"""RSS Cache This is a cache for RSS, CDF and Atom feeds built around the universal feed parser from Mark Pilgrim, http://feedparser.org The latest version may be found on http://jensge.org?feedmap Required: feedparser Required: bsddb3 for python < 2.3 """ __version__ = "0.1" __license__ = "Python" __copyright__ = "Copyright 2005, Jens Georg" __author__ = "Jens Georg " import md5 import os.path import time import feedparser import re try: from cPickle import loads, dumps, UnpicklingError except: from Pickle import loads, dumps, UnpicklingError import urllib2 import urlparse import sys # if python >= 2.3 try builtin bsddb support else use bsddb3 module if sys.version_info[0] >=2 and sys.version_info[1] >=3: from bsddb.db import * else: from bsddb3.db import * _debug = False USER_AGENT = "RssCache/%s +http://jensge.org" % version class FeedGoneException(Exception): """ The feed was removed from the site """ pass class FeedMovedException(Exception): """ This exception is raised if feedparser got an 301. """ pass class FeedNotFoundException(Exception): pass class FeedDownloadError(Exception): """ This is a general download error, which is raised if feed.bozo is set and the exception is an urllib2.URLError """ pass class UnknownAuthMethodError(Exception): pass class MalformedServerResponseError(Exception): pass # simple cache class for rss feeds class RssCache: def __init__(self, cachedir, feed): self.dir = os.path.abspath(cachedir) self.feed = feed self.keywords = ["lastdate", "feed", "authtype", "realm", "etag", "modified"] if not os.path.isdir(self.dir): os.mkdir(self.dir) # open environment for threaded access myflags = DB_THREAD | DB_CREATE flagsforenv = DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN self.env = DBEnv() self.env.set_lk_detect(DB_LOCK_DEFAULT) self.env.open(self.dir, myflags | flagsforenv) self.db = DB(self.env) self.db.open(self.feed.getDigest(), DB_HASH, myflags, 0600) def __del__(self): self.close() def invalidate_auth_cache (self): """ This method removes the auth method and realm from the cache; this may be necessary if either of them has changed """ try: del self.db["authtype"] del self.db["realm"] except: pass def auth_handler (self): """ We try to return the auth handler suitable to our feed (which authentication method, which realm, etc.) The method tries to use cached values for this; if not available (because we're calling it for the first time or used invalidate_auth_cache()), it'll try to autodetect them. """ # This feed is not password protected, so don't return a handler try: user, password = self.feed.getCredentials() except: return [] if not (self.db.has_key("realm") and self.db.has_key("authtype")): if _debug: print "Detecting realm and authtype:" # autodetect authentication method and realm p = feedparser.parse (self.feed.url, agent = USER_AGENT) # There was another error, let the main update procedure # cope with this if p.status != 401: return [] extractor = re.match('^(Digest|Basic) realm="(.*)"', p.headers["www-authenticate"].split(",")[0]) if not extractor: raise MalformedServerResponseError() self.db["authtype"] = extractor.group(1).lower() self.db["realm"] = extractor.group(2) if _debug: print "Realm: %s, Authtype: %s" % (self.db["realm"], self.db["authtype"]) if self.db["authtype"] == "basic": auth = urllib2.HTTPBasicAuthHandler() elif self.db["authtype"] == "digest": auth = urllib2.HTTPDigestAuthHandler() else: raise UnknownAuthMethodError() auth.add_password (self.db["realm"], self.feed.domain, user, password) return [auth] def force_update(self, really_force = False): if self.db.has_key("gone"): print "Feed from %s is gone..." % self.feed.url if _debug and self.db.has_key("lastdate"): print "Updating cache. Lastdate: %s, now: %d, delta: %d" % (self.db["lastdate"], int(time.time()), int(time.time()) - int(self.db["lastdate"])) if self.db.has_key('etag') and not really_force: etag = self.db['etag'] if _debug: print "Setting etag to %s" % etag else: etag = '' if self.db.has_key('modified-since') and not really_force: modified = loads(self.db['modified-since']) if _debug: print "Setting modified to %s" % str(modified) else: modified = (1970, 1, 1, 0, 0, 0, 0, 0, 0) d = feedparser.parse(self.feed.url, agent = USER_AGENT, handlers = self.auth_handler(), etag = etag, modified = modified ) # look if there was an error parsing this feed if d.bozo == 1: if not d.entries and d.status <> 304: try: raise d.bozo_exception except urllib2.URLError, e: # pass on URLErrors print ("Error downloading %s: %s" % (self.feed.url, str (e))) except Exception, e: print ("Error while parsing feed %s: %s" % (self.feed.url, str (e))) return else: # non-fatal error del d['bozo_exception'] d.bozo = 0 if d.status == 304: # feed did not change print ("No need to update %s" % self.feed.url) return elif d.status == 301: # permanent redirect print ("Feed from %s has been permanently redirected to %s" % ( self.feed.url, d.url)) elif d.status == 401: # feed gone print ("Feed from %s is gone!" % self.feed.url) self.db["gone"] = "1" return else: # update etag and modified-since try: self.db['etag'] = d.etag self.db['modified-since'] = dumps(d.modified) except AttributeError, e: pass self.db["lastdate"] = str(int(time.time())) self.db["feed"] = dumps (d) def update (self): def too_old(): return int(self.db["lastdate"]) + self.feed.expire <= int(time.time()) if not self.db.has_key("lastdate") or too_old(): try: self.force_update() except FeedMovedException, e: # ignore FeedMovedException pass def invalidate(self): """ Invalidate the cache contents and force the cache to refresh its contents on the next access """ if _debug: print "Invalidating cache" if self.db.has_key("lastdate"): del self.db["lastdate"] def get_feed(self): self.update () content = loads(self.db["feed"]) return content def sync(self): try: self.db.sync() except DBIncompleteError: pass def close(self): if self.db is not None: self.db.close() self.db = None if self.env is not None: self.env.close() self.env = None def dump_meta (self): lines = [] for key in self.keywords: if key == 'feed': continue try: v = loads (self.db[key]) except UnpicklingError: v = self.db[key] except KeyError: v = "" except EOFError: v = "" dots = "".join (["." for i in range (0, 20 - len (key))]) lines.append ("%s%s: %s" % (key, dots, v)) print "\n".join(lines) # vim:ts=4:sw=4:noet:tw=76