"""RSS Cache This is a cache for RSS, CDF and Atom feeds built around the universal feed parser from Mark Pilgrim, http://feedparser.org The latest version may be found on http://jensge.org?feedmap Required: feedparser Required: bsddb3 for python < 2.3 """ __version__ = "0.1" __license__ = "Python" __copyright__ = "Copyright 2005, Jens Georg" __author__ = "Jens Georg " import md5 import os.path import time import feedparser import re try: from cPickle import loads, dumps except: from Pickle import loads, dumps import urllib2 import urlparse import sys # if python >= 2.3 try builtin bsddb support else use bsddb3 module if sys.version_info[0] >=2 and sys.version_info[1] >=3: from bsddb.db import * else: from bsddb3.db import * _debug = False USER_AGENT = "RssCache/%s +http://jensge.org" % version class FeedGoneException(Exception): """ The feed was removed from the site """ pass class FeedMovedException(Exception): """ This exception is raised if feedparser got an 301. """ pass class FeedNotFoundException(Exception): pass class FeedDownloadError(Exception): """ This is a general download error, which is raised if feed.bozo is set and the exception is an urllib2.URLError """ pass class UnknownAuthMethodError(Exception): pass class MalformedServerResponseError(Exception): pass # simple cache class for rss feeds class RssCache: def __init__(self, cachedir, url, expire, user="", password = ""): self.dir = os.path.abspath(cachedir) self.url = url self.expire = expire self.keywords = ["lastdate", "feed", "authtype", "realm"] self.user = user self.password = password self.domain = urlparse.urlparse(self.url)[1] if not os.path.isdir(self.dir): os.mkdir(self.dir) self.hash = md5.new(self.url) # open environment for threaded access myflags = DB_THREAD | DB_CREATE flagsforenv = DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN self.env = DBEnv() self.env.set_lk_detect(DB_LOCK_DEFAULT) self.env.open(self.dir, myflags | flagsforenv) self.db = DB(self.env) self.db.open(self.hash.hexdigest(), DB_HASH, myflags, 0600) def __del__(self): self.close() def invalidate_auth_cache (self): """ This method removes the auth method and realm from the cache; this may be necessary, if either of those has changed """ try: del self.db["authtype"] del self.db["realm"] except: pass def auth_handler (self): """ We try to return the auth handler suitable to our feed (which authentication method, which realm, etc.) The method tries to use cached values for this; if not available (because we're calling it for the first time or used invalidate_auth_cache()), it'll try to autodetect them. """ # This feed is not password protected, so don't return a handler if not self.user: return [] if not (self.db.has_key("realm") and self.db.has_key("authtype")): if _debug: print "Detecting realm and authtype:" # autodetect authentication method and realm p = feedparser.parse (self.url, agent = USER_AGENT) # There was another error, let the main update procedure # cope with this if p.status != 401: return [] extractor = re.match('^(Digest|Basic) realm="(.*)"', p.headers["www-authenticate"].split(",")[0]) if not extractor: raise MalformedServerResponseError() self.db["authtype"] = extractor.group(1).lower() self.db["realm"] = extractor.group(2) if _debug: print "Realm: %s, Authtype: %s" % (self.db["realm"], self.db["authtype"]) if self.db["authtype"] == "basic": auth = urllib2.HTTPBasicAuthHandler() elif self.db["authtype"] == "digest": auth = urllib2.HTTPDigestAuthHandler() else: raise UnknownAuthMethodError() auth.add_password (self.db["realm"], self.domain, self.user, self.password) return [auth] def update_cache(self): if _debug and self.db.has_key("lastdate"): print "Updating cache. Lastdate: %s, now: %d, delta: %d" % (self.db["lastdate"], int(time.time()), int(time.time()) - int(self.db["lastdate"])) d = feedparser.parse(self.url, agent = USER_AGENT, handlers = self.auth_handler()) # look if there was an error parsing this feed if d.bozo == 1: try: raise d.bozo_exception except urllib2.URLError: # pass on URLErrors raise except: # we need to delete any exception, because we can't pickle the # response otherwise del d['bozo_exception'] d.bozo = 0 pass self.db["lastdate"] = str(int(time.time())) if _debug: print repr(d) self.db["feed"] = dumps(d) def invalidate(self): """ Invalidate the cache contents and force the cache to refresh its contents on the next access """ if _debug: print "Invalidating cache" if self.db.has_key("lastdate"): del self.db["lastdate"] def get_feed(self): def too_old(): return int(self.db["lastdate"]) + self.expire <= int(time.time()) if not self.db.has_key("lastdate") or too_old(): try: self.update_cache() except FeedMovedException, e: # ignore FeedMovedException pass content = loads(self.db["feed"]) return content def sync(self): try: self.db.sync() except DBIncompleteError: pass def close(self): if self.db is not None: self.db.close() self.db = None if self.env is not None: self.env.close() self.env = None # vim:ts=4:sw=4:noet:tw=76