#!/usr/local/bin/python # Poor man's feedreader # public domain. # written 2012 by Moritz Wilhelmy, mw at furnace wzff de # # contributors: # - s4msung at exigen.org # # TODO: # - lock the seen-file (done? do I care about ~ on NFS?) # - improve error handling, i.e. if things fail between opening the seen-file # and finishing, have a backup file... # - Fix Email storage # - Do something about RSS feeds without GUIDs in entries import os, sys, codecs, time, fcntl import feedparser import mailbox import email.utils from urllib import pathname2url as urlenc from email.MIMEText import MIMEText from email.utils import formatdate, mktime_tz from email.Header import Header try: import cPickle as pickle except ImportError: import pickle ###### home = os.getenv("HOME") or "." base = home + "/lib/feeds/" furls = base + "feeds" # textfile containing urls. fseen = base + "seen" # things that have already been sent, just pickle it.. renderc= "/usr/local/bin/elinks -dump /dev/stdin" # should also be possible to send them with sendmail if you need it.. mbox = mailbox.Maildir(home + "/.Maildir/.feeds") ###### class Bug(Exception): pass class Unreachable(Bug): pass def render(html): i, o = os.popen2(renderc) i.write(html.encode("utf-8")) i.close() return o.read() def time_convert(t): """Workaround for feedparser discarding the timezone information from the input data. (RSS and Atom do have a timezone field in their date/time specification. Unfortunately, feedparser just drops it for no particular reason). This means this reader might be some hours off with the timestamp.""" return formatdate(mktime_tz(t[:] + (0,))) with open(furls, "r") as u: feeds = filter(lambda(x): not x.strip().startswith("#"), u.read().splitlines()) seen = {} try: with open(fseen, "r") as s: seen = pickle.load(s) except IOError: pass def create_feed_mail(author, title, body, date, id, link): for body_charset in ("US-ASCII", "ISO-8859-1", "UTF-8"): try: body.encode(body_charset) except UnicodeDecodeError: pass else: break #print body_charset mail = MIMEText(body, "plain", body_charset) # FIXME: Escaping? if type(author) in (str, unicode): mail["From"] = '%s <>' % Header(author.encode("utf-8"), "utf-8").encode("utf-8") elif type(author) == feedparser.FeedParserDict: # FIXME: needs tweaking: mail["From"] = '%s <%s>' %(Header(author.get("name", u"Unknown"), "utf-8"), author.get("email", u"").encode("utf-8")) else: raise Unreachable, "Unknown author type. This shouldn't happen" # FIXME: Theoretically, there might be any mime type possible here: mail["Subject"] = Header(title.encode("utf-8"), "utf-8") if date: mail["Date"] = date.encode("utf-8") if id: mail["Message-Id"] = '<%s@localhost>' % urlenc(id) # Helps filter duplicates if link: mail["To"] = ('<%s>' % link).encode("utf-8") # XXX: come up with a better header? return mail for url in feeds: try: feed = feedparser.parse(url) except: print >> sys.stderr, "Error retrieving %s" % url continue fs = seen.get(url, set()) for ent in feed.entries: if not ent.has_key("id"): print >> sys.stderr, "Feed '%s' currently does not work with cron-feed." % url break if ent["id"] in fs: continue title = ent.title_detail["value"] # XXX: take care of content types.. summ = render(ent.summary) # pray that elinks autodetects text/plain... author = ent.get("author_detail") or ent.get("author") or\ feed.feed.get("author_detail") or feed.feed.get("author") or "Unknown" link = ent.get("link", None) try: date = time_convert(ent.get("updated_parsed", feed.feed.get("updated_parsed"))) except: date = None mail = create_feed_mail(author, title, summ, date, ent["id"], link) mbox.add(mail) fs.add(ent["id"]) seen[url] = fs try: fd = open(fseen, "r+") # doesn't nuke the file ("w" does) except: try: fd = open(fseen, "w") except: print >> sys.stderr, "can't open %s for writing. Check permissions, rinse, repeat" % fseen sys.exit(1) with fd as fd: fcntl.flock(fd, fcntl.LOCK_EX) pickle.dump(seen, fd, -1) fd.truncate() # $Id: cron-feed,v 1.9 2012/05/07 18:08:00 mw Exp $