head 1.9; access; symbols; locks; comment @# @; 1.9 date 2012.05.07.18.08.00; author mw; state Exp; branches; next 1.8; 1.8 date 2012.05.06.21.56.28; author mw; state Exp; branches; next 1.7; 1.7 date 2012.05.05.22.08.06; author mw; state Exp; branches; next 1.6; 1.6 date 2012.05.05.19.14.36; author mw; state Exp; branches; next 1.5; 1.5 date 2012.05.05.19.02.27; author mw; state Exp; branches; next 1.4; 1.4 date 2012.05.05.15.29.34; author mw; state Exp; branches; next 1.3; 1.3 date 2012.05.05.15.25.40; author mw; state Exp; branches; next 1.2; 1.2 date 2012.03.20.11.12.22; author mw; state Exp; branches; next 1.1; 1.1 date 2012.03.19.23.39.22; author mw; state Exp; branches; next ; desc @feed parser thingy @ 1.9 log @actually print the URL... @ text @#!/usr/local/bin/python # Poor man's feedreader # public domain. # written 2012 by Moritz Wilhelmy, mw at furnace wzff de # # contributors: # - s4msung at exigen.org # # TODO: # - lock the seen-file (done? do I care about ~ on NFS?) # - improve error handling, i.e. if things fail between opening the seen-file # and finishing, have a backup file... # - Fix Email storage # - Do something about RSS feeds without GUIDs in entries import os, sys, codecs, time, fcntl import feedparser import mailbox import email.utils from urllib import pathname2url as urlenc from email.MIMEText import MIMEText from email.utils import formatdate, mktime_tz from email.Header import Header try: import cPickle as pickle except ImportError: import pickle ###### home = os.getenv("HOME") or "." base = home + "/lib/feeds/" furls = base + "feeds" # textfile containing urls. fseen = base + "seen" # things that have already been sent, just pickle it.. renderc= "/usr/local/bin/elinks -dump /dev/stdin" # should also be possible to send them with sendmail if you need it.. mbox = mailbox.Maildir(home + "/.Maildir/.feeds") ###### class Bug(Exception): pass class Unreachable(Bug): pass def render(html): i, o = os.popen2(renderc) i.write(html.encode("utf-8")) i.close() return o.read() def time_convert(t): """Workaround for feedparser discarding the timezone information from the input data. (RSS and Atom do have a timezone field in their date/time specification. Unfortunately, feedparser just drops it for no particular reason). This means this reader might be some hours off with the timestamp.""" return formatdate(mktime_tz(t[:] + (0,))) with open(furls, "r") as u: feeds = filter(lambda(x): not x.strip().startswith("#"), u.read().splitlines()) seen = {} try: with open(fseen, "r") as s: seen = pickle.load(s) except IOError: pass def create_feed_mail(author, title, body, date, id, link): for body_charset in ("US-ASCII", "ISO-8859-1", "UTF-8"): try: body.encode(body_charset) except UnicodeDecodeError: pass else: break #print body_charset mail = MIMEText(body, "plain", body_charset) # FIXME: Escaping? if type(author) in (str, unicode): mail["From"] = '%s <>' % Header(author.encode("utf-8"), "utf-8").encode("utf-8") elif type(author) == feedparser.FeedParserDict: # FIXME: needs tweaking: mail["From"] = '%s <%s>' %(Header(author.get("name", u"Unknown"), "utf-8"), author.get("email", u"").encode("utf-8")) else: raise Unreachable, "Unknown author type. This shouldn't happen" # FIXME: Theoretically, there might be any mime type possible here: mail["Subject"] = Header(title.encode("utf-8"), "utf-8") if date: mail["Date"] = date.encode("utf-8") if id: mail["Message-Id"] = '<%s@@localhost>' % urlenc(id) # Helps filter duplicates if link: mail["To"] = ('<%s>' % link).encode("utf-8") # XXX: come up with a better header? return mail for url in feeds: try: feed = feedparser.parse(url) except: print >> sys.stderr, "Error retrieving %s" % url continue fs = seen.get(url, set()) for ent in feed.entries: if not ent.has_key("id"): print >> sys.stderr, "Feed '%s' currently does not work with cron-feed." % url break if ent["id"] in fs: continue title = ent.title_detail["value"] # XXX: take care of content types.. summ = render(ent.summary) # pray that elinks autodetects text/plain... author = ent.get("author_detail") or ent.get("author") or\ feed.feed.get("author_detail") or feed.feed.get("author") or "Unknown" link = ent.get("link", None) try: date = time_convert(ent.get("updated_parsed", feed.feed.get("updated_parsed"))) except: date = None mail = create_feed_mail(author, title, summ, date, ent["id"], link) mbox.add(mail) fs.add(ent["id"]) seen[url] = fs try: fd = open(fseen, "r+") # doesn't nuke the file ("w" does) except: try: fd = open(fseen, "w") except: print >> sys.stderr, "can't open %s for writing. Check permissions, rinse, repeat" % fseen sys.exit(1) with fd as fd: fcntl.flock(fd, fcntl.LOCK_EX) pickle.dump(seen, fd, -1) fd.truncate() # $Id: cron-feed,v 1.8 2012/05/06 21:56:28 mw Exp mw $ @ 1.8 log @less getter nesting, truncate file after pickling @ text @d14 1 d105 1 a105 1 print >> sys.stderr, "Feed '%s' is broken. Please tell someone to fix it" d107 1 a107 1 if ent["id"] in fs: # XXX: broken feeds fail. Not sure it should be fixed d138 1 a138 1 # $Id: cron-feed,v 1.7 2012/05/05 22:08:06 mw Exp $ @ 1.7 log @Fix unicode in mail headers Thanks again to s4msung @ text @d110 2 a111 2 author = ent.get("author_detail", feed.feed.get("author_detail", feed.feed.get("author", "Unknown"))) d135 1 d137 1 a137 1 # $Id: cron-feed,v 1.6 2012/05/05 19:14:36 mw Exp $ @ 1.6 log @Put the entry link into the 'To'-Header Cleanup pending. @ text @d22 1 d82 1 a82 1 mail["From"] = u'"%s" <>' % author.encode("utf-8") d84 1 a84 1 mail["From"] = (author.get("name", "") + author.get("email", "<>")).encode("utf-8") d88 1 a88 1 mail["Subject"] = title.encode("utf-8") d136 1 a136 1 # $Id: cron-feed,v 1.5 2012/05/05 19:02:27 mw Exp mw $ @ 1.5 log @- add a Message-Id header - several fixes and cleanups (I already forgot what I did there) @ text @d67 1 a67 1 def create_feed_mail(author, title, body, date, id): d90 1 a90 1 d111 1 d113 1 a113 1 date = time_convert(ent.get("updated_parsed")) d116 1 a116 1 mail = create_feed_mail(author, title, summ, date, ent["id"]) d135 1 a135 1 # $Id: cron-feed,v 1.4 2012/05/05 15:29:34 mw Exp mw $ @ 1.4 log @don't blame the wrong people, it's feedparser's fault, not email or time. @ text @d15 1 a15 4 import os import codecs import time import fcntl d19 1 d38 6 d52 3 a54 1 input data.""" d67 1 a67 1 def create_feed_mail(author, title, body): d75 1 a75 1 print body_charset d78 9 a86 1 mail["From"] = author.encode("utf-8") d88 3 a92 1 d94 6 a99 4 feed = feedparser.parse(url) fs = set() if url in seen.keys(): fs = seen[url] a100 1 print(feed.keys()) d102 4 a105 2 if ent["id"] in fs: print "Debug: Skipping " + ent["id"] a106 1 print(feed.keys()) d109 7 a115 2 author = ent.get("author_detail", "No Author") mail = create_feed_mail(author, title, summ) d121 10 a130 1 with open(fseen, "w") as fd: d134 1 a134 1 # $Id: cron-feed,v 1.3 2012/05/05 15:25:40 mw Exp mw $ @ 1.3 log @code cleanup and unicode support Thanks to s4msung @ text @d47 2 a48 4 """Either the email module or the time module are for some reason fundamentally borked. The struct_time has elements [0..8], but formatdate tries to access a nonexistent 9th element, which then fails... I work around it by appending a zero to the tuple.""" a91 7 #os.popen("less","w").write(summ) #mail = Email() #mail.set_type(u"text/plain; charset=utf-8") # XXX: hardcoded :-/ #mail.add_header(u"From", '"' + ent["author_detail"] + '" <' + url + '>') # FIXME: some more logic, the author could be anywhere... #mail.add_header(u"Subject", title) #mail.add_header("Date", time_convert(feed["updated_parsed"])) #mail.set_payload(summ) d102 1 a102 1 # $Id: cron-feed,v 1.2 2012/03/20 11:12:22 mw Exp mw $ @ 1.2 log @cleanup lock seen-file @ text @d6 3 d15 9 a23 3 import feedparser, os, mailbox, email.utils, time, fcntl from email.message import Message as Email from email.utils import formatdate, mktime_tz d42 1 a42 1 i.write(html) d47 4 a50 4 # Either the email module or the time module are for some reason # fundamentally borked. The struct_time has elements [0..8], but formatdate # tries to access a nonexistent 9th element, which then fails... # I work around it by appending a zero to the tuple. d53 1 a53 1 with open(furls) as u: d58 1 a58 1 with open(fseen) as s: d63 16 d85 1 d88 1 a88 1 #print "Debug: Skipping " + ent["id"] d90 1 d93 1 d95 7 a101 6 mail = Email() mail.set_type("text/plain; charset=utf-8") # XXX: hardcoded :-/ mail.add_header("From", '"' + ent["author_detail"] + '" <' + url + '>') # FIXME: some more logic, the author could be anywhere... mail.add_header("Subject", title) mail.add_header("Date", time_convert(ent["updated_parsed"])) mail.set_payload(summ) d111 1 a111 1 # $Id: cron-feed,v 1.1 2012/03/19 23:39:22 mw Exp mw $ @ 1.1 log @Initial revision @ text @d1 1 a1 1 #!/usr/bin/env python d5 6 d12 1 a12 1 import feedparser, os, mailbox, email.utils, time d14 1 d24 1 a24 1 ffeeds = base + "feeds" # textfile containing urls. d26 1 a26 1 renderc= "elinks -dump /dev/stdin" d28 1 a28 1 mbox = mailbox.Maildir("/tmp/feeds") a30 2 seen = {} d37 9 a45 2 with open(ffeeds) as f: feeds = filter(lambda(x): x[0] != "#", f.read().splitlines()) d47 1 d54 2 a55 2 for feed in feeds: f = feedparser.parse(feed) d57 2 a58 2 if feed in seen.keys(): fs = seen[feed] d60 3 a62 3 for g in f.entries: if g["id"] in fs: #print "Debug: Skipping " + g["id"] d64 2 a65 2 title = g.title_detail["value"] # XXX: take care of content types.. summ = render(g.summary) # pray that elinks autodetects text/plain... d69 1 a69 1 mail.add_header("From", '<' + feed + '>') # FIXME: some more logic, the author could be anywhere... d71 1 a71 2 tmp = g["updated_parsed"][:] + (0,) # WTF?!?!? why, python, why??! mail.add_header("Date", email.utils.formatdate(email.utils.mktime_tz(tmp))) d74 1 a74 1 fs.add(g["id"]) d76 1 a76 1 seen[feed] = fs d79 1 d82 1 a82 1 # $Id$ @