head	1.9;
access;
symbols;
locks;
comment	@# @;


1.9
date	2012.05.07.18.08.00;	author mw;	state Exp;
branches;
next	1.8;

1.8
date	2012.05.06.21.56.28;	author mw;	state Exp;
branches;
next	1.7;

1.7
date	2012.05.05.22.08.06;	author mw;	state Exp;
branches;
next	1.6;

1.6
date	2012.05.05.19.14.36;	author mw;	state Exp;
branches;
next	1.5;

1.5
date	2012.05.05.19.02.27;	author mw;	state Exp;
branches;
next	1.4;

1.4
date	2012.05.05.15.29.34;	author mw;	state Exp;
branches;
next	1.3;

1.3
date	2012.05.05.15.25.40;	author mw;	state Exp;
branches;
next	1.2;

1.2
date	2012.03.20.11.12.22;	author mw;	state Exp;
branches;
next	1.1;

1.1
date	2012.03.19.23.39.22;	author mw;	state Exp;
branches;
next	;


desc
@feed parser thingy
@


1.9
log
@actually print the URL...
@
text
@#!/usr/local/bin/python
# Poor man's feedreader
# public domain.
# written 2012 by Moritz Wilhelmy, mw at furnace wzff de
#
# contributors:
# - s4msung at exigen.org
#
# TODO:
# - lock the seen-file (done? do I care about ~ on NFS?)
# - improve error handling, i.e. if things fail between opening the seen-file
#   and finishing, have a backup file...
# - Fix Email storage
# - Do something about RSS feeds without GUIDs in entries

import os, sys, codecs, time, fcntl
import feedparser
import mailbox
import email.utils
from urllib import pathname2url as urlenc
from email.MIMEText import MIMEText
from email.utils    import formatdate, mktime_tz
from email.Header   import Header

try:
	import cPickle as pickle
except ImportError:
	import pickle

######
home   = os.getenv("HOME") or "."
base   = home + "/lib/feeds/"
furls  = base + "feeds" # textfile containing urls.
fseen  = base + "seen"  # things that have already been sent, just pickle it..
renderc= "/usr/local/bin/elinks -dump /dev/stdin"
# should also be possible to send them with sendmail if you need it..
mbox   = mailbox.Maildir(home + "/.Maildir/.feeds")
######

class Bug(Exception):
	pass

class Unreachable(Bug):
	pass

def render(html):
	i, o = os.popen2(renderc)
	i.write(html.encode("utf-8"))
	i.close()
	return o.read()

def time_convert(t):
	"""Workaround for feedparser discarding the timezone information from the
	input data. (RSS and Atom do have a timezone field in their date/time
	specification. Unfortunately, feedparser just drops it for no particular 
	reason). This means this reader might be some hours off with the timestamp."""
	return formatdate(mktime_tz(t[:] + (0,)))

with open(furls, "r") as u:
	feeds = filter(lambda(x): not x.strip().startswith("#"), u.read().splitlines())

seen = {}
try:
	with open(fseen, "r") as s:
		seen = pickle.load(s)
except IOError:
	pass

def create_feed_mail(author, title, body, date, id, link):
	for body_charset in ("US-ASCII", "ISO-8859-1", "UTF-8"):
		try:
			body.encode(body_charset)
		except UnicodeDecodeError:
			pass
		else:
			break
	#print body_charset

	mail = MIMEText(body, "plain", body_charset)

 	# FIXME: Escaping?
	if type(author) in (str, unicode):
		mail["From"] = '%s <>' % Header(author.encode("utf-8"), "utf-8").encode("utf-8")
	elif type(author) == feedparser.FeedParserDict: # FIXME: needs tweaking:
		mail["From"] = '%s <%s>' %(Header(author.get("name", u"Unknown"), "utf-8"), author.get("email", u"").encode("utf-8"))
	else:
		raise Unreachable, "Unknown author type. This shouldn't happen"
	# FIXME: Theoretically, there might be any mime type possible here:
	mail["Subject"] = Header(title.encode("utf-8"), "utf-8")
	if date: mail["Date"] = date.encode("utf-8")
	if id:   mail["Message-Id"] = '<%s@@localhost>' % urlenc(id) # Helps filter duplicates
	if link: mail["To"] = ('<%s>' % link).encode("utf-8") # XXX: come up with a better header?
	return mail

for url in feeds:
	try:
		feed = feedparser.parse(url)
	except:
		print >> sys.stderr, "Error retrieving %s" % url
		continue
	fs = seen.get(url, set())

	for ent in feed.entries:
		if not ent.has_key("id"):
			print >> sys.stderr, "Feed '%s' currently does not work with cron-feed." % url
			break
		if ent["id"] in fs:
			continue
		title = ent.title_detail["value"] # XXX: take care of content types..
		summ = render(ent.summary) # pray that elinks autodetects text/plain...
		author = ent.get("author_detail") or ent.get("author") or\
			 feed.feed.get("author_detail") or feed.feed.get("author") or "Unknown"
		link = ent.get("link", None)
		try:
			date = time_convert(ent.get("updated_parsed", feed.feed.get("updated_parsed")))
		except:
			date = None
		mail = create_feed_mail(author, title, summ, date, ent["id"], link)
		mbox.add(mail)
		fs.add(ent["id"])

	seen[url] = fs

try:
	fd = open(fseen, "r+") # doesn't nuke the file ("w" does)
except:
	try:
		fd = open(fseen, "w")	
	except:
		print >> sys.stderr, "can't open %s for writing. Check permissions, rinse, repeat" % fseen
		sys.exit(1)
	
with fd as fd:
	fcntl.flock(fd, fcntl.LOCK_EX)
	pickle.dump(seen, fd, -1)
	fd.truncate()

# $Id: cron-feed,v 1.8 2012/05/06 21:56:28 mw Exp mw $
@


1.8
log
@less getter nesting, truncate file after pickling
@
text
@d14 1
d105 1
a105 1
			print >> sys.stderr, "Feed '%s' is broken. Please tell someone to fix it"
d107 1
a107 1
		if ent["id"] in fs: # XXX: broken feeds fail. Not sure it should be fixed
d138 1
a138 1
# $Id: cron-feed,v 1.7 2012/05/05 22:08:06 mw Exp $
@


1.7
log
@Fix unicode in mail headers
Thanks again to s4msung
@
text
@d110 2
a111 2
		author = ent.get("author_detail", feed.feed.get("author_detail",
				 feed.feed.get("author", "Unknown")))
d135 1
d137 1
a137 1
# $Id: cron-feed,v 1.6 2012/05/05 19:14:36 mw Exp $
@


1.6
log
@Put the entry link into the 'To'-Header
Cleanup pending.
@
text
@d22 1
d82 1
a82 1
		mail["From"] = u'"%s" <>' % author.encode("utf-8")
d84 1
a84 1
		mail["From"] = (author.get("name", "") + author.get("email", "<>")).encode("utf-8") 
d88 1
a88 1
	mail["Subject"] = title.encode("utf-8")
d136 1
a136 1
# $Id: cron-feed,v 1.5 2012/05/05 19:02:27 mw Exp mw $
@


1.5
log
@- add a Message-Id header
- several fixes and cleanups (I already forgot what I did there)
@
text
@d67 1
a67 1
def create_feed_mail(author, title, body, date, id):
d90 1
a90 1

d111 1
d113 1
a113 1
			date = time_convert(ent.get("updated_parsed"))
d116 1
a116 1
		mail = create_feed_mail(author, title, summ, date, ent["id"])
d135 1
a135 1
# $Id: cron-feed,v 1.4 2012/05/05 15:29:34 mw Exp mw $
@


1.4
log
@don't blame the wrong people, it's feedparser's fault, not email or time.
@
text
@d15 1
a15 4
import os
import codecs
import time
import fcntl
d19 1
d38 6
d52 3
a54 1
	input data."""
d67 1
a67 1
def create_feed_mail(author, title, body):
d75 1
a75 1
	print body_charset
d78 9
a86 1
	mail["From"] = author.encode("utf-8")
d88 3
a92 1

d94 6
a99 4
	feed = feedparser.parse(url)
	fs = set()
	if url in seen.keys():
		fs = seen[url]
a100 1
	print(feed.keys())
d102 4
a105 2
		if ent["id"] in fs:
			print "Debug: Skipping " + ent["id"]
a106 1
		print(feed.keys())
d109 7
a115 2
		author = ent.get("author_detail", "No Author")
		mail = create_feed_mail(author, title, summ)
d121 10
a130 1
with open(fseen, "w") as fd:
d134 1
a134 1
# $Id: cron-feed,v 1.3 2012/05/05 15:25:40 mw Exp mw $
@


1.3
log
@code cleanup and unicode support
Thanks to s4msung
@
text
@d47 2
a48 4
	"""Either the email module or the time module are for some reason
	fundamentally borked. The struct_time has elements [0..8], but formatdate
	tries to access a nonexistent 9th element, which then fails...
	I work around it by appending a zero to the tuple."""
a91 7
		#os.popen("less","w").write(summ)
		#mail = Email()
		#mail.set_type(u"text/plain; charset=utf-8") # XXX: hardcoded :-/
		#mail.add_header(u"From", '"' + ent["author_detail"] + '" <' + url + '>') # FIXME: some more logic, the author could be anywhere...
		#mail.add_header(u"Subject", title)
		#mail.add_header("Date", time_convert(feed["updated_parsed"]))
		#mail.set_payload(summ)
d102 1
a102 1
# $Id: cron-feed,v 1.2 2012/03/20 11:12:22 mw Exp mw $
@


1.2
log
@cleanup
lock seen-file
@
text
@d6 3
d15 9
a23 3
import feedparser, os, mailbox, email.utils, time, fcntl
from email.message import Message as Email
from email.utils   import formatdate, mktime_tz
d42 1
a42 1
	i.write(html)
d47 4
a50 4
	# Either the email module or the time module are for some reason
	# fundamentally borked. The struct_time has elements [0..8], but formatdate
	# tries to access a nonexistent 9th element, which then fails...
	# I work around it by appending a zero to the tuple.
d53 1
a53 1
with open(furls) as u:
d58 1
a58 1
	with open(fseen) as s:
d63 16
d85 1
d88 1
a88 1
			#print "Debug: Skipping " + ent["id"]
d90 1
d93 1
d95 7
a101 6
		mail = Email()
		mail.set_type("text/plain; charset=utf-8") # XXX: hardcoded :-/
		mail.add_header("From", '"' + ent["author_detail"] + '" <' + url + '>') # FIXME: some more logic, the author could be anywhere...
		mail.add_header("Subject", title)
		mail.add_header("Date", time_convert(ent["updated_parsed"]))
		mail.set_payload(summ)
d111 1
a111 1
# $Id: cron-feed,v 1.1 2012/03/19 23:39:22 mw Exp mw $
@


1.1
log
@Initial revision
@
text
@d1 1
a1 1
#!/usr/bin/env python
d5 6
d12 1
a12 1
import feedparser, os, mailbox, email.utils, time
d14 1
d24 1
a24 1
ffeeds = base + "feeds" # textfile containing urls.
d26 1
a26 1
renderc= "elinks -dump /dev/stdin"
d28 1
a28 1
mbox   = mailbox.Maildir("/tmp/feeds")
a30 2
seen   = {}

d37 9
a45 2
with open(ffeeds) as f:
	feeds = filter(lambda(x): x[0] != "#", f.read().splitlines())
d47 1
d54 2
a55 2
for feed in feeds:
	f  = feedparser.parse(feed)
d57 2
a58 2
	if feed in seen.keys():
		fs = seen[feed]
d60 3
a62 3
	for g in f.entries:
		if g["id"] in fs:
			#print "Debug: Skipping " + g["id"]
d64 2
a65 2
		title = g.title_detail["value"] # XXX: take care of content types..
		summ = render(g.summary) # pray that elinks autodetects text/plain...
d69 1
a69 1
		mail.add_header("From", '<' + feed + '>') # FIXME: some more logic, the author could be anywhere...
d71 1
a71 2
		tmp = g["updated_parsed"][:] + (0,) # WTF?!?!? why, python, why??!
		mail.add_header("Date", email.utils.formatdate(email.utils.mktime_tz(tmp)))
d74 1
a74 1
		fs.add(g["id"])
d76 1
a76 1
	seen[feed] = fs
d79 1
d82 1
a82 1
# $Id$
@