#!/usr/bin/env python # feedme: read RSS/Atom feeds and convert to Plucker files. # Copyright 2009,2011 Akkana Peck # Based on feedread, Copyright (C) 2009 Benjamin M. A'Lee # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details: # . # # Important TODO: # - Figure out why we get repeat stories on sites like BBC. VersionString = "FeedMe 0.8" import cPickle import time #, datetime import os, sys import re #import types import shutil import traceback from ConfigParser import ConfigParser import feedparser import urllib2 has_ununicode=True try : import ununicode except ImportError, e: has_ununicode=False # # Clean up old feed directories # def clean_up(config) : try : days = int(config.get('DEFAULT', 'save_days')) feeddir = config.get('DEFAULT', 'dir') feeddir = sub_tilde(feeddir) except : print "Error trying to get save_days and feed dir; can't clean up" return print "Cleaning up anything older than", days, "days from", feeddir now = time.time() for dir in os.listdir(feeddir) : d = os.path.join(feeddir, dir) howold = (now - os.path.getctime(d)) / 60 / 60 / 24 if howold > days : print "Deleting", d if os.path.isdir(d) : shutil.rmtree(d) else : os.unlink(d) ################################################################## # OUTPUT GENERATING FUNCTIONS # Define functions for each output format you need to support. # def run_conversion_cmd(appargs) : if True or verbose : cmd = " ".join(appargs) print >>sys.stderr, "Running:", cmd sys.stdout.flush() retval = os.spawnvp(os.P_WAIT, appargs[0], appargs) #retval = os.system(cmd) if retval != 0 : raise OSError(retval, "Couldn't run: " + ' '.join(appargs)) # # Generate a Plucker file # def make_plucker_file(indexfile, feedname, levels, ascii) : home = os.environ['HOME'] day = time.strftime("%a") docname = day + " " + feedname cleanfilename = day + "_" + feedname.replace(" ", "_") # Make sure the plucker directory exists: pluckerdir = os.path.join(home, ".plucker", "feedme") if not os.path.exists(pluckerdir) : os.makedirs(pluckerdir) # Run plucker. This should eventually be configurable -- # but how, with arguments like these? # Plucker mysteriously creates unbeamable files if the # document name has a colons in it. # So use the less pretty but safer underscored docname. #docname = cleanfilename appargs = [ "plucker-build", "-N", docname, "-f", os.path.join("feedme", cleanfilename), "--stayonhost", "--noimages", "--maxdepth", str(levels), "--zlib-compression", "--beamable", "-H", "file://" + indexfile ] if not ascii : appargs.append("--charset=utf-8") run_conversion_cmd(appargs) # # http://calibre-ebook.com/user_manual/conversion.html # def make_calibre_file(indexfile, feedname, extension, levels, ascii, author, flags) : home = os.environ['HOME'] day = time.strftime("%a") # Prepend daynum to the filename because fbreader can only sort by filename #daynum = time.strftime("%w") cleanfilename = day + "_" + feedname.replace(" ", "_") outdir = os.path.join(home, "feeds", extension[1:]) if not os.access(outdir, os.W_OK) : os.makedirs(outdir) appargs = [ "ebook-convert", indexfile, #os.path.join(home, "feeds", cleanfilename + extension), # directory should be configurable too, probably os.path.join(outdir, cleanfilename + extension), "--authors", author ] for flag in flags : appargs.append(flag) if True or verbose : cmd = " ".join(appargs) print >>sys.stderr, "Running:", cmd sys.stdout.flush() run_conversion_cmd(appargs) # # Generate a fictionbook2 file # def make_fb2_file(indexfile, feedname, levels, ascii) : make_calibre_file(indexfile, feedname, ".fb2", levels, ascii, "feedme", flags = [ "--disable-font-rescaling" ] ) # # Generate an ePub file # http://calibre-ebook.com/user_manual/cli/ebook-convert-3.html#html-input-to-epub-output # def make_epub_file(indexfile, feedname, levels, ascii) : make_calibre_file(indexfile, feedname, ".epub", levels, ascii, time.strftime("%m-%d %a") + " feeds", flags = [ '--no-default-epub-cover', '--dont-split-on-page-breaks' ]) # END OUTPUT GENERATING FUNCTIONS ################################################################## ################################################################## # MsgLog: Print messages and also batch them up to print at the end: # class MsgLog : def __init__(self) : self.msgstr = "" self.errstr = "" def msg(self, s) : self.msgstr += "\n" + s print "", s.encode('ascii', 'backslashreplace') def err(self, s) : self.errstr += "\n" + s print "ERROR:", s.encode('ascii', 'backslashreplace') def get_msgs(self) : return self.msgstr def get_errs(self) : return self.errstr import sys # file-like class that can optionally send output to a log file. Inspired by # http://www.redmountainsw.com/wordpress/archives/python-subclassing-file-types # and with help from KirkMcDonald. class tee() : def __init__(self, _fd1, _fd2) : self.fd1 = _fd1 self.fd2 = _fd2 def __del__(self) : if self.fd1 != sys.stdout and self.fd1 != sys.stderr : self.fd1.close() if self.fd2 != sys.stdout and self.fd2 != sys.stderr : self.fd2.close() def write(self, text) : self.fd1.write(text) self.fd2.write(text) def flush(self) : self.fd1.flush() self.fd2.flush() # # Interrupt handler: prompt for what to do. # def handleKeyboardInterrupt(msg) : # os.isatty() doesn't work, so: if not hasattr(sys.stdin, "isatty") : print "Interrupt, and not running interactively. Exiting." sys.exit(1) response = raw_input(msg) if response == '' : return '\0' if response[0] == 'q' : sys.exit(1) return response[0] def sub_tilde(name) : # config.get alas doesn't substitute $HOME or ~ if name[0:2] == "~/" : name = os.path.join(os.environ['HOME'], name[2:]) elif name[0:6] == "$HOME/" : name = os.path.join(os.environ['HOME'], name[6:]) return name def get_config_multiline(config, feedname, configname) : configlines = config.get(feedname, configname) if configlines != '' : configlines = configlines.split('\n') else : configlines = [] print "configlines for", configname, ":", configlines return configlines # # Get a single feed # def get_feed(feedname, config, cache, cachefile, msglog) : # Mandatory arguments: try : sitefeedurl = config.get(feedname, 'url') feeddir = config.get(feedname, 'dir') except : msglog.err("Error reading feedme.conf entry for: " + feedname) return feeddir = sub_tilde(feeddir) feeddir = os.path.join(feeddir, time.strftime("%m-%d-%a")) formats = config.get(feedname, 'formats').split(',') encoding = config.get(feedname, 'encoding') skip_pats = get_config_multiline(config, feedname, 'skip_pat') # Skip images if requested if config.get(feedname, 'skip_images') == 'true': skip_pats.append('') feedfile = feedname.replace(" ", "_") outdir = os.path.join(feeddir, feedfile) ascii = (config.get(feedname, 'ascii') != 'false') if ascii and not has_ununicode : ascii = False msglog.msg(feedname + ": Can't convert to ascii without ununicode") levels = int(config.get(feedname, 'levels')) #page_start = config.get(feedname, 'page_start') page_starts = get_config_multiline(config, feedname, 'page_start') page_ends = get_config_multiline(config, feedname, 'page_end') single_page_pats = get_config_multiline(config, feedname, 'single_page_pat') verbose = (config.get(feedname, 'verbose').lower() == 'true') if cache == None : nocache = True else : nocache = (config.get(feedname, 'nocache') == 'true') if verbose and nocache : msglog.msg(feedname + ": Ignoring cache") def output_encode(s, encoding) : if ascii and has_ununicode : #return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') # valid values in encode are replace and ignore return ununicode.toascii(s, in_encoding=encoding, errfilename=os.path.join(outdir, "errors")) elif isinstance(s, unicode) : return s.encode('utf-8', 'backslashreplace') else : return s global VersionString downloaded_string ="\n
(Downloaded by " + VersionString + ")\n" feed = feedparser.parse(sitefeedurl) # feedparser has no error return! One way is to check len(feed.feed). if len(feed.feed) == 0 : msglog.err("Can't read " + sitefeedurl) return # XXX Sometimes feeds die a few lines later getting feed.feed.title. # Here's a braindead guard against it -- but why isn't this # whole clause inside a try? It should be. if not 'title' in feed.feed : msglog.msg(sitefeedurl + " lacks a title!") feed.feed.title = '[' + feedname + ']' #return if not nocache : if sitefeedurl not in cache: cache[sitefeedurl] = [] feedcache = cache[sitefeedurl] newfeedcache = [] # suburls: mapping of URLs we've encountered to local URLs. # Any anchors (#anchor) will be discarded. # This is for sites like WorldWideWords that make many links # to the same page. suburls = [] # indexstr is the contents of the index.html file. # Kept as a string until we know whether there are new, non-cached # stories so it's worth updating the copy on disk. # The stylesheet is for FeedViewer and shouldn't bother plucker etc. day = time.strftime("%a") indexstr = u"""\n %s: %s \n

%s: %s: %s

\n""" % (day, feedname, day, feedname, feed.feed.title) if verbose: print >>sys.stderr, "********* Reading", sitefeedurl itemnum = 0 for item in feed.entries : try : # # Get the list of links (href) and a (hopefully) unique ID: # if 'links' in item : href = [i['href'].encode('utf-8') \ for i in item.links if i['rel'] == 'alternate'] else: href = [] if not 'id' in item : if len(href) > 0 : item.id = href[0] if verbose : msglog.msg("Using URL " + href[0] + " for ID.") else: if verbose : msglog.msg("Item in " + href[0] + " had no ID or URL.") next # or return? # Filter out file types known not to work # XXX Only mp3 for now. Obviously, make this more general. if item.link.endswith("mp3") : print "Filtering out mp3 link", item.link continue # Make sure ids don't have named anchors appended: anchor_index = item.id.rfind('#') if anchor_index >= 0 : anchor = item.id[anchor_index:] item.id = item.id[0:anchor_index] else : anchor = "" # See if we've already seen this page: try : pagenum = suburls.index(item.id) # We've already seen a link to this URL. It's probably # a link to a different named anchor within the same file. except ValueError : # Haven't seen it before. But is it in the cache already? if not nocache : # We want it in the cache, whether it's new or not: newfeedcache.append(item.id) if item.id in feedcache: if verbose : msglog.msg(item.id + " already cached -- skipping") continue # Add it to the cache and suburls. suburls.append(item.id) pagenum = len(suburls) - 1 itemnum += 1 if verbose : print >>sys.stderr, "\nItem:", item.title.encode('utf-8') # Now itemnum is the number of the entry on the index page; # pagenum is the html file of the subentry, e.g. 3.html. # # Follow the link and make a file for it: # if levels > 1 or not 'content' in item : try : # Try to trap keyboard interrupts, + others if verbose : print "Fetching link", item.link # For the sub-pages, we're getting HTML, not RSS. # Nobody seems to have RSS pointing to RSS. response = urllib2.urlopen(item.link) # At this point it would be lovely to check whether the # mime type is HTML. Unfortunately, all we have is a # httplib.HTTPMessage instance which is completely # undocumented (see http://bugs.python.org/issue3428). # It's not documented, but sometimes after urlopen # we can actually get a content type. If it's not # text/something, that's bad. ctype = response.headers['content-type'] if ctype and ctype != '' and ctype[0:4] != 'text' : msglog.error(item.link + " isn't text -- skipping") continue # Read the content of the link: # This can die with socket.error, "connection reset by peer" html = response.read() link = response.geturl() # urllib2 unfortunately doesn't read unicode, # so try to figure out the current encoding: if encoding == '' : enctype = response.headers['content-type'].split('charset=') if len(enctype) > 1 : encoding = enctype[-1] else : encoding = 'utf-8' # No docs say I should close this. I can only assume. response.close() # URL rewriting, so we can offer "Next page" and # similar links. # Do this *before* checking the single_page_pats # since it might need to be rewritten too. # Base URL which will be prepended to any relative links: baseurl = re.sub('(.+)/.*', r'\1', link.encode('ascii', 'xmlcharrefreplace')) siteurl = re.sub('([a-zA-Z]+://[^/]+)/.*', r'\1', baseurl) # XXX Next two regexps are dicey -- e.g. they don't # ensure that the end quote is the same as the start quote. # Rewrite any relative URLs in terms of the base URL html = re.sub('([hH][rR][eE][fF]\s*=\s*)(["\'])([^:/\'"]+?)(["\'])', r'\1\2' + baseurl + r'/\3\4', html) # Rewrite URLs that start with / in terms of the site URL: html = re.sub('([hH][rR][eE][fF]\s*=\s*)(["\'])/([^:\'"]+?)(["\'])', r'\1\2' + siteurl + r'/\3\4', html) # See if the single page pattern exists and works if len(single_page_pats) > 0 : for single_page_pat in single_page_pats : m = re.search(single_page_pat, html) if m : single_page = html[m.start():m.end()] if verbose : print >>sys.stderr, \ "\nFetching single-page pattern:", \ single_page.encode('utf-8') try : response = urllib2.urlopen(single_page) html2 = response.read() link = response.geturl() html = html2 response.close() if verbose : print >>sys.stderr, \ "Single page @", link break # found a single-page, don't need 2 except Exception, e : print >>sys.stderr, \ "Can't get single-page url", \ single_page, \ str(e) elif verbose : print >>sys.stderr, "single-page pattern", \ single_page_pat, "not found in", link # Throw out everything before the page_start pattern # and after the page_end pattern if len(page_starts) > 0 : for page_start in page_starts : #pat = re.compile(page_start) #match = pat.search(html) #if not match : # print >>sys.stderr, "Couldn't find", page_start #else : # html = html[match.start() : ] print "looking for page_start", page_start match = html.find(page_start) if match >= 0: if verbose : print "Found page_start", page_start html = html[match:] break if len(page_ends) > 0 : for page_end in page_ends : print "looking for page_end", page_end match = html.find(page_end) if match >= 0: if verbose : print "Found page_end", page_end html = html[0 : match] # Skip anything matching any of the skip_pats if len(skip_pats) > 0 : print len(skip_pats), "skip pats" for skip in skip_pats : if verbose : print >>sys.stderr, "Trying to skip", skip #print >>sys.stderr, "in", html.encode('utf-8') #sys.stderr.flush() # flags=DOTALL doesn't exist in re.sub until 2.7, #html = re.sub(skip, '', html, flags=re.DOTALL) # but does exist in a compiled re expression: regexp = re.compile(skip, flags=re.DOTALL) html = regexp.sub('', html) # Another way would be to use (.|\\n) in place of . # For some reason [.\n] doesn't work. #html = re.sub(skip, '', html, flags=re.DOTALL) else : print "no skip pats" except KeyboardInterrupt : response = handleKeyboardInterrupt(""" *** Caught keyboard interrupt reading a story! ***\n Options: q: Quit c: Continue trying to read this story s: Skip to next story n: Skip to next site Which (default = s): """) if response[0] == 'n' : # next site return elif response[0] != 'c' : # next story (default) continue # If the response was 'c', we continue and just # ignore the interrupt. except Exception, e : # Collect info about what went wrong: errmsg = "Couldnt read " + item.link + "\n" errmsg += "Title: " + item.title.encode('utf-8') if False and verbose : errmsg += "Item summary was:\n------\n" errmsg += item.summary + "\n------\n" errmsg += str(e) + '
\n' errmsg += str(sys.exc_info()[0]) + '
\n' errmsg += str(sys.exc_info()[1]) + '
\n' errmsg += traceback.format_exc(sys.exc_info()[2]) if verbose : print >>sys.stderr, "==============" msglog.err(errmsg) if verbose : print >>sys.stderr, "==============" raise # so this entry won't get stored or cached if not 'published_parsed' in item: if 'updated_parsed' in item: item.published_parsed = item.updated_parsed else: item.published_parsed = time.gmtime() def save_html_file(outdir, title, html, encoding) : # title is a unicode string, not yet encoded. # html is a string presumed to be in encoding (which may be ''). if verbose : print "Saving", title, "in", outdir utftitle = output_encode(title, encoding) fnam = str(pagenum) + ".html" # Make the parent directory if we haven't already # (don't do it before now since we didn't know whether # we had any content to save): if not os.access(outdir, os.W_OK) : if verbose : print "Making", outdir os.makedirs(outdir) of = open(os.path.join(outdir, fnam), "w") of.write("""\n %s

%s

\n """ % (utftitle, utftitle)) of.write(output_encode(html, encoding)) # add a "next item" link. # XXX Unfortunately this itemnum check isn't necessarily reliable # XXX since we may have skipped items. #if itemnum < len(feed.entries) - 1 : if item != feed.entries[-1] : of.write("
>->\n") of.write(downloaded_string) of.write("\n\n") of.close() return fnam # Plucker named anchors don't work unless preceded by a

# http://www.mail-archive.com/plucker-list@rubberchicken.org/msg07314.html # and the previous message. indexstr += "

 " if levels > 1 : fnam = save_html_file(outdir, item.title, html, encoding) if verbose : print >>sys.stderr, "Saved to file", fnam itemlink = '' indexstr += itemlink + '' + item.title + '\n' else : # For a single-level site, don't put links over each entry. itemlink = '' indexstr += "\n" + itemlink + item.title + "\n" # Under the title, add a link to jump to the next entry # if it isn't the last entry. if item != feed.entries[-1] : indexstr += "
>->\n
\n" # Add either the content or the summary: if levels == 1 and 'content' in item : content = item.content[0].value + "\n" elif 'summary_detail' in item: content = item.summary_detail.value + "\n" else : content = "[No content]" # Remove images from index content too # XXX should do this only if skip_imgs is true! content = re.sub('', '', content) indexstr += content if 'author' in item : indexstr += "\n
by: " + item.author + "
" # After the content, add another link to the title, # in case the user wants to click through after reading # the content: sublen = 16 if len(item.title) > sublen : # Truncate the title to sublen characters, and # temove any HTML tags, otherwise we'll likely have # tags like that open but don't close short_title = re.sub('<.*?>', '', item.title[0:sublen]) \ + "..." else : short_title = item.title indexstr += "\n
[[" + itemlink + short_title + "]]\n\n" # If there was an error parsing this entry, we won't save # a file so decrement the itemnum and loop to the next entry. except KeyboardInterrupt : sys.stderr.flush() response = handleKeyboardInterrupt(""" *** Caught keyboard interrupt while finishing a site! ***\n Options: q: Quit c: Continue trying to finish this site n: Skip to next site Which (default = n): """) if response[0] == 'c' : continue if response[0] == 'q' : sys.exit(1) # Default is to skip to the next site: return except Exception, e : itemnum -= 1 if verbose : print >>sys.stderr, "Skipping item", item.link.encode('utf-8') print >>sys.stderr, "error was", str(e).encode('utf-8') print >>sys.stderr, str(sys.exc_info()[0]) print >>sys.stderr, str(sys.exc_info()[1]) print >>sys.stderr, traceback.format_exc(sys.exc_info()[2]) # Only write the index.html file if there was content that # wasn't already in the cache. sys.stdout.flush() if itemnum > 0 : indexfile = os.path.join(outdir, "index.html") if verbose : print >>sys.stderr, "Writing", indexfile index = open(indexfile, "w") index.write(output_encode(indexstr, encoding)) index.write(downloaded_string) index.write("\n\n\n") index.close() #################################################### # Generate the output files # if 'plucker' in formats : make_plucker_file(indexfile, feedname, levels, ascii) if 'fb2' in formats : make_fb2_file(indexfile, feedname, levels, ascii) if 'epub' in formats : make_epub_file(indexfile, feedname, levels, ascii) # # All done. Update the cache file. # if not nocache : if verbose : print >>sys.stderr, feedname, ": Updating cache file" # Dump the new cache, not the old one: # XXX Find out how long this is taking. # XXX Should we split the cache into per site? t = time.time() cache[sitefeedurl] = newfeedcache cPickle.dump(cache, open(cachefile, 'w')) print >>sys.stderr, "Writing cache took", time.time() - t, "seconds" elif verbose : print >>sys.stderr, feedname, ": Not updating cache file" else : print >>sys.stderr, feedname, ": no new content" # # Find the cache file and load it, but don't parse yet # def init_cache() : # # Load the cache file # if 'XDG_CACHE_HOME' in os.environ: cachefile = os.path.join(os.environ['XDG_CACHE_HOME'], 'feedme', 'feedme.dat') else: cachefile = os.path.join(os.environ['HOME'], '.cache', 'feedme', 'feedme.dat') if not os.path.exists(cachefile) : dirname = os.path.dirname(cachefile) if not os.path.exists(dirname): os.makedirs(dirname) cache = {} elif not os.access(cachefile, os.W_OK) : print >>sys.stderr, "Error: can't write cache file", cachefile sys.exit(1) else : # Make a backup of the cache file, in case something goes wrong: shutil.copy2(cachefile, cachefile + ".bak") cache = cPickle.load(open(cachefile)) return cache, cachefile # # Read the configuration file (don't act on it yet) # def read_config_file() : # # Read the config file # if 'XDG_CONFIG_HOME' in os.environ: conffile = os.path.join(os.environ['XDG_CONFIG_HOME'], 'feedme', 'feedme.conf') else: conffile = os.path.join(os.environ['HOME'], '.config', 'feedme', 'feedme.conf') if not os.access(conffile, os.R_OK): print >>sys.stderr, "Error: no config file in", conffile sys.exit(1) config = ConfigParser({'verbose':'false', 'levels':'2', 'encoding':'', # blank means try several 'page_start':'', 'page_end':'', 'single_page_pat':'', 'skip_pat':'', 'nocache':'false', 'logfile':'', 'save_days':'7', 'ascii':'false'}) config.read(conffile) return config # # Main -- read the config file and loop over sites. # if __name__ == '__main__': from optparse import OptionParser usage = """Usage: %prog [site ...] If no site is specified, feedme will update all the feeds in ~/.config/feedme.conf.""" LongVersion = VersionString + "0.8: an RSS feed reader.\n\ Copyright 2011 by Akkana Peck; share and enjoy under the GPL v2 or later." optparser = OptionParser(usage=usage, version=LongVersion) optparser.add_option("-n", "--nocache", action="store_true", dest="nocache", help="Don't consult the cache, or update it") optparser.add_option("-s", "--show-sites", action="store_true", dest="show_sites", help="Show available sites") optparser.add_option("-l", "--log", metavar="logfile", action="store", dest="log_file_name", help="Save output to a log file") (options, args) = optparser.parse_args() config = read_config_file() # server = config.get('DEFAULT', 'master_server') msglog = MsgLog() sections = config.sections() if options.show_sites : for feedname in sections : print feedname sys.exit(0) if options.nocache : cache = None cachefile = None else : cache, cachefile = init_cache() logfilename = config.get('DEFAULT', 'logfile') if logfilename : logfilename = sub_tilde(logfilename) # Set up a tee to a log file, and redirect stderr there: print "teeing output to", logfilename stderrsav = sys.stderr outputlog = open(logfilename, "w") sys.stderr = tee(stderrsav, outputlog) try : if len(args) == 0 : for feedname in sections : get_feed(feedname, config, cache, cachefile, msglog) else : for arg in args : print >>sys.stderr, 'Getting feed for', arg get_feed(arg, config, cache, cachefile, msglog) # This causes a lot of premature exits. Not sure why we end up # here rather than in the inner KeyboardInterrupt section. except KeyboardInterrupt : print >>sys.stderr, "Caught keyboard interrupt at the wrong time!" print traceback.format_exc(sys.exc_info()[2]) #sys.exit(1) except OSError, e : print >>sys.stderr, "Caught an OSError" print >>sys.stderr, e sys.exit(e.errno) # Dump any errors we encountered. msgs = msglog.get_msgs() if msgs : print >>sys.stderr, "\n===== Messages ====" print >>sys.stderr, msgs.encode('utf-8', 'backslashreplace') msgs = msglog.get_errs() if msgs : print >>sys.stderr, "\n====== Errors =====" print >>sys.stderr, msgs.encode('utf-8', 'backslashreplace') # Clean up old directories: clean_up(config)