#!/usr/bin/env python

# feedme: read RSS/Atom feeds and convert to Plucker files.
# Copyright 2009,2011 Akkana Peck <akkana@shallowsky.com>
# Based on feedread, Copyright (C) 2009 Benjamin M. A'Lee <bma@subvert.org.uk>
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details:
# <http://www.gnu.org/licenses/>.

#
# Important TODO:
# - Figure out why we get repeat stories on sites like BBC.

VersionString = "FeedMe 0.8"

import cPickle
import time  #, datetime
import os, sys
import re
#import types
import shutil
import traceback

from ConfigParser import ConfigParser

import feedparser
import urllib2

has_ununicode=True
try :
    import ununicode
except ImportError, e:
    has_ununicode=False

#
# Clean up old feed directories
#
def clean_up(config) :
    try :
        days = int(config.get('DEFAULT', 'save_days'))
        feeddir = config.get('DEFAULT', 'dir')
        feeddir = sub_tilde(feeddir)
    except :
        print "Error trying to get save_days and feed dir; can't clean up"
        return

    print "Cleaning up anything older than", days, "days from", feeddir

    now = time.time()
    for dir in os.listdir(feeddir) :
        d = os.path.join(feeddir, dir)
        howold = (now - os.path.getctime(d)) / 60 / 60 / 24
        if howold > days :
            print "Deleting", d
            if os.path.isdir(d) :
                shutil.rmtree(d)
            else :
                os.unlink(d)

##################################################################
# OUTPUT GENERATING FUNCTIONS
# Define functions for each output format you need to support.
#

def run_conversion_cmd(appargs) :
    if True or verbose :
        cmd = " ".join(appargs)
        print >>sys.stderr, "Running:", cmd
        sys.stdout.flush()

    retval = os.spawnvp(os.P_WAIT, appargs[0], appargs)
    #retval = os.system(cmd)
    if retval != 0 :
        raise OSError(retval, "Couldn't run: " + ' '.join(appargs))

#
# Generate a Plucker file
#
def make_plucker_file(indexfile, feedname, levels, ascii) :
    home = os.environ['HOME']
    day = time.strftime("%a")
    docname = day + " " + feedname
    cleanfilename = day + "_" + feedname.replace(" ", "_")

    # Make sure the plucker directory exists:
    pluckerdir = os.path.join(home, ".plucker", "feedme")
    if not os.path.exists(pluckerdir) :
        os.makedirs(pluckerdir)

    # Run plucker. This should eventually be configurable --
    # but how, with arguments like these?

    # Plucker mysteriously creates unbeamable files if the
    # document name has a colons in it.
    # So use the less pretty but safer underscored docname.
    #docname = cleanfilename
    appargs = [ "plucker-build", "-N", docname,
                "-f", os.path.join("feedme", cleanfilename),
                "--stayonhost", "--noimages",
                "--maxdepth", str(levels),
                "--zlib-compression", "--beamable",
                "-H", "file://" + indexfile ]
    if not ascii :
        appargs.append("--charset=utf-8")

    run_conversion_cmd(appargs)

#
# http://calibre-ebook.com/user_manual/conversion.html
#
def make_calibre_file(indexfile, feedname, extension, levels, ascii,
                      author, flags) :
    home = os.environ['HOME']
    day = time.strftime("%a")
    # Prepend daynum to the filename because fbreader can only sort by filename
    #daynum = time.strftime("%w")
    cleanfilename = day + "_" + feedname.replace(" ", "_")
    outdir = os.path.join(home, "feeds", extension[1:])
    if not os.access(outdir, os.W_OK) :
        os.makedirs(outdir)

    appargs = [ "ebook-convert",
                indexfile,
                #os.path.join(home, "feeds", cleanfilename + extension),
                # directory should be configurable too, probably
                os.path.join(outdir, cleanfilename + extension),
                "--authors", author ]
    for flag in flags :
        appargs.append(flag)
    if True or verbose :
        cmd = " ".join(appargs)
        print >>sys.stderr, "Running:", cmd
        sys.stdout.flush()

    run_conversion_cmd(appargs)

#
# Generate a fictionbook2 file
#
def make_fb2_file(indexfile, feedname, levels, ascii) :
    make_calibre_file(indexfile, feedname, ".fb2", levels, ascii,
                      "feedme", flags = [ "--disable-font-rescaling" ] )

#
# Generate an ePub file
# http://calibre-ebook.com/user_manual/cli/ebook-convert-3.html#html-input-to-epub-output
#
def make_epub_file(indexfile, feedname, levels, ascii) :
    make_calibre_file(indexfile, feedname, ".epub", levels, ascii,
                      time.strftime("%m-%d %a") + " feeds",
                      flags = [ '--no-default-epub-cover',
                                '--dont-split-on-page-breaks' ])

# END OUTPUT GENERATING FUNCTIONS
##################################################################

##################################################################
# MsgLog: Print messages and also batch them up to print at the end:
#
class MsgLog :
    def __init__(self) :
        self.msgstr = ""
        self.errstr = ""

    def msg(self, s) :
        self.msgstr += "\n" + s
        print "", s.encode('ascii', 'backslashreplace')

    def err(self, s) :
        self.errstr += "\n" + s
        print "ERROR:", s.encode('ascii', 'backslashreplace')

    def get_msgs(self) :
        return self.msgstr

    def get_errs(self) :
        return self.errstr

import sys

# file-like class that can optionally send output to a log file. Inspired by
# http://www.redmountainsw.com/wordpress/archives/python-subclassing-file-types
# and with help from KirkMcDonald.
class tee() :
    def __init__(self, _fd1, _fd2) :
        self.fd1 = _fd1
        self.fd2 = _fd2

    def __del__(self) :
        if self.fd1 != sys.stdout and self.fd1 != sys.stderr :
            self.fd1.close()
        if self.fd2 != sys.stdout and self.fd2 != sys.stderr :
            self.fd2.close()

    def write(self, text) :
        self.fd1.write(text)
        self.fd2.write(text)

    def flush(self) :
        self.fd1.flush()
        self.fd2.flush()

#
# Interrupt handler: prompt for what to do.
#
def handleKeyboardInterrupt(msg) :
    # os.isatty() doesn't work, so:
    if not hasattr(sys.stdin, "isatty") :
        print "Interrupt, and not running interactively. Exiting."
        sys.exit(1)

    response = raw_input(msg)
    if response == '' :
        return '\0'
    if response[0] == 'q' :
        sys.exit(1)
    return response[0]

def sub_tilde(name) :
    # config.get alas doesn't substitute $HOME or ~
    if name[0:2] == "~/" :
        name = os.path.join(os.environ['HOME'], name[2:])
    elif name[0:6] == "$HOME/" :
        name = os.path.join(os.environ['HOME'], name[6:])
    return name

def get_config_multiline(config, feedname, configname) :
    configlines = config.get(feedname, configname)
    if configlines != '' :
        configlines = configlines.split('\n')
    else :
        configlines = []
    print "configlines for", configname, ":", configlines
    return configlines

#
# Get a single feed
#
def get_feed(feedname, config, cache, cachefile, msglog) :
    # Mandatory arguments:
    try :
        sitefeedurl = config.get(feedname, 'url')
        feeddir = config.get(feedname, 'dir')
    except :
        msglog.err("Error reading feedme.conf entry for: " + feedname)
        return

    feeddir = sub_tilde(feeddir)
    feeddir = os.path.join(feeddir, time.strftime("%m-%d-%a"))

    formats = config.get(feedname, 'formats').split(',')

    encoding = config.get(feedname, 'encoding')

    skip_pats = get_config_multiline(config, feedname, 'skip_pat')

    # Skip images if requested
    if config.get(feedname, 'skip_images') == 'true':
        skip_pats.append('<img .*?>')

    feedfile = feedname.replace(" ", "_")
    outdir = os.path.join(feeddir,  feedfile)
    ascii = (config.get(feedname, 'ascii') != 'false')
    if ascii and not has_ununicode :
        ascii = False
        msglog.msg(feedname + ": Can't convert to ascii without ununicode")
    levels = int(config.get(feedname, 'levels'))
    #page_start = config.get(feedname, 'page_start')
    page_starts = get_config_multiline(config, feedname, 'page_start')
    page_ends = get_config_multiline(config, feedname, 'page_end')
    single_page_pats = get_config_multiline(config, feedname, 'single_page_pat')

    verbose = (config.get(feedname, 'verbose').lower() == 'true')
    if cache == None :
        nocache = True
    else :
        nocache = (config.get(feedname, 'nocache') == 'true')
    if verbose and nocache :
        msglog.msg(feedname + ": Ignoring cache")

    def output_encode(s, encoding) :
        if ascii and has_ununicode :
            #return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
            # valid values in encode are replace and ignore
            return ununicode.toascii(s,
                                     in_encoding=encoding,
                                     errfilename=os.path.join(outdir,
                                                              "errors"))
        elif isinstance(s, unicode) :
            return s.encode('utf-8', 'backslashreplace')
        else :
            return s

    global VersionString
    downloaded_string ="\n<hr><i>(Downloaded by " + VersionString + ")</i>\n"

    feed = feedparser.parse(sitefeedurl)

    # feedparser has no error return! One way is to check len(feed.feed).
    if len(feed.feed) == 0 :
        msglog.err("Can't read " + sitefeedurl)
        return
    # XXX Sometimes feeds die a few lines later getting feed.feed.title.
    # Here's a braindead guard against it -- but why isn't this
    # whole clause inside a try? It should be.
    if not 'title' in feed.feed :
        msglog.msg(sitefeedurl + " lacks a title!")
        feed.feed.title = '[' + feedname + ']'
        #return

    if not nocache :
        if sitefeedurl not in cache:
            cache[sitefeedurl] = []
        feedcache = cache[sitefeedurl]
        newfeedcache = []

    # suburls: mapping of URLs we've encountered to local URLs.
    # Any anchors (#anchor) will be discarded.
    # This is for sites like WorldWideWords that make many links
    # to the same page.
    suburls = []

    # indexstr is the contents of the index.html file.
    # Kept as a string until we know whether there are new, non-cached
    # stories so it's worth updating the copy on disk.
    # The stylesheet is for FeedViewer and shouldn't bother plucker etc.
    day = time.strftime("%a")
    indexstr = u"""<html>\n<head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">
<title>%s: %s</title>
<link rel="stylesheet" type="text/css" title="Feeds" href="../../feeds.css"/>
</head>

<body>\n<h1>%s: %s: %s</h1>
\n""" % (day, feedname, day, feedname, feed.feed.title)

    if verbose:
        print >>sys.stderr, "********* Reading", sitefeedurl

    itemnum = 0
    for item in feed.entries :
        try :
            #
            # Get the list of links (href) and a (hopefully) unique ID:
            #
            if 'links' in item :
                href = [i['href'].encode('utf-8') \
                            for i in item.links if i['rel'] == 'alternate']
            else:
                href = []

            if not 'id' in item :
                if len(href) > 0 :
                    item.id = href[0]
                    if verbose :
                        msglog.msg("Using URL " + href[0] + " for ID.")
                else:
                    if verbose :
                        msglog.msg("Item in " + href[0] + " had no ID or URL.")
                    next  # or return?

            # Filter out file types known not to work
            # XXX Only mp3 for now. Obviously, make this more general.
            if item.link.endswith("mp3") :
                print "Filtering out mp3 link", item.link
                continue

            # Make sure ids don't have named anchors appended:
            anchor_index = item.id.rfind('#')
            if anchor_index >= 0 :
                anchor = item.id[anchor_index:]
                item.id = item.id[0:anchor_index]
            else :
                anchor = ""

            # See if we've already seen this page:
            try :
                pagenum = suburls.index(item.id)
                # We've already seen a link to this URL. It's probably
                # a link to a different named anchor within the same file.
            except ValueError :
                # Haven't seen it before. But is it in the cache already?
                if not nocache :
                    # We want it in the cache, whether it's new or not:
                    newfeedcache.append(item.id)
                    if item.id in feedcache:
                        if verbose :
                            msglog.msg(item.id + " already cached -- skipping")
                        continue

                # Add it to the cache and suburls.
                suburls.append(item.id)
                pagenum = len(suburls) - 1

            itemnum += 1
            if verbose :
                print >>sys.stderr, "\nItem:", item.title.encode('utf-8')

            # Now itemnum is the number of the entry on the index page;
            # pagenum is the html file of the subentry, e.g. 3.html.

            #
            # Follow the link and make a file for it:
            #
            if levels > 1 or not 'content' in item :
                try :    # Try to trap keyboard interrupts, + others
                    if verbose :
                        print "Fetching link", item.link
                    # For the sub-pages, we're getting HTML, not RSS.
                    # Nobody seems to have RSS pointing to RSS.
                    response = urllib2.urlopen(item.link)

                    # At this point it would be lovely to check whether the
                    # mime type is HTML. Unfortunately, all we have is a
                    # httplib.HTTPMessage instance which is completely
                    # undocumented (see http://bugs.python.org/issue3428).

                    # It's not documented, but sometimes after urlopen
                    # we can actually get a content type. If it's not
                    # text/something, that's bad.
                    ctype = response.headers['content-type']
                    if ctype and ctype != '' and ctype[0:4] != 'text' :
                        msglog.error(item.link + " isn't text -- skipping")
                        continue

                    # Read the content of the link:
                    # This can die with socket.error, "connection reset by peer"
                    html = response.read()
                    link = response.geturl()

                    # urllib2 unfortunately doesn't read unicode,
                    # so try to figure out the current encoding:
                    if encoding == '' :
                        enctype = response.headers['content-type'].split('charset=')
                        if len(enctype) > 1 :
                            encoding = enctype[-1]
                        else :
                            encoding = 'utf-8'

                    # No docs say I should close this. I can only assume.
                    response.close()

                    # URL rewriting, so we can offer "Next page" and
                    # similar links.
                    # Do this *before* checking the single_page_pats
                    # since it might need to be rewritten too.

                    # Base URL which will be prepended to any relative links:
                    baseurl = re.sub('(.+)/.*', r'\1',
                                     link.encode('ascii', 'xmlcharrefreplace'))
                    siteurl = re.sub('([a-zA-Z]+://[^/]+)/.*', r'\1', baseurl)

                    # XXX Next two regexps are dicey -- e.g. they don't
                    # ensure that the end quote is the same as the start quote.

                    # Rewrite any relative URLs in terms of the base URL
                    html = re.sub('([hH][rR][eE][fF]\s*=\s*)(["\'])([^:/\'"]+?)(["\'])',
                                  r'\1\2' + baseurl + r'/\3\4',
                                  html)
                    # Rewrite URLs that start with / in terms of the site URL:
                    html = re.sub('([hH][rR][eE][fF]\s*=\s*)(["\'])/([^:\'"]+?)(["\'])',
                                  r'\1\2' + siteurl + r'/\3\4',
                                  html)

                    # See if the single page pattern exists and works
                    if len(single_page_pats) > 0 :
                        for single_page_pat in single_page_pats :
                            m = re.search(single_page_pat, html)
                            if m :
                                single_page = html[m.start():m.end()]
                                if verbose :
                                    print >>sys.stderr, \
                                        "\nFetching single-page pattern:", \
                                        single_page.encode('utf-8')
                                try :
                                    response = urllib2.urlopen(single_page)
                                    html2 = response.read()
                                    link = response.geturl()
                                    html = html2
                                    response.close()
                                    if verbose :
                                        print >>sys.stderr, \
                                            "Single page @", link
                                    break  # found a single-page, don't need 2
                                except Exception, e :
                                    print >>sys.stderr, \
                                        "Can't get single-page url", \
                                        single_page, \
                                        str(e)
                            elif verbose :
                                print >>sys.stderr, "single-page pattern", \
                                    single_page_pat, "not found in", link

                    # Throw out everything before the page_start pattern
                    # and after the page_end pattern
                    if len(page_starts) > 0 :
                        for page_start in page_starts :
                            #pat = re.compile(page_start)
                            #match = pat.search(html)
                            #if not match :
                            #    print >>sys.stderr, "Couldn't find", page_start
                            #else :
                            #    html = html[match.start() : ]
                            print "looking for page_start", page_start
                            match = html.find(page_start)
                            if match >= 0:
                                if verbose :
                                    print "Found page_start", page_start
                                html = html[match:]
                                break

                    if len(page_ends) > 0 :
                        for page_end in page_ends :
                            print "looking for page_end", page_end
                            match = html.find(page_end)
                            if match >= 0:
                                if verbose :
                                    print "Found page_end", page_end
                                html = html[0 : match]

                    # Skip anything matching any of the skip_pats
                    if len(skip_pats) > 0 :
                        print len(skip_pats), "skip pats"
                        for skip in skip_pats :
                            if verbose :
                                print >>sys.stderr, "Trying to skip", skip
                                #print >>sys.stderr, "in", html.encode('utf-8')
                                #sys.stderr.flush()
                            # flags=DOTALL doesn't exist in re.sub until 2.7,
                            #html = re.sub(skip, '', html, flags=re.DOTALL)
                            # but does exist in a compiled re expression:
                            regexp = re.compile(skip, flags=re.DOTALL)
                            html = regexp.sub('', html)
                            # Another way would be to use (.|\\n) in place of .
                            # For some reason [.\n] doesn't work.
                            #html = re.sub(skip, '', html, flags=re.DOTALL)
                    else : print "no skip pats"

                except KeyboardInterrupt :
                    response = handleKeyboardInterrupt("""
*** Caught keyboard interrupt reading a story! ***\n
Options:
q: Quit
c: Continue trying to read this story
s: Skip to next story
n: Skip to next site

Which (default = s): """)
                    if response[0] == 'n' :      # next site
                        return
                    elif response[0] != 'c' :    # next story (default)
                        continue

                    # If the response was 'c', we continue and just
                    # ignore the interrupt.

                except Exception, e :
                    # Collect info about what went wrong:
                    errmsg = "Couldnt read " + item.link + "\n"
                    errmsg += "Title: " + item.title.encode('utf-8')
                    if False and verbose :
                        errmsg += "Item summary was:\n------\n"
                        errmsg += item.summary + "\n------\n"
                        errmsg += str(e) + '<br>\n'
                        errmsg += str(sys.exc_info()[0]) + '<br>\n'
                        errmsg += str(sys.exc_info()[1]) + '<br>\n'
                        errmsg += traceback.format_exc(sys.exc_info()[2])

                    if verbose :
                        print >>sys.stderr, "=============="
                    msglog.err(errmsg)
                    if verbose :
                        print >>sys.stderr, "=============="
                    raise  # so this entry won't get stored or cached

            if not 'published_parsed' in item:
                if 'updated_parsed' in item:
                    item.published_parsed = item.updated_parsed
                else:
                    item.published_parsed = time.gmtime()

            def save_html_file(outdir, title, html, encoding) :
                # title is a unicode string, not yet encoded.
                # html is a string presumed to be in encoding (which may be '').

                if verbose :
                    print "Saving", title, "in", outdir
                utftitle = output_encode(title, encoding)
                fnam = str(pagenum) + ".html"

                # Make the parent directory if we haven't already
                # (don't do it before now since we didn't know whether
                # we had any content to save):
                if not os.access(outdir, os.W_OK) :
                    if verbose :
                        print "Making", outdir
                    os.makedirs(outdir)

                of = open(os.path.join(outdir, fnam), "w")
                of.write("""<html>\n<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"
<link rel="stylesheet" type="text/css" title="Feeds" href="../../feeds.css"/>
<title>%s</title>
</head>

<body>
<h1>%s</h1>\n
""" % (utftitle, utftitle))
                of.write(output_encode(html, encoding))

                # add a "next item" link.
                # XXX Unfortunately this itemnum check isn't necessarily reliable
                # XXX since we may have skipped items.
                #if itemnum < len(feed.entries) - 1 :
                if item != feed.entries[-1] :
                    of.write("<br><a href=\"" + str(pagenum+1) +
                             ".html\">&gt;-&gt;</a>\n")

                of.write(downloaded_string)

                of.write("</body>\n</html>\n")
                of.close()

                return fnam

            # Plucker named anchors don't work unless preceded by a <p>
     # http://www.mail-archive.com/plucker-list@rubberchicken.org/msg07314.html
            # and the previous message.
            indexstr += "<p><a name=\"" + str(itemnum) + "\">&nbsp;</a>"

            if levels > 1 :
                fnam = save_html_file(outdir, item.title, html, encoding)
                if verbose :
                    print >>sys.stderr, "Saved to file", fnam

                itemlink = '<a href=\"' + fnam + anchor + '\">'
                indexstr += itemlink + '<b>' + item.title + '</b></a>\n'
            else :
                # For a single-level site, don't put links over each entry.
                itemlink = '<a href=\"" + item.link + "\">'
                indexstr += "\n" + itemlink + item.title + "</a>\n"

            # Under the title, add a link to jump to the next entry
            # if it isn't the last entry.
            if item != feed.entries[-1] :
                indexstr += "<br> <i><a href=\"#" + str(itemnum+1) + \
                    "\">&gt;-&gt;</a></i>\n<br>\n"

            # Add either the content or the summary:
            if levels == 1 and 'content' in item :
                content = item.content[0].value + "\n"
            elif 'summary_detail' in item:
                content = item.summary_detail.value + "\n"
            else :
                content = "[No content]"

            # Remove images from index content too
            # XXX should do this only if skip_imgs is true!
            content = re.sub('<img .*?>', '', content)

            indexstr += content

            if 'author' in item :
                indexstr += "\n<br><i>by: " + item.author + "</i><br>"

            # After the content, add another link to the title,
            # in case the user wants to click through after reading
            # the content:
            sublen = 16
            if len(item.title) > sublen :
                # Truncate the title to sublen characters, and
                # temove any HTML tags, otherwise we'll likely have
                # tags like <i> that open but don't close
                short_title = re.sub('<.*?>', '', item.title[0:sublen]) \
                    + "..."

            else :
                short_title = item.title
            indexstr += "\n<br>[[" + itemlink + short_title + "</a>]]\n\n"

        # If there was an error parsing this entry, we won't save
        # a file so decrement the itemnum and loop to the next entry.
        except KeyboardInterrupt :
            sys.stderr.flush()
            response = handleKeyboardInterrupt("""
*** Caught keyboard interrupt while finishing a site! ***\n
Options:
q: Quit
c: Continue trying to finish this site
n: Skip to next site

Which (default = n): """)
            if response[0] == 'c' :
                continue
            if response[0] == 'q' :
                sys.exit(1)
            # Default is to skip to the next site:
            return
        except Exception, e :
            itemnum -= 1
            if verbose :
                print >>sys.stderr, "Skipping item", item.link.encode('utf-8')
                print >>sys.stderr, "error was", str(e).encode('utf-8')

                print >>sys.stderr, str(sys.exc_info()[0])
                print >>sys.stderr, str(sys.exc_info()[1])
                print >>sys.stderr, traceback.format_exc(sys.exc_info()[2])

    # Only write the index.html file if there was content that
    # wasn't already in the cache.
    sys.stdout.flush()
    if itemnum > 0 :
        indexfile = os.path.join(outdir, "index.html")
        if verbose :
            print  >>sys.stderr, "Writing", indexfile
        index = open(indexfile, "w")
        index.write(output_encode(indexstr, encoding))
        index.write(downloaded_string)
        index.write("\n</body>\n</html>\n")
        index.close()

        ####################################################
        # Generate the output files
        #
        if 'plucker' in formats :
            make_plucker_file(indexfile, feedname, levels, ascii)
        if 'fb2' in formats :
            make_fb2_file(indexfile, feedname, levels, ascii)
        if 'epub' in formats :
            make_epub_file(indexfile, feedname, levels, ascii)

        #
        # All done. Update the cache file.
        #
        if not nocache :
            if verbose :
                print >>sys.stderr, feedname, ": Updating cache file"
            # Dump the new cache, not the old one:
            # XXX Find out how long this is taking.
            # XXX Should we split the cache into per site?
            t = time.time()
            cache[sitefeedurl] = newfeedcache
            cPickle.dump(cache, open(cachefile, 'w'))
            print >>sys.stderr, "Writing cache took", time.time() - t, "seconds"
        elif verbose :
            print >>sys.stderr, feedname, ": Not updating cache file"

    else :
        print >>sys.stderr, feedname, ": no new content"

#
# Find the cache file and load it, but don't parse yet
#
def init_cache() :
    #
    # Load the cache file
    #
    if 'XDG_CACHE_HOME' in os.environ:
        cachefile = os.path.join(os.environ['XDG_CACHE_HOME'],
                                 'feedme', 'feedme.dat')
    else:
        cachefile = os.path.join(os.environ['HOME'], '.cache',
                                 'feedme', 'feedme.dat')

    if not os.path.exists(cachefile) :
        dirname = os.path.dirname(cachefile)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        cache = {}
    elif not os.access(cachefile, os.W_OK) :
        print >>sys.stderr, "Error: can't write cache file", cachefile
        sys.exit(1)
    else :
        # Make a backup of the cache file, in case something goes wrong:
        shutil.copy2(cachefile, cachefile + ".bak")
        cache = cPickle.load(open(cachefile))

    return cache, cachefile

#
# Read the configuration file (don't act on it yet)
#
def read_config_file() :
    #
    # Read the config file
    #
    if 'XDG_CONFIG_HOME' in os.environ:
        conffile = os.path.join(os.environ['XDG_CONFIG_HOME'],
                                'feedme', 'feedme.conf')
    else:
        conffile = os.path.join(os.environ['HOME'], '.config',
                                'feedme', 'feedme.conf')
    if not os.access(conffile, os.R_OK):
        print >>sys.stderr, "Error: no config file in", conffile
        sys.exit(1)
    
    config = ConfigParser({'verbose':'false', 'levels':'2',
                           'encoding':'',  # blank means try several
                           'page_start':'', 'page_end':'',
                           'single_page_pat':'', 'skip_pat':'',
                           'nocache':'false',
                           'logfile':'',
                           'save_days':'7',
                           'ascii':'false'})
    config.read(conffile)
    return config

#
# Main -- read the config file and loop over sites.
#
if __name__ == '__main__':
    from optparse import OptionParser

    usage = """Usage: %prog [site ...]
If no site is specified, feedme will update all the feeds in
~/.config/feedme.conf."""
    LongVersion = VersionString + "0.8: an RSS feed reader.\n\
Copyright 2011 by Akkana Peck; share and enjoy under the GPL v2 or later."

    optparser = OptionParser(usage=usage, version=LongVersion)
    optparser.add_option("-n", "--nocache",
                         action="store_true", dest="nocache",
                         help="Don't consult the cache, or update it")
    optparser.add_option("-s", "--show-sites",
                         action="store_true", dest="show_sites",
                         help="Show available sites")
    optparser.add_option("-l", "--log", metavar="logfile",
                         action="store", dest="log_file_name",
                         help="Save output to a log file")
    (options, args) = optparser.parse_args()

    config = read_config_file()

    # server = config.get('DEFAULT', 'master_server')

    msglog = MsgLog()

    sections = config.sections()

    if options.show_sites :
        for feedname in sections :
            print feedname
        sys.exit(0)

    if options.nocache :
        cache = None
        cachefile = None
    else :
        cache, cachefile = init_cache()

    logfilename = config.get('DEFAULT', 'logfile')
    if logfilename :
        logfilename = sub_tilde(logfilename)
        # Set up a tee to a log file, and redirect stderr there:
        print "teeing output to", logfilename
        stderrsav = sys.stderr
        outputlog = open(logfilename, "w")
        sys.stderr = tee(stderrsav, outputlog)

    try :
        if len(args) == 0 :
            for feedname in sections :
                get_feed(feedname, config, cache, cachefile, msglog)
        else :
            for arg in args :
                print >>sys.stderr, 'Getting feed for', arg
                get_feed(arg, config, cache, cachefile, msglog)

    # This causes a lot of premature exits. Not sure why we end up
    # here rather than in the inner KeyboardInterrupt section.
    except KeyboardInterrupt :
        print >>sys.stderr, "Caught keyboard interrupt at the wrong time!"
        print traceback.format_exc(sys.exc_info()[2])
        #sys.exit(1)
    except OSError, e :
        print >>sys.stderr, "Caught an OSError"
        print >>sys.stderr, e
        sys.exit(e.errno)

    # Dump any errors we encountered.
    msgs = msglog.get_msgs()
    if msgs :
        print >>sys.stderr, "\n===== Messages ===="
        print >>sys.stderr, msgs.encode('utf-8', 'backslashreplace')
    msgs = msglog.get_errs()
    if msgs :
        print >>sys.stderr, "\n====== Errors ====="
        print >>sys.stderr, msgs.encode('utf-8', 'backslashreplace')

    # Clean up old directories:
    clean_up(config)