#!/usr/bin/env python

# feedme: read RSS/Atom feeds and convert to Plucker files.
# Copyright 2009,2011 Akkana Peck <akkana@shallowsky.com>
# Based on feedread, Copyright (C) 2009 Benjamin M. A'Lee <bma@subvert.org.uk>
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details:
# <http://www.gnu.org/licenses/>.

# Goals for feedme 0.9: use real HTML parsing, not rexexp; add image fetching

#
# Important TODO:
# - Figure out why we get repeat stories on sites like BBC.

VersionString = "FeedMe 0.9"

import cPickle
import time  #, datetime
import os, sys
import re
#import types
import shutil
import traceback

from ConfigParser import ConfigParser

import feedparser
import urllib2

# We now use a separate file for the parsing and such:
import feedmeparser

has_ununicode=True
try :
    import ununicode
except ImportError, e:
    has_ununicode=False

#
# Clean up old feed directories
#
def clean_up(config) :
    try :
        days = int(config.get('DEFAULT', 'save_days'))
        feeddir = config.get('DEFAULT', 'dir')
        feeddir = sub_tilde(feeddir)
    except :
        print >>sys.stderr, \
            "Error trying to get save_days and feed dir; can't clean up"
        return

    print >>sys.stderr, "Cleaning up anything older than", \
        days, "days from", feeddir

    now = time.time()
    for dir in os.listdir(feeddir) :
        d = os.path.join(feeddir, dir)
        try :
            howold = (now - os.path.getctime(d)) / 60 / 60 / 24
            if howold > days :
                print >>sys.stderr, "Deleting", d
                if os.path.isdir(d) :
                    shutil.rmtree(d)
                else :
                    os.unlink(d)
        except Exception, e :
            print "Couldn't unlink", d, str(e)

##################################################################
# OUTPUT GENERATING FUNCTIONS
# Define functions for each output format you need to support.
#

def run_conversion_cmd(appargs) :
    if True or verbose :
        cmd = " ".join(appargs)
        print >>sys.stderr, "Running:", cmd
        sys.stdout.flush()

    retval = os.spawnvp(os.P_WAIT, appargs[0], appargs)
    #retval = os.system(cmd)
    if retval != 0 :
        raise OSError(retval, "Couldn't run: " + ' '.join(appargs))

#
# Generate a Plucker file
#
def make_plucker_file(indexfile, feedname, levels, ascii) :
    home = os.environ['HOME']
    day = time.strftime("%a")
    docname = day + " " + feedname
    cleanfilename = day + "_" + feedname.replace(" ", "_")

    # Make sure the plucker directory exists:
    pluckerdir = os.path.join(home, ".plucker", "feedme")
    if not os.path.exists(pluckerdir) :
        os.makedirs(pluckerdir)

    # Run plucker. This should eventually be configurable --
    # but how, with arguments like these?

    # Plucker mysteriously creates unbeamable files if the
    # document name has a colons in it.
    # So use the less pretty but safer underscored docname.
    #docname = cleanfilename
    appargs = [ "plucker-build", "-N", docname,
                "-f", os.path.join("feedme", cleanfilename),
                "--stayonhost", "--noimages",
                "--maxdepth", str(levels),
                "--zlib-compression", "--beamable",
                "-H", "file://" + indexfile ]
    if not ascii :
        appargs.append("--charset=utf-8")

    run_conversion_cmd(appargs)

#
# http://calibre-ebook.com/user_manual/conversion.html
#
def make_calibre_file(indexfile, feedname, extension, levels, ascii,
                      author, flags) :
    home = os.environ['HOME']
    day = time.strftime("%a")
    # Prepend daynum to the filename because fbreader can only sort by filename
    #daynum = time.strftime("%w")
    cleanfilename = day + "_" + feedname.replace(" ", "_")
    outdir = os.path.join(config.get('DEFAULT', 'dir'), extension[1:])
    if not os.access(outdir, os.W_OK) :
        os.makedirs(outdir)

    appargs = [ "ebook-convert",
                indexfile,
                #os.path.join(home, "feeds", cleanfilename + extension),
                # directory should be configurable too, probably
                os.path.join(outdir, cleanfilename + extension),
                "--authors", author ]
    for flag in flags :
        appargs.append(flag)
    if True or verbose :
        cmd = " ".join(appargs)
        print >>sys.stderr, "Running:", cmd
        sys.stdout.flush()

    run_conversion_cmd(appargs)

#
# Generate a fictionbook2 file
#
def make_fb2_file(indexfile, feedname, levels, ascii) :
    make_calibre_file(indexfile, feedname, ".fb2", levels, ascii,
                      "feedme", flags = [ "--disable-font-rescaling" ] )

#
# Generate an ePub file
# http://calibre-ebook.com/user_manual/cli/ebook-convert-3.html#html-input-to-epub-output
#
def make_epub_file(indexfile, feedname, levels, ascii) :
    make_calibre_file(indexfile, feedname, ".epub", levels, ascii,
                      time.strftime("%m-%d %a") + " feeds",
                      flags = [ '--no-default-epub-cover',
                                '--dont-split-on-page-breaks' ])

# END OUTPUT GENERATING FUNCTIONS
##################################################################

##################################################################
# MsgLog: Print messages and also batch them up to print at the end:
#
class MsgLog :
    def __init__(self) :
        self.msgstr = ""
        self.errstr = ""

    def msg(self, s) :
        self.msgstr += "\n" + s
        print "", s.encode('ascii', 'backslashreplace')

    def err(self, s) :
        self.errstr += "\n" + s
        print "ERROR:", s.encode('ascii', 'backslashreplace')

    def get_msgs(self) :
        return self.msgstr

    def get_errs(self) :
        return self.errstr

import sys

# file-like class that can optionally send output to a log file. Inspired by
# http://www.redmountainsw.com/wordpress/archives/python-subclassing-file-types
# and with help from KirkMcDonald.
class tee() :
    def __init__(self, _fd1, _fd2) :
        self.fd1 = _fd1
        self.fd2 = _fd2

    def __del__(self) :
        if self.fd1 != sys.stdout and self.fd1 != sys.stderr :
            self.fd1.close()
        if self.fd2 != sys.stdout and self.fd2 != sys.stderr :
            self.fd2.close()

    def write(self, text) :
        self.fd1.write(text)
        self.fd2.write(text)

    def flush(self) :
        self.fd1.flush()
        self.fd2.flush()

#
# Interrupt handler: prompt for what to do.
#
def handleKeyboardInterrupt(msg) :
    # os.isatty() doesn't work, so:
    if not hasattr(sys.stdin, "isatty") :
        print "Interrupt, and not running interactively. Exiting."
        sys.exit(1)

    response = raw_input(msg)
    if response == '' :
        return '\0'
    if response[0] == 'q' :
        sys.exit(1)
    return response[0]

def sub_tilde(name) :
    # config.get alas doesn't substitute $HOME or ~
    if name[0:2] == "~/" :
        name = os.path.join(os.environ['HOME'], name[2:])
    elif name[0:6] == "$HOME/" :
        name = os.path.join(os.environ['HOME'], name[6:])
    return name

#
# Get a single feed
#
def get_feed(feedname, config, cache, cachefile, msglog) :
    # Mandatory arguments:
    try :
        sitefeedurl = config.get(feedname, 'url')
        feeddir = config.get(feedname, 'dir')
    except :
        msglog.err("Error reading feedme.conf entry for: " + feedname)
        return

    verbose = (config.get(feedname, 'verbose').lower() == 'true')
    levels = int(config.get(feedname, 'levels'))

    feeddir = sub_tilde(feeddir)
    feeddir = os.path.join(feeddir, time.strftime("%m-%d-%a"))

    formats = config.get(feedname, 'formats').split(',')
    encoding = config.get(feedname, 'encoding')
    ascii = config.getboolean(feedname, 'ascii')

    #encoding = config.get(feedname, 'encoding')

    print >>sys.stderr, "feedname:", feedname
    feedfile = feedname.replace(" ", "_")
    print >>sys.stderr, "feedfile:", feedfile
    outdir = os.path.join(feeddir,  feedfile)
    print >>sys.stderr, "outdir:", outdir
    if cache == None :
        nocache = True
    else :
        nocache = (config.get(feedname, 'nocache') == 'true')
    if verbose and nocache :
        msglog.msg(feedname + ": Ignoring cache")

    global VersionString
    downloaded_string ="\n<hr><i>(Downloaded by " + VersionString + ")</i>\n"

    # feedparser.parse() can throw unexplained errors like
    # "xml.sax._exceptions.SAXException: Read failed (no details available)"
    # which will kill our whole process, so guard against that:
    try :
        feed = feedparser.parse(sitefeedurl)
    except Exception, e :
        print "Couldn't parse feed: URL:", sitefeedurl
        print str(e)
        return

    # feedparser has no error return! One way is to check len(feed.feed).
    if len(feed.feed) == 0 :
        msglog.err("Can't read " + sitefeedurl)
        return
    # XXX Sometimes feeds die a few lines later getting feed.feed.title.
    # Here's a braindead guard against it -- but why isn't this
    # whole clause inside a try? It should be.
    if not 'title' in feed.feed :
        msglog.msg(sitefeedurl + " lacks a title!")
        feed.feed.title = '[' + feedname + ']'
        #return

    if not nocache :
        if sitefeedurl not in cache:
            cache[sitefeedurl] = []
        feedcache = cache[sitefeedurl]
        newfeedcache = []

    # suburls: mapping of URLs we've encountered to local URLs.
    # Any anchors (#anchor) will be discarded.
    # This is for sites like WorldWideWords that make many links
    # to the same page.
    suburls = []

    # indexstr is the contents of the index.html file.
    # Kept as a string until we know whether there are new, non-cached
    # stories so it's worth updating the copy on disk.
    # The stylesheet is for FeedViewer and shouldn't bother plucker etc.
    day = time.strftime("%a")
    indexstr = u"""<html>\n<head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">
<title>%s: %s</title>
<link rel="stylesheet" type="text/css" title="Feeds" href="../../feeds.css"/>
</head>

<body>\n<h1>%s: %s: %s</h1>
\n""" % (day, feedname, day, feedname, feed.feed.title)

    if verbose:
        print >>sys.stderr, "********* Reading", sitefeedurl

    itemnum = 0
    for item in feed.entries :
        try :
            #
            # Get the list of links (href) and a (hopefully) unique ID:
            #
            if 'links' in item :
                href = [i['href'].encode('utf-8') \
                            for i in item.links if i['rel'] == 'alternate']
            else:
                href = []

            if not 'id' in item :
                if len(href) > 0 :
                    item.id = href[0]
                    if verbose :
                        msglog.msg("Using URL " + href[0] + " for ID.")
                else:
                    if verbose :
                        msglog.msg("Item in " + href[0] + " had no ID or URL.")
                    next  # or return?

            # Filter out file types known not to work
            # XXX Only mp3 for now. Obviously, make this more general.
            # Wish we could do this using the server's type rather than
            # file extension!
            if item.link.endswith("mp3") :
                print >>sys.stderr, "Filtering out mp3 link", item.link
                continue

            # Make sure ids don't have named anchors appended:
            anchor_index = item.id.rfind('#')
            if anchor_index >= 0 :
                anchor = item.id[anchor_index:]
                item.id = item.id[0:anchor_index]
            else :
                anchor = ""

            # See if we've already seen this page:
            try :
                pagenum = suburls.index(item.id)
                # We've already seen a link to this URL. It's probably
                # a link to a different named anchor within the same file.
            except ValueError :
                # Haven't seen it before. But is it in the cache already?
                if not nocache :
                    # We want it in the cache, whether it's new or not:
                    newfeedcache.append(item.id)
                    if item.id in feedcache:
                        if verbose :
                            msglog.msg(item.id + " already cached -- skipping")
                        continue

                # Add it to the cache and suburls.
                suburls.append(item.id)
                pagenum = len(suburls) - 1

            itemnum += 1
            if verbose :
                print >>sys.stderr, "\nItem:", item.title.encode('utf-8',
                                                                 'replace')

            # Now itemnum is the number of the entry on the index page;
            # pagenum is the html file of the subentry, e.g. 3.html.

            # Make the parent directory if we haven't already
            if not os.access(outdir, os.W_OK) :
                if verbose :
                    print >>sys.stderr, "Making", outdir
                os.makedirs(outdir)

            if 'author' in item :
                author = item.author
            else :
                author = None

            #
            # Follow the link and make a file for it:
            #
            if levels > 1 :        # Normal multi-level site
                try :    # Try to trap keyboard interrupts, + others
                    # For the sub-pages, we're getting HTML, not RSS.
                    # Nobody seems to have RSS pointing to RSS.
                    parser = feedmeparser.FeedmeHTMLParser(config, feedname)
                    fnam = str(pagenum) + ".html"
                    parser.fetch_url(item.link,
                                     outdir, fnam,
                                     item.title, author,
                                     "<a href=\"%d.html\">&gt;-%d-&gt;</a>" \
                                         % (itemnum, itemnum))

                except KeyboardInterrupt :
                    response = handleKeyboardInterrupt("""
*** Caught keyboard interrupt reading a story! ***\n
Options:
q: Quit
c: Continue trying to read this story
s: Skip to next story
n: Skip to next site

Which (default = s): """)
                    if response[0] == 'n' :      # next site
                        return
                    elif response[0] != 'c' :    # next story (default)
                        continue

                    # If the response was 'c', we continue and just
                    # ignore the interrupt.

                except (IOError, urllib2.HTTPError) as e :
                    # Collect info about what went wrong:
                    errmsg = "Couldn't read " + item.link + "\n"
                    #errmsg += "Title: " + item.title.encode('utf-8', 'replace')
                    if verbose :
                        #errmsg += "Item summary was:\n------\n"
                        #errmsg += item.summary + "\n------\n"
                        errmsg += str(e) + '<br>\n'
                        #errmsg += str(sys.exc_info()[0]) + '<br>\n'
                        #errmsg += str(sys.exc_info()[1]) + '<br>\n'
                        #errmsg += traceback.format_exc(sys.exc_info()[2])

                    if verbose :
                        print >>sys.stderr, "=============="
                    msglog.err(errmsg)
                    if verbose :
                        print >>sys.stderr, "=============="
                    #raise  # so this entry won't get stored or cached
                    continue   # Move on to next story
                except Exception as e :
                    # An unknown error, so report it complete with traceback.
                    errmsg = "Unknown error reading " + item.link + "\n"
                    errmsg += "Title: " + item.title.encode('utf-8', 'replace')
                    if verbose :
                        errmsg += "Item summary was:\n------\n"
                        errmsg += item.summary + "\n------\n"
                        errmsg += str(e) + '<br>\n'
                        errmsg += str(sys.exc_info()[0]) + '<br>\n'
                        errmsg += str(sys.exc_info()[1]) + '<br>\n'
                        errmsg += traceback.format_exc(sys.exc_info()[2])

                    if verbose :
                        print >>sys.stderr, "=============="
                    msglog.err(errmsg)
                    if verbose :
                        print >>sys.stderr, "=============="
                    continue   # Move on to next story, ensure we get index

            if not 'published_parsed' in item:
                if 'updated_parsed' in item:
                    item.published_parsed = item.updated_parsed
                else:
                    item.published_parsed = time.gmtime()

            # Plucker named anchors don't work unless preceded by a <p>
     # http://www.mail-archive.com/plucker-list@rubberchicken.org/msg07314.html
            # and the previous message.
            indexstr += "<p><a name=\"%d\">&nbsp;</a>" % itemnum

            if levels > 1 :
                itemlink = '<a href=\"' + fnam + anchor + '\">'
                indexstr += itemlink + '<b>' + item.title + '</b></a>\n'
            else :
                # For a single-level site, don't put links over each entry.
                itemlink = '<a href=\"" + item.link + "\">'
                indexstr += "\n" + itemlink + item.title + "</a>\n"

            # Under the title, add a link to jump to the next entry
            # if it isn't the last entry.
            if item != feed.entries[-1] :
                indexstr += \
                    "<br>\n<i><a href=\"#%d\">&gt;-&gt;</a></i>\n<br>\n" \
                    % (itemnum+1)

            # Add either the content or the summary:
            if levels == 1 and 'content' in item :
                content = item.content[0].value + "\n"
            elif 'summary_detail' in item:
                content = item.summary_detail.value + "\n"
            else :
                content = "[No content]"

            # There's an increasing trend to load up RSS pages with images.
            # Try to remove them.
            if config.getboolean(feedname, 'skip_images') :
                content = re.sub('<img .*?>', '', content)

            indexstr += content

            if author :
                indexstr += "\n<br><i>by: " + author + "</i><br>"

            # After the content, add another link to the title,
            # in case the user wants to click through after reading
            # the content:
            sublen = 16
            if len(item.title) > sublen :
                # Truncate the title to sublen characters, and
                # temove any HTML tags, otherwise we'll likely have
                # tags like <i> that open but don't close
                short_title = re.sub('<.*?>', '', item.title[0:sublen]) \
                    + "..."

            else :
                short_title = item.title
            indexstr += "\n<br>[[" + itemlink + short_title + "</a>]]\n\n"

        # If there was an error parsing this entry, we won't save
        # a file so decrement the itemnum and loop to the next entry.
        except KeyboardInterrupt :
            sys.stderr.flush()
            response = handleKeyboardInterrupt("""
*** Caught keyboard interrupt while finishing a site! ***\n
Options:
q: Quit
c: Continue trying to finish this site
n: Skip to next site

Which (default = n): """)
            if response[0] == 'c' :
                continue
            if response[0] == 'q' :
                sys.exit(1)
            # Default is to skip to the next site:
            return
        except Exception, e :    # probably an HTTPError, bad URL
            itemnum -= 1
            if verbose :
                print >>sys.stderr, "Skipping item", item.link.encode('utf-8')
                print >>sys.stderr, "error was", str(e).encode('utf-8')

                print >>sys.stderr, str(sys.exc_info()[0])
                print >>sys.stderr, str(sys.exc_info()[1])
                print >>sys.stderr, traceback.format_exc(sys.exc_info()[2])

    # Only write the index.html file if there was content that
    # wasn't already in the cache.
    sys.stdout.flush()
    if itemnum > 0 :
        indexfile = os.path.join(outdir, "index.html")
        if verbose :
            print  >>sys.stderr, "Writing", indexfile
        index = open(indexfile, "w")
        if ascii :
            index.write(feedmeparser.output_encode(indexstr, 'ascii'))
        else :
            index.write(feedmeparser.output_encode(indexstr, encoding))
        index.write(downloaded_string)
        index.write("\n</body>\n</html>\n")
        index.close()

        ####################################################
        # Generate the output files
        #
        if 'plucker' in formats :
            make_plucker_file(indexfile, feedname, levels, ascii)
        if 'fb2' in formats :
            make_fb2_file(indexfile, feedname, levels, ascii)
        if 'epub' in formats :
            make_epub_file(indexfile, feedname, levels, ascii)

        #
        # All done. Update the cache file.
        #
        if not nocache :
            if verbose :
                print >>sys.stderr, feedname, ": Updating cache file"
            # Dump the new cache, not the old one:
            # XXX Find out how long this is taking.
            # XXX Should we split the cache into per site?
            t = time.time()
            cache[sitefeedurl] = newfeedcache
            cPickle.dump(cache, open(cachefile, 'w'))
            print >>sys.stderr, "Writing cache took", time.time() - t, "seconds"
        elif verbose :
            print >>sys.stderr, feedname, ": Not updating cache file"

    else :
        print >>sys.stderr, feedname, ": no new content"

#
# Find the cache file and load it, but don't parse yet
#
def init_cache() :
    #
    # Load the cache file
    #
    if 'XDG_CACHE_HOME' in os.environ:
        cachefile = os.path.join(os.environ['XDG_CACHE_HOME'],
                                 'feedme', 'feedme.dat')
    else:
        cachefile = os.path.join(os.environ['HOME'], '.cache',
                                 'feedme', 'feedme.dat')

    if not os.path.exists(cachefile) :
        dirname = os.path.dirname(cachefile)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        cache = {}
    elif not os.access(cachefile, os.W_OK) :
        print >>sys.stderr, "Error: can't write cache file", cachefile
        sys.exit(1)
    else :
        # Make a backup of the cache file, in case something goes wrong:
        shutil.copy2(cachefile, cachefile + ".bak")
        cache = cPickle.load(open(cachefile))

    return cache, cachefile

#
# Main -- read the config file and loop over sites.
#
if __name__ == '__main__':
    from optparse import OptionParser

    usage = """Usage: %prog [site ...]
If no site is specified, feedme will update all the feeds in
~/.config/feedme.conf."""
    LongVersion = VersionString + "0.8: an RSS feed reader.\n\
Copyright 2011 by Akkana Peck; share and enjoy under the GPL v2 or later."

    optparser = OptionParser(usage=usage, version=LongVersion)
    optparser.add_option("-n", "--nocache",
                         action="store_true", dest="nocache",
                         help="Don't consult the cache, or update it")
    optparser.add_option("-s", "--show-sites",
                         action="store_true", dest="show_sites",
                         help="Show available sites")
    optparser.add_option("-l", "--log", metavar="logfile",
                         action="store", dest="log_file_name",
                         help="Save output to a log file")
    (options, args) = optparser.parse_args()

    config = feedmeparser.read_config_file()

    msglog = MsgLog()

    sections = config.sections()

    if options.show_sites :
        for feedname in sections :
            print feedname
        sys.exit(0)

    if options.nocache :
        cache = None
        cachefile = None
    else :
        cache, cachefile = init_cache()

    logfilename = config.get('DEFAULT', 'logfile')
    if logfilename :
        logfilename = sub_tilde(logfilename)
        # Set up a tee to a log file, and redirect stderr there:
        print "teeing output to", logfilename
        stderrsav = sys.stderr
        outputlog = open(logfilename, "w")
        sys.stderr = tee(stderrsav, outputlog)

    try :
        if len(args) == 0 :
            for feedname in sections :
                get_feed(feedname, config, cache, cachefile, msglog)
        else :
            for arg in args :
                print >>sys.stderr, 'Getting feed for', arg
                get_feed(arg, config, cache, cachefile, msglog)

    # This causes a lot of premature exits. Not sure why we end up
    # here rather than in the inner KeyboardInterrupt section.
    except KeyboardInterrupt :
        print >>sys.stderr, "Caught keyboard interrupt at the wrong time!"
        print traceback.format_exc(sys.exc_info()[2])
        #sys.exit(1)
    except OSError, e :
        print >>sys.stderr, "Caught an OSError"
        print >>sys.stderr, e
        #sys.exit(e.errno)

    # Dump any errors we encountered.
    msgs = msglog.get_msgs()
    if msgs :
        print >>sys.stderr, "\n===== Messages ===="
        print >>sys.stderr, msgs.encode('utf-8', 'backslashreplace')
    msgs = msglog.get_errs()
    if msgs :
        print >>sys.stderr, "\n====== Errors ====="
        print >>sys.stderr, msgs.encode('utf-8', 'backslashreplace')

    # Clean up old directories:
    clean_up(config)

