User:Skagedal/Fafafa/Code

From Wikipedia, the free encyclopedia

#!/usr/bin/python
#
# http://en.wikipedia.org/wiki/User:Skagedal/Fafafa
#
# This program generates RSS feeds of Wikipedia's Featured Articles and Picture of the Day.
#
# Command line options:
#    --fa       generate featured articles feed
#    --potd     generate picture of the day feed
#    --sa       generate selected anniversaries feed

import sys
import os
import string
import datetime
import time
import urllib
import re
import cPickle
import xml.sax.saxutils

#
# Settings
#

# ...General
settings = {
        'rss_webmaster': 'simon@helgo.net',
        'program_name': 'Fafafa',
        'version': '0.8'
        }

# ...for Featured Articles
settings_fa = {
        'entries': 20,
        'output_filename': '/home/simon/public_html/wikipedia/fa.xml',
        'cache_filename': '/home/simon/projects/wikipedia/fafafa/fa_cache.pickle',
        'url': 'http://en.wikipedia.org/wiki/Wikipedia:Today%%27s_featured_article/%(month)s_%(day)d%%2C_%(year)d',
        'rss_title': 'Wikipedia Featured Articles',
        'rss_link': 'http://en.wikipedia.org/wiki/Wikipedia:Today%%27s_featured_article',
        'rss_description': 'RSS feed of the Wikipedia Featured Articles, generated from HTML by Fafafa: http://en.wikipedia/wiki/User:Skagedal/Fafafa'
        }

# ...for Picture of the Day
settings_potd = {
        'entries': 20,
        'output_filename': '/home/simon/public_html/wikipedia/potd.xml',
        'cache_filename': '/home/simon/projects/wikipedia/fafafa/potd_cache.pickle',
        'url': 'http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/%(month)s_%(day)d%%2C_%(year)d',
        'rss_title': 'Wikipedia Picture of the Day',
        'rss_link': 'http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day',
        'rss_description': 'RSS feed of the Wikipedia Picture of the Day, generated from HTML by Fafafa: http://en.wikipedia/wiki/User:Skagedal/Fafafa'
}

# ...for Selected anniversaries
settings_sa = {
        'entries': 20,
        'output_filename': '/home/simon/public_html/wikipedia/sa.xml',
        'cache_filename': '/home/simon/projects/wikipedia/fafafa/sa_cache.pickle',
        'url': 'http://en.wikipedia.org/wiki/Wikipedia:Selected_anniversaries/%(month)s_%(day)d',
        'rss_title': 'Wikipedia: On This Day',
        'rss_link': 'http://en.wikipedia.org/wiki/Wikipedia:Selected_anniversaries',
        'rss_description': 'RSS feed of the Wikipedia Selected Anniversaries, generated from HTML by Fafafa: http://en.wikipedia/wiki/User:Skagedal/Fafafa',
        'no_title': True
}
# Find the URL of FA article of a specific date
#
# ASSUMPTION: Featured articles for a specific day, say May 30, 2006, can be found at:
# [[Wikipedia:Today's featured article/May_30, 2006]]

months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

def get_url(date):
        return settings['url'] % \
                { 'month': months[date.month - 1], 'day': date.day, 'year': date.year }

# Subclassing of URLopener - sets "User-agent: ", which Wikipedia requires to be set
# to something else than the default "Python-urllib"

class MyURLopener(urllib.URLopener):
        version = settings['program_name'] + "/" + settings['version']

def too_old(date):
        return (datetime.date.today() - date).days > settings['entries']
        
# Caching of HTML from Wikipedia

class CacheItem:
        def __init__(self, html, fetchtime):
                self.html = html
                self.fetchtime = fetchtime
                        
class WPCache:

        def __init__(self, cachefilename):
                self.url_opener = MyURLopener()
                self.filename = cachefilename
                if (os.path.exists(cachefilename)):
                        file = open(cachefilename)
                        self.cache = cPickle.load(file)
                        file.close()
                else:
                        self.cache = {}
        
        def get_html(self, date):
                if date in self.cache:
                        return self.cache[date].html
                else:
                        html = self.url_opener.open(get_url(date)).read()
                        cacheitem = CacheItem(html, time.gmtime())
                        self.cache[date] = cacheitem
                        return html
                        
        # Weed out old entries, so cache doesn't get big
        def weed_out_old(self):
                self.cache = dict([x for x in self.cache.items() if not too_old(x[0])])
                
        def save(self):
                self.weed_out_old()
                file = open(self.filename, "w")
                p = cPickle.Pickler(file)
                p.dump(self.cache)
                
# Get the content of the article
#
# ASSUMPTION: Content of article is between <!-- start content --> and <!-- end content -->

re_content = re.compile('<!--\s*start\s+content\s*-->(.*)<!--\s*end\s+content\s*-->', re.DOTALL)
def get_content(s):
        m = re_content.search(s)
        return m.group(1)

# Get title of article - expects html filtered by get_content
#
# ASSUMPTION: 
# * The text inside the first bolded a-tag is the title
# ** If that can't be found, the first bolded text is the title
# *** If that can't be found, the first a-tag is the title
# **** If all else fails, return '(unknown title)'

res_title = [re.compile('<b><a[^>]*>([^<]*)</a>'),
        re.compile('<b>([^<]*)</b>'),
        re.compile('<a[^>]*>([^<]*)</a>')]
def get_title(s):
        # Recursive helper function
        def get_title_r(res, s):
                if res == []:
                        return '(unknown title)'
                else:
                        try:
                                m = res[0].search(s)
                                s = m.group(1)
                                s = s[0].upper() + s[1:]
                                return s
                        except:
                                return get_title_r(res[1:], s)

        return get_title_r(res_title, s)

# Create RSS item - expects html filtered by get_content

def rss_item(date, content):
        if 'no_title' in settings and settings['no_title']:
                title = "%s %d" % (months[date.month - 1], date.day)
        else:
                title = "%s %d: %s" % (months[date.month - 1], date.day, get_title(content))
        return """<item>

<title>%(title)s</title>

<link>%(url)s</link>

<description>%(escaped_content)s</description>

</item>
""" % { 
        'title': title, 
        'url': get_url(date), 
        'escaped_content': xml.sax.saxutils.escape(content)}

# Puts the final RSS together

def rss(items):
        return """<?xml version="1.0" encoding="UTF-8"?>

<rss version="2.0" xmlns:blogChannel="http://backend.userland.com/blogChannelModule">

<channel>
<title>%(rss_title)s</title>
<link>%(rss_link)s</link>
<description>%(rss_description)s</description>
<language>en-us</language>
<copyright>GNU Free Documentation License</copyright>
<lastBuildDate>%(build_date)s</lastBuildDate>
<docs>http://blogs.law.harvard.edu/tech/rss</docs>
<webMaster>%(webmaster)s</webMaster>
<generator>%(generator)s</generator>

%(items)s

</channel>
</rss>
""" % {
        'rss_title': settings['rss_title'], 
        'rss_link': settings['rss_link'],
        'rss_description': settings['rss_description'],
        'webmaster': settings['rss_webmaster'],
        'build_date': time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()),
        'items': items, 
        'generator': settings['program_name'] + " " + settings['version'] }

# Main

def main():
        # Primitive command line parsing
        if '--potd' in sys.argv:
                settings.update(settings_potd)
        elif '--sa' in sys.argv:
                settings.update(settings_sa)
        elif '--fa' in sys.argv:
                settings.update(settings_fa)
        else:
                print "Usage: --potd, --sa or --fa" 
                sys.exit(1)

        today = datetime.date.today()
        one_day = datetime.timedelta(days = 1)

        cache = WPCache(settings['cache_filename'])
        
        dates = [today - one_day*x for x in range(settings['entries'])]

        def item(date):
                content = get_content(cache.get_html(date))
                return rss_item(date, content)

        # Iterate over the items
        items = string.join([item(date) for date in dates], "")
        the_rss = rss(items)

        # Write to file
        file = open(settings['output_filename'], "w")
        file.write(the_rss)
        file.close()
        
        cache.save()

# Don't run if we're imported

if __name__ == '__main__':
        main()