User:Skagedal/Fafafa/Code
From Wikipedia, the free encyclopedia
< User:Skagedal | Fafafa
#!/usr/bin/python # # http://en.wikipedia.org/wiki/User:Skagedal/Fafafa # # This program generates RSS feeds of Wikipedia's Featured Articles and Picture of the Day. # # Command line options: # --fa generate featured articles feed # --potd generate picture of the day feed # --sa generate selected anniversaries feed import sys import os import string import datetime import time import urllib import re import cPickle import xml.sax.saxutils # # Settings # # ...General settings = { 'rss_webmaster': 'simon@helgo.net', 'program_name': 'Fafafa', 'version': '0.8.1' } # ...for Featured Articles settings_fa = { 'entries': 20, 'output_filename': '/home/simon/public_html/wikipedia/fa.xml', 'cache_filename': '/home/simon/projects/wikipedia/fafafa/fa_cache.pickle', 'url': 'http://en.wikipedia.org/wiki/Wikipedia:Today%%27s_featured_article/%(month)s_%(day)d%%2C_%(year)d', 'rss_title': 'Wikipedia Featured Articles', 'rss_link': 'http://en.wikipedia.org/wiki/Wikipedia:Today%%27s_featured_article', 'rss_description': 'RSS feed of the Wikipedia Featured Articles, generated from HTML by Fafafa: http://en.wikipedia/wiki/User:Skagedal/Fafafa' } # ...for Picture of the Day settings_potd = { 'entries': 20, 'output_filename': '/home/simon/public_html/wikipedia/potd.xml', 'cache_filename': '/home/simon/projects/wikipedia/fafafa/potd_cache.pickle', 'url': 'http://en.wikipedia.org/wiki/Template:Pic_of_the_day', 'rss_title': 'Wikipedia Picture of the Day', 'rss_link': 'http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day', 'rss_description': 'RSS feed of the Wikipedia Picture of the Day, generated from HTML by Fafafa: http://en.wikipedia/wiki/User:Skagedal/Fafafa', 'potd': True } # ...for Selected anniversaries settings_sa = { 'entries': 20, 'output_filename': '/home/simon/public_html/wikipedia/sa.xml', 'cache_filename': '/home/simon/projects/wikipedia/fafafa/sa_cache.pickle', 'url': 'http://en.wikipedia.org/wiki/Wikipedia:Selected_anniversaries/%(month)s_%(day)d', 'rss_title': 'Wikipedia: On This Day', 'rss_link': 'http://en.wikipedia.org/wiki/Wikipedia:Selected_anniversaries', 'rss_description': 'RSS feed of the Wikipedia Selected Anniversaries, generated from HTML by Fafafa: http://en.wikipedia/wiki/User:Skagedal/Fafafa', 'no_title': True } # Find the URL of FA article of a specific date # # ASSUMPTION: Featured articles for a specific day, say May 30, 2006, can be found at: # [[Wikipedia:Today's featured article/May_30, 2006]] months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"] def get_url(date): return settings['url'] % \ { 'month': months[date.month - 1], 'day': date.day, 'year': date.year } # Subclassing of URLopener - sets "User-agent: ", which Wikipedia requires to be set # to something else than the default "Python-urllib" class MyURLopener(urllib.URLopener): version = settings['program_name'] + "/" + settings['version'] def too_old(date): return (datetime.date.today() - date).days > settings['entries'] # Caching of HTML from Wikipedia class CacheItem: def __init__(self, html, fetchtime): self.html = html self.fetchtime = fetchtime class WPCache: def __init__(self, cachefilename): self.url_opener = MyURLopener() self.filename = cachefilename if (os.path.exists(cachefilename)): file = open(cachefilename) self.cache = cPickle.load(file) file.close() else: self.cache = {} def get_html(self, date): if date in self.cache: return self.cache[date].html else: if ('potd' in settings and settings['potd'] and (date != datetime.date.today())): return False html = self.url_opener.open(get_url(date)).read() cacheitem = CacheItem(html, time.gmtime()) self.cache[date] = cacheitem return html # Weed out old entries, so cache doesn't get big def weed_out_old(self): self.cache = dict([x for x in self.cache.items() if not too_old(x[0])]) def save(self): self.weed_out_old() file = open(self.filename, "w") p = cPickle.Pickler(file) p.dump(self.cache) # Get the content of the article # # ASSUMPTION: Content of article is between <!-- start content --> and <!-- end content --> re_content = re.compile('<!--\s*start\s+content\s*-->(.*)<!--\s*end\s+content\s*-->', re.DOTALL) def get_content(s): m = re_content.search(s) return m.group(1) # Get title of article - expects html filtered by get_content # # ASSUMPTION: # * The text inside the first bolded a-tag is the title # ** If that can't be found, the first bolded text is the title # *** If that can't be found, the first a-tag is the title # **** If all else fails, return '(unknown title)' res_title = [re.compile('<b><a[^>]*>([^<]*)</a>'), re.compile('<b>([^<]*)</b>'), re.compile('<a[^>]*>([^<]*)</a>')] def get_title(s): # Recursive helper function def get_title_r(res, s): if res == []: return '(unknown title)' else: try: m = res[0].search(s) s = m.group(1) s = s[0].upper() + s[1:] return s except: return get_title_r(res[1:], s) return get_title_r(res_title, s) # Create RSS item - expects html filtered by get_content def rss_item(date, content): if 'no_title' in settings and settings['no_title']: title = "%s %d" % (months[date.month - 1], date.day) else: title = "%s %d: %s" % (months[date.month - 1], date.day, get_title(content)) return """<item> <title>%(title)s</title> <link>%(url)s</link> <description>%(escaped_content)s</description> </item> """ % { 'title': title, 'url': get_url(date), 'escaped_content': xml.sax.saxutils.escape(content)} # Puts the final RSS together def rss(items): return """<?xml version="1.0" encoding="UTF-8"?> <rss version="2.0" xmlns:blogChannel="http://backend.userland.com/blogChannelModule"> <channel> <title>%(rss_title)s</title> <link>%(rss_link)s</link> <description>%(rss_description)s</description> <language>en-us</language> <copyright>GNU Free Documentation License</copyright> <lastBuildDate>%(build_date)s</lastBuildDate> <docs>http://blogs.law.harvard.edu/tech/rss</docs> <webMaster>%(webmaster)s</webMaster> <generator>%(generator)s</generator> %(items)s </channel> </rss> """ % { 'rss_title': settings['rss_title'], 'rss_link': settings['rss_link'], 'rss_description': settings['rss_description'], 'webmaster': settings['rss_webmaster'], 'build_date': time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()), 'items': items, 'generator': settings['program_name'] + " " + settings['version'] } # Main def main(): # Primitive command line parsing if '--potd' in sys.argv: settings.update(settings_potd) elif '--sa' in sys.argv: settings.update(settings_sa) elif '--fa' in sys.argv: settings.update(settings_fa) else: print "Usage: --potd, --sa or --fa" sys.exit(1) today = datetime.date.today() one_day = datetime.timedelta(days = 1) cache = WPCache(settings['cache_filename']) dates = [today - one_day*x for x in range(settings['entries'])] def item(date): html = cache.get_html(date) if html: content = get_content(cache.get_html(date)) else: content = '' return rss_item(date, content) # Iterate over the items items = string.join([item(date) for date in dates], "") the_rss = rss(items) # Write to file file = open(settings['output_filename'], "w") file.write(the_rss) file.close() cache.save() # Don't run if we're imported if __name__ == '__main__': main()