User:Gdr/history.py
From Wikipedia, the free encyclopedia
< User:Gdr
#!/usr/bin/python # # # HISTORY.PY -- WIKIPEDIA PAGE HISTORY # Gdr, 2005-05-12 # # # INTRODUCTION # # This Python library analyzes the history of articles on the English # Wikipedia. # # You must have the Python Wikipedia Robot Framework # (http://sourceforge.net/projects/pywikipediabot/). # # # LICENCE # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. import calendar import re import time import wikipedia edit1_re = re.compile(r'name="oldid" value="([0-9]+)"' r'.* title="[^\"]*">([^<]*[0-9][^<]*)</a>' r'.* title="(?:(User:[^\"]+)|Special:Contributions)">') edit2_re = re.compile(r'.* title="[^\"]*">([^<]*[0-9][^<]*)</a>' r'.* title="(?:(User:[^\"]+)|Special:Contributions)">') months = { 'Jan': 1, 'January': 1, 'Feb': 2, 'February': 2, 'Mar': 3, 'March': 3, 'Apr': 4, 'April': 4, 'May': 5, 'May': 5, 'Jun': 6, 'June': 6, 'Jul': 7, 'July': 7, 'Aug': 8, 'August': 8, 'Sep': 9, 'September': 9, 'Oct': 10, 'October': 10, 'Nov': 11, 'November': 11, 'Dec': 12, 'December': 12, } def dateParse(date): # Current time supplies default values. tm = list(time.gmtime()[:5]) + [0] # Use slot-filling approach to guess fields. fields = re.compile(r'[^\w:]+', re.U).split(date) for field in fields: if re.compile(r'^[0-9][0-9][0-9][0-9]$').match(field): # Four digits is a year tm[0] = int(field) elif re.compile(r'^[0-9][0-9]$').match(field): # Two digits is a day tm[2] = int(field) elif re.compile(r'^[0-9]$').match(field): # One digit is a day tm[2] = int(field) elif re.compile(r'^[0-9][0-9]:[0-9][0-9]$').match(field): # 2:2 digits is a time tm[3] = int(field[0:2]) tm[4] = int(field[3:5]) elif months.has_key(field): # A month name tm[1] = months[field] return calendar.timegm(tm) def historyParse(edit): m = edit1_re.search(edit) if m: return { 'oldid': m.group(1), 'date': dateParse(m.group(2)), 'user': m.group(3) } m = edit2_re.search(edit) if m: return { 'date': dateParse(m.group(1)), 'user': m.group(2) } raise wikipedia.Error("Can't parse edit:\n" + edit) def historyPage(page, limit = None, offset = None): """historyPage(page, limit = None, offset = None) Get the history of the article given by 'page'. Optional arguments: 'limit' specifies the maximum number of edits to return, and 'offset' says where to start in the history. Returns the history as a list of dictionaries, one per edit in the history, with keys 'oldid' - the id of the revision following the edit, if known (in MediaWiki 1.4 the current revision has no id), 'date' - the time of the edit as a number of seconds since the epoch, and 'user' - the user who made the edit.""" # Check whether we are not too quickly after the previous putPage, and # wait a bit until the interval is acceptable wikipedia.get_throttle() # Which web-site host are we submitting to? host = page.site().hostname() # Get the address of the page on that host. address = '/w/index.php?title=%s&action=%s'%(page.urlname(),'history') if limit: address = address + '&limit=%d' % limit if offset: address = address + '&offset=%d' % offset # Get the page. wikipedia.output(u"Getting history for %s" % page.linkname()) text, charset = wikipedia.getUrl(host, address) # Extract the edit items. m = re.compile(r'<ul id="pagehistory"><li>(.*)</li></ul>', re.M).search(text) if not m: raise wikipedia.Error("Can't find the list of edits:" + text) return map(historyParse, m.group(1).split('</li><li>')) def getOldRevision(page, oldid): """getOldRevision(page, oldid) Returns revision 'oldid' of article given by 'page'.""" wikipedia.get_throttle() host = page.site().hostname() address = page.site().edit_address(page.urlname()) + '&oldid=%s' % oldid print "address = ", address text, charset = wikipedia.getUrl(host, address, page.site()) return unicode(wikipedia.unescape(re.search('<textarea[^>]*>(.*)</textarea>', text, re.S).group(1)).rstrip(), charset, errors = 'replace')