User:Gdr/yearbot.py
From Wikipedia, the free encyclopedia
< User:Gdr
#!/usr/bin/python # # # YEARBOT.PY -- POPULATE BIRTHS/DEATHS IN YEAR # Gdr, 2005-05-14 # Minor updates: User:Docu, 2006-12-17 # # INTRODUCTION # # This script assists with the population of the "Births" and "Deaths" # sections of an article about a year in the English wikipedia, using # articles in [[Category:<year> births]] and [[Category:<year> deaths]]. # # # USAGE # # See [[User:Gdr/Yearbot]] # requires [[User:Gdr/history.py]] # # DATA STRUCTURES # # An entry is a dictionary with these fields: # # article Name of article. # bdate Date of birth, as a pair like ('April 17', '0417'). # byear Birth year, as string like '1543' # ddate Date of death, as a pair like ('September 23', '0923'). # dyear Death year, as string like '1602' # exclude 1 if article is to be excluded from the page. # intro Introductory paragraph of article, if any is found. # pagelink wikipedia.Page object referring to article. # post String placed after the article link. # pre String placed before the article link. # sort Sort key, if any. # desc Description extracted from article (used as text for 'post' # if entry is new). # # # LICENCE # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. import catlib import getopt import history import re import sys import wikipedia class Year: site = wikipedia.Site('en') # List of regexp search-and-replace patterns that should be applied # to all descriptions. patterns = [] # The year we are working on, its Page, and the original text. year = None year_pl = None year_orig = None year_text = None ignore = { 'Special:Categories': 1, } # Matches a regexp pattern. pattern_re = r'/((?:[^\\]|\\.)*)/((?:[^\\]|\\.)*)/$' # File to store patterns. pattern_file = 'yearbot-patterns' def __init__(self, year): if not re.match(r'^' + self.year_re + r'$', year): print "%s doesn't look like a year" % year self.year = year self.year_pl = wikipedia.Page(self.site, self.year) self.patterns = [] f = file(self.pattern_file) if f: for line in f: m = re.match(self.pattern_re, line) if m: self.patterns.append(m.groups()) f.close() # Matches a year in the range for which the script operates. year_re = r'1[0-9][0-9][0-9]' # Matches a trailing birth date. trail_born_re = re.compile(r'^(.*\S)\s*\(b(?:\.|orn)\s*\[?\[?(' + year_re + r')\]?\]?\)$') # Matches a trailing death date. trail_died_re = re.compile(r'^(.*\S)\s*\(d(?:\.|ied)\s*\[?\[?(' + year_re + r')\]?\]?\)$') # Matches a month name. month_re = (r'January|February|March|April|May|June|' r'July|August|September|October|November|December') # Matches a date. date_re = (r'\[?\[?(?:(' + month_re + r')\s+([0-9]+)|([0-9]+)\s*(' + month_re + r'))\]?\]?') # Matches an entry starting with a date. entry_date_re = re.compile(r'^\s*' + date_re + r'\s*(?:-|–|—)?\s*(.*)$') # Matches an entry: captures pre, article, linktext, post. entry_re = re.compile(r'([^\[]*)\[\[([^\]|]+)(?:\|([^|\]]*))?\]\](.*)') # Matches the introductory paragraph of an article, once filled in # with birth year and death year. intro1_re = r"^.*'''[^']+'''(.*?)\[?\[?%s\]?\]?(.*?)\[?\[?%s\]?\]?\W*(.*)$" intro2_re = r"^.*'''[^']+'''[^\(]*\([^\)]+\)(.*)$" # Matches description. desc_re = r'\s+(?:(?:the|an?)\s+)?(([^,.!?\[]|\[\[[^\]]+\]\])+)[,.!?]' desc1_re = re.compile(r'\)\s*was' + desc_re) desc2_re = re.compile(r'\),' + desc_re) desc3_re = re.compile(r'\s+was' + desc_re) desc4_re = re.compile(r',' + desc_re) # Matches wiki-link link1_re = re.compile(r'\[\[[^|\]]+\|([^|\]]+)\]\]') link2_re = re.compile(r'\[\[([^|\]]+)\]\]') # Approximate date? approx_re = re.compile(r'\bc(?:\.|a\.|irca)') def save_patterns(self): f = file(self.pattern_file, 'w') if f: for p in self.patterns: f.write(u'/%s/%s/\n' % (p[0], p[1])) f.close() else: print "Couldn't write %s" % self.pattern_file def apply_patterns(self): for entries in self.topic_entries.values(): for e in entries: for p in self.patterns: if e.has_key('post'): e['post'] = re.sub(p[0], p[1], e['post']) elif e.has_key('desc'): e['desc'] = re.sub(p[0], p[1], e['desc']) def unwikify(self, text): text = self.link1_re.sub(r'\1', text) text = self.link2_re.sub(r'\1', text) return text def make_date(self, m): month = m.group(1) or m.group(4) day = m.group(2) or m.group(3) return ('%s %s' % (month, day), '%02d%02d' % (history.months[month], int(day))) def parse_entries(self, what): m = re.search(r'==\s*' + what.capitalize() + '\s*==\n((?:\s*\n|\*.*\n)*)', self.year_pl.get()) if not m: print "No ==%s==" % what.capitalize() return [] lines = re.split(r'\s*\n\s*', m.group(1)) entries = [] for line_orig in lines: entry = {} line = re.sub(r'^\*\s*', '', line_orig) m = self.entry_date_re.match(line) if m: date = self.make_date(m) if what == 'births': entry['bdate'] = date elif what == 'deaths': entry['ddate'] = date else: entry['?date'] = date line = m.group(5) m = self.trail_born_re.match(line) if m: entry['byear'] = m.group(2) line = m.group(1) m = self.trail_died_re.match(line) if m: entry['dyear'] = m.group(2) line = m.group(1) m = self.entry_re.match(line) if m: entry['pre'] = m.group(1) entry['article'] = m.group(2) if m.group(3): entry['linktext'] = m.group(3) entry['post'] = m.group(4) entries.append(entry) elif not re.match(r'^\s*$', line_orig): wikipedia.output(u"Couldn't parse %s" % line_orig) return entries def check_entry(self, entry, key, what, value): if value != None: if entry.has_key(key) and entry[key] != value: wikipedia.output(u"%s '%s' fails to match '%s'; " u"discarding the former." % (what, entry[key], value)) entry[key] = value def parse_article(self, entry, what, entries = {}): intro = None try: text = entry['pagelink'].get() except wikipedia.IsRedirectPage, arg: return except wikipedia.NoPage: return # Look for {{lived}} template. m = re.search(r'{{lived|\s*b\s*=\s*(' + self.year_re + r')\s*|\s*d\s*=\s*(' + self.year_re + r')\s*|\s*key\s*=\s*(.*)}}', text) if m: self.check_entry(entry, 'byear', 'birth year', m.group(1)) self.check_entry(entry, 'dyear', 'death year', m.group(2)) self.check_entry(entry, 'sortkey', 'sort key', m.group(3)) else: # Get birth year from category, if possible. m = re.search(r'\[\[[Cc]ategory:(' + self.year_re + ') births(?:\|([^|\]]+))?\]\]', text) if m: self.check_entry(entry, 'byear', 'birth year', m.group(1)) self.check_entry(entry, 'sortkey', 'sort key', m.group(2)) else: wikipedia.output(u"%s has no Category:births" % entry['article']) # Get death year from category, if possible. m = re.search(r'\[\[[Cc]ategory:(' + self.year_re + ') deaths(?:\|([^|\]]+))?\]\]', text) if m: self.check_entry(entry, 'dyear', 'death year', m.group(1)) self.check_entry(entry, 'sortkey', 'sort key', m.group(2)) else: wikipedia.output(u"%s has no Category:deaths" % entry['article']) # Find introductory paragraph. m = re.search(self.intro1_re % (entry.get('byear') or self.year_re, entry.get('dyear') or self.year_re), text, re.M) if m: entry['intro'] = m.group(0) intro = m.group(3) # Birth date available in intro? mm = re.search(self.date_re, m.group(1)) if mm: self.check_entry(entry, 'bdate', 'birth date', self.make_date(mm)) # Birth date approximate? if self.approx_re.search(m.group(1)) and what == 'births': entry['exclude'] = True # Death date available in intro? mm = re.search(self.date_re, m.group(2)) if mm: self.check_entry(entry, 'ddate', 'death date', self.make_date(mm)) # Death date approximate? if self.approx_re.search(m.group(2)) and what == 'deaths': entry['exclude'] = True else: m = re.search(self.intro2_re, text, re.M) if m: entry['intro'] = m.group(0) intro = m.group(1) else: # Use first line instead. entry['intro'] = text.split('\n')[0] # Brief description available? mm = None if intro: mm = (self.desc3_re.match(intro) or self.desc4_re.match(intro)) mm = (mm or self.desc1_re.search(entry['intro']) or self.desc2_re.search(entry['intro']) or self.desc3_re.search(entry['intro']) or self.desc4_re.search(entry['intro'])) if mm: entry['desc'] = self.unwikify(mm.group(1)) def get_entries(self, what): # Get entries from the section of the year page. entries = self.parse_entries(what) article_entry = {} for entry in entries: article_entry[entry['article']] = entry # Get lists of births and deaths articles for this year. cl = catlib.Category(self.site, '%s %s' % (self.year, what)) for a in cl.articles(): if (not self.ignore.has_key(a.title()) and not article_entry.has_key(a.title())): e = {'article': a.title()} article_entry[a.title()] = e # Get them all. for e in article_entry.values(): e['pagelink'] = wikipedia.Page(self.site, e['article']) wikipedia.getall(self.site, map(lambda e: e['pagelink'], article_entry.values())) # Merge redirects. for e in article_entry.values(): try: text = e['pagelink'].get() except wikipedia.IsRedirectPage, arg: pl = wikipedia.Page(self.site, arg.args[0]) redir = pl.title() wikipedia.output("%s redirects to %s" % (e['article'], redir)) if article_entry.has_key(redir): e['pagelink'] = article_entry[redir]['pagelink'] del article_entry[redir] else: e['pagelink'] = pl del article_entry[e['article']] article_entry[redir] = e e['article'] = redir except wikipedia.NoPage: continue # Parse articles. for e in article_entry.values(): self.parse_article(e, what) return article_entry.values() def guess_sortkey(self, article): words = article.split(' ') if 1 < len(words): return words[-1] + u', ' + u' '.join(words[:-1]) else: return article def sort_entries(self, entries, what): for e in entries: if what == 'births': e['sort'] = e.has_key('bdate') and e['bdate'][1] or e.get('sortkey') or self.guess_sortkey(e['article']) elif what == 'deaths': e['sort'] = e.has_key('ddate') and e['ddate'][1] or e.get('sortkey') or self.guess_sortkey(e['article']) else: e['sort'] = e.get('sortkey') or self.guess_sortkey(e['article']) entries.sort(lambda e,f: cmp(e['sort'], f['sort'])) def format_entry(self, entry, what): if entry.get('exclude'): t = u'- ' else: t = u'* ' if what == 'births' and entry.has_key('bdate'): t = t + u'[[%s]] - ' % entry['bdate'][0] elif what == 'deaths' and entry.has_key('ddate'): t = t + u'[[%s]] - ' % entry['ddate'][0] t = t + (entry.get('pre') or u'') if entry.has_key('linktext'): t = t + u'[[%s|%s]]' % (entry['article'], entry['linktext']) elif entry['article'][-1] == ')': t = t + u'[[%s|]]' % entry['article'] else: t = t + u'[[%s]]' % entry['article'] if entry.has_key('post'): t = t + entry['post'] elif entry.has_key('desc'): t = t + u', ' + entry['desc'] if what == 'births' and entry.has_key('dyear'): t = t + u' (died [[%s]])' % entry['dyear'] elif what == 'deaths' and entry.has_key('byear'): t = t + u' (born [[%s]])' % entry['byear'] return t def write_entries(self, entries, what): if not self.year_text: self.year_text = self.year_pl.get() text = self.year_text m = re.search(r'==\s*' + what.capitalize() + '\s*==\n((?:\s*\n|\*.*\n)*)', text) if not m: print "No ==%s==" % what.capitalize() return "" return (text[:m.start(1)] + u'\n'.join(map(lambda e: self.format_entry(e, what), filter(lambda e: not e.get('exclude'), entries))) + u'\n\n' + text[m.end(1):]) help_text = u""" h - Help l - List entries v - Preview changes to the page s - Save changes to the page q - Quit /<from>/<to>/ - Edit all entries and save pattern in file <n>p - Print entry <n> <n>i - Print introductory paragraph for entry <n> <n>t - Print whole article text for entry <n> <n>x - Exclude entry <n> (or include if already excluded) <n>d:<desc> - Update description for entry <n> <n>d<m> - Cut description for entry <n> to <m> words <n>P:<desc> - Update prefix text for entry <n> <n>/<from>/<to>/ - Edit entry <n> using regexp search-and-replace """ def show_entries(self, title, entries, what): wikipedia.output(u'------- %s -------' % title) n = 0 self.sort_entries(entries, what) for e in entries: n = n + 1 wikipedia.output(u"%d%s" % (n, self.format_entry(e, what))) def interface(self, title, entries, what): self.show_entries(title, entries, what) while 1: inp = wikipedia.input(u"-- What now? [hlqs0-9pdtx]") m1 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*([0-9]+)$', inp) m2 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*(:.*)?$', inp) m3 = re.match(r'^\s*([0-9]+)\s*' + self.pattern_re, inp) m4 = re.match(r'^\s*' + self.pattern_re, inp) if inp == 'l': self.show_entries(title, entries, what) elif inp == 'q': return False elif inp == 's' or inp == 'w': return True elif inp == 'h': wikipedia.output(self.help_text) elif m1: n = int(m1.group(1)) op = m1.group(2) n2 = int(m1.group(3)) if n < 1 or len(entries) < n: wikipedia.output(u"No entry %d (must be 1-%d)" % (n, len(entries))) elif op == 'd': desc = (entries[n-1].get('post') or entries[n-1].has_key('desc') and u', ' + entries[n-1]['desc'] or '') entries[n-1]['post'] = ' '.join(desc.split(' ')[:n2 + 1]) wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) else: wikipedia.output(u"Not understood: %s" % inp) elif m2: n = int(m2.group(1)) op = m2.group(2) if n < 1 or len(entries) < n: wikipedia.output(u"No entry %d (must be 1-%d)" % (n, len(entries))) elif op == 'p': for k, v in entries[n-1].items(): wikipedia.output(u' %s: %s' % (k, v)) elif op == 'd': if m2.group(3) and 2 <= len(m2.group(3)): entries[n-1]['post'] = u', ' + m2.group(3)[1:] wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) else: entries[n-1]['post'] = '' wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) elif op == 'P': entries[n-1]['pre'] = m2.group(3)[1:] wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) elif op == 't': try: wikipedia.output(entries[n-1]['pagelink'].get()) except: wikipedia.output(u"No page %s" % entries[n-1]['pagelink'].title()) elif op == 'i': wikipedia.output(entries[n-1].get('intro', u'No intro')) elif op == 'x': entries[n-1]['exclude'] = not entries[n-1].get('exclude') wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) else: wikipedia.output(u"Not understood: %s" % inp) elif m3: n = int(m3.group(1)) if n < 1 or len(entries) < n: wikipedia.output(u"No entry %d (must be 1-%d)" % (n, len(entries))) else: desc = (entries[n-1].get('post') or entries[n-1].has_key('desc') and u', ' + entries[n-1]['desc'] or '') entries[n-1]['post'] = re.sub(m3.group(2), m3.group(3), desc) wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) elif m4: self.patterns.append((m4.group(1), m4.group(2))) self.save_patterns() self.apply_patterns() else: wikipedia.output(u"Not understood: %s" % inp) comment = "yearbot - robot-assisted updating of births and deaths" topic_names = ['births', 'deaths'] def run(self): self.topic_entries = {} for what in self.topic_names: self.topic_entries[what] = self.get_entries(what) self.sort_entries(self.topic_entries[what], what) self.apply_patterns() while 1: for what in self.topic_names: entries = self.topic_entries[what] for i in range((len(entries) + 19) / 20): efrom = i * 20 eto = min(len(entries), (i + 1) * 20) batch = entries[efrom : eto] title = u'%s (%d-%d)' % (what.capitalize(), efrom + 1, eto), if not self.interface(title, batch, what): return self.sort_entries(entries, what) self.year_text = self.write_entries(entries, what) wikipedia.showDiff(self.year_pl.get(), self.year_text) if wikipedia.input(u"OK? [yN]") == 'y': self.year_pl.put(self.year_text, self.comment) return if __name__ == '__main__': wikipedia.username = 'yearbot' try: if len(sys.argv) < 2: raise "No year specified" Year(sys.argv[1]).run() finally: wikipedia.stopme()