User:Gdr/yearbot.py

From Wikipedia, the free encyclopedia

#!/usr/bin/python
#
#
#             YEARBOT.PY -- POPULATE BIRTHS/DEATHS IN YEAR
#                           Gdr, 2005-05-14
#                           Minor updates: User:Docu, 2006-12-17 
#
# INTRODUCTION
#
# This script assists with the population of the "Births" and "Deaths"
# sections of an article about a year in the English wikipedia, using
# articles in [[Category:<year> births]] and [[Category:<year> deaths]].
#
#
# USAGE
#
# See [[User:Gdr/Yearbot]]
# requires [[User:Gdr/history.py]]
#
# DATA STRUCTURES
#
# An entry is a dictionary with these fields:
#
# article   Name of article.
# bdate     Date of birth, as a pair like ('April 17', '0417').
# byear     Birth year, as string like '1543'
# ddate     Date of death, as a pair like ('September 23', '0923').
# dyear     Death year, as string like '1602'
# exclude   1 if article is to be excluded from the page.
# intro     Introductory paragraph of article, if any is found.
# pagelink  wikipedia.Page object referring to article.
# post      String placed after the article link.
# pre       String placed before the article link.
# sort      Sort key, if any.
# desc      Description extracted from article (used as text for 'post'
#           if entry is new).
#
#
# LICENCE
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.

import catlib
import getopt
import history
import re
import sys
import wikipedia

class Year:
    site = wikipedia.Site('en')

    # List of regexp search-and-replace patterns that should be applied
    # to all descriptions.
    patterns = []

    # The year we are working on, its Page, and the original text.
    year = None
    year_pl = None
    year_orig = None
    year_text = None

    ignore = {
        'Special:Categories': 1,
        }

    # Matches a regexp pattern.
    pattern_re = r'/((?:[^\\]|\\.)*)/((?:[^\\]|\\.)*)/$'

    # File to store patterns.
    pattern_file = 'yearbot-patterns'

    def __init__(self, year):
        if not re.match(r'^' + self.year_re + r'$', year):
            print "%s doesn't look like a year" % year
        self.year = year
        self.year_pl = wikipedia.Page(self.site, self.year)
        self.patterns = []
        f = file(self.pattern_file)
        if f:
            for line in f:
                m = re.match(self.pattern_re, line)
                if m:
                    self.patterns.append(m.groups())
        f.close()

    # Matches a year in the range for which the script operates.
    year_re = r'1[0-9][0-9][0-9]'

    # Matches a trailing birth date.
    trail_born_re = re.compile(r'^(.*\S)\s*\(b(?:\.|orn)\s*\[?\[?('
                               + year_re + r')\]?\]?\)$')

    # Matches a trailing death date.
    trail_died_re = re.compile(r'^(.*\S)\s*\(d(?:\.|ied)\s*\[?\[?('
                               + year_re + r')\]?\]?\)$')

    # Matches a month name.
    month_re = (r'January|February|March|April|May|June|'
                r'July|August|September|October|November|December')

    # Matches a date.
    date_re = (r'\[?\[?(?:(' + month_re + r')\s+([0-9]+)|([0-9]+)\s*('
               + month_re + r'))\]?\]?')

    # Matches an entry starting with a date.
    entry_date_re = re.compile(r'^\s*' + date_re
                               + r'\s*(?:-|–|—)?\s*(.*)$')

    # Matches an entry: captures pre, article, linktext, post.
    entry_re = re.compile(r'([^\[]*)\[\[([^\]|]+)(?:\|([^|\]]*))?\]\](.*)')

    # Matches the introductory paragraph of an article, once filled in
    # with birth year and death year.
    intro1_re = r"^.*'''[^']+'''(.*?)\[?\[?%s\]?\]?(.*?)\[?\[?%s\]?\]?\W*(.*)$"
    intro2_re = r"^.*'''[^']+'''[^\(]*\([^\)]+\)(.*)$"

    # Matches description.
    desc_re = r'\s+(?:(?:the|an?)\s+)?(([^,.!?\[]|\[\[[^\]]+\]\])+)[,.!?]'
    desc1_re = re.compile(r'\)\s*was' + desc_re)
    desc2_re = re.compile(r'\),' + desc_re)
    desc3_re = re.compile(r'\s+was' + desc_re)
    desc4_re = re.compile(r',' + desc_re)

    # Matches wiki-link
    link1_re = re.compile(r'\[\[[^|\]]+\|([^|\]]+)\]\]')
    link2_re = re.compile(r'\[\[([^|\]]+)\]\]')

    # Approximate date?
    approx_re = re.compile(r'\bc(?:\.|a\.|irca)')

    def save_patterns(self):
        f = file(self.pattern_file, 'w')
        if f:
            for p in self.patterns:
                f.write(u'/%s/%s/\n' % (p[0], p[1]))
            f.close()
        else:
            print "Couldn't write %s" % self.pattern_file

    def apply_patterns(self):
        for entries in self.topic_entries.values():
            for e in entries:
                for p in self.patterns:
                    if e.has_key('post'):
                        e['post'] = re.sub(p[0], p[1], e['post'])
                    elif e.has_key('desc'):
                        e['desc'] = re.sub(p[0], p[1], e['desc'])

    def unwikify(self, text):
        text = self.link1_re.sub(r'\1', text)
        text = self.link2_re.sub(r'\1', text)
        return text

    def make_date(self, m):
        month = m.group(1) or m.group(4)
        day = m.group(2) or m.group(3)
        return ('%s %s' % (month, day),
                '%02d%02d' % (history.months[month], int(day)))

    def parse_entries(self, what):
        m = re.search(r'==\s*' + what.capitalize()
                      + '\s*==\n((?:\s*\n|\*.*\n)*)',
                      self.year_pl.get())
        if not m:
            print "No ==%s==" % what.capitalize()
            return []
        lines = re.split(r'\s*\n\s*', m.group(1))
        entries = []
        for line_orig in lines:
            entry = {}
            line = re.sub(r'^\*\s*', '', line_orig)
            m = self.entry_date_re.match(line)
            if m:
                date = self.make_date(m)
                if what == 'births':
                    entry['bdate'] = date
                elif what == 'deaths':
                    entry['ddate'] = date
                else:
                    entry['?date'] = date
                line = m.group(5)
            m = self.trail_born_re.match(line)
            if m:
                entry['byear'] = m.group(2)
                line = m.group(1)
            m = self.trail_died_re.match(line)
            if m:
                entry['dyear'] = m.group(2)
                line = m.group(1)
            m = self.entry_re.match(line)
            if m:
                entry['pre'] = m.group(1)
                entry['article'] = m.group(2)
                if m.group(3):
                    entry['linktext'] = m.group(3)
                entry['post'] = m.group(4)
                entries.append(entry)
            elif not re.match(r'^\s*$', line_orig):
                wikipedia.output(u"Couldn't parse %s" % line_orig)
        return entries

    def check_entry(self, entry, key, what, value):
        if value != None:
            if entry.has_key(key) and entry[key] != value:
                wikipedia.output(u"%s '%s' fails to match '%s'; "
                                 u"discarding the former."
                                 % (what, entry[key], value))
            entry[key] = value

    def parse_article(self, entry, what, entries = {}):
        intro = None
        try:
            text = entry['pagelink'].get()
        except wikipedia.IsRedirectPage, arg:
            return
        except wikipedia.NoPage:
            return

        # Look for {{lived}} template.
        m = re.search(r'{{lived|\s*b\s*=\s*(' + self.year_re
                      + r')\s*|\s*d\s*=\s*(' + self.year_re
                      + r')\s*|\s*key\s*=\s*(.*)}}', text)
        if m:
            self.check_entry(entry, 'byear', 'birth year', m.group(1))
            self.check_entry(entry, 'dyear', 'death year', m.group(2))
            self.check_entry(entry, 'sortkey', 'sort key', m.group(3))
        else:
            # Get birth year from category, if possible.
            m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
                          + ') births(?:\|([^|\]]+))?\]\]', text)
            if m:
                self.check_entry(entry, 'byear', 'birth year', m.group(1))
                self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
            else:
                wikipedia.output(u"%s has no Category:births"
                                 % entry['article'])

            # Get death year from category, if possible.
            m = re.search(r'\[\[[Cc]ategory:(' + self.year_re
                          + ') deaths(?:\|([^|\]]+))?\]\]', text)
            if m:
                self.check_entry(entry, 'dyear', 'death year', m.group(1))
                self.check_entry(entry, 'sortkey', 'sort key', m.group(2))
            else:
                wikipedia.output(u"%s has no Category:deaths"
                                 % entry['article'])

        # Find introductory paragraph.
        m = re.search(self.intro1_re % (entry.get('byear') or self.year_re,
                                        entry.get('dyear') or self.year_re),
                      text, re.M)
        if m:
            entry['intro'] = m.group(0)
            intro = m.group(3)

            # Birth date available in intro?
            mm = re.search(self.date_re, m.group(1))
            if mm:
                self.check_entry(entry, 'bdate', 'birth date',
                                 self.make_date(mm))

            # Birth date approximate?
            if self.approx_re.search(m.group(1)) and what == 'births':
                entry['exclude'] = True

            # Death date available in intro?
            mm = re.search(self.date_re, m.group(2))
            if mm:
                self.check_entry(entry, 'ddate', 'death date',
                                 self.make_date(mm))

            # Death date approximate?
            if self.approx_re.search(m.group(2)) and what == 'deaths':
                entry['exclude'] = True
        else:
            m = re.search(self.intro2_re, text, re.M)
            if m:
                entry['intro'] = m.group(0)
                intro = m.group(1)
            else:
                # Use first line instead.
                entry['intro'] = text.split('\n')[0]

        # Brief description available?
        mm = None
        if intro:
            mm = (self.desc3_re.match(intro)
                  or self.desc4_re.match(intro))
        mm = (mm or self.desc1_re.search(entry['intro'])
              or self.desc2_re.search(entry['intro'])
              or self.desc3_re.search(entry['intro'])
              or self.desc4_re.search(entry['intro']))
        if mm:
            entry['desc'] = self.unwikify(mm.group(1))

    def get_entries(self, what):
        # Get entries from the section of the year page.
        entries = self.parse_entries(what)
        article_entry = {}
        for entry in entries:
            article_entry[entry['article']] = entry

        # Get lists of births and deaths articles for this year.
        cl = catlib.Category(self.site, '%s %s' % (self.year, what))
        for a in cl.articles():
            if (not self.ignore.has_key(a.title())
                and not article_entry.has_key(a.title())):
                e = {'article': a.title()}
                article_entry[a.title()] = e

        # Get them all.
        for e in article_entry.values():
            e['pagelink'] = wikipedia.Page(self.site, e['article'])
        wikipedia.getall(self.site, map(lambda e: e['pagelink'],
                                        article_entry.values()))

        # Merge redirects.
        for e in article_entry.values():
            try:
                text = e['pagelink'].get()
            except wikipedia.IsRedirectPage, arg:
                pl = wikipedia.Page(self.site, arg.args[0])
                redir = pl.title()
                wikipedia.output("%s redirects to %s" % (e['article'], redir))
                if article_entry.has_key(redir):
                    e['pagelink'] = article_entry[redir]['pagelink']
                    del article_entry[redir]
                else:
                    e['pagelink'] = pl
                del article_entry[e['article']]
                article_entry[redir] = e
                e['article'] = redir
            except wikipedia.NoPage:
                continue

        # Parse articles.
        for e in article_entry.values():
            self.parse_article(e, what)
        return article_entry.values()

    def guess_sortkey(self, article):
        words = article.split(' ')
        if 1 < len(words):
            return words[-1] + u', ' + u' '.join(words[:-1])
        else:
            return article

    def sort_entries(self, entries, what):
        for e in entries:
            if what == 'births':
                e['sort'] = e.has_key('bdate') and e['bdate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
            elif what == 'deaths':
                e['sort'] = e.has_key('ddate') and e['ddate'][1] or e.get('sortkey') or self.guess_sortkey(e['article'])
            else:
                e['sort'] = e.get('sortkey') or self.guess_sortkey(e['article'])
        entries.sort(lambda e,f: cmp(e['sort'], f['sort']))

    def format_entry(self, entry, what):
        if entry.get('exclude'):
            t = u'- '
        else:
            t = u'* '
        if what == 'births' and entry.has_key('bdate'):
            t = t + u'[[%s]] - ' % entry['bdate'][0]
        elif what == 'deaths' and entry.has_key('ddate'):
            t = t + u'[[%s]] - ' % entry['ddate'][0]
        t = t + (entry.get('pre') or u'')
        if entry.has_key('linktext'):
            t = t + u'[[%s|%s]]' % (entry['article'], entry['linktext'])
        elif entry['article'][-1] == ')':
            t = t + u'[[%s|]]' % entry['article']
        else:
            t = t + u'[[%s]]' % entry['article']
        if entry.has_key('post'):
            t = t + entry['post']
        elif entry.has_key('desc'):
            t = t + u', ' + entry['desc']
        if what == 'births' and entry.has_key('dyear'):
            t = t + u' (died [[%s]])' % entry['dyear']
        elif what == 'deaths' and entry.has_key('byear'):
            t = t + u' (born [[%s]])' % entry['byear']
        return t

    def write_entries(self, entries, what):
        if not self.year_text:
            self.year_text = self.year_pl.get()
        text = self.year_text
        m = re.search(r'==\s*' + what.capitalize()
                      + '\s*==\n((?:\s*\n|\*.*\n)*)',
                      text)
        if not m:
            print "No ==%s==" % what.capitalize()
            return ""
        return (text[:m.start(1)]
                + u'\n'.join(map(lambda e: self.format_entry(e, what),
                                 filter(lambda e: not e.get('exclude'),
                                        entries)))
                + u'\n\n'
                + text[m.end(1):])

    help_text = u"""
    h - Help
    l - List entries
    v - Preview changes to the page
    s - Save changes to the page
    q - Quit
    /<from>/<to>/ - Edit all entries and save pattern in file
    <n>p - Print entry <n>
    <n>i - Print introductory paragraph for entry <n>
    <n>t - Print whole article text for entry <n>
    <n>x - Exclude entry <n> (or include if already excluded)
    <n>d:<desc> - Update description for entry <n>
    <n>d<m> - Cut description for entry <n> to <m> words
    <n>P:<desc> - Update prefix text for entry <n>
    <n>/<from>/<to>/ - Edit entry <n> using regexp search-and-replace
    """

    def show_entries(self, title, entries, what):
        wikipedia.output(u'------- %s -------' % title)
        n = 0
        self.sort_entries(entries, what)
        for e in entries:
            n = n + 1
            wikipedia.output(u"%d%s" % (n, self.format_entry(e, what)))

    def interface(self, title, entries, what):
        self.show_entries(title, entries, what)
        while 1:
            inp = wikipedia.input(u"-- What now? [hlqs0-9pdtx]")
            m1 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*([0-9]+)$', inp)
            m2 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*(:.*)?$', inp)
            m3 = re.match(r'^\s*([0-9]+)\s*' + self.pattern_re, inp)
            m4 = re.match(r'^\s*' + self.pattern_re, inp)
            if inp == 'l':
                self.show_entries(title, entries, what)
            elif inp == 'q':
                return False
            elif inp == 's' or inp == 'w':
                return True
            elif inp == 'h':
                wikipedia.output(self.help_text)
            elif m1:
                n = int(m1.group(1))
                op = m1.group(2)
                n2 = int(m1.group(3))
                if n < 1 or len(entries) < n:
                    wikipedia.output(u"No entry %d (must be 1-%d)"
                                     % (n, len(entries)))
                elif op == 'd':
                    desc = (entries[n-1].get('post')
                            or entries[n-1].has_key('desc')
                            and u', ' + entries[n-1]['desc'] or '')
                    entries[n-1]['post'] = ' '.join(desc.split(' ')[:n2 + 1])
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                else:
                    wikipedia.output(u"Not understood: %s" % inp)
            elif m2:
                n = int(m2.group(1))
                op = m2.group(2)
                if n < 1 or len(entries) < n:
                    wikipedia.output(u"No entry %d (must be 1-%d)"
                                     % (n, len(entries)))
                elif op == 'p':
                    for k, v in entries[n-1].items():
                        wikipedia.output(u'  %s: %s' % (k, v))
                elif op == 'd':
                    if m2.group(3) and 2 <= len(m2.group(3)):
                        entries[n-1]['post'] = u', ' + m2.group(3)[1:]
                        wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                    else:
                        entries[n-1]['post'] = ''
                        wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                elif op == 'P':
                    entries[n-1]['pre'] = m2.group(3)[1:]
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                elif op == 't':
                    try:
                        wikipedia.output(entries[n-1]['pagelink'].get())
                    except:
                        wikipedia.output(u"No page %s" % entries[n-1]['pagelink'].title())
                elif op == 'i':
                    wikipedia.output(entries[n-1].get('intro', u'No intro'))
                elif op == 'x':
                    entries[n-1]['exclude'] = not entries[n-1].get('exclude')
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
                else:
                    wikipedia.output(u"Not understood: %s" % inp)
            elif m3:
                n = int(m3.group(1))
                if n < 1 or len(entries) < n:
                    wikipedia.output(u"No entry %d (must be 1-%d)"
                                     % (n, len(entries)))
                else:
                    desc = (entries[n-1].get('post')
                            or entries[n-1].has_key('desc')
                            and u', ' + entries[n-1]['desc'] or '')
                    entries[n-1]['post'] = re.sub(m3.group(2), m3.group(3), desc)
                    wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what)))
            elif m4:
                self.patterns.append((m4.group(1), m4.group(2)))
                self.save_patterns()
                self.apply_patterns()
            else:
                wikipedia.output(u"Not understood: %s" % inp)

    comment = "yearbot - robot-assisted updating of births and deaths"
    topic_names = ['births', 'deaths']

    def run(self):
        self.topic_entries = {}
        for what in self.topic_names:
            self.topic_entries[what] = self.get_entries(what)
            self.sort_entries(self.topic_entries[what], what)
        self.apply_patterns()
        while 1:
            for what in self.topic_names:
                entries = self.topic_entries[what]
                for i in range((len(entries) + 19) / 20):
                    efrom = i * 20
                    eto = min(len(entries), (i + 1) * 20)
                    batch = entries[efrom : eto]
                    title = u'%s (%d-%d)' % (what.capitalize(), efrom + 1, eto),
                    if not self.interface(title, batch, what):
                        return
                self.sort_entries(entries, what)
                self.year_text = self.write_entries(entries, what)
            wikipedia.showDiff(self.year_pl.get(), self.year_text)
            if wikipedia.input(u"OK? [yN]") == 'y':
                self.year_pl.put(self.year_text, self.comment)
                return

if __name__ == '__main__':
    wikipedia.username = 'yearbot'
    try:
        if len(sys.argv) < 2:
            raise "No year specified"
        Year(sys.argv[1]).run()
    finally:
        wikipedia.stopme()