User:Gdr/authority.py

From Wikipedia, the free encyclopedia

#!/usr/bin/python
#
#
#              AUTHORITY.PY -- ADD AUTHORITY TO TAXOBOX
#                           Gdr, 2005-07-05
#
#
# 1. INTRODUCTION
#
# This Python script add an authority to a selected taxobox on the
# English wikipedia.
#
#
# 1.1 USAGE
#
#    ./authority.py --rebuild         Rebuild abbreviation table
#    ./authority.py --query=ABBREV    Query abbreviation
#    ./authority.py TAXON             Find authority and add it to taxon
#    ./authority.py TAXON AUTHORITY   Add authority to taxon
#
#
# 1.2 OPTIONS
#
#    -r     --rebuild     Rebuild abbreviation table
#    -q X   --query=X     Query abbreviation
#    -a A   --article=A   Start at article A instead of TAXON
#    -n     --noexpand    Don't expand abbreviations
#    -d     --disambig    Solve disambiguations for abbrevs
#
#
# 1.2 EXAMPLES
#
#    ./authority.py Magnolia
#    ./authority.py 'Boa constrictor'
#    ./authority.py Quercus L.
#    ./authority.py 'Passer domesticus' '(Linnaeus, 1758)'
#    ./authority.py 'Plasmodium vivax' 'Grassi & Feletti 1890'
#    ./authority.py -a 'Homo (genus)' Homo
#
#
# 1.3 LICENCE
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.

import getopt
import htmlentitydefs
import os
import pickle
import re
import sys
import time
import unicodedata
import urllib
import wikipedia

class Error(Exception):
    def __init__(self, s):
        wikipedia.output(unicode(s))
        self.s = s

    def __repr__(self):
        return self.s

class Authority:

    # 2. CONFIGURATION

    # 2.1 USER CONFIGURATION

    # Which Wikipedia we are editing.
    site = wikipedia.Site('en')

    # 'authfile' is the filename in which the tables of author names and
    # abbreviations will be saved.
    authfile = 'authority.dat'

    # A regular expression that matches an authority and abbreviation in
    # a Wikipedia article. (This is the default; you can override it for
    # particular sources; see below.)
    auth_re = re.compile(ur'^\*[ \']*([\w\'., -]+[\w.])[ \']*'
                         ur' +(?:[-\u2013]|&[nm]dash;) +'
                         ur'\[\[([^\]|]+).*\r?\n', re.M|re.U)

    # 'wiki_abbrev_sources' is a dictionary mapping a code letter to a
    # Wikipedia sources for authority abbreviations.  Each source is a
    # dictionary with these keys:
    #
    # name ---- name of the Wikipedia article containing authorities and
    #           their abbreviations
    # re ------ a regular expression matching an authority and its
    #           abbreviation(s). There must be two groups, one for the
    #           abbreviation(s) for that authority and one for the name
    #           of the article about that authority. If omitted, auth_re
    #           is used as the default. Abbreviations are presumed to be
    #           separated by commas.
    # groups -- a tuple giving the group for the abbreviation(s) and the
    #           article; if omitted, (1,2) is the default.
    # fmt ----- format string for a new authority. Use %A for the
    #           abbreviation and %B for the authority.
    # sort ---- How to sort (by 'surname' or by 'abbrev').
    wiki_abbrev_sources = {
        'b': {'name': 'List of botanists by author abbreviation',
              'fmt': "* '''%A''' - [[%B]]\n",
              'sort': 'abbrev'},
        'z': {'name': 'List of zoologists by author abbreviation',
              'fmt': "* %A - [[%B]]\n",
              'sort': 'surname'},
        }

    # 'other_abbbrev_sources' is a list of other (non-Wikipedia) sources
    # for abbreviations. Each entry is a dictionary with keys:
    #
    # taxon --- a regular expression matching a taxon; means that this
    #           entry is only appropriate for articles contained in taxa
    #           matching this regexp. For example 'Plant' for a source
    #           listing only botanists, or 'Arthropod' for a source
    #           listsing only entomologists.
    # re ------ a regular expression matching the abbreviation and its
    #           expansion. %A will be replaced by the regexp-escaped
    #           form of the abbreviation we are looking for.  It should
    #           contain one group, matching the expansion.
    # url ----- the URL to visit to find the abbreviation. %A will be
    #           replaced by the URL-encoded form of the abbreviation we
    #           are looking for.
    other_abbrev_sources = [
        {'taxon': 'Plant',
         'url': 'http://www.ipni.org/ipni/authorsearch?find_abbreviation=%A&query_type=by_query',
         're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},
        {'url': 'http://www.ipni.org/ipni/authorsearch?find_surname=%A&query_type=by_query',
         're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'},
        ]

    # 'auth_sources' is a list of sources to consult to find the
    # authority for a taxon. Each entry is a dictionary with these keys:
    #
    # taxon --- a regular expression matching a taxon; means that this
    #           entry is only appropriate for articles contained in taxa
    #           matching this regexp. For example 'Plant' for a source
    #           listing only plant names, or 'Coleoptera' for a source
    #           listsing only beetles.
    # url ----- the URL to visit to find the taxon. %T will be replaced
    #           by the URL-encoded form of the taxon we are looking
    #           for, and %S by the SN2000 "subject" area.
    # re ------ a regexp for getting the authority.  %A will be replaced
    #           by the regexp-escaped form of the abbreviation we are
    #           looking for.  It should contain one group, matching the
    #           expansion.
    auth_sources = [
        {'taxon': 'Plant',
         'url': ('http://www.ipni.org/ipni/plantsearch?'
                 'find_wholeName=%T&query_type=by_query'),
         're': r'<i>%T</i> (.*)</a>'},
        {'url': ('http://sn2000.taxonomy.nl/Taxonomicon/TaxonList.aspx?'
                 'searchBy=ScientificName&subject=%S&search=%T'),
         're': r'<i>%T</i>[^<]*<font size="-1"> *(\(?[^<,]+,? +[0-9]+\)?)'},
       # {'url': ('http://www.itis.usda.gov/servlet/SingleRpt/SingleRpt?' 'search_topic=Scientific_Name&search_value=%T'), 're': (r'(?i)<SPAN CLASS="taxon_head"><I>%T</I></SPAN>' r'[ \r\n]*<A.*[ \r\n]*<SPAN CLASS="taxon_head">[ \r\n]*' r' <B>([^<]+)</B></A>'),}
        ]


    # 2.2 OTHER CONFIGURATION

    # 'rank_to_subject' is a dictionary mapping Linnaean rank in Latin
    # (as used in Wikipedia taxobox template names) to the SN2000
    # "Subject area" in which a taxon can be looked up. Ranks not listed
    # here are looked up in the subject area "High".
    rank_to_subject = {
        'subspecies': 'Species',
        'species': 'Species',
        'subgenus': 'Genus',
        'genus': 'Genus',
        'tribus': 'Family',
        'subfamilia': 'Family',
        'familia': 'Family',
        'superfamilia': 'Family',
        }

    # Don't ask easy questions of the user?
    noquery = False

    def __init__(self):
        for s in self.wiki_abbrev_sources.values():
            s['page'] = wikipedia.Page(self.site, s['name'])
        self.restore_abbreviations()


    # 3. ABBREVIATIONS
    #
    # We want to be able to find abbreviations and turn them into links
    # to the appropriate article. For example, given the abbreviation
    # 'L.' we need to generate the wikitext '[[Carolus Linnaeus|L.]]'.
    # This section includes the code for finding, storing, and updating
    # these abbreviations.


    # 3.1 LOADING AND SAVING ABBREVIATIONS

    # Load abbreviations from disk.
    def restore_abbreviations(self):
        self.abbrev = {}
        if os.path.isfile(self.authfile):
            f = open(self.authfile, 'r')
            if f:
                self.abbrev = pickle.load(f)
                f.close()

    # Save authorities to disk.
    def save_abbreviations(self):
        f = file('authority.dat', 'w')
        pickle.dump(self.abbrev, f)
        f.close()

    def unhtmlify(self, s):
        s = s.decode('iso-8859-1')
        while 1:
            m = re.search(r'&([a-z]+);', s)
            if not m:
                break
            s = (s[:m.start(0)]
                 + unichr(htmlentitydefs.name2codepoint[m.group(1)])
                 + s[m.end(0):])
        return s


    # Normalize the unicode string 's' into ASCII. The idea is to store
    # the authority Lac'ep`ede under the key 'Lacepede' so that
    # inconsistent accentuation doesn't cause us to miss an
    # abbreviation. We decompose all composed characters and then ignore
    # everything non-ASCII. (This converts eacute->e etc.)
    def normalize(self, s):
        return unicodedata.normalize('NFD', unicode(s)).encode('ascii', 'ignore')

    # Add an abbreviation to the table. 'abbrev' is the abbreviation;
    # 'article' is the title of the Wikipedia article on that authority;
    # 'code' is the code for the list from which it came, if any.
    def add_abbreviation(self, abbrev, article, code = None):
        key = self.normalize(abbrev)
        if not self.abbrev.has_key(key):
            self.abbrev[key] = []
        for a in self.abbrev[key]:
            # Do we already have this authority under this abbreviation?
            if abbrev == a[0] and article == a[1]:
                return
        self.abbrev[key].append((abbrev, article, code))


    # 3.2 USER INTERFACE FOR ADDING A NEW ABBREVIATION

    # If we don't find an abbreviation in any of wiki_abbrev_sources, we can
    # prompt the user to tell us the article title corresponding to the
    # abbreviation; then we can add it to the relevant source.

    # Return the normalized surname of the abbreviation.
    def surname(self, abbrev):
        m = re.search(r'(?ui)(?:de |von |d\')?[\w-]+\.?$',
                      self.normalize(abbrev))
        if m:
            return m.group(0)
        else:
            wikipedia.output(u"No surname for %s" % abbrev)
            return 'a'

    # 'abbrev' is the abbreviation for the authority described at
    # 'article'. Add this to the source given by 'code'.
    def add_abbreviation_to_source(self, abbrev, article, code):
        source = self.wiki_abbrev_sources[code]
        text = source['page'].get()
        if source['sort'] == 'surname':
            sortkey = self.surname(abbrev)
        else:
            sortkey = abbrev
        groups = source.get('groups', (1,2))

        # Format authority for insertion into the source.
        fmt = source['fmt']
        fmt = re.sub('%A', abbrev, fmt)
        if article[-1] == '(':
            fmt = re.sub('%B', article + '|', fmt)
        else:
            fmt = re.sub('%B', article, fmt)

        # Go through abbreviations in the source until we get to the
        # appropriate point in alphabetical order by surname.
        for m in re.finditer(source.get('re', self.auth_re), text):
            newtext = None
            if source['sort'] == 'surname':
                s2 = self.surname(m.group(groups[0]))
            else:
                s2 = m.group(groups[0])
            if sortkey[0] != s2[0]:
                # Sort keys not in the same letter of the alphabet.
                continue
            elif sortkey < s2:
                # New abbrev goes before this one.
                newtext = text[:m.start(0)] + fmt + text[m.start(0):]
            elif re.match(r'(?: *\r?\n)*==', text[m.end(0):]):
                # We've reached the end of the section for the right
                # letter, but not found anywhere to put the new
                # abbrev. So it goes at the end.
                newtext = text[:m.end(0)] + fmt + text[m.end(0):]
            else:
                continue
            # Found a place for it.
            wikipedia.showDiff(source['page'].get(), newtext)
            if wikipedia.input(u'OK? [yN]') == 'y':
                source['page'].put(newtext, 'nomialbot - adding %s = %s'
                                   % (abbrev, article))
            return
        wikipedia.output(u'Sorry, nowhere to put authority %s' % fmt)

    # 'abbrev' is the abbreviation for the authority described at
    # 'article'. Ask the user which source to add it to.
    def user_add_abbreviation(self, abbrev, article):
        for code, source in self.wiki_abbrev_sources.items():
            wikipedia.output(u'(%s) %s' % (code, source['name']))
        if self.noquery:
            inp = None
        else:
            inp = wikipedia.input(u"Add abbreviation %s = %s to which source? [%s]"
                                  % (abbrev, article,
                                     ''.join(self.wiki_abbrev_sources.keys())))
        if self.wiki_abbrev_sources.has_key(inp):
            self.add_abbreviation(abbrev, article, inp)
            self.save_abbreviations()
            self.add_abbreviation_to_source(abbrev, article, inp)
        else:
            self.add_abbreviation(abbrev, article)
            self.save_abbreviations()


    # 3.3 FINDING EXPANSIONS FOR ABBREVIATIONS

    # Rebuild table of authorities from the Wikipedia articles listed in
    # 'wiki_abbrev_sources'.
    def rebuild_abbreviations(self):
        wikipedia.getall(self.site,
                         map(lambda l: l['page'], self.wiki_abbrev_sources.values()))
        for code, s in self.wiki_abbrev_sources.items():
            for m in re.finditer(s.get('re', self.auth_re), s['page'].get()):
                groups = s.get('groups', (1,2))
                abbrevs = m.group(groups[0])
                pagename = m.group(groups[1])
                for a in re.split(r', +', abbrevs):
                    self.add_abbreviation(a, pagename, code)
        self.save_abbreviations()

    # User interface for finding an abbreviation using the stored
    # abbrevs, returning the pair (abbrev, expansion) or None.
    def find_abbreviation_in_store(self, abbrev):
        key = self.normalize(abbrev)
        if self.abbrev.has_key(key):
            if len(self.abbrev[key]) == 1:
                return self.abbrev[key][0]
            for i in range(len(self.abbrev[key])):
                wikipedia.output(u'(%d) %s' % (i + 1, self.abbrev[key][i][1]))
            while 1:
                i = wikipedia.input(u"Which authority? [1-%d]"
                                    % len(self.abbrev[key]))
                if (re.match(r'[0-9]+$', i)
                    and int(i) - 1 in range(len(self.abbrev[key]))):
                    break
            return (abbrev, self.abbrev[key][int(i) - 1][1])
        return None

    # Find abbreviation using 'other_abbrev_sources', returning the pair
    # (abbrev, expansion) or None.
    def find_abbreviation_other(self, abbrev):
        # TODO: check source[taxon]
        for source in self.other_abbrev_sources:
            url = re.sub('%A', urllib.quote(abbrev), source['url'])
            wikipedia.output(u'Trying %s' % url)
            f = urllib.urlopen(url)
            r = re.sub('%A', re.escape(abbrev), source['re'])
            m = re.search(r, f.read())
            f.close()
            if m:
                e = self.unhtmlify(m.group(1))
                self.user_add_abbreviation(abbrev, e)
                return (abbrev, e)
        return None

    # User interface for finding abbreviation using Wikipedia, returning
    # its expansion, or None.
    def find_abbreviation_wiki(self, abbrev):
        # See if there's a Wikipedia page for the abbrev.
        pl = wikipedia.Page(self.site, abbrev)
        if not pl.exists():
            expansions = []
        elif pl.isRedirectPage():
            expansions = [wikipedia.Page(self.site, pl.getRedirectTarget())]
        elif pl.isDisambig():
            expansions = pl.linkedPages()
        else:
            expansions = []
        for i in range(len(expansions)):
            wikipedia.output(u'(%d) %s' % (i + 1, expansions[i].title()))
        while 1:
            if expansions:
                inp = wikipedia.input(u'Expansion for %s? [1-%d;aecq]'
                                      % (abbrev, len(expansions)))
            else:
                inp = wikipedia.input(u'Expansion for %s? [aecq]'
                                      % abbrev)
            if inp == 'a':
                abbrev = wikipedia.input(u'Enter new abbrev:')
                return self.find_abbreviation(abbrev)
            elif inp == 'e':
                expansion = wikipedia.input(u'Enter expansion for %s:'
                                            % abbrev)
                self.user_add_abbreviation(abbrev, expansion)
                return (abbrev, expansion)
            elif (re.match(r'[0-9]+$', inp)
                  and int(inp) - 1 in range(len(expansions))):
                expansion = expansions[int(inp) - 1].title()
                self.user_add_abbreviation(abbrev, expansion)
                return (abbrev, expansion)
            elif inp == 'c':
                return None
            elif inp == 'q':
                raise Error, "Quit requested"
            elif inp == 'l':
                for i in range(len(expansions)):
                    wikipedia.output(u'(%d) %s' % (i + 1, expansions[i]))
            else:
                wikipedia.output(
                    u'<number> = choose expansion;\n'
                    u'a = enter new abbreviation\n'
                    u'e = enter expansion\n'
                    u'c = continue (with no expansion for abbreviation)\n'
                    u'l = list expansions\n'
                    u'q = quit\n')

    # Find expansion for abbreviation using all available methods,
    # returning the pair (abbrev, expansion) or just abbrev if nothing
    # found.
    def find_abbreviation(self, abbrev):
        if abbrev:
            return (self.find_abbreviation_in_store(abbrev)
                    or self.find_abbreviation_other(abbrev)
                    or self.find_abbreviation_wiki(abbrev)
                    or (abbrev,))
        else:
            return ('',)

    def wikify_abbreviation(self, expansion):
        if 2 <= len(expansion):
            return u'[[%s|%s]]' % (expansion[1], expansion[0])
        else:
            return expansion[0]


    # 4. FINDING THE AUTHORITY FOR A TAXON

    # 'format_authority' takes an 'authority', splits it into its
    # component authorities, makes wikilinks for those components, and
    # returns a wikitext string.
    def format_authority(self, authority):
        r = re.compile(r'^\(|, +[0-9]*| +[0-9]+| +in +| +and +|'
                       r' *\bex\.? +| +& +| +& +|\) *|'
                       r' +et al\.?')
        abbrevs = r.split(authority)
        joins = r.findall(authority)
        expansions = map(self.wikify_abbreviation,
                         map(self.find_abbreviation, abbrevs))
        return reduce(lambda x,y: x+y,
                      map(lambda x: x[0]+x[1], zip(expansions, joins + [''])))

    # 'find_authority' returns the authority for the given taxon. 'text'
    # is the text of the Wikipedia article about that taxon.
    def find_authority(self, taxon, text):
        rank = self.rank_of_taxon(taxon, text)
        subject = self.rank_to_subject.get(rank, 'High')
        for source in self.auth_sources:
            if (source.has_key('taxon') and not
                re.search(r'(?m)^\| [a-z_]+ *= *\[\[%s' % source['taxon'], text)):
                continue
            url = re.sub('%T', urllib.quote(taxon), source['url'])
            url = re.sub('%S', subject, url)
            url = re.sub('%R', rank, url)
            wikipedia.output(u'Trying %s' % url)
            f = urllib.urlopen(url)
            r = re.sub('%T',
                       re.sub(r'\\? +', r'(?: +|</i> +<i>)', re.escape(taxon)),
                       source['re'])
            m = re.search(r, f.read())
            f.close()
            if m:
                return self.unhtmlify(m.group(1))
        wikipedia.output(u'No authority found for %s' % taxon)
        return None


    # 5. UPDATING THE AUTHORITY FOR AN ARTICLE

    kingdom_map = {
        'Plant': 'Plantae',
        'Animal': 'Animalia',
        'Bacterium': 'Bacteria',
        'Fungus': 'Fungi',
        'Protist': 'Protista',
        }

    def kingdom(self, text):
        m = re.search(r'(?m)^\| *regnum *= *\[\[([^\|\]]+)', text)
        if m:
            return self.kingdom_map.get(m.group(1), m.group(1))
        else:
            raise Error, "No kingdom found."

    def rank_of_taxon(self, taxon, text):
        if re.match(r'^[\w-]+ [\w-]+ [\w-]+$', taxon):
            return 'subspecies'
        elif re.match(r'^[\w-]+ [\w-]+$', taxon):
            return 'species'
        m = re.search(r'(?m)^\| *((?!name)[a-z_]+) *= *'
                      r'[ \']*\[*%s[^\w]\]*[ \']*$' % re.escape(taxon), text)
        if not m:
            raise Error, "Can't find taxon %s in taxobox" % taxon
        return m.group(1)

    kingdom_to_color = {
        'Animalia': 'pink',
        'Plantae': 'lightgreen',
        'Fungi': 'lightblue',
        'Archaea': 'darkgray',
        'Protista': 'khaki',
        'Bacteria': 'lightgrey',
        }

    # 'find_article' takes the name of an article to start looking at,
    # and returns a Page object.
    def find_article(self, article):
        while 1:
            pl = wikipedia.Page(self.site, article)
            if not pl.exists():
                wikipedia.output(u"No page %s" % pl.title())
                i = wikipedia.input(u"Redirect to:")
                if not i:
                    raise Error, "Quit requested"
                pl.put(u"#REDIRECT [[%s]]" % i,
                       u"nomialbot - redirecting scientific name %s to %s"
                       % (article, i))
                article = i
            elif pl.isRedirectPage():
                article = pl.getRedirectTarget()
            elif pl.isDisambig():
                links = pl.linkedPages()
                for i in range(len(links)):
                    wikipedia.output(u'(%d) %s' % (i + 1, links[i]))
                inp = wikipedia.input(u'Choose which article? [1-%d]'
                                      % len(links))
                if (re.match(r'[0-9]+$', inp)
                    and int(inp) - 1 in range(len(links))):
                    article = links[int(inp) - 1].title()
                else:
                    raise Error, "Quit requested"
            else:
                return pl

    # 'add_authority_to_article' takes a Page object, a taxon and an
    # authority. It adds the authority to that page.
    def add_authority_to_article(self, pl, taxon, authority, expand = True):
        text = pl.get()
        text = self.tidy_taxobox(text)
        if expand:
            authority = self.format_authority(authority)
        rank = self.rank_of_taxon(taxon, text)
        kingdom = self.kingdom(text)
        if rank == 'species':
            test_param = 'binomial'
            auth_param = 'binomial_authority'
        elif rank == 'subspecies':
            test_param = 'trinomial'
            auth_param = 'trinomial_authority'
        else:
            test_param = rank
            auth_param = rank + '_authority'
        m = re.search('(?m)^\| *%s *=.*$' % re.escape(test_param), text)
        if not m:
            raise Error, "Can't find rank %s in %s" % (test_param, pl.title())
        m1 = re.search(r'(?m)^\| *%s *= *(.*)' % re.escape(auth_param), text)
        if not m1:
            text = (text[:m.end(0)]
                    + u'\n| %s = %s' % (auth_param, authority)
                    + text[m.end(0):])
        elif wikipedia.input(u'%s already has authority "%s". '
                             u'Replace? [yN]' % (taxon, m1.group(1))) == 'y':
            text = (text[:m1.start(0)]
                    + u'\n| %s = %s' % (auth_param, authority)
                    + text[m1.end(0):])
        wikipedia.showDiff(pl.get(), text)
        if pl.get() != text and (self.noquery or (wikipedia.input(u"OK? [yN]") == 'y')):
            pl.put(text, u'nomialbot - adding authority for %s %s'
                   % (taxon, authority))

    def add_authority(self, article, taxon, authority, expand = True):
        pl = self.find_article(article)
        if pl:
            self.add_authority_to_article(pl, taxon, authority, expand)

    def find_and_add_authority(self, article, taxon, expand = True):
        pl = self.find_article(article)
        if not pl:
            return
        authority = self.find_authority(taxon, pl.get())
        if authority:
            self.add_authority_to_article(pl, taxon, authority, expand)


    # 7. GENERAL TIDYING

    subs = [
        # Capitalize "Taxobox"
        (r'{{taxobox', '{{Taxobox'),

        # Italicise genus entry.
        (r'(?m)^\| * genus *=[ \']*\[\[([^\]]+)\]\][ \']*$',
         '| genus = \'\'[[\\1]]\'\''),

        # Abbreviate genus in species entry.
        (r'(?m)^\| *species *= *([\']*)([A-Z])[a-z]+ ([a-z]+)',
         r'| species = \1\2. \3'),

        # Supply missing genus abbrev in species entry.
        (r'(?m)^(\| *genus *=[ \'\[]*([A-Z])[a-z]+[\] \']* *\n'
         r'\| *species *=[ \']*)([a-z-]+[ \']*$)',
         r'\1\2. \3'),

        # Supply missing species entry.
        (r'(?m)(^\| *genus *=.*\n)'
         r'(\| * binomial *= *'
         r'([A-Z])[a-z]+ ([a-z-]+))',
         r"\1| species = '''''\3. \4'''''\n\2"),

        # Italicise genus or species if it appears as the title.
        (r'(?ms)^\| *name *= *([a-z -]+[a-z]) *(\n.*'
         r'^\| *(?:genus|species) *=[ \'\[]*\1[ \'\]]*$)',
         '| name = \'\'\\1\'\'\\2'),

        # Bold genus if unlinked.
        (r'(?m)^\| *genus *= *\'*(\w+)\'* *$',
         "| genus = '''''\\1'''''"),

        # Cut superfluous taxa.
        (r'(?m)(?:^\| *(?!(?:regnum|phylum|divisio|classis|ordo|familia|genus|species))'
         r'(?:super|sub|infra|nano)(?:regnum|phylum|divisio|classis|ordo|familia|genus|species) *=.*\n)+'
         r'(^\| *(?:regnum|phylum|divisio|classis|ordo|familia|genus|species)'
         r' *=.*\n)'
         r'(?=^\| *[a-z]+ *=.*$)',
         r'\1'),
        ]

    conditional_subs = [
        # Bold species entry if subject of article.
        ([r'(?m)^\| *binomial *='],
         r'(?m)^\| *species *=[ \']*([^\]\'\}]+)[ \']*$',
         '| species = \'\'\'\'\'\\1\'\'\'\'\''),

        # Bold subspecies entry if subject of article.
        ([r'(?m)^\| *trinomial *='],
         r'(?m)^\| *subspecies *=[ \']*([^\]\'\}]+)[ \']*$',
         '| subspecies = \'\'\'\'\'\\1\'\'\'\'\''),
        ]

    anticonditional_subs = [
        # Supply missing binomial entry.
        ([r'(?m)^\| *binomial *=',
          r'(?m)^\| *subspecies *='],
         r'(?m)(^\| *genus *=[ \'\[]*([A-Z])([a-z]+)[ \'\]]*\n(?:.*\n)*'
         r'(?m)^\| *species *=[ \']*\2. ([a-z-]+)[ \']*\n)',
         r"\1| binomial = ''\2\3 \4''\n"),

        ([r'(?m)^\| *binomial *=',
          r'(?m)^\| *subspecies *='],
         r'(?m)(^\| *species *=[ \']*([A-Z][a-z]+ [a-z-]+)[ \']*\n)',
         r"\1| binomial = ''\2''\n"),
        ]

    def tidy_taxobox(self, text):
        for s in self.subs:
            text = re.sub(s[0], s[1], text)
        for s in self.conditional_subs:
            ok = True
            for c in s[0]:
                if not re.search(c, text):
                    ok = False
                    break
            if ok:
                text = re.sub(s[1], s[2], text)
        for s in self.anticonditional_subs:
            ok = True
            for c in s[0]:
                if re.search(c, text):
                    ok = False
                    break
            if ok:
                text = re.sub(s[1], s[2], text)

        # Add FishBase reference.
        if (re.search(r'(?m)^\| *[a-z_]+ *= *'
                     r'\[\[(?:Actinopterygii|Chondrichthyes)\]\]$', text)
            and not re.search(r'{{FishBase', text)):
            m1 = re.search(r'(?m)^\| * genus *=[ \'\[]*'
                          r'([A-Z][a-z]+)[ \'\]]*$', text)
            m2 = re.search(r'(?m)^\| species *=[ \']*'
                          r'(?:[A-Z]\. )?([a-z-]+)[ \']*$', text)
            if m1 and m2:
                ref = time.strftime('{{FishBase species | genus = %s | '
                                    'species = %s | month = %%B | year = %%Y}}'
                                    % (m1.group(1), m2.group(1)))
            elif m1:
                ref = time.strftime('{{FishBase genus | genus = %s | '
                                    'month = %%B | year = %%Y}}'
                                    % m1.group(1))
            else:
                ref = None
            if ref:
                m1 = re.search(r'==+ *References? *==+ *\n+', text)
                m2 = re.search(r'(?:(?:{{.*-stub}}|\[\[[a-z][a-z]:.*\]\]'
                               r'|\[\[Category:.*\]\])[ \n]*)*$',
                               text)
                if m1:
                    text = (text[:m1.end(0)]
                            + '* ' + ref + '\n'
                            + text[m1.end(0):])
                elif m2:
                    text = (text[:m2.start(0)]
                            + '\n==References==\n* ' + ref + '\n'
                            + text[m2.start(0):])
                else:
                    raise Error, "Nowhere to put FishBase reference"
        return text


    # 6. DISAMBIGUATION

    # Run solve_disambiguation on all botanical abbreviations.
    def disambiguate(self):
        import solve_disambiguation
        for a in self.abbrev.values():
            for aa in a:
                if aa[2] == 'b' and aa[0][-1] == '.':
                    bot = solve_disambiguation.DisambiguationRobot(
                        '0', [aa[1]], False, False, [aa[0]], False, True)
                    bot.run()


def badusage():
    raise Error, ('Usage:\n'
                  '%s --rebuild         Rebuild abbreviation table\n'
                  '%s --query=abbrev    Query abbreviation\n'
                  '%s taxon             Find authority and add it to taxon\n'
                  '%s taxon authority   Add authority to taxon\n'
                  % (sys.argv[0], sys.argv[0], sys.argv[0], sys.argv[0]))

def main():
    wikipedia.username = 'nomialbot'
    try:
        auth = Authority()
        article = None
        expand = True
        try:
            opts, args = getopt.getopt(sys.argv[1:], 'zdnra:q:',
                                       ['noexpand', 'rebuild', 'article=',
                                        'query=', 'disambig', 'noquery'])
            for o, a in opts:
                if o in ('-q', '--query'):
                    print auth.find_abbreviation(a.decode())
                elif o in ('-r', '--rebuild'):
                    auth.rebuild_abbreviations()
                elif o in ('-d', '--disambig'):
                    auth.disambiguate()
                elif o in ('-a', '--article'):
                    article = a
                elif o in ('-n', '--noexpand'):
                    expand = False
                elif o in ('-z', '--noquery'):
                    auth.noquery = True
                else:
                    badusage()
                    return
        except getopt.GetoptError:
            badusage()
            return
        if len(args) == 1:
            auth.find_and_add_authority(article or args[0], args[0], expand)
        elif len(args) == 2:
            auth.add_authority(article or args[0], args[0], args[1], expand)
        else:
            badusage()
            return
    except Error:
        return

if __name__ == '__main__':
    try:
        main()
    finally:
        wikipedia.stopme()
User:Gdr/authority.py

From Wikipedia, the free encyclopedia

Views

Navigation

Interaction

Search