User:Gdr/authority.py
From Wikipedia, the free encyclopedia
< User:Gdr
#!/usr/bin/python # # # AUTHORITY.PY -- ADD AUTHORITY TO TAXOBOX # Gdr, 2005-07-05 # # # 1. INTRODUCTION # # This Python script add an authority to a selected taxobox on the # English wikipedia. # # # 1.1 USAGE # # ./authority.py --rebuild Rebuild abbreviation table # ./authority.py --query=ABBREV Query abbreviation # ./authority.py TAXON Find authority and add it to taxon # ./authority.py TAXON AUTHORITY Add authority to taxon # # # 1.2 OPTIONS # # -r --rebuild Rebuild abbreviation table # -q X --query=X Query abbreviation # -a A --article=A Start at article A instead of TAXON # -n --noexpand Don't expand abbreviations # -d --disambig Solve disambiguations for abbrevs # # # 1.2 EXAMPLES # # ./authority.py Magnolia # ./authority.py 'Boa constrictor' # ./authority.py Quercus L. # ./authority.py 'Passer domesticus' '(Linnaeus, 1758)' # ./authority.py 'Plasmodium vivax' 'Grassi & Feletti 1890' # ./authority.py -a 'Homo (genus)' Homo # # # 1.3 LICENCE # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. import getopt import htmlentitydefs import os import pickle import re import sys import time import unicodedata import urllib import wikipedia class Error(Exception): def __init__(self, s): wikipedia.output(unicode(s)) self.s = s def __repr__(self): return self.s class Authority: # 2. CONFIGURATION # 2.1 USER CONFIGURATION # Which Wikipedia we are editing. site = wikipedia.Site('en') # 'authfile' is the filename in which the tables of author names and # abbreviations will be saved. authfile = 'authority.dat' # A regular expression that matches an authority and abbreviation in # a Wikipedia article. (This is the default; you can override it for # particular sources; see below.) auth_re = re.compile(ur'^\*[ \']*([\w\'., -]+[\w.])[ \']*' ur' +(?:[-\u2013]|&[nm]dash;) +' ur'\[\[([^\]|]+).*\r?\n', re.M|re.U) # 'wiki_abbrev_sources' is a dictionary mapping a code letter to a # Wikipedia sources for authority abbreviations. Each source is a # dictionary with these keys: # # name ---- name of the Wikipedia article containing authorities and # their abbreviations # re ------ a regular expression matching an authority and its # abbreviation(s). There must be two groups, one for the # abbreviation(s) for that authority and one for the name # of the article about that authority. If omitted, auth_re # is used as the default. Abbreviations are presumed to be # separated by commas. # groups -- a tuple giving the group for the abbreviation(s) and the # article; if omitted, (1,2) is the default. # fmt ----- format string for a new authority. Use %A for the # abbreviation and %B for the authority. # sort ---- How to sort (by 'surname' or by 'abbrev'). wiki_abbrev_sources = { 'b': {'name': 'List of botanists by author abbreviation', 'fmt': "* '''%A''' - [[%B]]\n", 'sort': 'abbrev'}, 'z': {'name': 'List of zoologists by author abbreviation', 'fmt': "* %A - [[%B]]\n", 'sort': 'surname'}, } # 'other_abbbrev_sources' is a list of other (non-Wikipedia) sources # for abbreviations. Each entry is a dictionary with keys: # # taxon --- a regular expression matching a taxon; means that this # entry is only appropriate for articles contained in taxa # matching this regexp. For example 'Plant' for a source # listing only botanists, or 'Arthropod' for a source # listsing only entomologists. # re ------ a regular expression matching the abbreviation and its # expansion. %A will be replaced by the regexp-escaped # form of the abbreviation we are looking for. It should # contain one group, matching the expansion. # url ----- the URL to visit to find the abbreviation. %A will be # replaced by the URL-encoded form of the abbreviation we # are looking for. other_abbrev_sources = [ {'taxon': 'Plant', 'url': 'http://www.ipni.org/ipni/authorsearch?find_abbreviation=%A&query_type=by_query', 're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'}, {'url': 'http://www.ipni.org/ipni/authorsearch?find_surname=%A&query_type=by_query', 're': r'(?u)>%A</a> - (\w(?:&[a-z]+;|[\w.\' -]+)*(?!\d)\w) *[0-9\n]'}, ] # 'auth_sources' is a list of sources to consult to find the # authority for a taxon. Each entry is a dictionary with these keys: # # taxon --- a regular expression matching a taxon; means that this # entry is only appropriate for articles contained in taxa # matching this regexp. For example 'Plant' for a source # listing only plant names, or 'Coleoptera' for a source # listsing only beetles. # url ----- the URL to visit to find the taxon. %T will be replaced # by the URL-encoded form of the taxon we are looking # for, and %S by the SN2000 "subject" area. # re ------ a regexp for getting the authority. %A will be replaced # by the regexp-escaped form of the abbreviation we are # looking for. It should contain one group, matching the # expansion. auth_sources = [ {'taxon': 'Plant', 'url': ('http://www.ipni.org/ipni/plantsearch?' 'find_wholeName=%T&query_type=by_query'), 're': r'<i>%T</i> (.*)</a>'}, {'url': ('http://sn2000.taxonomy.nl/Taxonomicon/TaxonList.aspx?' 'searchBy=ScientificName&subject=%S&search=%T'), 're': r'<i>%T</i>[^<]*<font size="-1"> *(\(?[^<,]+,? +[0-9]+\)?)'}, # {'url': ('http://www.itis.usda.gov/servlet/SingleRpt/SingleRpt?' 'search_topic=Scientific_Name&search_value=%T'), 're': (r'(?i)<SPAN CLASS="taxon_head"><I>%T</I></SPAN>' r'[ \r\n]*<A.*[ \r\n]*<SPAN CLASS="taxon_head">[ \r\n]*' r' <B>([^<]+)</B></A>'),} ] # 2.2 OTHER CONFIGURATION # 'rank_to_subject' is a dictionary mapping Linnaean rank in Latin # (as used in Wikipedia taxobox template names) to the SN2000 # "Subject area" in which a taxon can be looked up. Ranks not listed # here are looked up in the subject area "High". rank_to_subject = { 'subspecies': 'Species', 'species': 'Species', 'subgenus': 'Genus', 'genus': 'Genus', 'tribus': 'Family', 'subfamilia': 'Family', 'familia': 'Family', 'superfamilia': 'Family', } # Don't ask easy questions of the user? noquery = False def __init__(self): for s in self.wiki_abbrev_sources.values(): s['page'] = wikipedia.Page(self.site, s['name']) self.restore_abbreviations() # 3. ABBREVIATIONS # # We want to be able to find abbreviations and turn them into links # to the appropriate article. For example, given the abbreviation # 'L.' we need to generate the wikitext '[[Carolus Linnaeus|L.]]'. # This section includes the code for finding, storing, and updating # these abbreviations. # 3.1 LOADING AND SAVING ABBREVIATIONS # Load abbreviations from disk. def restore_abbreviations(self): self.abbrev = {} if os.path.isfile(self.authfile): f = open(self.authfile, 'r') if f: self.abbrev = pickle.load(f) f.close() # Save authorities to disk. def save_abbreviations(self): f = file('authority.dat', 'w') pickle.dump(self.abbrev, f) f.close() def unhtmlify(self, s): s = s.decode('iso-8859-1') while 1: m = re.search(r'&([a-z]+);', s) if not m: break s = (s[:m.start(0)] + unichr(htmlentitydefs.name2codepoint[m.group(1)]) + s[m.end(0):]) return s # Normalize the unicode string 's' into ASCII. The idea is to store # the authority Lac'ep`ede under the key 'Lacepede' so that # inconsistent accentuation doesn't cause us to miss an # abbreviation. We decompose all composed characters and then ignore # everything non-ASCII. (This converts eacute->e etc.) def normalize(self, s): return unicodedata.normalize('NFD', unicode(s)).encode('ascii', 'ignore') # Add an abbreviation to the table. 'abbrev' is the abbreviation; # 'article' is the title of the Wikipedia article on that authority; # 'code' is the code for the list from which it came, if any. def add_abbreviation(self, abbrev, article, code = None): key = self.normalize(abbrev) if not self.abbrev.has_key(key): self.abbrev[key] = [] for a in self.abbrev[key]: # Do we already have this authority under this abbreviation? if abbrev == a[0] and article == a[1]: return self.abbrev[key].append((abbrev, article, code)) # 3.2 USER INTERFACE FOR ADDING A NEW ABBREVIATION # If we don't find an abbreviation in any of wiki_abbrev_sources, we can # prompt the user to tell us the article title corresponding to the # abbreviation; then we can add it to the relevant source. # Return the normalized surname of the abbreviation. def surname(self, abbrev): m = re.search(r'(?ui)(?:de |von |d\')?[\w-]+\.?$', self.normalize(abbrev)) if m: return m.group(0) else: wikipedia.output(u"No surname for %s" % abbrev) return 'a' # 'abbrev' is the abbreviation for the authority described at # 'article'. Add this to the source given by 'code'. def add_abbreviation_to_source(self, abbrev, article, code): source = self.wiki_abbrev_sources[code] text = source['page'].get() if source['sort'] == 'surname': sortkey = self.surname(abbrev) else: sortkey = abbrev groups = source.get('groups', (1,2)) # Format authority for insertion into the source. fmt = source['fmt'] fmt = re.sub('%A', abbrev, fmt) if article[-1] == '(': fmt = re.sub('%B', article + '|', fmt) else: fmt = re.sub('%B', article, fmt) # Go through abbreviations in the source until we get to the # appropriate point in alphabetical order by surname. for m in re.finditer(source.get('re', self.auth_re), text): newtext = None if source['sort'] == 'surname': s2 = self.surname(m.group(groups[0])) else: s2 = m.group(groups[0]) if sortkey[0] != s2[0]: # Sort keys not in the same letter of the alphabet. continue elif sortkey < s2: # New abbrev goes before this one. newtext = text[:m.start(0)] + fmt + text[m.start(0):] elif re.match(r'(?: *\r?\n)*==', text[m.end(0):]): # We've reached the end of the section for the right # letter, but not found anywhere to put the new # abbrev. So it goes at the end. newtext = text[:m.end(0)] + fmt + text[m.end(0):] else: continue # Found a place for it. wikipedia.showDiff(source['page'].get(), newtext) if wikipedia.input(u'OK? [yN]') == 'y': source['page'].put(newtext, 'nomialbot - adding %s = %s' % (abbrev, article)) return wikipedia.output(u'Sorry, nowhere to put authority %s' % fmt) # 'abbrev' is the abbreviation for the authority described at # 'article'. Ask the user which source to add it to. def user_add_abbreviation(self, abbrev, article): for code, source in self.wiki_abbrev_sources.items(): wikipedia.output(u'(%s) %s' % (code, source['name'])) if self.noquery: inp = None else: inp = wikipedia.input(u"Add abbreviation %s = %s to which source? [%s]" % (abbrev, article, ''.join(self.wiki_abbrev_sources.keys()))) if self.wiki_abbrev_sources.has_key(inp): self.add_abbreviation(abbrev, article, inp) self.save_abbreviations() self.add_abbreviation_to_source(abbrev, article, inp) else: self.add_abbreviation(abbrev, article) self.save_abbreviations() # 3.3 FINDING EXPANSIONS FOR ABBREVIATIONS # Rebuild table of authorities from the Wikipedia articles listed in # 'wiki_abbrev_sources'. def rebuild_abbreviations(self): wikipedia.getall(self.site, map(lambda l: l['page'], self.wiki_abbrev_sources.values())) for code, s in self.wiki_abbrev_sources.items(): for m in re.finditer(s.get('re', self.auth_re), s['page'].get()): groups = s.get('groups', (1,2)) abbrevs = m.group(groups[0]) pagename = m.group(groups[1]) for a in re.split(r', +', abbrevs): self.add_abbreviation(a, pagename, code) self.save_abbreviations() # User interface for finding an abbreviation using the stored # abbrevs, returning the pair (abbrev, expansion) or None. def find_abbreviation_in_store(self, abbrev): key = self.normalize(abbrev) if self.abbrev.has_key(key): if len(self.abbrev[key]) == 1: return self.abbrev[key][0] for i in range(len(self.abbrev[key])): wikipedia.output(u'(%d) %s' % (i + 1, self.abbrev[key][i][1])) while 1: i = wikipedia.input(u"Which authority? [1-%d]" % len(self.abbrev[key])) if (re.match(r'[0-9]+$', i) and int(i) - 1 in range(len(self.abbrev[key]))): break return (abbrev, self.abbrev[key][int(i) - 1][1]) return None # Find abbreviation using 'other_abbrev_sources', returning the pair # (abbrev, expansion) or None. def find_abbreviation_other(self, abbrev): # TODO: check source[taxon] for source in self.other_abbrev_sources: url = re.sub('%A', urllib.quote(abbrev), source['url']) wikipedia.output(u'Trying %s' % url) f = urllib.urlopen(url) r = re.sub('%A', re.escape(abbrev), source['re']) m = re.search(r, f.read()) f.close() if m: e = self.unhtmlify(m.group(1)) self.user_add_abbreviation(abbrev, e) return (abbrev, e) return None # User interface for finding abbreviation using Wikipedia, returning # its expansion, or None. def find_abbreviation_wiki(self, abbrev): # See if there's a Wikipedia page for the abbrev. pl = wikipedia.Page(self.site, abbrev) if not pl.exists(): expansions = [] elif pl.isRedirectPage(): expansions = [wikipedia.Page(self.site, pl.getRedirectTarget())] elif pl.isDisambig(): expansions = pl.linkedPages() else: expansions = [] for i in range(len(expansions)): wikipedia.output(u'(%d) %s' % (i + 1, expansions[i].title())) while 1: if expansions: inp = wikipedia.input(u'Expansion for %s? [1-%d;aecq]' % (abbrev, len(expansions))) else: inp = wikipedia.input(u'Expansion for %s? [aecq]' % abbrev) if inp == 'a': abbrev = wikipedia.input(u'Enter new abbrev:') return self.find_abbreviation(abbrev) elif inp == 'e': expansion = wikipedia.input(u'Enter expansion for %s:' % abbrev) self.user_add_abbreviation(abbrev, expansion) return (abbrev, expansion) elif (re.match(r'[0-9]+$', inp) and int(inp) - 1 in range(len(expansions))): expansion = expansions[int(inp) - 1].title() self.user_add_abbreviation(abbrev, expansion) return (abbrev, expansion) elif inp == 'c': return None elif inp == 'q': raise Error, "Quit requested" elif inp == 'l': for i in range(len(expansions)): wikipedia.output(u'(%d) %s' % (i + 1, expansions[i])) else: wikipedia.output( u'<number> = choose expansion;\n' u'a = enter new abbreviation\n' u'e = enter expansion\n' u'c = continue (with no expansion for abbreviation)\n' u'l = list expansions\n' u'q = quit\n') # Find expansion for abbreviation using all available methods, # returning the pair (abbrev, expansion) or just abbrev if nothing # found. def find_abbreviation(self, abbrev): if abbrev: return (self.find_abbreviation_in_store(abbrev) or self.find_abbreviation_other(abbrev) or self.find_abbreviation_wiki(abbrev) or (abbrev,)) else: return ('',) def wikify_abbreviation(self, expansion): if 2 <= len(expansion): return u'[[%s|%s]]' % (expansion[1], expansion[0]) else: return expansion[0] # 4. FINDING THE AUTHORITY FOR A TAXON # 'format_authority' takes an 'authority', splits it into its # component authorities, makes wikilinks for those components, and # returns a wikitext string. def format_authority(self, authority): r = re.compile(r'^\(|, +[0-9]*| +[0-9]+| +in +| +and +|' r' *\bex\.? +| +& +| +& +|\) *|' r' +et al\.?') abbrevs = r.split(authority) joins = r.findall(authority) expansions = map(self.wikify_abbreviation, map(self.find_abbreviation, abbrevs)) return reduce(lambda x,y: x+y, map(lambda x: x[0]+x[1], zip(expansions, joins + ['']))) # 'find_authority' returns the authority for the given taxon. 'text' # is the text of the Wikipedia article about that taxon. def find_authority(self, taxon, text): rank = self.rank_of_taxon(taxon, text) subject = self.rank_to_subject.get(rank, 'High') for source in self.auth_sources: if (source.has_key('taxon') and not re.search(r'(?m)^\| [a-z_]+ *= *\[\[%s' % source['taxon'], text)): continue url = re.sub('%T', urllib.quote(taxon), source['url']) url = re.sub('%S', subject, url) url = re.sub('%R', rank, url) wikipedia.output(u'Trying %s' % url) f = urllib.urlopen(url) r = re.sub('%T', re.sub(r'\\? +', r'(?: +|</i> +<i>)', re.escape(taxon)), source['re']) m = re.search(r, f.read()) f.close() if m: return self.unhtmlify(m.group(1)) wikipedia.output(u'No authority found for %s' % taxon) return None # 5. UPDATING THE AUTHORITY FOR AN ARTICLE kingdom_map = { 'Plant': 'Plantae', 'Animal': 'Animalia', 'Bacterium': 'Bacteria', 'Fungus': 'Fungi', 'Protist': 'Protista', } def kingdom(self, text): m = re.search(r'(?m)^\| *regnum *= *\[\[([^\|\]]+)', text) if m: return self.kingdom_map.get(m.group(1), m.group(1)) else: raise Error, "No kingdom found." def rank_of_taxon(self, taxon, text): if re.match(r'^[\w-]+ [\w-]+ [\w-]+$', taxon): return 'subspecies' elif re.match(r'^[\w-]+ [\w-]+$', taxon): return 'species' m = re.search(r'(?m)^\| *((?!name)[a-z_]+) *= *' r'[ \']*\[*%s[^\w]\]*[ \']*$' % re.escape(taxon), text) if not m: raise Error, "Can't find taxon %s in taxobox" % taxon return m.group(1) kingdom_to_color = { 'Animalia': 'pink', 'Plantae': 'lightgreen', 'Fungi': 'lightblue', 'Archaea': 'darkgray', 'Protista': 'khaki', 'Bacteria': 'lightgrey', } # 'find_article' takes the name of an article to start looking at, # and returns a Page object. def find_article(self, article): while 1: pl = wikipedia.Page(self.site, article) if not pl.exists(): wikipedia.output(u"No page %s" % pl.title()) i = wikipedia.input(u"Redirect to:") if not i: raise Error, "Quit requested" pl.put(u"#REDIRECT [[%s]]" % i, u"nomialbot - redirecting scientific name %s to %s" % (article, i)) article = i elif pl.isRedirectPage(): article = pl.getRedirectTarget() elif pl.isDisambig(): links = pl.linkedPages() for i in range(len(links)): wikipedia.output(u'(%d) %s' % (i + 1, links[i])) inp = wikipedia.input(u'Choose which article? [1-%d]' % len(links)) if (re.match(r'[0-9]+$', inp) and int(inp) - 1 in range(len(links))): article = links[int(inp) - 1].title() else: raise Error, "Quit requested" else: return pl # 'add_authority_to_article' takes a Page object, a taxon and an # authority. It adds the authority to that page. def add_authority_to_article(self, pl, taxon, authority, expand = True): text = pl.get() text = self.tidy_taxobox(text) if expand: authority = self.format_authority(authority) rank = self.rank_of_taxon(taxon, text) kingdom = self.kingdom(text) if rank == 'species': test_param = 'binomial' auth_param = 'binomial_authority' elif rank == 'subspecies': test_param = 'trinomial' auth_param = 'trinomial_authority' else: test_param = rank auth_param = rank + '_authority' m = re.search('(?m)^\| *%s *=.*$' % re.escape(test_param), text) if not m: raise Error, "Can't find rank %s in %s" % (test_param, pl.title()) m1 = re.search(r'(?m)^\| *%s *= *(.*)' % re.escape(auth_param), text) if not m1: text = (text[:m.end(0)] + u'\n| %s = %s' % (auth_param, authority) + text[m.end(0):]) elif wikipedia.input(u'%s already has authority "%s". ' u'Replace? [yN]' % (taxon, m1.group(1))) == 'y': text = (text[:m1.start(0)] + u'\n| %s = %s' % (auth_param, authority) + text[m1.end(0):]) wikipedia.showDiff(pl.get(), text) if pl.get() != text and (self.noquery or (wikipedia.input(u"OK? [yN]") == 'y')): pl.put(text, u'nomialbot - adding authority for %s %s' % (taxon, authority)) def add_authority(self, article, taxon, authority, expand = True): pl = self.find_article(article) if pl: self.add_authority_to_article(pl, taxon, authority, expand) def find_and_add_authority(self, article, taxon, expand = True): pl = self.find_article(article) if not pl: return authority = self.find_authority(taxon, pl.get()) if authority: self.add_authority_to_article(pl, taxon, authority, expand) # 7. GENERAL TIDYING subs = [ # Capitalize "Taxobox" (r'{{taxobox', '{{Taxobox'), # Italicise genus entry. (r'(?m)^\| * genus *=[ \']*\[\[([^\]]+)\]\][ \']*$', '| genus = \'\'[[\\1]]\'\''), # Abbreviate genus in species entry. (r'(?m)^\| *species *= *([\']*)([A-Z])[a-z]+ ([a-z]+)', r'| species = \1\2. \3'), # Supply missing genus abbrev in species entry. (r'(?m)^(\| *genus *=[ \'\[]*([A-Z])[a-z]+[\] \']* *\n' r'\| *species *=[ \']*)([a-z-]+[ \']*$)', r'\1\2. \3'), # Supply missing species entry. (r'(?m)(^\| *genus *=.*\n)' r'(\| * binomial *= *' r'([A-Z])[a-z]+ ([a-z-]+))', r"\1| species = '''''\3. \4'''''\n\2"), # Italicise genus or species if it appears as the title. (r'(?ms)^\| *name *= *([a-z -]+[a-z]) *(\n.*' r'^\| *(?:genus|species) *=[ \'\[]*\1[ \'\]]*$)', '| name = \'\'\\1\'\'\\2'), # Bold genus if unlinked. (r'(?m)^\| *genus *= *\'*(\w+)\'* *$', "| genus = '''''\\1'''''"), # Cut superfluous taxa. (r'(?m)(?:^\| *(?!(?:regnum|phylum|divisio|classis|ordo|familia|genus|species))' r'(?:super|sub|infra|nano)(?:regnum|phylum|divisio|classis|ordo|familia|genus|species) *=.*\n)+' r'(^\| *(?:regnum|phylum|divisio|classis|ordo|familia|genus|species)' r' *=.*\n)' r'(?=^\| *[a-z]+ *=.*$)', r'\1'), ] conditional_subs = [ # Bold species entry if subject of article. ([r'(?m)^\| *binomial *='], r'(?m)^\| *species *=[ \']*([^\]\'\}]+)[ \']*$', '| species = \'\'\'\'\'\\1\'\'\'\'\''), # Bold subspecies entry if subject of article. ([r'(?m)^\| *trinomial *='], r'(?m)^\| *subspecies *=[ \']*([^\]\'\}]+)[ \']*$', '| subspecies = \'\'\'\'\'\\1\'\'\'\'\''), ] anticonditional_subs = [ # Supply missing binomial entry. ([r'(?m)^\| *binomial *=', r'(?m)^\| *subspecies *='], r'(?m)(^\| *genus *=[ \'\[]*([A-Z])([a-z]+)[ \'\]]*\n(?:.*\n)*' r'(?m)^\| *species *=[ \']*\2. ([a-z-]+)[ \']*\n)', r"\1| binomial = ''\2\3 \4''\n"), ([r'(?m)^\| *binomial *=', r'(?m)^\| *subspecies *='], r'(?m)(^\| *species *=[ \']*([A-Z][a-z]+ [a-z-]+)[ \']*\n)', r"\1| binomial = ''\2''\n"), ] def tidy_taxobox(self, text): for s in self.subs: text = re.sub(s[0], s[1], text) for s in self.conditional_subs: ok = True for c in s[0]: if not re.search(c, text): ok = False break if ok: text = re.sub(s[1], s[2], text) for s in self.anticonditional_subs: ok = True for c in s[0]: if re.search(c, text): ok = False break if ok: text = re.sub(s[1], s[2], text) # Add FishBase reference. if (re.search(r'(?m)^\| *[a-z_]+ *= *' r'\[\[(?:Actinopterygii|Chondrichthyes)\]\]$', text) and not re.search(r'{{FishBase', text)): m1 = re.search(r'(?m)^\| * genus *=[ \'\[]*' r'([A-Z][a-z]+)[ \'\]]*$', text) m2 = re.search(r'(?m)^\| species *=[ \']*' r'(?:[A-Z]\. )?([a-z-]+)[ \']*$', text) if m1 and m2: ref = time.strftime('{{FishBase species | genus = %s | ' 'species = %s | month = %%B | year = %%Y}}' % (m1.group(1), m2.group(1))) elif m1: ref = time.strftime('{{FishBase genus | genus = %s | ' 'month = %%B | year = %%Y}}' % m1.group(1)) else: ref = None if ref: m1 = re.search(r'==+ *References? *==+ *\n+', text) m2 = re.search(r'(?:(?:{{.*-stub}}|\[\[[a-z][a-z]:.*\]\]' r'|\[\[Category:.*\]\])[ \n]*)*$', text) if m1: text = (text[:m1.end(0)] + '* ' + ref + '\n' + text[m1.end(0):]) elif m2: text = (text[:m2.start(0)] + '\n==References==\n* ' + ref + '\n' + text[m2.start(0):]) else: raise Error, "Nowhere to put FishBase reference" return text # 6. DISAMBIGUATION # Run solve_disambiguation on all botanical abbreviations. def disambiguate(self): import solve_disambiguation for a in self.abbrev.values(): for aa in a: if aa[2] == 'b' and aa[0][-1] == '.': bot = solve_disambiguation.DisambiguationRobot( '0', [aa[1]], False, False, [aa[0]], False, True) bot.run() def badusage(): raise Error, ('Usage:\n' '%s --rebuild Rebuild abbreviation table\n' '%s --query=abbrev Query abbreviation\n' '%s taxon Find authority and add it to taxon\n' '%s taxon authority Add authority to taxon\n' % (sys.argv[0], sys.argv[0], sys.argv[0], sys.argv[0])) def main(): wikipedia.username = 'nomialbot' try: auth = Authority() article = None expand = True try: opts, args = getopt.getopt(sys.argv[1:], 'zdnra:q:', ['noexpand', 'rebuild', 'article=', 'query=', 'disambig', 'noquery']) for o, a in opts: if o in ('-q', '--query'): print auth.find_abbreviation(a.decode()) elif o in ('-r', '--rebuild'): auth.rebuild_abbreviations() elif o in ('-d', '--disambig'): auth.disambiguate() elif o in ('-a', '--article'): article = a elif o in ('-n', '--noexpand'): expand = False elif o in ('-z', '--noquery'): auth.noquery = True else: badusage() return except getopt.GetoptError: badusage() return if len(args) == 1: auth.find_and_add_authority(article or args[0], args[0], expand) elif len(args) == 2: auth.add_authority(article or args[0], args[0], args[1], expand) else: badusage() return except Error: return if __name__ == '__main__': try: main() finally: wikipedia.stopme()