User:Gdr/taxoconvert.py
From Wikipedia, the free encyclopedia
< User:Gdr
#!/usr/bin/python # -*- encoding:utf-8 -*- # # taxoconvert.py -- convert multi-template taxoboxes to single template import codecs import getopt import os import pickle import re import sys import tempfile import wikipedia global checks, edit, debug site = wikipedia.Site('en') checks = True edit = False debug = False class Error(Exception): def __init__(self, text): self.text = text def __str__(self): return self.text class NoError(Error): None def edittext(s): fn = tempfile.mktemp() f = codecs.open(fn, 'w', 'utf-8') f.write(s) f.close() os.system('%s "%s"' % (os.getenv('EDITOR', 'vi'), fn)) f = codecs.open(fn, 'r', 'utf-8') s = f.read() f.close() return s def canonize(s): return filter(lambda c: c.isalnum(), s).lower() def check(text, newtext): if not checks: return newtext while 1: wikipedia.showDiff(text, newtext) i = wikipedia.input(u'OK? [yNeq]') if i == 'q': raise IOError elif i == 'y': return newtext elif i == 'e': newtext = edittext(newtext) else: return None def record(params, key, value): if debug: wikipedia.output(u"%s = %s" % (key, value)) if params.has_key(key): raise Error(u"Duplicate key %s" % key) if value: params['sequence'].append(key) params[key] = value def parse_nomial(suffix, n, lines, params): if debug: wikipedia.output(u"parse_nomial: suffix = '%s', lines[n] = %s" % (suffix, lines[n])) orig_n = n found = False m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)' r'(?:[ _]+(?:simple|botany|parens))? *\| *' r'color *= *[a-z]+ *\| *' r'\1_name *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m: record(params, m.group(1) + suffix, "''%s''" % m.group(2)) n += 1 found = True m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)' r'(?:[ _]+part)? *\| *' r'(?:color *= *[a-z]+ *\| *)?' r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *' r'author *= *([^\}]*[^\} ]) *\| *' r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "''%s''" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), '%s, %s' % (m.group(3), m.group(4))) n += 1 found = True m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial) *\| *' r'color *= *[a-z]+ *\| *' r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *' r'author *= *([^\}]*[^\} ]) *\| *' r'date *= *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "''%s''" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3)) n += 1 found = True m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)' r'(?:[ _]+(?:parens|botany|simple))? *\| *' r'color *= *[a-z]+ *\| *' r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *' r'author *= *\| *' r'date *= *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "''%s''" % m.group(2)) n += 1 found = True m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)' r'[ _]+parens(?:[ _]+part)? *\| *' r'(?:color *= *[a-z]+ *\| *)?' r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *' r'author *= *([^\}]*[^\} ]) *\| *' r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "''%s''" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), '(%s, %s)' % (m.group(3), m.group(4))) n += 1 found = True m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)' r'[ _]+botany *\| *' r'color *= *[a-z]+ *\| *' r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *' r'author *= *([^\}]*[^\} ]|) *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "''%s''" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3)) n += 1 found = True if n + 1 < len(lines): m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)' r'[ _]+botany *\| *' r'color *= *[a-z]+ *\| *' r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *' r'author *= *([^\}]*[^\} ]|) *}}$', lines[n] + lines[n+1]) if m and not found: record(params, m.group(1) + suffix, "''%s''" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3)) n += 2 found = True m = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *' r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *' r'(?:\| *([0-9]+px))?(?:\|[^\]]*)?\]\] *\| *' r'caption *= *([^\}]*[^\} ]|) *}}$', lines[n]) if m and re.search(r'(?i)(?:range|distribution)', lines[n]): record(params, 'range_map%s' % suffix, m.group(1)) record(params, 'range_map%s_width' % suffix, m.group(2)) record(params, 'range_map%s_caption' % suffix, m.group(3)) n += 1 return (n, orig_n != n) def parse(text, linkname): """parse(text, linkname) -- parse multi-template taxobox from 'text' and return it as a dictionary suitable for constructing a taxobox template.""" params = {'sequence': []} text = re.sub(r'(?m)[ \t\r]+$', '', text) if 1 < len(re.findall(r'(?i){{taxobox[ _]+begin *\|', text)): raise Error(u"Two occurrences of {{taxobox begin}}.") if 1 < len(re.findall(r'(?i){{taxobox[ _]+end *}}', text)): raise Error(u"Two occurrences of {{taxobox end}}.") m = re.search(r'(?is){{taxobox[ _]+begin.*{{taxobox[ _]+end *}}', text) if not m: global done done[linkname] = True raise NoError(u"Can't find taxobox.") lines = re.split(r'(?: *(?:</?br */?>(?= *(?:{{|<))|\n) *)+', m.group(0)) n = 0 m1 = re.match(r'(?i){{taxobox[ _]+begin *\| *color *= *([a-z]+) *\| *' 'name *= *(.*[^ ]) *}}[ \t]*(?:<br */?> *)?$', lines[n]) m2 = re.match(r'(?i){{taxobox[ _]+begin *\| *name *= *(.*[^ ]) *\| *' 'color *= *([a-z]+) *}}[ \t]*(?:<br */?> *)?$', lines[n]) if m1: record(params, 'color', m1.group(1)) record(params, 'name', m1.group(2)) n += 1 elif m2: record(params, 'color', m2.group(2)) record(params, 'name', m2.group(1)) n += 1 else: raise Error(u"Can't find {{taxobox begin}}: %s" % lines[n]) m = re.match(r'(?i){{(?:template:)?(status[^\}]+)}}', lines[n]) if m: record(params, 'status', '{{%s}}' % m.group(1)) n += 1 m = re.match(r'(?i)(?:<small> *)?fossil +(?:range|record): +([^<\n]*[^<\n ]) *' r'(?:</small>)?', lines[n]) if m: record(params, 'fossil_range', m.group(1)) n += 1 if re.match(r'(?i)<!--.*-->', lines[n]): n += 1 image_re = (r'(?i){{taxobox[ _]+image *\| *image *= *' r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *' r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *\| *' r'caption *= *([^\}]*[^\} ]|) *}}$') m1 = re.match(image_re, lines[n]) m2 = re.match(image_re, lines[n] + lines[n+1]) m3 = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *' r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *' r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *}}$', lines[n]) if m1: record(params, 'image', m1.group(1)) record(params, 'image_width', m1.group(2)) record(params, 'image_caption', m1.group(3)) n += 1 elif m2: record(params, 'image', m2.group(1)) record(params, 'image_width', m2.group(2)) record(params, 'image_caption', m2.group(3)) n += 2 elif m3: record(params, 'image', m3.group(1)) record(params, 'image_width', m3.group(2)) n += 1 m = re.match(image_re, lines[n]) if m: record(params, 'image2', m.group(1)) record(params, 'image2_width', m.group(2)) record(params, 'image2_caption', m.group(3)) n += 1 if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *\| *caption *= *}}$', lines[n]): n += 1 if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *(?:|\|.*)}}$', lines[n]): n += 1 if re.match(r'(?i){{taxobox[ _]+image.*(?:Image with unknown copyright status removed|Unsourced image removed)', lines[n]): n += 1 if re.match(r'(?i)<!--.*-->', lines[n]): n += 1 if re.match(r'(?is)<!--.*-->', lines[n] + lines[n+1]): n += 2 m = re.match(r'(?i){{taxobox[ _]+begin[ _]+placement *\| *' r'color *= *[a-z]+ *}}$', lines[n]) if not m: raise Error(u"Can't find {{taxobox begin placement}}: %s" % lines[n]) n += 1 while n < len(lines): m0 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *' r'taxon *= *([^\}]*[^\} ]) *' r'<small>(.*)</?small>}}$', lines[n] + lines[n+1]) if m0: record(params, m0.group(1), m0.group(2)) record(params, m0.group(1) + '_authority', m0.group(3)) n += 2 continue m1 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *' r'taxon *= *([^\}]*[^\} ]) *}}(?:<br */?>)?$', lines[n]) if not m1: break record(params, m1.group(1), m1.group(2)) n += 1 m2 = re.match(r'(?i){{taxobox[ _]+authority *\| *' r'author *= *([^\}]*[^\} ]) *\| *' r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m2: record(params, m1.group(1) + '_authority', '%s, %s' % (m2.group(1), m2.group(2))) n += 1 continue m3 = re.match(r'(?i){{taxobox[ _]+authority[ _]+parens *\| *' r'author *= *([^\}]*[^\} ]) *\| *' r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m3: record(params, m1.group(1) + '_authority', '(%s, %s)' % (m3.group(1), m3.group(2))) n += 1 continue m4 = re.match(r'(?i){{taxobox[ _]+authority[ _]+(?:new|botany)? *\| *' r'author(?:ity)? *= *([^\}]*[^\} ]) *}}$', lines[n]) if m4: record(params, m1.group(1) + '_authority', m4.group(1)) n += 1 continue m5 = re.match(r'(?i)<small> *(.*[^ ]) *(?:</?small>)?', lines[n]) if m5: record(params, m1.group(1) + '_authority', m5.group(1)) n += 1 continue m = re.match(r'(?i){{taxobox[ _]+end[ _]+placement(?: *\| *color *= *[a-z]+ *)?}}$', lines[n]) if not m: raise Error(u"Expected {{taxobox end placement}}: %s" % lines[n]) n += 1 n, found = parse_nomial('', n, lines, params) if found: n, found = parse_nomial('2', n, lines, params) if found: n, found = parse_nomial('3', n, lines, params) if found: n, found = parse_nomial('4', n, lines, params) m = re.match(r'(?i){{taxobox[ _]+section[ _]+type[ _]+species *\| *' r'color *= *[a-z]+ *\| *' r'species *= *([^\}]*[^\} ]) *\| *' r'comment *= *([^\}]*[^\} ]|) *}}$', lines[n]) if m: record(params, 'type_species', "''%s''" % m.group(1)) record(params, 'type_species_authority', m.group(2)) n += 1 if re.match(r'(?i)<!--.*-->', lines[n]): n += 1 m = re.match(r'(?i){{taxobox[ _]+begin[ _]+synonyms *\| *' r'color *= *[a-z]+ *}}$', lines[n]) if m: n += 1 syn = [] while 1: m1 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+simple' r' *\| *binomial_name *= *([^\}]*[^\} ]) *}}$', lines[n]) m2 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+botany' r' *\| *binomial_name *= *([^\}]*[^\} ]) *\| *' r'author *= *([^\}]*[^\} ]) *}}$', lines[n]) m3 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry *\| *' r'binomial_name *= *([^\|\}]*[^\|\} ]) *\| *' r'author *= *([^\}]*[^\} ]) *\| *' r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m1: syn.append("''%s''" % m1.group(1)) elif m2: syn.append("''%s'' <small>%s</small>" % (m2.group(1), m2.group(2))) elif m3: syn.append("''%s'' <small>%s, %s</small>" % (m3.group(1), m3.group(2), m3.group(3))) else: break n += 1 record(params, 'synonyms', '<br/>'.join(syn)) m = re.match(r'(?i){{taxobox[ _]+end[ _]+synonyms}}$', lines[n]) if not m: raise Error(u"Expected {{taxobox synonyms end}} but found: %s" % lines[n]) n += 1 if not params.has_key('binomial') and not params.has_key('trinomial'): n, found = parse_nomial('', n, lines, params) m = re.match(r'(?i){{taxobox[ _]+section[ _]+diversity *\| *' r'color *= *[a-z]+ *\| *' r'link *= *([^\}]*[^\} ]) *\| *' r'diversity *= *([^\}]*[^\} ]) *}}$', lines[n]) if m: record(params, 'diversity', m.group(2)) record(params, 'diversity_link', m.group(1)) n += 1 m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *' r'color *= *[a-z]+ *\| *' r'plural_taxon *= *([^\}]*[^\} ]) *}}$', lines[n]) if not m: m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *' r'plural_taxon *= *([^\}]*[^\} ]) *\| *' r'color *= *[a-z]+ *}}$', lines[n]) if m: record(params, 'subdivision_ranks', m.group(1)) n += 1 m = n while not re.match(r'(?i){{taxobox', lines[n]): n += 1 record(params, 'subdivision', '\n' + '\n'.join(lines[m:n])) if re.match(r'(?i)<!--.*-->', lines[n]): n += 1 if n + 1 < len(lines) and re.match(r'(?i)<!--.*-->', lines[n] + lines[n+1]): n += 2 m = re.match(r'(?i){{taxobox[ _]+end *}}$', lines[n]) if not m: raise Error(u"Unrecognized line: %s" % lines[n]) # Some other checks if params.has_key('norank'): raise Error(u"Can't handle {{taxobox norank entry}}, sorry.") if params.has_key('unranked'): raise Error(u"Can't handle {{taxobox unranked entry}}, sorry.") # Fix some simple mistakes. if (params.has_key('genus') and params.has_key('name') and params['genus'] == "'''''%s'''''" % params['name']): params['name'] = "''%s''" % params['name'] if (params.has_key('binomial') and params.has_key('name') and params['binomial'] == "''%s''" % params['name']): params['name'] = "''%s''" % params['name'] if (params.has_key('trinomial') and params.has_key('name') and params['trinomial'] == "''%s''" % params['name']): params['name'] = "''%s''" % params['name'] if (params.has_key('image_caption') and canonize(params['image_caption']) in (canonize(params.get('name', '')), canonize(params.get('binomial', '')), canonize(params.get('trinomial', '')), canonize(params.get('genus', '')) + 'sp', canonize(params.get('name', '') + params.get('binomial', '')), )): del params['image_caption'] if params.has_key('binomial_authority'): params['binomial_authority'] = re.sub(r',,', ',', params['binomial_authority']) if params.has_key('trinomial_authority'): params['trinomial_authority'] = re.sub(r',,', ',', params['trinomial_authority']) if params.has_key('genus') and re.match(r"'''''[[.*]]'''''$", params['genus']): params['genus'] = params['genus'][3:-3] if params.has_key('name'): m = re.match(r"<center> *(.*[^ ]) *</center>$", params['name']) if m: params['name'] = m.group(1) if params.has_key('subdivision_ranks'): m = re.match(r"<center> *(.*[^ ]) *</center>$", params['subdivision_ranks']) if m: params['subdivision_ranks'] = m.group(1) if params.has_key('genus') and re.match(r"(''')?[^']+\1$", params['genus']): params['genus'] = "''%s''" % params['genus'] if params.has_key('species') and re.match(r"(''')?[^']+\1$", params['species']): params['species'] = "''%s''" % params['species'] if params.has_key('subspecies') and re.match(r"(''')?[^']+\1$", params['subspecies']): params['subspecies'] = "''%s''" % params['subspecies'] if params.has_key('species') and params.has_key('binomial') and re.match(r"''[^']+''$", params['species']): params['species'] = "'''%s'''" % params['species'] if params.has_key('subspecies') and params.has_key('trinomial') and re.match(r"''[^']+''$", params['subspecies']): params['subspecies'] = "'''%s'''" % params['subspecies'] if params.has_key('subdivision') and canonize(params['subdivision']) == 'seetext': params['subdivision'] = '\nSee text.' if (params.has_key('binomial') and params.has_key('species') and re.match("'''''[^']*'''''$", params['species'])): m = re.match(r"'*([A-Z])[a-z-]* ([a-z-]*)'*", params['binomial']) if m: params['species'] = "'''''%s. %s'''''" % (m.group(1), m.group(2)) if (params.has_key('trinomial') and params.has_key('subspecies') and re.match("'''''.*'''''$", params['subspecies'])): m = re.match(r"'*([A-Z])[a-z-]* ([a-z])[a-z-]* ([a-z][a-z-]*)'*", params['trinomial']) if m: params['subspecies'] = "'''''%s. %s. %s'''''" % (m.group(1), m.group(2), m.group(3)) return params def convert(pl): text = pl.get() if edit: text = edittext(text) params = parse(text, pl.title()) newtext = re.sub(r'(?is){{taxobox[ _]+begin *\|.*{{taxobox[ _]+end *}}', '{{Taxobox\n' + ''.join(map(lambda k: '| %s = %s\n' % (k, params[k]), filter(lambda s: params.has_key(s), params['sequence']))) + '}}', text) newtext = check(pl.get(), newtext) if newtext: status, reason, data = pl.put(newtext, u'nomialbot — converted multi-template taxobox to {{Taxobox}}') global done if data == '': done[pl.title()] = True def convertmany(): global site, n, linknames, done pages = map(lambda l: wikipedia.Page(site, l), linknames) fetched = [] while n < len(linknames): try: if not done.get(linknames[n]): if linknames[n] not in fetched: tofetch = filter(lambda p: not done.get(p.title()), pages[n:])[:50] wikipedia.getall(site, tofetch) fetched += map(lambda p: p.title(), tofetch) wikipedia.output("Trying %s" % linknames[n]) if pl.namespace() != 0: done[pl.title()] = True wikipedia.output(u"%s not in main namespace" % pl.title()) else: convert(pages[n]) except wikipedia.LockedPage: wikipedia.output("%s is locked" % linknames[n]) except wikipedia.IsRedirectPage: wikipedia.output("%s is redirect" % linknames[n]) done[linknames[n]] = True except NoError: None except Error, e: wikipedia.output(u'***' + e.text) n += 1 def main(): global checks, edit, debug offset = None reload = None try: opts, args = getopt.getopt(sys.argv[1:], 'r:dneo:', ['reload=', 'debug', 'no-checks', 'edit', 'offset=']) for o, a in opts: if o in ('-n', '--no-checks'): checks = False elif o in ('-o', '--offset'): offset = int(a) elif o in ('-e', '--edit'): edit = True elif o in ('-d', '--debug'): debug = True elif o in ('-r', '--reload'): reload = a else: print "Bad option: %s" % o return except getopt.GetoptError: print "Bad command line" return global n, linknames, done done = {} try: f = file('taxoconvert.db', 'rb') n, linknames, done = pickle.load(f) f.close() if reload: raise IOError except IOError: tb = wikipedia.Page(site, reload) linknames = map(lambda p:p.title(), tb.getReferences()) print len(linknames), "pages found" n = 0 try: if offset != None: n = offset if args: for aa in args: convert(wikipedia.Page(site, aa)) else: convertmany() finally: f = file('taxoconvert.db.new', 'wb') pickle.dump((n, linknames, done), f) f.close() os.rename('taxoconvert.db.new', 'taxoconvert.db') if __name__ == '__main__': try: main() finally: wikipedia.stopme()