User:MiszaBot/Source/archivebot.py

From Wikipedia, the free encyclopedia

[edit] archivebot.py

This is a bot that archives pages, with respect to a configuration file given in a separate XML file. Example config is avalable on User:MiszaBot/Source/archivemisza.xml. Bots running this framework include User:MiszaBot, User:EssjayBot II and User:EssjayBot II.

#!/usr/bin/env python

import sys, wikipedia, re, time, locale
from xml.parsers import expat

class Element(object):
  def __init__(self, name, attributes):
    self.name = name
    self.attributes = attributes
    self.cdata = ''
    self.children = []
  def addChild(self,element):
    self.children.append(element)
  def getAttribute(self, key):
    return self.attributes.get(key)
  def getData(self):
    return self.cdata
  def setData(self,newcdata):
    self.cdata = newcdata
  def getElements(self, name=''):
    if name:
      return [c for c in self.children if c.name == name];
    else:
      return list(self.children)
  def Print(self,handle,depth=0):
    if depth == 0:
      handle.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n\n")
    if len(self.children) == 0:
      handle.write("%s<%s>%s</%s>\n" % ("  "*depth, self.name, self.cdata, self.name))
    else:
      handle.write("%s<%s>\n" % ("  "*depth, self.name))
      for elem in list(self.children):
        elem.Print(handle,depth+1)
      handle.write("%s</%s>\n" % ("  "*depth, self.name))

class Xml2Obj(object):
  def __init__(self):
    self.root = None
    self.nodeStack = []
  def StartElement(self, name, attributes):
    element = Element(name.encode(), attributes)
    if self.nodeStack:
      parent = self.nodeStack[-1]
      parent.addChild(element)
    else:
      self.root = element
    self.nodeStack.append(element)
  def EndElement(self, name):
    self.nodeStack.pop()
  def CharacterData(self, data):
    if data.strip():
      data = data.encode()
      element = self.nodeStack[-1]
      element.cdata += data
  def Parse(self, filename):
    Parser = expat.ParserCreate()
    Parser.StartElementHandler = self.StartElement
    Parser.EndElementHandler = self.EndElement
    Parser.CharacterDataHandler = self.CharacterData
    ParserStatus = Parser.Parse(open(filename).read(),1)
    return self.root

def AnalyzePage(pagetext,modes,age):
  lines = re.split('\n',pagetext)
  header = ""
  archive = ""
  newpage = ""
  thread = ""
  thr_time = 0
  state = 0 #Reading header
  archived = 0
  treshold = age * 24 * 60 * 60
  cur_time = time.time()
  if lines:
    for line in lines:
      if re.match('^==[^=].*[^=]==$',line): #New header
        if state > 0: #We must decide on a previous one
          if thr_time != 0 and cur_time - thr_time > treshold:
            archive = '\n'.join((archive,thread))
            archived = archived + 1
          else:
            newpage = '\n'.join((newpage,thread))
        thread = line
        thr_time = 0
        state = state + 1
      else: #A regular line
        if state == 0: #We must add to the header
          header = '\n'.join((header,line))
        else:
          thread = '\n'.join((thread,line))
          TM = re.search('(\d\d):(\d\d), (\d\d?) (January|February|March|April|May|June|July|August|September|October|November|December) (\d\d\d\d) \(UTC\)',line)
          if TM:
            TIME = time.strptime(TM.group(0),"%H:%M, %d %B %Y (UTC)")
            thr_time = max(thr_time,time.mktime(TIME))
    #Do last thread:
    if cur_time - thr_time > treshold and thread != '':
      archive = '\n'.join((archive,thread))
      archived = archived + 1
    else:
      newpage = '\n'.join((newpage,thread))
  header  = re.sub('^\n*','',re.sub('\n*$','\n',header))
  newpage = re.sub('^\n*','',re.sub('\n*$','\n',newpage))
  archive = re.sub('^\n*','',re.sub('\n*$','\n',archive))
  newpage = '\n'.join((header,newpage))
  return (archived,newpage,archive)

if __name__ == '__main__':
  try:
    if len(sys.argv)>0:
      locale.setlocale(locale.LC_TIME,('en_US','utf-8'))
      parser = Xml2Obj()
      archivebot = parser.Parse(sys.argv[1])

      Site = wikipedia.getSite()

      targets = archivebot.getElements('target')
      for target in targets:
        pagename = target.getElements('page')[0].getData()
        archivename = target.getElements('archive')[0].getData()
        modes = target.getElements('mode')[0].getData().split(',')
        if 'relative' in modes:
          archivename = ''.join((pagename,archivename))
        if 'autoincrement' in modes:
          counter = target.getElements('counter')[0].getData()
          maxsize = int(target.getElements('maxsize')[0].getData())
          archivename = re.sub('\$A', counter, archivename)
        age = int(target.getElements('age')[0].getData())
        print "Archiving [[%s]] to [[%s]]..." % (pagename, archivename)
        Page = wikipedia.Page(Site,pagename)
        pagetext = Page.get()

        archive = AnalyzePage(pagetext,modes,age)

        if archive[0] > 0:
          print "%d threads to archive." % archive[0]
          if archive[0] == 1:
            wikipedia.setAction("Archiving a thread older than %d days to [[%s]]" % (age,archivename))
          else:
            wikipedia.setAction("Archiving %d threads older than %d days to [[%s]]" % (archive[0],age,archivename))
          Page.put(archive[1])
          Archive = wikipedia.Page(Site,archivename)
          oldarchive = ""
          try:
            oldarchive = Archive.get()
          except wikipedia.NoPage:
            print "Archive doesn't exist."
            oldarchive = ""
          newarchive = '\n\n'.join((re.sub('\n*$','',oldarchive),archive[2]))
          if archive[0] == 1:
            summary = "Archiving a thread older than %d days from [[%s]]" % (age,pagename)
          else:
            summary = "Archiving %d threads older than %d days from [[%s]]" % (archive[0],age,pagename)
          if 'autoincrement' in modes and len(newarchive) > maxsize*1024:
            print "Archive is full."
            summary = "%s (ARCHIVE FULL)" % summary
            counter = target.getElements('counter')[0].setData(int(counter)+1)
          wikipedia.setAction(summary)
          Archive.put(newarchive)
        else:
          print "No need to archive."
      if len(sys.argv[1])>0:
        cfg = open(sys.argv[1],"w+")
        archivebot.Print(cfg)
        cfg.close()

  finally:
    wikipedia.stopme()


Public domain

I, the creator of this work, hereby release it into the public domain. This applies worldwide.
In case this is not legally possible,
I grant any entity the right to use this work for any purpose, without any conditions, unless such conditions are required by law.