User:MiszaBot/Source

From Wikipedia, the free encyclopedia

Contents

[edit] archivebot.py

This is a bot that archives pages, with respect to a configuration file given in a separate XML file. Example config is avalable on User:MiszaBot/Source/archivemisza.xml. Bots running this framework include User:MiszaBot, User:EssjayBot II and User:EssjayBot II.

#!/usr/bin/env python

import sys, wikipedia, re, time, locale
from xml.parsers import expat

class Element(object):
  def __init__(self, name, attributes):
    self.name = name
    self.attributes = attributes
    self.cdata = ''
    self.children = []
  def addChild(self,element):
    self.children.append(element)
  def getAttribute(self, key):
    return self.attributes.get(key)
  def getData(self):
    return self.cdata
  def setData(self,newcdata):
    self.cdata = newcdata
  def getElements(self, name=''):
    if name:
      return [c for c in self.children if c.name == name];
    else:
      return list(self.children)
  def Print(self,handle,depth=0):
    if depth == 0:
      handle.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n\n")
    if len(self.children) == 0:
      handle.write("%s<%s>%s</%s>\n" % ("  "*depth, self.name, self.cdata, self.name))
    else:
      handle.write("%s<%s>\n" % ("  "*depth, self.name))
      for elem in list(self.children):
        elem.Print(handle,depth+1)
      handle.write("%s</%s>\n" % ("  "*depth, self.name))

class Xml2Obj(object):
  def __init__(self):
    self.root = None
    self.nodeStack = []
  def StartElement(self, name, attributes):
    element = Element(name.encode(), attributes)
    if self.nodeStack:
      parent = self.nodeStack[-1]
      parent.addChild(element)
    else:
      self.root = element
    self.nodeStack.append(element)
  def EndElement(self, name):
    self.nodeStack.pop()
  def CharacterData(self, data):
    if data.strip():
      data = data.encode()
      element = self.nodeStack[-1]
      element.cdata += data
  def Parse(self, filename):
    Parser = expat.ParserCreate()
    Parser.StartElementHandler = self.StartElement
    Parser.EndElementHandler = self.EndElement
    Parser.CharacterDataHandler = self.CharacterData
    ParserStatus = Parser.Parse(open(filename).read(),1)
    return self.root

def AnalyzePage(pagetext,modes,age):
  lines = re.split('\n',pagetext)
  header = ""
  archive = ""
  newpage = ""
  thread = ""
  thr_time = 0
  state = 0 #Reading header
  archived = 0
  treshold = age * 24 * 60 * 60
  cur_time = time.time()
  if lines:
    for line in lines:
      if re.match('^==[^=].*[^=]==$',line): #New header
        if state > 0: #We must decide on a previous one
          if thr_time != 0 and cur_time - thr_time > treshold:
            archive = '\n'.join((archive,thread))
            archived = archived + 1
          else:
            newpage = '\n'.join((newpage,thread))
        thread = line
        thr_time = 0
        state = state + 1
      else: #A regular line
        if state == 0: #We must add to the header
          header = '\n'.join((header,line))
        else:
          thread = '\n'.join((thread,line))
          TM = re.search('(\d\d):(\d\d), (\d\d?) (January|February|March|April|May|June|July|August|September|October|November|December) (\d\d\d\d) \(UTC\)',line)
          if TM:
            TIME = time.strptime(TM.group(0),"%H:%M, %d %B %Y (UTC)")
            thr_time = max(thr_time,time.mktime(TIME))
    #Do last thread:
    if cur_time - thr_time > treshold and thread != '':
      archive = '\n'.join((archive,thread))
      archived = archived + 1
    else:
      newpage = '\n'.join((newpage,thread))
  header  = re.sub('^\n*','',re.sub('\n*$','\n',header))
  newpage = re.sub('^\n*','',re.sub('\n*$','\n',newpage))
  archive = re.sub('^\n*','',re.sub('\n*$','\n',archive))
  newpage = '\n'.join((header,newpage))
  return (archived,newpage,archive)

if __name__ == '__main__':
  try:
    if len(sys.argv)>0:
      locale.setlocale(locale.LC_TIME,('en_US','utf-8'))
      parser = Xml2Obj()
      archivebot = parser.Parse(sys.argv[1])

      Site = wikipedia.getSite()

      targets = archivebot.getElements('target')
      for target in targets:
        pagename = target.getElements('page')[0].getData()
        archivename = target.getElements('archive')[0].getData()
        modes = target.getElements('mode')[0].getData().split(',')
        if 'relative' in modes:
          archivename = ''.join((pagename,archivename))
        if 'autoincrement' in modes:
          counter = target.getElements('counter')[0].getData()
          maxsize = int(target.getElements('maxsize')[0].getData())
          archivename = re.sub('\$A', counter, archivename)
        age = int(target.getElements('age')[0].getData())
        print "Archiving [[%s]] to [[%s]]..." % (pagename, archivename)
        Page = wikipedia.Page(Site,pagename)
        pagetext = Page.get()

        archive = AnalyzePage(pagetext,modes,age)

        if archive[0] > 0:
          print "%d threads to archive." % archive[0]
          if archive[0] == 1:
            wikipedia.setAction("Archiving a thread older than %d days to [[%s]]" % (age,archivename))
          else:
            wikipedia.setAction("Archiving %d threads older than %d days to [[%s]]" % (archive[0],age,archivename))
          Page.put(archive[1])
          Archive = wikipedia.Page(Site,archivename)
          oldarchive = ""
          try:
            oldarchive = Archive.get()
          except wikipedia.NoPage:
            print "Archive doesn't exist."
            oldarchive = ""
          newarchive = '\n\n'.join((re.sub('\n*$','',oldarchive),archive[2]))
          if archive[0] == 1:
            summary = "Archiving a thread older than %d days from [[%s]]" % (age,pagename)
          else:
            summary = "Archiving %d threads older than %d days from [[%s]]" % (archive[0],age,pagename)
          if 'autoincrement' in modes and len(newarchive) > maxsize*1024:
            print "Archive is full."
            summary = "%s (ARCHIVE FULL)" % summary
            counter = target.getElements('counter')[0].setData(int(counter)+1)
          wikipedia.setAction(summary)
          Archive.put(newarchive)
        else:
          print "No need to archive."
      if len(sys.argv[1])>0:
        cfg = open(sys.argv[1],"w+")
        archivebot.Print(cfg)
        cfg.close()

  finally:
    wikipedia.stopme()


[edit] archivemisza.xml

This is an example configuration file for the archive bot. It archives the page User talk:Misza13. The mode "relative" specifies that the archive is a subpage of the page being archived (if not specified, it must be an absolute page name). Other possibility is the "autoincrement" mode (specify multiple modes in one <mode> tag, separated by commas), where you should also specify a <counter> tag. If you do, any occurences of "$A" in the archive name will be replaced by the counter. You should also provide a <maxsize> tag, which will specify the maximum size of an archive page (in kilobytes), before it is considered full, the counter is incremented and the config file modified accordingly.

<?xml version="1.0" encoding="utf-8"?>

<archivebot>
  <target>
    <page>User talk:Misza13</page>
    <archive>/Archives/2006/07</archive>
    <mode>relative</mode>
    <age>7</age>
  </target>
</archivebot>


[edit] spambot.py

This code uses a recipe from the Python Cookbook to build an XML tree from the config file. Apart from that, it just traverses the list on given pages, filling them with given content, using the pywikipedia framework.

#!/usr/bin/python

import sys, wikipedia, re
from xml.parsers import expat

class Element(object):
  def __init__(self, name, attributes):
    self.name = name
    self.attributes = attributes
    self.cdata = ''
    self.children = []
  def addChild(self,element):
    self.children.append(element)
  def getAttribute(self, key):
    return self.attributes.get(key)
  def getData(self):
    return self.cdata
  def getElements(self, name=''):
    if name:
      return [c for c in self.children if c.name == name];
    else:
      return list(self.children)

class Xml2Obj(object):
  def __init__(self):
    self.root = None
    self.nodeStack = []
  def StartElement(self, name, attributes):
    element = Element(name.encode(), attributes)
    if self.nodeStack:
      parent = self.nodeStack[-1]
      parent.addChild(element)
    else:
      self.root = element
    self.nodeStack.append(element)
  def EndElement(self, name):
    self.nodeStack.pop()
  def CharacterData(self, data):
    if data.strip():
      data = data.encode()
      element = self.nodeStack[-1]
      element.cdata += data
  def Parse(self, filename):
    Parser = expat.ParserCreate()
    Parser.StartElementHandler = self.StartElement
    Parser.EndElementHandler = self.EndElement
    Parser.CharacterDataHandler = self.CharacterData
    ParserStatus = Parser.Parse(open(filename).read(),1)
    return self.root

if __name__ == '__main__':
  try:
    if len(sys.argv)>1:
      parser = Xml2Obj()
      spambot_cfg = parser.Parse(sys.argv[1])

      header = spambot_cfg.getElements('header')
      if len(header) != 1:
        print "ERROR: None or multiple headers!"
        sys.exit(0)
      header = header[0].getData()

      content = spambot_cfg.getElements('content')
      if len(content) < 1:
        print "ERROR: No content!"
        sys.exit(0)
      content_ = ''
      for c in content:
        content_ += '\n' + c.getData()

      site = wikipedia.getSite()
      targets = spambot_cfg.getElements('target')
      for target in targets:
        print "Now spamming [[%s]]..." % target.getData()
        page = wikipedia.Page(site,target.getData())

        oldtext = page.get()
        wikipedia.setAction(header)
        re.sub('','\n*$',oldtext)
        newtext = oldtext + "\n\n== %s ==\n%s" % (header, content_)
        page.put(newtext)

    else:
      print "No configuration file specified!"

  finally:
    wikipedia.stopme()


[edit] config.xml

An example configuration file for the spambot:

<spambot>
  <header>Message from spambot</header>
  <content>Hello! This is a message from the spambot!</content>
    <target>Wikipedia:Sandbox</target>
    <target>User:Example/Sandbox</target>
    <!-- And so on... -->
</spambot>


Public domain

I, the creator of this work, hereby release it into the public domain. This applies worldwide.
In case this is not legally possible,
I grant any entity the right to use this work for any purpose, without any conditions, unless such conditions are required by law.