User:MiszaBot/Source
From Wikipedia, the free encyclopedia
Contents |
[edit] archivebot.py
This is a bot that archives pages, with respect to a configuration file given in a separate XML file. Example config is avalable on User:MiszaBot/Source/archivemisza.xml. Bots running this framework include User:MiszaBot, User:EssjayBot II and User:EssjayBot II.
#!/usr/bin/env python import sys, wikipedia, re, time, locale from xml.parsers import expat class Element(object): def __init__(self, name, attributes): self.name = name self.attributes = attributes self.cdata = '' self.children = [] def addChild(self,element): self.children.append(element) def getAttribute(self, key): return self.attributes.get(key) def getData(self): return self.cdata def setData(self,newcdata): self.cdata = newcdata def getElements(self, name=''): if name: return [c for c in self.children if c.name == name]; else: return list(self.children) def Print(self,handle,depth=0): if depth == 0: handle.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n\n") if len(self.children) == 0: handle.write("%s<%s>%s</%s>\n" % (" "*depth, self.name, self.cdata, self.name)) else: handle.write("%s<%s>\n" % (" "*depth, self.name)) for elem in list(self.children): elem.Print(handle,depth+1) handle.write("%s</%s>\n" % (" "*depth, self.name)) class Xml2Obj(object): def __init__(self): self.root = None self.nodeStack = [] def StartElement(self, name, attributes): element = Element(name.encode(), attributes) if self.nodeStack: parent = self.nodeStack[-1] parent.addChild(element) else: self.root = element self.nodeStack.append(element) def EndElement(self, name): self.nodeStack.pop() def CharacterData(self, data): if data.strip(): data = data.encode() element = self.nodeStack[-1] element.cdata += data def Parse(self, filename): Parser = expat.ParserCreate() Parser.StartElementHandler = self.StartElement Parser.EndElementHandler = self.EndElement Parser.CharacterDataHandler = self.CharacterData ParserStatus = Parser.Parse(open(filename).read(),1) return self.root def AnalyzePage(pagetext,modes,age): lines = re.split('\n',pagetext) header = "" archive = "" newpage = "" thread = "" thr_time = 0 state = 0 #Reading header archived = 0 treshold = age * 24 * 60 * 60 cur_time = time.time() if lines: for line in lines: if re.match('^==[^=].*[^=]==$',line): #New header if state > 0: #We must decide on a previous one if thr_time != 0 and cur_time - thr_time > treshold: archive = '\n'.join((archive,thread)) archived = archived + 1 else: newpage = '\n'.join((newpage,thread)) thread = line thr_time = 0 state = state + 1 else: #A regular line if state == 0: #We must add to the header header = '\n'.join((header,line)) else: thread = '\n'.join((thread,line)) TM = re.search('(\d\d):(\d\d), (\d\d?) (January|February|March|April|May|June|July|August|September|October|November|December) (\d\d\d\d) \(UTC\)',line) if TM: TIME = time.strptime(TM.group(0),"%H:%M, %d %B %Y (UTC)") thr_time = max(thr_time,time.mktime(TIME)) #Do last thread: if cur_time - thr_time > treshold and thread != '': archive = '\n'.join((archive,thread)) archived = archived + 1 else: newpage = '\n'.join((newpage,thread)) header = re.sub('^\n*','',re.sub('\n*$','\n',header)) newpage = re.sub('^\n*','',re.sub('\n*$','\n',newpage)) archive = re.sub('^\n*','',re.sub('\n*$','\n',archive)) newpage = '\n'.join((header,newpage)) return (archived,newpage,archive) if __name__ == '__main__': try: if len(sys.argv)>0: locale.setlocale(locale.LC_TIME,('en_US','utf-8')) parser = Xml2Obj() archivebot = parser.Parse(sys.argv[1]) Site = wikipedia.getSite() targets = archivebot.getElements('target') for target in targets: pagename = target.getElements('page')[0].getData() archivename = target.getElements('archive')[0].getData() modes = target.getElements('mode')[0].getData().split(',') if 'relative' in modes: archivename = ''.join((pagename,archivename)) if 'autoincrement' in modes: counter = target.getElements('counter')[0].getData() maxsize = int(target.getElements('maxsize')[0].getData()) archivename = re.sub('\$A', counter, archivename) age = int(target.getElements('age')[0].getData()) print "Archiving [[%s]] to [[%s]]..." % (pagename, archivename) Page = wikipedia.Page(Site,pagename) pagetext = Page.get() archive = AnalyzePage(pagetext,modes,age) if archive[0] > 0: print "%d threads to archive." % archive[0] if archive[0] == 1: wikipedia.setAction("Archiving a thread older than %d days to [[%s]]" % (age,archivename)) else: wikipedia.setAction("Archiving %d threads older than %d days to [[%s]]" % (archive[0],age,archivename)) Page.put(archive[1]) Archive = wikipedia.Page(Site,archivename) oldarchive = "" try: oldarchive = Archive.get() except wikipedia.NoPage: print "Archive doesn't exist." oldarchive = "" newarchive = '\n\n'.join((re.sub('\n*$','',oldarchive),archive[2])) if archive[0] == 1: summary = "Archiving a thread older than %d days from [[%s]]" % (age,pagename) else: summary = "Archiving %d threads older than %d days from [[%s]]" % (archive[0],age,pagename) if 'autoincrement' in modes and len(newarchive) > maxsize*1024: print "Archive is full." summary = "%s (ARCHIVE FULL)" % summary counter = target.getElements('counter')[0].setData(int(counter)+1) wikipedia.setAction(summary) Archive.put(newarchive) else: print "No need to archive." if len(sys.argv[1])>0: cfg = open(sys.argv[1],"w+") archivebot.Print(cfg) cfg.close() finally: wikipedia.stopme()
[edit] archivemisza.xml
This is an example configuration file for the archive bot. It archives the page User talk:Misza13. The mode "relative" specifies that the archive is a subpage of the page being archived (if not specified, it must be an absolute page name). Other possibility is the "autoincrement" mode (specify multiple modes in one <mode> tag, separated by commas), where you should also specify a <counter> tag. If you do, any occurences of "$A" in the archive name will be replaced by the counter. You should also provide a <maxsize> tag, which will specify the maximum size of an archive page (in kilobytes), before it is considered full, the counter is incremented and the config file modified accordingly.
<?xml version="1.0" encoding="utf-8"?> <archivebot> <target> <page>User talk:Misza13</page> <archive>/Archives/2006/07</archive> <mode>relative</mode> <age>7</age> </target> </archivebot>
[edit] spambot.py
This code uses a recipe from the Python Cookbook to build an XML tree from the config file. Apart from that, it just traverses the list on given pages, filling them with given content, using the pywikipedia framework.
#!/usr/bin/python import sys, wikipedia, re from xml.parsers import expat class Element(object): def __init__(self, name, attributes): self.name = name self.attributes = attributes self.cdata = '' self.children = [] def addChild(self,element): self.children.append(element) def getAttribute(self, key): return self.attributes.get(key) def getData(self): return self.cdata def getElements(self, name=''): if name: return [c for c in self.children if c.name == name]; else: return list(self.children) class Xml2Obj(object): def __init__(self): self.root = None self.nodeStack = [] def StartElement(self, name, attributes): element = Element(name.encode(), attributes) if self.nodeStack: parent = self.nodeStack[-1] parent.addChild(element) else: self.root = element self.nodeStack.append(element) def EndElement(self, name): self.nodeStack.pop() def CharacterData(self, data): if data.strip(): data = data.encode() element = self.nodeStack[-1] element.cdata += data def Parse(self, filename): Parser = expat.ParserCreate() Parser.StartElementHandler = self.StartElement Parser.EndElementHandler = self.EndElement Parser.CharacterDataHandler = self.CharacterData ParserStatus = Parser.Parse(open(filename).read(),1) return self.root if __name__ == '__main__': try: if len(sys.argv)>1: parser = Xml2Obj() spambot_cfg = parser.Parse(sys.argv[1]) header = spambot_cfg.getElements('header') if len(header) != 1: print "ERROR: None or multiple headers!" sys.exit(0) header = header[0].getData() content = spambot_cfg.getElements('content') if len(content) < 1: print "ERROR: No content!" sys.exit(0) content_ = '' for c in content: content_ += '\n' + c.getData() site = wikipedia.getSite() targets = spambot_cfg.getElements('target') for target in targets: print "Now spamming [[%s]]..." % target.getData() page = wikipedia.Page(site,target.getData()) oldtext = page.get() wikipedia.setAction(header) re.sub('','\n*$',oldtext) newtext = oldtext + "\n\n== %s ==\n%s" % (header, content_) page.put(newtext) else: print "No configuration file specified!" finally: wikipedia.stopme()
[edit] config.xml
An example configuration file for the spambot:
<spambot> <header>Message from spambot</header> <content>Hello! This is a message from the spambot!</content> <target>Wikipedia:Sandbox</target> <target>User:Example/Sandbox</target> <!-- And so on... --> </spambot>
I, the creator of this work, hereby release it into the public domain. This applies worldwide.
In case this is not legally possible,
I grant any entity the right to use this work for any purpose, without any conditions, unless such conditions are required by law.