User:MiszaBot/Source/archivebot.py
From Wikipedia, the free encyclopedia
< User:MiszaBot | Source
[edit] archivebot.py
This is a bot that archives pages, with respect to a configuration file given in a separate XML file. Example config is avalable on User:MiszaBot/Source/archivemisza.xml. Bots running this framework include User:MiszaBot, User:EssjayBot II and User:EssjayBot II.
#!/usr/bin/env python import sys, wikipedia, re, time, locale from xml.parsers import expat class Element(object): def __init__(self, name, attributes): self.name = name self.attributes = attributes self.cdata = '' self.children = [] def addChild(self,element): self.children.append(element) def getAttribute(self, key): return self.attributes.get(key) def getData(self): return self.cdata def setData(self,newcdata): self.cdata = newcdata def getElements(self, name=''): if name: return [c for c in self.children if c.name == name]; else: return list(self.children) def Print(self,handle,depth=0): if depth == 0: handle.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n\n") if len(self.children) == 0: handle.write("%s<%s>%s</%s>\n" % (" "*depth, self.name, self.cdata, self.name)) else: handle.write("%s<%s>\n" % (" "*depth, self.name)) for elem in list(self.children): elem.Print(handle,depth+1) handle.write("%s</%s>\n" % (" "*depth, self.name)) class Xml2Obj(object): def __init__(self): self.root = None self.nodeStack = [] def StartElement(self, name, attributes): element = Element(name.encode(), attributes) if self.nodeStack: parent = self.nodeStack[-1] parent.addChild(element) else: self.root = element self.nodeStack.append(element) def EndElement(self, name): self.nodeStack.pop() def CharacterData(self, data): if data.strip(): data = data.encode() element = self.nodeStack[-1] element.cdata += data def Parse(self, filename): Parser = expat.ParserCreate() Parser.StartElementHandler = self.StartElement Parser.EndElementHandler = self.EndElement Parser.CharacterDataHandler = self.CharacterData ParserStatus = Parser.Parse(open(filename).read(),1) return self.root def AnalyzePage(pagetext,modes,age): lines = re.split('\n',pagetext) header = "" archive = "" newpage = "" thread = "" thr_time = 0 state = 0 #Reading header archived = 0 treshold = age * 24 * 60 * 60 cur_time = time.time() if lines: for line in lines: if re.match('^==[^=].*[^=]==$',line): #New header if state > 0: #We must decide on a previous one if thr_time != 0 and cur_time - thr_time > treshold: archive = '\n'.join((archive,thread)) archived = archived + 1 else: newpage = '\n'.join((newpage,thread)) thread = line thr_time = 0 state = state + 1 else: #A regular line if state == 0: #We must add to the header header = '\n'.join((header,line)) else: thread = '\n'.join((thread,line)) TM = re.search('(\d\d):(\d\d), (\d\d?) (January|February|March|April|May|June|July|August|September|October|November|December) (\d\d\d\d) \(UTC\)',line) if TM: TIME = time.strptime(TM.group(0),"%H:%M, %d %B %Y (UTC)") thr_time = max(thr_time,time.mktime(TIME)) #Do last thread: if cur_time - thr_time > treshold and thread != '': archive = '\n'.join((archive,thread)) archived = archived + 1 else: newpage = '\n'.join((newpage,thread)) header = re.sub('^\n*','',re.sub('\n*$','\n',header)) newpage = re.sub('^\n*','',re.sub('\n*$','\n',newpage)) archive = re.sub('^\n*','',re.sub('\n*$','\n',archive)) newpage = '\n'.join((header,newpage)) return (archived,newpage,archive) if __name__ == '__main__': try: if len(sys.argv)>0: locale.setlocale(locale.LC_TIME,('en_US','utf-8')) parser = Xml2Obj() archivebot = parser.Parse(sys.argv[1]) Site = wikipedia.getSite() targets = archivebot.getElements('target') for target in targets: pagename = target.getElements('page')[0].getData() archivename = target.getElements('archive')[0].getData() modes = target.getElements('mode')[0].getData().split(',') if 'relative' in modes: archivename = ''.join((pagename,archivename)) if 'autoincrement' in modes: counter = target.getElements('counter')[0].getData() maxsize = int(target.getElements('maxsize')[0].getData()) archivename = re.sub('\$A', counter, archivename) age = int(target.getElements('age')[0].getData()) print "Archiving [[%s]] to [[%s]]..." % (pagename, archivename) Page = wikipedia.Page(Site,pagename) pagetext = Page.get() archive = AnalyzePage(pagetext,modes,age) if archive[0] > 0: print "%d threads to archive." % archive[0] if archive[0] == 1: wikipedia.setAction("Archiving a thread older than %d days to [[%s]]" % (age,archivename)) else: wikipedia.setAction("Archiving %d threads older than %d days to [[%s]]" % (archive[0],age,archivename)) Page.put(archive[1]) Archive = wikipedia.Page(Site,archivename) oldarchive = "" try: oldarchive = Archive.get() except wikipedia.NoPage: print "Archive doesn't exist." oldarchive = "" newarchive = '\n\n'.join((re.sub('\n*$','',oldarchive),archive[2])) if archive[0] == 1: summary = "Archiving a thread older than %d days from [[%s]]" % (age,pagename) else: summary = "Archiving %d threads older than %d days from [[%s]]" % (archive[0],age,pagename) if 'autoincrement' in modes and len(newarchive) > maxsize*1024: print "Archive is full." summary = "%s (ARCHIVE FULL)" % summary counter = target.getElements('counter')[0].setData(int(counter)+1) wikipedia.setAction(summary) Archive.put(newarchive) else: print "No need to archive." if len(sys.argv[1])>0: cfg = open(sys.argv[1],"w+") archivebot.Print(cfg) cfg.close() finally: wikipedia.stopme()
I, the creator of this work, hereby release it into the public domain. This applies worldwide.
In case this is not legally possible,
I grant any entity the right to use this work for any purpose, without any conditions, unless such conditions are required by law.