User:Drinibot/ExtractWikilinks.py

From Wikipedia, the free encyclopedia

import re
import subprocess
rawfilename="raw.html"
linksfilename="links.txt"

fi=open(rawfilename,'r')
li=open(linksfilename,'w')
regex=re.compile(r".*.org/wiki/(?P<oldcat>.*?)\".*$")

for line in fi:
        m=regex.findall(line)
        if m: 
                if not ("Special:" ) in  m[0]:
                        wl= "[["+ m[0] +"]]"
                        print wl
                        li.write(wl+"\n")
fi.close()
li.close()