User:Benc/Scripts/what links out.py

From Wikipedia, the free encyclopedia

# what_links_out.py
# Author:  Benc (http://en.wikipedia.org/wiki/User:Benc)
# Created: 21 September 2004
# Updated: 21 September 2004
# Purpose: Compile a list of articles that are linked to by a page. Sorted by
#          wiki and by namespace. Warning: may have unexpected results for
#          malformed wikicode.
#
# Usage:   Cut and paste the page source here. Alternately, if you're a
#          programmer using this script as a module, call the reportLinks()
#          function instead.

pageSource="""

"""

# -----------------------------------------------------------------------------
import string

NAMESPACES = [   'Talk',
        'Category',  'Category talk',
        'Help',      'Help talk',
        'Image',     'Image talk',
        'Mediawiki', 'Mediawiki talk',  # Namespace miscapitalization intentional
        'Template',  'Template talk',
        'User',      'User talk',
        'Wikipedia', 'Wikipedia talk']

PSEUDO = {
        'WP':'Wikipedia',
        'CAT':'Category',
        'P':'Wikipedia' } # Proposed WikiProject pseudo-namespace

INTERWIKI = { # this list should probably be expanded
        'b':'[[Wikibooks]]',
        'commons':'Wikimedia Commons',
        'm':'Meta-wiki',
        'q':'[[Wikiquote]]',
        'sep11':'9-11 Memorial',
        'wikt':'[[Wiktionary]]' }

INTERWIKI_PLACEHOLDER = 'ZZZ-interwiki to: '
TRANSLATION_PLACEHOLDER = 'ZZZ-translation to: '

outgoingLinks = []      # Another global variable. Horrible, I know.

def uncapAll(txt):
        return string.lower(txt)
def capAll(txt):
        return string.upper(txt)
def capFirst(txt):
        return string.upper(txt[0]) + txt[1:]
def capNamespace(txt):
        return string.upper(txt[0]) + string.lower(txt[1:])

def parsePipe(link):
        i = link.find('|')
        if i < 0:
                return capFirst(link), None
        else:
                return capFirst(link[:i]), link[i+1:]

def parseNamespace(link):
        namespace = ''  # Main article namespace
        i = link.find(':')
        ns = link[:i]
        if i >= 0:
                if capNamespace(ns) in NAMESPACES:
                        namespace = capNamespace(ns)
                elif PSEUDO.has_key(capAll(ns)):
                        namespace = PSEUDO[capAll(ns)]
                elif INTERWIKI.has_key(uncapAll(ns)):
                        namespace = INTERWIKI_PLACEHOLDER + INTERWIKI[uncapAll(ns)]
                elif len(ns) == 2 and ns == uncapAll(ns):
                        namespace = TRANSLATION_PLACEHOLDER + ns # fairly safe assumption
                # else the : is not being used as a namespace indicator
        return namespace

def parseOneLink(link):
        namespace = parseNamespace(link)
        trueLink, displayedLink = parsePipe(link)
        return (namespace, link, trueLink, displayedLink)

def parseLinks(depth=0):
        """Note: this function is recursive to handle wikicode such as
                [[Image:Example.gif|thumb|right|This is an [[example]] image]]
        """
        global pageSource, outgoingLinks
        
        if depth > 0:
                i, j = pageSource.find('[['), pageSource.find(']]')
                outgoingLinks.append(parseOneLink(pageSource[:j]))
                if (i < 0 and j >=0) or j < i: # nothing deeper
                        pageSource = pageSource[j+2:]
                        return
                # else recurse
        
        while 1:
                i = pageSource.find('[[')
                if i < 0:
                        break
                pageSource = pageSource[i+2:]
                parseLinks(depth+1)

def reportLinks(src):
        """Main interface function"""
        
        global pageSource, outgoingLinks
        pageSource = src
        outgoingLinks = []
        
        parseLinks()
        outgoingLinks.sort()
        
        result = 'The following pages are linked to by this page:\n'
        curNamespace = '?'
        translation = False
        for link in outgoingLinks:
                if link[0] <> curNamespace: # transitioning into new namespace
                        curNamespace = link[0]
                        translation = False
                        if curNamespace == '':
                                result += '*Main article namespace:\n'
                        elif curNamespace.find(INTERWIKI_PLACEHOLDER) >= 0:
                                result += '*Interwiki to %s:\n' % (curNamespace[len(INTERWIKI_PLACEHOLDER):])
                        elif curNamespace.find(TRANSLATION_PLACEHOLDER) >= 0:
                                result += '*Translation: %s\n' % (curNamespace[len(TRANSLATION_PLACEHOLDER):])
                                translation = True
                        else:
                                result += '*%s namespace:\n' % (curNamespace)

                if not translation:
                        if link[3] == None:
                                result += '**[[%s]]\n' % (link[2])
                        else:
                                result += '**[[%s]] (%s)\n' % (link[2], link[3])
        return result

if __name__=="__main__":
        print reportLinks(pageSource)



  ←back to main page    talk