User:Dispenser/Link scraper

From Wikipedia, the free encyclopedia

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This is a concept for a user edit system for retrieving meta-data from newspaper sites
 
The following functions are available:
        hre()           - Html Regular Expression match, case insensative
        fmttime()       - Same as strftime(), but uses gmtime() instead of localtime()
        pageurl()       - returns the URL of the current link
        pagetitle()     - returns the title of the page as given by beautifulsoup
"""
sites = (
        nytime-abstract = {
                # New York Times Abstracts (pre-1987)
                urlmatch: r'http://.*nytimes\.com/.*abstract.*',
                template: "cite web", 
                parameters: {
                        url:        pageurl(),
                        title:      hre(r'<h1>(.*?)</h1>', '\1')
                        publisher:      "New York Times",
                        date:       hre(r'<meta name="WT.z_pud" content="\d{4}\d{2}\d{2}"[^<>]>', r'\1-\2-\3'),
                        page:       hre(r'<\w+[^<>]*note[^<>]*>Page (\d+).*?</p>', r'\1'),
                        author:     hre(r'<\w+[^<>]*note[^<>]*>By .*?</p>', r'\1'),
                        accessdate: fmttime("%Y-%m-%d"),
                }
        },
        nytime-metadata = {
                # New York Times with metadata
                # http://open.blogs.nytimes.com/2007/10/23/messing-around-with-metadata/
                urlmatch: r'http://(www\.)?nytimes\.com/.*\.html',
                template: "cite web", 
                parameters: {
                        url:        pageurl(),
                        title:      hre(r'<meta name="hdl" content="([^"]*)">', r'\1'),
                        publisher:      hre(r'<meta name="cre" content="([^"]*)">', r'\1'),
                        #location:    hre(r'<meta name="geo" content="([^"]*)"[^<>]>', r'\1'),
                        # Place of the news source... of the building or where it happend?
                        date:       hre(r'<meta name="pdate" content="(\d{4})(\d{2})(\d{2})">', r'\1-\2-\3'),
                        author:     hre(r'<meta name="byl" content="by *([^"]*)">', r'\1'),
                        accessdate: fmttime("%Y-%m-%d"),
                }
        },
)
#end