User:HBC Archive Indexerbot/source
From Wikipedia, the free encyclopedia
[edit] Main code
# This script is released under the GFDL license, see # http://en.wikipedia.org/w/index.php?title=User:HBC Archive Indexerbot/source&action=history # for a full list of contributors ### Configuration ### # Time to sleep between writes (in seconds) my $write_delay = 5; # Max pages to download at once my $download_max = 25; # Default template my $default_template = 'User:HBC Archive Indexerbot/default template'; # Cache paths my $cache = 'cache'; my $wiki_cache = "$cache/wiki"; my $obj_cache = "$cache/obj"; my $report_cache = "$cache/reports"; ### End Configuration ### use strict; use warnings; use Data::Dumper; use Date::Parse; use Digest::SHA1 qw(sha1_hex); use IO::Socket; use MediaWiki; use POSIX qw(strftime); use Storable; use Time::Duration; use Time::Local; use URI::Escape; use XML::Simple; my($log_file,$pages_watched,$pages_downloaded,$pages_attempted,$dl,$ul) = ('',0,0,0,0,0); my $nowiki = 'nowiki'; my $start_time = undef; ######################################### # Log into Wikipedia # ######################################### die "Cache directories must be created in advance\n" unless (-d $cache && -d $wiki_cache && -d $obj_cache); open(PASS,'password'); # A file with only the password, no carraige return sysread(PASS, my $password, -s(PASS)); # No password in sourcecode. close(PASS); writelog ('Connecting to Wikipedia'); my $c = MediaWiki->new; $c->setup ({ 'bot' => {'user' => 'HBC Archive Indexerbot','pass' => $password}, 'wiki' => {'host' => 'en.wikipedia.org','path' => 'w'} }) || die 'Failed to log in'; my $whoami = $c->user(); writelog ($whoami.' connected'); ######################################### # Update cache to modern state # ######################################### writelog ('Gathering jobs'); my @master_job_list = gather_jobs(); my @post_jobs = @master_job_list; writelog (scalar(@master_job_list).' jobs found'); writelog ('Parsing cache'); my $rh_cache_data = parse_cache(); writelog ('done'); writelog ('Parsing watchlist'); my $rh_modified_data = parse_watchlist(); writelog ('done'); download_pages(find_updated()); download_pages(find_templates(@master_job_list)); #push (@needed,&find_holes()); fetch_pages(@master_job_list); writelog ("$pages_watched pages added to watchlist."); writelog ("$pages_downloaded out of $pages_attempted downloaded."); ######################################### # Parse cached data and create reports # ######################################### writelog ('Creating reports'); foreach my $ra_job (@post_jobs) { my $page = $ra_job->{'page'}; my $dest = $ra_job->{'target'}; my $dest_escaped = _escape($dest); my $mask = join(', ', @{$ra_job->{'mask'}}); my $index_here = $ra_job->{'indexhere'}; unless (check_text(1000,$dest)) { writelog ('Not writing to [['.$dest.']] as I cannot find permission'); next; } my $report = create_report($ra_job); open(REPORT, ">$report_cache/$dest_escaped.$$"); use bytes; print REPORT $report; close(REPORT); if (-e "$report_cache/$dest_escaped") { my $result = `diff --brief "$report_cache/$dest_escaped" "$report_cache/$dest_escaped.$$"`; unless ($result) { writelog ('No change, skipping [['.$dest.']]'); unlink "$report_cache/$dest_escaped.$$"; next; } } writelog ('Writing report at [['.$dest.']]'); my $edit_summary = "Writing index of archives in $mask due to request from \[\[$page\]\] - Bot edit"; send_report($dest,$report,$edit_summary); rename "$report_cache/$dest_escaped.$$", "$report_cache/$dest_escaped"; } $ul += 120 + length($log_file); writelog ('Complete, downloaded {{formatnum:'.int($dl/1024).'}} kilobyte(s) and uploaded {{formatnum:'.int($ul/1024).'}} kilobyte(s) (figures approximate)'); &post_log(); exit; ######################################### # Subroutines # ######################################### sub check_text { my $bytes = shift; my $page = shift; my $host = 'en.wikipedia.org'; my $path = _underscore('/wiki/'.$page.'?action=raw'); my $sock = new IO::Socket::INET ( PeerAddr => $host, PeerPort => 80, Proto => 'tcp', ); return 0 unless ($sock); my $header = ('GET http://'.$host.$path.' HTTP/1.1'."\r\n".'User-Agent: HBC Archive Indexerbot 0.9a'."\r\n\r\n"); syswrite ($sock, $header); my($buf , $content, $done); while (!$done) { ($done = 1) unless sysread($sock, $buf, $bytes); $content .= $buf; if ((length($content) >= $bytes) || ($content =~ m|!-- HBC Archive Indexerbot can blank this --|)) { $done = 1; } } close($sock); $dl += length($content); return ($content =~ m|!-- HBC Archive Indexerbot can blank this --|); } sub create_report { my ($ra_job) = @_; my ($rh_index, $numbered_links) = index_headings($ra_job); my $template = get_template($ra_job->{'template'}); my $report = sprintf("%sThis report has been generated because of a request at [[%s]]. It covers the archives that match '''%s'''\n<br/>Report generated at ~~~~~ by ~~~\n----\n\n", $template->{'lead'}, $ra_job->{'page'}, join(', ', @{$ra_job->{'mask'}})); $report .= $template->{'header'}; my $i = 0; foreach my $key (sort {lc($a) cmp lc($b) || $rh_index->{$a}->{'root_path'} cmp $rh_index->{$b}->{'root_path'}} (keys(%{$rh_index}))) { $rh_index->{$key}->{'topic'} =~ s:({{.*?}}|[|!]{2}):<$nowiki>$1</$nowiki>:g; my $row = $template->{'row'}; if ($template->{'altrow'}) { unless ($i++ % 2 == 0) { $row = $template->{'altrow'} } } foreach ('topic','replies','link','first','last','duration', 'firstepoch','lastepoch','durationsecs') { $row =~ s:%%$_%%:${$rh_index}{$key}{$_}:gi; } $report .= $row; } $report .= sprintf("%s\n%s", $template->{'footer'}, $template->{'tail'}); return $report; } sub download_pages { my (@pages) = @_; return unless @pages; my $requests = scalar(@pages); my (@received_names); while (@pages) { my @batch; while ((scalar(@batch) < 50) && @pages) { my $item = shift(@pages) || last; $item = _underscore($item); push (@batch, $item); } $pages_attempted += scalar(@batch); my $xml_code = $c->special_export(@batch); $dl += length($xml_code); my $xml_result = XMLin($xml_code); next unless ($xml_result->{'page'}); if ($xml_result->{'page'}{'title'}) { push (@received_names, handle_chunk($xml_result->{'page'})); } else { foreach my $key (keys %{$xml_result->{'page'}}) { push (@received_names, handle_chunk($xml_result->{'page'}->{$key})); } } } writelog('Downloaded '.scalar(@received_names)." pages from $requests requests"); return (@received_names); } sub fetch_pages { my (@jobs) = @_; my (@cache_names) = keys(%$rh_cache_data); foreach my $ra_job (@jobs) { my @fetch; if ($ra_job->{'indexhere'}) { my $page = _underscore($ra_job->{'page'}); push(@fetch, $ra_job->{'page'}) unless (grep(/^\Q$page\E$/, @cache_names)); } my $fetch_size = 0; foreach my $mask (@{$ra_job->{'mask'}}) { if ($mask =~ m|<#>|) { $fetch_size += 10; my $pattern = _underscore($mask); my ($part1, $part2) = split(m|<#>|, $pattern, 2); $pattern = qr/\Q$part1\E(\d+)/; $pattern .= qr/\Q$part2\E/ if $part2; my $leading_zeros = $ra_job->{'leading_zeros'}+1; my $marker = '%d'; $marker = '%0'.$leading_zeros.'d' if ($leading_zeros > 1); my $printf_pattern = $mask; $printf_pattern =~ s|<#>|$marker|; my(@mask_pages) = grep(/$pattern/,@cache_names); my $largest = 0; foreach my $key (@mask_pages) { ($key =~ m|$pattern|) || next; $largest = $1 if ($1 > $largest); } my $count = $largest; my (@pages); until ($count >= ($largest + $fetch_size)) { $count++; my $page_name = sprintf($printf_pattern, $count); push(@fetch,$page_name); } } else { my $check = _underscore($mask); push (@fetch, $mask) unless (grep(/^\Q$check\E$/, @cache_names)); } } continue { if (scalar(@fetch)) { my (@received) = download_pages(@fetch); $rh_cache_data = parse_cache(); (@cache_names) = keys(%$rh_cache_data); if (scalar(@fetch) == scalar(@received)) { @fetch = (); redo; } else { @fetch = (); } } $fetch_size = 0; } } } sub find_holes # This sub will find gaps in the archive(mabye a page was deleted then restored) and { # adds them to the list of potentially needed pages return(); } sub find_templates { my (@jobs) = @_; my %templates; my @templates_needed; foreach my $ra_job (@jobs) { $templates{$ra_job->{'template'}}++; } foreach my $template (keys %templates) { $template = $default_template if $template eq 'default'; my $tmpl_under = _underscore($template); push(@templates_needed, $template) unless grep(/^\Q$tmpl_under\E$/, keys %{$rh_cache_data}); } writelog (scalar(@templates_needed).' templates needed'); return @templates_needed; } sub find_updated # Find items that have changed { my(@need_update); foreach my $page (keys(%{$rh_cache_data})) { if ($rh_modified_data->{$page}) { # If it's not on the watchlist, it hasn't # been modified in the past month, ignore if ($rh_cache_data->{$page} < ${$rh_modified_data}{$page}) { push(@need_update,$page); my $fname = ("$wiki_cache/".uri_escape($page).' '.$rh_cache_data->{$page}); unlink($fname); # Remove old item } } } writelog (scalar(@need_update).' pages need updating'); return @need_update; } sub gather_jobs { my (@jobs); my $html_list = $c->{ua}->get($c->{index}."?title=Special:Whatlinkshere/User:HBC Archive Indexerbot/OptIn&limit=5000")->content(); $dl += length($html_list); my @targets; while ($html_list =~ s|>([^<]*?)</a> \(transclusion\)||) { push(@targets,$1); } my $xml_source = XMLin($c->special_export(@targets)); my $xml = $xml_source; $dl += length($xml_source); my $rh_pages = ${$xml}{'page'}; my %targets; foreach my $key (keys(%{$rh_pages})) { my $content = ${$rh_pages}{$key}{'revision'}{'text'}{'content'}; if ($content =~ m"\Q{{User:HBC Archive Indexerbot/OptIn\E\s*\|(.+?)\s*\Q}}\E"s) { my @params = split(/\s*\|\s*/, $1); my %job = ( page => $rh_pages->{$key}{'title'}, leading_zeros => 0 ); foreach my $param (@params) { my ($key, $value) = split(/\s*=\s*/, $param); next unless ($key && defined($value)); $value =~ s:^\.?/:$job{'page'}/:; if ($key eq 'target') { $job{'target'} = $value; } elsif ($key eq 'mask') { push (@{$job{'mask'}}, $value); } elsif ($key =~ /leading_zeroe?s/) { $job{'leading_zeros'} = $value; } elsif ($key eq 'indexhere') { $job{'indexhere'} = (($value =~ m|ye?s?|i) ? ('1') : ('0')); } elsif ($key eq 'template') { $job{'template'} = $value; } } $job{'template'} = 'default' unless $job{'template'}; next unless ($job{'target'} && $job{'mask'}); if ($targets{$job{'target'}}) { writelog("Request on $job{'page'} duplicates target $job{'target'}; skipping"); next; } else { $targets{$job{'target'}}++; } push(@jobs,\%job); } } return @jobs; } sub get_template { my ($template) = (@_); if ($template eq 'default') { $template = $default_template; } my $tmpl_fn = _escape($template); my ($file) = glob("$wiki_cache/$tmpl_fn*"); unless ($file) { if ($template eq $default_template) { die "$template missing from cache\n"; } else { return get_template('default'); } } open(TMPL, $file); my @content = <TMPL>; close(TMPL); my %template = (lead => '', header => '', row => '', altrow => '', footer => '', tail => ''); my $section = ''; foreach my $line (@content) { chomp $line; if ($line =~ m:^<!--\s*(.*?)\s*-->$:) { $section = lc($1); $section =~ s/\s+//g; last if $section eq 'end'; } else { if ($section) { next unless $line; $template{$section} .= "$line\n"; } } } $template{'lead'} .= "\n" if $template{'lead'}; unless ($template{'row'}) { die "Default template missing 'row' parameter!\n" if $template eq $default_template; writelog("Invalid template: '$template', using default instead"); return get_template('default'); } return \%template; } sub handle_chunk { my $chunk = shift; my $name = _underscore(${$chunk}{'title'}); my $fname = "$wiki_cache/".uri_escape($name); ${$chunk}{'revision'}{'timestamp'} =~ m|(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z|; my $time = timegm($6,$5,$4,$3,$2-1,$1); watch($name) unless (${$rh_cache_data}{$name}); open(OUT,">$fname $time"); binmode(OUT); use bytes; print OUT (${$chunk}{'revision'}{'text'}{'content'}); no bytes; close(OUT); $pages_downloaded++; return $name; } sub index_headings { my ($ra_job) = @_; my $mask_re = ''; foreach my $mask (@{$ra_job->{'mask'}}) { my $mask2 = _escape($mask); if ($mask2 =~ m|%3C%23%3E|) { my ($part1, $part2) = split(m|%3C%23%3E|, $mask2, 2); $mask_re .= '(?:'; $mask_re .= qr/\Q$part1\E\d+/; $mask_re .= qr/\Q$part2\E/ if $part2; $mask_re .= ')|'; } else { $mask_re .= qr/\Q$mask2\E/.'|'; } } chop($mask_re); opendir(CACHE,$wiki_cache); my(@cache) = readdir(CACHE); closedir(CACHE); my @files = grep(m|^(?:$mask_re)|,@cache); if ($ra_job->{'indexhere'}) { my $page = _escape($ra_job->{'page'}); push(@files, grep(m|^\Q$page\E \d+$|,@cache)); } my (%index, %used_headings); my $numbered_links = 0; foreach my $file (@files) { my (%used_names); next unless ($file =~ m|^(.*) (\d+)$|); my $root_path = uri_unescape($1); my $display_name = $root_path; $display_name =~ s/_/ /g; open(WIKI, "$wiki_cache/$file"); my @content = <WIKI>; close(WIKI); my $prev_heading = ''; my ($comment_count,$first,$last) = (0,0,0); foreach my $line (@content) { if ($line =~ m|^==\s*([^=].+?)\s*==|) { if ($prev_heading && $comment_count > 0) { ## WARNING: This code is duplicated below vvvvvv $index{$prev_heading}->{'replies'} = $comment_count; if ($first && $last) { $index{$prev_heading}->{'firstepoch'} = $first; $index{$prev_heading}->{'first'} = strftime('%F %T',gmtime($first)); $index{$prev_heading}->{'lastepoch'} = $last; $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last)); $index{$prev_heading}->{'durationsecs'} = $last - $first; if ($comment_count > 1) { $index{$prev_heading}->{'duration'} = duration($last - $first); } else { $index{$prev_heading}->{'duration'} = 'None'; } } $comment_count = 0; $first = 0; $last = 0; } my $heading = $1; my $head_link; ($head_link, $numbered_links) = path_fix($heading, $numbered_links); $used_names{$head_link}++; my $suffix = (($used_names{$head_link} > 1) ? ('_'.$used_names{$head_link}) : ('')); $used_headings{$head_link.$suffix}++; $prev_heading = $head_link.$suffix.'_'.$used_headings{$head_link.$suffix}; $index{$prev_heading} = { topic => $heading, link => ("[[{{urlencode:$root_path}}#$head_link".$suffix."|$display_name]]"), root_path => $root_path, head_link => $head_link, replies => 'Unknown', first => 'Unknown', 'last' => 'Unknown', duration => 'Unknown', firstepoch => 0, lastepoch => 0, durationsecs => 0, }; } elsif ($line =~ m/\[\[User.*[\]>)}].*?\s+(.*\(UTC\))/) { $comment_count++; my $time = str2time($1); if ($time && (!$first || $time < $first)) { $first = $time; } if ($time && ($time > $last)) { $last = $time; } } } if ($prev_heading && $comment_count > 0) { ## WARNING: This code is duplicated from above ^^^^^^ $index{$prev_heading}->{'replies'} = $comment_count; if ($first && $last) { $index{$prev_heading}->{'firstepoch'} = $first; $index{$prev_heading}->{'first'} = strftime('%F %T', gmtime($first)); $index{$prev_heading}->{'lastepoch'} = $last; $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last)); $index{$prev_heading}->{'durationsecs'} = $last - $first; if ($comment_count > 1) { $index{$prev_heading}->{'duration'} = duration($last - $first); } else { $index{$prev_heading}->{'duration'} = 'None'; } } } } return \%index; } sub parse_cache { my (@pages,$count); opendir(CACHE,$wiki_cache); my(@files) = readdir(CACHE); closedir(CACHE); my(%cache); foreach my $file (@files) { next unless ($file =~ m|^(.*) (\d+)$|); my $page_name = uri_unescape($1); my $time = $2; $cache{$page_name} = $time; } return \%cache; } sub parse_watchlist { my $watchlist = $c->{ua}->get($c->{index}."?title=Special:Watchlist&days=0")->content(); $dl += length($watchlist); my @lines = split("\n",$watchlist); my @date; my %watchlist; while (scalar(@lines)) { my $line = shift(@lines); if ($line =~ m|<h4>(\d{4})-(\d{2})-(\d{2})</h4>|i) { @date = ($1,$2,$3); } if ($line =~ m|title="([^"]*?)">hist</a>|i) # " { my $page_name = _underscore($1); $line =~ m|(\d{2}):(\d{2}):(\d{2})|; $watchlist{$page_name} = timegm($3,$2,$1,$date[2],$date[1]-1,$date[0]); } } return \%watchlist; } sub path_fix { my ($path,$numbered_links) = @_; ($path =~ s|'{2,4}||g); ($path =~ s|<.*?>||g); ($path =~ s/\[\[:?.*?\|(.*?)\]\]/$1/g); ($path =~ s|\[\[:?(.*?)\]\]|$1|g); while ($path =~ m|\[.*?\]|) { my $title; if ($path =~ m|\[[^ ]* (.*?)\]|) { $title = $1; } else { $numbered_links++; $title = ".5B$numbered_links.5D"; } $path =~ s|\[.*?\]|$title|; } ($path =~ s|\s|_|g); ($path =~ s| |.C2.A0|g); while ($path =~ m|([^/a-z0-9\.:_'-])|i) { my $bad = $1; my $fix = uc('.'.sprintf("%x",ord($bad))); ($path =~ s/\Q$bad/$fix/g); } return ($path,$numbered_links); } sub post_log { my $pg = $c->get('User:HBC Archive Indexerbot/logs', 'rw'); $pg->{summary} = ('Writing log file for '.$start_time).' - Bot edit'; $pg->{content} = $log_file; $pg->save(); } sub send_report { my $dest = shift; my $report = shift; my $edit_summary = shift; my $pg = $c->get($dest, 'w'); $pg->{summary} = $edit_summary; $pg->{content} = '<!-- HBC Archive Indexerbot can blank this -->'."\n".$report; $ul += length($report); $pg->save() || writelog("Failed to save report to $dest"); sleep($write_delay); } sub watch { my $page_name = shift; my $success = $c->{ua}->get($c->{index}."?title=$page_name&action=watch")->is_success; $pages_watched++ if ($success); return $success; } sub writelog { my $entry = shift; my @month_table = ( 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', ); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time); my $time = sprintf("%02d:%02d:%02d %02d %s %04d", $hour,$min,$sec,$mday,$month_table[$mon],($year+1900)); $start_time ||= $time; $log_file .= ('* '.$time.': '.$entry.' ~~~'."\n"); warn "$entry\n"; } sub _escape { my ($val) = @_; $val = _underscore($val); $val = uri_escape($val); return $val; } sub _hash { my ($val) = @_; $val = _escape($val); $val = sha1_hex($val); return $val; } sub _underscore { my ($val) = @_; $val =~ s|\s|_|g; return $val; }
[edit] MediaWiki Change
Fixing an error in MediaWiki/page.pm(fergotten &):
return $obj->{ua}->get($obj->_wiki_url . "&action=" . ($unwatch ? "un" : "") . "watch")->is_success;