User:Eloquence/Wikinfo import script

From Wikipedia, the free encyclopedia

Copy this from the source, not from here.

This is the first pre-release of the new import script. It runs as a webserver on 8450 and does some cool stuff:

  1. Filters Special:Newpages for new articles that are not "from Wikipedia"
  2. Does not require a cookies.txt - uses its own account. Works with Wikinfo's new login requirement
  3. Auto-generates diffs of new articles that exist in both Wikinfo and Wikipedia
  4. Strips signatures from titles when importing

Note that to do all this it has to load quite a few pages, which takes some time for a full set of 500 new pages.

To do:

  • Cache previous runs in wikinfo.db so we don't have to wait 60 seconds
  • Use Special:Export

If you want to use this in some way or another, make sure you install all the used modules first using CPAN.

#!/usr/bin/perl
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Daemon;
use HTTP::Status;
use HTTP::Response;
use URI::Escape;
use Text::ParagraphDiff;
use GDBM_File ;
tie %storage, 'GDBM_File', "wikinfo.db", &GDBM_WRCREAT, 0640;
$SIG{INT} = \&catch_zap;  # best strategy
$WKPREFIX="WIKINFO_";
$WKSPREFIX="WIKINFOSIZE_";
$LCPREFIX="LASTCHECK_";
$WPPREFIX="WIKIPEDIA_";
$DIPREFIX="DIFF_";

$browser=LWP::UserAgent->new();

$browser->cookie_jar( {} );
@ns_headers = (
   'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
   'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
        image/pjpeg, image/png, */*',
   'Accept-Charset' => 'iso-8859-1,*,utf-8',
   'Accept-Language' => 'en-US',
);
$browser->post("http://www.wikinfo.org/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]);
$browser->post("http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]);


$d=new HTTP::Daemon(LocalHost=>'localhost', LocalPort => '8450', Reuse=>1);
print "Please contact me at: ".$d->url. "\n";
@ns_headers = (
   'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
   'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
        image/pjpeg, image/png, */*',
   'Accept-Charset' => 'iso-8859-1,*,utf-8',
   'Accept-Language' => 'en-US',
);
#get_wikinfo_new();
#exit 0;

while ($c = $d->accept) {
      $r = $c->get_request;
      my $html;
      $html.= <<HTML ;
<html>
<head>
<style TYPE="text/css">
<!--
body { margin-left:2em;margin-right:2em;background:#eeeeee;}
a { text-decoration:none;color:blue;}
a.ext { color:green;cursor:help; }
-->
</STYLE>

<body>
<h1>Wikinfo Import Script</h1>
HTML

      if ($r) {
          if ($r->method eq 'GET' and $r->url->path eq "/") {

                my $re=new HTTP::Response();
                $re->header("content_type"=>"text/html");
                $html.= <<HTML ;
<table border="1" width="100%">
<tr><td><b>Wikinfo page</B></td><td><b>Corresponding Wikipedia page</B></td><td><b>Import</B></td></tr>
HTML

                my @lines=get_wikinfo_new();
                while(@lines) {
                        $linkopen=shift @lines;
                        $linktitle=shift @lines;
                        $linkclose=shift @lines;
                        $bytes=shift @lines;
                        $comment=shift @lines;
                        $wikipedia=shift @lines;
                        $diff=shift @lines;
                        $import=shift @lines;
                        $html.="<tr valign='top'><td>".$linkopen .$linktitle. $linkclose ." (".$bytes." bytes)";
                        if($comment) { $html.="<br><I>$comment</I>";}
                        $html.="</td><td>$wikipedia</td><td>$import</td></tr>";
                        if($diff ne "N/A") {

                                $html .= "<tr><td colspan=3 bgcolor=\"#dddddd\"><b>Diff:</B><P><font size=-1>".
                                "$diff</font></td></tr>";

                        }
                }

                $html.= <<HTML ;
</table>
</body>
</html>
HTML


                $re->content($html);
                $c->send_response($re);

          } elsif($r->method eq 'GET' and $r->url->path ne "/") {

                my $re=new HTTP::Response();
                $re->header("content_type"=>"text/html");

                $page=substr($r->url->path,1);
                $html.=import_wikinfo($page);
                $html.="</body></html>";
                $re->content($html);
                $c->send_response($re);
          }
          else {
              $c->send_error(RC_FORBIDDEN)
          }
      }
      $c = undef;  # close connection
  }

sub get_wikinfo_new {

        my $response = $browser->get(
        "http://www.wikinfo.org/wiki.phtml?title=Special:Newpages&limit=500&offset=0",
        @ns_headers);
        $response->content=~m/<ol start=.*?>(.*?)<\/ol>/s;
        @lines=split(/<LI>/i,$1);
        print $#lines;
        my @checklines;

        foreach $line(@lines) {
                if($line=~m/(.*?)(<a href.*?>)(.*?)(<\/a>).*?\((.*?) bytes\)/i) {
                        $date=$1;
                        $linkopen=$2;
                        $linktitle=$3;
                        $linkclose=$4;
                        $bytes=$5;
                        if($line=~m/<em>\((.*)\)<\/em>/i) {
                                $comment=$1;
                        } else {
                                $comment="";

                        }
                        $wikititle=to_url($linktitle);
                        $pediatitle=to_url(strip_sig($linktitle));


                        if(!($comment=~m/from wikipedia \(note changes here\)/i)) {
                                push @checklines,$linkopen;
                                push @checklines,$linktitle;
                                push @checklines,$linkclose;
                                push @checklines,$bytes;
                                push @checklines,$comment;

                                $tryurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediatitle .
                                  "&action=edit";
                                $response=$browser->get($tryurl,@ns_headers);
                                $response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
                                $pediasource=$1;
                                if(($pediasource=~m/\w+/)) {

                                        push @checklines,"<a href='$tryurl'>$pediatitle</a>";


                                        if(1) {
                                                $tryurl="http://www.wikinfo.org/wiki.phtml?title=" .
                                                  $wikititle .
                                                "&action=edit";
                                                $response=$browser->get($tryurl,@ns_headers);
                                                $response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
                                                $wikinfosource=$1;
                                                $diff=text_diff($pediasource, $wikinfosource, {string=>1, plain=>1, escape=>1});
                                                $diff=~m/<p>(.*)<\/p>/si;
                                                $diff=$1;
                                                $diff=~s/ size="\+1">/>/gi;
                                                push @checklines, $diff;
                                        } else {
                                                push @checklines, "N/A";
                                        }
                                        push @checklines, "N/A"; # exists, no import possible

                                } else {

                                        push @checklines, "<a href='http://en.wikipedia.org/wiki/$pediatitle'>N/A</A>"; # no Wikipedia URL
                                        push @checklines, "N/A"; # no diff
                                        $importurl=$d->url.$wikititle;
                                        $importlink="<a href='$importurl'>Go!</a>";
                                        push @checklines,$importlink;

                                }

                        }

                }
        }

        return @checklines;
}

sub import_wikinfo {

        my $title=shift;
        my $editurl="http://www.wikinfo.org/wiki.phtml?title=".$title."&action=edit";
        my $viewurl="http://www.wikinfo.org/wiki.phtml?title=".$title;
        my $response = $browser->get($editurl,@ns_headers);
        my $rv;
        $pagetitle=to_wiki($title);
        $pediaurl=to_url(strip_sig($pagetitle));

#       print "Full:\n".$response->content;
        $response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
#       print "Source:\n".$source;
        $source=$1;
        $source=~s/\"/"/gi; # unescape
        $source=~s/\>/>/gi;
        $source=~s/\</</gi;
        $source=~s/\&/\&/gi;
        if(!($source=~m/\w+/)) {
        $rv.= "The page with the specified title was not found: <A HREF='$viewurl'>$viewurl</A> (<a href='$editurl'>edit</a>)";
        return $rv;
        }

        $source.="\n\n''Adapted from the [[Wikinfo]] article [$viewurl $pagetitle], licensed under the [[GNU Free Documentation License]].''";

        $rv.="Checking for duplicate of <A HREF='$viewurl'>$viewurl</A>..<P>";

        $tryurl="http://en.wikipedia.org/wiki/".$pediaurl;
        $response=$browser->get($tryurl,@ns_headers);
        if($response->content=~m/There is currently no text in this page/) {

                $rv.="Posted new article to <a href='$tryurl'>$tryurl</A>!<P>";
                $wpurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediaurl."&action=submit";
                $browser->post($wpurl,@ns_headers,Content=>
                [
                wpTextbox1=>$source,
                wpSave=>"Save page",
                wpSummary=>"Imported from Wikinfo via [[User:Eloquence/Wikinfo import script]]"
                ]);
        } else {

                $rv.="Page already exists on Wikipedia: <a href='$tryurl'>$tryurl</A>! You have to merge by hand. :-("

        }

        return $rv;

}

    sub catch_zap {
        my $signame = shift;
        untie %storage;
        die "Program terminated: Received $signame";
    }

    sub strip_sig {
        my $title=shift;
        @names=( "Levan Urushadze", "Fred Bauder");
        while ($name=shift(@names)) {
                $title=~s/(.*) by $name$/$1/g;
        }
        return $title;
    }

    sub to_url {
        my $title=shift;
        $title=~s/ /_/gi;
        $title=uri_escape($title);
        $title=~s/\'/\%27/gi;
        return $title;
    }

    sub to_wiki {
        my $title=shift;
        $title=uri_unescape($title);
        $title=~s/\%27/\'/gi;
        $title=~s/_/ /gi;
        return $title;
    }