User:Interiot/Tool/code
From Wikipedia, the free encyclopedia
< User:Interiot | Tool
<nowiki> #!/usr/bin/perl # License: [[Public domain]] # regression test cases: # Markadet fr.wikipedia.org (11k edits) # Kolja21 de.wikipedia.org (5.1k edits) # OldakQuill en.wikipedia.org (12k edits) # Mxn meta.wikimedia.org (1.7k edits) # Helios89 it.wikipedia.org (7k edits) # TODO: # - regarding the "403 access denied" problem, contact noc@wikimedia.org or #wikimedia-tech on freenode # - ahh, they actively block screen-scrapers # - sweet-talk Hashar or Dom into unblocking, temporarily disable the tool or enable some form of rate limiting, etc. # - add a starting-cutoff-date, so renominations for RfA could only include the most recent items # - add a # edits per day # - use something like this to retrieve the list of namespaces in real-time: # http://en.wikiquote.org/wiki/Special:Export/Main_Page # - make "minor" actually work well for editcountitis: # - eg. for each namespace, present it like: Category Talk: 23 (13) # where "23" is the non-minor edits, and "13" is the minor edits # - get it to work with other mediawikis (example: http://brandt-watch.org/bwwiki/Main_Page) # - include a date at the end of the top-15 breakdown # - change the <div>s to <td>s on graph bars # - don't count comments as having an edit summary when it's purely an autocomment # - fix the issue where there's an "extra" first result when $offset > 0 # # - REWRITE IN AJAX so we don't have to worry about it being a temporary solution or not # - fix the sorting order on the output # # - ?? http://tools.wikimedia.de/~avar/cgi-bin/count # Possible other analysis graphs: # - monthly breakdowns # : have all the monthly breakdowns appear in one space on the page, but allow the user to # select between them with Javascript # - monthly breakdown of major/minor edits (like current red/green... make major edits on left, with minor edits trailing on right) # - monthly breakdown of the number of edits with summaries of /^(rv|revert)/ # - monthly breakdown, one each for the separate namespaces # - on monthly breakdowns, extrapolate the current month forward # - allow the user to hit ''(more)'' at the bottom of the namespace breakdowns, allowing them to # see a more complete list of top-15 # - allow the user to restrict the metrics to some specific recent period... eg. this is # something that's sometimes discussed on RfA # - any content-based analyses? (I suppose one would have to know which SQL thingies are quicker than others) # semi-far-out: # - allow the user to see JUST their edits from a specific page, when they click on that page on # the top-15 breakdown (furthermore, if structured right, it might let anybody's tool basically to # pop up the results of a $user && $page query) # - allow the results to be the combination of multiple users (either logged-in-user + anon-IP, # and multiple logged-in-users from multiple sites, eg. meta) use strict; use warnings; use CGI; #use CGI::Carp qw(fatalsToBrowser); use Date::Parse; use LWP::Simple; use HTML::Entities; use Data::Dumper; sub LOGFILE {"/home/interiot/public_html/tmp/wannabe_kate.log"} if ($ENV{QUERY_STRING} eq "code") { # send ourself when requested open FIN, $0 and print "Content-type: text/plain\n\n", <FIN>; exit; } # fill out using these documents: # http://meta.wikimedia.org/wiki/MediaWiki_localisation#Getting_latest_file # http://sourceforge.net/docs/E04/ sub nmspc { my @a = map {s/#.*//; s/^\s+|\s+$//g; $_} grep /\S/, split /[\n\r]+/, shift; return { "\x00order" => [@a], map { $_,1} @a}; } my %valid_namespaces = ( 'en.wikipedia.org' => nmspc(qq[ Talk: Category talk: Category: Help: Help talk: Image: Image talk: MediaWiki: MediaWiki talk: Portal: Portal talk: Template: Template talk: User: User talk: Wikipedia: Wikipedia talk: ]), 'de.wikipedia.org' => nmspc(qq[ Diskussion: # Talk Kategorie: # Category: Kategorie Diskussion: # Category Talk: Hilfe: # Help: Hilfe Diskussion: # Help Talk: Bild: # Image: Bild Diskussion: # Image Talk: MediaWiki: # MediaWiki: MediaWiki Diskussion: # MediaWiki Talk: Portal: # Portal: Portal Diskussion: # Portal Talk: Vorlage: # Template: Vorlage Diskussion: # Template Talk: Benutzer: # User: Benutzer Diskussion: # User Talk: Wikipedia: # Wikipedia: Wikipedia Diskussion: # Wikipedia Talk: ]), 'it.wikipedia.org' => nmspc(qq[ Discussione # Talk: Categoria # Category: Discussioni categoria # Category Talk: Aiuto # Help: Discussioni aiuto # Help Talk: Immagine # Image: Discussioni immagine # Image Talk: MediaWiki # MediaWiki: Discussioni MediaWiki # MediaWiki Talk: Template # Template: Discussioni template # Template Talk: Utente # User: Discussioni utente # User Talk: Wikipedia # Wikipedia: Discussioni Wikipedia # Wikipedia Talk: ]), ); my $query = new CGI; my $site = $query->param("site"); my $username = CGI::Util::escape($query->param("username")); $username =~ s/[\+\s]/_/g; my $isvalid = 0; my $this_namespace; if ($ENV{QUERY_STRING}) { $isvalid = 1; $isvalid = 0 unless ($site =~ /^[\w\.]*\.(org|com|net)$/i); #$isvalid = 0 unless ($username =~ /^[-\w\._]*$/); $isvalid = 0 if (length($username) == 0); } # data we generate by parsing the output from Wikipedia my @urls; my $bandwidth_down = 0; my %namespace_totals; my $xml_lang = ""; my $earliest_perldate; my $latest_perldate; my %month_totals; my %month_editsummary_totals; my %unique_articles; my %namespace_unique_articles; my %article_titles; print "Content-type: text/html; charset=utf-8\n\n"; #cgi_dumper(\%valid_namespaces); if (!$isvalid) { if ($ENV{QUERY_STRING}) { print "<font color=red><b>Invalid value</b></font>. <a href='http://en.wikipedia.org/wiki/Special:Emailuser/Interiot'>email Interiot</a> if this is incorrect.<p><br><br>\n"; } print <<"EOF"; This is a slow substitute for <a href="http://en.wikipedia.org/wiki/Wikipedia:Kate%27s_Tool">Kate's Tool</a> when it's unavailable. <form method=GET style="padding-top:1em"> <table><tr><td>username <td><input maxlength=128 name=username value="" title="username"> <tr><td>site <td><input maxlength=128 name=site value="en.wikipedia.org" title="site"> <tr><td> <td><input type=submit value="Submit"> </table> </form> Notes: <ul> <li>Green bars are for edit summaries, red bars are for edits with no summaries <li>The statistics are real-time (it <a href="http://en.wikipedia.org/wiki/Screen_scraping">scrapes</a> data off of the <tt>Special:Contributions</tt> page while you wait). <li>It's somewhat slow for edit counts over 5000 <li>It's unable to count deleted edits <li>It should work with most wikis out there that use <a href="http://en.wikipedia.org/wiki/MediaWiki">MediaWiki</a>, since it doesn't need privileged access to the databases. <!-- <li>This can't be more than a temporary solution for Wikipedia, as it wastes ~1GB/day of extra bandwidth compared to Kate's --> <li>Source code is in the <a href="http://en.wikipedia.org/wiki/Public_domain">public domain</a> and available <a href="$ENV{SCRIPT_NAME}?code">here</a> <li>Warning: <a href="http://www.bbc.co.uk/dna/h2g2/A1091350">metrics are evil</a> </ul> For bug reports/comments, see <a href="http://en.wikipedia.org/wiki/User_talk:Interiot">User talk:Interiot</a> or <a href="http://en.wikipedia.org/wiki/Special:Emailuser/Interiot">email him</a>. EOF } else { $this_namespace = $valid_namespaces{lc $site}; #cgi_dumper(\$this_namespace); exit; $username =~ s/^_+|_$//g; #print "$site<br>$username\n"; $namespace_totals{earliest} = get_5000($site, $username, 0); #cgi_dumper(\@urls, \%namespace_totals); exit; #cgi_dumper(\%unique_articles); $namespace_totals{"number of unique articles"} = scalar(keys %unique_articles); $namespace_totals{"avg edits per article"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique articles"}); print $xml_lang, <<'EOF'; <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <style> td {padding: .1em 1em .1em} table.months {padding-top: 2em} table.months td.date {font-weight: bold} table.months td {font-size: 75%} div.red, div.green { height:1em; float:left; } div.red {background-color: #f00} div.green {background-color: #0f0} div.topN { float: left; min-height: 30em; /* otherwise, they get ALL jumbled up */ } table.topN { float: left; border: 1px solid black; } table.topN th { background-color: #000; color: #fff; } table.topN td { /* override the above */ padding: .1em .3em; } </style> </head> <body> EOF print "<ul style='padding-left:10em'><a href='$ENV{SCRIPT_NAME}'><i>Go back</i></a> to see caveats or to check another user.</ul>\n"; print "<h3>User:$username</h3>\n"; print "<table>\n"; foreach my $key (sort keys %namespace_totals) { print "<tr><td>", $key, "<td>", $namespace_totals{$key}, "\n"; } print "</table>\n"; #### output the months stats #cgi_dumper(\%month_editsummary_totals); my @months = list_months(); my $max_width = 0; $max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals); if ($max_width > 0) { print "<table class=months>\n"; foreach my $month (@months) { my $no_summary = $month_totals{$month} - $month_editsummary_totals{$month}; print "<tr><td class=date>$month <td>", $month_totals{$month}, "\n"; #print "<td><div class=red style='width:", int(500 * $month_totals{$month} / $max_width), "px'></div>\n"; print "<td><div class=green style='width:", int(500 * $month_editsummary_totals{$month} / $max_width), "px'></div>\n"; print "<div class=red style='width:", int(500 * $no_summary / $max_width), "px'></div>\n"; } print "</table>\n"; } #### output the top-15 namespace stats my $num_to_present = 15; if ($this_namespace) { # only do it if we're sure about the namespaces print "<p><br>\n"; #print "<ul>NOTE: This section has a tendency to hilight a user's \"youthful indiscretions\". Please take the dates of the edits into account.</ul>\n"; foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) { next unless (scalar(keys %{$namespace_unique_articles{$nmspc}})); my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}} grep { $namespace_unique_articles{$nmspc}{$_} > 1} # filter out items with only 1 edit keys(%{$namespace_unique_articles{$nmspc}}); next unless (@articles); #print "<div class=topN>\n"; print "<table class=topN><tr><th colspan=2>$nmspc\n"; my @present = splice(@articles, 0, $num_to_present); foreach my $article (@present) { my $artname = $article_titles{$article}; if ($nmspc ne 'Mainspace') { $artname =~ s/^.*?://; } $artname =~ s/\s/ /g; my $url = "http://$site/w/index.php?title=$article&action=history"; print "<tr><td>", $namespace_unique_articles{$nmspc}{$article}, "<td><a href='$url'>$artname</a>\n"; } # fill it out so float:left doesn't jumble up for (my $ctr=@present; $ctr<15; $ctr++) { print "<tr><td> <td> \n"; } print "</table>\n"; #print "</div>\n"; } } #### output the bottom summary print "<p style='clear:left'><br><br>If there were any problems, please <a href='http://en.wikipedia.org/wiki/Special:Emailuser/Interiot'>email Interiot</a> or post at <a href='http://en.wikipedia.org/wiki/User_talk:Interiot'>User talk:Interiot</a>.\n"; #print "<p>Based on these URLs:\n<ul>\n", join("\n", map {"<li><a href='$_>$_</a>"} @urls), "</ul>\n"; print "<div style='padding:1em 3em; font-size: 60%'>Based directly on these URLs:\n"; for (my $ctr=0; $ctr<@urls; $ctr++) { print "<a href='$urls[$ctr]'>[", ($ctr+1), "]</a>"; print ", " unless ($ctr >= @urls - 1); print "\n"; } print "</div>\n"; #### log the bandwidth used open FOUT, ">>" . LOGFILE() or die; printf FOUT "%s %-20s %-30s %5dK %7d\n", scalar(localtime), $username, $site, int($bandwidth_down / 1024), $namespace_totals{total}; close FOUT; } sub get_5000 { my $site = shift; my $username = shift; my $offset = shift; my $earliest = ""; my $url = "http://$site/w/index.php?title=Special:Contributions&target=$username&offset=${offset}&limit=5000"; if (! $LWP::Simple::ua) { LWP::Simple::_init_ua(); #$LWP::Simple::ua->agent("Mozilla/4.0 WebTV/2.6 (compatible; MSIE 4.0)"); # apparently they're picky about useragent strings $LWP::Simple::ua->agent("Wget/1.9.1"); # apparently they're picky about useragent strings. Use the same as wget. } push(@urls, $url); if (@urls >= 10) { print "Too many pages fetched. Terminating.<br>\n"; #cgi_dumper(\@urls); exit; } my $page; if (1) { my $request = HTTP::Request->new(GET => $url); my $response = $LWP::Simple::ua->request($request); if (!$response->is_success) { print "While trying to fetch <a href='$url'>$url</a>, $site responded:<br><br>\n", $response->status_line, "<br><br>", $response->content; exit; } $page = $response->content; $bandwidth_down += length($page); if (0) { local *FOUTOUT; open FOUTOUT, ">/var/tmp/kate/tmp.out" or die; print FOUTOUT $page; close FOUTOUT; } } else { open FININ, "</var/tmp/kate/tmp.out" or die; local $/ = undef; $page = <FININ>; close FININ; } if ($page =~ m#(<html [^>]+>)#i) { $xml_lang = $1; } ## parse each individual contribution #while ($page =~ /^<li>(\d\d:\d\d,.*)/igm) { while ($page =~ /^<li>([^(]+\(<a href="[^"]+action=history.*)/igm) { my $this_time; local $_ = $1; my $edit_summary; #$edit_summary++ if (m#<a href="/wiki/[^"]*"\s+title="[^"]*">[^<]*</a>\s*\(#is); $edit_summary++ if (/<span class='comment'>/si); my $article_url; if (m#<a href="/wiki/([^"]+)" title="[^"]+">([^<]+)#si) { $article_url = $1; $article_titles{$1} = $2; } $unique_articles{$article_url}++; ## strip out all the HTML tags s/<[^>]*>//gs; if (/^(.*?) \(/) { my $date = $1; $earliest = $date; # translate months into english, so Date::Parse chn handle them # languages believed to work here: EN, DE, IT $date =~ s/\b(?:gen )\b/jan/gix; $date =~ s/\b(?:mär )\b/mar/gix; $date =~ s/\b(?:mai|mag )\b/may/gix; $date =~ s/\b(?:giu )\b/jun/gix; $date =~ s/\b(?:lug )\b/jul/gix; $date =~ s/\b(?:ago )\b/aug/gix; $date =~ s/\b(?:set )\b/sep/gix; $date =~ s/\b(?:okt|ott )\b/oct/gix; $date =~ s/\b(?:dez|dic )\b/dec/gix; $this_time = str2time($date); if ($this_time == 0) { #print "XXXXXXXXXXXXXXXXXXXXXXXXX<br>\n"; } else { #print scalar(gmtime($this_time)), "<br>\n"; $earliest_perldate = $this_time; # record the earliest and latest month we see $latest_perldate ||= $this_time; my $monthkey = monthkey(localtime($this_time)); $month_totals{$monthkey}++; $edit_summary && $month_editsummary_totals{$monthkey}++; } } s/^[^()]*\([^()]*\) \([^()]*\) (?:\S )? //; my $subspace = "Mainspace"; if (/^([^\s\d\/:]+(?:\s[^\s\d\/:]+)?:)/) { if (!$this_namespace || exists $this_namespace->{$1}) { $subspace = $1; } } $namespace_totals{$subspace}++; $namespace_totals{total}++; $namespace_unique_articles{$subspace}{$article_url}++; #print "$_<br>\n"; } ## if they have more than 5000 contributions, go to the next page while ($page =~ m#href="[^"]+:Contributions[^"]+offset=(\d+)#ig) { #print "Trying again at offset $1<br>\n"; next unless ($1 > 0 && ($offset == 0 || $1 < $offset)); return get_5000($site, $username, $1); # tail recursion until there are no more } return $earliest; } # returns something like [ # "2003/10", # "2003/11", # "2003,12" # ] sub list_months { my $last_monthkey = ''; my @ret; # yes, this is a fairly odd algorithm. oh well. for (my $date=$earliest_perldate; $date<=$latest_perldate; $date+=10*24*60*60) { my $monthkey = monthkey(localtime($date)); if ($monthkey ne $last_monthkey) { push(@ret, $monthkey); $last_monthkey = $monthkey; } } return @ret; } sub monthkey {($_[5] + 1900) . "/" . ($_[4] + 1)} sub cgi_dumper {print "<pre>", HTML::Entities::encode(Dumper(@_)), "
"} </nowiki>