User:SQLBot/Reflist.php

From Wikipedia, the free encyclopedia

<?php
$dbdump = "enwiki.xml";
$dump = @fopen($dbdump, "r");
 
$reftag = "/(<ref |<ref>).*/";
$reflisttag = "/(\{\{(reflist|reference|refs|footnotes)|<references).*/i";
$foundarticles = fopen("xtfound", 'a');
 
function SaveArticle($artname, $foundarticles) {
        $artct = "* [[" . $artname . "]]\r\n";
        if (fwrite($foundarticles, $artct) === FALSE) {
                echo "Cannot write to file ($foundarticles)\r\n";
        }
}
 
function CheckArticle($article) {
        global $reftag;
        global $reflisttag;
        global $foundarticles;
        #### Search for reftag and reflisttag
        $reftags = preg_match($reftag, $article[text]);
        $reflisttags = preg_match($reflisttag, $article[text]);
        $artname = $article[title];
        if ($reftags && $reflisttags) {
                echo "RefTags and RefListTags were both found in $artname\r\n";
        } else {
                if ($reftags) {
                        echo "Only RefTags were found in $artname\r\n";
                        SaveArticle($artname, $foundarticles);
                }
                if ($reflisttags) {
                        echo "Only RefListTags were found in $artname\r\n";
                }
        }
        if (!$reftags && !$reflisttags) {
                echo "Neither RefListTags nor RefTags were found in $artname\r\n";
        }
}
 
while (!feof($dump)) {
        $dumpline = fgets($dump);
        $dumpline = ltrim($dumpline);
        $dumpline = rtrim($dumpline);
        if($dumpline == "<page>") {
                $dumpline = fgets($dump);
              $dumpline = ltrim($dumpline);
              $dumpline = rtrim($dumpline);
              preg_match("/\<</span>title\>(.*)\<</span>\/title\>/", $dumpline, $matches);
              $article[title] = $matches[1];
                $dumpline = fgets($dump);
                $dumpline = ltrim($dumpline);
                $dumpline = rtrim($dumpline);
              preg_match("/\<</span>id\>(.*)\<</span>\/id\>/", $dumpline, $matches);
              $article[id] = $matches[1];
                $dumpline = fgets($dump);
                $dumpline = fgets($dump);
                $dumpline = fgets($dump);
                $dumpline = fgets($dump);
                $dumpline = fgets($dump);
                $dumpline = ltrim($dumpline);
                $dumpline = rtrim($dumpline);
                preg_match("/\<</span>username\>(.*)\<</span>\/username\>/", $dumpline, $matches);
                $article[username] = $matches[1];
              $found = 0;
              while ($found <= 0) {
                $dumpline = fgets($dump);
                if(strstr($dumpline, '<text xml:space="preserve">')) {
                   $found = 1;
                }
              }
                $dumpline = ltrim($dumpline);
                $dumpline = rtrim($dumpline);
              if (preg_match('/\<</span>text xml\:space\=\"preserve\"\>(.*)\<</span>\/text\>/', $dumpline, $matches)) {
                $article[text] = $matches[1];
              } else {
                $article[text] = $matches[1];
                $endoftextarea = false;
                while (!$endoftextarea) {
                   $dumpline = fgets($dump);
                              $dumpline = ltrim($dumpline);
                        $dumpline = rtrim($dumpline);
                   if(strstr($dumpline, '</text>')) {
                        preg_match('/(.*)\<</span>\/text\>/', $dumpline, $matches);
                        $article[text] = $article[text] . $matches[1];
                        $endoftextarea = true;
                   } else {
                        $article[text] = $article[text] . $dumpline;
                   }
                }
              }
              CheckArticle($article);
        }
}
fclose($dump);
?>