From Wikipedia, the free encyclopedia
<?php
$dbdump = "enwiki.xml";
$dump = @fopen($dbdump, "r");
$reftag = "/(<ref |<ref>).*/";
$reflisttag = "/(\{\{(reflist|reference|refs|footnotes)|<references).*/i";
$foundarticles = fopen("xtfound", 'a');
function SaveArticle($artname, $foundarticles) {
$artct = "* [[" . $artname . "]]\r\n";
if (fwrite($foundarticles, $artct) === FALSE) {
echo "Cannot write to file ($foundarticles)\r\n";
}
}
function CheckArticle($article) {
global $reftag;
global $reflisttag;
global $foundarticles;
#### Search for reftag and reflisttag
$reftags = preg_match($reftag, $article[text]);
$reflisttags = preg_match($reflisttag, $article[text]);
$artname = $article[title];
if ($reftags && $reflisttags) {
echo "RefTags and RefListTags were both found in $artname\r\n";
} else {
if ($reftags) {
echo "Only RefTags were found in $artname\r\n";
SaveArticle($artname, $foundarticles);
}
if ($reflisttags) {
echo "Only RefListTags were found in $artname\r\n";
}
}
if (!$reftags && !$reflisttags) {
echo "Neither RefListTags nor RefTags were found in $artname\r\n";
}
}
while (!feof($dump)) {
$dumpline = fgets($dump);
$dumpline = ltrim($dumpline);
$dumpline = rtrim($dumpline);
if($dumpline == "<page>") {
$dumpline = fgets($dump);
$dumpline = ltrim($dumpline);
$dumpline = rtrim($dumpline);
preg_match("/\<</span>title\>(.*)\<</span>\/title\>/", $dumpline, $matches);
$article[title] = $matches[1];
$dumpline = fgets($dump);
$dumpline = ltrim($dumpline);
$dumpline = rtrim($dumpline);
preg_match("/\<</span>id\>(.*)\<</span>\/id\>/", $dumpline, $matches);
$article[id] = $matches[1];
$dumpline = fgets($dump);
$dumpline = fgets($dump);
$dumpline = fgets($dump);
$dumpline = fgets($dump);
$dumpline = fgets($dump);
$dumpline = ltrim($dumpline);
$dumpline = rtrim($dumpline);
preg_match("/\<</span>username\>(.*)\<</span>\/username\>/", $dumpline, $matches);
$article[username] = $matches[1];
$found = 0;
while ($found <= 0) {
$dumpline = fgets($dump);
if(strstr($dumpline, '<text xml:space="preserve">')) {
$found = 1;
}
}
$dumpline = ltrim($dumpline);
$dumpline = rtrim($dumpline);
if (preg_match('/\<</span>text xml\:space\=\"preserve\"\>(.*)\<</span>\/text\>/', $dumpline, $matches)) {
$article[text] = $matches[1];
} else {
$article[text] = $matches[1];
$endoftextarea = false;
while (!$endoftextarea) {
$dumpline = fgets($dump);
$dumpline = ltrim($dumpline);
$dumpline = rtrim($dumpline);
if(strstr($dumpline, '</text>')) {
preg_match('/(.*)\<</span>\/text\>/', $dumpline, $matches);
$article[text] = $article[text] . $matches[1];
$endoftextarea = true;
} else {
$article[text] = $article[text] . $dumpline;
}
}
}
CheckArticle($article);
}
}
fclose($dump);
?>