User:Ap/LaTeX conversion utility

From Wikipedia, the free encyclopedia

 #!/usr/bin/perl -w
 ##
 ## Copyright (C) 2003 Arno W. Peters.
 ## released under GNU GPL version 2 or higer.
 
 use strict;
 use DBI();
 
 sub texheader {
   print "\\documentclass[10pt,english,a4paper,twocolumn]{book}\n";
   print "\\usepackage[latin1]{inputenc}\n";
   print "\\usepackage[T1]{fontenc}\n";
   print "\\usepackage{geometry}\n";
   print "\\usepackage[cm]{fullpage}\n";
   print "\\usepackage{babel}\n";
   print "\n";
   print "\\def\\sup#1{\\ensuremath{^#1}}\n";
   print "\\def\\sub#1{\\ensuremath{_#1}}\n";
   print "\\def\\slash{/}\n";
   print "\\catcode`\\/=\\active\n";
   print "\\def/{\\slash\\discretionary{}{}{}}\n";
   print "\n";
   print "\\begin{document}\n";
 }
 
 sub texfooter {
   print "\\end{document}\n";
 }
 
 sub article2tex {
   my ($title, $text) = @_;
   my @math;
   my $n = 0;
   my $tmp;
 
   $title =~ s/_/ /g;
   $title =~ s/&/\\&/g;
 
 ###  while ($tmp =~ m%<math>(.*)</math>%) {
 ###    $math[$n] = $1;
 ###    $n++;
 ###    $tmp = $';
 ###  }
 
   $text =~ s/\r//g;
   $text =~ s/\\/\//g;
   $text =~ s/\$/\\\$/g;
   $text =~ s/\^/\\\^{\ }/g;
   $text =~ s/{/\\{/g;
   $text =~ s/}/\\}/g;
 
   $text =~ s/ë/\\"e/g;
   $text =~ s/ö/\\"o/g;
   $text =~ s/ü/\\"u/g;
   $text =~ s/ï/\\"{\\i}/g;
   $text =~ s/á/\\'a/g;
   $text =~ s/é/\\'e/g;
   $text =~ s/è/\\`e/g;
   $text =~ s/É/\\'E/g;
 
   $text =~ s/α/\\ensuremath{\\alpha}/g;
   $text =~ s/Α/A/g;
   $text =~ s/β/\\ensuremath{\\beta}/g;
   $text =~ s/Β/B/g;
   $text =~ s/γ/\\ensuremath{\\gamma}/g;
   $text =~ s/Γ/\\ensuremath{\\Gamma}/g;
   $text =~ s/δ/\\ensuremath{\\delta}/g;
   $text =~ s/Δ/\\ensuremath{\\Delta}/g;
   $text =~ s/ε/\\ensuremath{\\epsilon}/g;
   $text =~ s/Ε/E/g;
   $text =~ s/ζ/\\ensuremath{\\zeta}/g;
   $text =~ s/Ζ/Z/g;
   $text =~ s/η/\\ensuremath{\\eta}/g;
   $text =~ s/Η/E/g;
   $text =~ s/θ/\\ensuremath{\\theta}/g;
   $text =~ s/Θ/\\ensuremath{\\Theta}/g;
   $text =~ s/ι/\\ensuremath{\\iota}/g;
   $text =~ s/Ι/I/g;
   $text =~ s/κ/\\ensuremath{\\kappa}/g;
   $text =~ s/Κ/K/g;
   $text =~ s/λ/\\ensuremath{\\lambda}/g;
   $text =~ s/Λ/\\ensuremath{\\Lambda}/g;
   $text =~ s/μ/\\ensuremath{\\mu}/g;
   $text =~ s/Μ/M/g;
   $text =~ s/ν/\\ensuremath{\\nu}/g;
   $text =~ s/Ν/N/g;
   $text =~ s/ξ/\\ensuremath{\\xi}/g;
   $text =~ s/Ξ/\\ensuremath{\\Xi}/g;
   $text =~ s/π/\\ensuremath{\\pi}/g;
   $text =~ s/Π/\\ensuremath{\\Pi}/g;
   $text =~ s/ρ/\\ensuremath{\\rho}/g;
   $text =~ s/Ρ/R/g;
   $text =~ s/σ/\\ensuremath{\\sigma}/g;
   $text =~ s/Σ/\\ensuremath{\\Sigma}/g;
   $text =~ s/τ/\\ensuremath{\\tau}/g;
   $text =~ s/Τ/\\ensuremath{\\Tau}/g;
   $text =~ s/υ/\\ensuremath{\\upsilon}/g;
   $text =~ s/Υ/\\ensuremath{\\Upsilon}/g;
   $text =~ s/φ/\\ensuremath{\\phi}/g;
   $text =~ s/Φ/\\ensuremath{\\Phi}/g;
   $text =~ s/χ/\\ensuremath{\\chi}/g;
   $text =~ s/Χ/X/g;
   $text =~ s/ψ/\\ensuremath{\\psi}/g;
   $text =~ s/Ψ/\\ensuremath{\\Psi}/g;
   $text =~ s/ω/\\ensuremath{\\omega}/g;
   $text =~ s/Ω/\\ensuremath{\\Omega}/g;
 
   $text =~ s/²/\\sup{2}/g;
   $text =~ s/ /~/g;
   $text =~ s/°/\\ensuremath{^\\circ}/g;
   $text =~ s/</\\ensuremath{<}/g;
   $text =~ s/>/\\ensuremath{>}/g;
   $text =~ s/≤/\\ensuremath{\\leq}/g;
   $text =~ s/≥/\\ensuremath{\\geq}/g;
   $text =~ s/↑/\\ensuremath{\\uparrow}/g;
   $text =~ s/↓/\\ensuremath{\\downarrow}/g;
   $text =~ s/→/\\ensuremath{\\rightarrow}/g;
   $text =~ s/←/\\ensuremath{\\leftarrow}/g;
   $text =~ s/∀/\\ensuremath{\\forall}/g;
   $text =~ s/∈/\\ensuremath{\\in}/g;
   $text =~ s/∪/\\ensuremath{\\cup}/g;
   $text =~ s/∫/\\ensuremath{\\int}/g;
   $text =~ s/—/---/g;
 
   $text =~ s/♣/\\ensuremath{\\clubsuit}}/g;
   $text =~ s/♠/\\ensuremath{\\spadesuit}}/g;
   $text =~ s/♦/\\ensuremath{\\diamondsuit}}/g;
   $text =~ s/♥/\\ensuremath{\\heartsuit}}/g;
 
   $text =~ s/^#redirect/See /i;
 
   $text =~ s/&/\\&/g;
   $text =~ s/#/\\#/g;
   $text =~ s/%/\\%/g;
   $text =~ s/_/\\_/g;
 
   $text =~ s/''''''(.+?)''''''/\{\}$1\{\}/mg;
   $text =~ s/'''''(.+?)'''''/\\emph{\\textbf{$1}}/mg;
   $text =~ s/''''(.+?)''''/\\textbf{'$1'}/mg;
   $text =~ s/'''(.+?)'''/\\textbf{$1}/mg;
   $text =~ s/''(.+?)''/\\emph{$1}/mg;
 
   ## Ignore verbatim 
   ## $text =~ s/^ (.*)$/\\begin{verbatim}$1\n\\end{verbatim}/mg;
 
   $text =~ s/^=====(.*)=====/\\subparagraph*{$1}\n/mg;
   $text =~ s/^====(.*)====/\\paragraph*{$1}\n/mg;
   $text =~ s/^===(.*)===/\\subsubsection*{$1}\n/mg;
   $text =~ s/^==(.*)==/\\subsection*{$1}\n/mg;
   $text =~ s/(\d)\-(\d)/$1--$2/mg;
 
   $text =~ s/^\*\*\*/\\par\\noindent\\hangindent=6em\\hskip5em\\llap{\\ensuremath{\\bullet}}\\quad /mg;
   $text =~ s/^\*\*/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\ensuremath{\\bullet}}\\quad /mg;
   $text =~ s/^\*/\\par\\noindent\\hangindent=2em\\quad\\llap{\\ensuremath{\\bullet}}\\quad /mg;
   $text =~ s/^:::/\\par\\noindent\\hangindent=6em\\hskip6em /mg;
   $text =~ s/^::/\\par\\noindent\\hangindent=4em\\hskip4em /mg;
   $text =~ s/^:/\\par\\noindent\\hangindent=2em\\qquad /mg;
   $text =~ s/^\\#\\#/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\#}\\quad /mg;
   $text =~ s/^\\#/\\par\\noindent\\hangindent=2em\\quad\\llap{\\#}\\quad /mg;
 
   $text =~ s/<li[^>]*?>/\\par\\noindent\\qquad /ig;
   $text =~ s/<\/li>/ /ig;
   $text =~ s/<dd[^>]*?>/\\par\\noindent\\qquad /ig;
   $text =~ s/<\/dd>/ /ig;
 
   $text =~ s/<ul[^>]*?>/\n\n/ig;
   $text =~ s/<\/ul>/ /ig;
   $text =~ s/<dl[^>]*?>/\n\n/ig;
   $text =~ s/<\/dl>/ /ig;
   $text =~ s/<ol[^>]*?>/\n\n/ig;
   $text =~ s/<\/ol>/ /ig;
   $text =~ s/<p[^>]*?>/\n\n/ig;
   $text =~ s/<\/p>/ /ig;
 
   $text =~ s/<h1>/\\subsection*{/ig;
   $text =~ s/<h2>/\\subsubsection*{/ig;
   $text =~ s/<h3>/\\paragraph*{/ig;
   $text =~ s/<h4>/\\subparagraph*{/ig;
   $text =~ s/<\/h\d>/}\n/ig;
 
   $text =~ s/<blockquote>/\\begin{quotation}\n/ig;
   $text =~ s/<\/blockquote>/\\end{quotation}\n/ig;
 
   $text =~ s/<strong>/\\textbf{/ig;
   $text =~ s/<\/strong>/}/ig;
   $text =~ s/<b>/\\textbf{/ig;
   $text =~ s/<\/b>/}/ig;
   $text =~ s/<i>/\\emph{/ig;
   $text =~ s/<\/i>/}/ig;
   $text =~ s/<var>/\\emph{/ig;
   $text =~ s/<\/var>/}/ig;
   $text =~ s/<emph>/\\emph{/ig;
   $text =~ s/<\/emph>/}/ig;
   $text =~ s/<sup>/\\sup{/ig;
   $text =~ s/<\/sup>/}/ig;
   $text =~ s/<sub>/\\sub{/ig;
   $text =~ s/<\/sub>/}/ig;
   $text =~ s/<small>/{\\small /ig;
   $text =~ s/<\/small>/}/ig;
 
   $text =~ s/<br[^>]*?>/\\\\[.5\\baselineskip]/ig;
 
   $text =~ s/----+/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /g;
   $text =~ s/<hr[^>]*?>/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /ig;
 
   ## Ignore tables for now
   $text =~ s/<table[^>]*?>/ /ig;
   $text =~ s/<\/table>/} /ig;
   $text =~ s/<tr[^>]*?>/ /ig;
   $text =~ s/<\/tr>/ /ig;
   $text =~ s/<td[^>]*?>/ /ig;
   $text =~ s/<\/td>/ /ig;
   $text =~ s/<th[^>]*?>/ /ig;
   $text =~ s/<\/th>/ /ig;
 
   $text =~ s/<center[^>]*?>/\\begin{center}\n/ig;
   $text =~ s/<\/center>/\\end{center}\n/ig;
 
   $text =~ s/<div[^>]*?>/ /ig;
   $text =~ s/<\/div>/ /ig;
   $text =~ s/<font[^>]*?>//ig;
   $text =~ s/<\/font>//ig;
 
   $text =~ s/²/\\sup{2}/g;
   $text =~ s/³/\\sup{3}/g;
   $text =~ s/’/'/g;
   $text =~ s/–//g;
   $text =~ s/—//g;
 
   print "\\section*{$title}\n\n";
   print "\\begingroup\n";
   print $text;
   print "\\par\\endgroup\n";
   print "\n\n";
 }
 
 # Connect to the database.
 my $dbh = DBI->connect("DBI:mysql:database=wikipedia-en;host=localhost",
                       "arno", "",
                       {'RaiseError' => 1});
 
 die "No argument specified" if $#ARGV == -1;
 my $letter = shift @ARGV;
 print STDERR "Generating all articles starting with $letter.\n";
 
 # Now retrieve data from the table.
 my $sth = $dbh->prepare("SELECT cur_title, cur_text FROM cur " .
                        "WHERE cur_title LIKE '$letter%' and cur_namespace = 0");
 $sth->execute();
 texheader();
 while (my $ref = $sth->fetchrow_hashref()) {
   article2tex($ref->{'cur_title'}, $ref->{'cur_text'});
 }
 $sth->finish();
 texfooter();
 
 # Disconnect from the database.
 $dbh->disconnect();