User:Ap/LaTeX conversion utility
From Wikipedia, the free encyclopedia
< User:Ap
#!/usr/bin/perl -w ## ## Copyright (C) 2003 Arno W. Peters. ## released under GNU GPL version 2 or higer. use strict; use DBI(); sub texheader { print "\\documentclass[10pt,english,a4paper,twocolumn]{book}\n"; print "\\usepackage[latin1]{inputenc}\n"; print "\\usepackage[T1]{fontenc}\n"; print "\\usepackage{geometry}\n"; print "\\usepackage[cm]{fullpage}\n"; print "\\usepackage{babel}\n"; print "\n"; print "\\def\\sup#1{\\ensuremath{^#1}}\n"; print "\\def\\sub#1{\\ensuremath{_#1}}\n"; print "\\def\\slash{/}\n"; print "\\catcode`\\/=\\active\n"; print "\\def/{\\slash\\discretionary{}{}{}}\n"; print "\n"; print "\\begin{document}\n"; } sub texfooter { print "\\end{document}\n"; } sub article2tex { my ($title, $text) = @_; my @math; my $n = 0; my $tmp; $title =~ s/_/ /g; $title =~ s/&/\\&/g; ### while ($tmp =~ m%<math>(.*)</math>%) { ### $math[$n] = $1; ### $n++; ### $tmp = $'; ### } $text =~ s/\r//g; $text =~ s/\\/\//g; $text =~ s/\$/\\\$/g; $text =~ s/\^/\\\^{\ }/g; $text =~ s/{/\\{/g; $text =~ s/}/\\}/g; $text =~ s/ë/\\"e/g; $text =~ s/ö/\\"o/g; $text =~ s/ü/\\"u/g; $text =~ s/ï/\\"{\\i}/g; $text =~ s/á/\\'a/g; $text =~ s/é/\\'e/g; $text =~ s/è/\\`e/g; $text =~ s/É/\\'E/g; $text =~ s/α/\\ensuremath{\\alpha}/g; $text =~ s/Α/A/g; $text =~ s/β/\\ensuremath{\\beta}/g; $text =~ s/Β/B/g; $text =~ s/γ/\\ensuremath{\\gamma}/g; $text =~ s/Γ/\\ensuremath{\\Gamma}/g; $text =~ s/δ/\\ensuremath{\\delta}/g; $text =~ s/Δ/\\ensuremath{\\Delta}/g; $text =~ s/ε/\\ensuremath{\\epsilon}/g; $text =~ s/Ε/E/g; $text =~ s/ζ/\\ensuremath{\\zeta}/g; $text =~ s/Ζ/Z/g; $text =~ s/η/\\ensuremath{\\eta}/g; $text =~ s/Η/E/g; $text =~ s/θ/\\ensuremath{\\theta}/g; $text =~ s/Θ/\\ensuremath{\\Theta}/g; $text =~ s/ι/\\ensuremath{\\iota}/g; $text =~ s/Ι/I/g; $text =~ s/κ/\\ensuremath{\\kappa}/g; $text =~ s/Κ/K/g; $text =~ s/λ/\\ensuremath{\\lambda}/g; $text =~ s/Λ/\\ensuremath{\\Lambda}/g; $text =~ s/μ/\\ensuremath{\\mu}/g; $text =~ s/Μ/M/g; $text =~ s/ν/\\ensuremath{\\nu}/g; $text =~ s/Ν/N/g; $text =~ s/ξ/\\ensuremath{\\xi}/g; $text =~ s/Ξ/\\ensuremath{\\Xi}/g; $text =~ s/π/\\ensuremath{\\pi}/g; $text =~ s/Π/\\ensuremath{\\Pi}/g; $text =~ s/ρ/\\ensuremath{\\rho}/g; $text =~ s/Ρ/R/g; $text =~ s/σ/\\ensuremath{\\sigma}/g; $text =~ s/Σ/\\ensuremath{\\Sigma}/g; $text =~ s/τ/\\ensuremath{\\tau}/g; $text =~ s/Τ/\\ensuremath{\\Tau}/g; $text =~ s/υ/\\ensuremath{\\upsilon}/g; $text =~ s/Υ/\\ensuremath{\\Upsilon}/g; $text =~ s/φ/\\ensuremath{\\phi}/g; $text =~ s/Φ/\\ensuremath{\\Phi}/g; $text =~ s/χ/\\ensuremath{\\chi}/g; $text =~ s/Χ/X/g; $text =~ s/ψ/\\ensuremath{\\psi}/g; $text =~ s/Ψ/\\ensuremath{\\Psi}/g; $text =~ s/ω/\\ensuremath{\\omega}/g; $text =~ s/Ω/\\ensuremath{\\Omega}/g; $text =~ s/²/\\sup{2}/g; $text =~ s/ /~/g; $text =~ s/°/\\ensuremath{^\\circ}/g; $text =~ s/</\\ensuremath{<}/g; $text =~ s/>/\\ensuremath{>}/g; $text =~ s/≤/\\ensuremath{\\leq}/g; $text =~ s/≥/\\ensuremath{\\geq}/g; $text =~ s/↑/\\ensuremath{\\uparrow}/g; $text =~ s/↓/\\ensuremath{\\downarrow}/g; $text =~ s/→/\\ensuremath{\\rightarrow}/g; $text =~ s/←/\\ensuremath{\\leftarrow}/g; $text =~ s/∀/\\ensuremath{\\forall}/g; $text =~ s/∈/\\ensuremath{\\in}/g; $text =~ s/∪/\\ensuremath{\\cup}/g; $text =~ s/∫/\\ensuremath{\\int}/g; $text =~ s/—/---/g; $text =~ s/♣/\\ensuremath{\\clubsuit}}/g; $text =~ s/♠/\\ensuremath{\\spadesuit}}/g; $text =~ s/♦/\\ensuremath{\\diamondsuit}}/g; $text =~ s/♥/\\ensuremath{\\heartsuit}}/g; $text =~ s/^#redirect/See /i; $text =~ s/&/\\&/g; $text =~ s/#/\\#/g; $text =~ s/%/\\%/g; $text =~ s/_/\\_/g; $text =~ s/''''''(.+?)''''''/\{\}$1\{\}/mg; $text =~ s/'''''(.+?)'''''/\\emph{\\textbf{$1}}/mg; $text =~ s/''''(.+?)''''/\\textbf{'$1'}/mg; $text =~ s/'''(.+?)'''/\\textbf{$1}/mg; $text =~ s/''(.+?)''/\\emph{$1}/mg; ## Ignore verbatim ## $text =~ s/^ (.*)$/\\begin{verbatim}$1\n\\end{verbatim}/mg; $text =~ s/^=====(.*)=====/\\subparagraph*{$1}\n/mg; $text =~ s/^====(.*)====/\\paragraph*{$1}\n/mg; $text =~ s/^===(.*)===/\\subsubsection*{$1}\n/mg; $text =~ s/^==(.*)==/\\subsection*{$1}\n/mg; $text =~ s/(\d)\-(\d)/$1--$2/mg; $text =~ s/^\*\*\*/\\par\\noindent\\hangindent=6em\\hskip5em\\llap{\\ensuremath{\\bullet}}\\quad /mg; $text =~ s/^\*\*/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\ensuremath{\\bullet}}\\quad /mg; $text =~ s/^\*/\\par\\noindent\\hangindent=2em\\quad\\llap{\\ensuremath{\\bullet}}\\quad /mg; $text =~ s/^:::/\\par\\noindent\\hangindent=6em\\hskip6em /mg; $text =~ s/^::/\\par\\noindent\\hangindent=4em\\hskip4em /mg; $text =~ s/^:/\\par\\noindent\\hangindent=2em\\qquad /mg; $text =~ s/^\\#\\#/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\#}\\quad /mg; $text =~ s/^\\#/\\par\\noindent\\hangindent=2em\\quad\\llap{\\#}\\quad /mg; $text =~ s/<li[^>]*?>/\\par\\noindent\\qquad /ig; $text =~ s/<\/li>/ /ig; $text =~ s/<dd[^>]*?>/\\par\\noindent\\qquad /ig; $text =~ s/<\/dd>/ /ig; $text =~ s/<ul[^>]*?>/\n\n/ig; $text =~ s/<\/ul>/ /ig; $text =~ s/<dl[^>]*?>/\n\n/ig; $text =~ s/<\/dl>/ /ig; $text =~ s/<ol[^>]*?>/\n\n/ig; $text =~ s/<\/ol>/ /ig; $text =~ s/<p[^>]*?>/\n\n/ig; $text =~ s/<\/p>/ /ig; $text =~ s/<h1>/\\subsection*{/ig; $text =~ s/<h2>/\\subsubsection*{/ig; $text =~ s/<h3>/\\paragraph*{/ig; $text =~ s/<h4>/\\subparagraph*{/ig; $text =~ s/<\/h\d>/}\n/ig; $text =~ s/<blockquote>/\\begin{quotation}\n/ig; $text =~ s/<\/blockquote>/\\end{quotation}\n/ig; $text =~ s/<strong>/\\textbf{/ig; $text =~ s/<\/strong>/}/ig; $text =~ s/<b>/\\textbf{/ig; $text =~ s/<\/b>/}/ig; $text =~ s/<i>/\\emph{/ig; $text =~ s/<\/i>/}/ig; $text =~ s/<var>/\\emph{/ig; $text =~ s/<\/var>/}/ig; $text =~ s/<emph>/\\emph{/ig; $text =~ s/<\/emph>/}/ig; $text =~ s/<sup>/\\sup{/ig; $text =~ s/<\/sup>/}/ig; $text =~ s/<sub>/\\sub{/ig; $text =~ s/<\/sub>/}/ig; $text =~ s/<small>/{\\small /ig; $text =~ s/<\/small>/}/ig; $text =~ s/<br[^>]*?>/\\\\[.5\\baselineskip]/ig; $text =~ s/----+/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /g; $text =~ s/<hr[^>]*?>/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /ig; ## Ignore tables for now $text =~ s/<table[^>]*?>/ /ig; $text =~ s/<\/table>/} /ig; $text =~ s/<tr[^>]*?>/ /ig; $text =~ s/<\/tr>/ /ig; $text =~ s/<td[^>]*?>/ /ig; $text =~ s/<\/td>/ /ig; $text =~ s/<th[^>]*?>/ /ig; $text =~ s/<\/th>/ /ig; $text =~ s/<center[^>]*?>/\\begin{center}\n/ig; $text =~ s/<\/center>/\\end{center}\n/ig; $text =~ s/<div[^>]*?>/ /ig; $text =~ s/<\/div>/ /ig; $text =~ s/<font[^>]*?>//ig; $text =~ s/<\/font>//ig; $text =~ s/²/\\sup{2}/g; $text =~ s/³/\\sup{3}/g; $text =~ s/’/'/g; $text =~ s/–//g; $text =~ s/—//g; print "\\section*{$title}\n\n"; print "\\begingroup\n"; print $text; print "\\par\\endgroup\n"; print "\n\n"; } # Connect to the database. my $dbh = DBI->connect("DBI:mysql:database=wikipedia-en;host=localhost", "arno", "", {'RaiseError' => 1}); die "No argument specified" if $#ARGV == -1; my $letter = shift @ARGV; print STDERR "Generating all articles starting with $letter.\n"; # Now retrieve data from the table. my $sth = $dbh->prepare("SELECT cur_title, cur_text FROM cur " . "WHERE cur_title LIKE '$letter%' and cur_namespace = 0"); $sth->execute(); texheader(); while (my $ref = $sth->fetchrow_hashref()) { article2tex($ref->{'cur_title'}, $ref->{'cur_text'}); } $sth->finish(); texfooter(); # Disconnect from the database. $dbh->disconnect();