From Wikipedia, the free encyclopedia
### IMPORTANT ###
# This code is released into the public domain.
### RECENT CHANGES ###
# 6 Aug 2007: Created
# 21 Aug 2007: Added comment folding/unfolding
# 21 Oct 2007: Fixed and tested comment folding
# 25 Oct 2007: Added link canonicalization
# Notes on editable markup:
# * Multi-character symbols are replaced with single-character placeholders
# from the Unicode "control symbols" set (U+0001 to U+001F).
# * Comments are replaced with single-character placeholders from the
# Unicode fifteenth-plane private-use area (U+F0000 to U+FFFFF).
package Pearle::WikiPage;
use strict;
use warnings;
use URI::Escape;
use Encode;
########## Constructor ###############################################
sub new
{
my $class = shift;
my %params = @_;
my $self = {
text => '', # Page text
title => '', # Page title
# Internal variables
editTime => undef, # editTime parameter used when editing a page
startTime => undef, # startTime parameter used when editing a page
editToken => undef, # editToken parameter used when editing a page
# Comment-folding
_comments_folded => 0, # Are comments presently folded?
_comment_fold_lookup => {}, # Lookup table of proxy,comment pairs
_comment_fold_proxy => 0xF0000, # Next proxy character to use
# Single-character markup representations
_linkstart => "\x01",
_linkend => "\x02",
_transclusionstart => "\x03",
_transclusionend => "\x04",
};
foreach my $key (keys(%params))
{
if($key eq 'text')
{
$self->{text} = $params{text};
}
elsif($key eq 'title')
{
$self->{title} = $params{title};
}
elsif($key eq 'editTime')
{
$self->{editTime} = $params{editTime};
}
elsif($key eq 'startTime')
{
$self->{startTime} = $params{startTime};
}
elsif($key eq 'editToken')
{
$self->{editToken} = $params{editToken};
}
}
bless($self, $class);
return $self;
}
########## Accessor functions ########################################
# Return the text with modifications to make it easier to operate on
#
# NOTE: Don't try to print this. In order to make editing easier,
# various multi-character markup sequences have been replaced with
# very non-printable characters.
sub getEditableText
{
my $self = shift;
$self->foldComments();
return $self->makeEditableMarkup($self->{text});
}
sub setEditableText
{
my $self = shift;
$self->{text} = shift;
}
# Return the text in WikiMarkup format
sub getWikiText
{
my $self = shift;
$self->unfoldComments();
return $self->makeWikiMarkup($self->{text});
}
sub getTitle
{
my $self = shift;
return $self->{title};
}
sub setTitle
{
die "Setting the title of a WikiPage is not supported.\n";
}
sub getEditToken
{
my $self = shift;
return $self->{editToken};
}
sub getStartTime
{
my $self = shift;
return $self->{startTime};
}
sub getEditTime
{
my $self = shift;
return $self->{editTime};
}
########## Verbs #####################################################
# Convert to editable representation
sub makeEditableMarkup
{
my $self = shift;
my $text = shift;
# $text =~ s/\[\[\[/\x01[/g; # Triple opening brackets: not valid wikimarkup
$text =~ s/\[\[/\x01/g; # Double opening brackets: the start of an internal link or inline image
$text =~ s/\]\]\]\]/\x02\x02/g; # Quadruple closing brackets: The end of an image caption containing an internal link
$text =~ s/\]\]\]/]\x02/g; # Triple closing brackets: an image caption containing an external link
$text =~ s/\]\]/\x02/g; # Double closing brackets: the end of an internal link or image
$text =~ s/\{\{/\x03/g; # Double opening braces: the start of a transclusion
$text =~ s/\}\}/\x04/g; # Double closing braces: the end of a transclusion
return $text;
}
# Convert to WikiMarkup representation
sub makeWikiMarkup
{
my $self = shift;
my $text = shift;
$text =~ s/\x01/[[/g;
$text =~ s/\x02/]]/g;
$text =~ s/\x03/{{/g;
$text =~ s/\x04/}}/g;
return $text;
}
# Replace comments with single-character proxies.
sub foldComments
{
my $self = shift;
my $text = $self->{text};
while($text =~ /(<!--.*?-->)/s)
{
my $proxy_char = chr $self->{_comment_fold_proxy};
$self->{_comment_fold_lookup}->{$proxy_char} = $1;
my $comment = escapeRegex($1);
$text =~ s/$comment/$proxy_char/;
$self->{_comment_fold_proxy} += 1;
die "Too many comments in page" if($self->{_comment_fold_proxy} > 0xFFFFF); # More than 65535 comments in the page
}
$self->{text} = $text;
return $text;
}
# Replace proxies with the original comments
sub unfoldComments
{
my $self = shift;
my $text = $self->{text};
foreach my $proxy_char (keys(%{$self->{_comment_fold_lookup}}))
{
$text =~ s/$proxy_char/$self->{_comment_fold_lookup}->{$proxy_char}/g;
}
$self->{text} = $text;
return $text;
}
sub canonicalizeLinks
{
my $self = shift;
my %link_lookup;
# NOTE: Order of the following two lines is important, since getEditableText modifies $self->{_comment_fold_proxy}
my $text = $self->getEditableText();
my $link_proxy = $self->{_comment_fold_proxy};
# Extract the links beginnings into a lookup table
while($text =~ /(\x01.*?[|\x02])/)
{
my $proxy_char = chr $link_proxy;
$link_lookup{$proxy_char} = $1;
my $link = escapeRegex($1);
$text =~ s/$link/$proxy_char/;
# print "$link_proxy $link_lookup{$proxy_char}\n";
$link_proxy += 1;
die "Too many links in page" if($link_proxy > 0xFFFFF);
}
# Canonicalize link beginnings
foreach my $proxy_char (keys %link_lookup)
{
my $link = $link_lookup{$proxy_char};
next if $link =~ /http:/; # Skip if it's a badly-formatted external link
$link = unescapeUTF8URL($link); # Convert URL-encoded UTF8 to Perl chars
$link =~ s/_/ /g; # Underscores to spaces
$link =~ s/ / /g; # Collapse multiple spaces
$link =~ s/[\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]//g; # Kill Unicode BiDi markers
# TODO: Decode HTML entities (E E Á
# Trim spaces
$link =~ s/^\x01 /\x01/;
$link =~ s/ \|$/|/;
$link =~ s/ \x02$/\x02/;
# TODO: Trim internal spaces for namespaced links
# print URI::Escape::uri_escape_utf8($link), "\n";
$link_lookup{$proxy_char} = $link;
}
# Put link beginnings back in the text
foreach my $proxy_char (keys %link_lookup)
{
$text =~ s/$proxy_char/$link_lookup{$proxy_char}/g;
}
$self->setEditableText($text);
# exit;
}
########## Utilities #################################################
# Escape a string so that it's a literal match in a regex
sub escapeRegex
{
my $string = shift;
$string =~ s/\\/\\\\/g;
$string =~ s/\./\\\./g;
$string =~ s/\(/\\\(/g;
$string =~ s/\)/\\\)/g;
$string =~ s/\[/\\\[/g;
$string =~ s/\{/\\\{/g;
$string =~ s/\+/\\\+/g;
$string =~ s/\*/\\\*/g;
$string =~ s/\?/\\\?/g;
$string =~ s/\^/\\\^/g;
$string =~ s/\$/\\\$/g;
$string =~ s/\|/\\\|/g;
return $string;
}
sub unescapeUTF8URL
{
# Since nobody seems to have a module to unescape a UTF8-encoded URL-escaped string...
my $string = shift;
my @chars = split //, $string;
my $result_string = '';
for(my $i = 0; $i < scalar(@chars); $i++)
{
my $partial_string = '';
if($chars[$i] eq '%')
{
my $done = 0;
while(!$done)
{
# If the next two chars are hex values, stuff them in $partial_string
if($chars[$i+1] =~ /[0-9a-fA-F]/ and $chars[$i+2] =~ /[0-9a-fA-F]/)
{
$partial_string .= $chars[$i] . $chars[$i+1] . $chars[$i+2];
$i += 3;
}
else
{
# Literal percent
$result_string .= $chars[$i];
$i += 1;
$done = 1;
}
if($chars[$i] ne '%')
{
$done = 1;
}
}
if($partial_string ne '')
{
$result_string .= decode("utf8", URI::Escape::uri_unescape($partial_string));
}
$i -= 1;
}
else
{
# Literal char, already in unicode
$result_string .= $chars[$i];
}
}
return $result_string;
}
1;