User:FairuseBot/Pearle/WikiPage.pm

From Wikipedia, the free encyclopedia

### IMPORTANT ###

# This code is released into the public domain.

### RECENT CHANGES ###
#  6 Aug 2007: Created
# 21 Aug 2007: Added comment folding/unfolding
# 21 Oct 2007: Fixed and tested comment folding
# 25 Oct 2007: Added link canonicalization

# Notes on editable markup:
#  * Multi-character symbols are replaced with single-character placeholders 
#    from the Unicode "control symbols" set (U+0001 to U+001F).
#  * Comments are replaced with single-character placeholders from the 
#    Unicode fifteenth-plane private-use area (U+F0000 to U+FFFFF).

package Pearle::WikiPage;

use strict;
use warnings;

use URI::Escape;
use Encode;

########## Constructor ###############################################
sub new
{
        my $class = shift;
        my %params = @_;
        
        my $self = {
                text => '',                          # Page text
                title => '',                 # Page title
                
                # Internal variables
                editTime => undef,           # editTime parameter used when editing a page
                startTime => undef,          # startTime parameter used when editing a page
                editToken => undef,          # editToken parameter used when editing a page
                
                # Comment-folding
                _comments_folded => 0,               # Are comments presently folded?
                _comment_fold_lookup => {},  # Lookup table of proxy,comment pairs
                _comment_fold_proxy => 0xF0000,      # Next proxy character to use
                
                # Single-character markup representations
                _linkstart => "\x01",
                _linkend => "\x02",
                _transclusionstart => "\x03",
                _transclusionend => "\x04",
                
        };
        
        foreach my $key (keys(%params))
        {
                if($key eq 'text')
                {
                        $self->{text} = $params{text};
                }
                elsif($key eq 'title')
                {
                        $self->{title} = $params{title};
                }
                elsif($key eq 'editTime')
                {
                        $self->{editTime} = $params{editTime};
                }
                elsif($key eq 'startTime')
                {
                        $self->{startTime} = $params{startTime};
                }
                elsif($key eq 'editToken')
                {
                        $self->{editToken} = $params{editToken};
                }
        }

        bless($self, $class);
        return $self;
}
########## Accessor functions ########################################

# Return the text with modifications to make it easier to operate on
#
# NOTE: Don't try to print this.  In order to make editing easier,
#  various multi-character markup sequences have been replaced with
#  very non-printable characters.
sub getEditableText
{
        my $self = shift;
        $self->foldComments();
        return $self->makeEditableMarkup($self->{text});
}

sub setEditableText
{
        my $self = shift;
        $self->{text} = shift;
}

# Return the text in WikiMarkup format
sub getWikiText
{
        my $self = shift;
        $self->unfoldComments();
        return $self->makeWikiMarkup($self->{text});
}

sub getTitle
{
        my $self = shift;
        return $self->{title};
}

sub setTitle
{
        die "Setting the title of a WikiPage is not supported.\n";
}

sub getEditToken
{
        my $self = shift;
        return $self->{editToken};
}

sub getStartTime
{
        my $self = shift;
        return $self->{startTime};
}

sub getEditTime
{
        my $self = shift;
        return $self->{editTime};
}

########## Verbs #####################################################

# Convert to editable representation
sub makeEditableMarkup
{
        my $self = shift;
        my $text = shift;

#       $text =~ s/\[\[\[/\x01[/g;                      # Triple opening brackets: not valid wikimarkup
        $text =~ s/\[\[/\x01/g;                         # Double opening brackets: the start of an internal link or inline image
        $text =~ s/\]\]\]\]/\x02\x02/g; # Quadruple closing brackets: The end of an image caption containing an internal link
        $text =~ s/\]\]\]/]\x02/g;                      # Triple closing brackets: an image caption containing an external link
        $text =~ s/\]\]/\x02/g;                         # Double closing brackets: the end of an internal link or image
        $text =~ s/\{\{/\x03/g;                         # Double opening braces: the start of a transclusion
        $text =~ s/\}\}/\x04/g;                         # Double closing braces: the end of a transclusion
        
        return $text;
}

# Convert to WikiMarkup representation
sub makeWikiMarkup
{
        my $self = shift;
        my $text = shift;

        $text =~ s/\x01/[[/g;
        $text =~ s/\x02/]]/g;
        $text =~ s/\x03/{{/g;
        $text =~ s/\x04/}}/g;
        
        return $text;
}

# Replace comments with single-character proxies.
sub foldComments
{
        my $self = shift;
        my $text = $self->{text};
        
        while($text =~ /(<!--.*?-->)/s)
        {
                my $proxy_char = chr $self->{_comment_fold_proxy};
                $self->{_comment_fold_lookup}->{$proxy_char} = $1;
                my $comment = escapeRegex($1);
                $text =~ s/$comment/$proxy_char/;

                $self->{_comment_fold_proxy} += 1;
                die "Too many comments in page" if($self->{_comment_fold_proxy} > 0xFFFFF);     # More than 65535 comments in the page
        }
        $self->{text} = $text;
        return $text;
}

# Replace proxies with the original comments
sub unfoldComments
{
        my $self = shift;
        my $text = $self->{text};

        foreach my $proxy_char (keys(%{$self->{_comment_fold_lookup}}))
        {
                $text =~ s/$proxy_char/$self->{_comment_fold_lookup}->{$proxy_char}/g;
        }
        $self->{text} = $text;
        return $text;
}

sub canonicalizeLinks
{
        my $self = shift;
        my %link_lookup;
        # NOTE: Order of the following two lines is important, since getEditableText modifies $self->{_comment_fold_proxy}
        my $text = $self->getEditableText();
        my $link_proxy = $self->{_comment_fold_proxy};
        
        # Extract the links beginnings into a lookup table
        while($text =~ /(\x01.*?[|\x02])/)
        {
                my $proxy_char = chr $link_proxy;
                $link_lookup{$proxy_char} = $1;
                my $link = escapeRegex($1);
                $text =~ s/$link/$proxy_char/;
#               print "$link_proxy $link_lookup{$proxy_char}\n";

                $link_proxy += 1;
                die "Too many links in page" if($link_proxy > 0xFFFFF);
        }

        # Canonicalize link beginnings
        foreach my $proxy_char (keys %link_lookup)
        {
                my $link = $link_lookup{$proxy_char};
                next if $link =~ /http:/;               # Skip if it's a badly-formatted external link
                $link = unescapeUTF8URL($link);                                                                                         # Convert URL-encoded UTF8 to Perl chars
                $link =~ s/_/ /g;                                                                                                                       # Underscores to spaces
                $link =~ s/  / /g;                                                                                                                      # Collapse multiple spaces
                $link =~ s/[\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]//g;       # Kill Unicode BiDi markers
                # TODO: Decode HTML entities (E E &Aacute
                # Trim spaces
                $link =~ s/^\x01 /\x01/;
                $link =~ s/ \|$/|/;
                $link =~ s/ \x02$/\x02/;
                # TODO: Trim internal spaces for namespaced links
#               print URI::Escape::uri_escape_utf8($link), "\n";
                $link_lookup{$proxy_char} = $link;
        }
        
        # Put link beginnings back in the text

        foreach my $proxy_char (keys %link_lookup)
        {
                $text =~ s/$proxy_char/$link_lookup{$proxy_char}/g;
        }

        $self->setEditableText($text);
#       exit;
}

########## Utilities #################################################

# Escape a string so that it's a literal match in a regex
sub escapeRegex
{
        my $string = shift;
        $string =~ s/\\/\\\\/g;
        $string =~ s/\./\\\./g;
        $string =~ s/\(/\\\(/g;
        $string =~ s/\)/\\\)/g;
        $string =~ s/\[/\\\[/g;
        $string =~ s/\{/\\\{/g;
        $string =~ s/\+/\\\+/g;
        $string =~ s/\*/\\\*/g;
        $string =~ s/\?/\\\?/g;
        $string =~ s/\^/\\\^/g;
        $string =~ s/\$/\\\$/g;
        $string =~ s/\|/\\\|/g;
        return $string;
}

sub unescapeUTF8URL
{
        # Since nobody seems to have a module to unescape a UTF8-encoded URL-escaped string...
        my $string = shift;
        my @chars = split //, $string;
        my $result_string = '';
        
        for(my $i = 0; $i < scalar(@chars); $i++)
        {
                my $partial_string = '';
                if($chars[$i] eq '%')
                {
                        my $done = 0;
                        while(!$done)
                        {
                                # If the next two chars are hex values, stuff them in $partial_string
                                if($chars[$i+1] =~ /[0-9a-fA-F]/ and $chars[$i+2] =~ /[0-9a-fA-F]/)
                                {
                                        $partial_string .= $chars[$i] . $chars[$i+1] . $chars[$i+2];
                                        $i += 3;
                                }
                                else
                                {
                                        # Literal percent
                                        $result_string .= $chars[$i];
                                        $i += 1;
                                        $done = 1;
                                }
                                if($chars[$i] ne '%')
                                {
                                        $done = 1;
                                }
                        }
                        if($partial_string ne '')
                        {
                                $result_string .= decode("utf8", URI::Escape::uri_unescape($partial_string));
                        }
                        $i -= 1;
                }
                else
                {
                        # Literal char, already in unicode
                        $result_string .= $chars[$i];
                }
        }
        return $result_string;
}

1;