From 86e2fbbe49cd0c7b0203b3f179cd9280b14ff404 Mon Sep 17 00:00:00 2001
From: Christopher Hall <chris.hall@catalyst.net.nz>
Date: Fri, 25 Feb 2011 16:38:49 +1300
Subject: [PATCH] Bug 5917 : final commit from catalyst

---
 misc/translator/TTParser.pm      |  24 +++--
 misc/translator/TmplTokenType.pm |   2 +-
 misc/translator/TmplTokenizer.pm | 147 ++++++++++++++++---------------
 misc/translator/xgettext.pl      |  60 ++++++-------
 4 files changed, 125 insertions(+), 108 deletions(-)

diff --git a/misc/translator/TTParser.pm b/misc/translator/TTParser.pm
index c86166d224..507474e67b 100755
--- a/misc/translator/TTParser.pm
+++ b/misc/translator/TTParser.pm
@@ -21,7 +21,7 @@ sub next_token{
 }
 
 #unshift token back on @tokens
-sub return_token{
+sub unshift_token{
     my $self = shift;
     unshift @tokens, shift;
 }
@@ -39,8 +39,9 @@ sub build_tokens{
     $self->{filename} = $filename;
     $self->handler(start => "start", "self, line, tagname, attr, text"); #signature is start( self, linenumber, tagname, hash of attributes, origional text )
     $self->handler(text => "text", "self, line, text, is_cdata"); #signature is text( self, linenumber, origional text, is_cdata )
-    $self->handler(end => ""); #ignore end tags
+    $self->handler(end => "end", "self, line, tag, text"); #signature is end( self, linenumber, tagename, origional text )
     $self->marked_sections(1); #treat anything inside CDATA tags as text, should really make it a TmplTokenType::CDATA
+    $self->unbroken_text(1); #make contiguous whitespace into a single token (can span multiple lines)
     $self->parse_file($filename);
     return $self;
 }
@@ -52,7 +53,7 @@ sub text{
     my $work = shift; # original text
     my $is_cdata = shift;
     while($work){
-        return if $work =~ m/^\s*$/;
+#        return if $work =~ m/^\s*$/;
         # if there is a template_toolkit tag
         if( $work =~ m/\[%.*?\]/ ){
             #everything before this tag is text (or possibly CDATA), add a text token to tokens if $`
@@ -81,13 +82,14 @@ sub start{
     my $self = shift;
     my $line = shift;
     my $tag = shift;
-    my $hash = shift;
-    my $text = shift; #unused atm...
+    my $hash = shift; #hash of attr/value pairs
+    my $text = shift; #origional text
     #return if ! $interesting_tags{$tag};
     # was $hash->{$key}
     # print "#### " . $self->{filename}  . " " . $tag . "####\n";
-    my $t = TmplToken->new( $tag, TmplTokenType::TAG, $line, $self->{filename});
+    my $t = TmplToken->new( $text, TmplTokenType::TAG, $line, $self->{filename});
     my %attr;
+    # tags seem to be uses in an 'interesting' way elsewhere..
     for my $key( %$hash ) {
         next unless defined $hash->{$key};
         $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
@@ -96,4 +98,14 @@ sub start{
     push @tokens, $t;
 }
 
+#handle closing html tags
+sub end{
+  my $self = shift;
+  my $line = shift;
+  my $tag = shift;
+  my $text = shift;
+  # what format should this be in?
+  my $t = TmplToken->new( $text, TmplTokenType::TAG, $line, $self->{filename} );
+}
+
 1;
diff --git a/misc/translator/TmplTokenType.pm b/misc/translator/TmplTokenType.pm
index 217cfbbab9..bfebebbe23 100644
--- a/misc/translator/TmplTokenType.pm
+++ b/misc/translator/TmplTokenType.pm
@@ -108,7 +108,7 @@ something that has the form of an SGML processing instruction
 
 =item DIRECTIVE
 
-a HTML::Template directive (whether or not embedded in an SGML comment)
+a Template Toolkit directive
 
 =item COMMENT
 
diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm
index b4c69584ed..22c0a13328 100644
--- a/misc/translator/TmplTokenizer.pm
+++ b/misc/translator/TmplTokenizer.pm
@@ -14,15 +14,11 @@ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
 =head1 NAME
 
-TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
+TmplTokenizer.pm - Simple-minded wrapper class for TTParser
 
 =head1 DESCRIPTION
 
-Because .tmpl files contains HTML::Template directives
-that tend to confuse real parsers (e.g., HTML::Parse),
-it might be better to create a customized scanner
-to scan the template files for tokens.
-This module is a simple-minded attempt at such a scanner.
+A wrapper for the functionality found in TTParser to allow an easier transition to Template Toolkit
 
 =cut
 
@@ -148,44 +144,44 @@ sub _split_js ($) {
     my ($s0) = @_;
     my @it = ();
     while (length $s0) {
-	if ($s0 =~ /^\s+/s) {				# whitespace
-	    push @it, $&;
-	    $s0 = $';
-	} elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) {	# C++-style comment
-	    push @it, $&;
-	    $s0 = $';
-	} elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) {	# C-style comment
-	    push @it, $&;
-	    $s0 = $';
-	# Keyword or identifier, ECMA-262 p.13 (section 7.5)
-	} elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) {	# IdentifierName
-	    push @it, $&;
-	    $s0 = $';
-	# Punctuator, ECMA-262 p.13 (section 7.6)
-	} elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
-	    push @it, $&;
-	    $s0 = $';
-	# DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
-	} elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
-	    push @it, $&;
-	    $s0 = $';
-	# HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
-	} elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
-	    push @it, $&;
-	    $s0 = $';
-	# OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
-	} elsif ($s0 =~ /^0[\da-fA-F]+/s) {
-	    push @it, $&;
-	    $s0 = $';
-	# StringLiteral, ECMA-262 p.17 (section 7.7.4)
-	# XXX SourceCharacter doesn't seem to be defined (?)
-	} elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
-	    push @it, $&;
-	    $s0 = $';
-	} elsif ($s0 =~ /^./) {				# UNKNOWN TOKEN !!!
-	    push @it, $&;
-	    $s0 = $';
-	}
+        if ($s0 =~ /^\s+/s) {				# whitespace
+          push @it, $&;
+          $s0 = $';
+        } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) {	# C++-style comment
+        push @it, $&;
+        $s0 = $';
+        } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) {	# C-style comment
+            push @it, $&;
+            $s0 = $';
+        # Keyword or identifier, ECMA-262 p.13 (section 7.5)
+        } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) {	# IdentifierName
+            push @it, $&;
+            $s0 = $';
+        # Punctuator, ECMA-262 p.13 (section 7.6)
+        } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
+            push @it, $&;
+            $s0 = $';
+        # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
+        } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
+            push @it, $&;
+            $s0 = $';
+        # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+        } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
+            push @it, $&;
+            $s0 = $';
+        # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+        } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
+            push @it, $&;
+            $s0 = $';
+        # StringLiteral, ECMA-262 p.17 (section 7.7.4)
+        # XXX SourceCharacter doesn't seem to be defined (?)
+        } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
+            push @it, $&;
+            $s0 = $';
+        } elsif ($s0 =~ /^./) {				# UNKNOWN TOKEN !!!
+            push @it, $&;
+            $s0 = $';
+        }
     }
     return @it;
 }
@@ -202,28 +198,28 @@ sub _identify_js_translatables (@) {
     # We mark a JavaScript translatable string as in C, i.e., _("literal")
     # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
     for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
-	my $reset_state_p = 0;
-	push @output, [0, $input[$i]];
-	if ($input[$i] !~ /\S/s) {
-	    ;
-	} elsif ($state == 0) {
-	    $state = STATE_UNDERSCORE if $input[$i] eq '_';
-	} elsif ($state == STATE_UNDERSCORE) {
-	    $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
-	} elsif ($state == STATE_PARENLEFT) {
-	    if ($input[$i] =~ /^(['"])(.*)\1$/s) {
-		($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
-	    } else {
-		$state = 0;
-	    }
-	} elsif ($state == STATE_STRING_LITERAL) {
-	    if ($input[$i] eq parenright) {
-		$output[$j] = [1, $output[$j]->[1], $q, $s];
-	    }
-	    $state = 0;
-	} else {
-	    die "identify_js_translatables internal error: Unknown state $state"
-	}
+        my $reset_state_p = 0;
+        push @output, [0, $input[$i]];
+        if ($input[$i] !~ /\S/s) {
+          ;
+        } elsif ($state == 0) {
+          $state = STATE_UNDERSCORE if $input[$i] eq '_';
+        } elsif ($state == STATE_UNDERSCORE) {
+          $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
+        } elsif ($state == STATE_PARENLEFT) {
+          if ($input[$i] =~ /^(['"])(.*)\1$/s) {
+            ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
+          } else {
+            $state = 0;
+          }
+        } elsif ($state == STATE_STRING_LITERAL) {
+          if ($input[$i] eq parenright) {
+            $output[$j] = [1, $output[$j]->[1], $q, $s];
+          }
+          $state = 0;
+        } else {
+          die "identify_js_translatables internal error: Unknown state $state"
+        }
     }
     return \@output;
 }
@@ -233,10 +229,19 @@ sub _identify_js_translatables (@) {
 sub string_canon ($) {
   my $s = shift;
   # Fold all whitespace into single blanks
-  $s =~ s/\s+/ /gs;
+  $s =~ s/\s+/ /g;
+  return $s;
+}
+
+# safer version used internally, preserves new lines
+sub string_canon_safe ($) {
+  my $s = shift;
+  # fold tabs and spaces into single spaces
+  $s =~ s/[\ \t]+/ /gs;
   return $s;
 }
 
+
 sub _quote_cformat{
   my $s = shift;
   $s =~ s/%/%%/g;
@@ -245,7 +250,7 @@ sub _quote_cformat{
 
 sub _formalize_string_cformat{
   my $s = shift;
-  return _quote_cformat( string_canon $s );
+  return _quote_cformat( string_canon_safe $s );
 }
 
 sub _formalize{
@@ -312,7 +317,7 @@ sub next_token {
             # if there is nothing in parts, return this token
             return $next unless @parts;
             # OTHERWISE, put this token back and return the parametrized string of @parts
-            $self->{_parser}->return_token($next);
+            $self->{_parser}->unshift_token($next);
             return $self->_parametrize_internal(@parts);
         }
     }
@@ -320,7 +325,7 @@ sub next_token {
 
 ###############################################################################
 
-# ugly method taken from old version
+# function taken from old version
 # used by tmpl_process3
 sub parametrize ($$$$) {
     my($fmt_0, $cformat_p, $t, $f) = @_;
@@ -527,7 +532,7 @@ is different (replacing %s with %1$s, %2$s, etc.), or when certain
 words will require certain inflectional suffixes in sentences.
 
 Because this is an incompatible change, this mode must be explicitly
-turned on using the set_cformat(1) method call.
+turned on using the set_allow_cformat(1) method call.
 
 =head2 The flag characters
 
diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl
index abe315e5d7..ef1fb3db4a 100755
--- a/misc/translator/xgettext.pl
+++ b/misc/translator/xgettext.pl
@@ -60,9 +60,9 @@ sub remember ($$) {
     my($token, $string) = @_;
     # If we determine that the string is negligible, don't bother to remember
     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
-	my $key = TmplTokenizer::string_canon( $string );
-	$text{$key} = [] unless defined $text{$key};
-	push @{$text{$key}}, $token;
+        my $key = TmplTokenizer::string_canon( $string );
+        $text{$key} = [] unless defined $text{$key};
+        push @{$text{$key}}, $token;
     }
 }
 
@@ -83,37 +83,37 @@ sub string_list () {
     return @t;
 }
 
-###############################################################################
+  ###############################################################################
 
 sub text_extract (*) {
     my($h) = @_;
     for (;;) {
-	my $s = TmplTokenizer::next_token $h;
-    last unless defined $s;
-	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
-	if ($kind eq TmplTokenType::TEXT) {
-	    remember( $s, $t ) if $t =~ /\S/s;
-	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
-	    remember( $s, $s->form ) if $s->form =~ /\S/s;
-	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
-	    # value [tag=input], meta
-	    my $tag = lc($1) if $t =~ /^<(\S+)/s;
-	    for my $a ('alt', 'content', 'title', 'value','label') {
-		if ($attr->{$a}) {
-            next if $a eq 'label' && $tag ne 'optgroup';
-		    next if $a eq 'content' && $tag ne 'meta';
-		    next if $a eq 'value' && ($tag ne 'input'
-			|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
-		    my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
-		    $val = TmplTokenizer::trim $val;
-		    remember( $s, $val ) if $val =~ /\S/s;
-		}
-	    }
-	} elsif ($s->has_js_data) {
-	    for my $t (@{$s->js_data}) {
-		remember( $s, $t->[3] ) if $t->[0]; # FIXME
-	    }
-	}
+        my $s = TmplTokenizer::next_token $h;
+        last unless defined $s;
+        my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
+        if ($kind eq TmplTokenType::TEXT) {
+            remember( $s, $t ) if $t =~ /\S/s;
+        } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
+            remember( $s, $s->form ) if $s->form =~ /\S/s;
+        } elsif ($kind eq TmplTokenType::TAG && %$attr) {
+            # value [tag=input], meta
+            my $tag = lc($1) if $t =~ /^<(\S+)/s;
+            for my $a ('alt', 'content', 'title', 'value','label') {
+                if ($attr->{$a}) {
+                    next if $a eq 'label' && $tag ne 'optgroup';
+                    next if $a eq 'content' && $tag ne 'meta';
+                    next if $a eq 'value' && ($tag ne 'input'
+                        || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
+                    my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
+                    $val = TmplTokenizer::trim $val;
+                    remember( $s, $val ) if $val =~ /\S/s;
+                }
+            }
+        } elsif ($s->has_js_data) {
+            for my $t (@{$s->js_data}) {
+              remember( $s, $t->[3] ) if $t->[0]; # FIXME
+            }
+        }
     }
 }
 
-- 
2.39.2