From ff532faaa627436241018737f7d4a5dcf8cf6556 Mon Sep 17 00:00:00 2001
From: Christopher Hall <chris.hall@catalyst.net.nz>
Date: Wed, 16 Feb 2011 14:00:10 +1300
Subject: [PATCH] Bug 5917 : initial work on translation tool conversion

---
 misc/translator/TTParser.pm      |  99 ++++
 misc/translator/TmplTokenizer.pm | 805 +++++--------------------------
 misc/translator/tmpl_process3.pl |   4 +-
 3 files changed, 226 insertions(+), 682 deletions(-)
 create mode 100755 misc/translator/TTParser.pm

diff --git a/misc/translator/TTParser.pm b/misc/translator/TTParser.pm
new file mode 100755
index 0000000000..c86166d224
--- /dev/null
+++ b/misc/translator/TTParser.pm
@@ -0,0 +1,99 @@
+#!/usr/bin/env perl
+#simple parser for HTML with Template Toolkit directives. Tokens are put into @tokens and are accesible via next_token and peep_token
+package TTParser;
+use base qw(HTML::Parser);
+use TmplToken;
+use strict;
+use warnings;
+
+#seems to be handled post tokenizer
+##hash where key is tag we are interested in and the value is a hash of the attributes we want
+#my %interesting_tags = (
+#    img => { alt => 1 },
+#);
+
+#tokens found so far (used like a stack)
+my ( @tokens );
+
+#shiftnext token or undef
+sub next_token{
+    return shift @tokens;
+}
+
+#unshift token back on @tokens
+sub return_token{
+    my $self = shift;
+    unshift @tokens, shift;
+}
+
+#have a peep at next token
+sub peep_token{
+    return $tokens[0];
+}
+
+#wrapper for parse
+#please use this method INSTEAD of the HTML::Parser->parse_file method (and HTML::Parser->parse)
+#signature build_tokens( self, filename)
+sub build_tokens{
+    my ($self, $filename) = @_;
+    $self->{filename} = $filename;
+    $self->handler(start => "start", "self, line, tagname, attr, text"); #signature is start( self, linenumber, tagname, hash of attributes, origional text )
+    $self->handler(text => "text", "self, line, text, is_cdata"); #signature is text( self, linenumber, origional text, is_cdata )
+    $self->handler(end => ""); #ignore end tags
+    $self->marked_sections(1); #treat anything inside CDATA tags as text, should really make it a TmplTokenType::CDATA
+    $self->parse_file($filename);
+    return $self;
+}
+
+#handle parsing of text
+sub text{
+    my $self = shift;
+    my $line = shift;
+    my $work = shift; # original text
+    my $is_cdata = shift;
+    while($work){
+        return if $work =~ m/^\s*$/;
+        # if there is a template_toolkit tag
+        if( $work =~ m/\[%.*?\]/ ){
+            #everything before this tag is text (or possibly CDATA), add a text token to tokens if $`
+            if( $` ){
+                my $t = TmplToken->new( $`, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} );
+                push @tokens, $t;
+            }
+
+            #the match itself is a DIRECTIVE $&
+            my $t = TmplToken->new( $&, TmplTokenType::DIRECTIVE, $line, $self->{filename} );
+            push @tokens, $t;
+
+            #put work still to do back into work
+            $work = $' ? $' : 0;
+        } else {
+            #If there is some left over work, treat it as text token
+            my $t = TmplToken->new( $work, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} );
+            push @tokens, $t;
+            last;
+        }
+    }
+}
+
+#handle opening html tags
+sub start{
+    my $self = shift;
+    my $line = shift;
+    my $tag = shift;
+    my $hash = shift;
+    my $text = shift; #unused atm...
+    #return if ! $interesting_tags{$tag};
+    # was $hash->{$key}
+    # print "#### " . $self->{filename}  . " " . $tag . "####\n";
+    my $t = TmplToken->new( $tag, TmplTokenType::TAG, $line, $self->{filename});
+    my %attr;
+    for my $key( %$hash ) {
+        next unless defined $hash->{$key};
+        $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
+    }
+    $t->set_attributes( \%attr );
+    push @tokens, $t;
+}
+
+1;
diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm
index 5c4f28ba5c..b4c69584ed 100644
--- a/misc/translator/TmplTokenizer.pm
+++ b/misc/translator/TmplTokenizer.pm
@@ -4,6 +4,7 @@ use strict;
 #use warnings; FIXME - Bug 2505
 use TmplTokenType;
 use TmplToken;
+use TTParser;
 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
 require Exporter;
 
@@ -39,44 +40,12 @@ use vars qw( $pedantic_error_markup_in_pcdata_p );
 ###############################################################################
 
 # Hideous stuff
-use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
-use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl);
+use vars qw( $re_xsl $re_end_entity $re_tmpl_var);
 BEGIN {
-    # $re_directive must not do any backreferences
-    $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
-    # TMPL_VAR or TMPL_INCLUDE
-    $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
-    $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
-    # TMPL_VAR ESCAPE=1/HTML/URL
+    $re_tmpl_var = q{\[%\s*[get|set|default]?\s*[\w\.]+\s*[|.*?]?\s*%\]};
     $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
-    $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
-    # Any control flow directive
-    $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
-    # /LOOP or /IF or /UNLESS
-    $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
-}
-
-# Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
-# Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
-use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
-use vars qw( $re_tag_strict $re_tag_compat @re_tag );
-sub re_tag ($) {
-   my($compat) = @_;
-   my $etag = $compat? '>': '<>\/';
-   # This is no longer similar to the original regexp in subst.pl :-(
-   # Note that we don't want <> in compat mode; Mozilla knows about <
-   q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
-   . $re_directive
-   . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
-}
-BEGIN {
-    $re_comment = '(?:--(?:[^-]|-[^-])*--)';
-    $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
-    $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
-    @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
 }
-
 # End of the hideous stuff
 
 use vars qw( $serial );
@@ -87,14 +56,14 @@ sub FATAL_P		() {'fatal-p'}
 sub SYNTAXERROR_P	() {'syntaxerror-p'}
 
 sub FILENAME		() {'input'}
-sub HANDLE		() {'handle'}
+#sub HANDLE		() {'handle'}
 
-sub READAHEAD		() {'readahead'}
+#sub READAHEAD		() {'readahead'}
 sub LINENUM_START	() {'lc_0'}
 sub LINENUM		() {'lc'}
 sub CDATA_MODE_P	() {'cdata-mode-p'}
 sub CDATA_CLOSE		() {'cdata-close'}
-sub PCDATA_MODE_P	() {'pcdata-mode-p'}	# additional submode for CDATA
+#sub PCDATA_MODE_P	() {'pcdata-mode-p'}	# additional submode for CDATA
 sub JS_MODE_P		() {'js-mode-p'}	# cdata-mode-p must also be true
 
 sub ALLOW_CFORMAT_P	() {'allow-cformat-p'}
@@ -102,11 +71,14 @@ sub ALLOW_CFORMAT_P	() {'allow-cformat-p'}
 sub new {
     shift;
     my ($filename) = @_;
-    open my $handle,$filename or die "can't open $filename";
+    #open my $handle,$filename or die "can't open $filename";
+    my $parser = TTParser->new;
+    $parser->build_tokens( $filename );
     bless {
-	    filename => $filename
-	    , handle => $handle
-	    , readahead => []
+      filename => $filename,
+      _parser => $parser
+#     , handle => $handle
+#     , readahead => []
     } , __PACKAGE__;
 }
 
@@ -119,49 +91,16 @@ sub filename {
     return $this->{filename};
 }
 
-sub _handle {
-    my $this = shift;
-    return $this->{handle};
-}
-
 sub fatal_p {
     my $this = shift;
     return $this->{+FATAL_P};
 }
 
+# work around, currently not implemented
 sub syntaxerror_p {
-    my $this = shift;
-    return $this->{+SYNTAXERROR_P};
-}
-
-sub has_readahead_p {
-    my $this = shift;
-    return @{$this->{readahead}};
-}
-
-sub _peek_readahead {
-    my $this = shift;
-    return $this->{readahead}->[$#{$this->{readahead}}];
-}
-
-sub line_number_start {
-    my $this = shift;
-    return $this->{+LINENUM_START};
-}
-
-sub line_number {
-    my $this = shift;
-    return $this->{+LINENUM};
-}
-
-sub cdata_mode_p {
-    my $this = shift;
-    return $this->{+CDATA_MODE_P};
-}
-
-sub pcdata_mode_p {
-    my $this = shift;
-    return $this->{+PCDATA_MODE_P};
+#    my $this = shift;
+#    return $this->{+SYNTAXERROR_P};
+    return 0;
 }
 
 sub js_mode_p {
@@ -169,11 +108,6 @@ sub js_mode_p {
     return $this->{+JS_MODE_P};
 }
 
-sub cdata_close {
-    my $this = shift;
-    return $this->{+CDATA_CLOSE};
-}
-
 sub allow_cformat_p {
     my $this = shift;
     return $this->{+ALLOW_CFORMAT_P};
@@ -187,71 +121,13 @@ sub _set_fatal {
     return $this;
 }
 
-sub _set_syntaxerror {
-    my $this = shift;
-    $this->{+SYNTAXERROR_P} = $_[0];
-    return $this;
-}
-
-sub _push_readahead {
-    my $this = shift;
-    push @{$this->{readahead}}, $_[0];
-    return $this;
-}
-
-sub _pop_readahead {
-    my $this = shift;
-    return pop @{$this->{readahead}};
-}
-
-sub _append_readahead {
-    my $this = shift;
-    $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
-    return $this;
-}
-
-sub _set_readahead {
-    my $this = shift;
-    $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
-    return $this;
-}
-
-sub _increment_line_number {
-    my $this = shift;
-    $this->{+LINENUM} += 1;
-    return $this;
-}
-
-sub _set_line_number_start {
-    my $this = shift;
-    $this->{+LINENUM_START} = $_[0];
-    return $this;
-}
-
-sub _set_cdata_mode {
-    my $this = shift;
-    $this->{+CDATA_MODE_P} = $_[0];
-    return $this;
-}
-
-sub _set_pcdata_mode {
-    my $this = shift;
-    $this->{+PCDATA_MODE_P} = $_[0];
-    return $this;
-}
-
 sub _set_js_mode {
     my $this = shift;
     $this->{+JS_MODE_P} = $_[0];
     return $this;
 }
 
-sub _set_cdata_close {
-    my $this = shift;
-    $this->{+CDATA_CLOSE} = $_[0];
-    return $this;
-}
-
+#used in xgettext, tmpl_process3 and text-extract2
 sub set_allow_cformat {
     my $this = shift;
     $this->{+ALLOW_CFORMAT_P} = $_[0];
@@ -268,7 +144,7 @@ BEGIN {
 sub parenleft  () { '(' }
 sub parenright () { ')' }
 
-sub split_js ($) {
+sub _split_js ($) {
     my ($s0) = @_;
     my @it = ();
     while (length $s0) {
@@ -320,7 +196,7 @@ sub STATE_STRING_LITERAL () { 3 }
 
 # XXX This is a crazy hack. I don't want to write an ECMAScript parser.
 # XXX A scanner is one thing; a parser another thing.
-sub identify_js_translatables (@) {
+sub _identify_js_translatables (@) {
     my @input = @_;
     my @output = ();
     # We mark a JavaScript translatable string as in C, i.e., _("literal")
@@ -354,555 +230,98 @@ sub identify_js_translatables (@) {
 
 ###############################################################################
 
-sub _extract_attributes ($;$) {
-    my $this = shift;
-    my($s, $lc) = @_;
-    my %attr;
-    $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s	# XML-style self-closing tags
-	    || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s;	# SGML-style tags
-
-    for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
-	my($key, $val, $val_orig, $rest)
-		= ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
-	$i += 1;
-	$attr{+lc($key)} = [$key, $val, $val_orig, $i];
-	$s = $rest;
-	if ($val =~ /$re_tmpl_include/os) {
-	    warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
-	} elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
-	    # XXX: we probably should not warn if key is "onclick" etc
-	    # XXX: there's just no reasonable thing to suggest
-	    my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
-	    undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
-	    warn_pedantic
-		    "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
-			. ": $val_orig",
-		    $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
-		if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
-	} elsif ($val_orig !~ /^['"]/) {
-	    my $t = $val; $t =~ s/$re_directive_control//os;
-	    warn_pedantic
-		"Unquoted attribute contains character(s) that should be quoted"
-		    . ": $val_orig",
-		$lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
-		if $t =~ /[^-\.A-Za-z0-9]/s;
-	}
-    }
-    my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
-    if ($s2 =~ /\S/s) { # should never happen
-	if ($s =~ /^([^\n]*)\n/s) { # this is even worse
-	    error_normal("Completely confused while extracting attributes: $1", $lc);
-	    error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
-	    $this->_set_fatal( 1 );
-	} else {
-	    # There's something wrong with the attribute syntax.
-	    # We might be able to deduce a likely cause by looking more.
-	    if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
-		warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
-	    } else {
-		warn_normal "Strange attribute syntax: $s\n", $lc;
-	    }
-	}
-    }
-    return \%attr;
-}
-
-sub _next_token_internal {
-    my $this = shift;
-    my($h) = @_;
-    my($it, $kind);
-    my $eof_p = 0;
-    $this->_pop_readahead if $this->has_readahead_p
-	    && !ref $this->_peek_readahead
-	    && !length $this->_peek_readahead;
-    if (!$this->has_readahead_p) {
-	my $next = scalar <$h>;
-	$eof_p = !defined $next;
-	if (!$eof_p) {
-	    $this->_increment_line_number;
-	    $this->_push_readahead( $next );
-	}
-    }
-    $this->_set_line_number_start( $this->line_number ); # remember 1st line num
-    if ($this->has_readahead_p && ref $this->_peek_readahead) {	# TmplToken obj.
-	($it, $kind) = ($this->_pop_readahead, undef);
-    } elsif ($eof_p && !$this->has_readahead_p) {	# nothing left to do
-	;
-    } elsif ($this->_peek_readahead =~ /^\s+/s) {	# whitespace
-	($kind, $it) = (TmplTokenType::TEXT, $&);
-	$this->_set_readahead( $' );
-    # FIXME the following (the [<\s] part) is an unreliable HACK :-(
-    } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) {	# non-space normal text
-	($kind, $it) = (TmplTokenType::TEXT, $&);
-	$this->_set_readahead( $' );
-	warn_normal "Unescaped < in $it\n", $this->line_number_start
-		if !$this->cdata_mode_p && $it =~ /</s;
-    } else {				# tag/declaration/processing instruction
-	my $ok_p = 0;
-	my $bad_comment_p = 0;
-	for (my $cdata_close = $this->cdata_close;;) {
-	    if ($this->cdata_mode_p) {
-		my $next = $this->_pop_readahead;
-		if ($next =~ /^$cdata_close/is) {
-		    ($kind, $it) = (TmplTokenType::TAG, $&);
-		    $this->_push_readahead( $' );
-		    $ok_p = 1;
-		} elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
-		    ($kind, $it) = (TmplTokenType::TEXT, $1);
-		    $this->_push_readahead( "$2$'" );
-		    $ok_p = 1;
-		} else {
-		    ($kind, $it) = (TmplTokenType::TEXT, $next);
-		    $ok_p = 1;
-		}
-	    } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
-		# If we detect a "closed start tag" but we know that the
-		# following token looks like a TMPL_VAR, don't stop
-		my($head, $tail, $post) = ($1, $2, $3);
-		if ($tail eq '' && $post =~ $re_tmpl_var) {
-		    # Don't bother to show the warning if we're too confused
-		    # FIXME. There's no method for _closed_start_tag_warning
-		    if (!defined $this->{'_closed_start_tag_warning'}
-			|| ($this->{'_closed_start_tag_warning'}->[0] eq $head
-			&& $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
-		    warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
-			    if split(/\n/, $head) < 10;
-		    }
-		    $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
-		} else {
-		    ($kind, $it) = (TmplTokenType::TAG, "$head>");
-		    $this->_set_readahead( $post );
-		    $ok_p = 1;
-		    warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '' 
-                and $head ne '<!DOCTYPE stylesheet ['; # another bit of temporary ugliness for bug 4472
-		}
-	    } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
-		($kind, $it) = (TmplTokenType::COMMENT, $&);
-		$this->_set_readahead( $' );
-		$ok_p = 1;
-		$bad_comment_p = 1;
-	    }
-	last if $ok_p;
-	    my $next = scalar <$h>;
-	    $eof_p = !defined $next;
-	last if $eof_p;
-	    $this->_increment_line_number;
-	    $this->_append_readahead( $next );
-	}
-	if ($kind ne TmplTokenType::TAG) {
-	    ;
-	} elsif ($it =~ /^<!/) {
-	    $kind = TmplTokenType::DECL;
-	    $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
-	    if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
-		warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
-	    }
-	} elsif ($it =~ /^<\?/) {
-	    $kind = TmplTokenType::PI;
-	}
-	if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
-	    $kind = TmplTokenType::DIRECTIVE;
-	} elsif ($bad_comment_p) {
-	    warn_normal sprintf("Syntax error in comment: %s\n", $it),
-		    $this->line_number_start;
-	    $this->_set_syntaxerror( 1 );
-	}
-	if (!$ok_p && $eof_p) {
-	    ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
-	    $this->_set_readahead, undef;
-	    $this->_set_syntaxerror( 1 );
-	}
-    }
-    warn_normal "Unrecognizable token found: "
-	    . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
-	    . "\n", $this->line_number_start
-	if $kind == TmplTokenType::UNKNOWN;
-    return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
-}
-
-sub _next_token_intermediate {
-    my $this = shift;
-    my $h = $this->_handle;
-    my $it;
-    if (!$this->cdata_mode_p) {
-	$it = $this->_next_token_internal($h);
-	if (defined $it && $it->type == TmplTokenType::TAG) {
-	    if ($it->string =~ /^<(script|style|textarea)\b/is ||
-            ($this->filename =~ /(opensearch)|(opac-showreviews-rss)/ && $it->string =~ /^<(description)\b/) # FIXME special case to handle
-                                                                                    # a CDATA in opac-opensearch.tmpl and opac-showreviews-rss.tmpl
-           ) {
-		$this->_set_cdata_mode( 1 );
-		$this->_set_cdata_close( "</$1\\s*>" );
-		$this->_set_pcdata_mode( 0 );
-		$this->_set_js_mode( lc($1) eq 'script' );
-#	    } elsif ($it->string =~ /^<(title)\b/is) {
-#		$this->_set_cdata_mode( 1 );
-#		$this->_set_cdata_close( "</$1\\s*>" );
-#		$this->_set_pcdata_mode( 1 );
-	    }
-	    $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
-	}
-    } else {
-	my $eof_p = 0;
-	for ($it = '', my $cdata_close = $this->cdata_close;;) {
-	    my $next = $this->_next_token_internal($h);
-	    $eof_p = !defined $next;
-	last if $eof_p;
-	    if (defined $next && $next->string =~ /$cdata_close/is) {
-		$this->_push_readahead( $next ); # push entire TmplToken object
-		$this->_set_cdata_mode( 0 );
-	    }
-	last unless $this->cdata_mode_p;
-	    $it .= $next->string;
-	}
-	if ($eof_p) {
-	    $it = undef;
-	    error_normal "Unexpected end of file while looking for "
-		    . $this->cdata_close
-		    . "\n", $this->line_number_start;
-	    $this->_set_fatal( 1 );
-	    $this->_set_syntaxerror( 1 );
-	}
-	if ($this->pcdata_mode_p) {
-	    my $check = $it;
-	    $check =~ s/$re_directive//gos;
-	    warn_pedantic "Markup found in PCDATA\n", $this->line_number,
-			    \$pedantic_error_markup_in_pcdata_p
-		    if $check =~ /$re_tag_compat/s;
-	}
-	# PCDATA should be treated as text, not CDATA
-	# Actually it should be treated as TEXT_PARAMETRIZED :-(
-	$it = TmplToken->new( $it,
-			($this->pcdata_mode_p?
-			    TmplTokenType::TEXT: TmplTokenType::CDATA),
-			$this->line_number, $this->filename )
-		if defined $it;
-	if ($this->js_mode_p) {
-	    my $s0 = $it->string;
-	    my @head = ();
-	    my @tail = ();
-	    if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
-		push @head, $1;
-		push @tail, $3;
-		$s0 = $2;
-	    }
-	    push @head, split_js $s0;
-	    $it->set_js_data( identify_js_translatables(@head, @tail) );
-	}
-	$this->_set_pcdata_mode, 0;
-	$this->_set_cdata_close, undef unless !defined $it;
-    }
-    return $it;
-}
-
-sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
-    my($t) = @_;
-    return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
-	|| ($t->type == TmplTokenType::DIRECTIVE
-		&& $t->string =~ /^(?:$re_tmpl_var)$/os)
-	|| ($t->type == TmplTokenType::TAG
-		&& ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
-		|| ($t->string =~ /^<input\b/is
-		    && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
-		    ))
-}
-
-sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
-    my($t) = @_;
-    return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
-	|| ($t->type == TmplTokenType::DIRECTIVE
-		&& $t->string =~ /^(?:$re_tmpl_var)$/os)
-	|| ($t->type == TmplTokenType::TAG
-		&& ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
-		|| ($t->string =~ /^<input\b/is
-		    && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
-}
-
-sub _quote_cformat ($) {
-    my($s) = @_;
-    $s =~ s/%/%%/g;
-    return $s;
-}
-
 sub string_canon ($) {
-    my($s) = @_;
-    if (1) { # FIXME
-	# Fold all whitespace into single blanks
-	$s =~ s/\s+/ /gs;
+  my $s = shift;
+  # Fold all whitespace into single blanks
+  $s =~ s/\s+/ /gs;
+  return $s;
+}
+
+sub _quote_cformat{
+  my $s = shift;
+  $s =~ s/%/%%/g;
+  return $s;
+}
+
+sub _formalize_string_cformat{
+  my $s = shift;
+  return _quote_cformat( string_canon $s );
+}
+
+sub _formalize{
+  my $t = shift;
+  if( $t->type == TmplTokenType::DIRECTIVE ){
+    return '%s';
+  } elsif( $t->type == TmplTokenType::TEXT ){
+    return _formalize_string_cformat( $t->string );
+  } elsif( $t->type == TmplTokenType::TAG ){
+    if( $t->string =~ m/^a\b/is ){
+      return '<a>';
+    } elsif( $t->string =~ m/^input\b/is ){
+      if( lc $t->attributes->{'type'}->[1] eq 'text' ){
+        return '%S';
+      } else{
+        return '%p';
+      }
+    } else{
+      return _quote_cformat $t->string;
     }
-    return $s;
+  } else{
+    return _quote_cformat $t->string;
+  }
 }
 
-sub _formalize_string_cformat ($) {
-    my($s) = @_;
-    return _quote_cformat string_canon $s;
-}
-
-sub _formalize ($) {
-    my($t) = @_;
-    return $t->type == TmplTokenType::DIRECTIVE? '%s':
-	   $t->type == TmplTokenType::TEXT?
-		   _formalize_string_cformat($t->string):
-	   $t->type == TmplTokenType::TAG?
-		   ($t->string =~ /^<a\b/is? '<a>':
-		    $t->string =~ /^<input\b/is? (
-			    lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
-			    '%p'):
-		    _quote_cformat($t->string)):
-	       _quote_cformat($t->string);
-}
-
-sub _optimize {
+# internal parametization, used within next_token
+# method that takes in an array of TEXT and DIRECTIVE tokens (DIRECTIVEs must be GET) and return a TmplTokenType::TEXT_PARAMETRIZED
+sub _parametrize_internal{
     my $this = shift;
-    my @structure = @_;
-    my $undo_trailing_blanks = sub {
-		for (my $i = $#structure; $i >= 0; $i -= 1) {
-		last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
-		    # Queue element structure: [reanalysis-p, token]
-		    push @{$this->{_queue}}, [1, pop @structure];
-		}
-	    };
-    &$undo_trailing_blanks;
-    while (@structure >= 2) {
-	my $something_done_p = 0;
-	# FIXME: If the last token is a close tag but there are no tags
-	# FIXME: before it, drop the close tag back into the queue. This
-	# FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
-	if (@structure >= 2
-		&& $structure[$#structure]->type == TmplTokenType::TAG
-		&& $structure[$#structure]->string =~ /^<\//s) {
-	    my $has_other_tags_p = 0;
-	    for (my $i = 0; $i < $#structure; $i += 1) {
-		$has_other_tags_p = 1
-			if $structure[$i]->type == TmplTokenType::TAG;
-	    last if $has_other_tags_p;
-	    }
-	    if (!$has_other_tags_p) {
-		push @{$this->{_queue}}, [0, pop @structure]
-		&$undo_trailing_blanks;
-		$something_done_p = 1;
-	    }
-	}
-	# FIXME: Do the same ugly hack for the last token being a ( or [
-	if (@structure >= 2
-		&& $structure[$#structure]->type == TmplTokenType::TEXT
-		&& $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
-	    push @{$this->{_queue}}, [1, pop @structure];
-	    &$undo_trailing_blanks;
-	    $something_done_p = 1;
-	}
-	# FIXME: If the first token is an open tag, but there is no
-	# FIXME: corresponding close tag, "drop the open tag", i.e.,
-	# FIXME: requeue everything for reanalysis, except the frist tag. :-(
-	if (@structure >= 2
-		&& $structure[0]->type == TmplTokenType::TAG
-		&& $structure[0]->string =~ /^<([a-z0-9]+)/is
-		&& (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
-	) {
-	    my $tag_open_count = 1;
-	    for (my $i = 1; $i <= $#structure; $i += 1) {
-		if ($structure[$i]->type == TmplTokenType::TAG) {
-		    if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
-			$tag_open_count += ($1? -1: +1);
-		    }
-		}
-	    }
-	    if ($tag_open_count > 0) {
-		for (my $i = $#structure; $i; $i -= 1) {
-		    push @{$this->{_queue}}, [1, pop @structure];
-		}
-		$something_done_p = 1;
-	    }
-	}
-	# FIXME: If the first token is an open tag, the last token is the
-	# FIXME: corresponding close tag, and there are no other close tags 
-	# FIXME: inbetween, requeue the tokens from the second token on,
-	# FIXME: flagged as ok for re-analysis
-	if (@structure >= 3
-		&& $structure[0]->type == TmplTokenType::TAG
-		&& $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
-		&& $structure[$#structure]->type == TmplTokenType::TAG
-		&& $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
-	    my $has_other_open_or_close_tags_p = 0;
-	    for (my $i = 1; $i < $#structure; $i += 1) {
-		$has_other_open_or_close_tags_p = 1
-			if $structure[$i]->type == TmplTokenType::TAG
-			&& $structure[$i]->string =~ /^<\/?$tag\b/is;
-	    last if $has_other_open_or_close_tags_p;
-	    }
-	    if (!$has_other_open_or_close_tags_p) {
-		for (my $i = $#structure; $i; $i -= 1) {
-		    push @{$this->{_queue}}, [1, pop @structure];
-		}
-		$something_done_p = 1;
-	    }
-	}
-    last if !$something_done_p;
-    }
-    return @structure;
-}
-
-sub looks_plausibly_like_groupable_text_p (@) {
-    my @structure = @_;
-    # The text would look plausibly groupable if all open tags are also closed.
-    my @tags = ();
-    my $error_p = 0;
-    for (my $i = 0; $i <= $#structure; $i += 1) {
-	if ($structure[$i]->type == TmplTokenType::TAG) {
-	    my $form = $structure[$i]->string;
-	    if ($form =~ /^<([A-Z0-9]+)/is) {
-		my $tag = lc($1);
-		if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
-		    push @tags, $tag;
-		}
-	    } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
-		if (@tags && lc($1) eq $tags[$#tags]) {
-		    pop @tags;
-		} else {
-		    $error_p = 1;
-		}
-	    }
-	} elsif ($structure[$i]->type != TmplTokenType::TEXT) {
-	    $error_p = 1;
-	}
-    last if $error_p;
-    }
-    return !$error_p && !@tags;
+    my @parts = @_;
+    # my $s = "";
+    # for my $item (@parts){
+    #     if( $item->type == TmplTokenType::TEXT ){
+    #         $s .= $item->string;
+    #     } else {
+    #         #must be a variable directive
+    #         $s .= "%s";
+    #     }
+    # }
+    my $s = join( "", map { _formalize $_ } @parts );
+    # should both the string and form be $s? maybe only the later? posibly the former....
+    # used line number from first token, should suffice
+    my $t = TmplToken->new( $s, TmplTokenType::TEXT_PARAMETRIZED, $parts[0]->line_number, $this->filename );
+    $t->set_children(@parts);
+    $t->set_form($s);
+    return $t;
 }
 
 sub next_token {
-    my $this = shift;
-    my $h = $this->_handle;
-    my $it;
-    $this->{_queue} = [] unless defined $this->{_queue};
-
-    # Elements in the queue are ordered pairs. The first in the ordered pair
-    # specifies whether we are allowed to reanalysis; the second is the token.
-    if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
-	$it = (pop @{$this->{_queue}})->[1];
-    } else {
-	if (@{$this->{_queue}}) {
-	    $it = (pop @{$this->{_queue}})->[1];
-	} else {
-	    $it = $this->_next_token_intermediate($h);
-	}
-	if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
-	    && ($it->type == TmplTokenType::TEXT?
-		!blank_p( $it->string ): _token_groupable1_p( $it ))) {
-	    my @structure = ( $it );
-	    my @tags = ();
-	    my $next = undef;
-	    my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
-	    if ($it->type == TmplTokenType::TEXT) {
-		$nonblank_text_p = 1 if !blank_p( $it->string );
-	    } elsif ($it->type == TmplTokenType::DIRECTIVE) {
-		$parametrized_p = 1;
-	    } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
-		my $tag = lc($1);
-		push @tags, $tag if $tag !~ /^(?:br|input)$/i;
-		$with_anchor_p = 1 if $tag eq 'a';
-		$with_input_p = 1 if $tag eq 'input';
-	    }
-	    # We hate | and || in msgid strings, so we try to avoid them
-	    for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
-		if (@{$this->{_queue}}) {
-		    $next = (pop @{$this->{_queue}})->[1];
-		} else {
-		    $next = $this->_next_token_intermediate($h);
-		}
-		push @structure, $next; # for consistency (with initialization)
-	    last unless defined $next && _token_groupable2_p( $next );
-	    last if $quit_next_p;
-		if ($next->type == TmplTokenType::TEXT) {
-		    $nonblank_text_p = 1 if !blank_p( $next->string );
-		    $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
-		} elsif ($next->type == TmplTokenType::DIRECTIVE) {
-		    $parametrized_p = 1;
-		} elsif ($next->type == TmplTokenType::TAG) {
-		    if ($next->string =~ /^<([A-Z0-9]+)/is) {
-			my $tag = lc($1);
-			push @tags, $tag if $tag !~ /^(?:br|input)$/i;
-			$with_anchor_p = 1 if $tag eq 'a';
-			$with_input_p = 1 if $tag eq 'input';
-		    } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
-			my $close = lc($1);
-			$quit_p = 1 unless @tags && $close eq $tags[$#tags];
-			$quit_next_p = 1 if $close =~ /^h\d$/;
-			pop @tags;
-		    }
-		}
-	    last if $quit_p;
-	    }
-	    # Undo the last token, allowing reanalysis
-	    push @{$this->{_queue}}, [1, pop @structure];
-	    # Simply it a bit more
-	    @structure = $this->_optimize( @structure );
-	    if (@structure < 2) {
-		# Nothing to do
-		;
-	    } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
-		# Create the corresponding c-format string
-		my $string = join('', map { $_->string } @structure);
-		my $form = join('', map { _formalize $_ } @structure);
-		my($a_counter, $input_counter) = (0, 0);
-		$form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
-		$form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
-		$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
-			$it->line_number, $it->pathname);
-		$it->set_form( $form );
-		$it->set_children( @structure );
-	    } elsif ($nonblank_text_p
-		    && looks_plausibly_like_groupable_text_p( @structure )
-		    && $structure[$#structure]->type == TmplTokenType::TEXT) {
-		# Combine the strings
-		my $string = join('', map { $_->string } @structure);
-		$it = TmplToken->new($string, TmplTokenType::TEXT,
-			$it->line_number, $it->pathname);;
-	    } else {
-		# Requeue the tokens thus seen for re-emitting, allow reanalysis
-		for (;;) {
-		    push @{$this->{_queue}}, [1, pop @structure];
-		last if !@structure;
-		}
-		$it = (pop @{$this->{_queue}})->[1];
-	    }
-	}
+    my $self = shift;
+    my $next;
+    # parts that make up a text_parametrized (future children of the token)
+    my @parts = ();
+    while(1){
+        $next = $self->{_parser}->next_token;
+        return undef unless defined $next;
+        # if cformat mode is off, dont bother parametrizing, just return them as they come
+        return $next unless $self->allow_cformat_p;
+        if( $next->type == TmplTokenType::TEXT ){
+            push @parts, $next;
+        } elsif( $next->type == TmplTokenType::DIRECTIVE && $next->string =~ m/\[%\s*\w+\s*%\]/ ){
+            push @parts, $next;
+        } else{
+            # if there is nothing in parts, return this token
+            return $next unless @parts;
+            # OTHERWISE, put this token back and return the parametrized string of @parts
+            $self->{_parser}->return_token($next);
+            return $self->_parametrize_internal(@parts);
+        }
     }
-    if (defined $it && $it->type == TmplTokenType::TEXT) {
-	my $form = string_canon $it->string;
-	$it->set_form( $form );
-    }
-    return $it;
 }
 
 ###############################################################################
 
-# Other simple functions (These are not methods)
-
-sub blank_p ($) {
-    my($s) = @_;
-    return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var|$re_xsl)*$/os;
-}
-
-sub trim ($) {
-    my($s0) = @_;
-    my $l0 = length $s0;
-    my $s = $s0;
-    $s =~ s/^(\s|\&nbsp$re_end_entity)+//os; my $l1 = $l0 - length $s;
-    $s =~ s/(\s|\&nbsp$re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
-    return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
-}
-
-sub quote_po ($) {
-    my($s) = @_;
-    # Locale::PO->quote is buggy, it doesn't quote newlines :-/
-    $s =~ s/([\\"])/\\\1/gs;
-    $s =~ s/\n/\\n/g;
-    #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
-    return "\"$s\"";
-}
-
-# Some functions that shouldn't be here... should be moved out some time
+# ugly method taken from old version
+# used by tmpl_process3
 sub parametrize ($$$$) {
     my($fmt_0, $cformat_p, $t, $f) = @_;
     my $it = '';
@@ -993,6 +412,32 @@ sub parametrize ($$$$) {
     return $it;
 }
 
+
+# Other simple functions (These are not methods)
+
+sub blank_p ($) {
+    my($s) = @_;
+    return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var|$re_xsl)*$/osi;
+}
+
+sub trim ($) {
+    my($s0) = @_;
+    my $l0 = length $s0;
+    my $s = $s0;
+    $s =~ s/^(\s|\&nbsp$re_end_entity)+//os; my $l1 = $l0 - length $s;
+    $s =~ s/(\s|\&nbsp$re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
+    return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
+}
+
+sub quote_po ($) {
+    my($s) = @_;
+    # Locale::PO->quote is buggy, it doesn't quote newlines :-/
+    $s =~ s/([\\"])/\\\1/gs;
+    $s =~ s/\n/\\n/g;
+    #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
+    return "\"$s\"";
+}
+
 sub charset_canon ($) {
     my($charset) = @_;
     $charset = uc($charset);
diff --git a/misc/translator/tmpl_process3.pl b/misc/translator/tmpl_process3.pl
index b916cab649..dd6cf5cd66 100755
--- a/misc/translator/tmpl_process3.pl
+++ b/misc/translator/tmpl_process3.pl
@@ -227,8 +227,8 @@ my $action = shift or usage_error('You must specify an ACTION.');
 usage_error('You must at least specify input and string list filenames.')
     if !@in_files || !defined $str_file;
 
-# Type match defaults to *.tmpl plus *.inc if not specified
-$type = "tmpl|inc|xsl" if !defined($type);
+# Type match defaults to *.tt plus *.inc if not specified
+$type = "tt|inc|xsl" if !defined($type);
 
 # Check the inputs for being files or directories
 for my $input (@in_files) {
-- 
2.20.1