misc/translator/TmplTokenizer.pm

   1 package TmplTokenizer;
   2
   3 use strict;
   4 use TmplTokenType;
   5 use TmplToken;
   6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
   7 require Exporter;
   8
   9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10
  11 ###############################################################################
  12
  13 =head1 NAME
  14
  15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
  16
  17 =head1 DESCRIPTION
  18
  19 Because .tmpl files contains HTML::Template directives
  20 that tend to confuse real parsers (e.g., HTML::Parse),
  21 it might be better to create a customized scanner
  22 to scan the template files for tokens.
  23 This module is a simple-minded attempt at such a scanner.
  24
  25 =cut
  26
  27 ###############################################################################
  28
  29 $VERSION = 0.02;
  30
  31 @ISA = qw(Exporter);
  32 @EXPORT_OK = qw();
  33
  34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
  36
  37 ###############################################################################
  38
  39 # Hideous stuff
  40 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
  41 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
  42 BEGIN {
  43     # $re_directive must not do any backreferences
  44     $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  45     # TMPL_VAR or TMPL_INCLUDE
  46     $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  47     $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  48     # TMPL_VAR ESCAPE=1/HTML/URL
  49     $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  50     # Any control flow directive
  51     $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  52     # /LOOP or /IF or /UNLESS
  53     $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  54 }
  55
  56 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
  57 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
  58 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
  59 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
  60 sub re_tag ($) {
  61    my($compat) = @_;
  62    my $etag = $compat? '>': '<>\/';
  63    # This is no longer similar to the original regexp in subst.pl :-(
  64    # Note that we don't want <> in compat mode; Mozilla knows about <
  65    q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
  66    . $re_directive
  67    . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
  68 }
  69 BEGIN {
  70     $re_comment = '(?:--(?:[^-]|-[^-])*--)';
  71     $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
  72     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
  73     $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
  74     @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
  75 }
  76
  77 # End of the hideous stuff
  78
  79 use vars qw( $serial );
  80
  81 ###############################################################################
  82
  83 sub FATAL_P             () {'fatal-p'}
  84 sub SYNTAXERROR_P       () {'syntaxerror-p'}
  85
  86 sub FILENAME            () {'input'}
  87 sub HANDLE              () {'handle'}
  88
  89 sub READAHEAD           () {'readahead'}
  90 sub LINENUM_START       () {'lc_0'}
  91 sub LINENUM             () {'lc'}
  92 sub CDATA_MODE_P        () {'cdata-mode-p'}
  93 sub CDATA_CLOSE         () {'cdata-close'}
  94
  95 sub ALLOW_CFORMAT_P     () {'allow-cformat-p'}
  96
  97 sub new {
  98     my $this = shift;
  99     my($input) = @_;
 100     my $class = ref($this) || $this;
 101     my $self = {};
 102     bless $self, $class;
 103
 104     my $handle = sprintf('TMPLTOKENIZER%d', $serial);
 105     $serial += 1;
 106
 107     no strict;
 108     open($handle, "<$input") || die "$input: $!\n";
 109     use strict;
 110     $self->{+FILENAME} = $input;
 111     $self->{+HANDLE} = $handle;
 112     $self->{+READAHEAD} = [];
 113     return $self;
 114 }
 115
 116 ###############################################################################
 117
 118 # Simple getters
 119
 120 sub filename {
 121     my $this = shift;
 122     return $this->{+FILENAME};
 123 }
 124
 125 sub _handle {
 126     my $this = shift;
 127     return $this->{+HANDLE};
 128 }
 129
 130 sub fatal_p {
 131     my $this = shift;
 132     return $this->{+FATAL_P};
 133 }
 134
 135 sub syntaxerror_p {
 136     my $this = shift;
 137     return $this->{+SYNTAXERROR_P};
 138 }
 139
 140 sub has_readahead_p {
 141     my $this = shift;
 142     return @{$this->{+READAHEAD}};
 143 }
 144
 145 sub _peek_readahead {
 146     my $this = shift;
 147     return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
 148 }
 149
 150 sub line_number_start {
 151     my $this = shift;
 152     return $this->{+LINENUM_START};
 153 }
 154
 155 sub line_number {
 156     my $this = shift;
 157     return $this->{+LINENUM};
 158 }
 159
 160 sub cdata_mode_p {
 161     my $this = shift;
 162     return $this->{+CDATA_MODE_P};
 163 }
 164
 165 sub cdata_close {
 166     my $this = shift;
 167     return $this->{+CDATA_CLOSE};
 168 }
 169
 170 sub allow_cformat_p {
 171     my $this = shift;
 172     return $this->{+ALLOW_CFORMAT_P};
 173 }
 174
 175 # Simple setters
 176
 177 sub _set_fatal {
 178     my $this = shift;
 179     $this->{+FATAL_P} = $_[0];
 180     return $this;
 181 }
 182
 183 sub _set_syntaxerror {
 184     my $this = shift;
 185     $this->{+SYNTAXERROR_P} = $_[0];
 186     return $this;
 187 }
 188
 189 sub _push_readahead {
 190     my $this = shift;
 191     push @{$this->{+READAHEAD}}, $_[0];
 192     return $this;
 193 }
 194
 195 sub _pop_readahead {
 196     my $this = shift;
 197     return pop @{$this->{+READAHEAD}};
 198 }
 199
 200 sub _append_readahead {
 201     my $this = shift;
 202     $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
 203     return $this;
 204 }
 205
 206 sub _set_readahead {
 207     my $this = shift;
 208     $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
 209     return $this;
 210 }
 211
 212 sub _increment_line_number {
 213     my $this = shift;
 214     $this->{+LINENUM} += 1;
 215     return $this;
 216 }
 217
 218 sub _set_line_number_start {
 219     my $this = shift;
 220     $this->{+LINENUM_START} = $_[0];
 221     return $this;
 222 }
 223
 224 sub _set_cdata_mode {
 225     my $this = shift;
 226     $this->{+CDATA_MODE_P} = $_[0];
 227     return $this;
 228 }
 229
 230 sub _set_cdata_close {
 231     my $this = shift;
 232     $this->{+CDATA_CLOSE} = $_[0];
 233     return $this;
 234 }
 235
 236 sub set_allow_cformat {
 237     my $this = shift;
 238     $this->{+ALLOW_CFORMAT_P} = $_[0];
 239     return $this;
 240 }
 241
 242 ###############################################################################
 243
 244 sub _extract_attributes ($;$) {
 245     my $this = shift;
 246     my($s, $lc) = @_;
 247     my %attr;
 248     $s = $1 if $s =~ /^<\S+(.*)\/\S$/s  # XML-style self-closing tags
 249             || $s =~ /^<\S+(.*)\S$/s;   # SGML-style tags
 250
 251     for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
 252         my($key, $val, $val_orig, $rest)
 253                 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
 254         $i += 1;
 255         $attr{+lc($key)} = [$key, $val, $val_orig, $i];
 256         $s = $rest;
 257         if ($val =~ /$re_tmpl_include/os) {
 258             warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
 259         } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
 260             # XXX: we probably should not warn if key is "onclick" etc
 261             # XXX: there's just no reasonable thing to suggest
 262             my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
 263             undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
 264             warn_pedantic
 265                     "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
 266                         . ": $val_orig",
 267                     $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
 268                 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
 269         } elsif ($val_orig !~ /^['"]/) {
 270             my $t = $val; $t =~ s/$re_directive_control//os;
 271             warn_pedantic
 272                 "Unquoted attribute contains character(s) that should be quoted"
 273                     . ": $val_orig",
 274                 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
 275                 if $t =~ /[^-\.A-Za-z0-9]/s;
 276         }
 277     }
 278     my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
 279     if ($s2 =~ /\S/s) { # should never happen
 280         if ($s =~ /^([^\n]*)\n/s) { # this is even worse
 281             error_normal("Completely confused while extracting attributes: $1", $lc);
 282             error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
 283             $this->_set_fatal( 1 );
 284         } else {
 285             warn_normal "Strange attribute syntax: $s\n", $lc;
 286         }
 287     }
 288     return \%attr;
 289 }
 290
 291 sub _next_token_internal {
 292     my $this = shift;
 293     my($h) = @_;
 294     my($it, $kind);
 295     my $eof_p = 0;
 296     $this->_pop_readahead if $this->has_readahead_p
 297             && !ref $this->_peek_readahead
 298             && !length $this->_peek_readahead;
 299     if (!$this->has_readahead_p) {
 300         my $next = scalar <$h>;
 301         $eof_p = !defined $next;
 302         if (!$eof_p) {
 303             $this->_increment_line_number;
 304             $this->_push_readahead( $next );
 305         }
 306     }
 307     $this->_set_line_number_start( $this->line_number ); # remember 1st line num
 308     if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
 309         ($it, $kind) = ($this->_pop_readahead, undef);
 310     } elsif ($eof_p && !$this->has_readahead_p) {       # nothing left to do
 311         ;
 312     } elsif ($this->_peek_readahead =~ /^\s+/s) {       # whitespace
 313         ($kind, $it) = (TmplTokenType::TEXT, $&);
 314         $this->_set_readahead( $' );
 315     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
 316     } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) {        # non-space normal text
 317         ($kind, $it) = (TmplTokenType::TEXT, $&);
 318         $this->_set_readahead( $' );
 319         warn_normal "Unescaped < in $it\n", $this->line_number_start
 320                 if !$this->cdata_mode_p && $it =~ /</s;
 321     } else {                            # tag/declaration/processing instruction
 322         my $ok_p = 0;
 323         for (my $cdata_close = $this->cdata_close;;) {
 324             if ($this->cdata_mode_p) {
 325                 my $next = $this->_pop_readahead;
 326                 if ($next =~ /^$cdata_close/is) {
 327                     ($kind, $it) = (TmplTokenType::TAG, $&);
 328                     $this->_push_readahead( $' );
 329                     $ok_p = 1;
 330                 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
 331                     ($kind, $it) = (TmplTokenType::TEXT, $1);
 332                     $this->_push_readahead( "$2$'" );
 333                     $ok_p = 1;
 334                 } else {
 335                     ($kind, $it) = (TmplTokenType::TEXT, $next);
 336                     $ok_p = 1;
 337                 }
 338             } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
 339                 # If we detect a "closed start tag" but we know that the
 340                 # following token looks like a TMPL_VAR, don't stop
 341                 my($head, $tail, $post) = ($1, $2, $3);
 342                 if ($tail eq '' && $post =~ $re_tmpl_var) {
 343                     # Don't bother to show the warning if we're too confused
 344                     warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
 345                             if split(/\n/, $head) < 10;
 346                 } else {
 347                     ($kind, $it) = (TmplTokenType::TAG, "$head>");
 348                     $this->_set_readahead( $post );
 349                     $ok_p = 1;
 350                     warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
 351                 }
 352             } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
 353                 ($kind, $it) = (TmplTokenType::COMMENT, $&);
 354                 $this->_set_readahead( $' );
 355                 $ok_p = 1;
 356                 warn_normal "Syntax error in comment: $&\n", $this->line_number_start;
 357                 $this->_set_syntaxerror( 1 );
 358             }
 359         last if $ok_p;
 360             my $next = scalar <$h>;
 361             $eof_p = !defined $next;
 362         last if $eof_p;
 363             $this->_increment_line_number;
 364             $this->_append_readahead( $next );
 365         }
 366         if ($kind ne TmplTokenType::TAG) {
 367             ;
 368         } elsif ($it =~ /^<!/) {
 369             $kind = TmplTokenType::DECL;
 370             $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 371             if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
 372                 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
 373             }
 374         } elsif ($it =~ /^<\?/) {
 375             $kind = TmplTokenType::PI;
 376         }
 377         if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
 378             $kind = TmplTokenType::DIRECTIVE;
 379         }
 380         if (!$ok_p && $eof_p) {
 381             ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
 382             $this->_set_readahead, undef;
 383             $this->_set_syntaxerror( 1 );
 384         }
 385     }
 386     warn_normal "Unrecognizable token found: "
 387             . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
 388             . "\n", $this->line_number_start
 389         if $kind == TmplTokenType::UNKNOWN;
 390     return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
 391 }
 392
 393 sub _next_token_intermediate {
 394     my $this = shift;
 395     my $h = $this->_handle;
 396     my $it;
 397     if (!$this->cdata_mode_p) {
 398         $it = $this->_next_token_internal($h);
 399         if (defined $it && $it->type == TmplTokenType::TAG) {
 400             if ($it->string =~ /^<(script|style|textarea)\b/is) {
 401                 $this->_set_cdata_mode( 1 );
 402                 $this->_set_cdata_close( "</$1\\s*>" );
 403             }
 404             $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
 405         }
 406     } else {
 407         for ($it = '', my $cdata_close = $this->cdata_close;;) {
 408             my $next = $this->_next_token_internal($h);
 409         last if !defined $next;
 410             if (defined $next && $next->string =~ /$cdata_close/is) {
 411                 $this->_push_readahead( $next ); # push entire TmplToken object
 412                 $this->_set_cdata_mode( 0 );
 413             }
 414         last unless $this->cdata_mode_p;
 415             $it .= $next->string;
 416         }
 417         $it = TmplToken->new( $it, TmplTokenType::CDATA, $this->line_number );
 418         $this->_set_cdata_close, undef;
 419     }
 420     return $it;
 421 }
 422
 423 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
 424     my($t) = @_;
 425     return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
 426         || ($t->type == TmplTokenType::DIRECTIVE
 427                 && $t->string =~ /^(?:$re_tmpl_var)$/os)
 428         || ($t->type == TmplTokenType::TAG
 429                 && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
 430 #               || ($t->string =~ /^<input\b/is
 431 #                   && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
 432                     ))
 433 }
 434
 435 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
 436     my($t) = @_;
 437     return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
 438         || ($t->type == TmplTokenType::DIRECTIVE
 439                 && $t->string =~ /^(?:$re_tmpl_var)$/os)
 440         || ($t->type == TmplTokenType::TAG
 441                 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
 442                 || ($t->string =~ /^<input\b/is
 443                     && $t->attributes->{'type'} =~ /^(?:radio|text)$/is)))
 444 }
 445
 446 sub _quote_cformat ($) {
 447     my($s) = @_;
 448     $s =~ s/%/%%/g;
 449     return $s;
 450 }
 451
 452 sub string_canon ($) {
 453     my($s) = @_;
 454     if (1) { # FIXME
 455         # Fold all whitespace into single blanks
 456         $s =~ s/\s+/ /gs;
 457     }
 458     return $s;
 459 }
 460
 461 sub _formalize_string_cformat ($) {
 462     my($s) = @_;
 463     return _quote_cformat string_canon $s;
 464 }
 465
 466 sub _formalize ($) {
 467     my($t) = @_;
 468     return $t->type == TmplTokenType::DIRECTIVE? '%s':
 469            $t->type == TmplTokenType::TEXT?
 470                    _formalize_string_cformat($t->string):
 471            $t->type == TmplTokenType::TAG?
 472                    ($t->string =~ /^<a\b/is? '<a>':
 473                     $t->string =~ /^<input\b/is? '<input>':
 474                     _quote_cformat($t->string)):
 475                _quote_cformat($t->string);
 476 }
 477
 478 sub _optimize {
 479     my $this = shift;
 480     my @structure = @_;
 481     my $undo_trailing_blanks = sub {
 482                 for (my $i = $#structure; $i >= 0; $i -= 1) {
 483                 last if $structure[$i]->type != TmplTokenType::TEXT;
 484                 last if !blank_p($structure[$i]->string);
 485                     push @{$this->{_queue}}, pop @structure;
 486                 }
 487             };
 488     &$undo_trailing_blanks;
 489     # FIXME: If the last token is a close tag but there are no tags
 490     # FIXME: before it, drop the close tag back into the queue. This
 491     # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
 492     if (@structure >= 2
 493             && $structure[$#structure]->type == TmplTokenType::TAG
 494             && $structure[$#structure]->string =~ /^<\//s) {
 495         my $has_other_tags_p = 0;
 496         for (my $i = 0; $i < $#structure; $i += 1) {
 497             $has_other_tags_p = 1 if $structure[$i]->type == TmplTokenType::TAG;
 498         last if $has_other_tags_p;
 499         }
 500         push @{$this->{_queue}}, pop @structure unless $has_other_tags_p;
 501         &$undo_trailing_blanks;
 502     }
 503     # FIXME: Do the same ugly hack for the last token being a ( or [
 504     if (@structure >= 2
 505             && $structure[$#structure]->type == TmplTokenType::TEXT
 506             && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
 507         push @{$this->{_queue}}, pop @structure;
 508         &$undo_trailing_blanks;
 509     }
 510     return @structure;
 511 }
 512
 513 sub looks_plausibly_like_groupable_text_p (@) {
 514     my @structure = @_;
 515     # The text would look plausibly groupable if all open tags are also closed.
 516     my @tags = ();
 517     my $error_p = 0;
 518     for (my $i = 0; $i <= $#structure; $i += 1) {
 519         if ($structure[$i]->type == TmplTokenType::TAG) {
 520             if ($structure[$i]->string =~ /^<([A-Z0-9]+)/is) {
 521                 my $tag = lc($1);
 522                 push @tags, $tag unless $tag =~ /^<(?:input)/is
 523                         || $tag =~ /\/>$/is;
 524             } elsif ($structure[$i]->string =~ /^<\/([A-Z0-9]+)/is) {
 525                 if (@tags && lc($1) eq $tags[$#tags]) {
 526                     pop @tags;
 527                 } else {
 528                     $error_p = 1;
 529                 }
 530             }
 531         } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
 532             $error_p = 1;
 533         }
 534     last if $error_p;
 535     }
 536     return !$error_p && !@tags;
 537 }
 538
 539 sub next_token {
 540     my $this = shift;
 541     my $h = $this->_handle;
 542     my $it;
 543     $this->{_queue} = [] unless defined $this->{_queue};
 544
 545     # Don't reparse anything in the queue. We can put a parametrized token
 546     # there if we need to, however.
 547     if (@{$this->{_queue}}) {
 548         $it = pop @{$this->{_queue}};
 549     } else {
 550         $it = $this->_next_token_intermediate($h);
 551         if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
 552             && ($it->type == TmplTokenType::TEXT?
 553                 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
 554             my @structure = ( $it );
 555             my @tags = ();
 556             my $next = undef;
 557             my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
 558             if ($it->type == TmplTokenType::TEXT) {
 559                 $nonblank_text_p = 1 if !blank_p( $it->string );
 560             } elsif ($it->type == TmplTokenType::DIRECTIVE) {
 561                 $parametrized_p = 1;
 562             } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
 563                 push @tags, lc($1);
 564                 $with_anchor_p = 1 if lc($1) eq 'a';
 565                 $with_input_p = 1 if lc($1) eq 'input';
 566             }
 567             # We hate | and || in msgid strings, so we try to avoid them
 568             for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
 569                 $next = $this->_next_token_intermediate($h);
 570                 push @structure, $next; # for consistency (with initialization)
 571             last unless defined $next && _token_groupable2_p( $next );
 572             last if $quit_next_p;
 573                 if ($next->type == TmplTokenType::TEXT) {
 574                     $nonblank_text_p = 1 if !blank_p( $next->string );
 575                     $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
 576                 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
 577                     $parametrized_p = 1;
 578                 } elsif ($next->type == TmplTokenType::TAG) {
 579                     if ($next->string =~ /^<([A-Z0-9]+)/is) {
 580                         push @tags, lc($1);
 581                         $with_anchor_p = 1 if lc($1) eq 'a';
 582                         $with_input_p = 1 if lc($1) eq 'input';
 583                     } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
 584                         my $close = lc($1);
 585                         $quit_p = 1 unless @tags && $close eq $tags[$#tags];
 586                         $quit_next_p = 1 if $close =~ /^h\d$/;
 587                         pop @tags;
 588                     }
 589                 }
 590             last if $quit_p;
 591             }
 592             # Undo the last token
 593             push @{$this->{_queue}}, pop @structure;
 594             # Simply it a bit more
 595             @structure = $this->_optimize( @structure );
 596             if (@structure < 2) {
 597                 # Nothing to do
 598                 ;
 599             } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
 600                 # Create the corresponding c-format string
 601                 my $string = join('', map { $_->string } @structure);
 602                 my $form = join('', map { _formalize $_ } @structure);
 603                 my($a_counter, $input_counter) = (0, 0);
 604                 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
 605                 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
 606                 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
 607                         $it->line_number, $it->pathname);
 608                 $it->set_form( $form );
 609                 $it->set_children( @structure );
 610             } elsif ($nonblank_text_p
 611                     && looks_plausibly_like_groupable_text_p( @structure )
 612                     && $structure[$#structure]->type == TmplTokenType::TEXT) {
 613                 # Combine the strings
 614                 my $string = join('', map { $_->string } @structure);
 615                 $it = TmplToken->new($string, TmplTokenType::TEXT,
 616                         $it->line_number, $it->pathname);;
 617             } else {
 618                 # Requeue the tokens thus seen for re-emitting
 619                 for (;;) {
 620                     push @{$this->{_queue}}, pop @structure;
 621                 last if !@structure;
 622                 }
 623                 $it = pop @{$this->{_queue}};
 624             }
 625         }
 626     }
 627     if (defined $it && $it->type == TmplTokenType::TEXT) {
 628         my $form = string_canon $it->string;
 629         $it->set_form( $form );
 630     }
 631     return $it;
 632 }
 633
 634 ###############################################################################
 635
 636 # Other simple functions (These are not methods)
 637
 638 sub blank_p ($) {
 639     my($s) = @_;
 640     return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var)*$/os;
 641 }
 642
 643 sub trim ($) {
 644     my($s0) = @_;
 645     my $l0 = length $s0;
 646     my $s = $s0;
 647     $s =~ s/^(\s|\&nbsp$re_end_entity)+//os; my $l1 = $l0 - length $s;
 648     $s =~ s/(\s|\&nbsp$re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
 649     return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
 650 }
 651
 652 sub quote_po ($) {
 653     my($s) = @_;
 654     # Locale::PO->quote is buggy, it doesn't quote newlines :-/
 655     $s =~ s/([\\"])/\\\1/gs;
 656     $s =~ s/\n/\\n/g;
 657     #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
 658     return "\"$s\"";
 659 }
 660
 661 # Some functions that shouldn't be here... should be moved out some time
 662 sub parametrize ($$$) {
 663     my($fmt_0, $params, $anchors) = @_;
 664     my $it = '';
 665     for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
 666         if ($fmt =~ /^[^%]+/) {
 667             $fmt = $';
 668             $it .= $&;
 669         } elsif ($fmt =~ /^%%/) {
 670             $fmt = $';
 671             $it .= '%';
 672         } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/) {
 673             $n += 1;
 674             my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
 675             $fmt = $';
 676             if (!defined $width && !defined $prec) {
 677                 my $param = $params->[$i - 1];
 678                 $it .= $param;
 679                 warn_normal "$&: Undefined parameter $i for msgid \"$fmt_0\"",
 680                             undef
 681                         unless defined $param;
 682             } elsif (defined $width && defined $prec && !$width && !$prec) {
 683                 ;
 684             } else {
 685                 die "Unsupported precision specification in format: $&\n"; #XXX
 686             }
 687         } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
 688             $fmt = $';
 689             $it .= $&;
 690             die "Unknown or unsupported format specification: $&\n"; #XXX
 691         } else {
 692             die "Completely confused parametrizing: $fmt\n";#XXX
 693         }
 694     }
 695     for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
 696         if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
 697             $fmt = $';
 698             $it .= $&;
 699         } elsif ($fmt =~ /^<a(\d+)>/is) {
 700             $n += 1;
 701             my $i  = $1;
 702             $fmt = $';
 703             my $anchor = $anchors->[$i - 1];
 704             warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
 705                     unless defined $anchor;
 706             $it .= $anchor->string;
 707         } else {
 708             die "Completely confused decoding anchors: $fmt\n";#XXX
 709         }
 710     }
 711     return $it;
 712 }
 713
 714 sub charset_canon ($) {
 715     my($charset) = @_;
 716     $charset = uc($charset);
 717     $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
 718     $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
 719     return $charset;
 720 }
 721
 722 use vars qw( @latin1_utf8 );
 723 @latin1_utf8 = (
 724     "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
 725     "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
 726     "\302\214", "\302\215",   undef,      undef,    "\302\220", "\302\221",
 727     "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
 728     "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
 729     "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
 730     "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
 731     "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
 732     "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
 733     "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
 734     "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
 735     "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
 736     "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
 737     "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
 738     "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
 739     "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
 740     "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
 741     "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
 742     "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
 743     "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
 744     "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
 745     "\303\276", "\303\277" );
 746
 747 sub charset_convert ($$$) {
 748     my($s, $charset_in, $charset_out) = @_;
 749     if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
 750         ;
 751     } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
 752         $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
 753     } elsif ($charset_in ne $charset_out) {
 754         VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
 755     }
 756     return $s;
 757 }
 758
 759 ###############################################################################
 760
 761 =pod
 762
 763 In addition to the basic scanning, this class will also perform
 764 the following:
 765
 766 =over
 767
 768 =item -
 769
 770 Emulation of c-format strings (see below)
 771
 772 =item -
 773
 774 Display of warnings for certain things that affects either the
 775 ability of this class to yield correct output, or things that
 776 are known to cause the original template to cause trouble.
 777
 778 =item -
 779
 780 Automatic correction of some of the things warned about
 781 (e.g., SGML "closed start tag" notation).
 782
 783 =back
 784
 785 =head2 c-format strings emulation
 786
 787 Because English word order is not universal, a simple extraction
 788 of translatable strings may yield some strings like "Accounts for"
 789 or ambiguous strings like "in". This makes the resulting strings
 790 difficult to translate, but does not affect all languages alike.
 791 For example, Chinese (with a somewhat different word order) would
 792 be hit harder, but French would be relatively unaffected.
 793
 794 To overcome this problem, the scanner can be configured to detect
 795 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
 796 and try to construct a larger pattern that will appear in the PO
 797 file as c-format strings with %s placeholders. This additional
 798 step allows the translator to deal with cases where word order
 799 is different (replacing %s with %1$s, %2$s, etc.), or when certain
 800 words will require certain inflectional suffixes in sentences.
 801
 802 Because this is an incompatible change, this mode must be explicitly
 803 turned on using the set_cformat(1) method call.
 804
 805 =head1 HISTORY
 806
 807 This tokenizer is mostly based
 808 on Ambrose's hideous Perl script known as subst.pl.
 809
 810 =cut
 811
 812 1;