misc/translator/TmplTokenizer.pm

   1 package TmplTokenizer;
   2
   3 use strict;
   4 use TmplTokenType;
   5 use TmplToken;
   6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
   7 require Exporter;
   8
   9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10
  11 ###############################################################################
  12
  13 =head1 NAME
  14
  15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
  16
  17 =head1 DESCRIPTION
  18
  19 Because .tmpl files contains HTML::Template directives
  20 that tend to confuse real parsers (e.g., HTML::Parse),
  21 it might be better to create a customized scanner
  22 to scan the template files for tokens.
  23 This module is a simple-minded attempt at such a scanner.
  24
  25 =cut
  26
  27 ###############################################################################
  28
  29 $VERSION = 0.02;
  30
  31 @ISA = qw(Exporter);
  32 @EXPORT_OK = qw();
  33
  34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
  36 use vars qw( $pedantic_error_markup_in_pcdata_p );
  37
  38 ###############################################################################
  39
  40 # Hideous stuff
  41 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
  42 use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl);
  43 BEGIN {
  44     # $re_directive must not do any backreferences
  45     $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  46     # TMPL_VAR or TMPL_INCLUDE
  47     $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  48     $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  49     # TMPL_VAR ESCAPE=1/HTML/URL
  50     $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
  51     $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  52     # Any control flow directive
  53     $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  54     # /LOOP or /IF or /UNLESS
  55     $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  56 }
  57
  58 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
  59 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
  60 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
  61 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
  62 sub re_tag ($) {
  63    my($compat) = @_;
  64    my $etag = $compat? '>': '<>\/';
  65    # This is no longer similar to the original regexp in subst.pl :-(
  66    # Note that we don't want <> in compat mode; Mozilla knows about <
  67    q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
  68    . $re_directive
  69    . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
  70 }
  71 BEGIN {
  72     $re_comment = '(?:--(?:[^-]|-[^-])*--)';
  73     $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
  74     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
  75     $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
  76     @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
  77 }
  78
  79 # End of the hideous stuff
  80
  81 use vars qw( $serial );
  82
  83 ###############################################################################
  84
  85 sub FATAL_P             () {'fatal-p'}
  86 sub SYNTAXERROR_P       () {'syntaxerror-p'}
  87
  88 sub FILENAME            () {'input'}
  89 sub HANDLE              () {'handle'}
  90
  91 sub READAHEAD           () {'readahead'}
  92 sub LINENUM_START       () {'lc_0'}
  93 sub LINENUM             () {'lc'}
  94 sub CDATA_MODE_P        () {'cdata-mode-p'}
  95 sub CDATA_CLOSE         () {'cdata-close'}
  96 sub PCDATA_MODE_P       () {'pcdata-mode-p'}    # additional submode for CDATA
  97 sub JS_MODE_P           () {'js-mode-p'}        # cdata-mode-p must also be true
  98
  99 sub ALLOW_CFORMAT_P     () {'allow-cformat-p'}
 100
 101 sub new {
 102     shift;
 103     my ($filename) = @_;
 104     open my $handle,$filename or die "can't open $filename";
 105     bless {
 106             filename => $filename
 107             , handle => $handle
 108             , readahead => []
 109     } , __PACKAGE__;
 110 }
 111
 112 ###############################################################################
 113
 114 # Simple getters
 115
 116 sub filename {
 117     my $this = shift;
 118     return $this->{filename};
 119 }
 120
 121 sub _handle {
 122     my $this = shift;
 123     return $this->{handle};
 124 }
 125
 126 sub fatal_p {
 127     my $this = shift;
 128     return $this->{+FATAL_P};
 129 }
 130
 131 sub syntaxerror_p {
 132     my $this = shift;
 133     return $this->{+SYNTAXERROR_P};
 134 }
 135
 136 sub has_readahead_p {
 137     my $this = shift;
 138     return @{$this->{readahead}};
 139 }
 140
 141 sub _peek_readahead {
 142     my $this = shift;
 143     return $this->{readahead}->[$#{$this->{readahead}}];
 144 }
 145
 146 sub line_number_start {
 147     my $this = shift;
 148     return $this->{+LINENUM_START};
 149 }
 150
 151 sub line_number {
 152     my $this = shift;
 153     return $this->{+LINENUM};
 154 }
 155
 156 sub cdata_mode_p {
 157     my $this = shift;
 158     return $this->{+CDATA_MODE_P};
 159 }
 160
 161 sub pcdata_mode_p {
 162     my $this = shift;
 163     return $this->{+PCDATA_MODE_P};
 164 }
 165
 166 sub js_mode_p {
 167     my $this = shift;
 168     return $this->{+JS_MODE_P};
 169 }
 170
 171 sub cdata_close {
 172     my $this = shift;
 173     return $this->{+CDATA_CLOSE};
 174 }
 175
 176 sub allow_cformat_p {
 177     my $this = shift;
 178     return $this->{+ALLOW_CFORMAT_P};
 179 }
 180
 181 # Simple setters
 182
 183 sub _set_fatal {
 184     my $this = shift;
 185     $this->{+FATAL_P} = $_[0];
 186     return $this;
 187 }
 188
 189 sub _set_syntaxerror {
 190     my $this = shift;
 191     $this->{+SYNTAXERROR_P} = $_[0];
 192     return $this;
 193 }
 194
 195 sub _push_readahead {
 196     my $this = shift;
 197     push @{$this->{readahead}}, $_[0];
 198     return $this;
 199 }
 200
 201 sub _pop_readahead {
 202     my $this = shift;
 203     return pop @{$this->{readahead}};
 204 }
 205
 206 sub _append_readahead {
 207     my $this = shift;
 208     $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
 209     return $this;
 210 }
 211
 212 sub _set_readahead {
 213     my $this = shift;
 214     $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
 215     return $this;
 216 }
 217
 218 sub _increment_line_number {
 219     my $this = shift;
 220     $this->{+LINENUM} += 1;
 221     return $this;
 222 }
 223
 224 sub _set_line_number_start {
 225     my $this = shift;
 226     $this->{+LINENUM_START} = $_[0];
 227     return $this;
 228 }
 229
 230 sub _set_cdata_mode {
 231     my $this = shift;
 232     $this->{+CDATA_MODE_P} = $_[0];
 233     return $this;
 234 }
 235
 236 sub _set_pcdata_mode {
 237     my $this = shift;
 238     $this->{+PCDATA_MODE_P} = $_[0];
 239     return $this;
 240 }
 241
 242 sub _set_js_mode {
 243     my $this = shift;
 244     $this->{+JS_MODE_P} = $_[0];
 245     return $this;
 246 }
 247
 248 sub _set_cdata_close {
 249     my $this = shift;
 250     $this->{+CDATA_CLOSE} = $_[0];
 251     return $this;
 252 }
 253
 254 sub set_allow_cformat {
 255     my $this = shift;
 256     $this->{+ALLOW_CFORMAT_P} = $_[0];
 257     return $this;
 258 }
 259
 260 ###############################################################################
 261
 262 use vars qw( $js_EscapeSequence );
 263 BEGIN {
 264     # Perl quoting is really screwed up, but this common subexp is way too long
 265     $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
 266 }
 267 sub parenleft  () { '(' }
 268 sub parenright () { ')' }
 269
 270 sub split_js ($) {
 271     my ($s0) = @_;
 272     my @it = ();
 273     while (length $s0) {
 274         if ($s0 =~ /^\s+/s) {                           # whitespace
 275             push @it, $&;
 276             $s0 = $';
 277         } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
 278             push @it, $&;
 279             $s0 = $';
 280         } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) {   # C-style comment
 281             push @it, $&;
 282             $s0 = $';
 283         # Keyword or identifier, ECMA-262 p.13 (section 7.5)
 284         } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) {     # IdentifierName
 285             push @it, $&;
 286             $s0 = $';
 287         # Punctuator, ECMA-262 p.13 (section 7.6)
 288         } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
 289             push @it, $&;
 290             $s0 = $';
 291         # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
 292         } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
 293             push @it, $&;
 294             $s0 = $';
 295         # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
 296         } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
 297             push @it, $&;
 298             $s0 = $';
 299         # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
 300         } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
 301             push @it, $&;
 302             $s0 = $';
 303         # StringLiteral, ECMA-262 p.17 (section 7.7.4)
 304         # XXX SourceCharacter doesn't seem to be defined (?)
 305         } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
 306             push @it, $&;
 307             $s0 = $';
 308         } elsif ($s0 =~ /^./) {                         # UNKNOWN TOKEN !!!
 309             push @it, $&;
 310             $s0 = $';
 311         }
 312     }
 313     return @it;
 314 }
 315
 316 sub STATE_UNDERSCORE     () { 1 }
 317 sub STATE_PARENLEFT      () { 2 }
 318 sub STATE_STRING_LITERAL () { 3 }
 319
 320 # XXX This is a crazy hack. I don't want to write an ECMAScript parser.
 321 # XXX A scanner is one thing; a parser another thing.
 322 sub identify_js_translatables (@) {
 323     my @input = @_;
 324     my @output = ();
 325     # We mark a JavaScript translatable string as in C, i.e., _("literal")
 326     # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
 327     for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
 328         my $reset_state_p = 0;
 329         push @output, [0, $input[$i]];
 330         if ($input[$i] !~ /\S/s) {
 331             ;
 332         } elsif ($state == 0) {
 333             $state = STATE_UNDERSCORE if $input[$i] eq '_';
 334         } elsif ($state == STATE_UNDERSCORE) {
 335             $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
 336         } elsif ($state == STATE_PARENLEFT) {
 337             if ($input[$i] =~ /^(['"])(.*)\1$/s) {
 338                 ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
 339             } else {
 340                 $state = 0;
 341             }
 342         } elsif ($state == STATE_STRING_LITERAL) {
 343             if ($input[$i] eq parenright) {
 344                 $output[$j] = [1, $output[$j]->[1], $q, $s];
 345             }
 346             $state = 0;
 347         } else {
 348             die "identify_js_translatables internal error: Unknown state $state"
 349         }
 350     }
 351     return \@output;
 352 }
 353
 354 ###############################################################################
 355
 356 sub _extract_attributes ($;$) {
 357     my $this = shift;
 358     my($s, $lc) = @_;
 359     my %attr;
 360     $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s     # XML-style self-closing tags
 361             || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s;      # SGML-style tags
 362
 363     for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
 364         my($key, $val, $val_orig, $rest)
 365                 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
 366         $i += 1;
 367         $attr{+lc($key)} = [$key, $val, $val_orig, $i];
 368         $s = $rest;
 369         if ($val =~ /$re_tmpl_include/os) {
 370             warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
 371         } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
 372             # XXX: we probably should not warn if key is "onclick" etc
 373             # XXX: there's just no reasonable thing to suggest
 374             my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
 375             undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
 376             warn_pedantic
 377                     "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
 378                         . ": $val_orig",
 379                     $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
 380                 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
 381         } elsif ($val_orig !~ /^['"]/) {
 382             my $t = $val; $t =~ s/$re_directive_control//os;
 383             warn_pedantic
 384                 "Unquoted attribute contains character(s) that should be quoted"
 385                     . ": $val_orig",
 386                 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
 387                 if $t =~ /[^-\.A-Za-z0-9]/s;
 388         }
 389     }
 390     my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
 391     if ($s2 =~ /\S/s) { # should never happen
 392         if ($s =~ /^([^\n]*)\n/s) { # this is even worse
 393             error_normal("Completely confused while extracting attributes: $1", $lc);
 394             error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
 395             $this->_set_fatal( 1 );
 396         } else {
 397             # There's something wrong with the attribute syntax.
 398             # We might be able to deduce a likely cause by looking more.
 399             if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
 400                 warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
 401             } else {
 402                 warn_normal "Strange attribute syntax: $s\n", $lc;
 403             }
 404         }
 405     }
 406     return \%attr;
 407 }
 408
 409 sub _next_token_internal {
 410     my $this = shift;
 411     my($h) = @_;
 412     my($it, $kind);
 413     my $eof_p = 0;
 414     $this->_pop_readahead if $this->has_readahead_p
 415             && !ref $this->_peek_readahead
 416             && !length $this->_peek_readahead;
 417     if (!$this->has_readahead_p) {
 418         my $next = scalar <$h>;
 419         $eof_p = !defined $next;
 420         if (!$eof_p) {
 421             $this->_increment_line_number;
 422             $this->_push_readahead( $next );
 423         }
 424     }
 425     $this->_set_line_number_start( $this->line_number ); # remember 1st line num
 426     if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
 427         ($it, $kind) = ($this->_pop_readahead, undef);
 428     } elsif ($eof_p && !$this->has_readahead_p) {       # nothing left to do
 429         ;
 430     } elsif ($this->_peek_readahead =~ /^\s+/s) {       # whitespace
 431         ($kind, $it) = (TmplTokenType::TEXT, $&);
 432         $this->_set_readahead( $' );
 433     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
 434     } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) {        # non-space normal text
 435         ($kind, $it) = (TmplTokenType::TEXT, $&);
 436         $this->_set_readahead( $' );
 437         warn_normal "Unescaped < in $it\n", $this->line_number_start
 438                 if !$this->cdata_mode_p && $it =~ /</s;
 439     } else {                            # tag/declaration/processing instruction
 440         my $ok_p = 0;
 441         my $bad_comment_p = 0;
 442         for (my $cdata_close = $this->cdata_close;;) {
 443             if ($this->cdata_mode_p) {
 444                 my $next = $this->_pop_readahead;
 445                 if ($next =~ /^$cdata_close/is) {
 446                     ($kind, $it) = (TmplTokenType::TAG, $&);
 447                     $this->_push_readahead( $' );
 448                     $ok_p = 1;
 449                 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
 450                     ($kind, $it) = (TmplTokenType::TEXT, $1);
 451                     $this->_push_readahead( "$2$'" );
 452                     $ok_p = 1;
 453                 } else {
 454                     ($kind, $it) = (TmplTokenType::TEXT, $next);
 455                     $ok_p = 1;
 456                 }
 457             } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
 458                 # If we detect a "closed start tag" but we know that the
 459                 # following token looks like a TMPL_VAR, don't stop
 460                 my($head, $tail, $post) = ($1, $2, $3);
 461                 if ($tail eq '' && $post =~ $re_tmpl_var) {
 462                     # Don't bother to show the warning if we're too confused
 463                     # FIXME. There's no method for _closed_start_tag_warning
 464                     if (!defined $this->{'_closed_start_tag_warning'}
 465                         || ($this->{'_closed_start_tag_warning'}->[0] eq $head
 466                         && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
 467                     warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
 468                             if split(/\n/, $head) < 10;
 469                     }
 470                     $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
 471                 } else {
 472                     ($kind, $it) = (TmplTokenType::TAG, "$head>");
 473                     $this->_set_readahead( $post );
 474                     $ok_p = 1;
 475                     warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
 476                 }
 477             } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
 478                 ($kind, $it) = (TmplTokenType::COMMENT, $&);
 479                 $this->_set_readahead( $' );
 480                 $ok_p = 1;
 481                 $bad_comment_p = 1;
 482             }
 483         last if $ok_p;
 484             my $next = scalar <$h>;
 485             $eof_p = !defined $next;
 486         last if $eof_p;
 487             $this->_increment_line_number;
 488             $this->_append_readahead( $next );
 489         }
 490         if ($kind ne TmplTokenType::TAG) {
 491             ;
 492         } elsif ($it =~ /^<!/) {
 493             $kind = TmplTokenType::DECL;
 494             $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 495             if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
 496                 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
 497             }
 498         } elsif ($it =~ /^<\?/) {
 499             $kind = TmplTokenType::PI;
 500         }
 501         if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
 502             $kind = TmplTokenType::DIRECTIVE;
 503         } elsif ($bad_comment_p) {
 504             warn_normal sprintf("Syntax error in comment: %s\n", $it),
 505                     $this->line_number_start;
 506             $this->_set_syntaxerror( 1 );
 507         }
 508         if (!$ok_p && $eof_p) {
 509             ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
 510             $this->_set_readahead, undef;
 511             $this->_set_syntaxerror( 1 );
 512         }
 513     }
 514     warn_normal "Unrecognizable token found: "
 515             . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
 516             . "\n", $this->line_number_start
 517         if $kind == TmplTokenType::UNKNOWN;
 518     return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
 519 }
 520
 521 sub _next_token_intermediate {
 522     my $this = shift;
 523     my $h = $this->_handle;
 524     my $it;
 525     if (!$this->cdata_mode_p) {
 526         $it = $this->_next_token_internal($h);
 527         if (defined $it && $it->type == TmplTokenType::TAG) {
 528             if ($it->string =~ /^<(script|style|textarea)\b/is) {
 529                 $this->_set_cdata_mode( 1 );
 530                 $this->_set_cdata_close( "</$1\\s*>" );
 531                 $this->_set_pcdata_mode( 0 );
 532                 $this->_set_js_mode( lc($1) eq 'script' );
 533 #           } elsif ($it->string =~ /^<(title)\b/is) {
 534 #               $this->_set_cdata_mode( 1 );
 535 #               $this->_set_cdata_close( "</$1\\s*>" );
 536 #               $this->_set_pcdata_mode( 1 );
 537             }
 538             $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
 539         }
 540     } else {
 541         my $eof_p = 0;
 542         for ($it = '', my $cdata_close = $this->cdata_close;;) {
 543             my $next = $this->_next_token_internal($h);
 544             $eof_p = !defined $next;
 545         last if $eof_p;
 546             if (defined $next && $next->string =~ /$cdata_close/is) {
 547                 $this->_push_readahead( $next ); # push entire TmplToken object
 548                 $this->_set_cdata_mode( 0 );
 549             }
 550         last unless $this->cdata_mode_p;
 551             $it .= $next->string;
 552         }
 553         if ($eof_p) {
 554             $it = undef;
 555             error_normal "Unexpected end of file while looking for "
 556                     . $this->cdata_close
 557                     . "\n", $this->line_number_start;
 558             $this->_set_fatal( 1 );
 559             $this->_set_syntaxerror( 1 );
 560         }
 561         if ($this->pcdata_mode_p) {
 562             my $check = $it;
 563             $check =~ s/$re_directive//gos;
 564             warn_pedantic "Markup found in PCDATA\n", $this->line_number,
 565                             \$pedantic_error_markup_in_pcdata_p
 566                     if $check =~ /$re_tag_compat/s;
 567         }
 568         # PCDATA should be treated as text, not CDATA
 569         # Actually it should be treated as TEXT_PARAMETRIZED :-(
 570         $it = TmplToken->new( $it,
 571                         ($this->pcdata_mode_p?
 572                             TmplTokenType::TEXT: TmplTokenType::CDATA),
 573                         $this->line_number, $this->filename )
 574                 if defined $it;
 575         if ($this->js_mode_p) {
 576             my $s0 = $it->string;
 577             my @head = ();
 578             my @tail = ();
 579             if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
 580                 push @head, $1;
 581                 push @tail, $3;
 582                 $s0 = $2;
 583             }
 584             push @head, split_js $s0;
 585             $it->set_js_data( identify_js_translatables(@head, @tail) );
 586         }
 587         $this->_set_pcdata_mode, 0;
 588         $this->_set_cdata_close, undef unless !defined $it;
 589     }
 590     return $it;
 591 }
 592
 593 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
 594     my($t) = @_;
 595     return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
 596         || ($t->type == TmplTokenType::DIRECTIVE
 597                 && $t->string =~ /^(?:$re_tmpl_var)$/os)
 598         || ($t->type == TmplTokenType::TAG
 599                 && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
 600                 || ($t->string =~ /^<input\b/is
 601                     && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
 602                     ))
 603 }
 604
 605 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
 606     my($t) = @_;
 607     return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
 608         || ($t->type == TmplTokenType::DIRECTIVE
 609                 && $t->string =~ /^(?:$re_tmpl_var)$/os)
 610         || ($t->type == TmplTokenType::TAG
 611                 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
 612                 || ($t->string =~ /^<input\b/is
 613                     && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
 614 }
 615
 616 sub _quote_cformat ($) {
 617     my($s) = @_;
 618     $s =~ s/%/%%/g;
 619     return $s;
 620 }
 621
 622 sub string_canon ($) {
 623     my($s) = @_;
 624     if (1) { # FIXME
 625         # Fold all whitespace into single blanks
 626         $s =~ s/\s+/ /gs;
 627     }
 628     return $s;
 629 }
 630
 631 sub _formalize_string_cformat ($) {
 632     my($s) = @_;
 633     return _quote_cformat string_canon $s;
 634 }
 635
 636 sub _formalize ($) {
 637     my($t) = @_;
 638     return $t->type == TmplTokenType::DIRECTIVE? '%s':
 639            $t->type == TmplTokenType::TEXT?
 640                    _formalize_string_cformat($t->string):
 641            $t->type == TmplTokenType::TAG?
 642                    ($t->string =~ /^<a\b/is? '<a>':
 643                     $t->string =~ /^<input\b/is? (
 644                             lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
 645                             '%p'):
 646                     _quote_cformat($t->string)):
 647                _quote_cformat($t->string);
 648 }
 649
 650 sub _optimize {
 651     my $this = shift;
 652     my @structure = @_;
 653     my $undo_trailing_blanks = sub {
 654                 for (my $i = $#structure; $i >= 0; $i -= 1) {
 655                 last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
 656                     # Queue element structure: [reanalysis-p, token]
 657                     push @{$this->{_queue}}, [1, pop @structure];
 658                 }
 659             };
 660     &$undo_trailing_blanks;
 661     while (@structure >= 2) {
 662         my $something_done_p = 0;
 663         # FIXME: If the last token is a close tag but there are no tags
 664         # FIXME: before it, drop the close tag back into the queue. This
 665         # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
 666         if (@structure >= 2
 667                 && $structure[$#structure]->type == TmplTokenType::TAG
 668                 && $structure[$#structure]->string =~ /^<\//s) {
 669             my $has_other_tags_p = 0;
 670             for (my $i = 0; $i < $#structure; $i += 1) {
 671                 $has_other_tags_p = 1
 672                         if $structure[$i]->type == TmplTokenType::TAG;
 673             last if $has_other_tags_p;
 674             }
 675             if (!$has_other_tags_p) {
 676                 push @{$this->{_queue}}, [0, pop @structure]
 677                 &$undo_trailing_blanks;
 678                 $something_done_p = 1;
 679             }
 680         }
 681         # FIXME: Do the same ugly hack for the last token being a ( or [
 682         if (@structure >= 2
 683                 && $structure[$#structure]->type == TmplTokenType::TEXT
 684                 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
 685             push @{$this->{_queue}}, [1, pop @structure];
 686             &$undo_trailing_blanks;
 687             $something_done_p = 1;
 688         }
 689         # FIXME: If the first token is an open tag, but there is no
 690         # FIXME: corresponding close tag, "drop the open tag", i.e.,
 691         # FIXME: requeue everything for reanalysis, except the frist tag. :-(
 692         if (@structure >= 2
 693                 && $structure[0]->type == TmplTokenType::TAG
 694                 && $structure[0]->string =~ /^<([a-z0-9]+)/is
 695                 && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
 696         ) {
 697             my $tag_open_count = 1;
 698             for (my $i = 1; $i <= $#structure; $i += 1) {
 699                 if ($structure[$i]->type == TmplTokenType::TAG) {
 700                     if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
 701                         $tag_open_count += ($1? -1: +1);
 702                     }
 703                 }
 704             }
 705             if ($tag_open_count > 0) {
 706                 for (my $i = $#structure; $i; $i -= 1) {
 707                     push @{$this->{_queue}}, [1, pop @structure];
 708                 }
 709                 $something_done_p = 1;
 710             }
 711         }
 712         # FIXME: If the first token is an open tag, the last token is the
 713         # FIXME: corresponding close tag, and there are no other close tags
 714         # FIXME: inbetween, requeue the tokens from the second token on,
 715         # FIXME: flagged as ok for re-analysis
 716         if (@structure >= 3
 717                 && $structure[0]->type == TmplTokenType::TAG
 718                 && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
 719                 && $structure[$#structure]->type == TmplTokenType::TAG
 720                 && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
 721             my $has_other_open_or_close_tags_p = 0;
 722             for (my $i = 1; $i < $#structure; $i += 1) {
 723                 $has_other_open_or_close_tags_p = 1
 724                         if $structure[$i]->type == TmplTokenType::TAG
 725                         && $structure[$i]->string =~ /^<\/?$tag\b/is;
 726             last if $has_other_open_or_close_tags_p;
 727             }
 728             if (!$has_other_open_or_close_tags_p) {
 729                 for (my $i = $#structure; $i; $i -= 1) {
 730                     push @{$this->{_queue}}, [1, pop @structure];
 731                 }
 732                 $something_done_p = 1;
 733             }
 734         }
 735     last if !$something_done_p;
 736     }
 737     return @structure;
 738 }
 739
 740 sub looks_plausibly_like_groupable_text_p (@) {
 741     my @structure = @_;
 742     # The text would look plausibly groupable if all open tags are also closed.
 743     my @tags = ();
 744     my $error_p = 0;
 745     for (my $i = 0; $i <= $#structure; $i += 1) {
 746         if ($structure[$i]->type == TmplTokenType::TAG) {
 747             my $form = $structure[$i]->string;
 748             if ($form =~ /^<([A-Z0-9]+)/is) {
 749                 my $tag = lc($1);
 750                 if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
 751                     push @tags, $tag;
 752                 }
 753             } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
 754                 if (@tags && lc($1) eq $tags[$#tags]) {
 755                     pop @tags;
 756                 } else {
 757                     $error_p = 1;
 758                 }
 759             }
 760         } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
 761             $error_p = 1;
 762         }
 763     last if $error_p;
 764     }
 765     return !$error_p && !@tags;
 766 }
 767
 768 sub next_token {
 769     my $this = shift;
 770     my $h = $this->_handle;
 771     my $it;
 772     $this->{_queue} = [] unless defined $this->{_queue};
 773
 774     # Elements in the queue are ordered pairs. The first in the ordered pair
 775     # specifies whether we are allowed to reanalysis; the second is the token.
 776     if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
 777         $it = (pop @{$this->{_queue}})->[1];
 778     } else {
 779         if (@{$this->{_queue}}) {
 780             $it = (pop @{$this->{_queue}})->[1];
 781         } else {
 782             $it = $this->_next_token_intermediate($h);
 783         }
 784         if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
 785             && ($it->type == TmplTokenType::TEXT?
 786                 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
 787             my @structure = ( $it );
 788             my @tags = ();
 789             my $next = undef;
 790             my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
 791             if ($it->type == TmplTokenType::TEXT) {
 792                 $nonblank_text_p = 1 if !blank_p( $it->string );
 793             } elsif ($it->type == TmplTokenType::DIRECTIVE) {
 794                 $parametrized_p = 1;
 795             } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
 796                 my $tag = lc($1);
 797                 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
 798                 $with_anchor_p = 1 if $tag eq 'a';
 799                 $with_input_p = 1 if $tag eq 'input';
 800             }
 801             # We hate | and || in msgid strings, so we try to avoid them
 802             for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
 803                 if (@{$this->{_queue}}) {
 804                     $next = (pop @{$this->{_queue}})->[1];
 805                 } else {
 806                     $next = $this->_next_token_intermediate($h);
 807                 }
 808                 push @structure, $next; # for consistency (with initialization)
 809             last unless defined $next && _token_groupable2_p( $next );
 810             last if $quit_next_p;
 811                 if ($next->type == TmplTokenType::TEXT) {
 812                     $nonblank_text_p = 1 if !blank_p( $next->string );
 813                     $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
 814                 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
 815                     $parametrized_p = 1;
 816                 } elsif ($next->type == TmplTokenType::TAG) {
 817                     if ($next->string =~ /^<([A-Z0-9]+)/is) {
 818                         my $tag = lc($1);
 819                         push @tags, $tag if $tag !~ /^(?:br|input)$/i;
 820                         $with_anchor_p = 1 if $tag eq 'a';
 821                         $with_input_p = 1 if $tag eq 'input';
 822                     } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
 823                         my $close = lc($1);
 824                         $quit_p = 1 unless @tags && $close eq $tags[$#tags];
 825                         $quit_next_p = 1 if $close =~ /^h\d$/;
 826                         pop @tags;
 827                     }
 828                 }
 829             last if $quit_p;
 830             }
 831             # Undo the last token, allowing reanalysis
 832             push @{$this->{_queue}}, [1, pop @structure];
 833             # Simply it a bit more
 834             @structure = $this->_optimize( @structure );
 835             if (@structure < 2) {
 836                 # Nothing to do
 837                 ;
 838             } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
 839                 # Create the corresponding c-format string
 840                 my $string = join('', map { $_->string } @structure);
 841                 my $form = join('', map { _formalize $_ } @structure);
 842                 my($a_counter, $input_counter) = (0, 0);
 843                 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
 844                 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
 845                 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
 846                         $it->line_number, $it->pathname);
 847                 $it->set_form( $form );
 848                 $it->set_children( @structure );
 849             } elsif ($nonblank_text_p
 850                     && looks_plausibly_like_groupable_text_p( @structure )
 851                     && $structure[$#structure]->type == TmplTokenType::TEXT) {
 852                 # Combine the strings
 853                 my $string = join('', map { $_->string } @structure);
 854                 $it = TmplToken->new($string, TmplTokenType::TEXT,
 855                         $it->line_number, $it->pathname);;
 856             } else {
 857                 # Requeue the tokens thus seen for re-emitting, allow reanalysis
 858                 for (;;) {
 859                     push @{$this->{_queue}}, [1, pop @structure];
 860                 last if !@structure;
 861                 }
 862                 $it = (pop @{$this->{_queue}})->[1];
 863             }
 864         }
 865     }
 866     if (defined $it && $it->type == TmplTokenType::TEXT) {
 867         my $form = string_canon $it->string;
 868         $it->set_form( $form );
 869     }
 870     return $it;
 871 }
 872
 873 ###############################################################################
 874
 875 # Other simple functions (These are not methods)
 876
 877 sub blank_p ($) {
 878     my($s) = @_;
 879     return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var|$re_xsl)*$/os;
 880 }
 881
 882 sub trim ($) {
 883     my($s0) = @_;
 884     my $l0 = length $s0;
 885     my $s = $s0;
 886     $s =~ s/^(\s|\&nbsp$re_end_entity)+//os; my $l1 = $l0 - length $s;
 887     $s =~ s/(\s|\&nbsp$re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
 888     return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
 889 }
 890
 891 sub quote_po ($) {
 892     my($s) = @_;
 893     # Locale::PO->quote is buggy, it doesn't quote newlines :-/
 894     $s =~ s/([\\"])/\\\1/gs;
 895     $s =~ s/\n/\\n/g;
 896     #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
 897     return "\"$s\"";
 898 }
 899
 900 # Some functions that shouldn't be here... should be moved out some time
 901 sub parametrize ($$$$) {
 902     my($fmt_0, $cformat_p, $t, $f) = @_;
 903     my $it = '';
 904     if ($cformat_p) {
 905         my @params = $t->parameters_and_fields;
 906         for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
 907             if ($fmt =~ /^[^%]+/) {
 908                 $fmt = $';
 909                 $it .= $&;
 910             } elsif ($fmt =~ /^%%/) {
 911                 $fmt = $';
 912                 $it .= '%';
 913             } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
 914                 $n += 1;
 915                 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
 916                 $fmt = $';
 917                 if (defined $width && defined $prec && !$width && !$prec) {
 918                     ;
 919                 } elsif (defined $params[$i - 1]) {
 920                     my $param = $params[$i - 1];
 921                     warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
 922                             . $param->type->to_string . "\n", undef
 923                             if $param->type != TmplTokenType::DIRECTIVE;
 924                     warn_normal "$fmt_0: $&: Unsupported "
 925                                 . "field width or precision\n", undef
 926                             if defined $width || defined $prec;
 927                     warn_normal "$fmt_0: $&: Parameter $i not known", undef
 928                             unless defined $param;
 929                     $it .= defined $f? &$f( $param ): $param->string;
 930                 }
 931             } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
 932                 $n += 1;
 933                 my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
 934                 $fmt = $';
 935
 936                 my $param = $params[$i - 1];
 937                 if (!defined $param) {
 938                     warn_normal "$fmt_0: $&: Parameter $i not known", undef;
 939                 } else {
 940                     if ($param->type == TmplTokenType::TAG
 941                             && $param->string =~ /^<input\b/is) {
 942                         my $type = defined $param->attributes?
 943                                 lc($param->attributes->{'type'}->[1]): undef;
 944                         if ($conv eq 'S') {
 945                             warn_normal "$fmt_0: $&: Expected type=text, "
 946                                         . "but found type=$type", undef
 947                                     unless $type eq 'text';
 948                         } elsif ($conv eq 'p') {
 949                             warn_normal "$fmt_0: $&: Expected type=radio, "
 950                                         . "but found type=$type", undef
 951                                     unless $type eq 'radio';
 952                         }
 953                     } else {
 954                         warn_normal "$&: Expected an INPUT, but found a "
 955                                 . $param->type->to_string . "\n", undef
 956                     }
 957                     warn_normal "$fmt_0: $&: Unsupported "
 958                                 . "field width or precision\n", undef
 959                             if defined $width || defined $prec;
 960                     $it .= defined $f? &$f( $param ): $param->string;
 961                 }
 962             } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
 963                 $fmt = $';
 964                 $it .= $&;
 965                 die "$&: Unknown or unsupported format specification\n"; #XXX
 966             } else {
 967                 die "$&: Completely confused parametrizing\n";#XXX
 968             }
 969         }
 970     }
 971     my @anchors = $t->anchors;
 972     for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
 973         if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
 974             $fmt = $';
 975             $it .= $&;
 976         } elsif ($fmt =~ /^<a(\d+)>/is) {
 977             $n += 1;
 978             my $i  = $1;
 979             $fmt = $';
 980             my $anchor = $anchors[$i - 1];
 981             warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
 982                     unless defined $anchor;
 983             $it .= $anchor->string;
 984         } else {
 985             die "Completely confused decoding anchors: $fmt\n";#XXX
 986         }
 987     }
 988     return $it;
 989 }
 990
 991 sub charset_canon ($) {
 992     my($charset) = @_;
 993     $charset = uc($charset);
 994     $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
 995     $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
 996     return $charset;
 997 }
 998
 999 use vars qw( @latin1_utf8 );
1000 @latin1_utf8 = (
1001     "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
1002     "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
1003     "\302\214", "\302\215",   undef,      undef,    "\302\220", "\302\221",
1004     "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
1005     "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
1006     "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
1007     "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
1008     "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
1009     "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
1010     "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
1011     "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
1012     "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
1013     "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
1014     "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
1015     "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
1016     "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
1017     "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
1018     "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
1019     "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
1020     "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
1021     "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
1022     "\303\276", "\303\277" );
1023
1024 sub charset_convert ($$$) {
1025     my($s, $charset_in, $charset_out) = @_;
1026     if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
1027         ;
1028     } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
1029         $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
1030     } elsif ($charset_in ne $charset_out) {
1031         VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
1032     }
1033     return $s;
1034 }
1035
1036 ###############################################################################
1037
1038 =pod
1039
1040 In addition to the basic scanning, this class will also perform
1041 the following:
1042
1043 =over
1044
1045 =item -
1046
1047 Emulation of c-format strings (see below)
1048
1049 =item -
1050
1051 Display of warnings for certain things that affects either the
1052 ability of this class to yield correct output, or things that
1053 are known to cause the original template to cause trouble.
1054
1055 =item -
1056
1057 Automatic correction of some of the things warned about
1058 (e.g., SGML "closed start tag" notation).
1059
1060 =back
1061
1062 =head2 c-format strings emulation
1063
1064 Because English word order is not universal, a simple extraction
1065 of translatable strings may yield some strings like "Accounts for"
1066 or ambiguous strings like "in". This makes the resulting strings
1067 difficult to translate, but does not affect all languages alike.
1068 For example, Chinese (with a somewhat different word order) would
1069 be hit harder, but French would be relatively unaffected.
1070
1071 To overcome this problem, the scanner can be configured to detect
1072 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
1073 and try to construct a larger pattern that will appear in the PO
1074 file as c-format strings with %s placeholders. This additional
1075 step allows the translator to deal with cases where word order
1076 is different (replacing %s with %1$s, %2$s, etc.), or when certain
1077 words will require certain inflectional suffixes in sentences.
1078
1079 Because this is an incompatible change, this mode must be explicitly
1080 turned on using the set_cformat(1) method call.
1081
1082 =head2 The flag characters
1083
1084 The character % is followed by zero or more of the following flags:
1085
1086 =over
1087
1088 =item #
1089
1090 The value comes from HTML <INPUT> elements.
1091 This abuse of the flag character is somewhat reasonable,
1092 since TMPL_VAR and INPUT are both variables, but of different kinds.
1093
1094 =back
1095
1096 =head2 The field width and precision
1097
1098 An optional 0.0 can be specified for %s to specify
1099 that the <TMPL_VAR> should be suppressed.
1100
1101 =head2 The conversion specifier
1102
1103 =over
1104
1105 =item p
1106
1107 Specifies any input field that is neither text nor hidden
1108 (which currently mean radio buttons).
1109 The p conversion specifier is chosen because this does not
1110 evoke any certain sensible data type.
1111
1112 =item S
1113
1114 Specifies a text input field (<INPUT TYPE=TEXT>).
1115 This use of the S conversion specifier is somewhat reasonable,
1116 since text input fields contain values of undeterminable type,
1117 which can be treated as strings.
1118
1119 =item s
1120
1121 Specifies a <TMPL_VAR>.
1122 This use of the o conversion specifier is somewhat reasonable,
1123 since <TMPL_VAR> denotes values of undeterminable type, which
1124 can be treated as strings.
1125
1126 =back
1127
1128 =head1 BUGS
1129
1130 There is no code to save the tag name anywhere in the scanned token.
1131
1132 The use of <AI<i>> to stand for the I<i>th anchor
1133 is not very well thought out.
1134 Some abuse of c-format specifies might have been more appropriate.
1135
1136 =head1 HISTORY
1137
1138 This tokenizer is mostly based
1139 on Ambrose's hideous Perl script known as subst.pl.
1140
1141 =cut
1142
1143 1;