misc/translator/TmplTokenizer.pm

   1 package TmplTokenizer;
   2
   3 use strict;
   4 use TmplTokenType;
   5 use TmplToken;
   6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
   7 require Exporter;
   8
   9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10
  11 ###############################################################################
  12
  13 =head1 NAME
  14
  15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
  16
  17 =head1 DESCRIPTION
  18
  19 Because .tmpl files contains HTML::Template directives
  20 that tend to confuse real parsers (e.g., HTML::Parse),
  21 it might be better to create a customized scanner
  22 to scan the template files for tokens.
  23 This module is a simple-minded attempt at such a scanner.
  24
  25 =cut
  26
  27 ###############################################################################
  28
  29 $VERSION = 0.02;
  30
  31 @ISA = qw(Exporter);
  32 @EXPORT_OK = qw();
  33
  34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
  36 use vars qw( $pedantic_error_markup_in_pcdata_p );
  37
  38 ###############################################################################
  39
  40 # Hideous stuff
  41 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
  42 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
  43 BEGIN {
  44     # $re_directive must not do any backreferences
  45     $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  46     # TMPL_VAR or TMPL_INCLUDE
  47     $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  48     $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  49     # TMPL_VAR ESCAPE=1/HTML/URL
  50     $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  51     # Any control flow directive
  52     $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  53     # /LOOP or /IF or /UNLESS
  54     $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  55 }
  56
  57 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
  58 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
  59 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
  60 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
  61 sub re_tag ($) {
  62    my($compat) = @_;
  63    my $etag = $compat? '>': '<>\/';
  64    # This is no longer similar to the original regexp in subst.pl :-(
  65    # Note that we don't want <> in compat mode; Mozilla knows about <
  66    q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
  67    . $re_directive
  68    . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
  69 }
  70 BEGIN {
  71     $re_comment = '(?:--(?:[^-]|-[^-])*--)';
  72     $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
  73     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
  74     $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
  75     @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
  76 }
  77
  78 # End of the hideous stuff
  79
  80 use vars qw( $serial );
  81
  82 ###############################################################################
  83
  84 sub FATAL_P             () {'fatal-p'}
  85 sub SYNTAXERROR_P       () {'syntaxerror-p'}
  86
  87 sub FILENAME            () {'input'}
  88 sub HANDLE              () {'handle'}
  89
  90 sub READAHEAD           () {'readahead'}
  91 sub LINENUM_START       () {'lc_0'}
  92 sub LINENUM             () {'lc'}
  93 sub CDATA_MODE_P        () {'cdata-mode-p'}
  94 sub CDATA_CLOSE         () {'cdata-close'}
  95 sub PCDATA_MODE_P       () {'pcdata-mode-p'}    # additional submode for CDATA
  96 sub JS_MODE_P           () {'js-mode-p'}        # cdata-mode-p must also be true
  97
  98 sub ALLOW_CFORMAT_P     () {'allow-cformat-p'}
  99
 100 sub new {
 101     my $this = shift;
 102     my($input) = @_;
 103     my $class = ref($this) || $this;
 104     my $self = {};
 105     bless $self, $class;
 106
 107     my $handle = sprintf('TMPLTOKENIZER%d', $serial);
 108     $serial += 1;
 109
 110     no strict;
 111     open($handle, "<$input") || die "$input: $!\n";
 112     use strict;
 113     $self->{+FILENAME} = $input;
 114     $self->{+HANDLE} = $handle;
 115     $self->{+READAHEAD} = [];
 116     return $self;
 117 }
 118
 119 ###############################################################################
 120
 121 # Simple getters
 122
 123 sub filename {
 124     my $this = shift;
 125     return $this->{+FILENAME};
 126 }
 127
 128 sub _handle {
 129     my $this = shift;
 130     return $this->{+HANDLE};
 131 }
 132
 133 sub fatal_p {
 134     my $this = shift;
 135     return $this->{+FATAL_P};
 136 }
 137
 138 sub syntaxerror_p {
 139     my $this = shift;
 140     return $this->{+SYNTAXERROR_P};
 141 }
 142
 143 sub has_readahead_p {
 144     my $this = shift;
 145     return @{$this->{+READAHEAD}};
 146 }
 147
 148 sub _peek_readahead {
 149     my $this = shift;
 150     return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
 151 }
 152
 153 sub line_number_start {
 154     my $this = shift;
 155     return $this->{+LINENUM_START};
 156 }
 157
 158 sub line_number {
 159     my $this = shift;
 160     return $this->{+LINENUM};
 161 }
 162
 163 sub cdata_mode_p {
 164     my $this = shift;
 165     return $this->{+CDATA_MODE_P};
 166 }
 167
 168 sub pcdata_mode_p {
 169     my $this = shift;
 170     return $this->{+PCDATA_MODE_P};
 171 }
 172
 173 sub js_mode_p {
 174     my $this = shift;
 175     return $this->{+JS_MODE_P};
 176 }
 177
 178 sub cdata_close {
 179     my $this = shift;
 180     return $this->{+CDATA_CLOSE};
 181 }
 182
 183 sub allow_cformat_p {
 184     my $this = shift;
 185     return $this->{+ALLOW_CFORMAT_P};
 186 }
 187
 188 # Simple setters
 189
 190 sub _set_fatal {
 191     my $this = shift;
 192     $this->{+FATAL_P} = $_[0];
 193     return $this;
 194 }
 195
 196 sub _set_syntaxerror {
 197     my $this = shift;
 198     $this->{+SYNTAXERROR_P} = $_[0];
 199     return $this;
 200 }
 201
 202 sub _push_readahead {
 203     my $this = shift;
 204     push @{$this->{+READAHEAD}}, $_[0];
 205     return $this;
 206 }
 207
 208 sub _pop_readahead {
 209     my $this = shift;
 210     return pop @{$this->{+READAHEAD}};
 211 }
 212
 213 sub _append_readahead {
 214     my $this = shift;
 215     $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
 216     return $this;
 217 }
 218
 219 sub _set_readahead {
 220     my $this = shift;
 221     $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
 222     return $this;
 223 }
 224
 225 sub _increment_line_number {
 226     my $this = shift;
 227     $this->{+LINENUM} += 1;
 228     return $this;
 229 }
 230
 231 sub _set_line_number_start {
 232     my $this = shift;
 233     $this->{+LINENUM_START} = $_[0];
 234     return $this;
 235 }
 236
 237 sub _set_cdata_mode {
 238     my $this = shift;
 239     $this->{+CDATA_MODE_P} = $_[0];
 240     return $this;
 241 }
 242
 243 sub _set_pcdata_mode {
 244     my $this = shift;
 245     $this->{+PCDATA_MODE_P} = $_[0];
 246     return $this;
 247 }
 248
 249 sub _set_js_mode {
 250     my $this = shift;
 251     $this->{+JS_MODE_P} = $_[0];
 252     return $this;
 253 }
 254
 255 sub _set_cdata_close {
 256     my $this = shift;
 257     $this->{+CDATA_CLOSE} = $_[0];
 258     return $this;
 259 }
 260
 261 sub set_allow_cformat {
 262     my $this = shift;
 263     $this->{+ALLOW_CFORMAT_P} = $_[0];
 264     return $this;
 265 }
 266
 267 ###############################################################################
 268
 269 use vars qw( $js_EscapeSequence );
 270 BEGIN {
 271     # Perl quoting is really screwed up, but this common subexp is way too long
 272     $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
 273 }
 274 sub parenleft  () { '(' }
 275 sub parenright () { ')' }
 276
 277 sub split_js ($) {
 278     my ($s0) = @_;
 279     my @it = ();
 280     while (length $s0) {
 281         if ($s0 =~ /^\s+/s) {                           # whitespace
 282             push @it, $&;
 283             $s0 = $';
 284         } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
 285             push @it, $&;
 286             $s0 = $';
 287         } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) {   # C-style comment
 288             push @it, $&;
 289             $s0 = $';
 290         # Keyword or identifier, ECMA-262 p.13 (section 7.5)
 291         } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) {     # IdentifierName
 292             push @it, $&;
 293             $s0 = $';
 294         # Punctuator, ECMA-262 p.13 (section 7.6)
 295         } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
 296             push @it, $&;
 297             $s0 = $';
 298         # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
 299         } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
 300             push @it, $&;
 301             $s0 = $';
 302         # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
 303         } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
 304             push @it, $&;
 305             $s0 = $';
 306         # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
 307         } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
 308             push @it, $&;
 309             $s0 = $';
 310         # StringLiteral, ECMA-262 p.17 (section 7.7.4)
 311         # XXX SourceCharacter doesn't seem to be defined (?)
 312         } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
 313             push @it, $&;
 314             $s0 = $';
 315         } elsif ($s0 =~ /^./) {                         # UNKNOWN TOKEN !!!
 316             push @it, $&;
 317             $s0 = $';
 318         }
 319     }
 320     return @it;
 321 }
 322
 323 sub STATE_UNDERSCORE     () { 1 }
 324 sub STATE_PARENLEFT      () { 2 }
 325 sub STATE_STRING_LITERAL () { 3 }
 326
 327 # XXX This is a crazy hack. I don't want to write an ECMAScript parser.
 328 # XXX A scanner is one thing; a parser another thing.
 329 sub identify_js_translatables (@) {
 330     my @input = @_;
 331     my @output = ();
 332     # We mark a JavaScript translatable string as in C, i.e., _("literal")
 333     # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
 334     for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
 335         my $reset_state_p = 0;
 336         push @output, [0, $input[$i]];
 337         if ($input[$i] !~ /\S/s) {
 338             ;
 339         } elsif ($state == 0) {
 340             $state = STATE_UNDERSCORE if $input[$i] eq '_';
 341         } elsif ($state == STATE_UNDERSCORE) {
 342             $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
 343         } elsif ($state == STATE_PARENLEFT) {
 344             if ($input[$i] =~ /^(['"])(.*)\1$/s) {
 345                 ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
 346             } else {
 347                 $state = 0;
 348             }
 349         } elsif ($state == STATE_STRING_LITERAL) {
 350             if ($input[$i] eq parenright) {
 351                 $output[$j] = [1, $output[$j]->[1], $q, $s];
 352             }
 353             $state = 0;
 354         } else {
 355             die "identify_js_translatables internal error: Unknown state $state"
 356         }
 357     }
 358     return \@output;
 359 }
 360
 361 ###############################################################################
 362
 363 sub _extract_attributes ($;$) {
 364     my $this = shift;
 365     my($s, $lc) = @_;
 366     my %attr;
 367     $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s     # XML-style self-closing tags
 368             || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s;      # SGML-style tags
 369
 370     for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
 371         my($key, $val, $val_orig, $rest)
 372                 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
 373         $i += 1;
 374         $attr{+lc($key)} = [$key, $val, $val_orig, $i];
 375         $s = $rest;
 376         if ($val =~ /$re_tmpl_include/os) {
 377             warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
 378         } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
 379             # XXX: we probably should not warn if key is "onclick" etc
 380             # XXX: there's just no reasonable thing to suggest
 381             my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
 382             undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
 383             warn_pedantic
 384                     "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
 385                         . ": $val_orig",
 386                     $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
 387                 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
 388         } elsif ($val_orig !~ /^['"]/) {
 389             my $t = $val; $t =~ s/$re_directive_control//os;
 390             warn_pedantic
 391                 "Unquoted attribute contains character(s) that should be quoted"
 392                     . ": $val_orig",
 393                 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
 394                 if $t =~ /[^-\.A-Za-z0-9]/s;
 395         }
 396     }
 397     my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
 398     if ($s2 =~ /\S/s) { # should never happen
 399         if ($s =~ /^([^\n]*)\n/s) { # this is even worse
 400             error_normal("Completely confused while extracting attributes: $1", $lc);
 401             error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
 402             $this->_set_fatal( 1 );
 403         } else {
 404             # There's something wrong with the attribute syntax.
 405             # We might be able to deduce a likely cause by looking more.
 406             if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
 407                 warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
 408             } else {
 409                 warn_normal "Strange attribute syntax: $s\n", $lc;
 410             }
 411         }
 412     }
 413     return \%attr;
 414 }
 415
 416 sub _next_token_internal {
 417     my $this = shift;
 418     my($h) = @_;
 419     my($it, $kind);
 420     my $eof_p = 0;
 421     $this->_pop_readahead if $this->has_readahead_p
 422             && !ref $this->_peek_readahead
 423             && !length $this->_peek_readahead;
 424     if (!$this->has_readahead_p) {
 425         my $next = scalar <$h>;
 426         $eof_p = !defined $next;
 427         if (!$eof_p) {
 428             $this->_increment_line_number;
 429             $this->_push_readahead( $next );
 430         }
 431     }
 432     $this->_set_line_number_start( $this->line_number ); # remember 1st line num
 433     if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
 434         ($it, $kind) = ($this->_pop_readahead, undef);
 435     } elsif ($eof_p && !$this->has_readahead_p) {       # nothing left to do
 436         ;
 437     } elsif ($this->_peek_readahead =~ /^\s+/s) {       # whitespace
 438         ($kind, $it) = (TmplTokenType::TEXT, $&);
 439         $this->_set_readahead( $' );
 440     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
 441     } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) {        # non-space normal text
 442         ($kind, $it) = (TmplTokenType::TEXT, $&);
 443         $this->_set_readahead( $' );
 444         warn_normal "Unescaped < in $it\n", $this->line_number_start
 445                 if !$this->cdata_mode_p && $it =~ /</s;
 446     } else {                            # tag/declaration/processing instruction
 447         my $ok_p = 0;
 448         my $bad_comment_p = 0;
 449         for (my $cdata_close = $this->cdata_close;;) {
 450             if ($this->cdata_mode_p) {
 451                 my $next = $this->_pop_readahead;
 452                 if ($next =~ /^$cdata_close/is) {
 453                     ($kind, $it) = (TmplTokenType::TAG, $&);
 454                     $this->_push_readahead( $' );
 455                     $ok_p = 1;
 456                 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
 457                     ($kind, $it) = (TmplTokenType::TEXT, $1);
 458                     $this->_push_readahead( "$2$'" );
 459                     $ok_p = 1;
 460                 } else {
 461                     ($kind, $it) = (TmplTokenType::TEXT, $next);
 462                     $ok_p = 1;
 463                 }
 464             } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
 465                 # If we detect a "closed start tag" but we know that the
 466                 # following token looks like a TMPL_VAR, don't stop
 467                 my($head, $tail, $post) = ($1, $2, $3);
 468                 if ($tail eq '' && $post =~ $re_tmpl_var) {
 469                     # Don't bother to show the warning if we're too confused
 470                     # FIXME. There's no method for _closed_start_tag_warning
 471                     if (!defined $this->{'_closed_start_tag_warning'}
 472                         || ($this->{'_closed_start_tag_warning'}->[0] eq $head
 473                         && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
 474                     warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
 475                             if split(/\n/, $head) < 10;
 476                     }
 477                     $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
 478                 } else {
 479                     ($kind, $it) = (TmplTokenType::TAG, "$head>");
 480                     $this->_set_readahead( $post );
 481                     $ok_p = 1;
 482                     warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
 483                 }
 484             } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
 485                 ($kind, $it) = (TmplTokenType::COMMENT, $&);
 486                 $this->_set_readahead( $' );
 487                 $ok_p = 1;
 488                 $bad_comment_p = 1;
 489             }
 490         last if $ok_p;
 491             my $next = scalar <$h>;
 492             $eof_p = !defined $next;
 493         last if $eof_p;
 494             $this->_increment_line_number;
 495             $this->_append_readahead( $next );
 496         }
 497         if ($kind ne TmplTokenType::TAG) {
 498             ;
 499         } elsif ($it =~ /^<!/) {
 500             $kind = TmplTokenType::DECL;
 501             $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 502             if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
 503                 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
 504             }
 505         } elsif ($it =~ /^<\?/) {
 506             $kind = TmplTokenType::PI;
 507         }
 508         if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
 509             $kind = TmplTokenType::DIRECTIVE;
 510         } elsif ($bad_comment_p) {
 511             warn_normal sprintf("Syntax error in comment: %s\n", $it),
 512                     $this->line_number_start;
 513             $this->_set_syntaxerror( 1 );
 514         }
 515         if (!$ok_p && $eof_p) {
 516             ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
 517             $this->_set_readahead, undef;
 518             $this->_set_syntaxerror( 1 );
 519         }
 520     }
 521     warn_normal "Unrecognizable token found: "
 522             . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
 523             . "\n", $this->line_number_start
 524         if $kind == TmplTokenType::UNKNOWN;
 525     return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
 526 }
 527
 528 sub _next_token_intermediate {
 529     my $this = shift;
 530     my $h = $this->_handle;
 531     my $it;
 532     if (!$this->cdata_mode_p) {
 533         $it = $this->_next_token_internal($h);
 534         if (defined $it && $it->type == TmplTokenType::TAG) {
 535             if ($it->string =~ /^<(script|style|textarea)\b/is) {
 536                 $this->_set_cdata_mode( 1 );
 537                 $this->_set_cdata_close( "</$1\\s*>" );
 538                 $this->_set_pcdata_mode( 0 );
 539                 $this->_set_js_mode( lc($1) eq 'script' );
 540 #           } elsif ($it->string =~ /^<(title)\b/is) {
 541 #               $this->_set_cdata_mode( 1 );
 542 #               $this->_set_cdata_close( "</$1\\s*>" );
 543 #               $this->_set_pcdata_mode( 1 );
 544             }
 545             $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
 546         }
 547     } else {
 548         my $eof_p = 0;
 549         for ($it = '', my $cdata_close = $this->cdata_close;;) {
 550             my $next = $this->_next_token_internal($h);
 551             $eof_p = !defined $next;
 552         last if $eof_p;
 553             if (defined $next && $next->string =~ /$cdata_close/is) {
 554                 $this->_push_readahead( $next ); # push entire TmplToken object
 555                 $this->_set_cdata_mode( 0 );
 556             }
 557         last unless $this->cdata_mode_p;
 558             $it .= $next->string;
 559         }
 560         if ($eof_p) {
 561             $it = undef;
 562             error_normal "Unexpected end of file while looking for "
 563                     . $this->cdata_close
 564                     . "\n", $this->line_number_start;
 565             $this->_set_fatal( 1 );
 566             $this->_set_syntaxerror( 1 );
 567         }
 568         if ($this->pcdata_mode_p) {
 569             my $check = $it;
 570             $check =~ s/$re_directive//gos;
 571             warn_pedantic "Markup found in PCDATA\n", $this->line_number,
 572                             \$pedantic_error_markup_in_pcdata_p
 573                     if $check =~ /$re_tag_compat/s;
 574         }
 575         # PCDATA should be treated as text, not CDATA
 576         # Actually it should be treated as TEXT_PARAMETRIZED :-(
 577         $it = TmplToken->new( $it,
 578                         ($this->pcdata_mode_p?
 579                             TmplTokenType::TEXT: TmplTokenType::CDATA),
 580                         $this->line_number, $this->filename )
 581                 if defined $it;
 582         if ($this->js_mode_p) {
 583             my $s0 = $it->string;
 584             my @head = ();
 585             my @tail = ();
 586             if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
 587                 push @head, $1;
 588                 push @tail, $3;
 589                 $s0 = $2;
 590             }
 591             push @head, split_js $s0;
 592             $it->set_js_data( identify_js_translatables(@head, @tail) );
 593         }
 594         $this->_set_pcdata_mode, 0;
 595         $this->_set_cdata_close, undef unless !defined $it;
 596     }
 597     return $it;
 598 }
 599
 600 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
 601     my($t) = @_;
 602     return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
 603         || ($t->type == TmplTokenType::DIRECTIVE
 604                 && $t->string =~ /^(?:$re_tmpl_var)$/os)
 605         || ($t->type == TmplTokenType::TAG
 606                 && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
 607                 || ($t->string =~ /^<input\b/is
 608                     && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
 609                     ))
 610 }
 611
 612 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
 613     my($t) = @_;
 614     return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
 615         || ($t->type == TmplTokenType::DIRECTIVE
 616                 && $t->string =~ /^(?:$re_tmpl_var)$/os)
 617         || ($t->type == TmplTokenType::TAG
 618                 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
 619                 || ($t->string =~ /^<input\b/is
 620                     && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
 621 }
 622
 623 sub _quote_cformat ($) {
 624     my($s) = @_;
 625     $s =~ s/%/%%/g;
 626     return $s;
 627 }
 628
 629 sub string_canon ($) {
 630     my($s) = @_;
 631     if (1) { # FIXME
 632         # Fold all whitespace into single blanks
 633         $s =~ s/\s+/ /gs;
 634     }
 635     return $s;
 636 }
 637
 638 sub _formalize_string_cformat ($) {
 639     my($s) = @_;
 640     return _quote_cformat string_canon $s;
 641 }
 642
 643 sub _formalize ($) {
 644     my($t) = @_;
 645     return $t->type == TmplTokenType::DIRECTIVE? '%s':
 646            $t->type == TmplTokenType::TEXT?
 647                    _formalize_string_cformat($t->string):
 648            $t->type == TmplTokenType::TAG?
 649                    ($t->string =~ /^<a\b/is? '<a>':
 650                     $t->string =~ /^<input\b/is? (
 651                             lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
 652                             '%p'):
 653                     _quote_cformat($t->string)):
 654                _quote_cformat($t->string);
 655 }
 656
 657 sub _optimize {
 658     my $this = shift;
 659     my @structure = @_;
 660     my $undo_trailing_blanks = sub {
 661                 for (my $i = $#structure; $i >= 0; $i -= 1) {
 662                 last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
 663                     # Queue element structure: [reanalysis-p, token]
 664                     push @{$this->{_queue}}, [1, pop @structure];
 665                 }
 666             };
 667     &$undo_trailing_blanks;
 668     while (@structure >= 2) {
 669         my $something_done_p = 0;
 670         # FIXME: If the last token is a close tag but there are no tags
 671         # FIXME: before it, drop the close tag back into the queue. This
 672         # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
 673         if (@structure >= 2
 674                 && $structure[$#structure]->type == TmplTokenType::TAG
 675                 && $structure[$#structure]->string =~ /^<\//s) {
 676             my $has_other_tags_p = 0;
 677             for (my $i = 0; $i < $#structure; $i += 1) {
 678                 $has_other_tags_p = 1
 679                         if $structure[$i]->type == TmplTokenType::TAG;
 680             last if $has_other_tags_p;
 681             }
 682             if (!$has_other_tags_p) {
 683                 push @{$this->{_queue}}, [0, pop @structure]
 684                 &$undo_trailing_blanks;
 685                 $something_done_p = 1;
 686             }
 687         }
 688         # FIXME: Do the same ugly hack for the last token being a ( or [
 689         if (@structure >= 2
 690                 && $structure[$#structure]->type == TmplTokenType::TEXT
 691                 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
 692             push @{$this->{_queue}}, [1, pop @structure];
 693             &$undo_trailing_blanks;
 694             $something_done_p = 1;
 695         }
 696         # FIXME: If the first token is an open tag, but there is no
 697         # FIXME: corresponding close tag, "drop the open tag", i.e.,
 698         # FIXME: requeue everything for reanalysis, except the frist tag. :-(
 699         if (@structure >= 2
 700                 && $structure[0]->type == TmplTokenType::TAG
 701                 && $structure[0]->string =~ /^<([a-z0-9]+)/is
 702                 && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
 703         ) {
 704             my $tag_open_count = 1;
 705             for (my $i = 1; $i <= $#structure; $i += 1) {
 706                 if ($structure[$i]->type == TmplTokenType::TAG) {
 707                     if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
 708                         $tag_open_count += ($1? -1: +1);
 709                     }
 710                 }
 711             }
 712             if ($tag_open_count > 0) {
 713                 for (my $i = $#structure; $i; $i -= 1) {
 714                     push @{$this->{_queue}}, [1, pop @structure];
 715                 }
 716                 $something_done_p = 1;
 717             }
 718         }
 719         # FIXME: If the first token is an open tag, the last token is the
 720         # FIXME: corresponding close tag, and there are no other close tags
 721         # FIXME: inbetween, requeue the tokens from the second token on,
 722         # FIXME: flagged as ok for re-analysis
 723         if (@structure >= 3
 724                 && $structure[0]->type == TmplTokenType::TAG
 725                 && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
 726                 && $structure[$#structure]->type == TmplTokenType::TAG
 727                 && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
 728             my $has_other_open_or_close_tags_p = 0;
 729             for (my $i = 1; $i < $#structure; $i += 1) {
 730                 $has_other_open_or_close_tags_p = 1
 731                         if $structure[$i]->type == TmplTokenType::TAG
 732                         && $structure[$i]->string =~ /^<\/?$tag\b/is;
 733             last if $has_other_open_or_close_tags_p;
 734             }
 735             if (!$has_other_open_or_close_tags_p) {
 736                 for (my $i = $#structure; $i; $i -= 1) {
 737                     push @{$this->{_queue}}, [1, pop @structure];
 738                 }
 739                 $something_done_p = 1;
 740             }
 741         }
 742     last if !$something_done_p;
 743     }
 744     return @structure;
 745 }
 746
 747 sub looks_plausibly_like_groupable_text_p (@) {
 748     my @structure = @_;
 749     # The text would look plausibly groupable if all open tags are also closed.
 750     my @tags = ();
 751     my $error_p = 0;
 752     for (my $i = 0; $i <= $#structure; $i += 1) {
 753         if ($structure[$i]->type == TmplTokenType::TAG) {
 754             my $form = $structure[$i]->string;
 755             if ($form =~ /^<([A-Z0-9]+)/is) {
 756                 my $tag = lc($1);
 757                 if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
 758                     push @tags, $tag;
 759                 }
 760             } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
 761                 if (@tags && lc($1) eq $tags[$#tags]) {
 762                     pop @tags;
 763                 } else {
 764                     $error_p = 1;
 765                 }
 766             }
 767         } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
 768             $error_p = 1;
 769         }
 770     last if $error_p;
 771     }
 772     return !$error_p && !@tags;
 773 }
 774
 775 sub next_token {
 776     my $this = shift;
 777     my $h = $this->_handle;
 778     my $it;
 779     $this->{_queue} = [] unless defined $this->{_queue};
 780
 781     # Elements in the queue are ordered pairs. The first in the ordered pair
 782     # specifies whether we are allowed to reanalysis; the second is the token.
 783     if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
 784         $it = (pop @{$this->{_queue}})->[1];
 785     } else {
 786         if (@{$this->{_queue}}) {
 787             $it = (pop @{$this->{_queue}})->[1];
 788         } else {
 789             $it = $this->_next_token_intermediate($h);
 790         }
 791         if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
 792             && ($it->type == TmplTokenType::TEXT?
 793                 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
 794             my @structure = ( $it );
 795             my @tags = ();
 796             my $next = undef;
 797             my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
 798             if ($it->type == TmplTokenType::TEXT) {
 799                 $nonblank_text_p = 1 if !blank_p( $it->string );
 800             } elsif ($it->type == TmplTokenType::DIRECTIVE) {
 801                 $parametrized_p = 1;
 802             } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
 803                 my $tag = lc($1);
 804                 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
 805                 $with_anchor_p = 1 if $tag eq 'a';
 806                 $with_input_p = 1 if $tag eq 'input';
 807             }
 808             # We hate | and || in msgid strings, so we try to avoid them
 809             for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
 810                 if (@{$this->{_queue}}) {
 811                     $next = (pop @{$this->{_queue}})->[1];
 812                 } else {
 813                     $next = $this->_next_token_intermediate($h);
 814                 }
 815                 push @structure, $next; # for consistency (with initialization)
 816             last unless defined $next && _token_groupable2_p( $next );
 817             last if $quit_next_p;
 818                 if ($next->type == TmplTokenType::TEXT) {
 819                     $nonblank_text_p = 1 if !blank_p( $next->string );
 820                     $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
 821                 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
 822                     $parametrized_p = 1;
 823                 } elsif ($next->type == TmplTokenType::TAG) {
 824                     if ($next->string =~ /^<([A-Z0-9]+)/is) {
 825                         my $tag = lc($1);
 826                         push @tags, $tag if $tag !~ /^(?:br|input)$/i;
 827                         $with_anchor_p = 1 if $tag eq 'a';
 828                         $with_input_p = 1 if $tag eq 'input';
 829                     } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
 830                         my $close = lc($1);
 831                         $quit_p = 1 unless @tags && $close eq $tags[$#tags];
 832                         $quit_next_p = 1 if $close =~ /^h\d$/;
 833                         pop @tags;
 834                     }
 835                 }
 836             last if $quit_p;
 837             }
 838             # Undo the last token, allowing reanalysis
 839             push @{$this->{_queue}}, [1, pop @structure];
 840             # Simply it a bit more
 841             @structure = $this->_optimize( @structure );
 842             if (@structure < 2) {
 843                 # Nothing to do
 844                 ;
 845             } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
 846                 # Create the corresponding c-format string
 847                 my $string = join('', map { $_->string } @structure);
 848                 my $form = join('', map { _formalize $_ } @structure);
 849                 my($a_counter, $input_counter) = (0, 0);
 850                 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
 851                 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
 852                 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
 853                         $it->line_number, $it->pathname);
 854                 $it->set_form( $form );
 855                 $it->set_children( @structure );
 856             } elsif ($nonblank_text_p
 857                     && looks_plausibly_like_groupable_text_p( @structure )
 858                     && $structure[$#structure]->type == TmplTokenType::TEXT) {
 859                 # Combine the strings
 860                 my $string = join('', map { $_->string } @structure);
 861                 $it = TmplToken->new($string, TmplTokenType::TEXT,
 862                         $it->line_number, $it->pathname);;
 863             } else {
 864                 # Requeue the tokens thus seen for re-emitting, allow reanalysis
 865                 for (;;) {
 866                     push @{$this->{_queue}}, [1, pop @structure];
 867                 last if !@structure;
 868                 }
 869                 $it = (pop @{$this->{_queue}})->[1];
 870             }
 871         }
 872     }
 873     if (defined $it && $it->type == TmplTokenType::TEXT) {
 874         my $form = string_canon $it->string;
 875         $it->set_form( $form );
 876     }
 877     return $it;
 878 }
 879
 880 ###############################################################################
 881
 882 # Other simple functions (These are not methods)
 883
 884 sub blank_p ($) {
 885     my($s) = @_;
 886     return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var)*$/os;
 887 }
 888
 889 sub trim ($) {
 890     my($s0) = @_;
 891     my $l0 = length $s0;
 892     my $s = $s0;
 893     $s =~ s/^(\s|\&nbsp$re_end_entity)+//os; my $l1 = $l0 - length $s;
 894     $s =~ s/(\s|\&nbsp$re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
 895     return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
 896 }
 897
 898 sub quote_po ($) {
 899     my($s) = @_;
 900     # Locale::PO->quote is buggy, it doesn't quote newlines :-/
 901     $s =~ s/([\\"])/\\\1/gs;
 902     $s =~ s/\n/\\n/g;
 903     #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
 904     return "\"$s\"";
 905 }
 906
 907 # Some functions that shouldn't be here... should be moved out some time
 908 sub parametrize ($$$$) {
 909     my($fmt_0, $cformat_p, $t, $f) = @_;
 910     my $it = '';
 911     if ($cformat_p) {
 912         my @params = $t->parameters_and_fields;
 913         for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
 914             if ($fmt =~ /^[^%]+/) {
 915                 $fmt = $';
 916                 $it .= $&;
 917             } elsif ($fmt =~ /^%%/) {
 918                 $fmt = $';
 919                 $it .= '%';
 920             } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
 921                 $n += 1;
 922                 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
 923                 $fmt = $';
 924                 if (defined $width && defined $prec && !$width && !$prec) {
 925                     ;
 926                 } elsif (defined $params[$i - 1]) {
 927                     my $param = $params[$i - 1];
 928                     warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
 929                             . $param->type->to_string . "\n", undef
 930                             if $param->type != TmplTokenType::DIRECTIVE;
 931                     warn_normal "$fmt_0: $&: Unsupported "
 932                                 . "field width or precision\n", undef
 933                             if defined $width || defined $prec;
 934                     warn_normal "$fmt_0: $&: Parameter $i not known", undef
 935                             unless defined $param;
 936                     $it .= defined $f? &$f( $param ): $param->string;
 937                 }
 938             } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
 939                 $n += 1;
 940                 my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
 941                 $fmt = $';
 942
 943                 my $param = $params[$i - 1];
 944                 if (!defined $param) {
 945                     warn_normal "$fmt_0: $&: Parameter $i not known", undef;
 946                 } else {
 947                     if ($param->type == TmplTokenType::TAG
 948                             && $param->string =~ /^<input\b/is) {
 949                         my $type = defined $param->attributes?
 950                                 lc($param->attributes->{'type'}->[1]): undef;
 951                         if ($conv eq 'S') {
 952                             warn_normal "$fmt_0: $&: Expected type=text, "
 953                                         . "but found type=$type", undef
 954                                     unless $type eq 'text';
 955                         } elsif ($conv eq 'p') {
 956                             warn_normal "$fmt_0: $&: Expected type=radio, "
 957                                         . "but found type=$type", undef
 958                                     unless $type eq 'radio';
 959                         }
 960                     } else {
 961                         warn_normal "$&: Expected an INPUT, but found a "
 962                                 . $param->type->to_string . "\n", undef
 963                     }
 964                     warn_normal "$fmt_0: $&: Unsupported "
 965                                 . "field width or precision\n", undef
 966                             if defined $width || defined $prec;
 967                     $it .= defined $f? &$f( $param ): $param->string;
 968                 }
 969             } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
 970                 $fmt = $';
 971                 $it .= $&;
 972                 die "$&: Unknown or unsupported format specification\n"; #XXX
 973             } else {
 974                 die "$&: Completely confused parametrizing\n";#XXX
 975             }
 976         }
 977     }
 978     my @anchors = $t->anchors;
 979     for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
 980         if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
 981             $fmt = $';
 982             $it .= $&;
 983         } elsif ($fmt =~ /^<a(\d+)>/is) {
 984             $n += 1;
 985             my $i  = $1;
 986             $fmt = $';
 987             my $anchor = $anchors[$i - 1];
 988             warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
 989                     unless defined $anchor;
 990             $it .= $anchor->string;
 991         } else {
 992             die "Completely confused decoding anchors: $fmt\n";#XXX
 993         }
 994     }
 995     return $it;
 996 }
 997
 998 sub charset_canon ($) {
 999     my($charset) = @_;
1000     $charset = uc($charset);
1001     $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
1002     $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
1003     return $charset;
1004 }
1005
1006 use vars qw( @latin1_utf8 );
1007 @latin1_utf8 = (
1008     "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
1009     "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
1010     "\302\214", "\302\215",   undef,      undef,    "\302\220", "\302\221",
1011     "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
1012     "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
1013     "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
1014     "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
1015     "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
1016     "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
1017     "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
1018     "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
1019     "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
1020     "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
1021     "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
1022     "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
1023     "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
1024     "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
1025     "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
1026     "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
1027     "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
1028     "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
1029     "\303\276", "\303\277" );
1030
1031 sub charset_convert ($$$) {
1032     my($s, $charset_in, $charset_out) = @_;
1033     if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
1034         ;
1035     } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
1036         $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
1037     } elsif ($charset_in ne $charset_out) {
1038         VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
1039     }
1040     return $s;
1041 }
1042
1043 ###############################################################################
1044
1045 =pod
1046
1047 In addition to the basic scanning, this class will also perform
1048 the following:
1049
1050 =over
1051
1052 =item -
1053
1054 Emulation of c-format strings (see below)
1055
1056 =item -
1057
1058 Display of warnings for certain things that affects either the
1059 ability of this class to yield correct output, or things that
1060 are known to cause the original template to cause trouble.
1061
1062 =item -
1063
1064 Automatic correction of some of the things warned about
1065 (e.g., SGML "closed start tag" notation).
1066
1067 =back
1068
1069 =head2 c-format strings emulation
1070
1071 Because English word order is not universal, a simple extraction
1072 of translatable strings may yield some strings like "Accounts for"
1073 or ambiguous strings like "in". This makes the resulting strings
1074 difficult to translate, but does not affect all languages alike.
1075 For example, Chinese (with a somewhat different word order) would
1076 be hit harder, but French would be relatively unaffected.
1077
1078 To overcome this problem, the scanner can be configured to detect
1079 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
1080 and try to construct a larger pattern that will appear in the PO
1081 file as c-format strings with %s placeholders. This additional
1082 step allows the translator to deal with cases where word order
1083 is different (replacing %s with %1$s, %2$s, etc.), or when certain
1084 words will require certain inflectional suffixes in sentences.
1085
1086 Because this is an incompatible change, this mode must be explicitly
1087 turned on using the set_cformat(1) method call.
1088
1089 =head2 The flag characters
1090
1091 The character % is followed by zero or more of the following flags:
1092
1093 =over
1094
1095 =item #
1096
1097 The value comes from HTML <INPUT> elements.
1098 This abuse of the flag character is somewhat reasonable,
1099 since TMPL_VAR and INPUT are both variables, but of different kinds.
1100
1101 =back
1102
1103 =head2 The field width and precision
1104
1105 An optional 0.0 can be specified for %s to specify
1106 that the <TMPL_VAR> should be suppressed.
1107
1108 =head2 The conversion specifier
1109
1110 =over
1111
1112 =item p
1113
1114 Specifies any input field that is neither text nor hidden
1115 (which currently mean radio buttons).
1116 The p conversion specifier is chosen because this does not
1117 evoke any certain sensible data type.
1118
1119 =item S
1120
1121 Specifies a text input field (<INPUT TYPE=TEXT>).
1122 This use of the S conversion specifier is somewhat reasonable,
1123 since text input fields contain values of undeterminable type,
1124 which can be treated as strings.
1125
1126 =item s
1127
1128 Specifies a <TMPL_VAR>.
1129 This use of the o conversion specifier is somewhat reasonable,
1130 since <TMPL_VAR> denotes values of undeterminable type, which
1131 can be treated as strings.
1132
1133 =back
1134
1135 =head1 BUGS
1136
1137 There is no code to save the tag name anywhere in the scanned token.
1138
1139 The use of <AI<i>> to stand for the I<i>th anchor
1140 is not very well thought out.
1141 Some abuse of c-format specifies might have been more appropriate.
1142
1143 =head1 HISTORY
1144
1145 This tokenizer is mostly based
1146 on Ambrose's hideous Perl script known as subst.pl.
1147
1148 =cut
1149
1150 1;