6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 ###############################################################################
34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
36 use vars qw( $pedantic_error_markup_in_pcdata_p );
38 ###############################################################################
41 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
42 use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl);
44 # $re_directive must not do any backreferences
45 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
46 # TMPL_VAR or TMPL_INCLUDE
47 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
48 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
49 # TMPL_VAR ESCAPE=1/HTML/URL
50 $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
51 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
52 # Any control flow directive
53 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
54 # /LOOP or /IF or /UNLESS
55 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
58 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
59 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
60 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
61 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
64 my $etag = $compat? '>': '<>\/';
65 # This is no longer similar to the original regexp in subst.pl :-(
66 # Note that we don't want <> in compat mode; Mozilla knows about <
67 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
69 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
72 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
73 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
74 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
75 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
76 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
79 # End of the hideous stuff
81 use vars qw( $serial );
83 ###############################################################################
85 sub FATAL_P () {'fatal-p'}
86 sub SYNTAXERROR_P () {'syntaxerror-p'}
88 sub FILENAME () {'input'}
89 sub HANDLE () {'handle'}
91 sub READAHEAD () {'readahead'}
92 sub LINENUM_START () {'lc_0'}
94 sub CDATA_MODE_P () {'cdata-mode-p'}
95 sub CDATA_CLOSE () {'cdata-close'}
96 sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
97 sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
99 sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
104 open my $handle,$filename or die "can't open $filename";
106 filename => $filename
112 ###############################################################################
118 return $this->{filename};
123 return $this->{handle};
128 return $this->{+FATAL_P};
133 return $this->{+SYNTAXERROR_P};
136 sub has_readahead_p {
138 return @{$this->{readahead}};
141 sub _peek_readahead {
143 return $this->{readahead}->[$#{$this->{readahead}}];
146 sub line_number_start {
148 return $this->{+LINENUM_START};
153 return $this->{+LINENUM};
158 return $this->{+CDATA_MODE_P};
163 return $this->{+PCDATA_MODE_P};
168 return $this->{+JS_MODE_P};
173 return $this->{+CDATA_CLOSE};
176 sub allow_cformat_p {
178 return $this->{+ALLOW_CFORMAT_P};
185 $this->{+FATAL_P} = $_[0];
189 sub _set_syntaxerror {
191 $this->{+SYNTAXERROR_P} = $_[0];
195 sub _push_readahead {
197 push @{$this->{readahead}}, $_[0];
203 return pop @{$this->{readahead}};
206 sub _append_readahead {
208 $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
214 $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
218 sub _increment_line_number {
220 $this->{+LINENUM} += 1;
224 sub _set_line_number_start {
226 $this->{+LINENUM_START} = $_[0];
230 sub _set_cdata_mode {
232 $this->{+CDATA_MODE_P} = $_[0];
236 sub _set_pcdata_mode {
238 $this->{+PCDATA_MODE_P} = $_[0];
244 $this->{+JS_MODE_P} = $_[0];
248 sub _set_cdata_close {
250 $this->{+CDATA_CLOSE} = $_[0];
254 sub set_allow_cformat {
256 $this->{+ALLOW_CFORMAT_P} = $_[0];
260 ###############################################################################
262 use vars qw( $js_EscapeSequence );
264 # Perl quoting is really screwed up, but this common subexp is way too long
265 $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
267 sub parenleft () { '(' }
268 sub parenright () { ')' }
274 if ($s0 =~ /^\s+/s) { # whitespace
277 } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
280 } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
283 # Keyword or identifier, ECMA-262 p.13 (section 7.5)
284 } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
287 # Punctuator, ECMA-262 p.13 (section 7.6)
288 } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
291 # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
292 } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
295 # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
296 } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
299 # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
300 } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
303 # StringLiteral, ECMA-262 p.17 (section 7.7.4)
304 # XXX SourceCharacter doesn't seem to be defined (?)
305 } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
308 } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
316 sub STATE_UNDERSCORE () { 1 }
317 sub STATE_PARENLEFT () { 2 }
318 sub STATE_STRING_LITERAL () { 3 }
320 # XXX This is a crazy hack. I don't want to write an ECMAScript parser.
321 # XXX A scanner is one thing; a parser another thing.
322 sub identify_js_translatables (@) {
325 # We mark a JavaScript translatable string as in C, i.e., _("literal")
326 # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
327 for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
328 my $reset_state_p = 0;
329 push @output, [0, $input[$i]];
330 if ($input[$i] !~ /\S/s) {
332 } elsif ($state == 0) {
333 $state = STATE_UNDERSCORE if $input[$i] eq '_';
334 } elsif ($state == STATE_UNDERSCORE) {
335 $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
336 } elsif ($state == STATE_PARENLEFT) {
337 if ($input[$i] =~ /^(['"])(.*)\1$/s) {
338 ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
342 } elsif ($state == STATE_STRING_LITERAL) {
343 if ($input[$i] eq parenright) {
344 $output[$j] = [1, $output[$j]->[1], $q, $s];
348 die "identify_js_translatables internal error: Unknown state $state"
354 ###############################################################################
356 sub _extract_attributes ($;$) {
360 $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags
361 || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags
363 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
364 my($key, $val, $val_orig, $rest)
365 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
367 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
369 if ($val =~ /$re_tmpl_include/os) {
370 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
371 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
372 # XXX: we probably should not warn if key is "onclick" etc
373 # XXX: there's just no reasonable thing to suggest
374 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
375 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
377 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
379 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
380 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
381 } elsif ($val_orig !~ /^['"]/) {
382 my $t = $val; $t =~ s/$re_directive_control//os;
384 "Unquoted attribute contains character(s) that should be quoted"
386 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
387 if $t =~ /[^-\.A-Za-z0-9]/s;
390 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
391 if ($s2 =~ /\S/s) { # should never happen
392 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
393 error_normal("Completely confused while extracting attributes: $1", $lc);
394 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
395 $this->_set_fatal( 1 );
397 # There's something wrong with the attribute syntax.
398 # We might be able to deduce a likely cause by looking more.
399 if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
400 warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
402 warn_normal "Strange attribute syntax: $s\n", $lc;
409 sub _next_token_internal {
414 $this->_pop_readahead if $this->has_readahead_p
415 && !ref $this->_peek_readahead
416 && !length $this->_peek_readahead;
417 if (!$this->has_readahead_p) {
418 my $next = scalar <$h>;
419 $eof_p = !defined $next;
421 $this->_increment_line_number;
422 $this->_push_readahead( $next );
425 $this->_set_line_number_start( $this->line_number ); # remember 1st line num
426 if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
427 ($it, $kind) = ($this->_pop_readahead, undef);
428 } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
430 } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
431 ($kind, $it) = (TmplTokenType::TEXT, $&);
432 $this->_set_readahead( $' );
433 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
434 } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text
435 ($kind, $it) = (TmplTokenType::TEXT, $&);
436 $this->_set_readahead( $' );
437 warn_normal "Unescaped < in $it\n", $this->line_number_start
438 if !$this->cdata_mode_p && $it =~ /</s;
439 } else { # tag/declaration/processing instruction
441 my $bad_comment_p = 0;
442 for (my $cdata_close = $this->cdata_close;;) {
443 if ($this->cdata_mode_p) {
444 my $next = $this->_pop_readahead;
445 if ($next =~ /^$cdata_close/is) {
446 ($kind, $it) = (TmplTokenType::TAG, $&);
447 $this->_push_readahead( $' );
449 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
450 ($kind, $it) = (TmplTokenType::TEXT, $1);
451 $this->_push_readahead( "$2$'" );
454 ($kind, $it) = (TmplTokenType::TEXT, $next);
457 } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
458 # If we detect a "closed start tag" but we know that the
459 # following token looks like a TMPL_VAR, don't stop
460 my($head, $tail, $post) = ($1, $2, $3);
461 if ($tail eq '' && $post =~ $re_tmpl_var) {
462 # Don't bother to show the warning if we're too confused
463 # FIXME. There's no method for _closed_start_tag_warning
464 if (!defined $this->{'_closed_start_tag_warning'}
465 || ($this->{'_closed_start_tag_warning'}->[0] eq $head
466 && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
467 warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
468 if split(/\n/, $head) < 10;
470 $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
472 ($kind, $it) = (TmplTokenType::TAG, "$head>");
473 $this->_set_readahead( $post );
475 warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
477 } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
478 ($kind, $it) = (TmplTokenType::COMMENT, $&);
479 $this->_set_readahead( $' );
484 my $next = scalar <$h>;
485 $eof_p = !defined $next;
487 $this->_increment_line_number;
488 $this->_append_readahead( $next );
490 if ($kind ne TmplTokenType::TAG) {
492 } elsif ($it =~ /^<!/) {
493 $kind = TmplTokenType::DECL;
494 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
495 if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
496 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
498 } elsif ($it =~ /^<\?/) {
499 $kind = TmplTokenType::PI;
501 if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
502 $kind = TmplTokenType::DIRECTIVE;
503 } elsif ($bad_comment_p) {
504 warn_normal sprintf("Syntax error in comment: %s\n", $it),
505 $this->line_number_start;
506 $this->_set_syntaxerror( 1 );
508 if (!$ok_p && $eof_p) {
509 ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
510 $this->_set_readahead, undef;
511 $this->_set_syntaxerror( 1 );
514 warn_normal "Unrecognizable token found: "
515 . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
516 . "\n", $this->line_number_start
517 if $kind == TmplTokenType::UNKNOWN;
518 return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
521 sub _next_token_intermediate {
523 my $h = $this->_handle;
525 if (!$this->cdata_mode_p) {
526 $it = $this->_next_token_internal($h);
527 if (defined $it && $it->type == TmplTokenType::TAG) {
528 if ($it->string =~ /^<(script|style|textarea)\b/is) {
529 $this->_set_cdata_mode( 1 );
530 $this->_set_cdata_close( "</$1\\s*>" );
531 $this->_set_pcdata_mode( 0 );
532 $this->_set_js_mode( lc($1) eq 'script' );
533 # } elsif ($it->string =~ /^<(title)\b/is) {
534 # $this->_set_cdata_mode( 1 );
535 # $this->_set_cdata_close( "</$1\\s*>" );
536 # $this->_set_pcdata_mode( 1 );
538 $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
542 for ($it = '', my $cdata_close = $this->cdata_close;;) {
543 my $next = $this->_next_token_internal($h);
544 $eof_p = !defined $next;
546 if (defined $next && $next->string =~ /$cdata_close/is) {
547 $this->_push_readahead( $next ); # push entire TmplToken object
548 $this->_set_cdata_mode( 0 );
550 last unless $this->cdata_mode_p;
551 $it .= $next->string;
555 error_normal "Unexpected end of file while looking for "
557 . "\n", $this->line_number_start;
558 $this->_set_fatal( 1 );
559 $this->_set_syntaxerror( 1 );
561 if ($this->pcdata_mode_p) {
563 $check =~ s/$re_directive//gos;
564 warn_pedantic "Markup found in PCDATA\n", $this->line_number,
565 \$pedantic_error_markup_in_pcdata_p
566 if $check =~ /$re_tag_compat/s;
568 # PCDATA should be treated as text, not CDATA
569 # Actually it should be treated as TEXT_PARAMETRIZED :-(
570 $it = TmplToken->new( $it,
571 ($this->pcdata_mode_p?
572 TmplTokenType::TEXT: TmplTokenType::CDATA),
573 $this->line_number, $this->filename )
575 if ($this->js_mode_p) {
576 my $s0 = $it->string;
579 if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
584 push @head, split_js $s0;
585 $it->set_js_data( identify_js_translatables(@head, @tail) );
587 $this->_set_pcdata_mode, 0;
588 $this->_set_cdata_close, undef unless !defined $it;
593 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
595 return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
596 || ($t->type == TmplTokenType::DIRECTIVE
597 && $t->string =~ /^(?:$re_tmpl_var)$/os)
598 || ($t->type == TmplTokenType::TAG
599 && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
600 || ($t->string =~ /^<input\b/is
601 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
605 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
607 return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
608 || ($t->type == TmplTokenType::DIRECTIVE
609 && $t->string =~ /^(?:$re_tmpl_var)$/os)
610 || ($t->type == TmplTokenType::TAG
611 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
612 || ($t->string =~ /^<input\b/is
613 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
616 sub _quote_cformat ($) {
622 sub string_canon ($) {
625 # Fold all whitespace into single blanks
631 sub _formalize_string_cformat ($) {
633 return _quote_cformat string_canon $s;
638 return $t->type == TmplTokenType::DIRECTIVE? '%s':
639 $t->type == TmplTokenType::TEXT?
640 _formalize_string_cformat($t->string):
641 $t->type == TmplTokenType::TAG?
642 ($t->string =~ /^<a\b/is? '<a>':
643 $t->string =~ /^<input\b/is? (
644 lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
646 _quote_cformat($t->string)):
647 _quote_cformat($t->string);
653 my $undo_trailing_blanks = sub {
654 for (my $i = $#structure; $i >= 0; $i -= 1) {
655 last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
656 # Queue element structure: [reanalysis-p, token]
657 push @{$this->{_queue}}, [1, pop @structure];
660 &$undo_trailing_blanks;
661 while (@structure >= 2) {
662 my $something_done_p = 0;
663 # FIXME: If the last token is a close tag but there are no tags
664 # FIXME: before it, drop the close tag back into the queue. This
665 # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
667 && $structure[$#structure]->type == TmplTokenType::TAG
668 && $structure[$#structure]->string =~ /^<\//s) {
669 my $has_other_tags_p = 0;
670 for (my $i = 0; $i < $#structure; $i += 1) {
671 $has_other_tags_p = 1
672 if $structure[$i]->type == TmplTokenType::TAG;
673 last if $has_other_tags_p;
675 if (!$has_other_tags_p) {
676 push @{$this->{_queue}}, [0, pop @structure]
677 &$undo_trailing_blanks;
678 $something_done_p = 1;
681 # FIXME: Do the same ugly hack for the last token being a ( or [
683 && $structure[$#structure]->type == TmplTokenType::TEXT
684 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
685 push @{$this->{_queue}}, [1, pop @structure];
686 &$undo_trailing_blanks;
687 $something_done_p = 1;
689 # FIXME: If the first token is an open tag, but there is no
690 # FIXME: corresponding close tag, "drop the open tag", i.e.,
691 # FIXME: requeue everything for reanalysis, except the frist tag. :-(
693 && $structure[0]->type == TmplTokenType::TAG
694 && $structure[0]->string =~ /^<([a-z0-9]+)/is
695 && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
697 my $tag_open_count = 1;
698 for (my $i = 1; $i <= $#structure; $i += 1) {
699 if ($structure[$i]->type == TmplTokenType::TAG) {
700 if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
701 $tag_open_count += ($1? -1: +1);
705 if ($tag_open_count > 0) {
706 for (my $i = $#structure; $i; $i -= 1) {
707 push @{$this->{_queue}}, [1, pop @structure];
709 $something_done_p = 1;
712 # FIXME: If the first token is an open tag, the last token is the
713 # FIXME: corresponding close tag, and there are no other close tags
714 # FIXME: inbetween, requeue the tokens from the second token on,
715 # FIXME: flagged as ok for re-analysis
717 && $structure[0]->type == TmplTokenType::TAG
718 && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
719 && $structure[$#structure]->type == TmplTokenType::TAG
720 && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
721 my $has_other_open_or_close_tags_p = 0;
722 for (my $i = 1; $i < $#structure; $i += 1) {
723 $has_other_open_or_close_tags_p = 1
724 if $structure[$i]->type == TmplTokenType::TAG
725 && $structure[$i]->string =~ /^<\/?$tag\b/is;
726 last if $has_other_open_or_close_tags_p;
728 if (!$has_other_open_or_close_tags_p) {
729 for (my $i = $#structure; $i; $i -= 1) {
730 push @{$this->{_queue}}, [1, pop @structure];
732 $something_done_p = 1;
735 last if !$something_done_p;
740 sub looks_plausibly_like_groupable_text_p (@) {
742 # The text would look plausibly groupable if all open tags are also closed.
745 for (my $i = 0; $i <= $#structure; $i += 1) {
746 if ($structure[$i]->type == TmplTokenType::TAG) {
747 my $form = $structure[$i]->string;
748 if ($form =~ /^<([A-Z0-9]+)/is) {
750 if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
753 } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
754 if (@tags && lc($1) eq $tags[$#tags]) {
760 } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
765 return !$error_p && !@tags;
770 my $h = $this->_handle;
772 $this->{_queue} = [] unless defined $this->{_queue};
774 # Elements in the queue are ordered pairs. The first in the ordered pair
775 # specifies whether we are allowed to reanalysis; the second is the token.
776 if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
777 $it = (pop @{$this->{_queue}})->[1];
779 if (@{$this->{_queue}}) {
780 $it = (pop @{$this->{_queue}})->[1];
782 $it = $this->_next_token_intermediate($h);
784 if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
785 && ($it->type == TmplTokenType::TEXT?
786 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
787 my @structure = ( $it );
790 my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
791 if ($it->type == TmplTokenType::TEXT) {
792 $nonblank_text_p = 1 if !blank_p( $it->string );
793 } elsif ($it->type == TmplTokenType::DIRECTIVE) {
795 } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
797 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
798 $with_anchor_p = 1 if $tag eq 'a';
799 $with_input_p = 1 if $tag eq 'input';
801 # We hate | and || in msgid strings, so we try to avoid them
802 for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
803 if (@{$this->{_queue}}) {
804 $next = (pop @{$this->{_queue}})->[1];
806 $next = $this->_next_token_intermediate($h);
808 push @structure, $next; # for consistency (with initialization)
809 last unless defined $next && _token_groupable2_p( $next );
810 last if $quit_next_p;
811 if ($next->type == TmplTokenType::TEXT) {
812 $nonblank_text_p = 1 if !blank_p( $next->string );
813 $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
814 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
816 } elsif ($next->type == TmplTokenType::TAG) {
817 if ($next->string =~ /^<([A-Z0-9]+)/is) {
819 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
820 $with_anchor_p = 1 if $tag eq 'a';
821 $with_input_p = 1 if $tag eq 'input';
822 } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
824 $quit_p = 1 unless @tags && $close eq $tags[$#tags];
825 $quit_next_p = 1 if $close =~ /^h\d$/;
831 # Undo the last token, allowing reanalysis
832 push @{$this->{_queue}}, [1, pop @structure];
833 # Simply it a bit more
834 @structure = $this->_optimize( @structure );
835 if (@structure < 2) {
838 } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
839 # Create the corresponding c-format string
840 my $string = join('', map { $_->string } @structure);
841 my $form = join('', map { _formalize $_ } @structure);
842 my($a_counter, $input_counter) = (0, 0);
843 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
844 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
845 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
846 $it->line_number, $it->pathname);
847 $it->set_form( $form );
848 $it->set_children( @structure );
849 } elsif ($nonblank_text_p
850 && looks_plausibly_like_groupable_text_p( @structure )
851 && $structure[$#structure]->type == TmplTokenType::TEXT) {
852 # Combine the strings
853 my $string = join('', map { $_->string } @structure);
854 $it = TmplToken->new($string, TmplTokenType::TEXT,
855 $it->line_number, $it->pathname);;
857 # Requeue the tokens thus seen for re-emitting, allow reanalysis
859 push @{$this->{_queue}}, [1, pop @structure];
862 $it = (pop @{$this->{_queue}})->[1];
866 if (defined $it && $it->type == TmplTokenType::TEXT) {
867 my $form = string_canon $it->string;
868 $it->set_form( $form );
873 ###############################################################################
875 # Other simple functions (These are not methods)
879 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var|$re_xsl)*$/os;
886 $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
887 $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
888 return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
893 # Locale::PO->quote is buggy, it doesn't quote newlines :-/
894 $s =~ s/([\\"])/\\\1/gs;
896 #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
900 # Some functions that shouldn't be here... should be moved out some time
901 sub parametrize ($$$$) {
902 my($fmt_0, $cformat_p, $t, $f) = @_;
905 my @params = $t->parameters_and_fields;
906 for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
907 if ($fmt =~ /^[^%]+/) {
910 } elsif ($fmt =~ /^%%/) {
913 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
915 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
917 if (defined $width && defined $prec && !$width && !$prec) {
919 } elsif (defined $params[$i - 1]) {
920 my $param = $params[$i - 1];
921 warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
922 . $param->type->to_string . "\n", undef
923 if $param->type != TmplTokenType::DIRECTIVE;
924 warn_normal "$fmt_0: $&: Unsupported "
925 . "field width or precision\n", undef
926 if defined $width || defined $prec;
927 warn_normal "$fmt_0: $&: Parameter $i not known", undef
928 unless defined $param;
929 $it .= defined $f? &$f( $param ): $param->string;
931 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
933 my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
936 my $param = $params[$i - 1];
937 if (!defined $param) {
938 warn_normal "$fmt_0: $&: Parameter $i not known", undef;
940 if ($param->type == TmplTokenType::TAG
941 && $param->string =~ /^<input\b/is) {
942 my $type = defined $param->attributes?
943 lc($param->attributes->{'type'}->[1]): undef;
945 warn_normal "$fmt_0: $&: Expected type=text, "
946 . "but found type=$type", undef
947 unless $type eq 'text';
948 } elsif ($conv eq 'p') {
949 warn_normal "$fmt_0: $&: Expected type=radio, "
950 . "but found type=$type", undef
951 unless $type eq 'radio';
954 warn_normal "$&: Expected an INPUT, but found a "
955 . $param->type->to_string . "\n", undef
957 warn_normal "$fmt_0: $&: Unsupported "
958 . "field width or precision\n", undef
959 if defined $width || defined $prec;
960 $it .= defined $f? &$f( $param ): $param->string;
962 } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
965 die "$&: Unknown or unsupported format specification\n"; #XXX
967 die "$&: Completely confused parametrizing\n";#XXX
971 my @anchors = $t->anchors;
972 for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
973 if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
976 } elsif ($fmt =~ /^<a(\d+)>/is) {
980 my $anchor = $anchors[$i - 1];
981 warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
982 unless defined $anchor;
983 $it .= $anchor->string;
985 die "Completely confused decoding anchors: $fmt\n";#XXX
991 sub charset_canon ($) {
993 $charset = uc($charset);
994 $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
995 $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
999 use vars qw( @latin1_utf8 );
1001 "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
1002 "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
1003 "\302\214", "\302\215", undef, undef, "\302\220", "\302\221",
1004 "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
1005 "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
1006 "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
1007 "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
1008 "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
1009 "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
1010 "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
1011 "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
1012 "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
1013 "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
1014 "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
1015 "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
1016 "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
1017 "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
1018 "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
1019 "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
1020 "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
1021 "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
1022 "\303\276", "\303\277" );
1024 sub charset_convert ($$$) {
1025 my($s, $charset_in, $charset_out) = @_;
1026 if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
1028 } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
1029 $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
1030 } elsif ($charset_in ne $charset_out) {
1031 VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
1036 ###############################################################################
1040 In addition to the basic scanning, this class will also perform
1047 Emulation of c-format strings (see below)
1051 Display of warnings for certain things that affects either the
1052 ability of this class to yield correct output, or things that
1053 are known to cause the original template to cause trouble.
1057 Automatic correction of some of the things warned about
1058 (e.g., SGML "closed start tag" notation).
1062 =head2 c-format strings emulation
1064 Because English word order is not universal, a simple extraction
1065 of translatable strings may yield some strings like "Accounts for"
1066 or ambiguous strings like "in". This makes the resulting strings
1067 difficult to translate, but does not affect all languages alike.
1068 For example, Chinese (with a somewhat different word order) would
1069 be hit harder, but French would be relatively unaffected.
1071 To overcome this problem, the scanner can be configured to detect
1072 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
1073 and try to construct a larger pattern that will appear in the PO
1074 file as c-format strings with %s placeholders. This additional
1075 step allows the translator to deal with cases where word order
1076 is different (replacing %s with %1$s, %2$s, etc.), or when certain
1077 words will require certain inflectional suffixes in sentences.
1079 Because this is an incompatible change, this mode must be explicitly
1080 turned on using the set_cformat(1) method call.
1082 =head2 The flag characters
1084 The character % is followed by zero or more of the following flags:
1090 The value comes from HTML <INPUT> elements.
1091 This abuse of the flag character is somewhat reasonable,
1092 since TMPL_VAR and INPUT are both variables, but of different kinds.
1096 =head2 The field width and precision
1098 An optional 0.0 can be specified for %s to specify
1099 that the <TMPL_VAR> should be suppressed.
1101 =head2 The conversion specifier
1107 Specifies any input field that is neither text nor hidden
1108 (which currently mean radio buttons).
1109 The p conversion specifier is chosen because this does not
1110 evoke any certain sensible data type.
1114 Specifies a text input field (<INPUT TYPE=TEXT>).
1115 This use of the S conversion specifier is somewhat reasonable,
1116 since text input fields contain values of undeterminable type,
1117 which can be treated as strings.
1121 Specifies a <TMPL_VAR>.
1122 This use of the o conversion specifier is somewhat reasonable,
1123 since <TMPL_VAR> denotes values of undeterminable type, which
1124 can be treated as strings.
1130 There is no code to save the tag name anywhere in the scanned token.
1132 The use of <AI<i>> to stand for the I<i>th anchor
1133 is not very well thought out.
1134 Some abuse of c-format specifies might have been more appropriate.
1138 This tokenizer is mostly based
1139 on Ambrose's hideous Perl script known as subst.pl.