6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 ###############################################################################
34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
36 use vars qw( $pedantic_error_markup_in_pcdata_p );
38 ###############################################################################
41 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
42 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
44 # $re_directive must not do any backreferences
45 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
46 # TMPL_VAR or TMPL_INCLUDE
47 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
48 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
49 # TMPL_VAR ESCAPE=1/HTML/URL
50 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
51 # Any control flow directive
52 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
53 # /LOOP or /IF or /UNLESS
54 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
57 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
58 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
59 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
60 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
63 my $etag = $compat? '>': '<>\/';
64 # This is no longer similar to the original regexp in subst.pl :-(
65 # Note that we don't want <> in compat mode; Mozilla knows about <
66 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
68 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
71 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
72 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
73 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
74 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
75 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
78 # End of the hideous stuff
80 use vars qw( $serial );
82 ###############################################################################
84 sub FATAL_P () {'fatal-p'}
85 sub SYNTAXERROR_P () {'syntaxerror-p'}
87 sub FILENAME () {'input'}
88 sub HANDLE () {'handle'}
90 sub READAHEAD () {'readahead'}
91 sub LINENUM_START () {'lc_0'}
93 sub CDATA_MODE_P () {'cdata-mode-p'}
94 sub CDATA_CLOSE () {'cdata-close'}
95 sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
96 sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
98 sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
103 my $class = ref($this) || $this;
107 my $handle = sprintf('TMPLTOKENIZER%d', $serial);
111 open($handle, "<$input") || die "$input: $!\n";
113 $self->{+FILENAME} = $input;
114 $self->{+HANDLE} = $handle;
115 $self->{+READAHEAD} = [];
119 ###############################################################################
125 return $this->{+FILENAME};
130 return $this->{+HANDLE};
135 return $this->{+FATAL_P};
140 return $this->{+SYNTAXERROR_P};
143 sub has_readahead_p {
145 return @{$this->{+READAHEAD}};
148 sub _peek_readahead {
150 return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
153 sub line_number_start {
155 return $this->{+LINENUM_START};
160 return $this->{+LINENUM};
165 return $this->{+CDATA_MODE_P};
170 return $this->{+PCDATA_MODE_P};
175 return $this->{+JS_MODE_P};
180 return $this->{+CDATA_CLOSE};
183 sub allow_cformat_p {
185 return $this->{+ALLOW_CFORMAT_P};
192 $this->{+FATAL_P} = $_[0];
196 sub _set_syntaxerror {
198 $this->{+SYNTAXERROR_P} = $_[0];
202 sub _push_readahead {
204 push @{$this->{+READAHEAD}}, $_[0];
210 return pop @{$this->{+READAHEAD}};
213 sub _append_readahead {
215 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
221 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
225 sub _increment_line_number {
227 $this->{+LINENUM} += 1;
231 sub _set_line_number_start {
233 $this->{+LINENUM_START} = $_[0];
237 sub _set_cdata_mode {
239 $this->{+CDATA_MODE_P} = $_[0];
243 sub _set_pcdata_mode {
245 $this->{+PCDATA_MODE_P} = $_[0];
251 $this->{+JS_MODE_P} = $_[0];
255 sub _set_cdata_close {
257 $this->{+CDATA_CLOSE} = $_[0];
261 sub set_allow_cformat {
263 $this->{+ALLOW_CFORMAT_P} = $_[0];
267 ###############################################################################
269 use vars qw( $js_EscapeSequence );
271 # Perl quoting is really screwed up, but this common subexp is way too long
272 $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
274 sub parenleft () { '(' }
275 sub parenright () { ')' }
281 if ($s0 =~ /^\s+/s) { # whitespace
284 } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
287 } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
290 # Keyword or identifier, ECMA-262 p.13 (section 7.5)
291 } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
294 # Punctuator, ECMA-262 p.13 (section 7.6)
295 } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
298 # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
299 } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
302 # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
303 } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
306 # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
307 } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
310 # StringLiteral, ECMA-262 p.17 (section 7.7.4)
311 # XXX SourceCharacter doesn't seem to be defined (?)
312 } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
315 } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
323 sub STATE_UNDERSCORE () { 1 }
324 sub STATE_PARENLEFT () { 2 }
325 sub STATE_STRING_LITERAL () { 3 }
327 # XXX This is a crazy hack. I don't want to write an ECMAScript parser.
328 # XXX A scanner is one thing; a parser another thing.
329 sub identify_js_translatables (@) {
332 # We mark a JavaScript translatable string as in C, i.e., _("literal")
333 # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
334 for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
335 my $reset_state_p = 0;
336 push @output, [0, $input[$i]];
337 if ($input[$i] !~ /\S/s) {
339 } elsif ($state == 0) {
340 $state = STATE_UNDERSCORE if $input[$i] eq '_';
341 } elsif ($state == STATE_UNDERSCORE) {
342 $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
343 } elsif ($state == STATE_PARENLEFT) {
344 if ($input[$i] =~ /^(['"])(.*)\1$/s) {
345 ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
349 } elsif ($state == STATE_STRING_LITERAL) {
350 if ($input[$i] eq parenright) {
351 $output[$j] = [1, $output[$j]->[1], $q, $s];
355 die "identify_js_translatables internal error: Unknown state $state"
361 ###############################################################################
363 sub _extract_attributes ($;$) {
367 $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags
368 || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags
370 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
371 my($key, $val, $val_orig, $rest)
372 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
374 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
376 if ($val =~ /$re_tmpl_include/os) {
377 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
378 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
379 # XXX: we probably should not warn if key is "onclick" etc
380 # XXX: there's just no reasonable thing to suggest
381 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
382 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
384 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
386 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
387 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
388 } elsif ($val_orig !~ /^['"]/) {
389 my $t = $val; $t =~ s/$re_directive_control//os;
391 "Unquoted attribute contains character(s) that should be quoted"
393 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
394 if $t =~ /[^-\.A-Za-z0-9]/s;
397 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
398 if ($s2 =~ /\S/s) { # should never happen
399 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
400 error_normal("Completely confused while extracting attributes: $1", $lc);
401 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
402 $this->_set_fatal( 1 );
404 # There's something wrong with the attribute syntax.
405 # We might be able to deduce a likely cause by looking more.
406 if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
407 warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
409 warn_normal "Strange attribute syntax: $s\n", $lc;
416 sub _next_token_internal {
421 $this->_pop_readahead if $this->has_readahead_p
422 && !ref $this->_peek_readahead
423 && !length $this->_peek_readahead;
424 if (!$this->has_readahead_p) {
425 my $next = scalar <$h>;
426 $eof_p = !defined $next;
428 $this->_increment_line_number;
429 $this->_push_readahead( $next );
432 $this->_set_line_number_start( $this->line_number ); # remember 1st line num
433 if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
434 ($it, $kind) = ($this->_pop_readahead, undef);
435 } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
437 } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
438 ($kind, $it) = (TmplTokenType::TEXT, $&);
439 $this->_set_readahead( $' );
440 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
441 } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text
442 ($kind, $it) = (TmplTokenType::TEXT, $&);
443 $this->_set_readahead( $' );
444 warn_normal "Unescaped < in $it\n", $this->line_number_start
445 if !$this->cdata_mode_p && $it =~ /</s;
446 } else { # tag/declaration/processing instruction
448 my $bad_comment_p = 0;
449 for (my $cdata_close = $this->cdata_close;;) {
450 if ($this->cdata_mode_p) {
451 my $next = $this->_pop_readahead;
452 if ($next =~ /^$cdata_close/is) {
453 ($kind, $it) = (TmplTokenType::TAG, $&);
454 $this->_push_readahead( $' );
456 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
457 ($kind, $it) = (TmplTokenType::TEXT, $1);
458 $this->_push_readahead( "$2$'" );
461 ($kind, $it) = (TmplTokenType::TEXT, $next);
464 } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
465 # If we detect a "closed start tag" but we know that the
466 # following token looks like a TMPL_VAR, don't stop
467 my($head, $tail, $post) = ($1, $2, $3);
468 if ($tail eq '' && $post =~ $re_tmpl_var) {
469 # Don't bother to show the warning if we're too confused
470 # FIXME. There's no method for _closed_start_tag_warning
471 if (!defined $this->{'_closed_start_tag_warning'}
472 || ($this->{'_closed_start_tag_warning'}->[0] eq $head
473 && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
474 warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
475 if split(/\n/, $head) < 10;
477 $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
479 ($kind, $it) = (TmplTokenType::TAG, "$head>");
480 $this->_set_readahead( $post );
482 warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
484 } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
485 ($kind, $it) = (TmplTokenType::COMMENT, $&);
486 $this->_set_readahead( $' );
491 my $next = scalar <$h>;
492 $eof_p = !defined $next;
494 $this->_increment_line_number;
495 $this->_append_readahead( $next );
497 if ($kind ne TmplTokenType::TAG) {
499 } elsif ($it =~ /^<!/) {
500 $kind = TmplTokenType::DECL;
501 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
502 if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
503 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
505 } elsif ($it =~ /^<\?/) {
506 $kind = TmplTokenType::PI;
508 if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
509 $kind = TmplTokenType::DIRECTIVE;
510 } elsif ($bad_comment_p) {
511 warn_normal sprintf("Syntax error in comment: %s\n", $it),
512 $this->line_number_start;
513 $this->_set_syntaxerror( 1 );
515 if (!$ok_p && $eof_p) {
516 ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
517 $this->_set_readahead, undef;
518 $this->_set_syntaxerror( 1 );
521 warn_normal "Unrecognizable token found: "
522 . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
523 . "\n", $this->line_number_start
524 if $kind == TmplTokenType::UNKNOWN;
525 return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
528 sub _next_token_intermediate {
530 my $h = $this->_handle;
532 if (!$this->cdata_mode_p) {
533 $it = $this->_next_token_internal($h);
534 if (defined $it && $it->type == TmplTokenType::TAG) {
535 if ($it->string =~ /^<(script|style|textarea)\b/is) {
536 $this->_set_cdata_mode( 1 );
537 $this->_set_cdata_close( "</$1\\s*>" );
538 $this->_set_pcdata_mode( 0 );
539 $this->_set_js_mode( lc($1) eq 'script' );
540 # } elsif ($it->string =~ /^<(title)\b/is) {
541 # $this->_set_cdata_mode( 1 );
542 # $this->_set_cdata_close( "</$1\\s*>" );
543 # $this->_set_pcdata_mode( 1 );
545 $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
549 for ($it = '', my $cdata_close = $this->cdata_close;;) {
550 my $next = $this->_next_token_internal($h);
551 $eof_p = !defined $next;
553 if (defined $next && $next->string =~ /$cdata_close/is) {
554 $this->_push_readahead( $next ); # push entire TmplToken object
555 $this->_set_cdata_mode( 0 );
557 last unless $this->cdata_mode_p;
558 $it .= $next->string;
562 error_normal "Unexpected end of file while looking for "
564 . "\n", $this->line_number_start;
565 $this->_set_fatal( 1 );
566 $this->_set_syntaxerror( 1 );
568 if ($this->pcdata_mode_p) {
570 $check =~ s/$re_directive//gos;
571 warn_pedantic "Markup found in PCDATA\n", $this->line_number,
572 \$pedantic_error_markup_in_pcdata_p
573 if $check =~ /$re_tag_compat/s;
575 # PCDATA should be treated as text, not CDATA
576 # Actually it should be treated as TEXT_PARAMETRIZED :-(
577 $it = TmplToken->new( $it,
578 ($this->pcdata_mode_p?
579 TmplTokenType::TEXT: TmplTokenType::CDATA),
580 $this->line_number, $this->filename )
582 if ($this->js_mode_p) {
583 my $s0 = $it->string;
586 if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
591 push @head, split_js $s0;
592 $it->set_js_data( identify_js_translatables(@head, @tail) );
594 $this->_set_pcdata_mode, 0;
595 $this->_set_cdata_close, undef unless !defined $it;
600 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
602 return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
603 || ($t->type == TmplTokenType::DIRECTIVE
604 && $t->string =~ /^(?:$re_tmpl_var)$/os)
605 || ($t->type == TmplTokenType::TAG
606 && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
607 || ($t->string =~ /^<input\b/is
608 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
612 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
614 return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
615 || ($t->type == TmplTokenType::DIRECTIVE
616 && $t->string =~ /^(?:$re_tmpl_var)$/os)
617 || ($t->type == TmplTokenType::TAG
618 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
619 || ($t->string =~ /^<input\b/is
620 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
623 sub _quote_cformat ($) {
629 sub string_canon ($) {
632 # Fold all whitespace into single blanks
638 sub _formalize_string_cformat ($) {
640 return _quote_cformat string_canon $s;
645 return $t->type == TmplTokenType::DIRECTIVE? '%s':
646 $t->type == TmplTokenType::TEXT?
647 _formalize_string_cformat($t->string):
648 $t->type == TmplTokenType::TAG?
649 ($t->string =~ /^<a\b/is? '<a>':
650 $t->string =~ /^<input\b/is? (
651 lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
653 _quote_cformat($t->string)):
654 _quote_cformat($t->string);
660 my $undo_trailing_blanks = sub {
661 for (my $i = $#structure; $i >= 0; $i -= 1) {
662 last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
663 # Queue element structure: [reanalysis-p, token]
664 push @{$this->{_queue}}, [1, pop @structure];
667 &$undo_trailing_blanks;
668 while (@structure >= 2) {
669 my $something_done_p = 0;
670 # FIXME: If the last token is a close tag but there are no tags
671 # FIXME: before it, drop the close tag back into the queue. This
672 # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
674 && $structure[$#structure]->type == TmplTokenType::TAG
675 && $structure[$#structure]->string =~ /^<\//s) {
676 my $has_other_tags_p = 0;
677 for (my $i = 0; $i < $#structure; $i += 1) {
678 $has_other_tags_p = 1
679 if $structure[$i]->type == TmplTokenType::TAG;
680 last if $has_other_tags_p;
682 if (!$has_other_tags_p) {
683 push @{$this->{_queue}}, [0, pop @structure]
684 &$undo_trailing_blanks;
685 $something_done_p = 1;
688 # FIXME: Do the same ugly hack for the last token being a ( or [
690 && $structure[$#structure]->type == TmplTokenType::TEXT
691 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
692 push @{$this->{_queue}}, [1, pop @structure];
693 &$undo_trailing_blanks;
694 $something_done_p = 1;
696 # FIXME: If the first token is an open tag, but there is no
697 # FIXME: corresponding close tag, "drop the open tag", i.e.,
698 # FIXME: requeue everything for reanalysis, except the frist tag. :-(
700 && $structure[0]->type == TmplTokenType::TAG
701 && $structure[0]->string =~ /^<([a-z0-9]+)/is
702 && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
704 my $tag_open_count = 1;
705 for (my $i = 1; $i <= $#structure; $i += 1) {
706 if ($structure[$i]->type == TmplTokenType::TAG) {
707 if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
708 $tag_open_count += ($1? -1: +1);
712 if ($tag_open_count > 0) {
713 for (my $i = $#structure; $i; $i -= 1) {
714 push @{$this->{_queue}}, [1, pop @structure];
716 $something_done_p = 1;
719 # FIXME: If the first token is an open tag, the last token is the
720 # FIXME: corresponding close tag, and there are no other close tags
721 # FIXME: inbetween, requeue the tokens from the second token on,
722 # FIXME: flagged as ok for re-analysis
724 && $structure[0]->type == TmplTokenType::TAG
725 && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
726 && $structure[$#structure]->type == TmplTokenType::TAG
727 && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
728 my $has_other_open_or_close_tags_p = 0;
729 for (my $i = 1; $i < $#structure; $i += 1) {
730 $has_other_open_or_close_tags_p = 1
731 if $structure[$i]->type == TmplTokenType::TAG
732 && $structure[$i]->string =~ /^<\/?$tag\b/is;
733 last if $has_other_open_or_close_tags_p;
735 if (!$has_other_open_or_close_tags_p) {
736 for (my $i = $#structure; $i; $i -= 1) {
737 push @{$this->{_queue}}, [1, pop @structure];
739 $something_done_p = 1;
742 last if !$something_done_p;
747 sub looks_plausibly_like_groupable_text_p (@) {
749 # The text would look plausibly groupable if all open tags are also closed.
752 for (my $i = 0; $i <= $#structure; $i += 1) {
753 if ($structure[$i]->type == TmplTokenType::TAG) {
754 my $form = $structure[$i]->string;
755 if ($form =~ /^<([A-Z0-9]+)/is) {
757 if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
760 } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
761 if (@tags && lc($1) eq $tags[$#tags]) {
767 } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
772 return !$error_p && !@tags;
777 my $h = $this->_handle;
779 $this->{_queue} = [] unless defined $this->{_queue};
781 # Elements in the queue are ordered pairs. The first in the ordered pair
782 # specifies whether we are allowed to reanalysis; the second is the token.
783 if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
784 $it = (pop @{$this->{_queue}})->[1];
786 if (@{$this->{_queue}}) {
787 $it = (pop @{$this->{_queue}})->[1];
789 $it = $this->_next_token_intermediate($h);
791 if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
792 && ($it->type == TmplTokenType::TEXT?
793 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
794 my @structure = ( $it );
797 my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
798 if ($it->type == TmplTokenType::TEXT) {
799 $nonblank_text_p = 1 if !blank_p( $it->string );
800 } elsif ($it->type == TmplTokenType::DIRECTIVE) {
802 } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
804 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
805 $with_anchor_p = 1 if $tag eq 'a';
806 $with_input_p = 1 if $tag eq 'input';
808 # We hate | and || in msgid strings, so we try to avoid them
809 for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
810 if (@{$this->{_queue}}) {
811 $next = (pop @{$this->{_queue}})->[1];
813 $next = $this->_next_token_intermediate($h);
815 push @structure, $next; # for consistency (with initialization)
816 last unless defined $next && _token_groupable2_p( $next );
817 last if $quit_next_p;
818 if ($next->type == TmplTokenType::TEXT) {
819 $nonblank_text_p = 1 if !blank_p( $next->string );
820 $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
821 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
823 } elsif ($next->type == TmplTokenType::TAG) {
824 if ($next->string =~ /^<([A-Z0-9]+)/is) {
826 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
827 $with_anchor_p = 1 if $tag eq 'a';
828 $with_input_p = 1 if $tag eq 'input';
829 } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
831 $quit_p = 1 unless @tags && $close eq $tags[$#tags];
832 $quit_next_p = 1 if $close =~ /^h\d$/;
838 # Undo the last token, allowing reanalysis
839 push @{$this->{_queue}}, [1, pop @structure];
840 # Simply it a bit more
841 @structure = $this->_optimize( @structure );
842 if (@structure < 2) {
845 } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
846 # Create the corresponding c-format string
847 my $string = join('', map { $_->string } @structure);
848 my $form = join('', map { _formalize $_ } @structure);
849 my($a_counter, $input_counter) = (0, 0);
850 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
851 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
852 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
853 $it->line_number, $it->pathname);
854 $it->set_form( $form );
855 $it->set_children( @structure );
856 } elsif ($nonblank_text_p
857 && looks_plausibly_like_groupable_text_p( @structure )
858 && $structure[$#structure]->type == TmplTokenType::TEXT) {
859 # Combine the strings
860 my $string = join('', map { $_->string } @structure);
861 $it = TmplToken->new($string, TmplTokenType::TEXT,
862 $it->line_number, $it->pathname);;
864 # Requeue the tokens thus seen for re-emitting, allow reanalysis
866 push @{$this->{_queue}}, [1, pop @structure];
869 $it = (pop @{$this->{_queue}})->[1];
873 if (defined $it && $it->type == TmplTokenType::TEXT) {
874 my $form = string_canon $it->string;
875 $it->set_form( $form );
880 ###############################################################################
882 # Other simple functions (These are not methods)
886 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
893 $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
894 $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
895 return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
900 # Locale::PO->quote is buggy, it doesn't quote newlines :-/
901 $s =~ s/([\\"])/\\\1/gs;
903 #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
907 # Some functions that shouldn't be here... should be moved out some time
908 sub parametrize ($$$$) {
909 my($fmt_0, $cformat_p, $t, $f) = @_;
912 my @params = $t->parameters_and_fields;
913 for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
914 if ($fmt =~ /^[^%]+/) {
917 } elsif ($fmt =~ /^%%/) {
920 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
922 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
924 if (defined $width && defined $prec && !$width && !$prec) {
926 } elsif (defined $params[$i - 1]) {
927 my $param = $params[$i - 1];
928 warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
929 . $param->type->to_string . "\n", undef
930 if $param->type != TmplTokenType::DIRECTIVE;
931 warn_normal "$fmt_0: $&: Unsupported "
932 . "field width or precision\n", undef
933 if defined $width || defined $prec;
934 warn_normal "$fmt_0: $&: Parameter $i not known", undef
935 unless defined $param;
936 $it .= defined $f? &$f( $param ): $param->string;
938 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
940 my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
943 my $param = $params[$i - 1];
944 if (!defined $param) {
945 warn_normal "$fmt_0: $&: Parameter $i not known", undef;
947 if ($param->type == TmplTokenType::TAG
948 && $param->string =~ /^<input\b/is) {
949 my $type = defined $param->attributes?
950 lc($param->attributes->{'type'}->[1]): undef;
952 warn_normal "$fmt_0: $&: Expected type=text, "
953 . "but found type=$type", undef
954 unless $type eq 'text';
955 } elsif ($conv eq 'p') {
956 warn_normal "$fmt_0: $&: Expected type=radio, "
957 . "but found type=$type", undef
958 unless $type eq 'radio';
961 warn_normal "$&: Expected an INPUT, but found a "
962 . $param->type->to_string . "\n", undef
964 warn_normal "$fmt_0: $&: Unsupported "
965 . "field width or precision\n", undef
966 if defined $width || defined $prec;
967 $it .= defined $f? &$f( $param ): $param->string;
969 } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
972 die "$&: Unknown or unsupported format specification\n"; #XXX
974 die "$&: Completely confused parametrizing\n";#XXX
978 my @anchors = $t->anchors;
979 for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
980 if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
983 } elsif ($fmt =~ /^<a(\d+)>/is) {
987 my $anchor = $anchors[$i - 1];
988 warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
989 unless defined $anchor;
990 $it .= $anchor->string;
992 die "Completely confused decoding anchors: $fmt\n";#XXX
998 sub charset_canon ($) {
1000 $charset = uc($charset);
1001 $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
1002 $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
1006 use vars qw( @latin1_utf8 );
1008 "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
1009 "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
1010 "\302\214", "\302\215", undef, undef, "\302\220", "\302\221",
1011 "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
1012 "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
1013 "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
1014 "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
1015 "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
1016 "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
1017 "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
1018 "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
1019 "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
1020 "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
1021 "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
1022 "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
1023 "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
1024 "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
1025 "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
1026 "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
1027 "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
1028 "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
1029 "\303\276", "\303\277" );
1031 sub charset_convert ($$$) {
1032 my($s, $charset_in, $charset_out) = @_;
1033 if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
1035 } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
1036 $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
1037 } elsif ($charset_in ne $charset_out) {
1038 VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
1043 ###############################################################################
1047 In addition to the basic scanning, this class will also perform
1054 Emulation of c-format strings (see below)
1058 Display of warnings for certain things that affects either the
1059 ability of this class to yield correct output, or things that
1060 are known to cause the original template to cause trouble.
1064 Automatic correction of some of the things warned about
1065 (e.g., SGML "closed start tag" notation).
1069 =head2 c-format strings emulation
1071 Because English word order is not universal, a simple extraction
1072 of translatable strings may yield some strings like "Accounts for"
1073 or ambiguous strings like "in". This makes the resulting strings
1074 difficult to translate, but does not affect all languages alike.
1075 For example, Chinese (with a somewhat different word order) would
1076 be hit harder, but French would be relatively unaffected.
1078 To overcome this problem, the scanner can be configured to detect
1079 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
1080 and try to construct a larger pattern that will appear in the PO
1081 file as c-format strings with %s placeholders. This additional
1082 step allows the translator to deal with cases where word order
1083 is different (replacing %s with %1$s, %2$s, etc.), or when certain
1084 words will require certain inflectional suffixes in sentences.
1086 Because this is an incompatible change, this mode must be explicitly
1087 turned on using the set_cformat(1) method call.
1089 =head2 The flag characters
1091 The character % is followed by zero or more of the following flags:
1097 The value comes from HTML <INPUT> elements.
1098 This abuse of the flag character is somewhat reasonable,
1099 since TMPL_VAR and INPUT are both variables, but of different kinds.
1103 =head2 The field width and precision
1105 An optional 0.0 can be specified for %s to specify
1106 that the <TMPL_VAR> should be suppressed.
1108 =head2 The conversion specifier
1114 Specifies any input field that is neither text nor hidden
1115 (which currently mean radio buttons).
1116 The p conversion specifier is chosen because this does not
1117 evoke any certain sensible data type.
1121 Specifies a text input field (<INPUT TYPE=TEXT>).
1122 This use of the S conversion specifier is somewhat reasonable,
1123 since text input fields contain values of undeterminable type,
1124 which can be treated as strings.
1128 Specifies a <TMPL_VAR>.
1129 This use of the o conversion specifier is somewhat reasonable,
1130 since <TMPL_VAR> denotes values of undeterminable type, which
1131 can be treated as strings.
1137 There is no code to save the tag name anywhere in the scanned token.
1139 The use of <AI<i>> to stand for the I<i>th anchor
1140 is not very well thought out.
1141 Some abuse of c-format specifies might have been more appropriate.
1145 This tokenizer is mostly based
1146 on Ambrose's hideous Perl script known as subst.pl.