6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 ###############################################################################
34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
36 use vars qw( $pedantic_error_markup_in_pcdata_p );
38 ###############################################################################
41 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
42 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
44 # $re_directive must not do any backreferences
45 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
46 # TMPL_VAR or TMPL_INCLUDE
47 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
48 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
49 # TMPL_VAR ESCAPE=1/HTML/URL
50 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
51 # Any control flow directive
52 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
53 # /LOOP or /IF or /UNLESS
54 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
57 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
58 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
59 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
60 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
63 my $etag = $compat? '>': '<>\/';
64 # This is no longer similar to the original regexp in subst.pl :-(
65 # Note that we don't want <> in compat mode; Mozilla knows about <
66 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
68 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
71 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
72 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
73 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
74 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
75 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
78 # End of the hideous stuff
80 use vars qw( $serial );
82 ###############################################################################
84 sub FATAL_P () {'fatal-p'}
85 sub SYNTAXERROR_P () {'syntaxerror-p'}
87 sub FILENAME () {'input'}
88 sub HANDLE () {'handle'}
90 sub READAHEAD () {'readahead'}
91 sub LINENUM_START () {'lc_0'}
93 sub CDATA_MODE_P () {'cdata-mode-p'}
94 sub CDATA_CLOSE () {'cdata-close'}
95 sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
97 sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
102 my $class = ref($this) || $this;
106 my $handle = sprintf('TMPLTOKENIZER%d', $serial);
110 open($handle, "<$input") || die "$input: $!\n";
112 $self->{+FILENAME} = $input;
113 $self->{+HANDLE} = $handle;
114 $self->{+READAHEAD} = [];
118 ###############################################################################
124 return $this->{+FILENAME};
129 return $this->{+HANDLE};
134 return $this->{+FATAL_P};
139 return $this->{+SYNTAXERROR_P};
142 sub has_readahead_p {
144 return @{$this->{+READAHEAD}};
147 sub _peek_readahead {
149 return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
152 sub line_number_start {
154 return $this->{+LINENUM_START};
159 return $this->{+LINENUM};
164 return $this->{+CDATA_MODE_P};
169 return $this->{+PCDATA_MODE_P};
174 return $this->{+CDATA_CLOSE};
177 sub allow_cformat_p {
179 return $this->{+ALLOW_CFORMAT_P};
186 $this->{+FATAL_P} = $_[0];
190 sub _set_syntaxerror {
192 $this->{+SYNTAXERROR_P} = $_[0];
196 sub _push_readahead {
198 push @{$this->{+READAHEAD}}, $_[0];
204 return pop @{$this->{+READAHEAD}};
207 sub _append_readahead {
209 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
215 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
219 sub _increment_line_number {
221 $this->{+LINENUM} += 1;
225 sub _set_line_number_start {
227 $this->{+LINENUM_START} = $_[0];
231 sub _set_cdata_mode {
233 $this->{+CDATA_MODE_P} = $_[0];
237 sub _set_pcdata_mode {
239 $this->{+PCDATA_MODE_P} = $_[0];
243 sub _set_cdata_close {
245 $this->{+CDATA_CLOSE} = $_[0];
249 sub set_allow_cformat {
251 $this->{+ALLOW_CFORMAT_P} = $_[0];
255 ###############################################################################
257 sub _extract_attributes ($;$) {
261 $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags
262 || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags
264 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
265 my($key, $val, $val_orig, $rest)
266 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
268 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
270 if ($val =~ /$re_tmpl_include/os) {
271 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
272 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
273 # XXX: we probably should not warn if key is "onclick" etc
274 # XXX: there's just no reasonable thing to suggest
275 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
276 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
278 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
280 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
281 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
282 } elsif ($val_orig !~ /^['"]/) {
283 my $t = $val; $t =~ s/$re_directive_control//os;
285 "Unquoted attribute contains character(s) that should be quoted"
287 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
288 if $t =~ /[^-\.A-Za-z0-9]/s;
291 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
292 if ($s2 =~ /\S/s) { # should never happen
293 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
294 error_normal("Completely confused while extracting attributes: $1", $lc);
295 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
296 $this->_set_fatal( 1 );
298 # There's something wrong with the attribute syntax.
299 # We might be able to deduce a likely cause by looking more.
300 if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
301 warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
303 warn_normal "Strange attribute syntax: $s\n", $lc;
310 sub _next_token_internal {
315 $this->_pop_readahead if $this->has_readahead_p
316 && !ref $this->_peek_readahead
317 && !length $this->_peek_readahead;
318 if (!$this->has_readahead_p) {
319 my $next = scalar <$h>;
320 $eof_p = !defined $next;
322 $this->_increment_line_number;
323 $this->_push_readahead( $next );
326 $this->_set_line_number_start( $this->line_number ); # remember 1st line num
327 if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
328 ($it, $kind) = ($this->_pop_readahead, undef);
329 } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
331 } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
332 ($kind, $it) = (TmplTokenType::TEXT, $&);
333 $this->_set_readahead( $' );
334 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
335 } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text
336 ($kind, $it) = (TmplTokenType::TEXT, $&);
337 $this->_set_readahead( $' );
338 warn_normal "Unescaped < in $it\n", $this->line_number_start
339 if !$this->cdata_mode_p && $it =~ /</s;
340 } else { # tag/declaration/processing instruction
342 my $bad_comment_p = 0;
343 for (my $cdata_close = $this->cdata_close;;) {
344 if ($this->cdata_mode_p) {
345 my $next = $this->_pop_readahead;
346 if ($next =~ /^$cdata_close/is) {
347 ($kind, $it) = (TmplTokenType::TAG, $&);
348 $this->_push_readahead( $' );
350 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
351 ($kind, $it) = (TmplTokenType::TEXT, $1);
352 $this->_push_readahead( "$2$'" );
355 ($kind, $it) = (TmplTokenType::TEXT, $next);
358 } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
359 # If we detect a "closed start tag" but we know that the
360 # following token looks like a TMPL_VAR, don't stop
361 my($head, $tail, $post) = ($1, $2, $3);
362 if ($tail eq '' && $post =~ $re_tmpl_var) {
363 # Don't bother to show the warning if we're too confused
364 # FIXME. There's no method for _closed_start_tag_warning
365 if (!defined $this->{'_closed_start_tag_warning'}
366 || ($this->{'_closed_start_tag_warning'}->[0] eq $head
367 && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
368 warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
369 if split(/\n/, $head) < 10;
371 $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
373 ($kind, $it) = (TmplTokenType::TAG, "$head>");
374 $this->_set_readahead( $post );
376 warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
378 } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
379 ($kind, $it) = (TmplTokenType::COMMENT, $&);
380 $this->_set_readahead( $' );
385 my $next = scalar <$h>;
386 $eof_p = !defined $next;
388 $this->_increment_line_number;
389 $this->_append_readahead( $next );
391 if ($kind ne TmplTokenType::TAG) {
393 } elsif ($it =~ /^<!/) {
394 $kind = TmplTokenType::DECL;
395 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
396 if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
397 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
399 } elsif ($it =~ /^<\?/) {
400 $kind = TmplTokenType::PI;
402 if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
403 $kind = TmplTokenType::DIRECTIVE;
404 } elsif ($bad_comment_p) {
405 warn_normal sprintf("Syntax error in comment: %s\n", $it),
406 $this->line_number_start;
407 $this->_set_syntaxerror( 1 );
409 if (!$ok_p && $eof_p) {
410 ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
411 $this->_set_readahead, undef;
412 $this->_set_syntaxerror( 1 );
415 warn_normal "Unrecognizable token found: "
416 . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
417 . "\n", $this->line_number_start
418 if $kind == TmplTokenType::UNKNOWN;
419 return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
422 sub _next_token_intermediate {
424 my $h = $this->_handle;
426 if (!$this->cdata_mode_p) {
427 $it = $this->_next_token_internal($h);
428 if (defined $it && $it->type == TmplTokenType::TAG) {
429 if ($it->string =~ /^<(script|style|textarea)\b/is) {
430 $this->_set_cdata_mode( 1 );
431 $this->_set_cdata_close( "</$1\\s*>" );
432 $this->_set_pcdata_mode( 0 );
433 # } elsif ($it->string =~ /^<(title)\b/is) {
434 # $this->_set_cdata_mode( 1 );
435 # $this->_set_cdata_close( "</$1\\s*>" );
436 # $this->_set_pcdata_mode( 1 );
438 $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
442 for ($it = '', my $cdata_close = $this->cdata_close;;) {
443 my $next = $this->_next_token_internal($h);
444 $eof_p = !defined $next;
446 if (defined $next && $next->string =~ /$cdata_close/is) {
447 $this->_push_readahead( $next ); # push entire TmplToken object
448 $this->_set_cdata_mode( 0 );
450 last unless $this->cdata_mode_p;
451 $it .= $next->string;
455 error_normal "Unexpected end of file while looking for "
457 . "\n", $this->line_number_start;
458 $this->_set_fatal( 1 );
459 $this->_set_syntaxerror( 1 );
461 if ($this->pcdata_mode_p) {
463 $check =~ s/$re_directive//gos;
464 warn_pedantic "Markup found in PCDATA\n", $this->line_number,
465 \$pedantic_error_markup_in_pcdata_p
466 if $check =~ /$re_tag_compat/s;
468 # PCDATA should be treated as text, not CDATA
469 # Actually it should be treated as TEXT_PARAMETRIZED :-(
470 $it = TmplToken->new( $it,
471 ($this->pcdata_mode_p?
472 TmplTokenType::TEXT: TmplTokenType::CDATA),
475 $this->_set_pcdata_mode, 0;
476 $this->_set_cdata_close, undef unless !defined $it;
481 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
483 return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
484 || ($t->type == TmplTokenType::DIRECTIVE
485 && $t->string =~ /^(?:$re_tmpl_var)$/os)
486 || ($t->type == TmplTokenType::TAG
487 && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
488 || ($t->string =~ /^<input\b/is
489 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
493 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
495 return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
496 || ($t->type == TmplTokenType::DIRECTIVE
497 && $t->string =~ /^(?:$re_tmpl_var)$/os)
498 || ($t->type == TmplTokenType::TAG
499 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
500 || ($t->string =~ /^<input\b/is
501 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
504 sub _quote_cformat ($) {
510 sub string_canon ($) {
513 # Fold all whitespace into single blanks
519 sub _formalize_string_cformat ($) {
521 return _quote_cformat string_canon $s;
526 return $t->type == TmplTokenType::DIRECTIVE? '%s':
527 $t->type == TmplTokenType::TEXT?
528 _formalize_string_cformat($t->string):
529 $t->type == TmplTokenType::TAG?
530 ($t->string =~ /^<a\b/is? '<a>':
531 $t->string =~ /^<input\b/is? (
532 lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
534 _quote_cformat($t->string)):
535 _quote_cformat($t->string);
541 my $undo_trailing_blanks = sub {
542 for (my $i = $#structure; $i >= 0; $i -= 1) {
543 last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
544 # Queue element structure: [reanalysis-p, token]
545 push @{$this->{_queue}}, [1, pop @structure];
548 &$undo_trailing_blanks;
549 while (@structure >= 2) {
550 my $something_done_p = 0;
551 # FIXME: If the last token is a close tag but there are no tags
552 # FIXME: before it, drop the close tag back into the queue. This
553 # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
555 && $structure[$#structure]->type == TmplTokenType::TAG
556 && $structure[$#structure]->string =~ /^<\//s) {
557 my $has_other_tags_p = 0;
558 for (my $i = 0; $i < $#structure; $i += 1) {
559 $has_other_tags_p = 1
560 if $structure[$i]->type == TmplTokenType::TAG;
561 last if $has_other_tags_p;
563 if (!$has_other_tags_p) {
564 push @{$this->{_queue}}, [0, pop @structure]
565 &$undo_trailing_blanks;
566 $something_done_p = 1;
569 # FIXME: Do the same ugly hack for the last token being a ( or [
571 && $structure[$#structure]->type == TmplTokenType::TEXT
572 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
573 push @{$this->{_queue}}, [1, pop @structure];
574 &$undo_trailing_blanks;
575 $something_done_p = 1;
577 # FIXME: If the first token is an open tag, but there is no
578 # FIXME: corresponding close tag, "drop the open tag", i.e.,
579 # FIXME: requeue everything for reanalysis, except the frist tag. :-(
581 && $structure[0]->type == TmplTokenType::TAG
582 && $structure[0]->string =~ /^<([a-z0-9]+)/is
583 && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
585 my $tag_open_count = 1;
586 for (my $i = 1; $i <= $#structure; $i += 1) {
587 if ($structure[$i]->type == TmplTokenType::TAG) {
588 if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
589 $tag_open_count += ($1? -1: +1);
593 if ($tag_open_count > 0) {
594 for (my $i = $#structure; $i; $i -= 1) {
595 push @{$this->{_queue}}, [1, pop @structure];
597 $something_done_p = 1;
600 # FIXME: If the first token is an open tag, the last token is the
601 # FIXME: corresponding close tag, and there are no other close tags
602 # FIXME: inbetween, requeue the tokens from the second token on,
603 # FIXME: flagged as ok for re-analysis
605 && $structure[0]->type == TmplTokenType::TAG
606 && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
607 && $structure[$#structure]->type == TmplTokenType::TAG
608 && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
609 my $has_other_open_or_close_tags_p = 0;
610 for (my $i = 1; $i < $#structure; $i += 1) {
611 $has_other_open_or_close_tags_p = 1
612 if $structure[$i]->type == TmplTokenType::TAG
613 && $structure[$i]->string =~ /^<\/?$tag\b/is;
614 last if $has_other_open_or_close_tags_p;
616 if (!$has_other_open_or_close_tags_p) {
617 for (my $i = $#structure; $i; $i -= 1) {
618 push @{$this->{_queue}}, [1, pop @structure];
620 $something_done_p = 1;
623 last if !$something_done_p;
628 sub looks_plausibly_like_groupable_text_p (@) {
630 # The text would look plausibly groupable if all open tags are also closed.
633 for (my $i = 0; $i <= $#structure; $i += 1) {
634 if ($structure[$i]->type == TmplTokenType::TAG) {
635 my $form = $structure[$i]->string;
636 if ($form =~ /^<([A-Z0-9]+)/is) {
638 if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
641 } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
642 if (@tags && lc($1) eq $tags[$#tags]) {
648 } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
653 return !$error_p && !@tags;
658 my $h = $this->_handle;
660 $this->{_queue} = [] unless defined $this->{_queue};
662 # Elements in the queue are ordered pairs. The first in the ordered pair
663 # specifies whether we are allowed to reanalysis; the second is the token.
664 if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
665 $it = (pop @{$this->{_queue}})->[1];
667 if (@{$this->{_queue}}) {
668 $it = (pop @{$this->{_queue}})->[1];
670 $it = $this->_next_token_intermediate($h);
672 if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
673 && ($it->type == TmplTokenType::TEXT?
674 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
675 my @structure = ( $it );
678 my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
679 if ($it->type == TmplTokenType::TEXT) {
680 $nonblank_text_p = 1 if !blank_p( $it->string );
681 } elsif ($it->type == TmplTokenType::DIRECTIVE) {
683 } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
685 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
686 $with_anchor_p = 1 if $tag eq 'a';
687 $with_input_p = 1 if $tag eq 'input';
689 # We hate | and || in msgid strings, so we try to avoid them
690 for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
691 if (@{$this->{_queue}}) {
692 $next = (pop @{$this->{_queue}})->[1];
694 $next = $this->_next_token_intermediate($h);
696 push @structure, $next; # for consistency (with initialization)
697 last unless defined $next && _token_groupable2_p( $next );
698 last if $quit_next_p;
699 if ($next->type == TmplTokenType::TEXT) {
700 $nonblank_text_p = 1 if !blank_p( $next->string );
701 $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
702 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
704 } elsif ($next->type == TmplTokenType::TAG) {
705 if ($next->string =~ /^<([A-Z0-9]+)/is) {
707 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
708 $with_anchor_p = 1 if $tag eq 'a';
709 $with_input_p = 1 if $tag eq 'input';
710 } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
712 $quit_p = 1 unless @tags && $close eq $tags[$#tags];
713 $quit_next_p = 1 if $close =~ /^h\d$/;
719 # Undo the last token, allowing reanalysis
720 push @{$this->{_queue}}, [1, pop @structure];
721 # Simply it a bit more
722 @structure = $this->_optimize( @structure );
723 if (@structure < 2) {
726 } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
727 # Create the corresponding c-format string
728 my $string = join('', map { $_->string } @structure);
729 my $form = join('', map { _formalize $_ } @structure);
730 my($a_counter, $input_counter) = (0, 0);
731 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
732 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
733 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
734 $it->line_number, $it->pathname);
735 $it->set_form( $form );
736 $it->set_children( @structure );
737 } elsif ($nonblank_text_p
738 && looks_plausibly_like_groupable_text_p( @structure )
739 && $structure[$#structure]->type == TmplTokenType::TEXT) {
740 # Combine the strings
741 my $string = join('', map { $_->string } @structure);
742 $it = TmplToken->new($string, TmplTokenType::TEXT,
743 $it->line_number, $it->pathname);;
745 # Requeue the tokens thus seen for re-emitting, allow reanalysis
747 push @{$this->{_queue}}, [1, pop @structure];
750 $it = (pop @{$this->{_queue}})->[1];
754 if (defined $it && $it->type == TmplTokenType::TEXT) {
755 my $form = string_canon $it->string;
756 $it->set_form( $form );
761 ###############################################################################
763 # Other simple functions (These are not methods)
767 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
774 $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
775 $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
776 return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
781 # Locale::PO->quote is buggy, it doesn't quote newlines :-/
782 $s =~ s/([\\"])/\\\1/gs;
784 #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
788 # Some functions that shouldn't be here... should be moved out some time
789 sub parametrize ($$$$) {
790 my($fmt_0, $cformat_p, $t, $f) = @_;
793 my @params = $t->parameters_and_fields;
794 for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
795 if ($fmt =~ /^[^%]+/) {
798 } elsif ($fmt =~ /^%%/) {
801 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
803 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
805 if (defined $width && defined $prec && !$width && !$prec) {
807 } elsif (defined $params[$i - 1]) {
808 my $param = $params[$i - 1];
809 warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
810 . $param->type->to_string . "\n", undef
811 if $param->type != TmplTokenType::DIRECTIVE;
812 warn_normal "$fmt_0: $&: Unsupported "
813 . "field width or precision\n", undef
814 if defined $width || defined $prec;
815 warn_normal "$fmt_0: $&: Parameter $i not known", undef
816 unless defined $param;
817 $it .= defined $f? &$f( $param ): $param->string;
819 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
821 my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
824 my $param = $params[$i - 1];
825 if (!defined $param) {
826 warn_normal "$fmt_0: $&: Parameter $i not known", undef;
828 if ($param->type == TmplTokenType::TAG
829 && $param->string =~ /^<input\b/is) {
830 my $type = defined $param->attributes?
831 lc($param->attributes->{'type'}->[1]): undef;
833 warn_normal "$fmt_0: $&: Expected type=text, "
834 . "but found type=$type", undef
835 unless $type eq 'text';
836 } elsif ($conv eq 'p') {
837 warn_normal "$fmt_0: $&: Expected type=radio, "
838 . "but found type=$type", undef
839 unless $type eq 'radio';
842 warn_normal "$&: Expected an INPUT, but found a "
843 . $param->type->to_string . "\n", undef
845 warn_normal "$fmt_0: $&: Unsupported "
846 . "field width or precision\n", undef
847 if defined $width || defined $prec;
848 $it .= defined $f? &$f( $param ): $param->string;
850 } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
853 die "$&: Unknown or unsupported format specification\n"; #XXX
855 die "$&: Completely confused parametrizing\n";#XXX
859 my @anchors = $t->anchors;
860 for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
861 if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
864 } elsif ($fmt =~ /^<a(\d+)>/is) {
868 my $anchor = $anchors[$i - 1];
869 warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
870 unless defined $anchor;
871 $it .= $anchor->string;
873 die "Completely confused decoding anchors: $fmt\n";#XXX
879 sub charset_canon ($) {
881 $charset = uc($charset);
882 $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
883 $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
887 use vars qw( @latin1_utf8 );
889 "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
890 "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
891 "\302\214", "\302\215", undef, undef, "\302\220", "\302\221",
892 "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
893 "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
894 "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
895 "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
896 "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
897 "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
898 "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
899 "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
900 "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
901 "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
902 "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
903 "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
904 "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
905 "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
906 "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
907 "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
908 "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
909 "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
910 "\303\276", "\303\277" );
912 sub charset_convert ($$$) {
913 my($s, $charset_in, $charset_out) = @_;
914 if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
916 } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
917 $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
918 } elsif ($charset_in ne $charset_out) {
919 VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
924 ###############################################################################
928 In addition to the basic scanning, this class will also perform
935 Emulation of c-format strings (see below)
939 Display of warnings for certain things that affects either the
940 ability of this class to yield correct output, or things that
941 are known to cause the original template to cause trouble.
945 Automatic correction of some of the things warned about
946 (e.g., SGML "closed start tag" notation).
950 =head2 c-format strings emulation
952 Because English word order is not universal, a simple extraction
953 of translatable strings may yield some strings like "Accounts for"
954 or ambiguous strings like "in". This makes the resulting strings
955 difficult to translate, but does not affect all languages alike.
956 For example, Chinese (with a somewhat different word order) would
957 be hit harder, but French would be relatively unaffected.
959 To overcome this problem, the scanner can be configured to detect
960 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
961 and try to construct a larger pattern that will appear in the PO
962 file as c-format strings with %s placeholders. This additional
963 step allows the translator to deal with cases where word order
964 is different (replacing %s with %1$s, %2$s, etc.), or when certain
965 words will require certain inflectional suffixes in sentences.
967 Because this is an incompatible change, this mode must be explicitly
968 turned on using the set_cformat(1) method call.
970 =head2 The flag characters
972 The character % is followed by zero or more of the following flags:
978 The value comes from HTML <INPUT> elements.
979 This abuse of the flag character is somewhat reasonable,
980 since TMPL_VAR and INPUT are both variables, but of different kinds.
984 =head2 The field width and precision
986 An optional 0.0 can be specified for %s to specify
987 that the <TMPL_VAR> should be suppressed.
989 =head2 The conversion specifier
995 Specifies any input field that is neither text nor hidden
996 (which currently mean radio buttons).
997 The p conversion specifier is chosen because this does not
998 evoke any certain sensible data type.
1002 Specifies a text input field (<INPUT TYPE=TEXT>).
1003 This use of the o conversion specifier is somewhat reasonable,
1004 since text input fields contain values of undeterminable type,
1005 which can be treated as strings.
1009 Specifies a <TMPL_VAR>.
1010 This use of the o conversion specifier is somewhat reasonable,
1011 since <TMPL_VAR> denotes values of undeterminable type, which
1012 can be treated as strings.
1018 There is no code to save the tag name anywhere in the scanned token.
1020 The use of <AI<i>> to stand for the I<i>th anchor
1021 is not very well thought out.
1022 Some abuse of c-format specifies might have been more appropriate.
1026 This tokenizer is mostly based
1027 on Ambrose's hideous Perl script known as subst.pl.