6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 ###############################################################################
34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
36 use vars qw( $pedantic_error_markup_in_pcdata_p );
38 ###############################################################################
41 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
42 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
44 # $re_directive must not do any backreferences
45 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
46 # TMPL_VAR or TMPL_INCLUDE
47 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
48 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
49 # TMPL_VAR ESCAPE=1/HTML/URL
50 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
51 # Any control flow directive
52 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
53 # /LOOP or /IF or /UNLESS
54 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
57 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
58 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
59 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
60 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
63 my $etag = $compat? '>': '<>\/';
64 # This is no longer similar to the original regexp in subst.pl :-(
65 # Note that we don't want <> in compat mode; Mozilla knows about <
66 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
68 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
71 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
72 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
73 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
74 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
75 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
78 # End of the hideous stuff
80 use vars qw( $serial );
82 ###############################################################################
84 sub FATAL_P () {'fatal-p'}
85 sub SYNTAXERROR_P () {'syntaxerror-p'}
87 sub FILENAME () {'input'}
88 sub HANDLE () {'handle'}
90 sub READAHEAD () {'readahead'}
91 sub LINENUM_START () {'lc_0'}
93 sub CDATA_MODE_P () {'cdata-mode-p'}
94 sub CDATA_CLOSE () {'cdata-close'}
95 sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
97 sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
102 my $class = ref($this) || $this;
106 my $handle = sprintf('TMPLTOKENIZER%d', $serial);
110 open($handle, "<$input") || die "$input: $!\n";
112 $self->{+FILENAME} = $input;
113 $self->{+HANDLE} = $handle;
114 $self->{+READAHEAD} = [];
118 ###############################################################################
124 return $this->{+FILENAME};
129 return $this->{+HANDLE};
134 return $this->{+FATAL_P};
139 return $this->{+SYNTAXERROR_P};
142 sub has_readahead_p {
144 return @{$this->{+READAHEAD}};
147 sub _peek_readahead {
149 return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
152 sub line_number_start {
154 return $this->{+LINENUM_START};
159 return $this->{+LINENUM};
164 return $this->{+CDATA_MODE_P};
169 return $this->{+PCDATA_MODE_P};
174 return $this->{+CDATA_CLOSE};
177 sub allow_cformat_p {
179 return $this->{+ALLOW_CFORMAT_P};
186 $this->{+FATAL_P} = $_[0];
190 sub _set_syntaxerror {
192 $this->{+SYNTAXERROR_P} = $_[0];
196 sub _push_readahead {
198 push @{$this->{+READAHEAD}}, $_[0];
204 return pop @{$this->{+READAHEAD}};
207 sub _append_readahead {
209 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
215 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
219 sub _increment_line_number {
221 $this->{+LINENUM} += 1;
225 sub _set_line_number_start {
227 $this->{+LINENUM_START} = $_[0];
231 sub _set_cdata_mode {
233 $this->{+CDATA_MODE_P} = $_[0];
237 sub _set_pcdata_mode {
239 $this->{+PCDATA_MODE_P} = $_[0];
243 sub _set_cdata_close {
245 $this->{+CDATA_CLOSE} = $_[0];
249 sub set_allow_cformat {
251 $this->{+ALLOW_CFORMAT_P} = $_[0];
255 ###############################################################################
257 sub _extract_attributes ($;$) {
261 $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
262 || $s =~ /^<\S+(.*)\S$/s; # SGML-style tags
264 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
265 my($key, $val, $val_orig, $rest)
266 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
268 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
270 if ($val =~ /$re_tmpl_include/os) {
271 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
272 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
273 # XXX: we probably should not warn if key is "onclick" etc
274 # XXX: there's just no reasonable thing to suggest
275 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
276 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
278 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
280 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
281 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
282 } elsif ($val_orig !~ /^['"]/) {
283 my $t = $val; $t =~ s/$re_directive_control//os;
285 "Unquoted attribute contains character(s) that should be quoted"
287 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
288 if $t =~ /[^-\.A-Za-z0-9]/s;
291 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
292 if ($s2 =~ /\S/s) { # should never happen
293 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
294 error_normal("Completely confused while extracting attributes: $1", $lc);
295 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
296 $this->_set_fatal( 1 );
298 # There's something wrong with the attribute syntax.
299 # We might be able to deduce a likely cause by looking more.
300 if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
301 warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
303 warn_normal "Strange attribute syntax: $s\n", $lc;
310 sub _next_token_internal {
315 $this->_pop_readahead if $this->has_readahead_p
316 && !ref $this->_peek_readahead
317 && !length $this->_peek_readahead;
318 if (!$this->has_readahead_p) {
319 my $next = scalar <$h>;
320 $eof_p = !defined $next;
322 $this->_increment_line_number;
323 $this->_push_readahead( $next );
326 $this->_set_line_number_start( $this->line_number ); # remember 1st line num
327 if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
328 ($it, $kind) = ($this->_pop_readahead, undef);
329 } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
331 } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
332 ($kind, $it) = (TmplTokenType::TEXT, $&);
333 $this->_set_readahead( $' );
334 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
335 } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text
336 ($kind, $it) = (TmplTokenType::TEXT, $&);
337 $this->_set_readahead( $' );
338 warn_normal "Unescaped < in $it\n", $this->line_number_start
339 if !$this->cdata_mode_p && $it =~ /</s;
340 } else { # tag/declaration/processing instruction
342 my $bad_comment_p = 0;
343 for (my $cdata_close = $this->cdata_close;;) {
344 if ($this->cdata_mode_p) {
345 my $next = $this->_pop_readahead;
346 if ($next =~ /^$cdata_close/is) {
347 ($kind, $it) = (TmplTokenType::TAG, $&);
348 $this->_push_readahead( $' );
350 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
351 ($kind, $it) = (TmplTokenType::TEXT, $1);
352 $this->_push_readahead( "$2$'" );
355 ($kind, $it) = (TmplTokenType::TEXT, $next);
358 } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
359 # If we detect a "closed start tag" but we know that the
360 # following token looks like a TMPL_VAR, don't stop
361 my($head, $tail, $post) = ($1, $2, $3);
362 if ($tail eq '' && $post =~ $re_tmpl_var) {
363 # Don't bother to show the warning if we're too confused
364 warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
365 if split(/\n/, $head) < 10;
367 ($kind, $it) = (TmplTokenType::TAG, "$head>");
368 $this->_set_readahead( $post );
370 warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
372 } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
373 ($kind, $it) = (TmplTokenType::COMMENT, $&);
374 $this->_set_readahead( $' );
379 my $next = scalar <$h>;
380 $eof_p = !defined $next;
382 $this->_increment_line_number;
383 $this->_append_readahead( $next );
385 if ($kind ne TmplTokenType::TAG) {
387 } elsif ($it =~ /^<!/) {
388 $kind = TmplTokenType::DECL;
389 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
390 if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
391 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
393 } elsif ($it =~ /^<\?/) {
394 $kind = TmplTokenType::PI;
396 if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
397 $kind = TmplTokenType::DIRECTIVE;
398 } elsif ($bad_comment_p) {
399 warn_normal sprintf("Syntax error in comment: %s\n", $it),
400 $this->line_number_start;
401 $this->_set_syntaxerror( 1 );
403 if (!$ok_p && $eof_p) {
404 ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
405 $this->_set_readahead, undef;
406 $this->_set_syntaxerror( 1 );
409 warn_normal "Unrecognizable token found: "
410 . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
411 . "\n", $this->line_number_start
412 if $kind == TmplTokenType::UNKNOWN;
413 return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
416 sub _next_token_intermediate {
418 my $h = $this->_handle;
420 if (!$this->cdata_mode_p) {
421 $it = $this->_next_token_internal($h);
422 if (defined $it && $it->type == TmplTokenType::TAG) {
423 if ($it->string =~ /^<(script|style|textarea)\b/is) {
424 $this->_set_cdata_mode( 1 );
425 $this->_set_cdata_close( "</$1\\s*>" );
426 $this->_set_pcdata_mode( 0 );
427 } elsif ($it->string =~ /^<(title)\b/is) {
428 $this->_set_cdata_mode( 1 );
429 $this->_set_cdata_close( "</$1\\s*>" );
430 $this->_set_pcdata_mode( 1 );
432 $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
436 for ($it = '', my $cdata_close = $this->cdata_close;;) {
437 my $next = $this->_next_token_internal($h);
438 $eof_p = !defined $next;
440 if (defined $next && $next->string =~ /$cdata_close/is) {
441 $this->_push_readahead( $next ); # push entire TmplToken object
442 $this->_set_cdata_mode( 0 );
444 last unless $this->cdata_mode_p;
445 $it .= $next->string;
449 error_normal "Unexpected end of file while looking for "
451 . "\n", $this->line_number_start;
452 $this->_set_fatal( 1 );
453 $this->_set_syntaxerror( 1 );
455 if ($this->pcdata_mode_p) {
457 $check =~ s/$re_directive//gos;
458 warn_pedantic "Markup found in PCDATA\n", $this->line_number,
459 \$pedantic_error_markup_in_pcdata_p
460 if $check =~ /$re_tag_compat/s;
462 $it = TmplToken->new( $it, TmplTokenType::CDATA, $this->line_number )
464 $this->_set_pcdata_mode, 0;
465 $this->_set_cdata_close, undef unless !defined $it;
470 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
472 return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
473 || ($t->type == TmplTokenType::DIRECTIVE
474 && $t->string =~ /^(?:$re_tmpl_var)$/os)
475 || ($t->type == TmplTokenType::TAG
476 && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
477 # || ($t->string =~ /^<input\b/is
478 # && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
482 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
484 return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
485 || ($t->type == TmplTokenType::DIRECTIVE
486 && $t->string =~ /^(?:$re_tmpl_var)$/os)
487 || ($t->type == TmplTokenType::TAG
488 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
489 || ($t->string =~ /^<input\b/is
490 && $t->attributes->{'type'} =~ /^(?:radio|text)$/is)))
493 sub _quote_cformat ($) {
499 sub string_canon ($) {
502 # Fold all whitespace into single blanks
508 sub _formalize_string_cformat ($) {
510 return _quote_cformat string_canon $s;
515 return $t->type == TmplTokenType::DIRECTIVE? '%s':
516 $t->type == TmplTokenType::TEXT?
517 _formalize_string_cformat($t->string):
518 $t->type == TmplTokenType::TAG?
519 ($t->string =~ /^<a\b/is? '<a>':
520 $t->string =~ /^<input\b/is? '<input>':
521 _quote_cformat($t->string)):
522 _quote_cformat($t->string);
528 my $undo_trailing_blanks = sub {
529 for (my $i = $#structure; $i >= 0; $i -= 1) {
530 last if $structure[$i]->type != TmplTokenType::TEXT;
531 last if !blank_p($structure[$i]->string);
532 push @{$this->{_queue}}, pop @structure;
535 &$undo_trailing_blanks;
536 # FIXME: If the last token is a close tag but there are no tags
537 # FIXME: before it, drop the close tag back into the queue. This
538 # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
540 && $structure[$#structure]->type == TmplTokenType::TAG
541 && $structure[$#structure]->string =~ /^<\//s) {
542 my $has_other_tags_p = 0;
543 for (my $i = 0; $i < $#structure; $i += 1) {
544 $has_other_tags_p = 1 if $structure[$i]->type == TmplTokenType::TAG;
545 last if $has_other_tags_p;
547 push @{$this->{_queue}}, pop @structure unless $has_other_tags_p;
548 &$undo_trailing_blanks;
550 # FIXME: Do the same ugly hack for the last token being a ( or [
552 && $structure[$#structure]->type == TmplTokenType::TEXT
553 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
554 push @{$this->{_queue}}, pop @structure;
555 &$undo_trailing_blanks;
560 sub looks_plausibly_like_groupable_text_p (@) {
562 # The text would look plausibly groupable if all open tags are also closed.
565 for (my $i = 0; $i <= $#structure; $i += 1) {
566 if ($structure[$i]->type == TmplTokenType::TAG) {
567 if ($structure[$i]->string =~ /^<([A-Z0-9]+)/is) {
569 push @tags, $tag unless $tag =~ /^<(?:input)/is
571 } elsif ($structure[$i]->string =~ /^<\/([A-Z0-9]+)/is) {
572 if (@tags && lc($1) eq $tags[$#tags]) {
578 } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
583 return !$error_p && !@tags;
588 my $h = $this->_handle;
590 $this->{_queue} = [] unless defined $this->{_queue};
592 # Don't reparse anything in the queue. We can put a parametrized token
593 # there if we need to, however.
594 if (@{$this->{_queue}}) {
595 $it = pop @{$this->{_queue}};
597 $it = $this->_next_token_intermediate($h);
598 if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
599 && ($it->type == TmplTokenType::TEXT?
600 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
601 my @structure = ( $it );
604 my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
605 if ($it->type == TmplTokenType::TEXT) {
606 $nonblank_text_p = 1 if !blank_p( $it->string );
607 } elsif ($it->type == TmplTokenType::DIRECTIVE) {
609 } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
611 $with_anchor_p = 1 if lc($1) eq 'a';
612 $with_input_p = 1 if lc($1) eq 'input';
614 # We hate | and || in msgid strings, so we try to avoid them
615 for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
616 $next = $this->_next_token_intermediate($h);
617 push @structure, $next; # for consistency (with initialization)
618 last unless defined $next && _token_groupable2_p( $next );
619 last if $quit_next_p;
620 if ($next->type == TmplTokenType::TEXT) {
621 $nonblank_text_p = 1 if !blank_p( $next->string );
622 $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
623 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
625 } elsif ($next->type == TmplTokenType::TAG) {
626 if ($next->string =~ /^<([A-Z0-9]+)/is) {
628 $with_anchor_p = 1 if lc($1) eq 'a';
629 $with_input_p = 1 if lc($1) eq 'input';
630 } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
632 $quit_p = 1 unless @tags && $close eq $tags[$#tags];
633 $quit_next_p = 1 if $close =~ /^h\d$/;
639 # Undo the last token
640 push @{$this->{_queue}}, pop @structure;
641 # Simply it a bit more
642 @structure = $this->_optimize( @structure );
643 if (@structure < 2) {
646 } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
647 # Create the corresponding c-format string
648 my $string = join('', map { $_->string } @structure);
649 my $form = join('', map { _formalize $_ } @structure);
650 my($a_counter, $input_counter) = (0, 0);
651 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
652 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
653 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
654 $it->line_number, $it->pathname);
655 $it->set_form( $form );
656 $it->set_children( @structure );
657 } elsif ($nonblank_text_p
658 && looks_plausibly_like_groupable_text_p( @structure )
659 && $structure[$#structure]->type == TmplTokenType::TEXT) {
660 # Combine the strings
661 my $string = join('', map { $_->string } @structure);
662 $it = TmplToken->new($string, TmplTokenType::TEXT,
663 $it->line_number, $it->pathname);;
665 # Requeue the tokens thus seen for re-emitting
667 push @{$this->{_queue}}, pop @structure;
670 $it = pop @{$this->{_queue}};
674 if (defined $it && $it->type == TmplTokenType::TEXT) {
675 my $form = string_canon $it->string;
676 $it->set_form( $form );
681 ###############################################################################
683 # Other simple functions (These are not methods)
687 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
694 $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
695 $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
696 return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
701 # Locale::PO->quote is buggy, it doesn't quote newlines :-/
702 $s =~ s/([\\"])/\\\1/gs;
704 #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
708 # Some functions that shouldn't be here... should be moved out some time
709 sub parametrize ($$$) {
710 my($fmt_0, $params, $anchors) = @_;
712 for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
713 if ($fmt =~ /^[^%]+/) {
716 } elsif ($fmt =~ /^%%/) {
719 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/) {
721 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
723 if (!defined $width && !defined $prec) {
724 my $param = $params->[$i - 1];
726 warn_normal "$&: Undefined parameter $i for msgid \"$fmt_0\"",
728 unless defined $param;
729 } elsif (defined $width && defined $prec && !$width && !$prec) {
732 die "Unsupported precision specification in format: $&\n"; #XXX
734 } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
737 die "Unknown or unsupported format specification: $&\n"; #XXX
739 die "Completely confused parametrizing: $fmt\n";#XXX
742 for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
743 if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
746 } elsif ($fmt =~ /^<a(\d+)>/is) {
750 my $anchor = $anchors->[$i - 1];
751 warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
752 unless defined $anchor;
753 $it .= $anchor->string;
755 die "Completely confused decoding anchors: $fmt\n";#XXX
761 sub charset_canon ($) {
763 $charset = uc($charset);
764 $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
765 $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
769 use vars qw( @latin1_utf8 );
771 "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
772 "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
773 "\302\214", "\302\215", undef, undef, "\302\220", "\302\221",
774 "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
775 "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
776 "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
777 "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
778 "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
779 "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
780 "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
781 "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
782 "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
783 "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
784 "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
785 "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
786 "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
787 "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
788 "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
789 "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
790 "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
791 "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
792 "\303\276", "\303\277" );
794 sub charset_convert ($$$) {
795 my($s, $charset_in, $charset_out) = @_;
796 if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
798 } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
799 $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
800 } elsif ($charset_in ne $charset_out) {
801 VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
806 ###############################################################################
810 In addition to the basic scanning, this class will also perform
817 Emulation of c-format strings (see below)
821 Display of warnings for certain things that affects either the
822 ability of this class to yield correct output, or things that
823 are known to cause the original template to cause trouble.
827 Automatic correction of some of the things warned about
828 (e.g., SGML "closed start tag" notation).
832 =head2 c-format strings emulation
834 Because English word order is not universal, a simple extraction
835 of translatable strings may yield some strings like "Accounts for"
836 or ambiguous strings like "in". This makes the resulting strings
837 difficult to translate, but does not affect all languages alike.
838 For example, Chinese (with a somewhat different word order) would
839 be hit harder, but French would be relatively unaffected.
841 To overcome this problem, the scanner can be configured to detect
842 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
843 and try to construct a larger pattern that will appear in the PO
844 file as c-format strings with %s placeholders. This additional
845 step allows the translator to deal with cases where word order
846 is different (replacing %s with %1$s, %2$s, etc.), or when certain
847 words will require certain inflectional suffixes in sentences.
849 Because this is an incompatible change, this mode must be explicitly
850 turned on using the set_cformat(1) method call.
854 This tokenizer is mostly based
855 on Ambrose's hideous Perl script known as subst.pl.