6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 ###############################################################################
34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
37 ###############################################################################
40 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
41 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
43 # $re_directive must not do any backreferences
44 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
45 # TMPL_VAR or TMPL_INCLUDE
46 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
47 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
48 # TMPL_VAR ESCAPE=1/HTML/URL
49 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
50 # Any control flow directive
51 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
52 # /LOOP or /IF or /UNLESS
53 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
56 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
57 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
58 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
59 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
62 my $etag = $compat? '>': '<>\/';
63 # This is no longer similar to the original regexp in subst.pl :-(
64 # Note that we don't want <> in compat mode; Mozilla knows about <
65 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
67 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
70 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
71 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
72 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
73 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
74 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
77 # End of the hideous stuff
79 use vars qw( $serial );
81 ###############################################################################
83 sub FATAL_P () {'fatal-p'}
84 sub SYNTAXERROR_P () {'syntaxerror-p'}
86 sub FILENAME () {'input'}
87 sub HANDLE () {'handle'}
89 sub READAHEAD () {'readahead'}
90 sub LINENUM_START () {'lc_0'}
92 sub CDATA_MODE_P () {'cdata-mode-p'}
93 sub CDATA_CLOSE () {'cdata-close'}
95 sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
100 my $class = ref($this) || $this;
104 my $handle = sprintf('TMPLTOKENIZER%d', $serial);
108 open($handle, "<$input") || die "$input: $!\n";
110 $self->{+FILENAME} = $input;
111 $self->{+HANDLE} = $handle;
112 $self->{+READAHEAD} = [];
116 ###############################################################################
122 return $this->{+FILENAME};
127 return $this->{+HANDLE};
132 return $this->{+FATAL_P};
137 return $this->{+SYNTAXERROR_P};
140 sub has_readahead_p {
142 return @{$this->{+READAHEAD}};
145 sub _peek_readahead {
147 return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
150 sub line_number_start {
152 return $this->{+LINENUM_START};
157 return $this->{+LINENUM};
162 return $this->{+CDATA_MODE_P};
167 return $this->{+CDATA_CLOSE};
170 sub allow_cformat_p {
172 return $this->{+ALLOW_CFORMAT_P};
179 $this->{+FATAL_P} = $_[0];
183 sub _set_syntaxerror {
185 $this->{+SYNTAXERROR_P} = $_[0];
189 sub _push_readahead {
191 push @{$this->{+READAHEAD}}, $_[0];
197 return pop @{$this->{+READAHEAD}};
200 sub _append_readahead {
202 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
208 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
212 sub _increment_line_number {
214 $this->{+LINENUM} += 1;
218 sub _set_line_number_start {
220 $this->{+LINENUM_START} = $_[0];
224 sub _set_cdata_mode {
226 $this->{+CDATA_MODE_P} = $_[0];
230 sub _set_cdata_close {
232 $this->{+CDATA_CLOSE} = $_[0];
236 sub set_allow_cformat {
238 $this->{+ALLOW_CFORMAT_P} = $_[0];
242 ###############################################################################
244 sub _extract_attributes ($;$) {
248 $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
249 || $s =~ /^<\S+(.*)\S$/s; # SGML-style tags
251 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
252 my($key, $val, $val_orig, $rest)
253 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
255 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
257 if ($val =~ /$re_tmpl_include/os) {
258 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
259 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
260 # XXX: we probably should not warn if key is "onclick" etc
261 # XXX: there's just no reasonable thing to suggest
262 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
263 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
265 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
267 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
268 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
269 } elsif ($val_orig !~ /^['"]/) {
270 my $t = $val; $t =~ s/$re_directive_control//os;
272 "Unquoted attribute contains character(s) that should be quoted"
274 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
275 if $t =~ /[^-\.A-Za-z0-9]/s;
278 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
279 if ($s2 =~ /\S/s) { # should never happen
280 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
281 error_normal("Completely confused while extracting attributes: $1", $lc);
282 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
283 $this->_set_fatal( 1 );
285 warn_normal "Strange attribute syntax: $s\n", $lc;
291 sub _next_token_internal {
296 $this->_pop_readahead if $this->has_readahead_p
297 && !ref $this->_peek_readahead
298 && !length $this->_peek_readahead;
299 if (!$this->has_readahead_p) {
300 my $next = scalar <$h>;
301 $eof_p = !defined $next;
303 $this->_increment_line_number;
304 $this->_push_readahead( $next );
307 $this->_set_line_number_start( $this->line_number ); # remember 1st line num
308 if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
309 ($it, $kind) = ($this->_pop_readahead, undef);
310 } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
312 } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
313 ($kind, $it) = (TmplTokenType::TEXT, $&);
314 $this->_set_readahead( $' );
315 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
316 } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text
317 ($kind, $it) = (TmplTokenType::TEXT, $&);
318 $this->_set_readahead( $' );
319 warn_normal "Unescaped < in $it\n", $this->line_number_start
320 if !$this->cdata_mode_p && $it =~ /</s;
321 } else { # tag/declaration/processing instruction
323 for (my $cdata_close = $this->cdata_close;;) {
324 if ($this->cdata_mode_p) {
325 my $next = $this->_pop_readahead;
326 if ($next =~ /^$cdata_close/) {
327 ($kind, $it) = (TmplTokenType::TAG, $&);
328 $this->_push_readahead( $' );
330 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/) {
331 ($kind, $it) = (TmplTokenType::TEXT, $1);
332 $this->_push_readahead( "$2$'" );
335 ($kind, $it) = (TmplTokenType::TEXT, $next);
338 } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
339 # If we detect a "closed start tag" but we know that the
340 # following token looks like a TMPL_VAR, don't stop
341 my($head, $tail, $post) = ($1, $2, $3);
342 if ($tail eq '' && $post =~ $re_tmpl_var) {
343 # Don't bother to show the warning if we're too confused
344 warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
345 if split(/\n/, $head) < 10;
347 ($kind, $it) = (TmplTokenType::TAG, "$head>");
348 $this->_set_readahead( $post );
350 warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
352 } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
353 ($kind, $it) = (TmplTokenType::COMMENT, $&);
354 $this->_set_readahead( $' );
356 warn_normal "Syntax error in comment: $&\n", $this->line_number_start;
357 $this->_set_syntaxerror( 1 );
360 my $next = scalar <$h>;
361 $eof_p = !defined $next;
363 $this->_increment_line_number;
364 $this->_append_readahead( $next );
366 if ($kind ne TmplTokenType::TAG) {
368 } elsif ($it =~ /^<!/) {
369 $kind = TmplTokenType::DECL;
370 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
371 if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
372 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
374 } elsif ($it =~ /^<\?/) {
375 $kind = TmplTokenType::PI;
377 if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
378 $kind = TmplTokenType::DIRECTIVE;
380 if (!$ok_p && $eof_p) {
381 ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
382 $this->_set_readahead, undef;
383 $this->_set_syntaxerror( 1 );
386 warn_normal "Unrecognizable token found: "
387 . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
388 . "\n", $this->line_number_start
389 if $kind == TmplTokenType::UNKNOWN;
390 return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
393 sub _next_token_intermediate {
395 my $h = $this->_handle;
397 if (!$this->cdata_mode_p) {
398 $it = $this->_next_token_internal($h);
399 if (defined $it && $it->type == TmplTokenType::TAG) {
400 if ($it->string =~ /^<(script|style|textarea)\b/i) {
401 $this->_set_cdata_mode( 1 );
402 $this->_set_cdata_close( "</$1\\s*>" );
404 $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
407 for ($it = '', my $cdata_close = $this->cdata_close;;) {
408 my $next = $this->_next_token_internal($h);
409 last if !defined $next;
410 if (defined $next && $next->string =~ /$cdata_close/i) {
411 $this->_push_readahead( $next ); # push entire TmplToken object
412 $this->_set_cdata_mode( 0 );
414 last unless $this->cdata_mode_p;
415 $it .= $next->string;
417 $it = TmplToken->new( $it, TmplTokenType::CDATA, $this->line_number );
418 $this->_set_cdata_close, undef;
423 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
425 return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s)
426 || ($t->type == TmplTokenType::DIRECTIVE
427 && $t->string =~ /^(?:$re_tmpl_var)$/os)
428 || ($t->type == TmplTokenType::TAG
429 && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
430 || ($t->string =~ /^<input/i
431 && $t->attributes->{'type'} =~ /^(?:text)$/i)))
434 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
436 return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/s))
437 || ($t->type == TmplTokenType::DIRECTIVE
438 && $t->string =~ /^(?:$re_tmpl_var)$/os)
439 || ($t->type == TmplTokenType::TAG
440 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
441 || ($t->string =~ /^<input/i
442 && $t->attributes->{'type'} =~ /^(?:text)$/i)))
445 sub _quote_cformat ($) {
451 sub string_canon ($) {
454 # Fold all whitespace into single blanks
460 sub _formalize_string_cformat ($) {
462 return _quote_cformat string_canon $s;
467 return $t->type == TmplTokenType::DIRECTIVE? '%s':
468 $t->type == TmplTokenType::TEXT?
469 _formalize_string_cformat($t->string):
470 $t->type == TmplTokenType::TAG?
471 ($t->string =~ /^<a\b/is? '<a>': _quote_cformat($t->string)):
472 _quote_cformat($t->string);
478 my $undo_trailing_blanks = sub {
479 for (my $i = $#structure; $i >= 0; $i -= 1) {
480 last if $structure[$i]->type != TmplTokenType::TEXT;
481 last if !blank_p($structure[$i]->string);
482 push @{$this->{_queue}}, pop @structure;
485 &$undo_trailing_blanks;
486 # FIXME: If the last token is a close tag but there are no tags
487 # FIXME: before it, drop the close tag back into the queue. This
488 # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
490 && $structure[$#structure]->type == TmplTokenType::TAG
491 && $structure[$#structure]->string =~ /^<\//s) {
492 my $has_other_tags_p = 0;
493 for (my $i = 0; $i < $#structure; $i += 1) {
494 $has_other_tags_p = 1 if $structure[$i]->type == TmplTokenType::TAG;
495 last if $has_other_tags_p;
497 push @{$this->{_queue}}, pop @structure unless $has_other_tags_p;
498 &$undo_trailing_blanks;
500 # FIXME: Do the same ugly hack for the last token being a ( or [
502 && $structure[$#structure]->type == TmplTokenType::TEXT
503 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
504 push @{$this->{_queue}}, pop @structure;
505 &$undo_trailing_blanks;
510 sub looks_plausibly_like_groupable_text_p (@) {
512 # The text would look plausibly groupable if all open tags are also closed.
515 for (my $i = 0; $i <= $#structure; $i += 1) {
516 if ($structure[$i]->type == TmplTokenType::TAG) {
517 if ($structure[$i]->string =~ /^<([A-Z0-9]+)/i) {
519 } elsif ($structure[$i]->string =~ /^<\/([A-Z0-9]+)/i) {
520 if (@tags && lc($1) eq $tags[$#tags]) {
526 } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
531 return !$error_p && !@tags;
536 my $h = $this->_handle;
538 $this->{_queue} = [] unless defined $this->{_queue};
540 # Don't reparse anything in the queue. We can put a parametrized token
541 # there if we need to, however.
542 if (@{$this->{_queue}}) {
543 $it = pop @{$this->{_queue}};
545 $it = $this->_next_token_intermediate($h);
546 if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
547 && ($it->type == TmplTokenType::TEXT?
548 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
549 my @structure = ( $it );
552 my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0);
553 if ($it->type == TmplTokenType::TEXT) {
554 $nonblank_text_p = 1 if !blank_p( $it->string );
555 } elsif ($it->type == TmplTokenType::DIRECTIVE) {
557 } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
559 $with_anchor_p = 1 if lc($1) eq 'a';
561 # We hate | and || in msgid strings, so we try to avoid them
562 for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
563 $next = $this->_next_token_intermediate($h);
564 push @structure, $next; # for consistency (with initialization)
565 last unless defined $next && _token_groupable2_p( $next );
566 last if $quit_next_p;
567 if ($next->type == TmplTokenType::TEXT) {
568 $nonblank_text_p = 1 if !blank_p( $next->string );
569 $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
570 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
572 } elsif ($next->type == TmplTokenType::TAG) {
573 if ($next->string =~ /^<([A-Z0-9]+)/is) {
575 $with_anchor_p = 1 if lc($1) eq 'a';
576 } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
578 $quit_p = 1 unless @tags && $close eq $tags[$#tags];
579 $quit_next_p = 1 if $close =~ /^h\d$/;
585 # Undo the last token
586 push @{$this->{_queue}}, pop @structure;
587 # Simply it a bit more
588 @structure = $this->_optimize( @structure );
589 if (@structure < 2) {
592 } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) {
593 # Create the corresponding c-format string
594 my $string = join('', map { $_->string } @structure);
595 my $form = join('', map { _formalize $_ } @structure);
597 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
598 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
599 $it->set_form( $form );
600 $it->set_children( @structure );
601 } elsif ($nonblank_text_p
602 && looks_plausibly_like_groupable_text_p( @structure )
603 && $structure[$#structure]->type == TmplTokenType::TEXT) {
604 # Combine the strings
605 my $string = join('', map { $_->string } @structure);
606 $it = TmplToken->new($string, TmplTokenType::TEXT, $it->line_number, $it->pathname);;
608 # Requeue the tokens thus seen for re-emitting
610 push @{$this->{_queue}}, pop @structure;
613 $it = pop @{$this->{_queue}};
617 if (defined $it && $it->type == TmplTokenType::TEXT) {
618 my $form = string_canon $it->string;
619 $it->set_form( $form );
624 ###############################################################################
626 # Other simple functions (These are not methods)
630 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
637 $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
638 $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
639 return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
644 # Locale::PO->quote is buggy, it doesn't quote newlines :-/
645 $s =~ s/([\\"])/\\\1/gs;
647 #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
651 # Some functions that shouldn't be here... should be moved out some time
652 sub parametrize ($$$) {
653 my($fmt_0, $params, $anchors) = @_;
655 for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
656 if ($fmt =~ /^[^%]+/) {
659 } elsif ($fmt =~ /^%%/) {
662 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/) {
664 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
666 if (!defined $width && !defined $prec) {
668 } elsif (defined $width && defined $prec && !$width && !$prec) {
671 die "Unsupported precision specification in format: $&\n"; #XXX
673 } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
676 die "Unknown or unsupported format specification: $&\n"; #XXX
678 die "Completely confused parametrizing: $fmt\n";#XXX
681 for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
682 if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
685 } elsif ($fmt =~ /^<a(\d+)>/is) {
689 my $anchor = $anchors->[$i - 1];
690 warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
691 unless defined $anchor;
692 $it .= $anchor->string;
694 die "Completely confused decoding anchors: $fmt\n";#XXX
700 sub charset_canon ($) {
702 $charset = uc($charset);
703 $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
704 $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
708 use vars qw( @latin1_utf8 );
710 "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
711 "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
712 "\302\214", "\302\215", undef, undef, "\302\220", "\302\221",
713 "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
714 "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
715 "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
716 "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
717 "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
718 "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
719 "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
720 "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
721 "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
722 "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
723 "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
724 "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
725 "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
726 "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
727 "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
728 "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
729 "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
730 "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
731 "\303\276", "\303\277" );
733 sub charset_convert ($$$) {
734 my($s, $charset_in, $charset_out) = @_;
735 if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
737 } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
738 $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
739 } elsif ($charset_in ne $charset_out) {
740 VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
745 ###############################################################################
749 In addition to the basic scanning, this class will also perform
756 Emulation of c-format strings (see below)
760 Display of warnings for certain things that affects either the
761 ability of this class to yield correct output, or things that
762 are known to cause the original template to cause trouble.
766 Automatic correction of some of the things warned about
767 (e.g., SGML "closed start tag" notation).
771 =head2 c-format strings emulation
773 Because English word order is not universal, a simple extraction
774 of translatable strings may yield some strings like "Accounts for"
775 or ambiguous strings like "in". This makes the resulting strings
776 difficult to translate, but does not affect all languages alike.
777 For example, Chinese (with a somewhat different word order) would
778 be hit harder, but French would be relatively unaffected.
780 To overcome this problem, the scanner can be configured to detect
781 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
782 and try to construct a larger pattern that will appear in the PO
783 file as c-format strings with %s placeholders. This additional
784 step allows the translator to deal with cases where word order
785 is different (replacing %s with %1$s, %2$s, etc.), or when certain
786 words will require certain inflectional suffixes in sentences.
788 Because this is an incompatible change, this mode must be explicitly
789 turned on using the set_cformat(1) method call.
793 This tokenizer is mostly based
794 on Ambrose's hideous Perl script known as subst.pl.