misc/translator/TmplTokenizer.pm

   1 package TmplTokenizer;
   2
   3 use Modern::Perl;
   4 use C4::TmplTokenType;
   5 use C4::TmplToken;
   6 use C4::TTParser;
   7 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
   8 require Exporter;
   9
  10 ###############################################################################
  11
  12 =head1 NAME
  13
  14 TmplTokenizer.pm - Simple-minded wrapper class for TTParser
  15
  16 =head1 DESCRIPTION
  17
  18 A wrapper for the functionality found in TTParser to allow an easier transition to Template Toolkit
  19
  20 =cut
  21
  22 ###############################################################################
  23
  24
  25 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  26 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
  27 use vars qw( $pedantic_error_markup_in_pcdata_p );
  28
  29 ###############################################################################
  30
  31 # Hideous stuff
  32 use vars qw( $re_xsl $re_end_entity $re_tmpl_var);
  33 BEGIN {
  34     $re_tmpl_var = q{\[%\s*[get|set|default]?\s*[\w\.]+\s*[|.*?]?\s*%\]};
  35     $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
  36     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
  37 }
  38 # End of the hideous stuff
  39
  40 use vars qw( $serial );
  41
  42 ###############################################################################
  43
  44 sub FATAL_P             () {'fatal-p'}
  45 sub SYNTAXERROR_P       () {'syntaxerror-p'}
  46
  47 sub FILENAME            () {'input'}
  48 #sub HANDLE             () {'handle'}
  49
  50 #sub READAHEAD          () {'readahead'}
  51 sub LINENUM_START       () {'lc_0'}
  52 sub LINENUM             () {'lc'}
  53 sub CDATA_MODE_P        () {'cdata-mode-p'}
  54 sub CDATA_CLOSE         () {'cdata-close'}
  55 #sub PCDATA_MODE_P      () {'pcdata-mode-p'}    # additional submode for CDATA
  56 sub JS_MODE_P           () {'js-mode-p'}        # cdata-mode-p must also be true
  57
  58 sub ALLOW_CFORMAT_P     () {'allow-cformat-p'}
  59
  60 sub new {
  61     shift;
  62     my ($filename) = @_;
  63     #open my $handle,$filename or die "can't open $filename";
  64     my $parser = C4::TTParser->new;
  65     $parser->build_tokens( $filename );
  66     bless {
  67       filename => $filename,
  68       _parser => $parser
  69 #     , handle => $handle
  70 #     , readahead => []
  71     } , __PACKAGE__;
  72 }
  73
  74 ###############################################################################
  75
  76 # Simple getters
  77
  78 sub filename {
  79     my $this = shift;
  80     return $this->{filename};
  81 }
  82
  83 sub fatal_p {
  84     my $this = shift;
  85     return $this->{+FATAL_P};
  86 }
  87
  88 # work around, currently not implemented
  89 sub syntaxerror_p {
  90 #    my $this = shift;
  91 #    return $this->{+SYNTAXERROR_P};
  92     return 0;
  93 }
  94
  95 sub js_mode_p {
  96     my $this = shift;
  97     return $this->{+JS_MODE_P};
  98 }
  99
 100 sub allow_cformat_p {
 101     my $this = shift;
 102     return $this->{+ALLOW_CFORMAT_P};
 103 }
 104
 105 # Simple setters
 106
 107 sub _set_fatal {
 108     my $this = shift;
 109     $this->{+FATAL_P} = $_[0];
 110     return $this;
 111 }
 112
 113 sub _set_js_mode {
 114     my $this = shift;
 115     $this->{+JS_MODE_P} = $_[0];
 116     return $this;
 117 }
 118
 119 #used in xgettext, tmpl_process3
 120 sub set_allow_cformat {
 121     my $this = shift;
 122     $this->{+ALLOW_CFORMAT_P} = $_[0];
 123     return $this;
 124 }
 125
 126 ###############################################################################
 127
 128 use vars qw( $js_EscapeSequence );
 129 BEGIN {
 130     # Perl quoting is really screwed up, but this common subexp is way too long
 131     $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
 132 }
 133 sub parenleft  () { '(' }
 134 sub parenright () { ')' }
 135
 136 sub _split_js {
 137     my ($s0) = @_;
 138     my @it = ();
 139     while (length $s0) {
 140         if ($s0 =~ /^\s+/s) {                           # whitespace
 141           push @it, $&;
 142           $s0 = $';
 143         } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
 144         push @it, $&;
 145         $s0 = $';
 146         } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) {   # C-style comment
 147             push @it, $&;
 148             $s0 = $';
 149         # Keyword or identifier, ECMA-262 p.13 (section 7.5)
 150         } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) {     # IdentifierName
 151             push @it, $&;
 152             $s0 = $';
 153         # Punctuator, ECMA-262 p.13 (section 7.6)
 154         } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
 155             push @it, $&;
 156             $s0 = $';
 157         # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
 158         } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
 159             push @it, $&;
 160             $s0 = $';
 161         # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
 162         } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
 163             push @it, $&;
 164             $s0 = $';
 165         # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
 166         } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
 167             push @it, $&;
 168             $s0 = $';
 169         # StringLiteral, ECMA-262 p.17 (section 7.7.4)
 170         # XXX SourceCharacter doesn't seem to be defined (?)
 171         } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
 172             push @it, $&;
 173             $s0 = $';
 174         } elsif ($s0 =~ /^./) {                         # UNKNOWN TOKEN !!!
 175             push @it, $&;
 176             $s0 = $';
 177         }
 178     }
 179     return @it;
 180 }
 181
 182 sub STATE_UNDERSCORE     () { 1 }
 183 sub STATE_PARENLEFT      () { 2 }
 184 sub STATE_STRING_LITERAL () { 3 }
 185
 186 # XXX This is a crazy hack. I don't want to write an ECMAScript parser.
 187 # XXX A scanner is one thing; a parser another thing.
 188 sub _identify_js_translatables {
 189     my @input = @_;
 190     my @output = ();
 191     # We mark a JavaScript translatable string as in C, i.e., _("literal")
 192     # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
 193     for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
 194 #        warn $input[$i];
 195         my $reset_state_p = 0;
 196         push @output, [0, $input[$i]];
 197         if ($input[$i] !~ /\S/s) {
 198           ;
 199         } elsif ($state == 0) {
 200           $state = STATE_UNDERSCORE if $input[$i] eq '_';
 201         } elsif ($state == STATE_UNDERSCORE) {
 202           $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
 203         } elsif ($state == STATE_PARENLEFT) {
 204           if ($input[$i] =~ /^(['"])(.*)\1$/s) {
 205             ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
 206           } else {
 207             $state = 0;
 208           }
 209         } elsif ($state == STATE_STRING_LITERAL) {
 210           if ($input[$i] eq parenright) {
 211             $output[$j] = [1, $output[$j]->[1], $q, $s];
 212           }
 213           $state = 0;
 214         } else {
 215           die "identify_js_translatables internal error: Unknown state $state"
 216         }
 217     }
 218 #    use Data::Dumper;
 219 #    warn Dumper \@output;
 220     return \@output;
 221 }
 222
 223 ###############################################################################
 224
 225 sub string_canon {
 226   my $s = shift;
 227   # Fold all whitespace into single blanks
 228   $s =~ s/\s+/ /g;
 229   $s =~ s/^\s+//g;
 230   return $s;
 231 }
 232
 233 # safer version used internally, preserves new lines
 234 sub string_canon_safe {
 235   my $s = shift;
 236   # fold tabs and spaces into single spaces
 237   $s =~ s/[\ \t]+/ /gs;
 238   return $s;
 239 }
 240
 241
 242 sub _quote_cformat{
 243   my $s = shift;
 244   $s =~ s/%/%%/g;
 245   return $s;
 246 }
 247
 248 sub _formalize_string_cformat{
 249   my $s = shift;
 250   return _quote_cformat( string_canon_safe($s) );
 251 }
 252
 253 sub _formalize{
 254   my $t = shift;
 255   if( $t->type == C4::TmplTokenType::DIRECTIVE ){
 256     return '%s';
 257   } elsif( $t->type == C4::TmplTokenType::TEXT ){
 258     return _formalize_string_cformat( $t->string );
 259   } elsif( $t->type == C4::TmplTokenType::TAG ){
 260     if( $t->string =~ m/^a\b/is ){
 261       return '<a>';
 262     } elsif( $t->string =~ m/^input\b/is ){
 263       if( lc $t->attributes->{'type'}->[1] eq 'text' ){
 264         return '%S';
 265       } else{
 266         return '%p';
 267       }
 268     } else{
 269       return _quote_cformat $t->string;
 270     }
 271   } else{
 272     return _quote_cformat $t->string;
 273   }
 274 }
 275
 276 # internal parametization, used within next_token
 277 # method that takes in an array of TEXT and DIRECTIVE tokens (DIRECTIVEs must be GET) and return a C4::TmplTokenType::TEXT_PARAMETRIZED
 278 sub _parametrize_internal{
 279     my $this = shift;
 280     my @parts = @_;
 281     # my $s = "";
 282     # for my $item (@parts){
 283     #     if( $item->type == C4::TmplTokenType::TEXT ){
 284     #         $s .= $item->string;
 285     #     } else {
 286     #         #must be a variable directive
 287     #         $s .= "%s";
 288     #     }
 289     # }
 290     my $s = join( "", map { _formalize $_ } @parts );
 291     # should both the string and form be $s? maybe only the later? posibly the former....
 292     # used line number from first token, should suffice
 293     my $t = C4::TmplToken->new( $s, C4::TmplTokenType::TEXT_PARAMETRIZED, $parts[0]->line_number, $this->filename );
 294     $t->set_children(@parts);
 295     $t->set_form($s);
 296     return $t;
 297 }
 298
 299 sub next_token {
 300     my $self = shift;
 301     my $next;
 302 #    warn "in next_token";
 303     # parts that make up a text_parametrized (future children of the token)
 304     my @parts = ();
 305     while(1){
 306         $next = $self->{_parser}->next_token;
 307         if (! $next){
 308             if (@parts){
 309                 return $self->_parametrize_internal(@parts);
 310             }
 311             else {
 312                 return;
 313             }
 314         }
 315         # if cformat mode is off, dont bother parametrizing, just return them as they come
 316         return $next unless $self->allow_cformat_p;
 317         if( $next->type == C4::TmplTokenType::TEXT ){
 318             push @parts, $next;
 319         }
 320 #        elsif( $next->type == C4::TmplTokenType::DIRECTIVE && $next->string =~ m/\[%\s*\w+\s*%\]/ ){
 321         elsif( $next->type == C4::TmplTokenType::DIRECTIVE ){
 322             push @parts, $next;
 323         }
 324         elsif ( $next->type == C4::TmplTokenType::CDATA){
 325             $self->_set_js_mode(1);
 326             my $s0 = $next->string;
 327             my @head = ();
 328             my @tail = ();
 329
 330             if ($s0 =~ /^(\s*\[%\s*)(.*)(\s%=]\s*)$/s) {
 331                 push @head, $1;
 332                  push @tail, $3;
 333                 $s0 = $2;
 334             }
 335             push @head, _split_js($s0);
 336             $next->set_js_data(_identify_js_translatables(@head, @tail) );
 337             return $next unless @parts;
 338             $self->{_parser}->unshift_token($next);
 339             return $self->_parametrize_internal(@parts);
 340         }
 341         else {
 342             # if there is nothing in parts, return this token
 343             return $next unless @parts;
 344
 345             # OTHERWISE, put this token back and return the parametrized string of @parts
 346             $self->{_parser}->unshift_token($next);
 347             return $self->_parametrize_internal(@parts);
 348         }
 349
 350     }
 351 }
 352
 353 ###############################################################################
 354
 355 # function taken from old version
 356 # used by tmpl_process3
 357 sub parametrize {
 358     my($fmt_0, $cformat_p, $t, $f) = @_;
 359     my $it = '';
 360     if ($cformat_p) {
 361         my @params = $t->parameters_and_fields;
 362         for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
 363             if ($fmt =~ /^[^%]+/) {
 364                 $fmt = $';
 365                 $it .= $&;
 366             } elsif ($fmt =~ /^%%/) {
 367                 $fmt = $';
 368                 $it .= '%';
 369             } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
 370                 $n += 1;
 371                 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
 372                 $fmt = $';
 373                 if (defined $width && defined $prec && !$width && !$prec) {
 374                     ;
 375                 } elsif (defined $params[$i - 1]) {
 376                     my $param = $params[$i - 1];
 377             warn_normal("$fmt_0: $&: Expected a TMPL_VAR, but found a "
 378                 . $param->type->to_string . "\n", undef)
 379                 if $param->type != C4::TmplTokenType::DIRECTIVE;
 380             warn_normal("$fmt_0: $&: Unsupported "
 381                 . "field width or precision\n", undef)
 382                             if defined $width || defined $prec;
 383             warn_normal("$fmt_0: $&: Parameter $i not known", undef)
 384                             unless defined $param;
 385                     $it .= defined $f? &$f( $param ): $param->string;
 386                 }
 387             } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
 388                 $n += 1;
 389                 my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
 390                 $fmt = $';
 391
 392                 my $param = $params[$i - 1];
 393                 if (!defined $param) {
 394             warn_normal("$fmt_0: $&: Parameter $i not known", undef);
 395                 } else {
 396                     if ($param->type == C4::TmplTokenType::TAG
 397                             && $param->string =~ /^<input\b/is) {
 398                         my $type = defined $param->attributes?
 399                                 lc($param->attributes->{'type'}->[1]): undef;
 400                         if ($conv eq 'S') {
 401                 warn_normal("$fmt_0: $&: Expected type=text, "
 402                     . "but found type=$type", undef)
 403                                     unless $type eq 'text';
 404                         } elsif ($conv eq 'p') {
 405                 warn_normal("$fmt_0: $&: Expected type=radio, "
 406                     . "but found type=$type", undef)
 407                                     unless $type eq 'radio';
 408                         }
 409                     } else {
 410                 warn_normal("$&: Expected an INPUT, but found a "
 411                     . $param->type->to_string . "\n", undef)
 412                     }
 413             warn_normal("$fmt_0: $&: Unsupported "
 414                 . "field width or precision\n", undef)
 415                             if defined $width || defined $prec;
 416                     $it .= defined $f? &$f( $param ): $param->string;
 417                 }
 418             } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
 419                 $fmt = $';
 420                 $it .= $&;
 421                 die "$&: Unknown or unsupported format specification\n"; #XXX
 422             } else {
 423                 die "$&: Completely confused parametrizing -- msgid: $fmt_0\n";#XXX
 424             }
 425         }
 426     }
 427     my @anchors = $t->anchors;
 428     for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
 429         if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
 430             $fmt = $';
 431             $it .= $&;
 432         } elsif ($fmt =~ /^<a(\d+)>/is) {
 433             $n += 1;
 434             my $i  = $1;
 435             $fmt = $';
 436             my $anchor = $anchors[$i - 1];
 437         warn_normal("$&: Anchor $1 not found for msgid \"$fmt_0\"", undef) #FIXME
 438                     unless defined $anchor;
 439             $it .= $anchor->string;
 440         } else {
 441             die "Completely confused decoding anchors: $fmt\n";#XXX
 442         }
 443     }
 444     return $it;
 445 }
 446
 447
 448 # Other simple functions (These are not methods)
 449
 450 sub blank_p {
 451     my($s) = @_;
 452     return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var|$re_xsl)*$/osi;
 453 }
 454
 455 sub trim {
 456     my($s0) = @_;
 457     my $l0 = length $s0;
 458     my $s = $s0;
 459     $s =~ s/^(\s|\&nbsp$re_end_entity)+//os; my $l1 = $l0 - length $s;
 460     $s =~ s/(\s|\&nbsp$re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
 461     return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
 462 }
 463
 464 sub charset_canon {
 465     my($charset) = @_;
 466     $charset = uc($charset);
 467     $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
 468     $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
 469     return $charset;
 470 }
 471
 472 use vars qw( @latin1_utf8 );
 473 @latin1_utf8 = (
 474     "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
 475     "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
 476     "\302\214", "\302\215",   undef,      undef,    "\302\220", "\302\221",
 477     "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
 478     "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
 479     "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
 480     "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
 481     "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
 482     "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
 483     "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
 484     "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
 485     "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
 486     "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
 487     "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
 488     "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
 489     "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
 490     "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
 491     "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
 492     "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
 493     "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
 494     "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
 495     "\303\276", "\303\277" );
 496
 497 sub charset_convert {
 498     my($s, $charset_in, $charset_out) = @_;
 499     if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
 500         ;
 501     } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
 502         $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
 503     } elsif ($charset_in ne $charset_out) {
 504         VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
 505     }
 506     return $s;
 507 }
 508
 509 ###############################################################################
 510
 511 =pod
 512
 513 In addition to the basic scanning, this class will also perform
 514 the following:
 515
 516 =over
 517
 518 =item -
 519
 520 Emulation of c-format strings (see below)
 521
 522 =item -
 523
 524 Display of warnings for certain things that affects either the
 525 ability of this class to yield correct output, or things that
 526 are known to cause the original template to cause trouble.
 527
 528 =item -
 529
 530 Automatic correction of some of the things warned about
 531 (e.g., SGML "closed start tag" notation).
 532
 533 =back
 534
 535 =head2 c-format strings emulation
 536
 537 Because English word order is not universal, a simple extraction
 538 of translatable strings may yield some strings like "Accounts for"
 539 or ambiguous strings like "in". This makes the resulting strings
 540 difficult to translate, but does not affect all languages alike.
 541 For example, Chinese (with a somewhat different word order) would
 542 be hit harder, but French would be relatively unaffected.
 543
 544 To overcome this problem, the scanner can be configured to detect
 545 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
 546 and try to construct a larger pattern that will appear in the PO
 547 file as c-format strings with %s placeholders. This additional
 548 step allows the translator to deal with cases where word order
 549 is different (replacing %s with %1$s, %2$s, etc.), or when certain
 550 words will require certain inflectional suffixes in sentences.
 551
 552 Because this is an incompatible change, this mode must be explicitly
 553 turned on using the set_allow_cformat(1) method call.
 554
 555 =head2 The flag characters
 556
 557 The character % is followed by zero or more of the following flags:
 558
 559 =over
 560
 561 =item #
 562
 563 The value comes from HTML <INPUT> elements.
 564 This abuse of the flag character is somewhat reasonable,
 565 since TMPL_VAR and INPUT are both variables, but of different kinds.
 566
 567 =back
 568
 569 =head2 The field width and precision
 570
 571 An optional 0.0 can be specified for %s to specify
 572 that the <TMPL_VAR> should be suppressed.
 573
 574 =head2 The conversion specifier
 575
 576 =over
 577
 578 =item p
 579
 580 Specifies any input field that is neither text nor hidden
 581 (which currently mean radio buttons).
 582 The p conversion specifier is chosen because this does not
 583 evoke any certain sensible data type.
 584
 585 =item S
 586
 587 Specifies a text input field (<INPUT TYPE=TEXT>).
 588 This use of the S conversion specifier is somewhat reasonable,
 589 since text input fields contain values of undeterminable type,
 590 which can be treated as strings.
 591
 592 =item s
 593
 594 Specifies a <TMPL_VAR>.
 595 This use of the o conversion specifier is somewhat reasonable,
 596 since <TMPL_VAR> denotes values of undeterminable type, which
 597 can be treated as strings.
 598
 599 =back
 600
 601 =head1 BUGS
 602
 603 There is no code to save the tag name anywhere in the scanned token.
 604
 605 The use of <AI<i>> to stand for the I<i>th anchor
 606 is not very well thought out.
 607 Some abuse of c-format specifies might have been more appropriate.
 608
 609 =head1 HISTORY
 610
 611 This tokenizer is mostly based
 612 on Ambrose's hideous Perl script known as subst.pl.
 613
 614 =cut
 615
 616 1;