From 09c348bd9c3dc42e5ae27c3eeb674d01e70d6917 Mon Sep 17 00:00:00 2001 From: acli Date: Tue, 17 Feb 2004 02:45:27 +0000 Subject: [PATCH] Further breaking up of the TmplTokenizer module. A couple of minor fixes. --- misc/translator/TmplToken.pm | 67 ++++++++++++++++++++++++ misc/translator/TmplTokenType.pm | 75 +++++++++++++++++++++++++++ misc/translator/TmplTokenizer.pm | 81 ++++++++++++++---------------- misc/translator/VerboseWarnings.pm | 5 +- misc/translator/text-extract2.pl | 10 ++-- 5 files changed, 186 insertions(+), 52 deletions(-) create mode 100644 misc/translator/TmplToken.pm create mode 100644 misc/translator/TmplTokenType.pm diff --git a/misc/translator/TmplToken.pm b/misc/translator/TmplToken.pm new file mode 100644 index 0000000000..3a20aa825d --- /dev/null +++ b/misc/translator/TmplToken.pm @@ -0,0 +1,67 @@ +package TmplToken; + +use strict; +use TmplTokenType; +require Exporter; + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +############################################################################### + +=head1 NAME + +TmplToken.pm - Object representing a scanner token for .tmpl files + +=head1 DESCRIPTION + +This is a class representing a token scanned from an HTML::Template .tmpl file. + +=cut + +############################################################################### + +$VERSION = 0.01; + +@ISA = qw(Exporter); +@EXPORT_OK = qw(); + +############################################################################### + +sub new { + my $this = shift; + my $class = ref($this) || $this; + my $self = {}; + bless $self, $class; + ($self->{'_string'}, $self->{'_type'}, $self->{'_lc'}) = @_; + return $self; +} + +sub string { + my $this = shift; + return $this->{'_string'} +} + +sub type { + my $this = shift; + return $this->{'_type'} +} + +sub line_number { + my $this = shift; + return $this->{'_lc'} +} + +sub attributes { + my $this = shift; + return $this->{'_attr'}; +} + +sub set_attributes { + my $this = shift; + $this->{'_attr'} = ref $_[0] eq 'HASH'? $_[0]: \@_; + return $this; +} + +############################################################################### + +1; diff --git a/misc/translator/TmplTokenType.pm b/misc/translator/TmplTokenType.pm new file mode 100644 index 0000000000..c5115846a1 --- /dev/null +++ b/misc/translator/TmplTokenType.pm @@ -0,0 +1,75 @@ +package TmplTokenType; + +use strict; +require Exporter; + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +############################################################################### + +=head1 NAME + +TmplTokenType.pm - Types of TmplToken objects + +=head1 DESCRIPTION + +This is a Java-style "safe enum" singleton class for types of TmplToken objects. + +=cut + +############################################################################### + +$VERSION = 0.01; + +@ISA = qw(Exporter); +@EXPORT_OK = qw( + &TEXT + &CDATA + &TAG + &DECL + &PI + &DIRECTIVE + &COMMENT + &UNKNOWN +); + +############################################################################### + +use vars qw( $_text $_cdata $_tag $_decl $_pi $_directive $_comment $_unknown ); + +BEGIN { + my $new = sub { + my $this = shift; + my $class = ref($this) || $this; + my $self = {}; + bless $self, $class; + ($self->{'id'}, $self->{'name'}, $self->{'desc'}) = @_; + return $self; + }; + $_text = &$new(0, 'TEXT'); + $_cdata = &$new(1, 'CDATA'); + $_tag = &$new(2, 'TAG'); + $_decl = &$new(3, 'DECL'); + $_pi = &$new(4, 'PI'); + $_directive = &$new(5, 'DIRECTIVE'); + $_comment = &$new(6, 'COMMENT'); + $_unknown = &$new(7, 'UNKNOWN'); +} + +sub to_string { + my $this = shift; + return $this->{'name'} +} + +sub TEXT () { $_text } +sub CDATA () { $_cdata } +sub TAG () { $_tag } +sub DECL () { $_decl } +sub PI () { $_pi } +sub DIRECTIVE () { $_directive } +sub COMMENT () { $_comment } +sub UNKNOWN () { $_unknown } + +############################################################################### + +1; diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index 3bc746a92b..c3d1fb54bc 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -1,6 +1,8 @@ package TmplTokenizer; use strict; +use TmplTokenType; +use TmplToken; use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic ); require Exporter; @@ -32,16 +34,7 @@ on Ambrose's hideous Perl script known as subst.pl. $VERSION = 0.01; @ISA = qw(Exporter); -@EXPORT_OK = qw( - &KIND_TEXT - &KIND_CDATA - &KIND_TAG - &KIND_DECL - &KIND_PI - &KIND_DIRECTIVE - &KIND_COMMENT - &KIND_UNKNOWN -); +@EXPORT_OK = qw(); use vars qw( $input ); use vars qw( $debug_dump_only_p ); @@ -91,15 +84,6 @@ BEGIN { # End of the hideous stuff -sub KIND_TEXT () { 'TEXT' } -sub KIND_CDATA () { 'CDATA' } -sub KIND_TAG () { 'TAG' } -sub KIND_DECL () { 'DECL' } -sub KIND_PI () { 'PI' } -sub KIND_DIRECTIVE () { 'HTML::Template' } -sub KIND_COMMENT () { 'COMMENT' } # empty DECL with exactly one SGML comment -sub KIND_UNKNOWN () { 'ERROR' } - use vars qw( $readahead $lc_0 $lc $syntaxerror_p ); use vars qw( $cdata_mode_p $cdata_close ); @@ -179,29 +163,29 @@ sub next_token_internal (*) { if ($eof_p && !length $readahead) { # nothing left to do ; } elsif ($readahead =~ /^\s+/s) { # whitespace - ($kind, $it, $readahead) = (KIND_TEXT, $&, $'); + ($kind, $it, $readahead) = (TmplTokenType::TEXT, $&, $'); # FIXME the following (the [<\s] part) is an unreliable HACK :-( } elsif ($readahead =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text - ($kind, $it, $readahead) = (KIND_TEXT, $&, $'); - warn_normal "Warning: Unescaped < $it\n", $lc_0 + ($kind, $it, $readahead) = (TmplTokenType::TEXT, $&, $'); + warn_normal "Unescaped < in $it\n", $lc_0 if !$cdata_mode_p && $it =~ /", $3); + ($kind, $it, $readahead) = (TmplTokenType::TAG, "$1>", $3); $ok_p = 1; warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq ''; } elsif ($readahead =~ /^).)*-->/s) { - ($kind, $it, $readahead) = (KIND_COMMENT, $&, $'); + ($kind, $it, $readahead) = (TmplTokenType::COMMENT, $&, $'); $ok_p = 1; warn_normal "Syntax error in comment: $&\n", $lc_0; $syntaxerror_p = 1; @@ -213,26 +197,25 @@ sub next_token_internal (*) { $lc += 1; $readahead .= $next; } - if ($kind ne KIND_TAG) { + if ($kind ne TmplTokenType::TAG) { ; } elsif ($it =~ /^).)*-->/; + $kind = TmplTokenType::DECL; + $kind = TmplTokenType::COMMENT if $it =~ /^).)*-->/; } elsif ($it =~ /^<\?/) { - $kind = KIND_PI; + $kind = TmplTokenType::PI; } if ($it =~ /^$re_directive/ios && !$cdata_mode_p) { - $kind = KIND_DIRECTIVE; + $kind = TmplTokenType::DIRECTIVE; } if (!$ok_p && $eof_p) { - ($kind, $it, $readahead) = (KIND_UNKNOWN, $readahead, undef); + ($kind, $it, $readahead) = (TmplTokenType::UNKNOWN, $readahead, undef); $syntaxerror_p = 1; } } warn_normal "Unrecognizable token found: $it\n", $lc_0 - if $kind eq KIND_UNKNOWN; - return defined $it? (wantarray? ($kind, $it): - [$kind, $it]): undef; + if $kind eq TmplTokenType::UNKNOWN; + return defined $it? TmplToken->new($it, $kind, $lc): undef; } sub next_token (*) { @@ -240,27 +223,27 @@ sub next_token (*) { my $it; if (!$cdata_mode_p) { $it = next_token_internal($h); - if (defined $it && $it->[0] eq KIND_TAG) { # FIXME + if (defined $it && $it->type eq TmplTokenType::TAG) { ($cdata_mode_p, $cdata_close) = (1, "") - if $it->[1] =~ /^<(script|style|textarea)\b/i; #FIXME - push @$it, extract_attributes($it->[1], $lc_0); #FIXME + if $it->string =~ /^<(script|style|textarea)\b/i; + $it->set_attributes( extract_attributes($it->string, $lc_0) ); } } else { for ($it = '';;) { my $lc_prev = $lc; my $next = next_token_internal($h); last if !defined $next; - if (defined $next && $next->[1] =~ /$cdata_close/i) { #FIXME - ($lc, $readahead) = ($lc_prev, $next->[1] . $readahead); #FIXME + if (defined $next && $next->string =~ /$cdata_close/i) { + ($lc, $readahead) = ($lc_prev, $next->string . $readahead); $cdata_mode_p = 0; } last unless $cdata_mode_p; - $it .= $next->[1]; #FIXME + $it .= $next->string; } - $it = [KIND_CDATA, $it]; #FIXME + $it = TmplToken->new( $it, TmplTokenType::CDATA, $lc ); $cdata_close = undef; } - return defined $it? (wantarray? @$it: $it): undef; + return $it; } ############################################################################### @@ -290,4 +273,16 @@ to languages where word order is very unlike English word order. This will be relatively major rework, requiring corresponding rework in tmpl_process.pl +Gettext-style line number references would also be very helpful in +disambiguating the strings. Ultimately, we should generate and work +with gettext-style po files, so that translators are able to use +tools designed for gettext. + +An example of a string untranslatable to Chinese is "Accounts for"; +"Accounts for %s", however, would be translatable. Short words like +"in" would also be untranslatable, not only to Chinese, but also to +languages requiring declension of nouns. + =cut + +1; diff --git a/misc/translator/VerboseWarnings.pm b/misc/translator/VerboseWarnings.pm index 2b79fb030e..7589176cb6 100644 --- a/misc/translator/VerboseWarnings.pm +++ b/misc/translator/VerboseWarnings.pm @@ -24,9 +24,6 @@ $VERSION = 0.01; @ISA = qw(Exporter); @EXPORT_OK = qw( - &set_application_name - &set_input_file_name - &set_pedantic_mode &pedantic_p &warn_normal &warn_pedantic @@ -45,7 +42,7 @@ sub set_application_name ($) { sub set_input_file_name ($) { my($s) = @_; $input = $s; - $input_abbr = $& if !defined $input && defined $s && $s =~ /[^\/]+$/; + $input_abbr = $& if defined $s && $s =~ /[^\/]+$/; } sub set_pedantic_mode ($) { diff --git a/misc/translator/text-extract2.pl b/misc/translator/text-extract2.pl index 7bbfc7721e..db568df741 100755 --- a/misc/translator/text-extract2.pl +++ b/misc/translator/text-extract2.pl @@ -32,11 +32,11 @@ sub debug_dump (*) { # for testing only my $s = TmplTokenizer::next_token $h; last unless defined $s; printf "%s\n", ('-' x 79); - my($kind, $t, $attr) = @$s; # FIXME + my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); printf "%s:\n", $kind; printf "%4dH%s\n", length($t), join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t)); - if ($kind eq TmplTokenizer::KIND_TAG && %$attr) { + if ($kind eq TmplTokenType::TAG && %$attr) { printf "Attributes:\n"; for my $a (keys %$attr) { my($key, $val, $val_orig, $order) = @{$attr->{$a}}; @@ -56,11 +56,11 @@ sub text_extract (*) { for (;;) { my $s = TmplTokenizer::next_token $h; last unless defined $s; - my($kind, $t, $attr) = @$s; # FIXME - if ($kind eq TmplTokenizer::KIND_TEXT) { + my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); + if ($kind eq TmplTokenType::TEXT) { $t = TmplTokenizer::trim $t; $text{$t} = 1 if $t =~ /\S/s; - } elsif ($kind eq TmplTokenizer::KIND_TAG && %$attr) { + } elsif ($kind eq TmplTokenType::TAG && %$attr) { # value [tag=input], meta my $tag = lc($1) if $t =~ /^<(\S+)/s; for my $a ('alt', 'content', 'title', 'value') { -- 2.39.5