From 09c348bd9c3dc42e5ae27c3eeb674d01e70d6917 Mon Sep 17 00:00:00 2001
From: acli <acli>
Date: Tue, 17 Feb 2004 02:45:27 +0000
Subject: [PATCH] Further breaking up of the TmplTokenizer module. A couple of
 minor fixes.

---
 misc/translator/TmplToken.pm       | 67 ++++++++++++++++++++++++
 misc/translator/TmplTokenType.pm   | 75 +++++++++++++++++++++++++++
 misc/translator/TmplTokenizer.pm   | 81 ++++++++++++++----------------
 misc/translator/VerboseWarnings.pm |  5 +-
 misc/translator/text-extract2.pl   | 10 ++--
 5 files changed, 186 insertions(+), 52 deletions(-)
 create mode 100644 misc/translator/TmplToken.pm
 create mode 100644 misc/translator/TmplTokenType.pm
diff --git a/misc/translator/TmplToken.pm b/misc/translator/TmplToken.pm
new file mode 100644
index 0000000000..3a20aa825d
--- /dev/null
+++ b/misc/translator/TmplToken.pm
@@ -0,0 +1,67 @@
+package TmplToken;
+
+use strict;
+use TmplTokenType;
+require Exporter;
+
+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+
+###############################################################################
+
+=head1 NAME
+
+TmplToken.pm - Object representing a scanner token for .tmpl files
+
+=head1 DESCRIPTION
+
+This is a class representing a token scanned from an HTML::Template .tmpl file.
+
+=cut
+
+###############################################################################
+
+$VERSION = 0.01;
+
+@ISA = qw(Exporter);
+@EXPORT_OK = qw();
+
+###############################################################################
+
+sub new {
+    my $this = shift;
+    my $class = ref($this) || $this;
+    my $self = {};
+    bless $self, $class;
+    ($self->{'_string'}, $self->{'_type'}, $self->{'_lc'}) = @_;
+    return $self;
+}
+
+sub string {
+    my $this = shift;
+    return $this->{'_string'}
+}
+
+sub type {
+    my $this = shift;
+    return $this->{'_type'}
+}
+
+sub line_number {
+    my $this = shift;
+    return $this->{'_lc'}
+}
+
+sub attributes {
+    my $this = shift;
+    return $this->{'_attr'};
+}
+
+sub set_attributes {
+    my $this = shift;
+    $this->{'_attr'} = ref $_[0] eq 'HASH'? $_[0]: \@_;
+    return $this;
+}
+
+###############################################################################
+
+1;
diff --git a/misc/translator/TmplTokenType.pm b/misc/translator/TmplTokenType.pm
new file mode 100644
index 0000000000..c5115846a1
--- /dev/null
+++ b/misc/translator/TmplTokenType.pm
@@ -0,0 +1,75 @@
+package TmplTokenType;
+
+use strict;
+require Exporter;
+
+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+
+###############################################################################
+
+=head1 NAME
+
+TmplTokenType.pm - Types of TmplToken objects
+
+=head1 DESCRIPTION
+
+This is a Java-style "safe enum" singleton class for types of TmplToken objects.
+
+=cut
+
+###############################################################################
+
+$VERSION = 0.01;
+
+@ISA = qw(Exporter);
+@EXPORT_OK = qw(
+    &TEXT
+    &CDATA
+    &TAG
+    &DECL
+    &PI
+    &DIRECTIVE
+    &COMMENT
+    &UNKNOWN
+);
+
+###############################################################################
+
+use vars qw( $_text $_cdata $_tag $_decl $_pi $_directive $_comment $_unknown );
+
+BEGIN {
+    my $new = sub {
+	my $this = shift;
+	my $class = ref($this) || $this;
+	my $self = {};
+	bless $self, $class;
+	($self->{'id'}, $self->{'name'}, $self->{'desc'}) = @_;
+	return $self;
+    };
+    $_text	= &$new(0, 'TEXT');
+    $_cdata	= &$new(1, 'CDATA');
+    $_tag	= &$new(2, 'TAG');
+    $_decl	= &$new(3, 'DECL');
+    $_pi	= &$new(4, 'PI');
+    $_directive	= &$new(5, 'DIRECTIVE');
+    $_comment	= &$new(6, 'COMMENT');
+    $_unknown	= &$new(7, 'UNKNOWN');
+}
+
+sub to_string {
+    my $this = shift;
+    return $this->{'name'}
+}
+
+sub TEXT	() { $_text }
+sub CDATA	() { $_cdata }
+sub TAG		() { $_tag }
+sub DECL	() { $_decl }
+sub PI		() { $_pi }
+sub DIRECTIVE	() { $_directive }
+sub COMMENT	() { $_comment }
+sub UNKNOWN	() { $_unknown }
+
+###############################################################################
+
+1;
diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm
index 3bc746a92b..c3d1fb54bc 100644
--- a/misc/translator/TmplTokenizer.pm
+++ b/misc/translator/TmplTokenizer.pm
@@ -1,6 +1,8 @@
 package TmplTokenizer;
 
 use strict;
+use TmplTokenType;
+use TmplToken;
 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
 require Exporter;
 
@@ -32,16 +34,7 @@ on Ambrose's hideous Perl script known as subst.pl.
 $VERSION = 0.01;
 
 @ISA = qw(Exporter);
-@EXPORT_OK = qw(
-    &KIND_TEXT
-    &KIND_CDATA
-    &KIND_TAG
-    &KIND_DECL
-    &KIND_PI
-    &KIND_DIRECTIVE
-    &KIND_COMMENT
-    &KIND_UNKNOWN
-);
+@EXPORT_OK = qw();
 
 use vars qw( $input );
 use vars qw( $debug_dump_only_p );
@@ -91,15 +84,6 @@ BEGIN {
 
 # End of the hideous stuff
 
-sub KIND_TEXT      () { 'TEXT' }
-sub KIND_CDATA     () { 'CDATA' }
-sub KIND_TAG       () { 'TAG' }
-sub KIND_DECL      () { 'DECL' }
-sub KIND_PI        () { 'PI' }
-sub KIND_DIRECTIVE () { 'HTML::Template' }
-sub KIND_COMMENT   () { 'COMMENT' }   # empty DECL with exactly one SGML comment
-sub KIND_UNKNOWN   () { 'ERROR' }
-
 use vars qw( $readahead $lc_0 $lc $syntaxerror_p );
 use vars qw( $cdata_mode_p $cdata_close );
 
@@ -179,29 +163,29 @@ sub next_token_internal (*) {
     if ($eof_p && !length $readahead) {	# nothing left to do
 	;
     } elsif ($readahead =~ /^\s+/s) {	# whitespace
-	($kind, $it, $readahead) = (KIND_TEXT, $&, $');
+	($kind, $it, $readahead) = (TmplTokenType::TEXT, $&, $');
     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
     } elsif ($readahead =~ /^(?:[^<]|<[<\s])+/s) {	# non-space normal text
-	($kind, $it, $readahead) = (KIND_TEXT, $&, $');
-	warn_normal "Warning: Unescaped < $it\n", $lc_0
+	($kind, $it, $readahead) = (TmplTokenType::TEXT, $&, $');
+	warn_normal "Unescaped < in $it\n", $lc_0
 		if !$cdata_mode_p && $it =~ /</s;
     } else {				# tag/declaration/processing instruction
 	my $ok_p = 0;
 	for (;;) {
 	    if ($cdata_mode_p) {
 		if ($readahead =~ /^$cdata_close/) {
-		    ($kind, $it, $readahead) = (KIND_TAG, $&, $');
+		    ($kind, $it, $readahead) = (TmplTokenType::TAG, $&, $');
 		    $ok_p = 1;
 		} else {
-		    ($kind, $it, $readahead) = (KIND_TEXT, $readahead, undef);
+		    ($kind, $it, $readahead) = (TmplTokenType::TEXT, $readahead, undef);
 		    $ok_p = 1;
 		}
 	    } elsif ($readahead =~ /^$re_tag_compat/os) {
-		($kind, $it, $readahead) = (KIND_TAG, "$1>", $3);
+		($kind, $it, $readahead) = (TmplTokenType::TAG, "$1>", $3);
 		$ok_p = 1;
 		warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq '';
 	    } elsif ($readahead =~ /^<!--(?:(?!-->).)*-->/s) {
-		($kind, $it, $readahead) = (KIND_COMMENT, $&, $');
+		($kind, $it, $readahead) = (TmplTokenType::COMMENT, $&, $');
 		$ok_p = 1;
 		warn_normal "Syntax error in comment: $&\n", $lc_0;
 		$syntaxerror_p = 1;
@@ -213,26 +197,25 @@ sub next_token_internal (*) {
 	    $lc += 1;
 	    $readahead .= $next;
 	}
-	if ($kind ne KIND_TAG) {
+	if ($kind ne TmplTokenType::TAG) {
 	    ;
 	} elsif ($it =~ /^<!/) {
-	    $kind = KIND_DECL;
-	    $kind = KIND_COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
+	    $kind = TmplTokenType::DECL;
+	    $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 	} elsif ($it =~ /^<\?/) {
-	    $kind = KIND_PI;
+	    $kind = TmplTokenType::PI;
 	}
 	if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
-	    $kind = KIND_DIRECTIVE;
+	    $kind = TmplTokenType::DIRECTIVE;
 	}
 	if (!$ok_p && $eof_p) {
-	    ($kind, $it, $readahead) = (KIND_UNKNOWN, $readahead, undef);
+	    ($kind, $it, $readahead) = (TmplTokenType::UNKNOWN, $readahead, undef);
 	    $syntaxerror_p = 1;
 	}
     }
     warn_normal "Unrecognizable token found: $it\n", $lc_0
-	    if $kind eq KIND_UNKNOWN;
-    return defined $it? (wantarray? ($kind, $it):
-				    [$kind, $it]): undef;
+	    if $kind eq TmplTokenType::UNKNOWN;
+    return defined $it? TmplToken->new($it, $kind, $lc): undef;
 }
 
 sub next_token (*) {
@@ -240,27 +223,27 @@ sub next_token (*) {
     my $it;
     if (!$cdata_mode_p) {
 	$it = next_token_internal($h);
-	if (defined $it && $it->[0] eq KIND_TAG) { # FIXME
+	if (defined $it && $it->type eq TmplTokenType::TAG) {
 	    ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
-		    if $it->[1] =~ /^<(script|style|textarea)\b/i; #FIXME
-	    push @$it, extract_attributes($it->[1], $lc_0); #FIXME
+		    if $it->string =~ /^<(script|style|textarea)\b/i;
+	    $it->set_attributes( extract_attributes($it->string, $lc_0) );
 	}
     } else {
 	for ($it = '';;) {
 	    my $lc_prev = $lc;
 	    my $next = next_token_internal($h);
 	last if !defined $next;
-	    if (defined $next && $next->[1] =~ /$cdata_close/i) { #FIXME
-		($lc, $readahead) = ($lc_prev, $next->[1] . $readahead); #FIXME
+	    if (defined $next && $next->string =~ /$cdata_close/i) {
+		($lc, $readahead) = ($lc_prev, $next->string . $readahead);
 		$cdata_mode_p = 0;
 	    }
 	last unless $cdata_mode_p;
-	    $it .= $next->[1]; #FIXME
+	    $it .= $next->string;
 	}
-	$it = [KIND_CDATA, $it]; #FIXME
+	$it = TmplToken->new( $it, TmplTokenType::CDATA, $lc );
 	$cdata_close = undef;
     }
-    return defined $it? (wantarray? @$it: $it): undef;
+    return $it;
 }
 
 ###############################################################################
@@ -290,4 +273,16 @@ to languages where word order is very unlike English word order.
 This will be relatively major rework, requiring corresponding
 rework in tmpl_process.pl
 
+Gettext-style line number references would also be very helpful in
+disambiguating the strings. Ultimately, we should generate and work
+with gettext-style po files, so that translators are able to use
+tools designed for gettext.
+
+An example of a string untranslatable to Chinese is "Accounts for";
+"Accounts for %s", however, would be translatable. Short words like
+"in" would also be untranslatable, not only to Chinese, but also to
+languages requiring declension of nouns.
+
 =cut
+
+1;
diff --git a/misc/translator/VerboseWarnings.pm b/misc/translator/VerboseWarnings.pm
index 2b79fb030e..7589176cb6 100644
--- a/misc/translator/VerboseWarnings.pm
+++ b/misc/translator/VerboseWarnings.pm
@@ -24,9 +24,6 @@ $VERSION = 0.01;
 
 @ISA = qw(Exporter);
 @EXPORT_OK = qw(
-    &set_application_name
-    &set_input_file_name
-    &set_pedantic_mode
     &pedantic_p
     &warn_normal
     &warn_pedantic
@@ -45,7 +42,7 @@ sub set_application_name ($) {
 sub set_input_file_name ($) {
     my($s) = @_;
     $input = $s;
-    $input_abbr = $& if !defined $input && defined $s && $s =~ /[^\/]+$/;
+    $input_abbr = $& if defined $s && $s =~ /[^\/]+$/;
 }
 
 sub set_pedantic_mode ($) {
diff --git a/misc/translator/text-extract2.pl b/misc/translator/text-extract2.pl
index 7bbfc7721e..db568df741 100755
--- a/misc/translator/text-extract2.pl
+++ b/misc/translator/text-extract2.pl
@@ -32,11 +32,11 @@ sub debug_dump (*) { # for testing only
 	my $s = TmplTokenizer::next_token $h;
     last unless defined $s;
 	printf "%s\n", ('-' x 79);
-	my($kind, $t, $attr) = @$s; # FIXME
+	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 	printf "%s:\n", $kind;
 	printf "%4dH%s\n", length($t),
 		join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
-	if ($kind eq TmplTokenizer::KIND_TAG && %$attr) {
+	if ($kind eq TmplTokenType::TAG && %$attr) {
 	    printf "Attributes:\n";
 	    for my $a (keys %$attr) {
 		my($key, $val, $val_orig, $order) = @{$attr->{$a}};
@@ -56,11 +56,11 @@ sub text_extract (*) {
     for (;;) {
 	my $s = TmplTokenizer::next_token $h;
     last unless defined $s;
-	my($kind, $t, $attr) = @$s; # FIXME
-	if ($kind eq TmplTokenizer::KIND_TEXT) {
+	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
+	if ($kind eq TmplTokenType::TEXT) {
 	    $t = TmplTokenizer::trim $t;
 	    $text{$t} = 1 if $t =~ /\S/s;
-	} elsif ($kind eq TmplTokenizer::KIND_TAG && %$attr) {
+	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
 	    # value [tag=input], meta
 	    my $tag = lc($1) if $t =~ /^<(\S+)/s;
 	    for my $a ('alt', 'content', 'title', 'value') {
-- 
2.39.2