From 8b57901d85938321f635f30bd6aaaa257846d162 Mon Sep 17 00:00:00 2001 From: acli Date: Thu, 19 Feb 2004 21:24:30 +0000 Subject: [PATCH] New scripts for translation into Chinese and other languages where English word order is too different than the word order of the target language to yield meaningful translations. The new scripts use a different translation file format (namely standard gettext-style PO files). This seems to reasonably work (e.g., producing an empty en_GB translation then installing seems to not corrupt the "translated" files), but it likely will still contain some bugs. There is also little documentation, but try to run perldoc on the .p[lm] files to see what's there. There are also some spurious warnings (both from bugs in the new scripts and from buggy third- party Locale::PO module). --- misc/translator/TmplToken.pm | 33 ++++ misc/translator/TmplTokenType.pm | 87 +++++++-- misc/translator/TmplTokenizer.pm | 232 ++++++++++++++++++++---- misc/translator/text-extract2.pl | 31 +++- misc/translator/tmpl_process3.pl | 300 +++++++++++++++++++++++++++++++ misc/translator/xgettext.pl | 289 +++++++++++++++++++++++++++++ 6 files changed, 918 insertions(+), 54 deletions(-) create mode 100755 misc/translator/tmpl_process3.pl create mode 100755 misc/translator/xgettext.pl diff --git a/misc/translator/TmplToken.pm b/misc/translator/TmplToken.pm index fb04019165..854a70a304 100644 --- a/misc/translator/TmplToken.pm +++ b/misc/translator/TmplToken.pm @@ -67,6 +67,39 @@ sub set_attributes { return $this; } +# only meaningful for TEXT_PARAMETRIZED tokens +sub children { + my $this = shift; + return $this->{'_kids'}; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub set_children { + my $this = shift; + $this->{'_kids'} = ref $_[0] eq 'ARRAY'? $_[0]: \@_; + return $this; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +# FIXME: DIRECTIVE is not necessarily TMPL_VAR !! +sub parameters { + my $this = shift; + return map { $_->type == TmplTokenType::DIRECTIVE? $_: ()} @{$this->{'_kids'}}; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub form { + my $this = shift; + return $this->{'_form'}; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub set_form { + my $this = shift; + $this->{'_form'} = $_[0]; + return $this; +} + ############################################################################### 1; diff --git a/misc/translator/TmplTokenType.pm b/misc/translator/TmplTokenType.pm index 9616f1379a..75d4f5256a 100644 --- a/misc/translator/TmplTokenType.pm +++ b/misc/translator/TmplTokenType.pm @@ -14,6 +14,7 @@ TmplTokenType.pm - Types of TmplToken objects =head1 DESCRIPTION This is a Java-style "safe enum" singleton class for types of TmplToken objects. +The predefined constants are =cut @@ -24,6 +25,7 @@ $VERSION = 0.01; @ISA = qw(Exporter); @EXPORT_OK = qw( &TEXT + &TEXT_PARAMETRIZED &CDATA &TAG &DECL @@ -35,7 +37,8 @@ $VERSION = 0.01; ############################################################################### -use vars qw( $_text $_cdata $_tag $_decl $_pi $_directive $_comment $_unknown ); +use vars qw( $_text $_text_parametrized $_cdata + $_tag $_decl $_pi $_directive $_comment $_null $_unknown ); BEGIN { my $new = sub { @@ -46,14 +49,15 @@ BEGIN { ($self->{'id'}, $self->{'name'}, $self->{'desc'}) = @_; return $self; }; - $_text = &$new(0, 'TEXT'); - $_cdata = &$new(1, 'CDATA'); - $_tag = &$new(2, 'TAG'); - $_decl = &$new(3, 'DECL'); - $_pi = &$new(4, 'PI'); - $_directive = &$new(5, 'DIRECTIVE'); - $_comment = &$new(6, 'COMMENT'); - $_unknown = &$new(7, 'UNKNOWN'); + $_text = &$new(0, 'TEXT'); + $_text_parametrized = &$new(8, 'TEXT-PARAMETRIZED'); + $_cdata = &$new(1, 'CDATA'); + $_tag = &$new(2, 'TAG'); + $_decl = &$new(3, 'DECL'); + $_pi = &$new(4, 'PI'); + $_directive = &$new(5, 'DIRECTIVE'); + $_comment = &$new(6, 'COMMENT'); + $_unknown = &$new(7, 'UNKNOWN'); } sub to_string { @@ -61,15 +65,64 @@ sub to_string { return $this->{'name'} } -sub TEXT () { $_text } -sub CDATA () { $_cdata } -sub TAG () { $_tag } -sub DECL () { $_decl } -sub PI () { $_pi } -sub DIRECTIVE () { $_directive } -sub COMMENT () { $_comment } -sub UNKNOWN () { $_unknown } +sub TEXT () { $_text } +sub TEXT_PARAMETRIZED () { $_text_parametrized } +sub CDATA () { $_cdata } +sub TAG () { $_tag } +sub DECL () { $_decl } +sub PI () { $_pi } +sub DIRECTIVE () { $_directive } +sub COMMENT () { $_comment } +sub UNKNOWN () { $_unknown } ############################################################################### +=over + +=item TEXT + +normal text (#text in the DTD) + +=item TEXT_PARAMETRIZED + +parametrized normal text +(result of simple recognition of text interspersed with directives; +this has to be explicitly enabled in the scanner) + +=item CDATA + +normal text (CDATA in the DTD) + +=item TAG + +something that has the form of an HTML tag + +=item DECL + +something that has the form of an SGML declaration + +=item PI + +something that has the form of an SGML processing instruction + +=item DIRECTIVE + +a HTML::Template directive (whether or not embedded in an SGML comment) + +=item COMMENT + +something that has the form of an HTML comment +(and is not recognized as an HTML::Template directive) + +=item UNKNOWN + +something that is not recognized at all by the scanner + +=back + +Note that end of file is currently represented by undef, +instead of a constant predefined by this module. + +=cut + 1; diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index bb37eaa10e..f3ff5b62a9 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -22,11 +22,6 @@ it might be better to create a customized scanner to scan the template files for tokens. This module is a simple-minded attempt at such a scanner. -=head1 HISTORY - -This tokenizer is mostly based -on Ambrose's hideous Perl script known as subst.pl. - =cut ############################################################################### @@ -97,6 +92,8 @@ sub LINENUM () {'lc'} sub CDATA_MODE_P () {'cdata-mode-p'} sub CDATA_CLOSE () {'cdata-close'} +sub ALLOW_CFORMAT_P () {'allow-cformat-p'} + sub new { my $this = shift; my($input) = @_; @@ -170,6 +167,11 @@ sub cdata_close { return $this->{+CDATA_CLOSE}; } +sub allow_cformat_p { + my $this = shift; + return $this->{+ALLOW_CFORMAT_P}; +} + # Simple setters sub _set_fatal { @@ -231,6 +233,12 @@ sub _set_cdata_close { return $this; } +sub set_allow_cformat { + my $this = shift; + $this->{+ALLOW_CFORMAT_P} = $_[0]; + return $this; +} + ############################################################################### sub _extract_attributes ($;$) { @@ -305,7 +313,7 @@ sub _next_token_internal { ($kind, $it) = (TmplTokenType::TEXT, $&); $this->_set_readahead( $' ); # FIXME the following (the [<\s] part) is an unreliable HACK :-( - } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text + } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text ($kind, $it) = (TmplTokenType::TEXT, $&); $this->_set_readahead( $' ); warn_normal "Unescaped < in $it\n", $this->line_number_start @@ -323,10 +331,17 @@ sub _next_token_internal { $ok_p = 1; } } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) { - ($kind, $it) = (TmplTokenType::TAG, "$1>"); - $this->_set_readahead( $3 ); - $ok_p = 1; - warn_normal "SGML \"closed start tag\" notation: $1<\n", $this->line_number_start if $2 eq ''; + # If we detect a "closed start tag" but we know that the + # following token looks like a TMPL_VAR, don't stop + my($head, $tail, $post) = ($1, $2, $3); + if ($tail eq '' && $post =~ $re_tmpl_var) { + warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number; + } else { + ($kind, $it) = (TmplTokenType::TAG, "$head>"); + $this->_set_readahead( $post ); + $ok_p = 1; + warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq ''; + } } elsif ($this->_peek_readahead =~ /^).)*-->/s) { ($kind, $it) = (TmplTokenType::COMMENT, $&); $this->_set_readahead( $' ); @@ -347,7 +362,7 @@ sub _next_token_internal { $kind = TmplTokenType::DECL; $kind = TmplTokenType::COMMENT if $it =~ /^).)*-->/; if ($kind == TmplTokenType::COMMENT && $it =~ /^