From 2d132d2b6bcf0fb47194e13bb252d51a55747937 Mon Sep 17 00:00:00 2001 From: acli Date: Wed, 10 Mar 2004 07:00:27 +0000 Subject: [PATCH] Added hack to extract and translate strings inside JavaScript CDATA blocks, using C-like _("some translatable string") notation. English templates will need to be modified. --- misc/translator/TmplToken.pm | 43 +++++++++++ misc/translator/TmplTokenizer.pm | 121 ++++++++++++++++++++++++++++++- misc/translator/text-extract2.pl | 10 +++ misc/translator/tmpl_process3.pl | 10 +++ misc/translator/xgettext.pl | 22 +++++- 5 files changed, 204 insertions(+), 2 deletions(-) diff --git a/misc/translator/TmplToken.pm b/misc/translator/TmplToken.pm index 17ee316117..0a54d0b52c 100644 --- a/misc/translator/TmplToken.pm +++ b/misc/translator/TmplToken.pm @@ -109,6 +109,49 @@ sub set_form { return $this; } +sub has_js_data { + my $this = shift; + return defined $this->{'_js_data'} && ref($this->{'_js_data'}) eq 'ARRAY'; +} + +sub js_data { + my $this = shift; + return $this->{'_js_data'}; +} + +sub set_js_data { + my $this = shift; + $this->{'_js_data'} = $_[0]; + return $this; +} + +# predefined tests + +sub tag_p { + my $this = shift; + return $this->type == TmplTokenType::TAG; +} + +sub cdata_p { + my $this = shift; + return $this->type == TmplTokenType::CDATA; +} + +sub text_p { + my $this = shift; + return $this->type == TmplTokenType::TEXT; +} + +sub text_parametrized_p { + my $this = shift; + return $this->type == TmplTokenType::TEXT_PARAMETRIZED; +} + +sub directive_p { + my $this = shift; + return $this->type == TmplTokenType::DIRECTIVE; +} + ############################################################################### 1; diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index 7ea81570a9..9e1c7629d2 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -93,6 +93,7 @@ sub LINENUM () {'lc'} sub CDATA_MODE_P () {'cdata-mode-p'} sub CDATA_CLOSE () {'cdata-close'} sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA +sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true sub ALLOW_CFORMAT_P () {'allow-cformat-p'} @@ -169,6 +170,11 @@ sub pcdata_mode_p { return $this->{+PCDATA_MODE_P}; } +sub js_mode_p { + my $this = shift; + return $this->{+JS_MODE_P}; +} + sub cdata_close { my $this = shift; return $this->{+CDATA_CLOSE}; @@ -240,6 +246,12 @@ sub _set_pcdata_mode { return $this; } +sub _set_js_mode { + my $this = shift; + $this->{+JS_MODE_P} = $_[0]; + return $this; +} + sub _set_cdata_close { my $this = shift; $this->{+CDATA_CLOSE} = $_[0]; @@ -254,6 +266,100 @@ sub set_allow_cformat { ############################################################################### +use vars qw( $js_EscapeSequence ); +BEGIN { + # Perl quoting is really screwed up, but this common subexp is way too long + $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})}; +} +sub parenleft () { '(' } +sub parenright () { ')' } + +sub split_js ($) { + my ($s0) = @_; + my @it = (); + while (length $s0) { + if ($s0 =~ /^\s+/s) { # whitespace + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment + push @it, $&; + $s0 = $'; + # Keyword or identifier, ECMA-262 p.13 (section 7.5) + } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName + push @it, $&; + $s0 = $'; + # Punctuator, ECMA-262 p.13 (section 7.6) + } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) { + push @it, $&; + $s0 = $'; + # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec + } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) { + push @it, $&; + $s0 = $'; + # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # StringLiteral, ECMA-262 p.17 (section 7.7.4) + # XXX SourceCharacter doesn't seem to be defined (?) + } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) { + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!! + push @it, $&; + $s0 = $'; + } + } + return @it; +} + +sub STATE_UNDERSCORE () { 1 } +sub STATE_PARENLEFT () { 2 } +sub STATE_STRING_LITERAL () { 3 } + +# XXX This is a crazy hack. I don't want to write an ECMAScript parser. +# XXX A scanner is one thing; a parser another thing. +sub identify_js_translatables (@) { + my @input = @_; + my @output = (); + # We mark a JavaScript translatable string as in C, i.e., _("literal") + # For simplicity, we ONLY look for "_" "(" StringLiteral ")" + for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) { + my $reset_state_p = 0; + push @output, [0, $input[$i]]; + if ($input[$i] !~ /\S/s) { + ; + } elsif ($state == 0) { + $state = STATE_UNDERSCORE if $input[$i] eq '_'; + } elsif ($state == STATE_UNDERSCORE) { + $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0; + } elsif ($state == STATE_PARENLEFT) { + if ($input[$i] =~ /^(['"])(.*)\1$/s) { + ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2); + } else { + $state = 0; + } + } elsif ($state == STATE_STRING_LITERAL) { + if ($input[$i] eq parenright) { + $output[$j] = [1, $output[$j]->[1], $q, $s]; + } + $state = 0; + } else { + die "identify_js_translatables internal error: Unknown state $state" + } + } + return \@output; +} + +############################################################################### + sub _extract_attributes ($;$) { my $this = shift; my($s, $lc) = @_; @@ -430,6 +536,7 @@ sub _next_token_intermediate { $this->_set_cdata_mode( 1 ); $this->_set_cdata_close( "" ); $this->_set_pcdata_mode( 0 ); + $this->_set_js_mode( lc($1) eq 'script' ); # } elsif ($it->string =~ /^<(title)\b/is) { # $this->_set_cdata_mode( 1 ); # $this->_set_cdata_close( "" ); @@ -470,8 +577,20 @@ sub _next_token_intermediate { $it = TmplToken->new( $it, ($this->pcdata_mode_p? TmplTokenType::TEXT: TmplTokenType::CDATA), - $this->line_number ) + $this->line_number, $this->filename ) if defined $it; + if ($this->js_mode_p) { + my $s0 = $it->string; + my @head = (); + my @tail = (); + if ($s0 =~ /^(\s*