Added hack to extract and translate strings inside JavaScript CDATA blocks,

using C-like _("some translatable string") notation. English templates will need to be modified.
2004-03-10 07:00:27 +00:00 · 2004-03-10 07:00:27 +00:00 · 2d132d2b6b
commit 2d132d2b6b
parent 424149c742
5 changed files with 204 additions and 2 deletions
--- a/misc/translator/TmplToken.pm
+++ b/misc/translator/TmplToken.pm
@ -109,6 +109,49 @@ sub set_form {
    return $this;
 }

+sub has_js_data {
+    my $this = shift;
+    return defined $this->{'_js_data'} && ref($this->{'_js_data'}) eq 'ARRAY';
+}
+
+sub js_data {
+    my $this = shift;
+    return $this->{'_js_data'};
+}
+
+sub set_js_data {
+    my $this = shift;
+    $this->{'_js_data'} = $_[0];
+    return $this;
+}
+
+# predefined tests
+
+sub tag_p {
+    my $this = shift;
+    return $this->type == TmplTokenType::TAG;
+}
+
+sub cdata_p {
+    my $this = shift;
+    return $this->type == TmplTokenType::CDATA;
+}
+
+sub text_p {
+    my $this = shift;
+    return $this->type == TmplTokenType::TEXT;
+}
+
+sub text_parametrized_p {
+    my $this = shift;
+    return $this->type == TmplTokenType::TEXT_PARAMETRIZED;
+}
+
+sub directive_p {
+    my $this = shift;
+    return $this->type == TmplTokenType::DIRECTIVE;
+}
+
 ###############################################################################

 1;
--- a/misc/translator/TmplTokenizer.pm
+++ b/misc/translator/TmplTokenizer.pm
@ -93,6 +93,7 @@ sub LINENUM		() {'lc'}
 sub CDATA_MODE_P	() {'cdata-mode-p'}
 sub CDATA_CLOSE		() {'cdata-close'}
 sub PCDATA_MODE_P	() {'pcdata-mode-p'}	# additional submode for CDATA
+sub JS_MODE_P		() {'js-mode-p'}	# cdata-mode-p must also be true

 sub ALLOW_CFORMAT_P	() {'allow-cformat-p'}

@ -169,6 +170,11 @@ sub pcdata_mode_p {
    return $this->{+PCDATA_MODE_P};
 }

+sub js_mode_p {
+    my $this = shift;
+    return $this->{+JS_MODE_P};
+}
+
 sub cdata_close {
    my $this = shift;
    return $this->{+CDATA_CLOSE};
@ -240,6 +246,12 @@ sub _set_pcdata_mode {
    return $this;
 }

+sub _set_js_mode {
+    my $this = shift;
+    $this->{+JS_MODE_P} = $_[0];
+    return $this;
+}
+
 sub _set_cdata_close {
    my $this = shift;
    $this->{+CDATA_CLOSE} = $_[0];
@ -254,6 +266,100 @@ sub set_allow_cformat {

 ###############################################################################

+use vars qw( $js_EscapeSequence );
+BEGIN {
+    # Perl quoting is really screwed up, but this common subexp is way too long
+    $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
+}
+sub parenleft  () { '(' }
+sub parenright () { ')' }
+
+sub split_js ($) {
+    my ($s0) = @_;
+    my @it = ();
+    while (length $s0) {
+	if ($s0 =~ /^\s+/s) {				# whitespace
+	    push @it, $&;
+	    $s0 = $';
+	} elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) {	# C++-style comment
+	    push @it, $&;
+	    $s0 = $';
+	} elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) {	# C-style comment
+	    push @it, $&;
+	    $s0 = $';
+	# Keyword or identifier, ECMA-262 p.13 (section 7.5)
+	} elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) {	# IdentifierName
+	    push @it, $&;
+	    $s0 = $';
+	# Punctuator, ECMA-262 p.13 (section 7.6)
+	} elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
+	    push @it, $&;
+	    $s0 = $';
+	# DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
+	} elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
+	    push @it, $&;
+	    $s0 = $';
+	# HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+	} elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
+	    push @it, $&;
+	    $s0 = $';
+	# OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+	} elsif ($s0 =~ /^0[\da-fA-F]+/s) {
+	    push @it, $&;
+	    $s0 = $';
+	# StringLiteral, ECMA-262 p.17 (section 7.7.4)
+	# XXX SourceCharacter doesn't seem to be defined (?)
+	} elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
+	    push @it, $&;
+	    $s0 = $';
+	} elsif ($s0 =~ /^./) {				# UNKNOWN TOKEN !!!
+	    push @it, $&;
+	    $s0 = $';
+	}
+    }
+    return @it;
+}
+
+sub STATE_UNDERSCORE     () { 1 }
+sub STATE_PARENLEFT      () { 2 }
+sub STATE_STRING_LITERAL () { 3 }
+
+# XXX This is a crazy hack. I don't want to write an ECMAScript parser.
+# XXX A scanner is one thing; a parser another thing.
+sub identify_js_translatables (@) {
+    my @input = @_;
+    my @output = ();
+    # We mark a JavaScript translatable string as in C, i.e., _("literal")
+    # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
+    for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
+	my $reset_state_p = 0;
+	push @output, [0, $input[$i]];
+	if ($input[$i] !~ /\S/s) {
+	    ;
+	} elsif ($state == 0) {
+	    $state = STATE_UNDERSCORE if $input[$i] eq '_';
+	} elsif ($state == STATE_UNDERSCORE) {
+	    $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
+	} elsif ($state == STATE_PARENLEFT) {
+	    if ($input[$i] =~ /^(['"])(.*)\1$/s) {
+		($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
+	    } else {
+		$state = 0;
+	    }
+	} elsif ($state == STATE_STRING_LITERAL) {
+	    if ($input[$i] eq parenright) {
+		$output[$j] = [1, $output[$j]->[1], $q, $s];
+	    }
+	    $state = 0;
+	} else {
+	    die "identify_js_translatables internal error: Unknown state $state"
+	}
+    }
+    return \@output;
+}
+
+###############################################################################
+
 sub _extract_attributes ($;$) {
    my $this = shift;
    my($s, $lc) = @_;
@ -430,6 +536,7 @@ sub _next_token_intermediate {
 		$this->_set_cdata_mode( 1 );
 		$this->_set_cdata_close( "</$1\\s*>" );
 		$this->_set_pcdata_mode( 0 );
+		$this->_set_js_mode( lc($1) eq 'script' );
 #	    } elsif ($it->string =~ /^<(title)\b/is) {
 #		$this->_set_cdata_mode( 1 );
 #		$this->_set_cdata_close( "</$1\\s*>" );
@ -470,8 +577,20 @@ sub _next_token_intermediate {
 	$it = TmplToken->new( $it,
 			($this->pcdata_mode_p?
 			    TmplTokenType::TEXT: TmplTokenType::CDATA),
-			$this->line_number )
+			$this->line_number, $this->filename )
 		if defined $it;
+	if ($this->js_mode_p) {
+	    my $s0 = $it->string;
+	    my @head = ();
+	    my @tail = ();
+	    if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
+		push @head, $1;
+		push @tail, $3;
+		$s0 = $2;
+	    }
+	    push @head, split_js $s0;
+	    $it->set_js_data( identify_js_translatables(@head, @tail) );
+	}
 	$this->_set_pcdata_mode, 0;
 	$this->_set_cdata_close, undef unless !defined $it;
    }
--- a/misc/translator/text-extract2.pl
+++ b/misc/translator/text-extract2.pl
@ -60,6 +60,12 @@ sub debug_dump ($) { # for testing only
 		$i += 1;
 	    }
 	}
+	if ($s->has_js_data) {
+	    printf "JavaScript translatable strings:\n";
+	    for my $t (@{$s->js_data}) {
+		printf "%dH%s\n", length $t->[3], underline $t->[3] if $t->[0]; # FIXME
+	    }
+	}
    }
 }

@ -88,6 +94,10 @@ sub text_extract ($) {
 		    $text{$val} = 1 if $val =~ /\S/s;
 		}
 	    }
+	} elsif ($s->has_js_data) {
+	    for my $t (@{$s->js_data}) {
+		remember( $s, $t->[3] ) if $t->[0]; # FIXME
+	    }
 	}
    }
    # Emit all extracted strings.
--- a/misc/translator/tmpl_process3.pl
+++ b/misc/translator/tmpl_process3.pl
@ -97,6 +97,16 @@ sub text_replace (**) {
 		    text_replace_tag($t, $attr): $t });
 	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
 	    print $output text_replace_tag($t, $attr);
+	} elsif ($s->has_js_data) {
+	    for my $t (@{$s->js_data}) {
+		# FIXME for this whole block
+		if ($t->[0]) {
+		    printf $output "%s%s%s", $t->[2], find_translation $t->[3],
+			    $t->[2];
+		} else {
+		    print $output $t->[1];
+		}
+	    }
 	} elsif (defined $t) {
 	    print $output $t;
 	}
--- a/misc/translator/xgettext.pl
+++ b/misc/translator/xgettext.pl
@ -107,6 +107,10 @@ sub text_extract (*) {
 		    remember( $s, $val ) if $val =~ /\S/s;
 		}
 	    }
+	} elsif ($s->has_js_data) {
+	    for my $t (@{$s->js_data}) {
+		remember( $s, $t->[3] ) if $t->[0]; # FIXME
+	    }
 	}
    }
 }
@ -198,6 +202,9 @@ EOF
 		    . (defined $type? " type=$type->[1]": '')
 		    . (defined $name? " name=$name->[1]": '');
 	    }
+	} elsif ($text{$t}->[0]->has_js_data) {
+	    printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
+	    printf OUTPUT "#. SCRIPT\n";
 	}
 	my $cformat_p;
 	for my $token (@{$text{$t}}) {
@ -376,7 +383,6 @@ A gettext-like format provides the following advantages:

 =item -

-(Future goal)
 Translation to non-English-like languages with different word
 order:  gettext's c-format strings can theoretically be
 emulated if we are able to do some analysis on the .tmpl input
@ -417,6 +423,20 @@ files (passed to -f) can be generated thus:
 This is, however, quite pointless, because the "create" and
 "update" actions have already been implemented in tmpl_process3.pl.

+=head2 Strings inside JavaScript
+
+In the SCRIPT elements, the script will attempt to scan for
+_("I<string literal>") patterns, and extract the I<string literal>
+as a translatable string.
+
+Note that the C-like _(...) notation is required.
+
+The JavaScript must actually define a _ function
+so that the code remains correct JavaScript.
+A suitable definition of such a function can be
+
+	function _(s) { return s } // dummy function for gettext
+
 =head1 SEE ALSO

 tmpl_process.pl,