Added hack to extract and translate strings inside JavaScript CDATA blocks,

using C-like _("some translatable string") notation. English templates will
need to be modified.
This commit is contained in:
acli 2004-03-10 07:00:27 +00:00
parent 424149c742
commit 2d132d2b6b
5 changed files with 204 additions and 2 deletions

View file

@ -109,6 +109,49 @@ sub set_form {
return $this;
}
sub has_js_data {
my $this = shift;
return defined $this->{'_js_data'} && ref($this->{'_js_data'}) eq 'ARRAY';
}
sub js_data {
my $this = shift;
return $this->{'_js_data'};
}
sub set_js_data {
my $this = shift;
$this->{'_js_data'} = $_[0];
return $this;
}
# predefined tests
sub tag_p {
my $this = shift;
return $this->type == TmplTokenType::TAG;
}
sub cdata_p {
my $this = shift;
return $this->type == TmplTokenType::CDATA;
}
sub text_p {
my $this = shift;
return $this->type == TmplTokenType::TEXT;
}
sub text_parametrized_p {
my $this = shift;
return $this->type == TmplTokenType::TEXT_PARAMETRIZED;
}
sub directive_p {
my $this = shift;
return $this->type == TmplTokenType::DIRECTIVE;
}
###############################################################################
1;

View file

@ -93,6 +93,7 @@ sub LINENUM () {'lc'}
sub CDATA_MODE_P () {'cdata-mode-p'}
sub CDATA_CLOSE () {'cdata-close'}
sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
@ -169,6 +170,11 @@ sub pcdata_mode_p {
return $this->{+PCDATA_MODE_P};
}
sub js_mode_p {
my $this = shift;
return $this->{+JS_MODE_P};
}
sub cdata_close {
my $this = shift;
return $this->{+CDATA_CLOSE};
@ -240,6 +246,12 @@ sub _set_pcdata_mode {
return $this;
}
sub _set_js_mode {
my $this = shift;
$this->{+JS_MODE_P} = $_[0];
return $this;
}
sub _set_cdata_close {
my $this = shift;
$this->{+CDATA_CLOSE} = $_[0];
@ -254,6 +266,100 @@ sub set_allow_cformat {
###############################################################################
use vars qw( $js_EscapeSequence );
BEGIN {
# Perl quoting is really screwed up, but this common subexp is way too long
$js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
}
sub parenleft () { '(' }
sub parenright () { ')' }
sub split_js ($) {
my ($s0) = @_;
my @it = ();
while (length $s0) {
if ($s0 =~ /^\s+/s) { # whitespace
push @it, $&;
$s0 = $';
} elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
push @it, $&;
$s0 = $';
} elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
push @it, $&;
$s0 = $';
# Keyword or identifier, ECMA-262 p.13 (section 7.5)
} elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
push @it, $&;
$s0 = $';
# Punctuator, ECMA-262 p.13 (section 7.6)
} elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
push @it, $&;
$s0 = $';
# DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
} elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
push @it, $&;
$s0 = $';
# HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
} elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
push @it, $&;
$s0 = $';
# OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
} elsif ($s0 =~ /^0[\da-fA-F]+/s) {
push @it, $&;
$s0 = $';
# StringLiteral, ECMA-262 p.17 (section 7.7.4)
# XXX SourceCharacter doesn't seem to be defined (?)
} elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
push @it, $&;
$s0 = $';
} elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
push @it, $&;
$s0 = $';
}
}
return @it;
}
sub STATE_UNDERSCORE () { 1 }
sub STATE_PARENLEFT () { 2 }
sub STATE_STRING_LITERAL () { 3 }
# XXX This is a crazy hack. I don't want to write an ECMAScript parser.
# XXX A scanner is one thing; a parser another thing.
sub identify_js_translatables (@) {
my @input = @_;
my @output = ();
# We mark a JavaScript translatable string as in C, i.e., _("literal")
# For simplicity, we ONLY look for "_" "(" StringLiteral ")"
for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
my $reset_state_p = 0;
push @output, [0, $input[$i]];
if ($input[$i] !~ /\S/s) {
;
} elsif ($state == 0) {
$state = STATE_UNDERSCORE if $input[$i] eq '_';
} elsif ($state == STATE_UNDERSCORE) {
$state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
} elsif ($state == STATE_PARENLEFT) {
if ($input[$i] =~ /^(['"])(.*)\1$/s) {
($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
} else {
$state = 0;
}
} elsif ($state == STATE_STRING_LITERAL) {
if ($input[$i] eq parenright) {
$output[$j] = [1, $output[$j]->[1], $q, $s];
}
$state = 0;
} else {
die "identify_js_translatables internal error: Unknown state $state"
}
}
return \@output;
}
###############################################################################
sub _extract_attributes ($;$) {
my $this = shift;
my($s, $lc) = @_;
@ -430,6 +536,7 @@ sub _next_token_intermediate {
$this->_set_cdata_mode( 1 );
$this->_set_cdata_close( "</$1\\s*>" );
$this->_set_pcdata_mode( 0 );
$this->_set_js_mode( lc($1) eq 'script' );
# } elsif ($it->string =~ /^<(title)\b/is) {
# $this->_set_cdata_mode( 1 );
# $this->_set_cdata_close( "</$1\\s*>" );
@ -470,8 +577,20 @@ sub _next_token_intermediate {
$it = TmplToken->new( $it,
($this->pcdata_mode_p?
TmplTokenType::TEXT: TmplTokenType::CDATA),
$this->line_number )
$this->line_number, $this->filename )
if defined $it;
if ($this->js_mode_p) {
my $s0 = $it->string;
my @head = ();
my @tail = ();
if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
push @head, $1;
push @tail, $3;
$s0 = $2;
}
push @head, split_js $s0;
$it->set_js_data( identify_js_translatables(@head, @tail) );
}
$this->_set_pcdata_mode, 0;
$this->_set_cdata_close, undef unless !defined $it;
}

View file

@ -60,6 +60,12 @@ sub debug_dump ($) { # for testing only
$i += 1;
}
}
if ($s->has_js_data) {
printf "JavaScript translatable strings:\n";
for my $t (@{$s->js_data}) {
printf "%dH%s\n", length $t->[3], underline $t->[3] if $t->[0]; # FIXME
}
}
}
}
@ -88,6 +94,10 @@ sub text_extract ($) {
$text{$val} = 1 if $val =~ /\S/s;
}
}
} elsif ($s->has_js_data) {
for my $t (@{$s->js_data}) {
remember( $s, $t->[3] ) if $t->[0]; # FIXME
}
}
}
# Emit all extracted strings.

View file

@ -97,6 +97,16 @@ sub text_replace (**) {
text_replace_tag($t, $attr): $t });
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
print $output text_replace_tag($t, $attr);
} elsif ($s->has_js_data) {
for my $t (@{$s->js_data}) {
# FIXME for this whole block
if ($t->[0]) {
printf $output "%s%s%s", $t->[2], find_translation $t->[3],
$t->[2];
} else {
print $output $t->[1];
}
}
} elsif (defined $t) {
print $output $t;
}

View file

@ -107,6 +107,10 @@ sub text_extract (*) {
remember( $s, $val ) if $val =~ /\S/s;
}
}
} elsif ($s->has_js_data) {
for my $t (@{$s->js_data}) {
remember( $s, $t->[3] ) if $t->[0]; # FIXME
}
}
}
}
@ -198,6 +202,9 @@ EOF
. (defined $type? " type=$type->[1]": '')
. (defined $name? " name=$name->[1]": '');
}
} elsif ($text{$t}->[0]->has_js_data) {
printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
printf OUTPUT "#. SCRIPT\n";
}
my $cformat_p;
for my $token (@{$text{$t}}) {
@ -376,7 +383,6 @@ A gettext-like format provides the following advantages:
=item -
(Future goal)
Translation to non-English-like languages with different word
order: gettext's c-format strings can theoretically be
emulated if we are able to do some analysis on the .tmpl input
@ -417,6 +423,20 @@ files (passed to -f) can be generated thus:
This is, however, quite pointless, because the "create" and
"update" actions have already been implemented in tmpl_process3.pl.
=head2 Strings inside JavaScript
In the SCRIPT elements, the script will attempt to scan for
_("I<string literal>") patterns, and extract the I<string literal>
as a translatable string.
Note that the C-like _(...) notation is required.
The JavaScript must actually define a _ function
so that the code remains correct JavaScript.
A suitable definition of such a function can be
function _(s) { return s } // dummy function for gettext
=head1 SEE ALSO
tmpl_process.pl,