Added hack to extract and translate strings inside JavaScript CDATA blocks,
using C-like _("some translatable string") notation. English templates will need to be modified.
This commit is contained in:
parent
424149c742
commit
2d132d2b6b
5 changed files with 204 additions and 2 deletions
|
@ -109,6 +109,49 @@ sub set_form {
|
|||
return $this;
|
||||
}
|
||||
|
||||
sub has_js_data {
|
||||
my $this = shift;
|
||||
return defined $this->{'_js_data'} && ref($this->{'_js_data'}) eq 'ARRAY';
|
||||
}
|
||||
|
||||
sub js_data {
|
||||
my $this = shift;
|
||||
return $this->{'_js_data'};
|
||||
}
|
||||
|
||||
sub set_js_data {
|
||||
my $this = shift;
|
||||
$this->{'_js_data'} = $_[0];
|
||||
return $this;
|
||||
}
|
||||
|
||||
# predefined tests
|
||||
|
||||
sub tag_p {
|
||||
my $this = shift;
|
||||
return $this->type == TmplTokenType::TAG;
|
||||
}
|
||||
|
||||
sub cdata_p {
|
||||
my $this = shift;
|
||||
return $this->type == TmplTokenType::CDATA;
|
||||
}
|
||||
|
||||
sub text_p {
|
||||
my $this = shift;
|
||||
return $this->type == TmplTokenType::TEXT;
|
||||
}
|
||||
|
||||
sub text_parametrized_p {
|
||||
my $this = shift;
|
||||
return $this->type == TmplTokenType::TEXT_PARAMETRIZED;
|
||||
}
|
||||
|
||||
sub directive_p {
|
||||
my $this = shift;
|
||||
return $this->type == TmplTokenType::DIRECTIVE;
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
|
||||
1;
|
||||
|
|
|
@ -93,6 +93,7 @@ sub LINENUM () {'lc'}
|
|||
sub CDATA_MODE_P () {'cdata-mode-p'}
|
||||
sub CDATA_CLOSE () {'cdata-close'}
|
||||
sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
|
||||
sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
|
||||
|
||||
sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
|
||||
|
||||
|
@ -169,6 +170,11 @@ sub pcdata_mode_p {
|
|||
return $this->{+PCDATA_MODE_P};
|
||||
}
|
||||
|
||||
sub js_mode_p {
|
||||
my $this = shift;
|
||||
return $this->{+JS_MODE_P};
|
||||
}
|
||||
|
||||
sub cdata_close {
|
||||
my $this = shift;
|
||||
return $this->{+CDATA_CLOSE};
|
||||
|
@ -240,6 +246,12 @@ sub _set_pcdata_mode {
|
|||
return $this;
|
||||
}
|
||||
|
||||
sub _set_js_mode {
|
||||
my $this = shift;
|
||||
$this->{+JS_MODE_P} = $_[0];
|
||||
return $this;
|
||||
}
|
||||
|
||||
sub _set_cdata_close {
|
||||
my $this = shift;
|
||||
$this->{+CDATA_CLOSE} = $_[0];
|
||||
|
@ -254,6 +266,100 @@ sub set_allow_cformat {
|
|||
|
||||
###############################################################################
|
||||
|
||||
use vars qw( $js_EscapeSequence );
|
||||
BEGIN {
|
||||
# Perl quoting is really screwed up, but this common subexp is way too long
|
||||
$js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
|
||||
}
|
||||
sub parenleft () { '(' }
|
||||
sub parenright () { ')' }
|
||||
|
||||
sub split_js ($) {
|
||||
my ($s0) = @_;
|
||||
my @it = ();
|
||||
while (length $s0) {
|
||||
if ($s0 =~ /^\s+/s) { # whitespace
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
} elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
} elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
# Keyword or identifier, ECMA-262 p.13 (section 7.5)
|
||||
} elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
# Punctuator, ECMA-262 p.13 (section 7.6)
|
||||
} elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
# DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
|
||||
} elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
# HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
|
||||
} elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
# OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
|
||||
} elsif ($s0 =~ /^0[\da-fA-F]+/s) {
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
# StringLiteral, ECMA-262 p.17 (section 7.7.4)
|
||||
# XXX SourceCharacter doesn't seem to be defined (?)
|
||||
} elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
} elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
|
||||
push @it, $&;
|
||||
$s0 = $';
|
||||
}
|
||||
}
|
||||
return @it;
|
||||
}
|
||||
|
||||
sub STATE_UNDERSCORE () { 1 }
|
||||
sub STATE_PARENLEFT () { 2 }
|
||||
sub STATE_STRING_LITERAL () { 3 }
|
||||
|
||||
# XXX This is a crazy hack. I don't want to write an ECMAScript parser.
|
||||
# XXX A scanner is one thing; a parser another thing.
|
||||
sub identify_js_translatables (@) {
|
||||
my @input = @_;
|
||||
my @output = ();
|
||||
# We mark a JavaScript translatable string as in C, i.e., _("literal")
|
||||
# For simplicity, we ONLY look for "_" "(" StringLiteral ")"
|
||||
for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
|
||||
my $reset_state_p = 0;
|
||||
push @output, [0, $input[$i]];
|
||||
if ($input[$i] !~ /\S/s) {
|
||||
;
|
||||
} elsif ($state == 0) {
|
||||
$state = STATE_UNDERSCORE if $input[$i] eq '_';
|
||||
} elsif ($state == STATE_UNDERSCORE) {
|
||||
$state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
|
||||
} elsif ($state == STATE_PARENLEFT) {
|
||||
if ($input[$i] =~ /^(['"])(.*)\1$/s) {
|
||||
($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
|
||||
} else {
|
||||
$state = 0;
|
||||
}
|
||||
} elsif ($state == STATE_STRING_LITERAL) {
|
||||
if ($input[$i] eq parenright) {
|
||||
$output[$j] = [1, $output[$j]->[1], $q, $s];
|
||||
}
|
||||
$state = 0;
|
||||
} else {
|
||||
die "identify_js_translatables internal error: Unknown state $state"
|
||||
}
|
||||
}
|
||||
return \@output;
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
|
||||
sub _extract_attributes ($;$) {
|
||||
my $this = shift;
|
||||
my($s, $lc) = @_;
|
||||
|
@ -430,6 +536,7 @@ sub _next_token_intermediate {
|
|||
$this->_set_cdata_mode( 1 );
|
||||
$this->_set_cdata_close( "</$1\\s*>" );
|
||||
$this->_set_pcdata_mode( 0 );
|
||||
$this->_set_js_mode( lc($1) eq 'script' );
|
||||
# } elsif ($it->string =~ /^<(title)\b/is) {
|
||||
# $this->_set_cdata_mode( 1 );
|
||||
# $this->_set_cdata_close( "</$1\\s*>" );
|
||||
|
@ -470,8 +577,20 @@ sub _next_token_intermediate {
|
|||
$it = TmplToken->new( $it,
|
||||
($this->pcdata_mode_p?
|
||||
TmplTokenType::TEXT: TmplTokenType::CDATA),
|
||||
$this->line_number )
|
||||
$this->line_number, $this->filename )
|
||||
if defined $it;
|
||||
if ($this->js_mode_p) {
|
||||
my $s0 = $it->string;
|
||||
my @head = ();
|
||||
my @tail = ();
|
||||
if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
|
||||
push @head, $1;
|
||||
push @tail, $3;
|
||||
$s0 = $2;
|
||||
}
|
||||
push @head, split_js $s0;
|
||||
$it->set_js_data( identify_js_translatables(@head, @tail) );
|
||||
}
|
||||
$this->_set_pcdata_mode, 0;
|
||||
$this->_set_cdata_close, undef unless !defined $it;
|
||||
}
|
||||
|
|
|
@ -60,6 +60,12 @@ sub debug_dump ($) { # for testing only
|
|||
$i += 1;
|
||||
}
|
||||
}
|
||||
if ($s->has_js_data) {
|
||||
printf "JavaScript translatable strings:\n";
|
||||
for my $t (@{$s->js_data}) {
|
||||
printf "%dH%s\n", length $t->[3], underline $t->[3] if $t->[0]; # FIXME
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -88,6 +94,10 @@ sub text_extract ($) {
|
|||
$text{$val} = 1 if $val =~ /\S/s;
|
||||
}
|
||||
}
|
||||
} elsif ($s->has_js_data) {
|
||||
for my $t (@{$s->js_data}) {
|
||||
remember( $s, $t->[3] ) if $t->[0]; # FIXME
|
||||
}
|
||||
}
|
||||
}
|
||||
# Emit all extracted strings.
|
||||
|
|
|
@ -97,6 +97,16 @@ sub text_replace (**) {
|
|||
text_replace_tag($t, $attr): $t });
|
||||
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
|
||||
print $output text_replace_tag($t, $attr);
|
||||
} elsif ($s->has_js_data) {
|
||||
for my $t (@{$s->js_data}) {
|
||||
# FIXME for this whole block
|
||||
if ($t->[0]) {
|
||||
printf $output "%s%s%s", $t->[2], find_translation $t->[3],
|
||||
$t->[2];
|
||||
} else {
|
||||
print $output $t->[1];
|
||||
}
|
||||
}
|
||||
} elsif (defined $t) {
|
||||
print $output $t;
|
||||
}
|
||||
|
|
|
@ -107,6 +107,10 @@ sub text_extract (*) {
|
|||
remember( $s, $val ) if $val =~ /\S/s;
|
||||
}
|
||||
}
|
||||
} elsif ($s->has_js_data) {
|
||||
for my $t (@{$s->js_data}) {
|
||||
remember( $s, $t->[3] ) if $t->[0]; # FIXME
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -198,6 +202,9 @@ EOF
|
|||
. (defined $type? " type=$type->[1]": '')
|
||||
. (defined $name? " name=$name->[1]": '');
|
||||
}
|
||||
} elsif ($text{$t}->[0]->has_js_data) {
|
||||
printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
|
||||
printf OUTPUT "#. SCRIPT\n";
|
||||
}
|
||||
my $cformat_p;
|
||||
for my $token (@{$text{$t}}) {
|
||||
|
@ -376,7 +383,6 @@ A gettext-like format provides the following advantages:
|
|||
|
||||
=item -
|
||||
|
||||
(Future goal)
|
||||
Translation to non-English-like languages with different word
|
||||
order: gettext's c-format strings can theoretically be
|
||||
emulated if we are able to do some analysis on the .tmpl input
|
||||
|
@ -417,6 +423,20 @@ files (passed to -f) can be generated thus:
|
|||
This is, however, quite pointless, because the "create" and
|
||||
"update" actions have already been implemented in tmpl_process3.pl.
|
||||
|
||||
=head2 Strings inside JavaScript
|
||||
|
||||
In the SCRIPT elements, the script will attempt to scan for
|
||||
_("I<string literal>") patterns, and extract the I<string literal>
|
||||
as a translatable string.
|
||||
|
||||
Note that the C-like _(...) notation is required.
|
||||
|
||||
The JavaScript must actually define a _ function
|
||||
so that the code remains correct JavaScript.
|
||||
A suitable definition of such a function can be
|
||||
|
||||
function _(s) { return s } // dummy function for gettext
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
tmpl_process.pl,
|
||||
|
|
Loading…
Reference in a new issue