5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 ###############################################################################
29 sub string_negligible_p ($) {
30 my($t) = @_; # a string
31 # Don't emit pure whitespace, pure numbers, pure punctuation,
32 # single letters, or TMPL_VAR's.
33 # Punctuation should arguably be translated. But without context
34 # they are untranslatable. Note that $t is a string, not a token object.
35 return !$extract_all_p && (
36 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
37 || $t =~ /^\d+$/ # purely digits
38 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
39 || $t =~ /^[A-Za-z]$/ # single letters
43 sub token_negligible_p( $ ) {
46 return !$extract_all_p && (
47 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
48 $t == TmplTokenType::DIRECTIVE? 1:
49 $t == TmplTokenType::TEXT_PARAMETRIZED
50 && join( '', map { my $t = $_->type;
51 $t == TmplTokenType::DIRECTIVE?
52 '1': $t == TmplTokenType::TAG?
53 '': token_negligible_p( $_ )?
54 '': '1' } @{$x->children} ) eq '' );
57 ###############################################################################
60 my($token, $string) = @_;
61 # If we determine that the string is negligible, don't bother to remember
62 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
63 my $key = TmplTokenizer::string_canon( $string );
64 $text{$key} = [] unless defined $text{$key};
65 push @{$text{$key}}, $token;
69 ###############################################################################
73 # The real gettext tools seems to sort case sensitively; I don't know why
74 @t = sort { $a cmp $b } @t if $sort eq 's';
76 my @aa = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$a}};
78 my @bb = sort { $a->pathname cmp $b->pathname
79 || $a->line_number <=> $b->line_number } @{$text{$b}};
80 $aa[0]->pathname cmp $bb[0]->pathname
81 || $aa[0]->line_number <=> $bb[0]->line_number;
86 ###############################################################################
88 sub text_extract (*) {
91 my $s = TmplTokenizer::next_token $h;
92 last unless defined $s;
93 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
94 if ($kind eq TmplTokenType::TEXT) {
95 remember( $s, $t ) if $t =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
97 remember( $s, $s->form ) if $s->form =~ /\S/s;
98 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
99 # value [tag=input], meta
100 my $tag = lc($1) if $t =~ /^<(\S+)/s;
101 for my $a ('alt', 'content', 'title', 'value','label') {
103 next if $a eq 'label' && $tag ne 'optgroup';
104 next if $a eq 'content' && $tag ne 'meta';
105 next if $a eq 'value' && ($tag ne 'input'
106 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
107 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
108 $val = TmplTokenizer::trim $val;
109 remember( $s, $val ) if $val =~ /\S/s;
112 } elsif ($s->has_js_data) {
113 for my $t (@{$s->js_data}) {
114 remember( $s, $t->[3] ) if $t->[0]; # FIXME
120 ###############################################################################
122 sub generate_strings_list () {
123 # Emit all extracted strings.
124 for my $t (string_list) {
125 printf OUTPUT "%s\n", $t;
129 ###############################################################################
131 sub generate_po_file () {
132 # We don't emit the Plural-Forms header; it's meaningless for us
133 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
134 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
135 # Time stamps aren't exactly right semantically. I don't know how to fix it.
136 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
137 my $time_pot = $time;
138 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
140 # SOME DESCRIPTIVE TITLE.
141 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
142 # This file is distributed under the same license as the PACKAGE package.
143 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
146 print OUTPUT <<EOF unless $disable_fuzzy_p;
152 "Project-Id-Version: PACKAGE VERSION\\n"
153 "POT-Creation-Date: $time_pot\\n"
154 "PO-Revision-Date: $time_po\\n"
155 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
156 "Language-Team: LANGUAGE <LL\@li.org>\\n"
157 "MIME-Version: 1.0\\n"
158 "Content-Type: text/plain; charset=$pot_charset\\n"
159 "Content-Transfer-Encoding: 8bit\\n"
162 my $directory_re = quotemeta("$directory/");
163 for my $t (string_list) {
164 if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
165 my($token, $n) = ($text{$t}->[0], 0);
166 printf OUTPUT "#. For the first occurrence,\n"
167 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
168 for my $param ($token->parameters_and_fields) {
170 my $type = $param->type;
171 my $subtype = ($type == TmplTokenType::TAG
172 && $param->string =~ /^<input\b/is?
173 $param->attributes->{'type'}->[1]: undef);
174 my $fmt = TmplTokenizer::_formalize( $param );
176 if ($type == TmplTokenType::DIRECTIVE) {
177 $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
178 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
180 printf OUTPUT "#. %s: %s\n", $fmt,
181 "$type" . (defined $name? " name=$name": '');
183 my $name = $param->attributes->{'name'};
184 my $value = $param->attributes->{'value'}
185 unless $subtype =~ /^(?:text)$/;
186 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
187 . (defined $name? " name=$name->[1]": '')
188 . (defined $value? " value=$value->[1]": '');
191 } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
192 my($token) = ($text{$t}->[0]);
193 printf OUTPUT "#. For the first occurrence,\n"
194 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
195 if ($token->string =~ /^<meta\b/is) {
196 my $type = $token->attributes->{'http-equiv'}->[1];
197 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
198 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
200 my $type = (lc($tag) eq 'input'?
201 $token->attributes->{'type'}: undef);
202 my $name = $token->attributes->{'name'};
203 printf OUTPUT "#. %s\n", $tag
204 . (defined $type? " type=$type->[1]": '')
205 . (defined $name? " name=$name->[1]": '');
207 } elsif ($text{$t}->[0]->has_js_data) {
208 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
209 printf OUTPUT "#. SCRIPT\n";
212 for my $token (@{$text{$t}}) {
213 my $pathname = $token->pathname;
214 $pathname =~ s/^$directory_re//os;
215 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
216 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
217 if defined $pathname && defined $token->line_number;
218 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
220 printf OUTPUT "#, c-format\n" if $cformat_p;
221 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
222 TmplTokenizer::string_canon
223 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
224 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
225 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
229 ###############################################################################
231 sub convert_translation_file () {
232 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
233 VerboseWarnings::set_input_file_name $convert_from;
236 my($msgid, $msgstr) = split(/\t/);
237 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
238 unless defined $msgstr;
240 # Fixup some of the bad strings
241 $msgid =~ s/^SELECTED>//;
244 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
245 remember( $token, $msgid );
246 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
247 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
249 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
250 my $candidate = TmplTokenizer::charset_canon $2;
251 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
252 if defined $charset_in && $charset_in ne $candidate;
253 $charset_in = $candidate;
255 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
256 my $candidate = TmplTokenizer::charset_canon $2;
257 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
258 if defined $charset_out && $charset_out ne $candidate;
259 $charset_out = $candidate;
262 # The following assumption is correct; that's what HTML::Template assumes
263 if (!defined $charset_in) {
264 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
265 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
269 ###############################################################################
273 my $h = $exitcode? *STDERR: *STDOUT;
276 Extract translatable strings from given HTML::Template input files.
279 -f, --files-from=FILE Get list of input files from FILE
280 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
282 Output file location:
283 -o, --output=FILE Write output to specified file
285 HTML::Template options:
286 -a, --extract-all Extract all strings
287 --pedantic-warnings Issue warnings even for detected problems
288 which are likely to be harmless
291 -s, --sort-output generate sorted output
292 -F, --sort-by-file sort output by file location
293 -v, --verbose explain what is being done
296 --help Display this help and exit
298 Try `perldoc $0' for perhaps more information.
303 ###############################################################################
305 sub usage_error (;$) {
306 print STDERR "$_[0]\n" if @_;
307 print STDERR "Try `$0 --help' for more information.\n";
311 ###############################################################################
313 Getopt::Long::config qw( bundling no_auto_abbrev );
315 'a|extract-all' => \$extract_all_p,
316 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
317 'convert-from=s' => \$convert_from,
318 'D|directory=s' => \$directory,
319 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
320 'f|files-from=s' => \$files_from,
321 'I|input-charset=s' => \$charset_in, # INTERNAL
322 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
323 'O|output-charset=s' => \$charset_out, # INTERNAL
324 'output|o=s' => \$output,
325 'po-mode' => \$po_mode_p, # INTERNAL
326 's|sort-output' => sub { $sort = 's' },
327 'F|sort-by-file' => sub { $sort = 'F' },
328 'v|verbose' => \$verbose_p,
329 'help' => sub { usage(0) },
332 VerboseWarnings::set_application_name $0;
333 VerboseWarnings::set_pedantic_mode $pedantic_p;
335 usage_error('Missing mandatory option -f')
336 unless defined $files_from || defined $convert_from;
337 $directory = '.' unless defined $directory;
339 usage_error('You cannot specify both --convert-from and --files-from')
340 if defined $convert_from && defined $files_from;
342 if (defined $output && $output ne '-') {
343 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
344 open(OUTPUT, ">$output") || die "$output: $!\n";
345 binmode( OUTPUT, ":utf8" );
347 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
348 open(OUTPUT, ">&STDOUT");
351 if (defined $files_from) {
352 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
353 open(INPUT, "<$files_from") || die "$files_from: $!\n";
356 my $input = /^\//? $_: "$directory/$_";
357 my $h = TmplTokenizer->new( $input );
358 $h->set_allow_cformat( 1 );
359 VerboseWarnings::set_input_file_name $input;
360 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
365 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
366 convert_translation_file;
370 warn "This input will not work with Mozilla standards-compliant mode\n", undef
371 if TmplTokenizer::syntaxerror_p;
374 exit(-1) if TmplTokenizer::fatal_p;
376 ###############################################################################
380 This is an experimental script based on the modularized
381 text-extract2.pl script. It has behaviour similar to
382 xgettext(1), and generates gettext-compatible output files.
384 A gettext-like format provides the following advantages:
390 Translation to non-English-like languages with different word
391 order: gettext's c-format strings can theoretically be
392 emulated if we are able to do some analysis on the .tmpl input
393 and treat <TMPL_VAR> in a way similar to %s.
397 Context for the extracted strings: the gettext format provides
398 the filenames and line numbers where each string can be found.
399 The translator can read the source file and see the context,
400 in case the string by itself can mean several different things.
404 Place for the translator to add comments about the translations.
408 Gettext-compatible tools, if any, might be usable if we adopt
413 This script has already been in use for over a year and should
414 be reasonable stable. Nevertheless, it is still somewhat
415 experimental and there are still some issues.
417 Please refer to the explanation in tmpl_process3 for further
420 If you want to generate GNOME-style POTFILES.in files, such
421 files (passed to -f) can be generated thus:
423 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
424 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
425 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
426 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
428 This is, however, quite pointless, because the "create" and
429 "update" actions have already been implemented in tmpl_process3.pl.
431 =head2 Strings inside JavaScript
433 In the SCRIPT elements, the script will attempt to scan for
434 _("I<string literal>") patterns, and extract the I<string literal>
435 as a translatable string.
437 Note that the C-like _(...) notation is required.
439 The JavaScript must actually define a _ function
440 so that the code remains correct JavaScript.
441 A suitable definition of such a function can be
443 function _(s) { return s } // dummy function for gettext
454 There probably are some. Bugs related to scanning of <INPUT>
455 tags seem to be especially likely to be present.
457 Its diagnostics are probably too verbose.
459 When a <TMPL_VAR> within a JavaScript-related attribute is
460 detected, the script currently displays no warnings at all.
461 It might be good to display some kind of warning.
463 Its sort order (-s option) seems to be different than the real
464 xgettext(1)'s sort option. This will result in translation
465 strings inside the generated PO file spuriously moving about
466 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
468 If a Javascript string has leading spaces, it will
469 generate strings with spurious leading spaces,
470 leading to failure to match the strings when actually generating