5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 ###############################################################################
29 sub string_negligible_p ($) {
30 my($t) = @_; # a string
31 # Don't emit pure whitespace, pure numbers, pure punctuation,
32 # single letters, or TMPL_VAR's.
33 # Punctuation should arguably be translated. But without context
34 # they are untranslatable. Note that $t is a string, not a token object.
35 return !$extract_all_p && (
36 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
37 || $t =~ /^\d+$/ # purely digits
38 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
39 || $t =~ /^[A-Za-z]$/ # single letters
43 sub token_negligible_p( $ ) {
46 return !$extract_all_p && (
47 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
48 $t == TmplTokenType::DIRECTIVE? 1:
49 $t == TmplTokenType::TEXT_PARAMETRIZED
50 && join( '', map { my $t = $_->type;
51 $t == TmplTokenType::DIRECTIVE?
52 '1': $t == TmplTokenType::TAG?
53 '': token_negligible_p( $_ )?
54 '': '1' } @{$x->children} ) eq '' );
57 ###############################################################################
60 my($token, $string) = @_;
61 # If we determine that the string is negligible, don't bother to remember
62 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
63 my $key = TmplTokenizer::string_canon( $string );
64 $text{$key} = [] unless defined $text{$key};
65 push @{$text{$key}}, $token;
69 ###############################################################################
73 # The real gettext tools seems to sort case sensitively; I don't know why
74 @t = sort { $a cmp $b } @t if $sort eq 's';
76 my @aa = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$a}};
78 my @bb = sort { $a->pathname cmp $b->pathname
79 || $a->line_number <=> $b->line_number } @{$text{$b}};
80 $aa[0]->pathname cmp $bb[0]->pathname
81 || $aa[0]->line_number <=> $bb[0]->line_number;
86 ###############################################################################
88 sub text_extract (*) {
91 my $s = TmplTokenizer::next_token $h;
92 last unless defined $s;
93 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
94 if ($kind eq TmplTokenType::TEXT) {
95 remember( $s, $t ) if $t =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
97 remember( $s, $s->form ) if $s->form =~ /\S/s;
98 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
99 # value [tag=input], meta
100 my $tag = lc($1) if $t =~ /^<(\S+)/s;
101 for my $a ('alt', 'content', 'title', 'value','label') {
103 next if $a eq 'label' && $tag ne 'optgroup';
104 next if $a eq 'content' && $tag ne 'meta';
105 next if $a eq 'value' && ($tag ne 'input'
106 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
107 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
108 $val = TmplTokenizer::trim $val;
109 remember( $s, $val ) if $val =~ /\S/s;
112 } elsif ($s->has_js_data) {
113 for my $t (@{$s->js_data}) {
114 remember( $s, $t->[3] ) if $t->[0]; # FIXME
120 ###############################################################################
122 sub generate_strings_list () {
123 # Emit all extracted strings.
124 for my $t (string_list) {
125 printf OUTPUT "%s\n", $t;
129 ###############################################################################
131 sub generate_po_file () {
132 # We don't emit the Plural-Forms header; it's meaningless for us
133 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
134 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
135 # Time stamps aren't exactly right semantically. I don't know how to fix it.
136 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
137 my $time_pot = $time;
138 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
140 # SOME DESCRIPTIVE TITLE.
141 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
142 # This file is distributed under the same license as the PACKAGE package.
143 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
146 print OUTPUT <<EOF unless $disable_fuzzy_p;
152 "Project-Id-Version: PACKAGE VERSION\\n"
153 "POT-Creation-Date: $time_pot\\n"
154 "PO-Revision-Date: $time_po\\n"
155 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
156 "Language-Team: LANGUAGE <LL\@li.org>\\n"
157 "MIME-Version: 1.0\\n"
158 "Content-Type: text/plain; charset=$pot_charset\\n"
159 "Content-Transfer-Encoding: 8bit\\n"
162 my $directory_re = quotemeta("$directory/");
163 for my $t (string_list) {
164 if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
165 my($token, $n) = ($text{$t}->[0], 0);
166 printf OUTPUT "#. For the first occurrence,\n"
167 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
168 for my $param ($token->parameters_and_fields) {
170 my $type = $param->type;
171 my $subtype = ($type == TmplTokenType::TAG
172 && $param->string =~ /^<input\b/is?
173 $param->attributes->{'type'}->[1]: undef);
174 my $fmt = TmplTokenizer::_formalize( $param );
176 if ($type == TmplTokenType::DIRECTIVE) {
177 $type = "Template::Toolkit Directive";
178 # $type = $param->string =~ /[%()+/is? $1: 'ERROR';
179 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
181 printf OUTPUT "#. %s: %s\n", $fmt,
182 "$type" . (defined $name? " name=$name": '');
184 my $name = $param->attributes->{'name'};
185 my $value = $param->attributes->{'value'}
186 unless $subtype =~ /^(?:text)$/;
187 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
188 . (defined $name? " name=$name->[1]": '')
189 . (defined $value? " value=$value->[1]": '');
192 } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
193 my($token) = ($text{$t}->[0]);
194 printf OUTPUT "#. For the first occurrence,\n"
195 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
196 if ($token->string =~ /^<meta\b/is) {
197 my $type = $token->attributes->{'http-equiv'}->[1];
198 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
199 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
201 my $type = (lc($tag) eq 'input'?
202 $token->attributes->{'type'}: undef);
203 my $name = $token->attributes->{'name'};
204 printf OUTPUT "#. %s\n", $tag
205 . (defined $type? " type=$type->[1]": '')
206 . (defined $name? " name=$name->[1]": '');
208 } elsif ($text{$t}->[0]->has_js_data) {
209 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
210 printf OUTPUT "#. SCRIPT\n";
213 for my $token (@{$text{$t}}) {
214 my $pathname = $token->pathname;
215 $pathname =~ s/^$directory_re//os;
216 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
217 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
218 if defined $pathname && defined $token->line_number;
219 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
221 printf OUTPUT "#, c-format\n" if $cformat_p;
222 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
223 TmplTokenizer::string_canon
224 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
225 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
226 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
230 ###############################################################################
232 sub convert_translation_file () {
233 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
234 VerboseWarnings::set_input_file_name $convert_from;
237 my($msgid, $msgstr) = split(/\t/);
238 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
239 unless defined $msgstr;
241 # Fixup some of the bad strings
242 $msgid =~ s/^SELECTED>//;
245 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
246 remember( $token, $msgid );
247 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
248 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
250 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
251 my $candidate = TmplTokenizer::charset_canon $2;
252 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
253 if defined $charset_in && $charset_in ne $candidate;
254 $charset_in = $candidate;
256 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
257 my $candidate = TmplTokenizer::charset_canon $2;
258 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
259 if defined $charset_out && $charset_out ne $candidate;
260 $charset_out = $candidate;
263 # The following assumption is correct; that's what HTML::Template assumes
264 if (!defined $charset_in) {
265 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
266 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
270 ###############################################################################
274 my $h = $exitcode? *STDERR: *STDOUT;
277 Extract translatable strings from given HTML::Template input files.
280 -f, --files-from=FILE Get list of input files from FILE
281 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
283 Output file location:
284 -o, --output=FILE Write output to specified file
286 HTML::Template options:
287 -a, --extract-all Extract all strings
288 --pedantic-warnings Issue warnings even for detected problems
289 which are likely to be harmless
292 -s, --sort-output generate sorted output
293 -F, --sort-by-file sort output by file location
294 -v, --verbose explain what is being done
297 --help Display this help and exit
299 Try `perldoc $0' for perhaps more information.
304 ###############################################################################
306 sub usage_error (;$) {
307 print STDERR "$_[0]\n" if @_;
308 print STDERR "Try `$0 --help' for more information.\n";
312 ###############################################################################
314 Getopt::Long::config qw( bundling no_auto_abbrev );
316 'a|extract-all' => \$extract_all_p,
317 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
318 'convert-from=s' => \$convert_from,
319 'D|directory=s' => \$directory,
320 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
321 'f|files-from=s' => \$files_from,
322 'I|input-charset=s' => \$charset_in, # INTERNAL
323 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
324 'O|output-charset=s' => \$charset_out, # INTERNAL
325 'output|o=s' => \$output,
326 'po-mode' => \$po_mode_p, # INTERNAL
327 's|sort-output' => sub { $sort = 's' },
328 'F|sort-by-file' => sub { $sort = 'F' },
329 'v|verbose' => \$verbose_p,
330 'help' => sub { usage(0) },
333 VerboseWarnings::set_application_name $0;
334 VerboseWarnings::set_pedantic_mode $pedantic_p;
336 usage_error('Missing mandatory option -f')
337 unless defined $files_from || defined $convert_from;
338 $directory = '.' unless defined $directory;
340 usage_error('You cannot specify both --convert-from and --files-from')
341 if defined $convert_from && defined $files_from;
343 if (defined $output && $output ne '-') {
344 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
345 open(OUTPUT, ">$output") || die "$output: $!\n";
346 binmode( OUTPUT, ":utf8" );
348 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
349 open(OUTPUT, ">&STDOUT");
352 if (defined $files_from) {
353 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
354 open(INPUT, "<$files_from") || die "$files_from: $!\n";
357 my $input = /^\//? $_: "$directory/$_";
358 my $h = TmplTokenizer->new( $input );
359 $h->set_allow_cformat( 1 );
360 VerboseWarnings::set_input_file_name $input;
361 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
366 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
367 convert_translation_file;
371 warn "This input will not work with Mozilla standards-compliant mode\n", undef
372 if TmplTokenizer::syntaxerror_p;
375 exit(-1) if TmplTokenizer::fatal_p;
377 ###############################################################################
381 This is an experimental script based on the modularized
382 text-extract2.pl script. It has behaviour similar to
383 xgettext(1), and generates gettext-compatible output files.
385 A gettext-like format provides the following advantages:
391 Translation to non-English-like languages with different word
392 order: gettext's c-format strings can theoretically be
393 emulated if we are able to do some analysis on the .tmpl input
394 and treat <TMPL_VAR> in a way similar to %s.
398 Context for the extracted strings: the gettext format provides
399 the filenames and line numbers where each string can be found.
400 The translator can read the source file and see the context,
401 in case the string by itself can mean several different things.
405 Place for the translator to add comments about the translations.
409 Gettext-compatible tools, if any, might be usable if we adopt
414 This script has already been in use for over a year and should
415 be reasonable stable. Nevertheless, it is still somewhat
416 experimental and there are still some issues.
418 Please refer to the explanation in tmpl_process3 for further
421 If you want to generate GNOME-style POTFILES.in files, such
422 files (passed to -f) can be generated thus:
424 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
425 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
426 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
427 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
429 This is, however, quite pointless, because the "create" and
430 "update" actions have already been implemented in tmpl_process3.pl.
432 =head2 Strings inside JavaScript
434 In the SCRIPT elements, the script will attempt to scan for
435 _("I<string literal>") patterns, and extract the I<string literal>
436 as a translatable string.
438 Note that the C-like _(...) notation is required.
440 The JavaScript must actually define a _ function
441 so that the code remains correct JavaScript.
442 A suitable definition of such a function can be
444 function _(s) { return s } // dummy function for gettext
455 There probably are some. Bugs related to scanning of <INPUT>
456 tags seem to be especially likely to be present.
458 Its diagnostics are probably too verbose.
460 When a <TMPL_VAR> within a JavaScript-related attribute is
461 detected, the script currently displays no warnings at all.
462 It might be good to display some kind of warning.
464 Its sort order (-s option) seems to be different than the real
465 xgettext(1)'s sort option. This will result in translation
466 strings inside the generated PO file spuriously moving about
467 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
469 If a Javascript string has leading spaces, it will
470 generate strings with spurious leading spaces,
471 leading to failure to match the strings when actually generating