5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 ###############################################################################
29 sub string_negligible_p ($) {
30 my($t) = @_; # a string
31 # Don't emit pure whitespace, pure numbers, pure punctuation,
32 # single letters, or TMPL_VAR's.
33 # Punctuation should arguably be translated. But without context
34 # they are untranslatable. Note that $t is a string, not a token object.
35 return !$extract_all_p && (
36 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
37 || $t =~ /^\d+$/ # purely digits
38 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
39 || $t =~ /^[A-Za-z]$/ # single letters
40 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
44 sub token_negligible_p( $ ) {
47 return !$extract_all_p && (
48 $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
49 $t == C4::TmplTokenType::DIRECTIVE? 1:
50 $t == C4::TmplTokenType::TEXT_PARAMETRIZED
51 && join( '', map { my $t = $_->type;
52 $t == C4::TmplTokenType::DIRECTIVE?
53 '1': $t == C4::TmplTokenType::TAG?
54 '': token_negligible_p( $_ )?
55 '': '1' } @{$x->children} ) eq '' );
58 ###############################################################################
61 my($token, $string) = @_;
62 # If we determine that the string is negligible, don't bother to remember
63 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
64 my $key = TmplTokenizer::string_canon( $string );
65 $text{$key} = [] unless defined $text{$key};
66 push @{$text{$key}}, $token;
70 ###############################################################################
74 # The real gettext tools seems to sort case sensitively; I don't know why
75 @t = sort { $a cmp $b } @t if $sort eq 's';
77 my @aa = sort { $a->pathname cmp $b->pathname
78 || $a->line_number <=> $b->line_number } @{$text{$a}};
79 my @bb = sort { $a->pathname cmp $b->pathname
80 || $a->line_number <=> $b->line_number } @{$text{$b}};
81 $aa[0]->pathname cmp $bb[0]->pathname
82 || $aa[0]->line_number <=> $bb[0]->line_number;
87 ###############################################################################
89 sub text_extract (*) {
92 my $s = TmplTokenizer::next_token $h;
93 last unless defined $s;
94 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
95 if ($kind eq C4::TmplTokenType::TEXT) {
96 if ($t =~ /\S/s && $t !~ /<!/){
99 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
100 if ($s->form =~ /\S/s && $s->form !~ /<!/){
101 remember( $s, $s->form );
103 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
104 # value [tag=input], meta
105 my $tag = lc($1) if $t =~ /^<(\S+)/s;
106 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
108 next if $a eq 'label' && $tag ne 'optgroup';
109 next if $a eq 'content' && $tag ne 'meta';
110 next if $a eq 'value' && ($tag ne 'input'
111 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
112 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
113 $val = TmplTokenizer::trim $val;
114 remember( $s, $val ) if $val =~ /\S/s;
117 } elsif ($s->has_js_data) {
118 for my $t (@{$s->js_data}) {
119 remember( $s, $t->[3] ) if $t->[0]; # FIXME
125 ###############################################################################
127 sub generate_strings_list () {
128 # Emit all extracted strings.
129 for my $t (string_list) {
130 printf OUTPUT "%s\n", $t;
134 ###############################################################################
136 sub generate_po_file () {
137 # We don't emit the Plural-Forms header; it's meaningless for us
138 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
139 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
140 # Time stamps aren't exactly right semantically. I don't know how to fix it.
141 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
142 my $time_pot = $time;
143 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
145 # SOME DESCRIPTIVE TITLE.
146 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
147 # This file is distributed under the same license as the PACKAGE package.
148 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
151 print OUTPUT <<EOF unless $disable_fuzzy_p;
157 "Project-Id-Version: PACKAGE VERSION\\n"
158 "POT-Creation-Date: $time_pot\\n"
159 "PO-Revision-Date: $time_po\\n"
160 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
161 "Language-Team: LANGUAGE <LL\@li.org>\\n"
162 "MIME-Version: 1.0\\n"
163 "Content-Type: text/plain; charset=$pot_charset\\n"
164 "Content-Transfer-Encoding: 8bit\\n"
167 my $directory_re = quotemeta("$directory/");
168 for my $t (string_list) {
169 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
170 my($token, $n) = ($text{$t}->[0], 0);
171 printf OUTPUT "#. For the first occurrence,\n"
172 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
173 for my $param ($token->parameters_and_fields) {
175 my $type = $param->type;
176 my $subtype = ($type == C4::TmplTokenType::TAG
177 && $param->string =~ /^<input\b/is?
178 $param->attributes->{'type'}->[1]: undef);
179 my $fmt = TmplTokenizer::_formalize( $param );
181 if ($type == C4::TmplTokenType::DIRECTIVE) {
182 # $type = "Template::Toolkit Directive";
183 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
184 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
186 printf OUTPUT "#. %s: %s\n", $fmt,
187 "$type" . (defined $name? " name=$name": '');
189 my $name = $param->attributes->{'name'};
190 my $value = $param->attributes->{'value'}
191 unless $subtype =~ /^(?:text)$/;
192 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
193 . (defined $name? " name=$name->[1]": '')
194 . (defined $value? " value=$value->[1]": '');
197 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
198 my($token) = ($text{$t}->[0]);
199 printf OUTPUT "#. For the first occurrence,\n"
200 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
201 if ($token->string =~ /^<meta\b/is) {
202 my $type = $token->attributes->{'http-equiv'}->[1];
203 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
204 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
206 my $type = (lc($tag) eq 'input'?
207 $token->attributes->{'type'}: undef);
208 my $name = $token->attributes->{'name'};
209 printf OUTPUT "#. %s\n", $tag
210 . (defined $type? " type=$type->[1]": '')
211 . (defined $name? " name=$name->[1]": '');
213 } elsif ($text{$t}->[0]->has_js_data) {
214 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
215 printf OUTPUT "#. SCRIPT\n";
218 for my $token (@{$text{$t}}) {
219 my $pathname = $token->pathname;
220 $pathname =~ s/^$directory_re//os;
221 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
222 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
223 if defined $pathname && defined $token->line_number;
224 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
226 printf OUTPUT "#, c-format\n" if $cformat_p;
227 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
228 TmplTokenizer::string_canon
229 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
230 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
231 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
235 ###############################################################################
237 sub convert_translation_file () {
238 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
239 VerboseWarnings::set_input_file_name $convert_from;
242 my($msgid, $msgstr) = split(/\t/);
243 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
244 unless defined $msgstr;
246 # Fixup some of the bad strings
247 $msgid =~ s/^SELECTED>//;
250 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
251 remember( $token, $msgid );
252 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
253 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
255 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
256 my $candidate = TmplTokenizer::charset_canon $2;
257 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
258 if defined $charset_in && $charset_in ne $candidate;
259 $charset_in = $candidate;
261 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
262 my $candidate = TmplTokenizer::charset_canon $2;
263 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
264 if defined $charset_out && $charset_out ne $candidate;
265 $charset_out = $candidate;
268 # The following assumption is correct; that's what HTML::Template assumes
269 if (!defined $charset_in) {
270 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
271 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
275 ###############################################################################
279 my $h = $exitcode? *STDERR: *STDOUT;
282 Extract translatable strings from given HTML::Template input files.
285 -f, --files-from=FILE Get list of input files from FILE
286 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
288 Output file location:
289 -o, --output=FILE Write output to specified file
291 HTML::Template options:
292 -a, --extract-all Extract all strings
293 --pedantic-warnings Issue warnings even for detected problems
294 which are likely to be harmless
297 -s, --sort-output generate sorted output
298 -F, --sort-by-file sort output by file location
299 -v, --verbose explain what is being done
302 --help Display this help and exit
304 Try `perldoc $0' for perhaps more information.
309 ###############################################################################
311 sub usage_error (;$) {
312 print STDERR "$_[0]\n" if @_;
313 print STDERR "Try `$0 --help' for more information.\n";
317 ###############################################################################
319 Getopt::Long::config qw( bundling no_auto_abbrev );
321 'a|extract-all' => \$extract_all_p,
322 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
323 'convert-from=s' => \$convert_from,
324 'D|directory=s' => \$directory,
325 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
326 'f|files-from=s' => \$files_from,
327 'I|input-charset=s' => \$charset_in, # INTERNAL
328 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
329 'O|output-charset=s' => \$charset_out, # INTERNAL
330 'output|o=s' => \$output,
331 'po-mode' => \$po_mode_p, # INTERNAL
332 's|sort-output' => sub { $sort = 's' },
333 'F|sort-by-file' => sub { $sort = 'F' },
334 'v|verbose' => \$verbose_p,
335 'help' => sub { usage(0) },
338 VerboseWarnings::set_application_name $0;
339 VerboseWarnings::set_pedantic_mode $pedantic_p;
341 usage_error('Missing mandatory option -f')
342 unless defined $files_from || defined $convert_from;
343 $directory = '.' unless defined $directory;
345 usage_error('You cannot specify both --convert-from and --files-from')
346 if defined $convert_from && defined $files_from;
348 if (defined $output && $output ne '-') {
349 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
350 open(OUTPUT, ">$output") || die "$output: $!\n";
352 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
353 open(OUTPUT, ">&STDOUT");
355 #binmode OUTPUT, ':encoding(UTF-8)';
357 if (defined $files_from) {
358 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
359 open(INPUT, "<$files_from") || die "$files_from: $!\n";
362 my $input = /^\//? $_: "$directory/$_";
363 my $h = TmplTokenizer->new( $input );
364 $h->set_allow_cformat( 1 );
365 VerboseWarnings::set_input_file_name $input;
366 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
371 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
372 convert_translation_file;
376 warn "This input will not work with Mozilla standards-compliant mode\n", undef
377 if TmplTokenizer::syntaxerror_p;
380 exit(-1) if TmplTokenizer::fatal_p;
382 ###############################################################################
386 This is an experimental script based on the modularized
387 text-extract2.pl script. It has behaviour similar to
388 xgettext(1), and generates gettext-compatible output files.
390 A gettext-like format provides the following advantages:
396 Translation to non-English-like languages with different word
397 order: gettext's c-format strings can theoretically be
398 emulated if we are able to do some analysis on the .tmpl input
399 and treat <TMPL_VAR> in a way similar to %s.
403 Context for the extracted strings: the gettext format provides
404 the filenames and line numbers where each string can be found.
405 The translator can read the source file and see the context,
406 in case the string by itself can mean several different things.
410 Place for the translator to add comments about the translations.
414 Gettext-compatible tools, if any, might be usable if we adopt
419 This script has already been in use for over a year and should
420 be reasonable stable. Nevertheless, it is still somewhat
421 experimental and there are still some issues.
423 Please refer to the explanation in tmpl_process3 for further
426 If you want to generate GNOME-style POTFILES.in files, such
427 files (passed to -f) can be generated thus:
429 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
430 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
431 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
432 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
434 This is, however, quite pointless, because the "create" and
435 "update" actions have already been implemented in tmpl_process3.pl.
437 =head2 Strings inside JavaScript
439 In the SCRIPT elements, the script will attempt to scan for
440 _("I<string literal>") patterns, and extract the I<string literal>
441 as a translatable string.
443 Note that the C-like _(...) notation is required.
445 The JavaScript must actually define a _ function
446 so that the code remains correct JavaScript.
447 A suitable definition of such a function can be
449 function _(s) { return s } // dummy function for gettext
460 There probably are some. Bugs related to scanning of <INPUT>
461 tags seem to be especially likely to be present.
463 Its diagnostics are probably too verbose.
465 When a <TMPL_VAR> within a JavaScript-related attribute is
466 detected, the script currently displays no warnings at all.
467 It might be good to display some kind of warning.
469 Its sort order (-s option) seems to be different than the real
470 xgettext(1)'s sort option. This will result in translation
471 strings inside the generated PO file spuriously moving about
472 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
474 If a Javascript string has leading spaces, it will
475 generate strings with spurious leading spaces,
476 leading to failure to match the strings when actually generating