5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 ###############################################################################
29 sub string_negligible_p ($) {
30 my($t) = @_; # a string
31 # Don't emit pure whitespace, pure numbers, pure punctuation,
32 # single letters, or TMPL_VAR's.
33 # Punctuation should arguably be translated. But without context
34 # they are untranslatable. Note that $t is a string, not a token object.
35 return !$extract_all_p && (
36 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
37 || $t =~ /^\d+$/ # purely digits
38 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
39 || $t =~ /^[A-Za-z]$/ # single letters
43 sub token_negligible_p( $ ) {
46 return !$extract_all_p && (
47 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
48 $t == TmplTokenType::DIRECTIVE? 1:
49 $t == TmplTokenType::TEXT_PARAMETRIZED
50 && join( '', map { my $t = $_->type;
51 $t == TmplTokenType::DIRECTIVE?
52 '1': $t == TmplTokenType::TAG?
53 '': token_negligible_p( $_ )?
54 '': '1' } @{$x->children} ) eq '' );
57 ###############################################################################
60 my($token, $string) = @_;
61 # If we determine that the string is negligible, don't bother to remember
62 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
63 my $key = TmplTokenizer::string_canon( $string );
64 $text{$key} = [] unless defined $text{$key};
65 push @{$text{$key}}, $token;
69 ###############################################################################
73 # The real gettext tools seems to sort case sensitively; I don't know why
74 @t = sort { $a cmp $b } @t if $sort eq 's';
76 my @aa = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$a}};
78 my @bb = sort { $a->pathname cmp $b->pathname
79 || $a->line_number <=> $b->line_number } @{$text{$b}};
80 $aa[0]->pathname cmp $bb[0]->pathname
81 || $aa[0]->line_number <=> $bb[0]->line_number;
86 ###############################################################################
88 sub text_extract (*) {
91 my $s = TmplTokenizer::next_token $h;
92 last unless defined $s;
93 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
94 if ($kind eq TmplTokenType::TEXT) {
95 remember( $s, $t ) if $t =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
97 remember( $s, $s->form ) if $s->form =~ /\S/s;
98 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
99 # value [tag=input], meta
100 my $tag = lc($1) if $t =~ /^<(\S+)/s;
101 for my $a ('alt', 'content', 'title', 'value') {
103 next if $a eq 'content' && $tag ne 'meta';
104 next if $a eq 'value' && ($tag ne 'input'
105 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|submit)$/)); # FIXME
106 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
107 $val = TmplTokenizer::trim $val;
108 remember( $s, $val ) if $val =~ /\S/s;
111 } elsif ($s->has_js_data) {
112 for my $t (@{$s->js_data}) {
113 remember( $s, $t->[3] ) if $t->[0]; # FIXME
119 ###############################################################################
121 sub generate_strings_list () {
122 # Emit all extracted strings.
123 for my $t (string_list) {
124 printf OUTPUT "%s\n", $t;
128 ###############################################################################
130 sub generate_po_file () {
131 # We don't emit the Plural-Forms header; it's meaningless for us
132 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
133 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
134 # Time stamps aren't exactly right semantically. I don't know how to fix it.
135 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
136 my $time_pot = $time;
137 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
139 # SOME DESCRIPTIVE TITLE.
140 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
141 # This file is distributed under the same license as the PACKAGE package.
142 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
145 print OUTPUT <<EOF unless $disable_fuzzy_p;
151 "Project-Id-Version: PACKAGE VERSION\\n"
152 "POT-Creation-Date: $time_pot\\n"
153 "PO-Revision-Date: $time_po\\n"
154 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
155 "Language-Team: LANGUAGE <LL\@li.org>\\n"
156 "MIME-Version: 1.0\\n"
157 "Content-Type: text/plain; charset=$pot_charset\\n"
158 "Content-Transfer-Encoding: 8bit\\n"
161 my $directory_re = quotemeta("$directory/");
162 for my $t (string_list) {
163 if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
164 my($token, $n) = ($text{$t}->[0], 0);
165 printf OUTPUT "#. For the first occurrence,\n"
166 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
167 for my $param ($token->parameters_and_fields) {
169 my $type = $param->type;
170 my $subtype = ($type == TmplTokenType::TAG
171 && $param->string =~ /^<input\b/is?
172 $param->attributes->{'type'}->[1]: undef);
173 my $fmt = TmplTokenizer::_formalize( $param );
175 if ($type == TmplTokenType::DIRECTIVE) {
176 $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
177 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
179 printf OUTPUT "#. %s: %s\n", $fmt,
180 "$type" . (defined $name? " name=$name": '');
182 my $name = $param->attributes->{'name'};
183 my $value = $param->attributes->{'value'}
184 unless $subtype =~ /^(?:text)$/;
185 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
186 . (defined $name? " name=$name->[1]": '')
187 . (defined $value? " value=$value->[1]": '');
190 } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
191 my($token) = ($text{$t}->[0]);
192 printf OUTPUT "#. For the first occurrence,\n"
193 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
194 if ($token->string =~ /^<meta\b/is) {
195 my $type = $token->attributes->{'http-equiv'}->[1];
196 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
197 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
199 my $type = (lc($tag) eq 'input'?
200 $token->attributes->{'type'}: undef);
201 my $name = $token->attributes->{'name'};
202 printf OUTPUT "#. %s\n", $tag
203 . (defined $type? " type=$type->[1]": '')
204 . (defined $name? " name=$name->[1]": '');
206 } elsif ($text{$t}->[0]->has_js_data) {
207 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
208 printf OUTPUT "#. SCRIPT\n";
211 for my $token (@{$text{$t}}) {
212 my $pathname = $token->pathname;
213 $pathname =~ s/^$directory_re//os;
214 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
215 if defined $pathname && defined $token->line_number;
216 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
218 printf OUTPUT "#, c-format\n" if $cformat_p;
219 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
220 TmplTokenizer::string_canon
221 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
222 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
223 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
227 ###############################################################################
229 sub convert_translation_file () {
230 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
231 VerboseWarnings::set_input_file_name $convert_from;
234 my($msgid, $msgstr) = split(/\t/);
235 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
236 unless defined $msgstr;
238 # Fixup some of the bad strings
239 $msgid =~ s/^SELECTED>//;
242 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
243 remember( $token, $msgid );
244 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
245 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
247 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
248 my $candidate = TmplTokenizer::charset_canon $2;
249 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
250 if defined $charset_in && $charset_in ne $candidate;
251 $charset_in = $candidate;
253 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
254 my $candidate = TmplTokenizer::charset_canon $2;
255 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
256 if defined $charset_out && $charset_out ne $candidate;
257 $charset_out = $candidate;
260 # The following assumption is correct; that's what HTML::Template assumes
261 if (!defined $charset_in) {
262 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
263 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
267 ###############################################################################
271 my $h = $exitcode? *STDERR: *STDOUT;
274 Extract translatable strings from given HTML::Template input files.
277 -f, --files-from=FILE Get list of input files from FILE
278 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
280 Output file location:
281 -o, --output=FILE Write output to specified file
283 HTML::Template options:
284 -a, --extract-all Extract all strings
285 --pedantic-warnings Issue warnings even for detected problems
286 which are likely to be harmless
289 -s, --sort-output generate sorted output
290 -F, --sort-by-file sort output by file location
291 -v, --verbose explain what is being done
294 --help Display this help and exit
296 Try `perldoc $0' for perhaps more information.
301 ###############################################################################
303 sub usage_error (;$) {
304 print STDERR "$_[0]\n" if @_;
305 print STDERR "Try `$0 --help' for more information.\n";
309 ###############################################################################
311 Getopt::Long::config qw( bundling no_auto_abbrev );
313 'a|extract-all' => \$extract_all_p,
314 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
315 'convert-from=s' => \$convert_from,
316 'D|directory=s' => \$directory,
317 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
318 'f|files-from=s' => \$files_from,
319 'I|input-charset=s' => \$charset_in, # INTERNAL
320 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
321 'O|output-charset=s' => \$charset_out, # INTERNAL
322 'output|o=s' => \$output,
323 'po-mode' => \$po_mode_p, # INTERNAL
324 's|sort-output' => sub { $sort = 's' },
325 'F|sort-by-file' => sub { $sort = 'F' },
326 'v|verbose' => \$verbose_p,
327 'help' => sub { usage(0) },
330 VerboseWarnings::set_application_name $0;
331 VerboseWarnings::set_pedantic_mode $pedantic_p;
333 usage_error('Missing mandatory option -f')
334 unless defined $files_from || defined $convert_from;
335 $directory = '.' unless defined $directory;
337 usage_error('You cannot specify both --convert-from and --files-from')
338 if defined $convert_from && defined $files_from;
340 if (defined $output && $output ne '-') {
341 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
342 open(OUTPUT, ">$output") || die "$output: $!\n";
344 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
345 open(OUTPUT, ">&STDOUT");
348 if (defined $files_from) {
349 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
350 open(INPUT, "<$files_from") || die "$files_from: $!\n";
353 my $input = /^\//? $_: "$directory/$_";
354 my $h = TmplTokenizer->new( $input );
355 $h->set_allow_cformat( 1 );
356 VerboseWarnings::set_input_file_name $input;
357 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
362 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
363 convert_translation_file;
367 warn "This input will not work with Mozilla standards-compliant mode\n", undef
368 if TmplTokenizer::syntaxerror_p;
371 exit(-1) if TmplTokenizer::fatal_p;
373 ###############################################################################
377 This is an experimental script based on the modularized
378 text-extract2.pl script. It has behaviour similar to
379 xgettext(1), and generates gettext-compatible output files.
381 A gettext-like format provides the following advantages:
387 Translation to non-English-like languages with different word
388 order: gettext's c-format strings can theoretically be
389 emulated if we are able to do some analysis on the .tmpl input
390 and treat <TMPL_VAR> in a way similar to %s.
394 Context for the extracted strings: the gettext format provides
395 the filenames and line numbers where each string can be found.
396 The translator can read the source file and see the context,
397 in case the string by itself can mean several different things.
401 Place for the translator to add comments about the translations.
405 Gettext-compatible tools, if any, might be usable if we adopt
410 This script has already been in use for over a year and should
411 be reasonable stable. Nevertheless, it is still somewhat
412 experimental and there are still some issues.
414 Please refer to the explanation in tmpl_process3 for further
417 If you want to generate GNOME-style POTFILES.in files, such
418 files (passed to -f) can be generated thus:
420 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
421 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
422 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
423 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
425 This is, however, quite pointless, because the "create" and
426 "update" actions have already been implemented in tmpl_process3.pl.
428 =head2 Strings inside JavaScript
430 In the SCRIPT elements, the script will attempt to scan for
431 _("I<string literal>") patterns, and extract the I<string literal>
432 as a translatable string.
434 Note that the C-like _(...) notation is required.
436 The JavaScript must actually define a _ function
437 so that the code remains correct JavaScript.
438 A suitable definition of such a function can be
440 function _(s) { return s } // dummy function for gettext
451 There probably are some. Bugs related to scanning of <INPUT>
452 tags seem to be especially likely to be present.
454 Its diagnostics are probably too verbose.
456 When a <TMPL_VAR> within a JavaScript-related attribute is
457 detected, the script currently displays no warnings at all.
458 It might be good to display some kind of warning.
460 Its sort order (-s option) seems to be different than the real
461 xgettext(1)'s sort option. This will result in translation
462 strings inside the generated PO file spuriously moving about
463 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
465 If a Javascript string has leading spaces, it will
466 generate strings with spurious leading spaces,
467 leading to failure to match the strings when actually generating