5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 ###############################################################################
29 sub string_negligible_p ($) {
30 my($t) = @_; # a string
31 # Don't emit pure whitespace, pure numbers, pure punctuation,
32 # single letters, or TMPL_VAR's.
33 # Punctuation should arguably be translated. But without context
34 # they are untranslatable. Note that $t is a string, not a token object.
35 return !$extract_all_p && (
36 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
37 || $t =~ /^\d+$/ # purely digits
38 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
39 || $t =~ /^[A-Za-z]$/ # single letters
43 sub token_negligible_p( $ ) {
46 return !$extract_all_p && (
47 $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
48 $t == C4::TmplTokenType::DIRECTIVE? 1:
49 $t == C4::TmplTokenType::TEXT_PARAMETRIZED
50 && join( '', map { my $t = $_->type;
51 $t == C4::TmplTokenType::DIRECTIVE?
52 '1': $t == C4::TmplTokenType::TAG?
53 '': token_negligible_p( $_ )?
54 '': '1' } @{$x->children} ) eq '' );
57 ###############################################################################
60 my($token, $string) = @_;
61 # If we determine that the string is negligible, don't bother to remember
62 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
63 my $key = TmplTokenizer::string_canon( $string );
64 $text{$key} = [] unless defined $text{$key};
65 push @{$text{$key}}, $token;
69 ###############################################################################
73 # The real gettext tools seems to sort case sensitively; I don't know why
74 @t = sort { $a cmp $b } @t if $sort eq 's';
76 my @aa = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$a}};
78 my @bb = sort { $a->pathname cmp $b->pathname
79 || $a->line_number <=> $b->line_number } @{$text{$b}};
80 $aa[0]->pathname cmp $bb[0]->pathname
81 || $aa[0]->line_number <=> $bb[0]->line_number;
86 ###############################################################################
88 sub text_extract (*) {
91 my $s = TmplTokenizer::next_token $h;
92 last unless defined $s;
93 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
94 if ($kind eq C4::TmplTokenType::TEXT) {
95 if ($t =~ /\S/s && $t !~ /<!/){
98 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
99 if ($s->form =~ /\S/s && $s->form !~ /<!/){
100 remember( $s, $s->form );
102 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
103 # value [tag=input], meta
104 my $tag = lc($1) if $t =~ /^<(\S+)/s;
105 for my $a ('alt', 'content', 'title', 'value','label') {
107 next if $a eq 'label' && $tag ne 'optgroup';
108 next if $a eq 'content' && $tag ne 'meta';
109 next if $a eq 'value' && ($tag ne 'input'
110 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
111 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
112 $val = TmplTokenizer::trim $val;
113 remember( $s, $val ) if $val =~ /\S/s;
116 } elsif ($s->has_js_data) {
117 for my $t (@{$s->js_data}) {
118 remember( $s, $t->[3] ) if $t->[0]; # FIXME
124 ###############################################################################
126 sub generate_strings_list () {
127 # Emit all extracted strings.
128 for my $t (string_list) {
129 printf OUTPUT "%s\n", $t;
133 ###############################################################################
135 sub generate_po_file () {
136 # We don't emit the Plural-Forms header; it's meaningless for us
137 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
138 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
139 # Time stamps aren't exactly right semantically. I don't know how to fix it.
140 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
141 my $time_pot = $time;
142 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
144 # SOME DESCRIPTIVE TITLE.
145 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
146 # This file is distributed under the same license as the PACKAGE package.
147 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
150 print OUTPUT <<EOF unless $disable_fuzzy_p;
156 "Project-Id-Version: PACKAGE VERSION\\n"
157 "POT-Creation-Date: $time_pot\\n"
158 "PO-Revision-Date: $time_po\\n"
159 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
160 "Language-Team: LANGUAGE <LL\@li.org>\\n"
161 "MIME-Version: 1.0\\n"
162 "Content-Type: text/plain; charset=$pot_charset\\n"
163 "Content-Transfer-Encoding: 8bit\\n"
166 my $directory_re = quotemeta("$directory/");
167 for my $t (string_list) {
168 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
169 my($token, $n) = ($text{$t}->[0], 0);
170 printf OUTPUT "#. For the first occurrence,\n"
171 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
172 for my $param ($token->parameters_and_fields) {
174 my $type = $param->type;
175 my $subtype = ($type == C4::TmplTokenType::TAG
176 && $param->string =~ /^<input\b/is?
177 $param->attributes->{'type'}->[1]: undef);
178 my $fmt = TmplTokenizer::_formalize( $param );
180 if ($type == C4::TmplTokenType::DIRECTIVE) {
181 # $type = "Template::Toolkit Directive";
182 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
183 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
185 printf OUTPUT "#. %s: %s\n", $fmt,
186 "$type" . (defined $name? " name=$name": '');
188 my $name = $param->attributes->{'name'};
189 my $value = $param->attributes->{'value'}
190 unless $subtype =~ /^(?:text)$/;
191 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
192 . (defined $name? " name=$name->[1]": '')
193 . (defined $value? " value=$value->[1]": '');
196 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
197 my($token) = ($text{$t}->[0]);
198 printf OUTPUT "#. For the first occurrence,\n"
199 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
200 if ($token->string =~ /^<meta\b/is) {
201 my $type = $token->attributes->{'http-equiv'}->[1];
202 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
203 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
205 my $type = (lc($tag) eq 'input'?
206 $token->attributes->{'type'}: undef);
207 my $name = $token->attributes->{'name'};
208 printf OUTPUT "#. %s\n", $tag
209 . (defined $type? " type=$type->[1]": '')
210 . (defined $name? " name=$name->[1]": '');
212 } elsif ($text{$t}->[0]->has_js_data) {
213 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
214 printf OUTPUT "#. SCRIPT\n";
217 for my $token (@{$text{$t}}) {
218 my $pathname = $token->pathname;
219 $pathname =~ s/^$directory_re//os;
220 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
221 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
222 if defined $pathname && defined $token->line_number;
223 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
225 printf OUTPUT "#, c-format\n" if $cformat_p;
226 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
227 TmplTokenizer::string_canon
228 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
229 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
230 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
234 ###############################################################################
236 sub convert_translation_file () {
237 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
238 VerboseWarnings::set_input_file_name $convert_from;
241 my($msgid, $msgstr) = split(/\t/);
242 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
243 unless defined $msgstr;
245 # Fixup some of the bad strings
246 $msgid =~ s/^SELECTED>//;
249 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
250 remember( $token, $msgid );
251 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
252 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
254 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
255 my $candidate = TmplTokenizer::charset_canon $2;
256 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
257 if defined $charset_in && $charset_in ne $candidate;
258 $charset_in = $candidate;
260 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
261 my $candidate = TmplTokenizer::charset_canon $2;
262 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
263 if defined $charset_out && $charset_out ne $candidate;
264 $charset_out = $candidate;
267 # The following assumption is correct; that's what HTML::Template assumes
268 if (!defined $charset_in) {
269 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
270 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
274 ###############################################################################
278 my $h = $exitcode? *STDERR: *STDOUT;
281 Extract translatable strings from given HTML::Template input files.
284 -f, --files-from=FILE Get list of input files from FILE
285 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
287 Output file location:
288 -o, --output=FILE Write output to specified file
290 HTML::Template options:
291 -a, --extract-all Extract all strings
292 --pedantic-warnings Issue warnings even for detected problems
293 which are likely to be harmless
296 -s, --sort-output generate sorted output
297 -F, --sort-by-file sort output by file location
298 -v, --verbose explain what is being done
301 --help Display this help and exit
303 Try `perldoc $0' for perhaps more information.
308 ###############################################################################
310 sub usage_error (;$) {
311 print STDERR "$_[0]\n" if @_;
312 print STDERR "Try `$0 --help' for more information.\n";
316 ###############################################################################
318 Getopt::Long::config qw( bundling no_auto_abbrev );
320 'a|extract-all' => \$extract_all_p,
321 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
322 'convert-from=s' => \$convert_from,
323 'D|directory=s' => \$directory,
324 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
325 'f|files-from=s' => \$files_from,
326 'I|input-charset=s' => \$charset_in, # INTERNAL
327 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
328 'O|output-charset=s' => \$charset_out, # INTERNAL
329 'output|o=s' => \$output,
330 'po-mode' => \$po_mode_p, # INTERNAL
331 's|sort-output' => sub { $sort = 's' },
332 'F|sort-by-file' => sub { $sort = 'F' },
333 'v|verbose' => \$verbose_p,
334 'help' => sub { usage(0) },
337 VerboseWarnings::set_application_name $0;
338 VerboseWarnings::set_pedantic_mode $pedantic_p;
340 usage_error('Missing mandatory option -f')
341 unless defined $files_from || defined $convert_from;
342 $directory = '.' unless defined $directory;
344 usage_error('You cannot specify both --convert-from and --files-from')
345 if defined $convert_from && defined $files_from;
347 if (defined $output && $output ne '-') {
348 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
349 open(OUTPUT, ">$output") || die "$output: $!\n";
351 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
352 open(OUTPUT, ">&STDOUT");
354 binmode OUTPUT, ':encoding(UTF-8)';
356 if (defined $files_from) {
357 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
358 open(INPUT, "<$files_from") || die "$files_from: $!\n";
361 my $input = /^\//? $_: "$directory/$_";
362 my $h = TmplTokenizer->new( $input );
363 $h->set_allow_cformat( 1 );
364 VerboseWarnings::set_input_file_name $input;
365 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
370 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
371 convert_translation_file;
375 warn "This input will not work with Mozilla standards-compliant mode\n", undef
376 if TmplTokenizer::syntaxerror_p;
379 exit(-1) if TmplTokenizer::fatal_p;
381 ###############################################################################
385 This is an experimental script based on the modularized
386 text-extract2.pl script. It has behaviour similar to
387 xgettext(1), and generates gettext-compatible output files.
389 A gettext-like format provides the following advantages:
395 Translation to non-English-like languages with different word
396 order: gettext's c-format strings can theoretically be
397 emulated if we are able to do some analysis on the .tmpl input
398 and treat <TMPL_VAR> in a way similar to %s.
402 Context for the extracted strings: the gettext format provides
403 the filenames and line numbers where each string can be found.
404 The translator can read the source file and see the context,
405 in case the string by itself can mean several different things.
409 Place for the translator to add comments about the translations.
413 Gettext-compatible tools, if any, might be usable if we adopt
418 This script has already been in use for over a year and should
419 be reasonable stable. Nevertheless, it is still somewhat
420 experimental and there are still some issues.
422 Please refer to the explanation in tmpl_process3 for further
425 If you want to generate GNOME-style POTFILES.in files, such
426 files (passed to -f) can be generated thus:
428 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
429 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
430 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
431 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
433 This is, however, quite pointless, because the "create" and
434 "update" actions have already been implemented in tmpl_process3.pl.
436 =head2 Strings inside JavaScript
438 In the SCRIPT elements, the script will attempt to scan for
439 _("I<string literal>") patterns, and extract the I<string literal>
440 as a translatable string.
442 Note that the C-like _(...) notation is required.
444 The JavaScript must actually define a _ function
445 so that the code remains correct JavaScript.
446 A suitable definition of such a function can be
448 function _(s) { return s } // dummy function for gettext
459 There probably are some. Bugs related to scanning of <INPUT>
460 tags seem to be especially likely to be present.
462 Its diagnostics are probably too verbose.
464 When a <TMPL_VAR> within a JavaScript-related attribute is
465 detected, the script currently displays no warnings at all.
466 It might be good to display some kind of warning.
468 Its sort order (-s option) seems to be different than the real
469 xgettext(1)'s sort option. This will result in translation
470 strings inside the generated PO file spuriously moving about
471 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
473 If a Javascript string has leading spaces, it will
474 generate strings with spurious leading spaces,
475 leading to failure to match the strings when actually generating