5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
15 use vars qw( $convert_from );
16 use vars qw( $files_from $directory $output $sort );
17 use vars qw( $pedantic_p );
18 use vars qw( %text %translation );
19 use vars qw( $charset_in $charset_out );
21 ###############################################################################
23 use vars qw( @latin1_utf8 );
25 "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
26 "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
27 "\302\214", "\302\215", undef, undef, "\302\220", "\302\221",
28 "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
29 "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
30 "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
31 "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
32 "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
33 "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
34 "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
35 "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
36 "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
37 "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
38 "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
39 "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
40 "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
41 "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
42 "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
43 "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
44 "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
45 "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
46 "\303\276", "\303\277" );
48 sub charset_convert ($) {
50 if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
52 } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
53 $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
54 } elsif ($charset_in ne $charset_out) {
55 VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
60 ###############################################################################
63 my($token, $string) = @_;
64 $text{$string} = [] unless defined $text{$string};
65 push @{$text{$string}}, $token;
68 ###############################################################################
72 # The real gettext tools seems to sort case sensitively; I don't know why
73 @t = sort { $a cmp $b } @t if $sort eq 's';
75 my @aa = sort { $a->pathname cmp $b->pathname
76 || $a->line_number <=> $b->line_number } @{$text{$a}};
77 my @bb = sort { $a->pathname cmp $b->pathname
78 || $a->line_number <=> $b->line_number } @{$text{$b}};
79 $aa[0]->pathname cmp $bb[0]->pathname
80 || $aa[0]->line_number <=> $bb[0]->line_number;
85 ###############################################################################
87 sub text_extract (*) {
90 my $s = TmplTokenizer::next_token $h;
91 last unless defined $s;
92 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
93 if ($kind eq TmplTokenType::TEXT) {
94 #$t = TmplTokenizer::trim $t;
95 remember( $s, $t ) if $t =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
97 #$t = TmplTokenizer::trim $t;
98 remember( $s, $s->form ) if $s->form =~ /\S/s;
99 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
100 # value [tag=input], meta
101 my $tag = lc($1) if $t =~ /^<(\S+)/s;
102 for my $a ('alt', 'content', 'title', 'value') {
104 next if $a eq 'content' && $tag ne 'meta';
105 next if $a eq 'value' && ($tag ne 'input'
106 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
107 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
108 $val = TmplTokenizer::trim $val;
109 remember( $s, $val ) if $val =~ /\S/s;
116 ###############################################################################
118 sub generate_strings_list () {
119 # Emit all extracted strings.
120 # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
121 for my $t (string_list) {
122 printf OUTPUT "%s\n", $t
123 unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
127 ###############################################################################
129 sub generate_po_file () {
130 # We don't emit the Plural-Forms header; it's meaningless for us
131 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
132 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
134 # SOME DESCRIPTIVE TITLE.
135 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
136 # This file is distributed under the same license as the PACKAGE package.
137 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
142 "Project-Id-Version: PACKAGE VERSION\\n"
143 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
144 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
145 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
146 "Language-Team: LANGUAGE <LL\@li.org>\\n"
147 "MIME-Version: 1.0\\n"
148 "Content-Type: text/plain; charset=$pot_charset\\n"
149 "Content-Transfer-Encoding: 8bit\\n"
152 my $directory_re = quotemeta("$directory/");
153 for my $t (string_list) {
154 next if TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
156 for my $token (@{$text{$t}}) {
157 my $pathname = $token->pathname;
158 $pathname =~ s/^$directory_re//os;
159 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
160 if defined $pathname && defined $token->line_number;
161 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
163 printf OUTPUT "#, c-format\n" if $cformat_p;
164 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( charset_convert $t );
165 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
166 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
170 ###############################################################################
172 sub convert_translation_file () {
173 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
174 VerboseWarnings::set_input_file_name $convert_from;
177 my($msgid, $msgstr) = split(/\t/);
178 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
179 unless defined $msgstr;
181 # Fixup some of the bad strings
182 $msgid =~ s/^SELECTED>//;
185 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
186 remember( $token, $msgid );
187 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
188 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
190 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
191 my $candidate = TmplTokenizer::charset_canon $2;
192 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
193 if defined $charset_in && $charset_in ne $candidate;
194 $charset_in = $candidate;
196 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
197 my $candidate = TmplTokenizer::charset_canon $2;
198 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
199 if defined $charset_out && $charset_out ne $candidate;
200 $charset_out = $candidate;
203 if (!defined $charset_in) {
204 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
205 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
209 ###############################################################################
213 my $h = $exitcode? *STDERR: *STDOUT;
216 Extract translatable strings from given HTML::Template input files.
219 -f, --files-from=FILE Get list of input files from FILE
220 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
222 Output file location:
223 -o, --output=FILE Write output to specified file
225 HTML::Template options:
226 --pedantic-warnings Issue warnings even for detected problems
227 which are likely to be harmless
230 -s, --sort-output generate sorted output
231 -F, --sort-by-file sort output by file location
234 --help Display this help and exit
239 ###############################################################################
241 sub usage_error (;$) {
242 print STDERR "$_[0]\n" if @_;
243 print STDERR "Try `$0 --help' for more information.\n";
247 ###############################################################################
249 Getopt::Long::config qw( bundling no_auto_abbrev );
251 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
252 'convert-from=s' => \$convert_from,
253 'D|directory=s' => \$directory,
254 'f|files-from=s' => \$files_from,
255 'I|input-charset=s' => \$charset_in, # INTERNAL
256 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
257 'O|output-charset=s' => \$charset_out, # INTERNAL
258 'output|o=s' => \$output,
259 's|sort-output' => sub { $sort = 's' },
260 'F|sort-by-file' => sub { $sort = 'F' },
261 'help' => sub { usage(0) },
264 VerboseWarnings::set_application_name $0;
265 VerboseWarnings::set_pedantic_mode $pedantic_p;
267 usage_error('Missing mandatory option -f')
268 unless defined $files_from || defined $convert_from;
269 $directory = '.' unless defined $directory;
271 usage_error('You cannot specify both --convert-from and --files-from')
272 if defined $convert_from && defined $files_from;
274 if (defined $output && $output ne '-') {
275 open(OUTPUT, ">$output") || die "$output: $!\n";
277 open(OUTPUT, ">&STDOUT");
280 if (defined $files_from) {
281 open(INPUT, "<$files_from") || die "$files_from: $!\n";
284 my $h = TmplTokenizer->new( "$directory/$_" );
285 $h->set_allow_cformat( 1 );
286 VerboseWarnings::set_input_file_name "$directory/$_";
291 convert_translation_file;
295 warn "This input will not work with Mozilla standards-compliant mode\n", undef
296 if TmplTokenizer::syntaxerror_p;
299 exit(-1) if TmplTokenizer::fatal_p;
301 ###############################################################################
305 This is an experimental script based on the modularized
306 text-extract2.pl script. It has behaviour similar to
307 xgettext(1), and generates gettext-compatible output files.
309 A gettext-like format provides the following advantages:
316 Translation to non-English-like languages with different word
317 order: gettext's c-format strings can theoretically be
318 emulated if we are able to do some analysis on the .tmpl input
319 and treat <TMPL_VAR> in a way similar to %s.
323 Context for the extracted strings: the gettext format provides
324 the filenames and line numbers where each string can be found.
325 The translator can read the source file and see the context,
326 in case the string by itself can mean several different things.
330 Place for the translator to add comments about the translations.
334 Gettext-compatible tools, if any, might be usable if we adopt
339 Right now it does about the same thing as text-extract2.pl but
340 generates gettext-style output; however, because it is scanner-
341 instead of parser-based, it is able to address the 4 weaknesses
342 listed in translator_doc.txt. Ultimately, the goal is to make
343 this able to do some kind of simple analysis on the input to
344 produce gettext-style output with c-format strings, in order to
345 facilitate translation to languages with a different word order
348 When the above is finished, the generated po file may contain
349 some HTML tags in addition to %s strings.
351 If you want to generate GNOME-style POTFILES.in files, such
352 files (passed to -f) can be generated thus:
354 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
355 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
356 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
357 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
359 This is, however, quite pointless, because the "create" and
360 "update" actions have already been implemented in tmpl_process3.pl.
371 There probably are some. Bugs related to scanning of <INPUT>
372 tags seem to be especially likely to be present.
374 Its diagnostics are probably too verbose.
376 When a <TMPL_VAR> within a JavaScript-related attribute is
377 detected, the script currently displays no warnings at all.
378 It might be good to display some kind of warning.
380 Its sort order (-s option) seems to be different than the real
381 xgettext(1)'s sort option. This will result in translation
382 strings inside the generated PO file spuriously moving about
383 when tmpl_process3.pl calls msgmerge(1) to update the PO file.