5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
15 use vars qw( $convert_from );
16 use vars qw( $files_from $directory $output $sort );
17 use vars qw( $extract_all_p );
18 use vars qw( $pedantic_p );
19 use vars qw( %text %translation );
20 use vars qw( $charset_in $charset_out );
21 use vars qw( $verbose_p );
23 ###############################################################################
25 sub string_negligible_p ($) {
26 my($t) = @_; # a string
27 # Don't emit pure whitespace, pure numbers, pure punctuation,
28 # single letters, or TMPL_VAR's.
29 # Punctuation should arguably be translated. But without context
30 # they are untranslatable. Note that $t is a string, not a token object.
31 return !$extract_all_p && (
32 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
33 || $t =~ /^\d+$/ # purely digits
34 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
35 || $t =~ /^[A-Za-z]$/ # single letters
39 sub token_negligible_p( $ ) {
42 return !$extract_all_p && (
43 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
44 $t == TmplTokenType::DIRECTIVE? 1:
45 $t == TmplTokenType::TEXT_PARAMETRIZED
46 && join( '', map { my $t = $_->type;
47 $t == TmplTokenType::DIRECTIVE?
48 '1': $t == TmplTokenType::TAG?
49 '': token_negligible_p( $_ )?
50 '': '1' } @{$x->children} ) eq '' );
53 ###############################################################################
56 my($token, $string) = @_;
57 # If we determine that the string is negligible, don't bother to remember
58 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
59 my $key = TmplTokenizer::string_canon( $string );
60 $text{$key} = [] unless defined $text{$key};
61 push @{$text{$key}}, $token;
65 ###############################################################################
69 # The real gettext tools seems to sort case sensitively; I don't know why
70 @t = sort { $a cmp $b } @t if $sort eq 's';
72 my @aa = sort { $a->pathname cmp $b->pathname
73 || $a->line_number <=> $b->line_number } @{$text{$a}};
74 my @bb = sort { $a->pathname cmp $b->pathname
75 || $a->line_number <=> $b->line_number } @{$text{$b}};
76 $aa[0]->pathname cmp $bb[0]->pathname
77 || $aa[0]->line_number <=> $bb[0]->line_number;
82 ###############################################################################
84 sub text_extract (*) {
87 my $s = TmplTokenizer::next_token $h;
88 last unless defined $s;
89 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
90 if ($kind eq TmplTokenType::TEXT) {
91 remember( $s, $t ) if $t =~ /\S/s;
92 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
93 remember( $s, $s->form ) if $s->form =~ /\S/s;
94 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
95 # value [tag=input], meta
96 my $tag = lc($1) if $t =~ /^<(\S+)/s;
97 for my $a ('alt', 'content', 'title', 'value') {
99 next if $a eq 'content' && $tag ne 'meta';
100 next if $a eq 'value' && ($tag ne 'input'
101 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
102 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
103 $val = TmplTokenizer::trim $val;
104 remember( $s, $val ) if $val =~ /\S/s;
111 ###############################################################################
113 sub generate_strings_list () {
114 # Emit all extracted strings.
115 for my $t (string_list) {
116 printf OUTPUT "%s\n", $t # unless negligible_p($t);
120 ###############################################################################
122 sub generate_po_file () {
123 # We don't emit the Plural-Forms header; it's meaningless for us
124 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
125 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
127 # SOME DESCRIPTIVE TITLE.
128 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
129 # This file is distributed under the same license as the PACKAGE package.
130 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
135 "Project-Id-Version: PACKAGE VERSION\\n"
136 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
137 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
138 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
139 "Language-Team: LANGUAGE <LL\@li.org>\\n"
140 "MIME-Version: 1.0\\n"
141 "Content-Type: text/plain; charset=$pot_charset\\n"
142 "Content-Transfer-Encoding: 8bit\\n"
145 my $directory_re = quotemeta("$directory/");
146 for my $t (string_list) {
147 #next if negligible_p($t);
149 for my $token (@{$text{$t}}) {
150 my $pathname = $token->pathname;
151 $pathname =~ s/^$directory_re//os;
152 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
153 if defined $pathname && defined $token->line_number;
154 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
156 printf OUTPUT "#, c-format\n" if $cformat_p;
157 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
158 TmplTokenizer::string_canon
159 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
160 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
161 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
165 ###############################################################################
167 sub convert_translation_file () {
168 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
169 VerboseWarnings::set_input_file_name $convert_from;
172 my($msgid, $msgstr) = split(/\t/);
173 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
174 unless defined $msgstr;
176 # Fixup some of the bad strings
177 $msgid =~ s/^SELECTED>//;
180 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
181 remember( $token, $msgid );
182 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
183 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
185 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
186 my $candidate = TmplTokenizer::charset_canon $2;
187 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
188 if defined $charset_in && $charset_in ne $candidate;
189 $charset_in = $candidate;
191 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
192 my $candidate = TmplTokenizer::charset_canon $2;
193 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
194 if defined $charset_out && $charset_out ne $candidate;
195 $charset_out = $candidate;
198 # The following assumption is correct; that's what HTML::Template assumes
199 if (!defined $charset_in) {
200 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
201 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
205 ###############################################################################
209 my $h = $exitcode? *STDERR: *STDOUT;
212 Extract translatable strings from given HTML::Template input files.
215 -f, --files-from=FILE Get list of input files from FILE
216 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
218 Output file location:
219 -o, --output=FILE Write output to specified file
221 HTML::Template options:
222 -a, --extract-all Extract all strings
223 --pedantic-warnings Issue warnings even for detected problems
224 which are likely to be harmless
227 -s, --sort-output generate sorted output
228 -F, --sort-by-file sort output by file location
229 -v, --verbose explain what is being done
232 --help Display this help and exit
237 ###############################################################################
239 sub usage_error (;$) {
240 print STDERR "$_[0]\n" if @_;
241 print STDERR "Try `$0 --help' for more information.\n";
245 ###############################################################################
247 Getopt::Long::config qw( bundling no_auto_abbrev );
249 'a|extract-all' => \$extract_all_p,
250 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
251 'convert-from=s' => \$convert_from,
252 'D|directory=s' => \$directory,
253 'f|files-from=s' => \$files_from,
254 'I|input-charset=s' => \$charset_in, # INTERNAL
255 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
256 'O|output-charset=s' => \$charset_out, # INTERNAL
257 'output|o=s' => \$output,
258 's|sort-output' => sub { $sort = 's' },
259 'F|sort-by-file' => sub { $sort = 'F' },
260 'v|verbose' => \$verbose_p,
261 'help' => sub { usage(0) },
264 VerboseWarnings::set_application_name $0;
265 VerboseWarnings::set_pedantic_mode $pedantic_p;
267 usage_error('Missing mandatory option -f')
268 unless defined $files_from || defined $convert_from;
269 $directory = '.' unless defined $directory;
271 usage_error('You cannot specify both --convert-from and --files-from')
272 if defined $convert_from && defined $files_from;
274 if (defined $output && $output ne '-') {
275 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
276 open(OUTPUT, ">$output") || die "$output: $!\n";
278 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
279 open(OUTPUT, ">&STDOUT");
282 if (defined $files_from) {
283 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
284 open(INPUT, "<$files_from") || die "$files_from: $!\n";
287 my $h = TmplTokenizer->new( "$directory/$_" );
288 $h->set_allow_cformat( 1 );
289 VerboseWarnings::set_input_file_name "$directory/$_";
290 print STDERR "$0: Processing file \"$directory/$_\"\n" if $verbose_p;
295 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
296 convert_translation_file;
300 warn "This input will not work with Mozilla standards-compliant mode\n", undef
301 if TmplTokenizer::syntaxerror_p;
304 exit(-1) if TmplTokenizer::fatal_p;
306 ###############################################################################
310 This is an experimental script based on the modularized
311 text-extract2.pl script. It has behaviour similar to
312 xgettext(1), and generates gettext-compatible output files.
314 A gettext-like format provides the following advantages:
321 Translation to non-English-like languages with different word
322 order: gettext's c-format strings can theoretically be
323 emulated if we are able to do some analysis on the .tmpl input
324 and treat <TMPL_VAR> in a way similar to %s.
328 Context for the extracted strings: the gettext format provides
329 the filenames and line numbers where each string can be found.
330 The translator can read the source file and see the context,
331 in case the string by itself can mean several different things.
335 Place for the translator to add comments about the translations.
339 Gettext-compatible tools, if any, might be usable if we adopt
344 Note that this script is experimental and should still be
347 Please refer to the explanation in tmpl_process3 for further
350 If you want to generate GNOME-style POTFILES.in files, such
351 files (passed to -f) can be generated thus:
353 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
354 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
355 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
356 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
358 This is, however, quite pointless, because the "create" and
359 "update" actions have already been implemented in tmpl_process3.pl.
370 There probably are some. Bugs related to scanning of <INPUT>
371 tags seem to be especially likely to be present.
373 Its diagnostics are probably too verbose.
375 When a <TMPL_VAR> within a JavaScript-related attribute is
376 detected, the script currently displays no warnings at all.
377 It might be good to display some kind of warning.
379 Its sort order (-s option) seems to be different than the real
380 xgettext(1)'s sort option. This will result in translation
381 strings inside the generated PO file spuriously moving about
382 when tmpl_process3.pl calls msgmerge(1) to update the PO file.