5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
15 use vars qw( $convert_from );
16 use vars qw( $files_from $directory $output $sort );
17 use vars qw( $extract_all_p );
18 use vars qw( $pedantic_p );
19 use vars qw( %text %translation );
20 use vars qw( $charset_in $charset_out );
22 ###############################################################################
24 sub string_negligible_p ($) {
25 my($t) = @_; # a string
26 # Don't emit pure whitespace, pure numbers, pure punctuation,
27 # single letters, or TMPL_VAR's.
28 # Punctuation should arguably be translated. But without context
29 # they are untranslatable. Note that $t is a string, not a token object.
30 return !$extract_all_p && (
31 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
32 || $t =~ /^\d+$/ # purely digits
33 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
34 || $t =~ /^[A-Za-z]$/ # single letters
38 sub token_negligible_p( $ ) {
41 return !$extract_all_p && (
42 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
43 $t == TmplTokenType::DIRECTIVE? 1:
44 $t == TmplTokenType::TEXT_PARAMETRIZED
45 && join( '', map { my $t = $_->type;
46 $t == TmplTokenType::DIRECTIVE?
47 '1': $t == TmplTokenType::TAG?
48 '': token_negligible_p( $_ )?
49 '': '1' } @{$x->children} ) eq '' );
52 ###############################################################################
55 my($token, $string) = @_;
56 # If we determine that the string is negligible, don't bother to remember
57 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
58 my $key = TmplTokenizer::string_canon( $string );
59 $text{$key} = [] unless defined $text{$key};
60 push @{$text{$key}}, $token;
64 ###############################################################################
68 # The real gettext tools seems to sort case sensitively; I don't know why
69 @t = sort { $a cmp $b } @t if $sort eq 's';
71 my @aa = sort { $a->pathname cmp $b->pathname
72 || $a->line_number <=> $b->line_number } @{$text{$a}};
73 my @bb = sort { $a->pathname cmp $b->pathname
74 || $a->line_number <=> $b->line_number } @{$text{$b}};
75 $aa[0]->pathname cmp $bb[0]->pathname
76 || $aa[0]->line_number <=> $bb[0]->line_number;
81 ###############################################################################
83 sub text_extract (*) {
86 my $s = TmplTokenizer::next_token $h;
87 last unless defined $s;
88 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
89 if ($kind eq TmplTokenType::TEXT) {
90 remember( $s, $t ) if $t =~ /\S/s;
91 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
92 remember( $s, $s->form ) if $s->form =~ /\S/s;
93 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
94 # value [tag=input], meta
95 my $tag = lc($1) if $t =~ /^<(\S+)/s;
96 for my $a ('alt', 'content', 'title', 'value') {
98 next if $a eq 'content' && $tag ne 'meta';
99 next if $a eq 'value' && ($tag ne 'input'
100 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
101 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
102 $val = TmplTokenizer::trim $val;
103 remember( $s, $val ) if $val =~ /\S/s;
110 ###############################################################################
112 sub generate_strings_list () {
113 # Emit all extracted strings.
114 for my $t (string_list) {
115 printf OUTPUT "%s\n", $t # unless negligible_p($t);
119 ###############################################################################
121 sub generate_po_file () {
122 # We don't emit the Plural-Forms header; it's meaningless for us
123 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
124 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
126 # SOME DESCRIPTIVE TITLE.
127 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
128 # This file is distributed under the same license as the PACKAGE package.
129 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
134 "Project-Id-Version: PACKAGE VERSION\\n"
135 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
136 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
137 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
138 "Language-Team: LANGUAGE <LL\@li.org>\\n"
139 "MIME-Version: 1.0\\n"
140 "Content-Type: text/plain; charset=$pot_charset\\n"
141 "Content-Transfer-Encoding: 8bit\\n"
144 my $directory_re = quotemeta("$directory/");
145 for my $t (string_list) {
146 #next if negligible_p($t);
148 for my $token (@{$text{$t}}) {
149 my $pathname = $token->pathname;
150 $pathname =~ s/^$directory_re//os;
151 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
152 if defined $pathname && defined $token->line_number;
153 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
155 printf OUTPUT "#, c-format\n" if $cformat_p;
156 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
157 TmplTokenizer::string_canon
158 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
159 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
160 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
164 ###############################################################################
166 sub convert_translation_file () {
167 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
168 VerboseWarnings::set_input_file_name $convert_from;
171 my($msgid, $msgstr) = split(/\t/);
172 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
173 unless defined $msgstr;
175 # Fixup some of the bad strings
176 $msgid =~ s/^SELECTED>//;
179 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
180 remember( $token, $msgid );
181 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
182 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
184 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
185 my $candidate = TmplTokenizer::charset_canon $2;
186 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
187 if defined $charset_in && $charset_in ne $candidate;
188 $charset_in = $candidate;
190 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
191 my $candidate = TmplTokenizer::charset_canon $2;
192 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
193 if defined $charset_out && $charset_out ne $candidate;
194 $charset_out = $candidate;
197 # The following assumption is correct; that's what HTML::Template assumes
198 if (!defined $charset_in) {
199 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
200 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
204 ###############################################################################
208 my $h = $exitcode? *STDERR: *STDOUT;
211 Extract translatable strings from given HTML::Template input files.
214 -f, --files-from=FILE Get list of input files from FILE
215 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
217 Output file location:
218 -o, --output=FILE Write output to specified file
220 HTML::Template options:
221 -a, --extract-all Extract all strings
222 --pedantic-warnings Issue warnings even for detected problems
223 which are likely to be harmless
226 -s, --sort-output generate sorted output
227 -F, --sort-by-file sort output by file location
230 --help Display this help and exit
235 ###############################################################################
237 sub usage_error (;$) {
238 print STDERR "$_[0]\n" if @_;
239 print STDERR "Try `$0 --help' for more information.\n";
243 ###############################################################################
245 Getopt::Long::config qw( bundling no_auto_abbrev );
247 'a|extract-all' => \$extract_all_p,
248 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
249 'convert-from=s' => \$convert_from,
250 'D|directory=s' => \$directory,
251 'f|files-from=s' => \$files_from,
252 'I|input-charset=s' => \$charset_in, # INTERNAL
253 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
254 'O|output-charset=s' => \$charset_out, # INTERNAL
255 'output|o=s' => \$output,
256 's|sort-output' => sub { $sort = 's' },
257 'F|sort-by-file' => sub { $sort = 'F' },
258 'help' => sub { usage(0) },
261 VerboseWarnings::set_application_name $0;
262 VerboseWarnings::set_pedantic_mode $pedantic_p;
264 usage_error('Missing mandatory option -f')
265 unless defined $files_from || defined $convert_from;
266 $directory = '.' unless defined $directory;
268 usage_error('You cannot specify both --convert-from and --files-from')
269 if defined $convert_from && defined $files_from;
271 if (defined $output && $output ne '-') {
272 open(OUTPUT, ">$output") || die "$output: $!\n";
274 open(OUTPUT, ">&STDOUT");
277 if (defined $files_from) {
278 open(INPUT, "<$files_from") || die "$files_from: $!\n";
281 my $h = TmplTokenizer->new( "$directory/$_" );
282 $h->set_allow_cformat( 1 );
283 VerboseWarnings::set_input_file_name "$directory/$_";
288 convert_translation_file;
292 warn "This input will not work with Mozilla standards-compliant mode\n", undef
293 if TmplTokenizer::syntaxerror_p;
296 exit(-1) if TmplTokenizer::fatal_p;
298 ###############################################################################
302 This is an experimental script based on the modularized
303 text-extract2.pl script. It has behaviour similar to
304 xgettext(1), and generates gettext-compatible output files.
306 A gettext-like format provides the following advantages:
313 Translation to non-English-like languages with different word
314 order: gettext's c-format strings can theoretically be
315 emulated if we are able to do some analysis on the .tmpl input
316 and treat <TMPL_VAR> in a way similar to %s.
320 Context for the extracted strings: the gettext format provides
321 the filenames and line numbers where each string can be found.
322 The translator can read the source file and see the context,
323 in case the string by itself can mean several different things.
327 Place for the translator to add comments about the translations.
331 Gettext-compatible tools, if any, might be usable if we adopt
336 Note that this script is experimental and should still be
339 Please refer to the explanation in tmpl_process3 for further
342 If you want to generate GNOME-style POTFILES.in files, such
343 files (passed to -f) can be generated thus:
345 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
346 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
347 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
348 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
350 This is, however, quite pointless, because the "create" and
351 "update" actions have already been implemented in tmpl_process3.pl.
362 There probably are some. Bugs related to scanning of <INPUT>
363 tags seem to be especially likely to be present.
365 Its diagnostics are probably too verbose.
367 When a <TMPL_VAR> within a JavaScript-related attribute is
368 detected, the script currently displays no warnings at all.
369 It might be good to display some kind of warning.
371 Its sort order (-s option) seems to be different than the real
372 xgettext(1)'s sort option. This will result in translation
373 strings inside the generated PO file spuriously moving about
374 when tmpl_process3.pl calls msgmerge(1) to update the PO file.