5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
15 use vars qw( $convert_from );
16 use vars qw( $files_from $directory $output $sort );
17 use vars qw( $extract_all_p );
18 use vars qw( $pedantic_p );
19 use vars qw( %text %translation );
20 use vars qw( $charset_in $charset_out );
22 ###############################################################################
24 sub string_negligible_p ($) {
25 my($t) = @_; # a string
26 # Don't emit pure whitespace, pure numbers, pure punctuation,
27 # single letters, or TMPL_VAR's.
28 # Punctuation should arguably be translated. But without context
29 # they are untranslatable. Note that $t is a string, not a token object.
30 return !$extract_all_p && (
31 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
32 || $t =~ /^\d+$/ # purely digits
33 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
34 || $t =~ /^[A-Za-z]$/ # single letters
38 sub token_negligible_p( $ ) {
41 return !$extract_all_p && (
42 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
43 $t == TmplTokenType::DIRECTIVE? 1:
44 $t == TmplTokenType::TEXT_PARAMETRIZED
45 && join( '', map { my $t = $_->type;
46 $t == TmplTokenType::DIRECTIVE?
47 '1': $t == TmplTokenType::TAG?
48 '': token_negligible_p( $_ )?
49 '': '1' } @{$x->children} ) eq '' );
52 ###############################################################################
55 my($token, $string) = @_;
56 # If we determine that the string is negligible, don't bother to remember
57 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
58 $text{$string} = [] unless defined $text{$string};
59 push @{$text{$string}}, $token;
63 ###############################################################################
67 # The real gettext tools seems to sort case sensitively; I don't know why
68 @t = sort { $a cmp $b } @t if $sort eq 's';
70 my @aa = sort { $a->pathname cmp $b->pathname
71 || $a->line_number <=> $b->line_number } @{$text{$a}};
72 my @bb = sort { $a->pathname cmp $b->pathname
73 || $a->line_number <=> $b->line_number } @{$text{$b}};
74 $aa[0]->pathname cmp $bb[0]->pathname
75 || $aa[0]->line_number <=> $bb[0]->line_number;
80 ###############################################################################
82 sub text_extract (*) {
85 my $s = TmplTokenizer::next_token $h;
86 last unless defined $s;
87 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
88 if ($kind eq TmplTokenType::TEXT) {
89 remember( $s, $t ) if $t =~ /\S/s;
90 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
91 remember( $s, $s->form ) if $s->form =~ /\S/s;
92 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
93 # value [tag=input], meta
94 my $tag = lc($1) if $t =~ /^<(\S+)/s;
95 for my $a ('alt', 'content', 'title', 'value') {
97 next if $a eq 'content' && $tag ne 'meta';
98 next if $a eq 'value' && ($tag ne 'input'
99 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
100 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
101 $val = TmplTokenizer::trim $val;
102 remember( $s, $val ) if $val =~ /\S/s;
109 ###############################################################################
111 sub generate_strings_list () {
112 # Emit all extracted strings.
113 for my $t (string_list) {
114 printf OUTPUT "%s\n", $t # unless negligible_p($t);
118 ###############################################################################
120 sub generate_po_file () {
121 # We don't emit the Plural-Forms header; it's meaningless for us
122 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
123 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
125 # SOME DESCRIPTIVE TITLE.
126 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
127 # This file is distributed under the same license as the PACKAGE package.
128 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
133 "Project-Id-Version: PACKAGE VERSION\\n"
134 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
135 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
136 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
137 "Language-Team: LANGUAGE <LL\@li.org>\\n"
138 "MIME-Version: 1.0\\n"
139 "Content-Type: text/plain; charset=$pot_charset\\n"
140 "Content-Transfer-Encoding: 8bit\\n"
143 my $directory_re = quotemeta("$directory/");
144 for my $t (string_list) {
145 #next if negligible_p($t);
147 for my $token (@{$text{$t}}) {
148 my $pathname = $token->pathname;
149 $pathname =~ s/^$directory_re//os;
150 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
151 if defined $pathname && defined $token->line_number;
152 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
154 printf OUTPUT "#, c-format\n" if $cformat_p;
155 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
156 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
157 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
158 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
162 ###############################################################################
164 sub convert_translation_file () {
165 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
166 VerboseWarnings::set_input_file_name $convert_from;
169 my($msgid, $msgstr) = split(/\t/);
170 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
171 unless defined $msgstr;
173 # Fixup some of the bad strings
174 $msgid =~ s/^SELECTED>//;
177 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
178 remember( $token, $msgid );
179 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
180 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
182 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
183 my $candidate = TmplTokenizer::charset_canon $2;
184 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
185 if defined $charset_in && $charset_in ne $candidate;
186 $charset_in = $candidate;
188 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
189 my $candidate = TmplTokenizer::charset_canon $2;
190 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
191 if defined $charset_out && $charset_out ne $candidate;
192 $charset_out = $candidate;
195 # The following assumption is correct; that's what HTML::Template assumes
196 if (!defined $charset_in) {
197 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
198 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
202 ###############################################################################
206 my $h = $exitcode? *STDERR: *STDOUT;
209 Extract translatable strings from given HTML::Template input files.
212 -f, --files-from=FILE Get list of input files from FILE
213 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
215 Output file location:
216 -o, --output=FILE Write output to specified file
218 HTML::Template options:
219 -a, --extract-all Extract all strings
220 --pedantic-warnings Issue warnings even for detected problems
221 which are likely to be harmless
224 -s, --sort-output generate sorted output
225 -F, --sort-by-file sort output by file location
228 --help Display this help and exit
233 ###############################################################################
235 sub usage_error (;$) {
236 print STDERR "$_[0]\n" if @_;
237 print STDERR "Try `$0 --help' for more information.\n";
241 ###############################################################################
243 Getopt::Long::config qw( bundling no_auto_abbrev );
245 'a|extract-all' => \$extract_all_p,
246 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
247 'convert-from=s' => \$convert_from,
248 'D|directory=s' => \$directory,
249 'f|files-from=s' => \$files_from,
250 'I|input-charset=s' => \$charset_in, # INTERNAL
251 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
252 'O|output-charset=s' => \$charset_out, # INTERNAL
253 'output|o=s' => \$output,
254 's|sort-output' => sub { $sort = 's' },
255 'F|sort-by-file' => sub { $sort = 'F' },
256 'help' => sub { usage(0) },
259 VerboseWarnings::set_application_name $0;
260 VerboseWarnings::set_pedantic_mode $pedantic_p;
262 usage_error('Missing mandatory option -f')
263 unless defined $files_from || defined $convert_from;
264 $directory = '.' unless defined $directory;
266 usage_error('You cannot specify both --convert-from and --files-from')
267 if defined $convert_from && defined $files_from;
269 if (defined $output && $output ne '-') {
270 open(OUTPUT, ">$output") || die "$output: $!\n";
272 open(OUTPUT, ">&STDOUT");
275 if (defined $files_from) {
276 open(INPUT, "<$files_from") || die "$files_from: $!\n";
279 my $h = TmplTokenizer->new( "$directory/$_" );
280 $h->set_allow_cformat( 1 );
281 VerboseWarnings::set_input_file_name "$directory/$_";
286 convert_translation_file;
290 warn "This input will not work with Mozilla standards-compliant mode\n", undef
291 if TmplTokenizer::syntaxerror_p;
294 exit(-1) if TmplTokenizer::fatal_p;
296 ###############################################################################
300 This is an experimental script based on the modularized
301 text-extract2.pl script. It has behaviour similar to
302 xgettext(1), and generates gettext-compatible output files.
304 A gettext-like format provides the following advantages:
311 Translation to non-English-like languages with different word
312 order: gettext's c-format strings can theoretically be
313 emulated if we are able to do some analysis on the .tmpl input
314 and treat <TMPL_VAR> in a way similar to %s.
318 Context for the extracted strings: the gettext format provides
319 the filenames and line numbers where each string can be found.
320 The translator can read the source file and see the context,
321 in case the string by itself can mean several different things.
325 Place for the translator to add comments about the translations.
329 Gettext-compatible tools, if any, might be usable if we adopt
334 Note that this script is experimental and should still be
337 Please refer to the explanation in tmpl_process3 for further
340 If you want to generate GNOME-style POTFILES.in files, such
341 files (passed to -f) can be generated thus:
343 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
344 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
345 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
346 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
348 This is, however, quite pointless, because the "create" and
349 "update" actions have already been implemented in tmpl_process3.pl.
360 There probably are some. Bugs related to scanning of <INPUT>
361 tags seem to be especially likely to be present.
363 Its diagnostics are probably too verbose.
365 When a <TMPL_VAR> within a JavaScript-related attribute is
366 detected, the script currently displays no warnings at all.
367 It might be good to display some kind of warning.
369 Its sort order (-s option) seems to be different than the real
370 xgettext(1)'s sort option. This will result in translation
371 strings inside the generated PO file spuriously moving about
372 when tmpl_process3.pl calls msgmerge(1) to update the PO file.