5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
16 use vars qw( $convert_from );
17 use vars qw( $files_from $directory $output $sort );
18 use vars qw( $extract_all_p );
19 use vars qw( $pedantic_p );
20 use vars qw( %text %translation );
21 use vars qw( $charset_in $charset_out );
22 use vars qw( $verbose_p );
23 use vars qw( $po_mode_p );
25 ###############################################################################
27 sub string_negligible_p ($) {
28 my($t) = @_; # a string
29 # Don't emit pure whitespace, pure numbers, pure punctuation,
30 # single letters, or TMPL_VAR's.
31 # Punctuation should arguably be translated. But without context
32 # they are untranslatable. Note that $t is a string, not a token object.
33 return !$extract_all_p && (
34 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
35 || $t =~ /^\d+$/ # purely digits
36 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
37 || $t =~ /^[A-Za-z]$/ # single letters
41 sub token_negligible_p( $ ) {
44 return !$extract_all_p && (
45 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
46 $t == TmplTokenType::DIRECTIVE? 1:
47 $t == TmplTokenType::TEXT_PARAMETRIZED
48 && join( '', map { my $t = $_->type;
49 $t == TmplTokenType::DIRECTIVE?
50 '1': $t == TmplTokenType::TAG?
51 '': token_negligible_p( $_ )?
52 '': '1' } @{$x->children} ) eq '' );
55 ###############################################################################
58 my($token, $string) = @_;
59 # If we determine that the string is negligible, don't bother to remember
60 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
61 my $key = TmplTokenizer::string_canon( $string );
62 $text{$key} = [] unless defined $text{$key};
63 push @{$text{$key}}, $token;
67 ###############################################################################
71 # The real gettext tools seems to sort case sensitively; I don't know why
72 @t = sort { $a cmp $b } @t if $sort eq 's';
74 my @aa = sort { $a->pathname cmp $b->pathname
75 || $a->line_number <=> $b->line_number } @{$text{$a}};
76 my @bb = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$b}};
78 $aa[0]->pathname cmp $bb[0]->pathname
79 || $aa[0]->line_number <=> $bb[0]->line_number;
84 ###############################################################################
86 sub text_extract (*) {
89 my $s = TmplTokenizer::next_token $h;
90 last unless defined $s;
91 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
92 if ($kind eq TmplTokenType::TEXT) {
93 remember( $s, $t ) if $t =~ /\S/s;
94 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
95 remember( $s, $s->form ) if $s->form =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
97 # value [tag=input], meta
98 my $tag = lc($1) if $t =~ /^<(\S+)/s;
99 for my $a ('alt', 'content', 'title', 'value') {
101 next if $a eq 'content' && $tag ne 'meta';
102 next if $a eq 'value' && ($tag ne 'input'
103 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
104 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
105 $val = TmplTokenizer::trim $val;
106 remember( $s, $val ) if $val =~ /\S/s;
113 ###############################################################################
115 sub generate_strings_list () {
116 # Emit all extracted strings.
117 for my $t (string_list) {
118 printf OUTPUT "%s\n", $t # unless negligible_p($t);
122 ###############################################################################
124 sub generate_po_file () {
125 # We don't emit the Plural-Forms header; it's meaningless for us
126 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
127 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
128 # Time stamps aren't exactly right semantically. I don't know how to fix it.
129 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
130 my $time_pot = $time;
131 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
133 # SOME DESCRIPTIVE TITLE.
134 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
135 # This file is distributed under the same license as the PACKAGE package.
136 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
141 "Project-Id-Version: PACKAGE VERSION\\n"
142 "POT-Creation-Date: $time_pot\\n"
143 "PO-Revision-Date: $time_po\\n"
144 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
145 "Language-Team: LANGUAGE <LL\@li.org>\\n"
146 "MIME-Version: 1.0\\n"
147 "Content-Type: text/plain; charset=$pot_charset\\n"
148 "Content-Transfer-Encoding: 8bit\\n"
151 my $directory_re = quotemeta("$directory/");
152 for my $t (string_list) {
153 #next if negligible_p($t);
155 for my $token (@{$text{$t}}) {
156 my $pathname = $token->pathname;
157 $pathname =~ s/^$directory_re//os;
158 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
159 if defined $pathname && defined $token->line_number;
160 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
162 printf OUTPUT "#, c-format\n" if $cformat_p;
163 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
164 TmplTokenizer::string_canon
165 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
166 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
167 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
171 ###############################################################################
173 sub convert_translation_file () {
174 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
175 VerboseWarnings::set_input_file_name $convert_from;
178 my($msgid, $msgstr) = split(/\t/);
179 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
180 unless defined $msgstr;
182 # Fixup some of the bad strings
183 $msgid =~ s/^SELECTED>//;
186 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
187 remember( $token, $msgid );
188 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
189 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
191 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
192 my $candidate = TmplTokenizer::charset_canon $2;
193 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
194 if defined $charset_in && $charset_in ne $candidate;
195 $charset_in = $candidate;
197 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
198 my $candidate = TmplTokenizer::charset_canon $2;
199 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
200 if defined $charset_out && $charset_out ne $candidate;
201 $charset_out = $candidate;
204 # The following assumption is correct; that's what HTML::Template assumes
205 if (!defined $charset_in) {
206 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
207 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
211 ###############################################################################
215 my $h = $exitcode? *STDERR: *STDOUT;
218 Extract translatable strings from given HTML::Template input files.
221 -f, --files-from=FILE Get list of input files from FILE
222 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
224 Output file location:
225 -o, --output=FILE Write output to specified file
227 HTML::Template options:
228 -a, --extract-all Extract all strings
229 --pedantic-warnings Issue warnings even for detected problems
230 which are likely to be harmless
233 -s, --sort-output generate sorted output
234 -F, --sort-by-file sort output by file location
235 -v, --verbose explain what is being done
238 --help Display this help and exit
243 ###############################################################################
245 sub usage_error (;$) {
246 print STDERR "$_[0]\n" if @_;
247 print STDERR "Try `$0 --help' for more information.\n";
251 ###############################################################################
253 Getopt::Long::config qw( bundling no_auto_abbrev );
255 'a|extract-all' => \$extract_all_p,
256 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
257 'convert-from=s' => \$convert_from,
258 'D|directory=s' => \$directory,
259 'f|files-from=s' => \$files_from,
260 'I|input-charset=s' => \$charset_in, # INTERNAL
261 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
262 'O|output-charset=s' => \$charset_out, # INTERNAL
263 'output|o=s' => \$output,
264 'po-mode' => \$po_mode_p, # INTERNAL
265 's|sort-output' => sub { $sort = 's' },
266 'F|sort-by-file' => sub { $sort = 'F' },
267 'v|verbose' => \$verbose_p,
268 'help' => sub { usage(0) },
271 VerboseWarnings::set_application_name $0;
272 VerboseWarnings::set_pedantic_mode $pedantic_p;
274 usage_error('Missing mandatory option -f')
275 unless defined $files_from || defined $convert_from;
276 $directory = '.' unless defined $directory;
278 usage_error('You cannot specify both --convert-from and --files-from')
279 if defined $convert_from && defined $files_from;
281 if (defined $output && $output ne '-') {
282 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
283 open(OUTPUT, ">$output") || die "$output: $!\n";
285 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
286 open(OUTPUT, ">&STDOUT");
289 if (defined $files_from) {
290 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
291 open(INPUT, "<$files_from") || die "$files_from: $!\n";
294 my $input = /^\//? $_: "$directory/$_";
295 my $h = TmplTokenizer->new( $input );
296 $h->set_allow_cformat( 1 );
297 VerboseWarnings::set_input_file_name $input;
298 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
303 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
304 convert_translation_file;
308 warn "This input will not work with Mozilla standards-compliant mode\n", undef
309 if TmplTokenizer::syntaxerror_p;
312 exit(-1) if TmplTokenizer::fatal_p;
314 ###############################################################################
318 This is an experimental script based on the modularized
319 text-extract2.pl script. It has behaviour similar to
320 xgettext(1), and generates gettext-compatible output files.
322 A gettext-like format provides the following advantages:
329 Translation to non-English-like languages with different word
330 order: gettext's c-format strings can theoretically be
331 emulated if we are able to do some analysis on the .tmpl input
332 and treat <TMPL_VAR> in a way similar to %s.
336 Context for the extracted strings: the gettext format provides
337 the filenames and line numbers where each string can be found.
338 The translator can read the source file and see the context,
339 in case the string by itself can mean several different things.
343 Place for the translator to add comments about the translations.
347 Gettext-compatible tools, if any, might be usable if we adopt
352 Note that this script is experimental and should still be
355 Please refer to the explanation in tmpl_process3 for further
358 If you want to generate GNOME-style POTFILES.in files, such
359 files (passed to -f) can be generated thus:
361 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
362 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
363 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
364 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
366 This is, however, quite pointless, because the "create" and
367 "update" actions have already been implemented in tmpl_process3.pl.
378 There probably are some. Bugs related to scanning of <INPUT>
379 tags seem to be especially likely to be present.
381 Its diagnostics are probably too verbose.
383 When a <TMPL_VAR> within a JavaScript-related attribute is
384 detected, the script currently displays no warnings at all.
385 It might be good to display some kind of warning.
387 Its sort order (-s option) seems to be different than the real
388 xgettext(1)'s sort option. This will result in translation
389 strings inside the generated PO file spuriously moving about
390 when tmpl_process3.pl calls msgmerge(1) to update the PO file.