5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
16 use vars qw( $convert_from );
17 use vars qw( $files_from $directory $output $sort );
18 use vars qw( $extract_all_p );
19 use vars qw( $pedantic_p );
20 use vars qw( %text %translation );
21 use vars qw( $charset_in $charset_out );
22 use vars qw( $verbose_p );
23 use vars qw( $po_mode_p );
25 ###############################################################################
27 sub string_negligible_p ($) {
28 my($t) = @_; # a string
29 # Don't emit pure whitespace, pure numbers, pure punctuation,
30 # single letters, or TMPL_VAR's.
31 # Punctuation should arguably be translated. But without context
32 # they are untranslatable. Note that $t is a string, not a token object.
33 return !$extract_all_p && (
34 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
35 || $t =~ /^\d+$/ # purely digits
36 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
37 || $t =~ /^[A-Za-z]$/ # single letters
41 sub token_negligible_p( $ ) {
44 return !$extract_all_p && (
45 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
46 $t == TmplTokenType::DIRECTIVE? 1:
47 $t == TmplTokenType::TEXT_PARAMETRIZED
48 && join( '', map { my $t = $_->type;
49 $t == TmplTokenType::DIRECTIVE?
50 '1': $t == TmplTokenType::TAG?
51 '': token_negligible_p( $_ )?
52 '': '1' } @{$x->children} ) eq '' );
55 ###############################################################################
58 my($token, $string) = @_;
59 # If we determine that the string is negligible, don't bother to remember
60 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
61 my $key = TmplTokenizer::string_canon( $string );
62 $text{$key} = [] unless defined $text{$key};
63 push @{$text{$key}}, $token;
67 ###############################################################################
71 # The real gettext tools seems to sort case sensitively; I don't know why
72 @t = sort { $a cmp $b } @t if $sort eq 's';
74 my @aa = sort { $a->pathname cmp $b->pathname
75 || $a->line_number <=> $b->line_number } @{$text{$a}};
76 my @bb = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$b}};
78 $aa[0]->pathname cmp $bb[0]->pathname
79 || $aa[0]->line_number <=> $bb[0]->line_number;
84 ###############################################################################
86 sub text_extract (*) {
89 my $s = TmplTokenizer::next_token $h;
90 last unless defined $s;
91 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
92 if ($kind eq TmplTokenType::TEXT) {
93 remember( $s, $t ) if $t =~ /\S/s;
94 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
95 remember( $s, $s->form ) if $s->form =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
97 # value [tag=input], meta
98 my $tag = lc($1) if $t =~ /^<(\S+)/s;
99 for my $a ('alt', 'content', 'title', 'value') {
101 next if $a eq 'content' && $tag ne 'meta';
102 next if $a eq 'value' && ($tag ne 'input'
103 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
104 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
105 $val = TmplTokenizer::trim $val;
106 remember( $s, $val ) if $val =~ /\S/s;
113 ###############################################################################
115 sub generate_strings_list () {
116 # Emit all extracted strings.
117 for my $t (string_list) {
118 printf OUTPUT "%s\n", $t # unless negligible_p($t);
122 ###############################################################################
124 sub generate_po_file () {
125 # We don't emit the Plural-Forms header; it's meaningless for us
126 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
127 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
128 # Time stamps aren't exactly right semantically. I don't know how to fix it.
129 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
130 my $time_pot = $time;
131 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
133 # SOME DESCRIPTIVE TITLE.
134 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
135 # This file is distributed under the same license as the PACKAGE package.
136 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
141 "Project-Id-Version: PACKAGE VERSION\\n"
142 "POT-Creation-Date: $time_pot\\n"
143 "PO-Revision-Date: $time_po\\n"
144 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
145 "Language-Team: LANGUAGE <LL\@li.org>\\n"
146 "MIME-Version: 1.0\\n"
147 "Content-Type: text/plain; charset=$pot_charset\\n"
148 "Content-Transfer-Encoding: 8bit\\n"
151 my $directory_re = quotemeta("$directory/");
152 for my $t (string_list) {
153 #next if negligible_p($t);
155 for my $token (@{$text{$t}}) {
156 my $pathname = $token->pathname;
157 $pathname =~ s/^$directory_re//os;
158 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
159 if defined $pathname && defined $token->line_number;
160 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
162 printf OUTPUT "#, c-format\n" if $cformat_p;
163 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
164 TmplTokenizer::string_canon
165 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
166 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
167 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
171 ###############################################################################
173 sub convert_translation_file () {
174 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
175 VerboseWarnings::set_input_file_name $convert_from;
178 my($msgid, $msgstr) = split(/\t/);
179 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
180 unless defined $msgstr;
182 # Fixup some of the bad strings
183 $msgid =~ s/^SELECTED>//;
186 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
187 remember( $token, $msgid );
188 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
189 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
191 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
192 my $candidate = TmplTokenizer::charset_canon $2;
193 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
194 if defined $charset_in && $charset_in ne $candidate;
195 $charset_in = $candidate;
197 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
198 my $candidate = TmplTokenizer::charset_canon $2;
199 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
200 if defined $charset_out && $charset_out ne $candidate;
201 $charset_out = $candidate;
204 # The following assumption is correct; that's what HTML::Template assumes
205 if (!defined $charset_in) {
206 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
207 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
211 ###############################################################################
215 my $h = $exitcode? *STDERR: *STDOUT;
218 Extract translatable strings from given HTML::Template input files.
221 -f, --files-from=FILE Get list of input files from FILE
222 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
224 Output file location:
225 -o, --output=FILE Write output to specified file
227 HTML::Template options:
228 -a, --extract-all Extract all strings
229 --pedantic-warnings Issue warnings even for detected problems
230 which are likely to be harmless
233 -s, --sort-output generate sorted output
234 -F, --sort-by-file sort output by file location
235 -v, --verbose explain what is being done
238 --help Display this help and exit
240 Try `perldoc $0' for perhaps more information.
245 ###############################################################################
247 sub usage_error (;$) {
248 print STDERR "$_[0]\n" if @_;
249 print STDERR "Try `$0 --help' for more information.\n";
253 ###############################################################################
255 Getopt::Long::config qw( bundling no_auto_abbrev );
257 'a|extract-all' => \$extract_all_p,
258 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
259 'convert-from=s' => \$convert_from,
260 'D|directory=s' => \$directory,
261 'f|files-from=s' => \$files_from,
262 'I|input-charset=s' => \$charset_in, # INTERNAL
263 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
264 'O|output-charset=s' => \$charset_out, # INTERNAL
265 'output|o=s' => \$output,
266 'po-mode' => \$po_mode_p, # INTERNAL
267 's|sort-output' => sub { $sort = 's' },
268 'F|sort-by-file' => sub { $sort = 'F' },
269 'v|verbose' => \$verbose_p,
270 'help' => sub { usage(0) },
273 VerboseWarnings::set_application_name $0;
274 VerboseWarnings::set_pedantic_mode $pedantic_p;
276 usage_error('Missing mandatory option -f')
277 unless defined $files_from || defined $convert_from;
278 $directory = '.' unless defined $directory;
280 usage_error('You cannot specify both --convert-from and --files-from')
281 if defined $convert_from && defined $files_from;
283 if (defined $output && $output ne '-') {
284 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
285 open(OUTPUT, ">$output") || die "$output: $!\n";
287 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
288 open(OUTPUT, ">&STDOUT");
291 if (defined $files_from) {
292 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
293 open(INPUT, "<$files_from") || die "$files_from: $!\n";
296 my $input = /^\//? $_: "$directory/$_";
297 my $h = TmplTokenizer->new( $input );
298 $h->set_allow_cformat( 1 );
299 VerboseWarnings::set_input_file_name $input;
300 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
305 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
306 convert_translation_file;
310 warn "This input will not work with Mozilla standards-compliant mode\n", undef
311 if TmplTokenizer::syntaxerror_p;
314 exit(-1) if TmplTokenizer::fatal_p;
316 ###############################################################################
320 This is an experimental script based on the modularized
321 text-extract2.pl script. It has behaviour similar to
322 xgettext(1), and generates gettext-compatible output files.
324 A gettext-like format provides the following advantages:
331 Translation to non-English-like languages with different word
332 order: gettext's c-format strings can theoretically be
333 emulated if we are able to do some analysis on the .tmpl input
334 and treat <TMPL_VAR> in a way similar to %s.
338 Context for the extracted strings: the gettext format provides
339 the filenames and line numbers where each string can be found.
340 The translator can read the source file and see the context,
341 in case the string by itself can mean several different things.
345 Place for the translator to add comments about the translations.
349 Gettext-compatible tools, if any, might be usable if we adopt
354 Note that this script is experimental and should still be
357 Please refer to the explanation in tmpl_process3 for further
360 If you want to generate GNOME-style POTFILES.in files, such
361 files (passed to -f) can be generated thus:
363 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
364 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
365 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
366 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
368 This is, however, quite pointless, because the "create" and
369 "update" actions have already been implemented in tmpl_process3.pl.
380 There probably are some. Bugs related to scanning of <INPUT>
381 tags seem to be especially likely to be present.
383 Its diagnostics are probably too verbose.
385 When a <TMPL_VAR> within a JavaScript-related attribute is
386 detected, the script currently displays no warnings at all.
387 It might be good to display some kind of warning.
389 Its sort order (-s option) seems to be different than the real
390 xgettext(1)'s sort option. This will result in translation
391 strings inside the generated PO file spuriously moving about
392 when tmpl_process3.pl calls msgmerge(1) to update the PO file.