misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use Locale::PO;
  12 use TmplTokenizer;
  13 use VerboseWarnings;
  14
  15 use vars qw( $convert_from );
  16 use vars qw( $files_from $directory $output $sort );
  17 use vars qw( $pedantic_p );
  18 use vars qw( %text %translation );
  19 use vars qw( $charset_in $charset_out );
  20
  21 ###############################################################################
  22
  23 use vars qw( @latin1_utf8 );
  24 @latin1_utf8 = (
  25     "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
  26     "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
  27     "\302\214", "\302\215",   undef,      undef,    "\302\220", "\302\221",
  28     "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
  29     "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
  30     "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
  31     "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
  32     "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
  33     "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
  34     "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
  35     "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
  36     "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
  37     "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
  38     "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
  39     "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
  40     "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
  41     "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
  42     "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
  43     "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
  44     "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
  45     "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
  46     "\303\276", "\303\277" );
  47
  48 sub charset_convert ($) {
  49     my($s) = @_;
  50     if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
  51         ;
  52     } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
  53         $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
  54     } elsif ($charset_in ne $charset_out) {
  55         VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
  56     }
  57     return $s;
  58 }
  59
  60 ###############################################################################
  61
  62 sub remember ($$) {
  63     my($token, $string) = @_;
  64     $text{$string} = [] unless defined $text{$string};
  65     push @{$text{$string}}, $token;
  66 }
  67
  68 ###############################################################################
  69
  70 sub string_list () {
  71     my @t = keys %text;
  72     # The real gettext tools seems to sort case sensitively; I don't know why
  73     @t = sort { $a cmp $b } @t if $sort eq 's';
  74     @t = sort {
  75             my @aa = sort { $a->pathname cmp $b->pathname
  76                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  77             my @bb = sort { $a->pathname cmp $b->pathname
  78                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  79             $aa[0]->pathname cmp $bb[0]->pathname
  80                     || $aa[0]->line_number <=> $bb[0]->line_number;
  81         } @t if $sort eq 'F';
  82     return @t;
  83 }
  84
  85 ###############################################################################
  86
  87 sub text_extract (*) {
  88     my($h) = @_;
  89     for (;;) {
  90         my $s = TmplTokenizer::next_token $h;
  91     last unless defined $s;
  92         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  93         if ($kind eq TmplTokenType::TEXT) {
  94             #$t = TmplTokenizer::trim $t;
  95             remember( $s, $t ) if $t =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  97             #$t = TmplTokenizer::trim $t;
  98             remember( $s, $s->form ) if $s->form =~ /\S/s;
  99         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
 100             # value [tag=input], meta
 101             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 102             for my $a ('alt', 'content', 'title', 'value') {
 103                 if ($attr->{$a}) {
 104                     next if $a eq 'content' && $tag ne 'meta';
 105                     next if $a eq 'value' && ($tag ne 'input'
 106                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 107                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 108                     $val = TmplTokenizer::trim $val;
 109                     remember( $s, $val ) if $val =~ /\S/s;
 110                 }
 111             }
 112         }
 113     }
 114 }
 115
 116 ###############################################################################
 117
 118 sub generate_strings_list () {
 119     # Emit all extracted strings.
 120     # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
 121     for my $t (string_list) {
 122         printf OUTPUT "%s\n", $t
 123             unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
 124     }
 125 }
 126
 127 ###############################################################################
 128
 129 sub generate_po_file () {
 130     # We don't emit the Plural-Forms header; it's meaningless for us
 131     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 132     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 133     print OUTPUT <<EOF;
 134 # SOME DESCRIPTIVE TITLE.
 135 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 136 # This file is distributed under the same license as the PACKAGE package.
 137 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 138 #
 139 #, fuzzy
 140 msgid ""
 141 msgstr ""
 142 "Project-Id-Version: PACKAGE VERSION\\n"
 143 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
 144 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
 145 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 146 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 147 "MIME-Version: 1.0\\n"
 148 "Content-Type: text/plain; charset=$pot_charset\\n"
 149 "Content-Transfer-Encoding: 8bit\\n"
 150
 151 EOF
 152     my $directory_re = quotemeta("$directory/");
 153     for my $t (string_list) {
 154         next if TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
 155         my $cformat_p;
 156         for my $token (@{$text{$t}}) {
 157             my $pathname = $token->pathname;
 158             $pathname =~ s/^$directory_re//os;
 159             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 160                     if defined $pathname && defined $token->line_number;
 161             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 162         }
 163         printf OUTPUT "#, c-format\n" if $cformat_p;
 164         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( charset_convert $t );
 165         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 166                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 167     }
 168 }
 169
 170 ###############################################################################
 171
 172 sub convert_translation_file () {
 173     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 174     VerboseWarnings::set_input_file_name $convert_from;
 175     while (<INPUT>) {
 176         chomp;
 177         my($msgid, $msgstr) = split(/\t/);
 178         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 179                 unless defined $msgstr;
 180
 181         # Fixup some of the bad strings
 182         $msgid =~ s/^SELECTED>//;
 183
 184         # Create dummy token
 185         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 186         remember( $token, $msgid );
 187         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 188         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 189
 190         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 191             my $candidate = TmplTokenizer::charset_canon $2;
 192             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 193                     if defined $charset_in && $charset_in ne $candidate;
 194             $charset_in = $candidate;
 195         }
 196         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 197             my $candidate = TmplTokenizer::charset_canon $2;
 198             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 199                     if defined $charset_out && $charset_out ne $candidate;
 200             $charset_out = $candidate;
 201         }
 202     }
 203     if (!defined $charset_in) {
 204         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 205         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 206     }
 207 }
 208
 209 ###############################################################################
 210
 211 sub usage ($) {
 212     my($exitcode) = @_;
 213     my $h = $exitcode? *STDERR: *STDOUT;
 214     print $h <<EOF;
 215 Usage: $0 [OPTIONS]
 216 Extract translatable strings from given HTML::Template input files.
 217
 218 Input file location:
 219   -f, --files-from=FILE          Get list of input files from FILE
 220   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 221
 222 Output file location:
 223   -o, --output=FILE              Write output to specified file
 224
 225 HTML::Template options:
 226       --pedantic-warnings        Issue warnings even for detected problems
 227                                  which are likely to be harmless
 228
 229 Output details:
 230   -s, --sort-output              generate sorted output
 231   -F, --sort-by-file             sort output by file location
 232
 233 Informative output:
 234       --help                     Display this help and exit
 235 EOF
 236     exit($exitcode);
 237 }
 238
 239 ###############################################################################
 240
 241 sub usage_error (;$) {
 242     print STDERR "$_[0]\n" if @_;
 243     print STDERR "Try `$0 --help' for more information.\n";
 244     exit(-1);
 245 }
 246
 247 ###############################################################################
 248
 249 Getopt::Long::config qw( bundling no_auto_abbrev );
 250 GetOptions(
 251     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 252     'convert-from=s'                    => \$convert_from,
 253     'D|directory=s'                     => \$directory,
 254     'f|files-from=s'                    => \$files_from,
 255     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 256     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 257     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 258     'output|o=s'                        => \$output,
 259     's|sort-output'                     => sub { $sort = 's' },
 260     'F|sort-by-file'                    => sub { $sort = 'F' },
 261     'help'                              => sub { usage(0) },
 262 ) || usage_error;
 263
 264 VerboseWarnings::set_application_name $0;
 265 VerboseWarnings::set_pedantic_mode $pedantic_p;
 266
 267 usage_error('Missing mandatory option -f')
 268         unless defined $files_from || defined $convert_from;
 269 $directory = '.' unless defined $directory;
 270
 271 usage_error('You cannot specify both --convert-from and --files-from')
 272         if defined $convert_from && defined $files_from;
 273
 274 if (defined $output && $output ne '-') {
 275     open(OUTPUT, ">$output") || die "$output: $!\n";
 276 } else {
 277     open(OUTPUT, ">&STDOUT");
 278 }
 279
 280 if (defined $files_from) {
 281     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 282     while (<INPUT>) {
 283         chomp;
 284         my $h = TmplTokenizer->new( "$directory/$_" );
 285         $h->set_allow_cformat( 1 );
 286         VerboseWarnings::set_input_file_name "$directory/$_";
 287         text_extract( $h );
 288     }
 289     close INPUT;
 290 } else {
 291     convert_translation_file;
 292 }
 293 generate_po_file;
 294
 295 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 296         if TmplTokenizer::syntaxerror_p;
 297
 298
 299 exit(-1) if TmplTokenizer::fatal_p;
 300
 301 ###############################################################################
 302
 303 =head1 DESCRIPTION
 304
 305 This is an experimental script based on the modularized
 306 text-extract2.pl script.  It has behaviour similar to
 307 xgettext(1), and generates gettext-compatible output files.
 308
 309 A gettext-like format provides the following advantages:
 310
 311 =over
 312
 313 =item -
 314
 315 (Future goal)
 316 Translation to non-English-like languages with different word
 317 order:  gettext's c-format strings can theoretically be
 318 emulated if we are able to do some analysis on the .tmpl input
 319 and treat <TMPL_VAR> in a way similar to %s.
 320
 321 =item -
 322
 323 Context for the extracted strings:  the gettext format provides
 324 the filenames and line numbers where each string can be found.
 325 The translator can read the source file and see the context,
 326 in case the string by itself can mean several different things.
 327
 328 =item -
 329
 330 Place for the translator to add comments about the translations.
 331
 332 =item -
 333
 334 Gettext-compatible tools, if any, might be usable if we adopt
 335 the gettext format.
 336
 337 =back
 338
 339 Right now it does about the same thing as text-extract2.pl but
 340 generates gettext-style output; however, because it is scanner-
 341 instead of parser-based, it is able to address the 4 weaknesses
 342 listed in translator_doc.txt.  Ultimately, the goal is to make
 343 this able to do some kind of simple analysis on the input to
 344 produce gettext-style output with c-format strings, in order to
 345 facilitate translation to languages with a different word order
 346 than English.
 347
 348 When the above is finished, the generated po file may contain
 349 some HTML tags in addition to %s strings.
 350
 351 If you want to generate GNOME-style POTFILES.in files, such
 352 files (passed to -f) can be generated thus:
 353
 354         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 355                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 356         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 357                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 358
 359 This is, however, quite pointless, because the "create" and
 360 "update" actions have already been implemented in tmpl_process3.pl.
 361
 362 =head1 SEE ALSO
 363
 364 tmpl_process.pl,
 365 xgettext(1),
 366 Locale::PO(3),
 367 translator_doc.txt
 368
 369 =head1 BUGS
 370
 371 There probably are some. Bugs related to scanning of <INPUT>
 372 tags seem to be especially likely to be present.
 373
 374 Its diagnostics are probably too verbose.
 375
 376 When a <TMPL_VAR> within a JavaScript-related attribute is
 377 detected, the script currently displays no warnings at all.
 378 It might be good to display some kind of warning.
 379
 380 Its sort order (-s option) seems to be different than the real
 381 xgettext(1)'s sort option. This will result in translation
 382 strings inside the generated PO file spuriously moving about
 383 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 384
 385 =cut