misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 ###############################################################################
  28
  29 sub string_negligible_p ($) {
  30     my($t) = @_;                                # a string
  31     # Don't emit pure whitespace, pure numbers, pure punctuation,
  32     # single letters, or TMPL_VAR's.
  33     # Punctuation should arguably be translated. But without context
  34     # they are untranslatable. Note that $t is a string, not a token object.
  35     return !$extract_all_p && (
  36                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  37             || $t =~ /^\d+$/                    # purely digits
  38             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  39             || $t =~ /^[A-Za-z]$/               # single letters
  40         )
  41 }
  42
  43 sub token_negligible_p( $ ) {
  44     my($x) = @_;
  45     my $t = $x->type;
  46     return !$extract_all_p && (
  47             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  48             $t == TmplTokenType::DIRECTIVE? 1:
  49             $t == TmplTokenType::TEXT_PARAMETRIZED
  50                 && join( '', map { my $t = $_->type;
  51                         $t == TmplTokenType::DIRECTIVE?
  52                                 '1': $t == TmplTokenType::TAG?
  53                                         '': token_negligible_p( $_ )?
  54                                         '': '1' } @{$x->children} ) eq '' );
  55 }
  56
  57 ###############################################################################
  58
  59 sub remember ($$) {
  60     my($token, $string) = @_;
  61     # If we determine that the string is negligible, don't bother to remember
  62     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  63         my $key = TmplTokenizer::string_canon( $string );
  64         $text{$key} = [] unless defined $text{$key};
  65         push @{$text{$key}}, $token;
  66     }
  67 }
  68
  69 ###############################################################################
  70
  71 sub string_list () {
  72     my @t = keys %text;
  73     # The real gettext tools seems to sort case sensitively; I don't know why
  74     @t = sort { $a cmp $b } @t if $sort eq 's';
  75     @t = sort {
  76             my @aa = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  78             my @bb = sort { $a->pathname cmp $b->pathname
  79                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  80             $aa[0]->pathname cmp $bb[0]->pathname
  81                     || $aa[0]->line_number <=> $bb[0]->line_number;
  82         } @t if $sort eq 'F';
  83     return @t;
  84 }
  85
  86   ###############################################################################
  87
  88 sub text_extract (*) {
  89     my($h) = @_;
  90     for (;;) {
  91         my $s = TmplTokenizer::next_token $h;
  92         last unless defined $s;
  93         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  94         if ($kind eq TmplTokenType::TEXT) {
  95             remember( $s, $t ) if $t =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  97             remember( $s, $s->form ) if $s->form =~ /\S/s;
  98         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  99             # value [tag=input], meta
 100             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 101             for my $a ('alt', 'content', 'title', 'value','label') {
 102                 if ($attr->{$a}) {
 103                     next if $a eq 'label' && $tag ne 'optgroup';
 104                     next if $a eq 'content' && $tag ne 'meta';
 105                     next if $a eq 'value' && ($tag ne 'input'
 106                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 107                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 108                     $val = TmplTokenizer::trim $val;
 109                     remember( $s, $val ) if $val =~ /\S/s;
 110                 }
 111             }
 112         } elsif ($s->has_js_data) {
 113             for my $t (@{$s->js_data}) {
 114               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 115             }
 116         }
 117     }
 118 }
 119
 120 ###############################################################################
 121
 122 sub generate_strings_list () {
 123     # Emit all extracted strings.
 124     for my $t (string_list) {
 125         printf OUTPUT "%s\n", $t;
 126     }
 127 }
 128
 129 ###############################################################################
 130
 131 sub generate_po_file () {
 132     # We don't emit the Plural-Forms header; it's meaningless for us
 133     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 134     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 135     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 136     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 137     my $time_pot = $time;
 138     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 139     print OUTPUT <<EOF;
 140 # SOME DESCRIPTIVE TITLE.
 141 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 142 # This file is distributed under the same license as the PACKAGE package.
 143 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 144 #
 145 EOF
 146     print OUTPUT <<EOF unless $disable_fuzzy_p;
 147 #, fuzzy
 148 EOF
 149     print OUTPUT <<EOF;
 150 msgid ""
 151 msgstr ""
 152 "Project-Id-Version: PACKAGE VERSION\\n"
 153 "POT-Creation-Date: $time_pot\\n"
 154 "PO-Revision-Date: $time_po\\n"
 155 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 156 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 157 "MIME-Version: 1.0\\n"
 158 "Content-Type: text/plain; charset=$pot_charset\\n"
 159 "Content-Transfer-Encoding: 8bit\\n"
 160
 161 EOF
 162     my $directory_re = quotemeta("$directory/");
 163     for my $t (string_list) {
 164         if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
 165             my($token, $n) = ($text{$t}->[0], 0);
 166             printf OUTPUT "#. For the first occurrence,\n"
 167                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 168             for my $param ($token->parameters_and_fields) {
 169                 $n += 1;
 170                 my $type = $param->type;
 171                 my $subtype = ($type == TmplTokenType::TAG
 172                         && $param->string =~ /^<input\b/is?
 173                                 $param->attributes->{'type'}->[1]: undef);
 174                 my $fmt = TmplTokenizer::_formalize( $param );
 175                 $fmt =~ s/^%/%$n\$/;
 176                 if ($type == TmplTokenType::DIRECTIVE) {
 177                     $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
 178                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 179                             $2: undef;
 180                     printf OUTPUT "#. %s: %s\n", $fmt,
 181                         "$type" . (defined $name? " name=$name": '');
 182                 } else {
 183                     my $name = $param->attributes->{'name'};
 184                     my $value = $param->attributes->{'value'}
 185                             unless $subtype =~ /^(?:text)$/;
 186                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 187                             . (defined $name?  " name=$name->[1]": '')
 188                             . (defined $value? " value=$value->[1]": '');
 189                 }
 190             }
 191         } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
 192             my($token) = ($text{$t}->[0]);
 193             printf OUTPUT "#. For the first occurrence,\n"
 194                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 195             if ($token->string =~ /^<meta\b/is) {
 196                 my $type = $token->attributes->{'http-equiv'}->[1];
 197                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 198             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 199                 my $tag = uc($1);
 200                 my $type = (lc($tag) eq 'input'?
 201                         $token->attributes->{'type'}: undef);
 202                 my $name = $token->attributes->{'name'};
 203                 printf OUTPUT "#. %s\n", $tag
 204                     . (defined $type? " type=$type->[1]": '')
 205                     . (defined $name? " name=$name->[1]": '');
 206             }
 207         } elsif ($text{$t}->[0]->has_js_data) {
 208             printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 209             printf OUTPUT "#. SCRIPT\n";
 210         }
 211         my $cformat_p;
 212         for my $token (@{$text{$t}}) {
 213             my $pathname = $token->pathname;
 214             $pathname =~ s/^$directory_re//os;
 215         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 216             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 217                     if defined $pathname && defined $token->line_number;
 218             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 219         }
 220         printf OUTPUT "#, c-format\n" if $cformat_p;
 221         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 222                 TmplTokenizer::string_canon
 223                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 224         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 225                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 226     }
 227 }
 228
 229 ###############################################################################
 230
 231 sub convert_translation_file () {
 232     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 233     VerboseWarnings::set_input_file_name $convert_from;
 234     while (<INPUT>) {
 235         chomp;
 236         my($msgid, $msgstr) = split(/\t/);
 237         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 238                 unless defined $msgstr;
 239
 240         # Fixup some of the bad strings
 241         $msgid =~ s/^SELECTED>//;
 242
 243         # Create dummy token
 244         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 245         remember( $token, $msgid );
 246         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 247         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 248
 249         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 250             my $candidate = TmplTokenizer::charset_canon $2;
 251             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 252                     if defined $charset_in && $charset_in ne $candidate;
 253             $charset_in = $candidate;
 254         }
 255         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 256             my $candidate = TmplTokenizer::charset_canon $2;
 257             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 258                     if defined $charset_out && $charset_out ne $candidate;
 259             $charset_out = $candidate;
 260         }
 261     }
 262     # The following assumption is correct; that's what HTML::Template assumes
 263     if (!defined $charset_in) {
 264         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 265         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 266     }
 267 }
 268
 269 ###############################################################################
 270
 271 sub usage ($) {
 272     my($exitcode) = @_;
 273     my $h = $exitcode? *STDERR: *STDOUT;
 274     print $h <<EOF;
 275 Usage: $0 [OPTIONS]
 276 Extract translatable strings from given HTML::Template input files.
 277
 278 Input file location:
 279   -f, --files-from=FILE          Get list of input files from FILE
 280   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 281
 282 Output file location:
 283   -o, --output=FILE              Write output to specified file
 284
 285 HTML::Template options:
 286   -a, --extract-all              Extract all strings
 287       --pedantic-warnings        Issue warnings even for detected problems
 288                                  which are likely to be harmless
 289
 290 Output details:
 291   -s, --sort-output              generate sorted output
 292   -F, --sort-by-file             sort output by file location
 293   -v, --verbose                  explain what is being done
 294
 295 Informative output:
 296       --help                     Display this help and exit
 297
 298 Try `perldoc $0' for perhaps more information.
 299 EOF
 300     exit($exitcode);
 301 }
 302
 303 ###############################################################################
 304
 305 sub usage_error (;$) {
 306     print STDERR "$_[0]\n" if @_;
 307     print STDERR "Try `$0 --help' for more information.\n";
 308     exit(-1);
 309 }
 310
 311 ###############################################################################
 312
 313 Getopt::Long::config qw( bundling no_auto_abbrev );
 314 GetOptions(
 315     'a|extract-all'                     => \$extract_all_p,
 316     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 317     'convert-from=s'                    => \$convert_from,
 318     'D|directory=s'                     => \$directory,
 319     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 320     'f|files-from=s'                    => \$files_from,
 321     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 322     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 323     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 324     'output|o=s'                        => \$output,
 325     'po-mode'                           => \$po_mode_p,         # INTERNAL
 326     's|sort-output'                     => sub { $sort = 's' },
 327     'F|sort-by-file'                    => sub { $sort = 'F' },
 328     'v|verbose'                         => \$verbose_p,
 329     'help'                              => sub { usage(0) },
 330 ) || usage_error;
 331
 332 VerboseWarnings::set_application_name $0;
 333 VerboseWarnings::set_pedantic_mode $pedantic_p;
 334
 335 usage_error('Missing mandatory option -f')
 336         unless defined $files_from || defined $convert_from;
 337 $directory = '.' unless defined $directory;
 338
 339 usage_error('You cannot specify both --convert-from and --files-from')
 340         if defined $convert_from && defined $files_from;
 341
 342 if (defined $output && $output ne '-') {
 343     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 344         open(OUTPUT, ">$output") || die "$output: $!\n";
 345     binmode( OUTPUT, ":utf8" );
 346 } else {
 347     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 348     open(OUTPUT, ">&STDOUT");
 349 }
 350
 351 if (defined $files_from) {
 352     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 353     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 354     while (<INPUT>) {
 355         chomp;
 356         my $input = /^\//? $_: "$directory/$_";
 357         my $h = TmplTokenizer->new( $input );
 358         $h->set_allow_cformat( 1 );
 359         VerboseWarnings::set_input_file_name $input;
 360         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 361         text_extract( $h );
 362     }
 363     close INPUT;
 364 } else {
 365     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 366     convert_translation_file;
 367 }
 368 generate_po_file;
 369
 370 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 371         if TmplTokenizer::syntaxerror_p;
 372
 373
 374 exit(-1) if TmplTokenizer::fatal_p;
 375
 376 ###############################################################################
 377
 378 =head1 DESCRIPTION
 379
 380 This is an experimental script based on the modularized
 381 text-extract2.pl script.  It has behaviour similar to
 382 xgettext(1), and generates gettext-compatible output files.
 383
 384 A gettext-like format provides the following advantages:
 385
 386 =over
 387
 388 =item -
 389
 390 Translation to non-English-like languages with different word
 391 order:  gettext's c-format strings can theoretically be
 392 emulated if we are able to do some analysis on the .tmpl input
 393 and treat <TMPL_VAR> in a way similar to %s.
 394
 395 =item -
 396
 397 Context for the extracted strings:  the gettext format provides
 398 the filenames and line numbers where each string can be found.
 399 The translator can read the source file and see the context,
 400 in case the string by itself can mean several different things.
 401
 402 =item -
 403
 404 Place for the translator to add comments about the translations.
 405
 406 =item -
 407
 408 Gettext-compatible tools, if any, might be usable if we adopt
 409 the gettext format.
 410
 411 =back
 412
 413 This script has already been in use for over a year and should
 414 be reasonable stable. Nevertheless, it is still somewhat
 415 experimental and there are still some issues.
 416
 417 Please refer to the explanation in tmpl_process3 for further
 418 details.
 419
 420 If you want to generate GNOME-style POTFILES.in files, such
 421 files (passed to -f) can be generated thus:
 422
 423         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 424                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 425         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 426                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 427
 428 This is, however, quite pointless, because the "create" and
 429 "update" actions have already been implemented in tmpl_process3.pl.
 430
 431 =head2 Strings inside JavaScript
 432
 433 In the SCRIPT elements, the script will attempt to scan for
 434 _("I<string literal>") patterns, and extract the I<string literal>
 435 as a translatable string.
 436
 437 Note that the C-like _(...) notation is required.
 438
 439 The JavaScript must actually define a _ function
 440 so that the code remains correct JavaScript.
 441 A suitable definition of such a function can be
 442
 443         function _(s) { return s } // dummy function for gettext
 444
 445 =head1 SEE ALSO
 446
 447 tmpl_process3.pl,
 448 xgettext(1),
 449 Locale::PO(3),
 450 translator_doc.txt
 451
 452 =head1 BUGS
 453
 454 There probably are some. Bugs related to scanning of <INPUT>
 455 tags seem to be especially likely to be present.
 456
 457 Its diagnostics are probably too verbose.
 458
 459 When a <TMPL_VAR> within a JavaScript-related attribute is
 460 detected, the script currently displays no warnings at all.
 461 It might be good to display some kind of warning.
 462
 463 Its sort order (-s option) seems to be different than the real
 464 xgettext(1)'s sort option. This will result in translation
 465 strings inside the generated PO file spuriously moving about
 466 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 467
 468 If a Javascript string has leading spaces, it will
 469 generate strings with spurious leading spaces,
 470 leading to failure to match the strings when actually generating
 471 translated files.
 472
 473 =cut