misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 ###############################################################################
  28
  29 sub string_negligible_p ($) {
  30     my($t) = @_;                                # a string
  31     # Don't emit pure whitespace, pure numbers, pure punctuation,
  32     # single letters, or TMPL_VAR's.
  33     # Punctuation should arguably be translated. But without context
  34     # they are untranslatable. Note that $t is a string, not a token object.
  35     return !$extract_all_p && (
  36                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  37             || $t =~ /^\d+$/                    # purely digits
  38             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  39             || $t =~ /^[A-Za-z]$/               # single letters
  40             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  41         )
  42 }
  43
  44 sub token_negligible_p( $ ) {
  45     my($x) = @_;
  46     my $t = $x->type;
  47     return !$extract_all_p && (
  48             $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
  49             $t == C4::TmplTokenType::DIRECTIVE? 1:
  50             $t == C4::TmplTokenType::TEXT_PARAMETRIZED
  51                 && join( '', map { my $t = $_->type;
  52                         $t == C4::TmplTokenType::DIRECTIVE?
  53                                 '1': $t == C4::TmplTokenType::TAG?
  54                                         '': token_negligible_p( $_ )?
  55                                         '': '1' } @{$x->children} ) eq '' );
  56 }
  57
  58 ###############################################################################
  59
  60 sub remember ($$) {
  61     my($token, $string) = @_;
  62     # If we determine that the string is negligible, don't bother to remember
  63     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  64         my $key = TmplTokenizer::string_canon( $string );
  65         $text{$key} = [] unless defined $text{$key};
  66         push @{$text{$key}}, $token;
  67     }
  68 }
  69
  70 ###############################################################################
  71
  72 sub string_list () {
  73     my @t = keys %text;
  74     # The real gettext tools seems to sort case sensitively; I don't know why
  75     @t = sort { $a cmp $b } @t if $sort eq 's';
  76     @t = sort {
  77             my @aa = sort { $a->pathname cmp $b->pathname
  78                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  79             my @bb = sort { $a->pathname cmp $b->pathname
  80                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  81             $aa[0]->pathname cmp $bb[0]->pathname
  82                     || $aa[0]->line_number <=> $bb[0]->line_number;
  83         } @t if $sort eq 'F';
  84     return @t;
  85 }
  86
  87   ###############################################################################
  88
  89 sub text_extract (*) {
  90     my($h) = @_;
  91     for (;;) {
  92         my $s = TmplTokenizer::next_token $h;
  93         last unless defined $s;
  94         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  95         if ($kind eq C4::TmplTokenType::TEXT) {
  96             if ($t =~ /\S/s && $t !~ /<!/){
  97                 remember( $s, $t );
  98             }
  99         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 100             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 101                 remember( $s, $s->form );
 102             }
 103         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 104             # value [tag=input], meta
 105             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 106             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 107                 if ($attr->{$a}) {
 108                     next if $a eq 'label' && $tag ne 'optgroup';
 109                     next if $a eq 'content' && $tag ne 'meta';
 110                     next if $a eq 'value' && ($tag ne 'input'
 111                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 112                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 113                     $val = TmplTokenizer::trim $val;
 114                     remember( $s, $val ) if $val =~ /\S/s;
 115                 }
 116             }
 117         } elsif ($s->has_js_data) {
 118             for my $t (@{$s->js_data}) {
 119               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 120             }
 121         }
 122     }
 123 }
 124
 125 ###############################################################################
 126
 127 sub generate_strings_list () {
 128     # Emit all extracted strings.
 129     for my $t (string_list) {
 130         printf OUTPUT "%s\n", $t;
 131     }
 132 }
 133
 134 ###############################################################################
 135
 136 sub generate_po_file () {
 137     # We don't emit the Plural-Forms header; it's meaningless for us
 138     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 139     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 140     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 141     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 142     my $time_pot = $time;
 143     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 144     print OUTPUT <<EOF;
 145 # SOME DESCRIPTIVE TITLE.
 146 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 147 # This file is distributed under the same license as the PACKAGE package.
 148 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 149 #
 150 EOF
 151     print OUTPUT <<EOF unless $disable_fuzzy_p;
 152 #, fuzzy
 153 EOF
 154     print OUTPUT <<EOF;
 155 msgid ""
 156 msgstr ""
 157 "Project-Id-Version: PACKAGE VERSION\\n"
 158 "POT-Creation-Date: $time_pot\\n"
 159 "PO-Revision-Date: $time_po\\n"
 160 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 161 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 162 "MIME-Version: 1.0\\n"
 163 "Content-Type: text/plain; charset=$pot_charset\\n"
 164 "Content-Transfer-Encoding: 8bit\\n"
 165
 166 EOF
 167     my $directory_re = quotemeta("$directory/");
 168     for my $t (string_list) {
 169         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 170             my($token, $n) = ($text{$t}->[0], 0);
 171             printf OUTPUT "#. For the first occurrence,\n"
 172                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 173             for my $param ($token->parameters_and_fields) {
 174                 $n += 1;
 175                 my $type = $param->type;
 176                 my $subtype = ($type == C4::TmplTokenType::TAG
 177                         && $param->string =~ /^<input\b/is?
 178                                 $param->attributes->{'type'}->[1]: undef);
 179                 my $fmt = TmplTokenizer::_formalize( $param );
 180                 $fmt =~ s/^%/%$n\$/;
 181                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 182 #                   $type = "Template::Toolkit Directive";
 183                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 184                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 185                             $2: undef;
 186                     printf OUTPUT "#. %s: %s\n", $fmt,
 187                         "$type" . (defined $name? " name=$name": '');
 188                 } else {
 189                     my $name = $param->attributes->{'name'};
 190                     my $value = $param->attributes->{'value'}
 191                             unless $subtype =~ /^(?:text)$/;
 192                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 193                             . (defined $name?  " name=$name->[1]": '')
 194                             . (defined $value? " value=$value->[1]": '');
 195                 }
 196             }
 197         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 198             my($token) = ($text{$t}->[0]);
 199             printf OUTPUT "#. For the first occurrence,\n"
 200                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 201             if ($token->string =~ /^<meta\b/is) {
 202                 my $type = $token->attributes->{'http-equiv'}->[1];
 203                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 204             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 205                 my $tag = uc($1);
 206                 my $type = (lc($tag) eq 'input'?
 207                         $token->attributes->{'type'}: undef);
 208                 my $name = $token->attributes->{'name'};
 209                 printf OUTPUT "#. %s\n", $tag
 210                     . (defined $type? " type=$type->[1]": '')
 211                     . (defined $name? " name=$name->[1]": '');
 212             }
 213         } elsif ($text{$t}->[0]->has_js_data) {
 214             printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 215             printf OUTPUT "#. SCRIPT\n";
 216         }
 217         my $cformat_p;
 218         for my $token (@{$text{$t}}) {
 219             my $pathname = $token->pathname;
 220             $pathname =~ s/^$directory_re//os;
 221         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 222             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 223                     if defined $pathname && defined $token->line_number;
 224             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 225         }
 226         printf OUTPUT "#, c-format\n" if $cformat_p;
 227         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 228                 TmplTokenizer::string_canon
 229                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 230         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 231                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 232     }
 233 }
 234
 235 ###############################################################################
 236
 237 sub convert_translation_file () {
 238     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 239     VerboseWarnings::set_input_file_name $convert_from;
 240     while (<INPUT>) {
 241         chomp;
 242         my($msgid, $msgstr) = split(/\t/);
 243         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 244                 unless defined $msgstr;
 245
 246         # Fixup some of the bad strings
 247         $msgid =~ s/^SELECTED>//;
 248
 249         # Create dummy token
 250         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 251         remember( $token, $msgid );
 252         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 253         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 254
 255         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 256             my $candidate = TmplTokenizer::charset_canon $2;
 257             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 258                     if defined $charset_in && $charset_in ne $candidate;
 259             $charset_in = $candidate;
 260         }
 261         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 262             my $candidate = TmplTokenizer::charset_canon $2;
 263             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 264                     if defined $charset_out && $charset_out ne $candidate;
 265             $charset_out = $candidate;
 266         }
 267     }
 268     # The following assumption is correct; that's what HTML::Template assumes
 269     if (!defined $charset_in) {
 270         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 271         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 272     }
 273 }
 274
 275 ###############################################################################
 276
 277 sub usage ($) {
 278     my($exitcode) = @_;
 279     my $h = $exitcode? *STDERR: *STDOUT;
 280     print $h <<EOF;
 281 Usage: $0 [OPTIONS]
 282 Extract translatable strings from given HTML::Template input files.
 283
 284 Input file location:
 285   -f, --files-from=FILE          Get list of input files from FILE
 286   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 287
 288 Output file location:
 289   -o, --output=FILE              Write output to specified file
 290
 291 HTML::Template options:
 292   -a, --extract-all              Extract all strings
 293       --pedantic-warnings        Issue warnings even for detected problems
 294                                  which are likely to be harmless
 295
 296 Output details:
 297   -s, --sort-output              generate sorted output
 298   -F, --sort-by-file             sort output by file location
 299   -v, --verbose                  explain what is being done
 300
 301 Informative output:
 302       --help                     Display this help and exit
 303
 304 Try `perldoc $0' for perhaps more information.
 305 EOF
 306     exit($exitcode);
 307 }
 308
 309 ###############################################################################
 310
 311 sub usage_error (;$) {
 312     print STDERR "$_[0]\n" if @_;
 313     print STDERR "Try `$0 --help' for more information.\n";
 314     exit(-1);
 315 }
 316
 317 ###############################################################################
 318
 319 Getopt::Long::config qw( bundling no_auto_abbrev );
 320 GetOptions(
 321     'a|extract-all'                     => \$extract_all_p,
 322     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 323     'convert-from=s'                    => \$convert_from,
 324     'D|directory=s'                     => \$directory,
 325     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 326     'f|files-from=s'                    => \$files_from,
 327     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 328     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 329     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 330     'output|o=s'                        => \$output,
 331     'po-mode'                           => \$po_mode_p,         # INTERNAL
 332     's|sort-output'                     => sub { $sort = 's' },
 333     'F|sort-by-file'                    => sub { $sort = 'F' },
 334     'v|verbose'                         => \$verbose_p,
 335     'help'                              => sub { usage(0) },
 336 ) || usage_error;
 337
 338 VerboseWarnings::set_application_name $0;
 339 VerboseWarnings::set_pedantic_mode $pedantic_p;
 340
 341 usage_error('Missing mandatory option -f')
 342         unless defined $files_from || defined $convert_from;
 343 $directory = '.' unless defined $directory;
 344
 345 usage_error('You cannot specify both --convert-from and --files-from')
 346         if defined $convert_from && defined $files_from;
 347
 348 if (defined $output && $output ne '-') {
 349     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 350         open(OUTPUT, ">$output") || die "$output: $!\n";
 351 } else {
 352     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 353     open(OUTPUT, ">&STDOUT");
 354 }
 355 #binmode OUTPUT, ':encoding(UTF-8)';
 356
 357 if (defined $files_from) {
 358     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 359     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 360     while (<INPUT>) {
 361         chomp;
 362         my $input = /^\//? $_: "$directory/$_";
 363         my $h = TmplTokenizer->new( $input );
 364         $h->set_allow_cformat( 1 );
 365         VerboseWarnings::set_input_file_name $input;
 366         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 367         text_extract( $h );
 368     }
 369     close INPUT;
 370 } else {
 371     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 372     convert_translation_file;
 373 }
 374 generate_po_file;
 375
 376 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 377         if TmplTokenizer::syntaxerror_p;
 378
 379
 380 exit(-1) if TmplTokenizer::fatal_p;
 381
 382 ###############################################################################
 383
 384 =head1 DESCRIPTION
 385
 386 This is an experimental script based on the modularized
 387 text-extract2.pl script.  It has behaviour similar to
 388 xgettext(1), and generates gettext-compatible output files.
 389
 390 A gettext-like format provides the following advantages:
 391
 392 =over
 393
 394 =item -
 395
 396 Translation to non-English-like languages with different word
 397 order:  gettext's c-format strings can theoretically be
 398 emulated if we are able to do some analysis on the .tmpl input
 399 and treat <TMPL_VAR> in a way similar to %s.
 400
 401 =item -
 402
 403 Context for the extracted strings:  the gettext format provides
 404 the filenames and line numbers where each string can be found.
 405 The translator can read the source file and see the context,
 406 in case the string by itself can mean several different things.
 407
 408 =item -
 409
 410 Place for the translator to add comments about the translations.
 411
 412 =item -
 413
 414 Gettext-compatible tools, if any, might be usable if we adopt
 415 the gettext format.
 416
 417 =back
 418
 419 This script has already been in use for over a year and should
 420 be reasonable stable. Nevertheless, it is still somewhat
 421 experimental and there are still some issues.
 422
 423 Please refer to the explanation in tmpl_process3 for further
 424 details.
 425
 426 If you want to generate GNOME-style POTFILES.in files, such
 427 files (passed to -f) can be generated thus:
 428
 429         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 430                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 431         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 432                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 433
 434 This is, however, quite pointless, because the "create" and
 435 "update" actions have already been implemented in tmpl_process3.pl.
 436
 437 =head2 Strings inside JavaScript
 438
 439 In the SCRIPT elements, the script will attempt to scan for
 440 _("I<string literal>") patterns, and extract the I<string literal>
 441 as a translatable string.
 442
 443 Note that the C-like _(...) notation is required.
 444
 445 The JavaScript must actually define a _ function
 446 so that the code remains correct JavaScript.
 447 A suitable definition of such a function can be
 448
 449         function _(s) { return s } // dummy function for gettext
 450
 451 =head1 SEE ALSO
 452
 453 tmpl_process3.pl,
 454 xgettext(1),
 455 Locale::PO(3),
 456 translator_doc.txt
 457
 458 =head1 BUGS
 459
 460 There probably are some. Bugs related to scanning of <INPUT>
 461 tags seem to be especially likely to be present.
 462
 463 Its diagnostics are probably too verbose.
 464
 465 When a <TMPL_VAR> within a JavaScript-related attribute is
 466 detected, the script currently displays no warnings at all.
 467 It might be good to display some kind of warning.
 468
 469 Its sort order (-s option) seems to be different than the real
 470 xgettext(1)'s sort option. This will result in translation
 471 strings inside the generated PO file spuriously moving about
 472 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 473
 474 If a Javascript string has leading spaces, it will
 475 generate strings with spurious leading spaces,
 476 leading to failure to match the strings when actually generating
 477 translated files.
 478
 479 =cut