misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use Locale::PO;
  12 use TmplTokenizer;
  13 use VerboseWarnings;
  14
  15 use vars qw( $convert_from );
  16 use vars qw( $files_from $directory $output $sort );
  17 use vars qw( $extract_all_p );
  18 use vars qw( $pedantic_p );
  19 use vars qw( %text %translation );
  20 use vars qw( $charset_in $charset_out );
  21 use vars qw( $verbose_p );
  22
  23 ###############################################################################
  24
  25 sub string_negligible_p ($) {
  26     my($t) = @_;                                # a string
  27     # Don't emit pure whitespace, pure numbers, pure punctuation,
  28     # single letters, or TMPL_VAR's.
  29     # Punctuation should arguably be translated. But without context
  30     # they are untranslatable. Note that $t is a string, not a token object.
  31     return !$extract_all_p && (
  32                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  33             || $t =~ /^\d+$/                    # purely digits
  34             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  35             || $t =~ /^[A-Za-z]$/               # single letters
  36         )
  37 }
  38
  39 sub token_negligible_p( $ ) {
  40     my($x) = @_;
  41     my $t = $x->type;
  42     return !$extract_all_p && (
  43             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  44             $t == TmplTokenType::DIRECTIVE? 1:
  45             $t == TmplTokenType::TEXT_PARAMETRIZED
  46                 && join( '', map { my $t = $_->type;
  47                         $t == TmplTokenType::DIRECTIVE?
  48                                 '1': $t == TmplTokenType::TAG?
  49                                         '': token_negligible_p( $_ )?
  50                                         '': '1' } @{$x->children} ) eq '' );
  51 }
  52
  53 ###############################################################################
  54
  55 sub remember ($$) {
  56     my($token, $string) = @_;
  57     # If we determine that the string is negligible, don't bother to remember
  58     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  59         my $key = TmplTokenizer::string_canon( $string );
  60         $text{$key} = [] unless defined $text{$key};
  61         push @{$text{$key}}, $token;
  62     }
  63 }
  64
  65 ###############################################################################
  66
  67 sub string_list () {
  68     my @t = keys %text;
  69     # The real gettext tools seems to sort case sensitively; I don't know why
  70     @t = sort { $a cmp $b } @t if $sort eq 's';
  71     @t = sort {
  72             my @aa = sort { $a->pathname cmp $b->pathname
  73                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  74             my @bb = sort { $a->pathname cmp $b->pathname
  75                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  76             $aa[0]->pathname cmp $bb[0]->pathname
  77                     || $aa[0]->line_number <=> $bb[0]->line_number;
  78         } @t if $sort eq 'F';
  79     return @t;
  80 }
  81
  82 ###############################################################################
  83
  84 sub text_extract (*) {
  85     my($h) = @_;
  86     for (;;) {
  87         my $s = TmplTokenizer::next_token $h;
  88     last unless defined $s;
  89         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  90         if ($kind eq TmplTokenType::TEXT) {
  91             remember( $s, $t ) if $t =~ /\S/s;
  92         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  93             remember( $s, $s->form ) if $s->form =~ /\S/s;
  94         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  95             # value [tag=input], meta
  96             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  97             for my $a ('alt', 'content', 'title', 'value') {
  98                 if ($attr->{$a}) {
  99                     next if $a eq 'content' && $tag ne 'meta';
 100                     next if $a eq 'value' && ($tag ne 'input'
 101                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 102                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 103                     $val = TmplTokenizer::trim $val;
 104                     remember( $s, $val ) if $val =~ /\S/s;
 105                 }
 106             }
 107         }
 108     }
 109 }
 110
 111 ###############################################################################
 112
 113 sub generate_strings_list () {
 114     # Emit all extracted strings.
 115     for my $t (string_list) {
 116         printf OUTPUT "%s\n", $t # unless negligible_p($t);
 117     }
 118 }
 119
 120 ###############################################################################
 121
 122 sub generate_po_file () {
 123     # We don't emit the Plural-Forms header; it's meaningless for us
 124     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 125     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 126     print OUTPUT <<EOF;
 127 # SOME DESCRIPTIVE TITLE.
 128 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 129 # This file is distributed under the same license as the PACKAGE package.
 130 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 131 #
 132 #, fuzzy
 133 msgid ""
 134 msgstr ""
 135 "Project-Id-Version: PACKAGE VERSION\\n"
 136 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
 137 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
 138 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 139 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 140 "MIME-Version: 1.0\\n"
 141 "Content-Type: text/plain; charset=$pot_charset\\n"
 142 "Content-Transfer-Encoding: 8bit\\n"
 143
 144 EOF
 145     my $directory_re = quotemeta("$directory/");
 146     for my $t (string_list) {
 147         #next if negligible_p($t);
 148         my $cformat_p;
 149         for my $token (@{$text{$t}}) {
 150             my $pathname = $token->pathname;
 151             $pathname =~ s/^$directory_re//os;
 152             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 153                     if defined $pathname && defined $token->line_number;
 154             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 155         }
 156         printf OUTPUT "#, c-format\n" if $cformat_p;
 157         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 158                 TmplTokenizer::string_canon
 159                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 160         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 161                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 162     }
 163 }
 164
 165 ###############################################################################
 166
 167 sub convert_translation_file () {
 168     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 169     VerboseWarnings::set_input_file_name $convert_from;
 170     while (<INPUT>) {
 171         chomp;
 172         my($msgid, $msgstr) = split(/\t/);
 173         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 174                 unless defined $msgstr;
 175
 176         # Fixup some of the bad strings
 177         $msgid =~ s/^SELECTED>//;
 178
 179         # Create dummy token
 180         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 181         remember( $token, $msgid );
 182         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 183         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 184
 185         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 186             my $candidate = TmplTokenizer::charset_canon $2;
 187             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 188                     if defined $charset_in && $charset_in ne $candidate;
 189             $charset_in = $candidate;
 190         }
 191         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 192             my $candidate = TmplTokenizer::charset_canon $2;
 193             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 194                     if defined $charset_out && $charset_out ne $candidate;
 195             $charset_out = $candidate;
 196         }
 197     }
 198     # The following assumption is correct; that's what HTML::Template assumes
 199     if (!defined $charset_in) {
 200         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 201         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 202     }
 203 }
 204
 205 ###############################################################################
 206
 207 sub usage ($) {
 208     my($exitcode) = @_;
 209     my $h = $exitcode? *STDERR: *STDOUT;
 210     print $h <<EOF;
 211 Usage: $0 [OPTIONS]
 212 Extract translatable strings from given HTML::Template input files.
 213
 214 Input file location:
 215   -f, --files-from=FILE          Get list of input files from FILE
 216   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 217
 218 Output file location:
 219   -o, --output=FILE              Write output to specified file
 220
 221 HTML::Template options:
 222   -a, --extract-all              Extract all strings
 223       --pedantic-warnings        Issue warnings even for detected problems
 224                                  which are likely to be harmless
 225
 226 Output details:
 227   -s, --sort-output              generate sorted output
 228   -F, --sort-by-file             sort output by file location
 229   -v, --verbose                  explain what is being done
 230
 231 Informative output:
 232       --help                     Display this help and exit
 233 EOF
 234     exit($exitcode);
 235 }
 236
 237 ###############################################################################
 238
 239 sub usage_error (;$) {
 240     print STDERR "$_[0]\n" if @_;
 241     print STDERR "Try `$0 --help' for more information.\n";
 242     exit(-1);
 243 }
 244
 245 ###############################################################################
 246
 247 Getopt::Long::config qw( bundling no_auto_abbrev );
 248 GetOptions(
 249     'a|extract-all'                     => \$extract_all_p,
 250     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 251     'convert-from=s'                    => \$convert_from,
 252     'D|directory=s'                     => \$directory,
 253     'f|files-from=s'                    => \$files_from,
 254     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 255     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 256     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 257     'output|o=s'                        => \$output,
 258     's|sort-output'                     => sub { $sort = 's' },
 259     'F|sort-by-file'                    => sub { $sort = 'F' },
 260     'v|verbose'                         => \$verbose_p,
 261     'help'                              => sub { usage(0) },
 262 ) || usage_error;
 263
 264 VerboseWarnings::set_application_name $0;
 265 VerboseWarnings::set_pedantic_mode $pedantic_p;
 266
 267 usage_error('Missing mandatory option -f')
 268         unless defined $files_from || defined $convert_from;
 269 $directory = '.' unless defined $directory;
 270
 271 usage_error('You cannot specify both --convert-from and --files-from')
 272         if defined $convert_from && defined $files_from;
 273
 274 if (defined $output && $output ne '-') {
 275     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 276     open(OUTPUT, ">$output") || die "$output: $!\n";
 277 } else {
 278     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 279     open(OUTPUT, ">&STDOUT");
 280 }
 281
 282 if (defined $files_from) {
 283     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 284     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 285     while (<INPUT>) {
 286         chomp;
 287         my $h = TmplTokenizer->new( "$directory/$_" );
 288         $h->set_allow_cformat( 1 );
 289         VerboseWarnings::set_input_file_name "$directory/$_";
 290         print STDERR "$0: Processing file \"$directory/$_\"\n" if $verbose_p;
 291         text_extract( $h );
 292     }
 293     close INPUT;
 294 } else {
 295     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 296     convert_translation_file;
 297 }
 298 generate_po_file;
 299
 300 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 301         if TmplTokenizer::syntaxerror_p;
 302
 303
 304 exit(-1) if TmplTokenizer::fatal_p;
 305
 306 ###############################################################################
 307
 308 =head1 DESCRIPTION
 309
 310 This is an experimental script based on the modularized
 311 text-extract2.pl script.  It has behaviour similar to
 312 xgettext(1), and generates gettext-compatible output files.
 313
 314 A gettext-like format provides the following advantages:
 315
 316 =over
 317
 318 =item -
 319
 320 (Future goal)
 321 Translation to non-English-like languages with different word
 322 order:  gettext's c-format strings can theoretically be
 323 emulated if we are able to do some analysis on the .tmpl input
 324 and treat <TMPL_VAR> in a way similar to %s.
 325
 326 =item -
 327
 328 Context for the extracted strings:  the gettext format provides
 329 the filenames and line numbers where each string can be found.
 330 The translator can read the source file and see the context,
 331 in case the string by itself can mean several different things.
 332
 333 =item -
 334
 335 Place for the translator to add comments about the translations.
 336
 337 =item -
 338
 339 Gettext-compatible tools, if any, might be usable if we adopt
 340 the gettext format.
 341
 342 =back
 343
 344 Note that this script is experimental and should still be
 345 considered unstable.
 346
 347 Please refer to the explanation in tmpl_process3 for further
 348 details.
 349
 350 If you want to generate GNOME-style POTFILES.in files, such
 351 files (passed to -f) can be generated thus:
 352
 353         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 354                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 355         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 356                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 357
 358 This is, however, quite pointless, because the "create" and
 359 "update" actions have already been implemented in tmpl_process3.pl.
 360
 361 =head1 SEE ALSO
 362
 363 tmpl_process.pl,
 364 xgettext(1),
 365 Locale::PO(3),
 366 translator_doc.txt
 367
 368 =head1 BUGS
 369
 370 There probably are some. Bugs related to scanning of <INPUT>
 371 tags seem to be especially likely to be present.
 372
 373 Its diagnostics are probably too verbose.
 374
 375 When a <TMPL_VAR> within a JavaScript-related attribute is
 376 detected, the script currently displays no warnings at all.
 377 It might be good to display some kind of warning.
 378
 379 Its sort order (-s option) seems to be different than the real
 380 xgettext(1)'s sort option. This will result in translation
 381 strings inside the generated PO file spuriously moving about
 382 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 383
 384 =cut