misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use Locale::PO;
  12 use TmplTokenizer;
  13 use VerboseWarnings;
  14
  15 use vars qw( $convert_from );
  16 use vars qw( $files_from $directory $output $sort );
  17 use vars qw( $extract_all_p );
  18 use vars qw( $pedantic_p );
  19 use vars qw( %text %translation );
  20 use vars qw( $charset_in $charset_out );
  21
  22 ###############################################################################
  23
  24 sub string_negligible_p ($) {
  25     my($t) = @_;                                # a string
  26     # Don't emit pure whitespace, pure numbers, pure punctuation,
  27     # single letters, or TMPL_VAR's.
  28     # Punctuation should arguably be translated. But without context
  29     # they are untranslatable. Note that $t is a string, not a token object.
  30     return !$extract_all_p && (
  31                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  32             || $t =~ /^\d+$/                    # purely digits
  33             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  34             || $t =~ /^[A-Za-z]$/               # single letters
  35         )
  36 }
  37
  38 sub token_negligible_p( $ ) {
  39     my($x) = @_;
  40     my $t = $x->type;
  41     return !$extract_all_p && (
  42             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  43             $t == TmplTokenType::DIRECTIVE? 1:
  44             $t == TmplTokenType::TEXT_PARAMETRIZED
  45                 && join( '', map { my $t = $_->type;
  46                         $t == TmplTokenType::DIRECTIVE?
  47                                 '1': $t == TmplTokenType::TAG?
  48                                         '': token_negligible_p( $_ )?
  49                                         '': '1' } @{$x->children} ) eq '' );
  50 }
  51
  52 ###############################################################################
  53
  54 sub remember ($$) {
  55     my($token, $string) = @_;
  56     # If we determine that the string is negligible, don't bother to remember
  57     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  58         $text{$string} = [] unless defined $text{$string};
  59         push @{$text{$string}}, $token;
  60     }
  61 }
  62
  63 ###############################################################################
  64
  65 sub string_list () {
  66     my @t = keys %text;
  67     # The real gettext tools seems to sort case sensitively; I don't know why
  68     @t = sort { $a cmp $b } @t if $sort eq 's';
  69     @t = sort {
  70             my @aa = sort { $a->pathname cmp $b->pathname
  71                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  72             my @bb = sort { $a->pathname cmp $b->pathname
  73                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  74             $aa[0]->pathname cmp $bb[0]->pathname
  75                     || $aa[0]->line_number <=> $bb[0]->line_number;
  76         } @t if $sort eq 'F';
  77     return @t;
  78 }
  79
  80 ###############################################################################
  81
  82 sub text_extract (*) {
  83     my($h) = @_;
  84     for (;;) {
  85         my $s = TmplTokenizer::next_token $h;
  86     last unless defined $s;
  87         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  88         if ($kind eq TmplTokenType::TEXT) {
  89             remember( $s, $t ) if $t =~ /\S/s;
  90         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  91             remember( $s, $s->form ) if $s->form =~ /\S/s;
  92         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  93             # value [tag=input], meta
  94             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  95             for my $a ('alt', 'content', 'title', 'value') {
  96                 if ($attr->{$a}) {
  97                     next if $a eq 'content' && $tag ne 'meta';
  98                     next if $a eq 'value' && ($tag ne 'input'
  99                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 100                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 101                     $val = TmplTokenizer::trim $val;
 102                     remember( $s, $val ) if $val =~ /\S/s;
 103                 }
 104             }
 105         }
 106     }
 107 }
 108
 109 ###############################################################################
 110
 111 sub generate_strings_list () {
 112     # Emit all extracted strings.
 113     for my $t (string_list) {
 114         printf OUTPUT "%s\n", $t # unless negligible_p($t);
 115     }
 116 }
 117
 118 ###############################################################################
 119
 120 sub generate_po_file () {
 121     # We don't emit the Plural-Forms header; it's meaningless for us
 122     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 123     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 124     print OUTPUT <<EOF;
 125 # SOME DESCRIPTIVE TITLE.
 126 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 127 # This file is distributed under the same license as the PACKAGE package.
 128 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 129 #
 130 #, fuzzy
 131 msgid ""
 132 msgstr ""
 133 "Project-Id-Version: PACKAGE VERSION\\n"
 134 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
 135 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
 136 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 137 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 138 "MIME-Version: 1.0\\n"
 139 "Content-Type: text/plain; charset=$pot_charset\\n"
 140 "Content-Transfer-Encoding: 8bit\\n"
 141
 142 EOF
 143     my $directory_re = quotemeta("$directory/");
 144     for my $t (string_list) {
 145         #next if negligible_p($t);
 146         my $cformat_p;
 147         for my $token (@{$text{$t}}) {
 148             my $pathname = $token->pathname;
 149             $pathname =~ s/^$directory_re//os;
 150             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 151                     if defined $pathname && defined $token->line_number;
 152             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 153         }
 154         printf OUTPUT "#, c-format\n" if $cformat_p;
 155         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 156                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 157         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 158                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 159     }
 160 }
 161
 162 ###############################################################################
 163
 164 sub convert_translation_file () {
 165     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 166     VerboseWarnings::set_input_file_name $convert_from;
 167     while (<INPUT>) {
 168         chomp;
 169         my($msgid, $msgstr) = split(/\t/);
 170         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 171                 unless defined $msgstr;
 172
 173         # Fixup some of the bad strings
 174         $msgid =~ s/^SELECTED>//;
 175
 176         # Create dummy token
 177         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 178         remember( $token, $msgid );
 179         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 180         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 181
 182         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 183             my $candidate = TmplTokenizer::charset_canon $2;
 184             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 185                     if defined $charset_in && $charset_in ne $candidate;
 186             $charset_in = $candidate;
 187         }
 188         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 189             my $candidate = TmplTokenizer::charset_canon $2;
 190             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 191                     if defined $charset_out && $charset_out ne $candidate;
 192             $charset_out = $candidate;
 193         }
 194     }
 195     # The following assumption is correct; that's what HTML::Template assumes
 196     if (!defined $charset_in) {
 197         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 198         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 199     }
 200 }
 201
 202 ###############################################################################
 203
 204 sub usage ($) {
 205     my($exitcode) = @_;
 206     my $h = $exitcode? *STDERR: *STDOUT;
 207     print $h <<EOF;
 208 Usage: $0 [OPTIONS]
 209 Extract translatable strings from given HTML::Template input files.
 210
 211 Input file location:
 212   -f, --files-from=FILE          Get list of input files from FILE
 213   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 214
 215 Output file location:
 216   -o, --output=FILE              Write output to specified file
 217
 218 HTML::Template options:
 219   -a, --extract-all              Extract all strings
 220       --pedantic-warnings        Issue warnings even for detected problems
 221                                  which are likely to be harmless
 222
 223 Output details:
 224   -s, --sort-output              generate sorted output
 225   -F, --sort-by-file             sort output by file location
 226
 227 Informative output:
 228       --help                     Display this help and exit
 229 EOF
 230     exit($exitcode);
 231 }
 232
 233 ###############################################################################
 234
 235 sub usage_error (;$) {
 236     print STDERR "$_[0]\n" if @_;
 237     print STDERR "Try `$0 --help' for more information.\n";
 238     exit(-1);
 239 }
 240
 241 ###############################################################################
 242
 243 Getopt::Long::config qw( bundling no_auto_abbrev );
 244 GetOptions(
 245     'a|extract-all'                     => \$extract_all_p,
 246     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 247     'convert-from=s'                    => \$convert_from,
 248     'D|directory=s'                     => \$directory,
 249     'f|files-from=s'                    => \$files_from,
 250     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 251     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 252     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 253     'output|o=s'                        => \$output,
 254     's|sort-output'                     => sub { $sort = 's' },
 255     'F|sort-by-file'                    => sub { $sort = 'F' },
 256     'help'                              => sub { usage(0) },
 257 ) || usage_error;
 258
 259 VerboseWarnings::set_application_name $0;
 260 VerboseWarnings::set_pedantic_mode $pedantic_p;
 261
 262 usage_error('Missing mandatory option -f')
 263         unless defined $files_from || defined $convert_from;
 264 $directory = '.' unless defined $directory;
 265
 266 usage_error('You cannot specify both --convert-from and --files-from')
 267         if defined $convert_from && defined $files_from;
 268
 269 if (defined $output && $output ne '-') {
 270     open(OUTPUT, ">$output") || die "$output: $!\n";
 271 } else {
 272     open(OUTPUT, ">&STDOUT");
 273 }
 274
 275 if (defined $files_from) {
 276     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 277     while (<INPUT>) {
 278         chomp;
 279         my $h = TmplTokenizer->new( "$directory/$_" );
 280         $h->set_allow_cformat( 1 );
 281         VerboseWarnings::set_input_file_name "$directory/$_";
 282         text_extract( $h );
 283     }
 284     close INPUT;
 285 } else {
 286     convert_translation_file;
 287 }
 288 generate_po_file;
 289
 290 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 291         if TmplTokenizer::syntaxerror_p;
 292
 293
 294 exit(-1) if TmplTokenizer::fatal_p;
 295
 296 ###############################################################################
 297
 298 =head1 DESCRIPTION
 299
 300 This is an experimental script based on the modularized
 301 text-extract2.pl script.  It has behaviour similar to
 302 xgettext(1), and generates gettext-compatible output files.
 303
 304 A gettext-like format provides the following advantages:
 305
 306 =over
 307
 308 =item -
 309
 310 (Future goal)
 311 Translation to non-English-like languages with different word
 312 order:  gettext's c-format strings can theoretically be
 313 emulated if we are able to do some analysis on the .tmpl input
 314 and treat <TMPL_VAR> in a way similar to %s.
 315
 316 =item -
 317
 318 Context for the extracted strings:  the gettext format provides
 319 the filenames and line numbers where each string can be found.
 320 The translator can read the source file and see the context,
 321 in case the string by itself can mean several different things.
 322
 323 =item -
 324
 325 Place for the translator to add comments about the translations.
 326
 327 =item -
 328
 329 Gettext-compatible tools, if any, might be usable if we adopt
 330 the gettext format.
 331
 332 =back
 333
 334 Note that this script is experimental and should still be
 335 considered unstable.
 336
 337 Please refer to the explanation in tmpl_process3 for further
 338 details.
 339
 340 If you want to generate GNOME-style POTFILES.in files, such
 341 files (passed to -f) can be generated thus:
 342
 343         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 344                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 345         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 346                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 347
 348 This is, however, quite pointless, because the "create" and
 349 "update" actions have already been implemented in tmpl_process3.pl.
 350
 351 =head1 SEE ALSO
 352
 353 tmpl_process.pl,
 354 xgettext(1),
 355 Locale::PO(3),
 356 translator_doc.txt
 357
 358 =head1 BUGS
 359
 360 There probably are some. Bugs related to scanning of <INPUT>
 361 tags seem to be especially likely to be present.
 362
 363 Its diagnostics are probably too verbose.
 364
 365 When a <TMPL_VAR> within a JavaScript-related attribute is
 366 detected, the script currently displays no warnings at all.
 367 It might be good to display some kind of warning.
 368
 369 Its sort order (-s option) seems to be different than the real
 370 xgettext(1)'s sort option. This will result in translation
 371 strings inside the generated PO file spuriously moving about
 372 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 373
 374 =cut