[Koha-cvs] CVS: koha/misc/translator TmplTokenizer.pm,1.16,1.17 tmpl_process3.pl,1.2,1.3 xgettext.pl,1.1,1.2

Sun Feb 22 06:18:55 CET 2004

Update of /cvsroot/koha/koha/misc/translator
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31961

Modified Files:
	TmplTokenizer.pm tmpl_process3.pl xgettext.pl 
Log Message:
Handle the iso8859-1 charset somewhat, so that when the po file is in
either iso8859-1 or utf8, msgmerge(1) won't crap out. The code is ugly;
the conversion table is hard-coded, and in some place not very appropriate.

However, this does fix the case where a few strings containing French
characters can't be translated. As a side effect, tmpl_process3 can now
also be used for French or other languages using iso8859-1.

Index: TmplTokenizer.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplTokenizer.pm,v
retrieving revision 1.16
retrieving revision 1.17
diff -C2 -r1.16 -r1.17
*** TmplTokenizer.pm	20 Feb 2004 07:52:32 -0000	1.16
--- TmplTokenizer.pm	22 Feb 2004 05:18:52 -0000	1.17
***************
*** 540,544 ****
  }

! # Complication function that shouldn't be here
  sub parametrize ($@) {
      my($fmt, @params) = @_;
--- 540,544 ----
  }

! # Some functions that shouldn't be here... should be moved out some time
  sub parametrize ($@) {
      my($fmt, @params) = @_;
***************
*** 573,576 ****
--- 573,584 ----
  }

+ sub charset_canon ($) {
+     my($charset) = @_;
+     $charset = uc($charset);
+     $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
+     $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
+     return $charset;
+ }
+ 
  ###############################################################################

Index: tmpl_process3.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/tmpl_process3.pl,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** tmpl_process3.pl	19 Feb 2004 21:28:14 -0000	1.2
--- tmpl_process3.pl	22 Feb 2004 05:18:52 -0000	1.3
***************
*** 27,30 ****
--- 27,31 ----
  use vars qw( $href );
  use vars qw( $type );	# file extension (DOS form without the dot) to match
+ use vars qw( $charset_in $charset_out );

  ###############################################################################
***************
*** 99,103 ****
  }

- # FIXME: Should we use the GNOME convention of using POTFILES.in instead?
  sub listfiles ($$) {
      my($dir, $type) = @_;
--- 100,103 ----
***************
*** 147,150 ****
--- 147,156 ----
  VerboseWarnings::set_pedantic_mode $pedantic_p;

+ # keep the buggy Locale::PO quiet if it says stupid things
+ $SIG{__WARN__} = sub {
+ 	my($s) = @_;
+ 	print STDERR $s unless $s =~ /^Strange line in [^:]+: #~/s
+     };
+ 
  my $action = shift or usage_error('You must specify an ACTION.');
  usage_error('You must at least specify input and string list filenames.')
***************
*** 180,192 ****
  }

  if ($action eq 'create')  {
      # updates the list. As the list is empty, every entry will be added
      die "$str_file: Output file already exists" if -f $str_file;
      my($tmph, $tmpfile) = tmpnam();
      for my $input (@in_files) {
  	print $tmph "$input\n";
      }
      close $tmph;
!     system {'./xgettext.pl'} ('xgettext.pl', '-s', '-f', $tmpfile, '-o', $str_file);
      unlink $tmpfile || warn_normal "$tmpfile: unlink failed: $!\n", undef;

--- 186,221 ----
  }

+ # restores the string list from file
+ $href = Locale::PO->load_file_ashash($str_file);
+ 
+ # guess the charsets. HTML::Templates defaults to iso-8859-1
+ if (defined $href) {
+     $charset_out = TmplTokenizer::charset_canon $2
+ 	    if $href->{'""'}->msgstr =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/;
+     for my $msgid (keys %$href) {
+ 	if ($msgid =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/) {
+ 	    my $candidate = TmplTokenizer::charset_canon $2;
+ 	    die "Conflicting charsets in msgid: $charset_in vs $candidate\n"
+ 		    if defined $charset_in && $charset_in ne $candidate;
+ 	    $charset_in = $2;
+ 	}
+     }
+ }
+ if (!defined $charset_in) {
+     $charset_in = TmplTokenizer::charset_canon 'iso8859-1';
+     warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
+ }
+ 
  if ($action eq 'create')  {
      # updates the list. As the list is empty, every entry will be added
      die "$str_file: Output file already exists" if -f $str_file;
      my($tmph, $tmpfile) = tmpnam();
+     # Generate the temporary file that acts as <MODULE>/POTFILES.in
      for my $input (@in_files) {
  	print $tmph "$input\n";
      }
      close $tmph;
!     # Generate the specified po file ($str_file)
!     system ('xgettext.pl', '-s', '-f', $tmpfile, '-o', $str_file);
      unlink $tmpfile || warn_normal "$tmpfile: unlink failed: $!\n", undef;

***************
*** 195,203 ****
      my($tmph2, $tmpfile2) = tmpnam();
      close $tmph2; # We just want a name
      for my $input (@in_files) {
  	print $tmph1 "$input\n";
      }
      close $tmph1;
!     system('./xgettext.pl', '-s', '-f', $tmpfile1, '-o', $tmpfile2);
      system('msgmerge', '-U', '-s', $str_file, $tmpfile2);
      unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
--- 224,239 ----
      my($tmph2, $tmpfile2) = tmpnam();
      close $tmph2; # We just want a name
+     # Generate the temporary file that acts as <MODULE>/POTFILES.in
      for my $input (@in_files) {
  	print $tmph1 "$input\n";
      }
      close $tmph1;
!     # Generate the temporary file that acts as <MODULE>/<LANG>.pot
!     system('./xgettext.pl', '-s', '-f', $tmpfile1, '-o', $tmpfile2,
! 	    (defined $charset_in? ('-I', $charset_in): ()),
! 	    (defined $charset_out? ('-O', $charset_out): ()));
!     # Merge the temporary "pot file" with the specified po file ($str_file)
!     # FIXME: msgmerge(1) is a Unix dependency
!     # FIXME: need to check the return value
      system('msgmerge', '-U', '-s', $str_file, $tmpfile2);
      unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
***************
*** 222,228 ****
      close INPUT;

-     # restores the string list from file
-     $href = Locale::PO->load_file_ashash($str_file);
- 
      # creates the new tmpl file using the new translation
      for my $input (@in_files) {
--- 258,261 ----

Index: xgettext.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/xgettext.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -r1.1 -r1.2
*** xgettext.pl	19 Feb 2004 21:24:30 -0000	1.1
--- xgettext.pl	22 Feb 2004 05:18:52 -0000	1.2
***************
*** 13,19 ****
  use VerboseWarnings;

  use vars qw( $files_from $directory $output $sort );
  use vars qw( $pedantic_p );
! use vars qw( %text );

  ###############################################################################
--- 13,60 ----
  use VerboseWarnings;

+ use vars qw( $convert_from );
  use vars qw( $files_from $directory $output $sort );
  use vars qw( $pedantic_p );
! use vars qw( %text %translation );
! use vars qw( $charset_in $charset_out );
! 
! ###############################################################################
! 
! use vars qw( @latin1_utf8 );
! @latin1_utf8 = (
!     "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
!     "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
!     "\302\214", "\302\215",   undef,      undef,    "\302\220", "\302\221",
!     "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
!     "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
!     "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
!     "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
!     "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
!     "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
!     "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
!     "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
!     "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
!     "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
!     "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
!     "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
!     "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
!     "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
!     "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
!     "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
!     "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
!     "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
!     "\303\276", "\303\277" );
! 
! sub charset_convert ($) {
!     my($s) = @_;
!     if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
! 	;
!     } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
! 	$s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
!     } elsif ($charset_in ne $charset_out) {
! 	VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
!     }
!     return $s;
! }

  ###############################################################################
***************
*** 88,91 ****
--- 129,134 ----
  sub generate_po_file () {
      # We don't emit the Plural-Forms header; it's meaningless for us
+     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
+     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
      print OUTPUT <<EOF;
  # SOME DESCRIPTIVE TITLE.
***************
*** 103,107 ****
  "Language-Team: LANGUAGE <LL\@li.org>\\n"
  "MIME-Version: 1.0\\n"
! "Content-Type: text/plain; charset=CHARSET\\n"
  "Content-Transfer-Encoding: 8bit\\n"

--- 146,150 ----
  "Language-Team: LANGUAGE <LL\@li.org>\\n"
  "MIME-Version: 1.0\\n"
! "Content-Type: text/plain; charset=$pot_charset\\n"
  "Content-Transfer-Encoding: 8bit\\n"

***************
*** 114,123 ****
  	    my $pathname = $token->pathname;
  	    $pathname =~ s/^$directory_re//os;
! 	    printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number;
  	    $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
  	}
  	printf OUTPUT "#, c-format\n" if $cformat_p;
! 	printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( $t );
! 	printf OUTPUT "msgstr \"\"\n\n";
      }
  }
--- 157,207 ----
  	    my $pathname = $token->pathname;
  	    $pathname =~ s/^$directory_re//os;
! 	    printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
! 		    if defined $pathname && defined $token->line_number;
  	    $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
  	}
  	printf OUTPUT "#, c-format\n" if $cformat_p;
! 	printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( charset_convert $t );
! 	printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
! 		TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
!     }
! }
! 
! ###############################################################################
! 
! sub convert_translation_file () {
!     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
!     VerboseWarnings::set_input_file_name $convert_from;
!     while (<INPUT>) {
! 	chomp;
! 	my($msgid, $msgstr) = split(/\t/);
! 	die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
! 		unless defined $msgstr;
! 
! 	# Fixup some of the bad strings
! 	$msgid =~ s/^SELECTED>//;
! 
! 	# Create dummy token
! 	my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
! 	remember( $token, $msgid );
! 	$msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
! 	$translation{$msgid} = $msgstr unless $msgstr eq '*****';
! 
! 	if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
! 	    my $candidate = TmplTokenizer::charset_canon $2;
! 	    die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
! 		    if defined $charset_in && $charset_in ne $candidate;
! 	    $charset_in = $candidate;
! 	}
! 	if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
! 	    my $candidate = TmplTokenizer::charset_canon $2;
! 	    die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
! 		    if defined $charset_out && $charset_out ne $candidate;
! 	    $charset_out = $candidate;
! 	}
!     }
!     if (!defined $charset_in) {
! 	$charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
! 	warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
      }
  }
***************
*** 165,171 ****
--- 249,259 ----
  Getopt::Long::config qw( bundling no_auto_abbrev );
  GetOptions(
+     'charset=s'	=> sub { $charset_in = $charset_out = $_[1] },	# INTERNAL
+     'convert-from=s'			=> \$convert_from,
      'D|directory=s'			=> \$directory,
      'f|files-from=s'			=> \$files_from,
+     'I|input-charset=s'			=> \$charset_in,	# INTERNAL
      'pedantic-warnings|pedantic'	=> sub { $pedantic_p = 1 },
+     'O|output-charset=s'		=> \$charset_out,	# INTERNAL
      'output|o=s'			=> \$output,
      's|sort-output'			=> sub { $sort = 's' },
***************
*** 177,183 ****
  VerboseWarnings::set_pedantic_mode $pedantic_p;

! usage_error('Missing mandatory option -f') unless defined $files_from;
  $directory = '.' unless defined $directory;

  if (defined $output && $output ne '-') {
      open(OUTPUT, ">$output") || die "$output: $!\n";
--- 265,275 ----
  VerboseWarnings::set_pedantic_mode $pedantic_p;

! usage_error('Missing mandatory option -f')
! 	unless defined $files_from || defined $convert_from;
  $directory = '.' unless defined $directory;

+ usage_error('You cannot specify both --convert-from and --files-from')
+ 	if defined $convert_from && defined $files_from;
+ 
  if (defined $output && $output ne '-') {
      open(OUTPUT, ">$output") || die "$output: $!\n";
***************
*** 186,198 ****
  }

! open(INPUT, "<$files_from") || die "$files_from: $!\n";
! while (<INPUT>) {
!     chomp;
!     my $h = TmplTokenizer->new( "$directory/$_" );
!     $h->set_allow_cformat( 1 );
!     VerboseWarnings::set_input_file_name "$directory/$_";
!     text_extract( $h );
  }
- close INPUT;
  generate_po_file;

--- 278,294 ----
  }

! if (defined $files_from) {
!     open(INPUT, "<$files_from") || die "$files_from: $!\n";
!     while (<INPUT>) {
! 	chomp;
! 	my $h = TmplTokenizer->new( "$directory/$_" );
! 	$h->set_allow_cformat( 1 );
! 	VerboseWarnings::set_input_file_name "$directory/$_";
! 	text_extract( $h );
!     }
!     close INPUT;
! } else {
!     convert_translation_file;
  }
  generate_po_file;