[Koha-cvs] CVS: koha/misc/translator tmpl_process3.pl,NONE,1.1 xgettext.pl,NONE,1.1 TmplToken.pm,1.2,1.3 TmplTokenType.pm,1.2,1.3 TmplTokenizer.pm,1.10,1.11 text-extract2.pl,1.40,1.41

Thu Feb 19 22:24:33 CET 2004

Update of /cvsroot/koha/koha/misc/translator
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27741

Modified Files:
	TmplToken.pm TmplTokenType.pm TmplTokenizer.pm 
	text-extract2.pl 
Added Files:
	tmpl_process3.pl xgettext.pl 
Log Message:
New scripts for translation into Chinese and other languages where English
word order is too different than the word order of the target language to
yield meaningful translations.

The new scripts use a different translation file format (namely standard
gettext-style PO files).

This seems to reasonably work (e.g., producing an empty en_GB translation
then installing seems to not corrupt the "translated" files), but it likely
will still contain some bugs. There is also little documentation, but try
to run perldoc on the .p[lm] files to see what's there. There are also some
spurious warnings (both from bugs in the new scripts and from buggy third-
party Locale::PO module).

--- NEW FILE ---
#!/usr/bin/perl
# This file is part of Koha
# Parts copyright 2003-2004 Paul Poulain
# Parts copyright 2003-2004 Jerome Vizcaino
# Parts copyright 2004 Ambrose Li

=head1 NAME

tmpl_process3.pl - Experimental version of tmpl_process.pl
using gettext-compatible translation files

=cut

use strict;
use Getopt::Long;
use Locale::PO;
use File::Temp qw( :POSIX );
use TmplTokenizer;
use VerboseWarnings qw( error_normal warn_normal );

###############################################################################

use vars qw( @in_files $in_dir $str_file $out_dir );
use vars qw( @excludes $exclude_regex );
use vars qw( $recursive_p );
use vars qw( $pedantic_p );
use vars qw( $href );
use vars qw( $type );	# file extension (DOS form without the dot) to match

###############################################################################

sub find_translation ($) {
    my($s) = @_;
    my $key = TmplTokenizer::quote_po($s) if $s =~ /\S/;
    return defined $href->{$key}
		&& length Locale::PO->dequote($href->{$key}->msgstr)?
	   Locale::PO->dequote($href->{$key}->msgstr): $s;
}

sub text_replace_tag ($$) {
    my($t, $attr) = @_;
    my $it;
    # value [tag=input], meta
    my $tag = lc($1) if $t =~ /^<(\S+)/s;
    my $translated_p = 0;
    for my $a ('alt', 'content', 'title', 'value') {
	if ($attr->{$a}) {
	    next if $a eq 'content' && $tag ne 'meta';
	    next if $a eq 'value' && ($tag ne 'input'
		|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
	    my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
	    my($pre, $trimmed, $post) = TmplTokenizer::trim $val;
	    if ($val =~ /\S/s) {
		my $s = $pre . find_translation($trimmed) . $post;
		if ($attr->{$a}->[1] ne $s) { #FIXME
		    $attr->{$a}->[1] = $s; # FIXME
		    $attr->{$a}->[2] = ($s =~ /"/s)? "'$s'": "\"$s\""; #FIXME
		    $translated_p = 1;
		}
	    }
	}
    }
    if ($translated_p) {
	$it = "<$tag"
	    . join('', map {
		    sprintf(' %s=%s', $_, $attr->{$_}->[2]) #FIXME
		} sort {
		    $attr->{$a}->[3] <=> $attr->{$b}->[3] #FIXME
		} keys %$attr)
	    . '>';
    } else {
	$it = $t;
    }
    return $it;
}

sub text_replace (**) {
    my($h, $output) = @_;
    for (;;) {
	my $s = TmplTokenizer::next_token $h;
    last unless defined $s;
	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
	if ($kind eq TmplTokenType::TEXT) {
	    my($pre, $trimmed, $post) = TmplTokenizer::trim $t;
	    print $output $pre, find_translation($trimmed), $post;
	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
	    my $fmt = find_translation($s->form);
	    print $output TmplTokenizer::parametrize($fmt, map {
		my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
		$kind == TmplTokenType::TAG && %$attr?
		    text_replace_tag($t, $attr): $t } $s->parameters);
	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
	    print $output text_replace_tag($t, $attr);
	} elsif (defined $t) {
	    print $output $t;
	}
    }
}

# FIXME: Should we use the GNOME convention of using POTFILES.in instead?
sub listfiles ($$) {
    my($dir, $type) = @_;
    my @it = ();
    if (opendir(DIR, $dir)) {
	my @dirent = readdir DIR;	# because DIR is shared when recursing
	closedir DIR;
	for my $dirent (@dirent) {
	    my $path = "$dir/$dirent";
	    if ($dirent =~ /^\./ || $dirent eq 'CVS' || $dirent eq 'RCS'
	    || (defined $exclude_regex && $dirent =~ /^(?:$exclude_regex)$/)) {
		;
	    } elsif (-f $path) {
		push @it, $path if !defined $type || $dirent =~ /\.(?:$type)$/;
	    } elsif (-d $path && $recursive_p) {
		push @it, listfiles($path, $type);
	    }
	}
    } else {
	warn_normal "$dir: $!", undef;
    }
    return @it;
}

###############################################################################

sub usage_error (;$) {
    for my $msg (split(/\n/, $_[0])) {
	print STDERR "$msg\n";
    }
    print STDERR "Try `$0 --help' for more information.\n";
    exit(-1);
}

###############################################################################

GetOptions(
    'input|i=s'				=> \@in_files,
    'outputdir|o=s'			=> \$out_dir,
    'recursive|r'			=> \$recursive_p,
    'str-file|s=s'			=> \$str_file,
    'exclude|x=s'			=> \@excludes,
    'pedantic-warnings|pedantic'	=> sub { $pedantic_p = 1 },
) || usage_error;

VerboseWarnings::set_application_name $0;
VerboseWarnings::set_pedantic_mode $pedantic_p;

my $action = shift or usage_error('You must specify an ACTION.');
usage_error('You must at least specify input and string list filenames.')
    if !@in_files || !defined $str_file;

# Type match defaults to *.tmpl plus *.inc if not specified
$type = "tmpl|inc" if !defined($type);

# Check the inputs for being files or directories
for my $input (@in_files) {
    usage_error("$input: Input must be a file or directory.\n"
	    . "(Symbolic links are not supported at the moment)")
	unless -d $input || -f $input;;
}

# Generates the global exclude regular expression
$exclude_regex =  '(?:'.join('|', @excludes).')' if @excludes;

# Generate the list of input files if a directory is specified
if (-d $in_files[0]) {
    die "If you specify a directory as input, you must specify only it.\n"
	    if @in_files > 1;

    # input is a directory, generates list of files to process
    $in_dir = $in_files[0];
    $in_dir =~ s/\/$//; # strips the trailing / if any
    @in_files = listfiles($in_dir, $type);
} else {
    for my $input (@in_files) {
	die "You cannot specify input files and directories at the same time.\n"
		unless -f $input;
    }
}

if ($action eq 'create')  {
    # updates the list. As the list is empty, every entry will be added
    die "$str_file: Output file already exists" if -f $str_file;
    my($tmph, $tmpfile) = tmpnam();
    for my $input (@in_files) {
	print $tmph "$input\n";
    }
    close $tmph;
    system {'./xgettext.pl'} ('xgettext.pl', '-s', '-f', $tmpfile, '-o', $str_file);
    unlink $tmpfile || warn_normal "$tmpfile: unlink failed: $!\n", undef;

} elsif ($action eq 'update') {
    my($tmph1, $tmpfile1) = tmpnam();
    my($tmph2, $tmpfile2) = tmpnam();
    close $tmph2; # We just want a name
    for my $input (@in_files) {
	print $tmph1 "$input\n";
    }
    close $tmph1;
    system('./xgettext.pl', '-s', '-f', $tmpfile1, '-o', $tmpfile2);
    system('msgmerge', '-U', '-s', $str_file, $tmpfile2);
    unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
    unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;

} elsif ($action eq 'install') {
    if(!defined($out_dir)) {
	usage_error("You must specify an output directory when using the install method.");
    }

    if ($in_dir eq $out_dir) {
	warn "You must specify a different input and output directory.\n";
	exit -1;
    }

    # Make sure the output directory exists
    # (It will auto-create it, but for compatibility we should not)
    -d $out_dir || die "$out_dir: The directory does not exist\n";

    # Try to open the file, because Locale::PO doesn't check :-/
    open(INPUT, "<$str_file") || die "$str_file: $!\n";
    close INPUT;

    # restores the string list from file
    $href = Locale::PO->load_file_ashash($str_file);

    # creates the new tmpl file using the new translation
    for my $input (@in_files) {
	die "Assertion failed"
		unless substr($input, 0, length($in_dir) + 1) eq "$in_dir/";

	my $h = TmplTokenizer->new( $input );
	$h->set_allow_cformat( 1 );
	VerboseWarnings::set_input_file_name $input;

	my $target = $out_dir . substr($input, length($in_dir));
	my $targetdir = $` if $target =~ /[^\/]+$/s;
	if (!-d $targetdir) {
	    print STDERR "Making directory $targetdir...";
	    # creates with rwxrwxr-x permissions
	    mkdir($targetdir, 0775) || warn_normal "$targetdir: $!", undef;
	}
	print STDERR "Creating $target...\n";
	open( OUTPUT, ">$target" ) || die "$target: $!\n";
	text_replace( $h, *OUTPUT );
	close OUTPUT;
    }

} else {
    usage_error('Unknown action specified.');
}
exit 0;

###############################################################################

=head1 SYNOPSIS

./tmpl_process3.pl [ I<tmpl_process.pl options> ]

=head1 DESCRIPTION

This is an experimental version of the tmpl_process.pl script,
using standard gettext-style PO files.

Currently, the create, update, and install actions have all been
reimplemented and seem to work.

The create action calls xgettext.pl to do the actual work;
the update action calls xgettext.pl and msgmerge(1) to do the
actual work.

The script can detect <TMPL_VAR> directives embedded inside what
appears to be a full sentence (this actual work being done by
TmplTokenizer(3)); these larger patterns appear in the translation
file as c-format strings with %s.

=head1 BUGS

The --help option has not been implemented yet.

There are probably some real bugs too, since this has not been
tested very much.

xgettext.pl must be present in the current directory; the
msgmerge(1) command must also be present in the search path.
The script currently does not check carefully whether these
dependent commands are present.

Locale::PO(3) has a lot of bugs. It can neither parse nor
generate GNU PO files properly; a couple of workarounds have
been written in TmplTokenizer and more is likely to be needed
(e.g., to get rid of the "Strange line" warning for #~).

=head1 SEE ALSO

xgettext.pl,
msgmerge(1),
Locale::PO(3),
translator_doc.txt

=cut

--- NEW FILE ---
#!/usr/bin/perl

=head1 NAME

xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction

=cut

use strict;
use Getopt::Long;
use Locale::PO;
use TmplTokenizer;
use VerboseWarnings;

use vars qw( $files_from $directory $output $sort );
use vars qw( $pedantic_p );
use vars qw( %text );

###############################################################################

sub remember ($$) {
    my($token, $string) = @_;
    $text{$string} = [] unless defined $text{$string};
    push @{$text{$string}}, $token;
}

###############################################################################

sub string_list () {
    my @t = keys %text;
    # The real gettext tools seems to sort case sensitively; I don't know why
    @t = sort { $a cmp $b } @t if $sort eq 's';
    @t = sort {
	    my @aa = sort { $a->pathname cmp $b->pathname
		    || $a->line_number <=> $b->line_number } @{$text{$a}};
	    my @bb = sort { $a->pathname cmp $b->pathname
		    || $a->line_number <=> $b->line_number } @{$text{$b}};
	    $aa[0]->pathname cmp $bb[0]->pathname
		    || $aa[0]->line_number <=> $bb[0]->line_number;
	} @t if $sort eq 'F';
    return @t;
}

###############################################################################

sub text_extract (*) {
    my($h) = @_;
    for (;;) {
	my $s = TmplTokenizer::next_token $h;
    last unless defined $s;
	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
	if ($kind eq TmplTokenType::TEXT) {
	    #$t = TmplTokenizer::trim $t;
	    remember( $s, $t ) if $t =~ /\S/s;
	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
	    #$t = TmplTokenizer::trim $t;
	    remember( $s, $s->form ) if $s->form =~ /\S/s;
	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
	    # value [tag=input], meta
	    my $tag = lc($1) if $t =~ /^<(\S+)/s;
	    for my $a ('alt', 'content', 'title', 'value') {
		if ($attr->{$a}) {
		    next if $a eq 'content' && $tag ne 'meta';
		    next if $a eq 'value' && ($tag ne 'input'
			|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
		    my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
		    $val = TmplTokenizer::trim $val;
		    remember( $s, $val ) if $val =~ /\S/s;
		}
	    }
	}
    }
}

###############################################################################

sub generate_strings_list () {
    # Emit all extracted strings.
    # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
    for my $t (string_list) {
	printf OUTPUT "%s\n", $t
	    unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
    }
}

###############################################################################

sub generate_po_file () {
    # We don't emit the Plural-Forms header; it's meaningless for us
    print OUTPUT <<EOF;
# SOME DESCRIPTIVE TITLE.
# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
# This file is distributed under the same license as the PACKAGE package.
# FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
#
#, fuzzy
msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\\n"
"POT-Creation-Date: 2004-02-05 20:55-0500\\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
"Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
"Language-Team: LANGUAGE <LL\@li.org>\\n"
"MIME-Version: 1.0\\n"
"Content-Type: text/plain; charset=CHARSET\\n"
"Content-Transfer-Encoding: 8bit\\n"

EOF
    my $directory_re = quotemeta("$directory/");
    for my $t (string_list) {
	next if TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
	my $cformat_p;
	for my $token (@{$text{$t}}) {
	    my $pathname = $token->pathname;
	    $pathname =~ s/^$directory_re//os;
	    printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number;
	    $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
	}
	printf OUTPUT "#, c-format\n" if $cformat_p;
	printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( $t );
	printf OUTPUT "msgstr \"\"\n\n";
    }
}

###############################################################################

sub usage ($) {
    my($exitcode) = @_;
    my $h = $exitcode? *STDERR: *STDOUT;
    print $h <<EOF;
Usage: $0 [OPTIONS]
Extract translatable strings from given HTML::Template input files.

Input file location:
  -f, --files-from=FILE          Get list of input files from FILE
  -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search

Output file location:
  -o, --output=FILE              Write output to specified file

HTML::Template options:
      --pedantic-warnings        Issue warnings even for detected problems
			         which are likely to be harmless

Output details:
  -s, --sort-output              generate sorted output
  -F, --sort-by-file             sort output by file location

Informative output:
      --help                     Display this help and exit
EOF
    exit($exitcode);
}

###############################################################################

sub usage_error (;$) {
    print STDERR "$_[0]\n" if @_;
    print STDERR "Try `$0 --help' for more information.\n";
    exit(-1);
}

###############################################################################

Getopt::Long::config qw( bundling no_auto_abbrev );
GetOptions(
    'D|directory=s'			=> \$directory,
    'f|files-from=s'			=> \$files_from,
    'pedantic-warnings|pedantic'	=> sub { $pedantic_p = 1 },
    'output|o=s'			=> \$output,
    's|sort-output'			=> sub { $sort = 's' },
    'F|sort-by-file'			=> sub { $sort = 'F' },
    'help'				=> sub { usage(0) },
) || usage_error;

VerboseWarnings::set_application_name $0;
VerboseWarnings::set_pedantic_mode $pedantic_p;

usage_error('Missing mandatory option -f') unless defined $files_from;
$directory = '.' unless defined $directory;

if (defined $output && $output ne '-') {
    open(OUTPUT, ">$output") || die "$output: $!\n";
} else {
    open(OUTPUT, ">&STDOUT");
}

open(INPUT, "<$files_from") || die "$files_from: $!\n";
while (<INPUT>) {
    chomp;
    my $h = TmplTokenizer->new( "$directory/$_" );
    $h->set_allow_cformat( 1 );
    VerboseWarnings::set_input_file_name "$directory/$_";
    text_extract( $h );
}
close INPUT;
generate_po_file;

warn "This input will not work with Mozilla standards-compliant mode\n", undef
	if TmplTokenizer::syntaxerror_p;

exit(-1) if TmplTokenizer::fatal_p;

###############################################################################

=head1 DESCRIPTION

This is an experimental script based on the modularized
text-extract2.pl script.  It has behaviour similar to
xgettext(1), and generates gettext-compatible output files.

A gettext-like format provides the following advantages:

=over

=item -

(Future goal)
Translation to non-English-like languages with different word
order:  gettext's c-format strings can theoretically be
emulated if we are able to do some analysis on the .tmpl input
and treat <TMPL_VAR> in a way similar to %s.

=item - 

Context for the extracted strings:  the gettext format provides
the filenames and line numbers where each string can be found.
The translator can read the source file and see the context,
in case the string by itself can mean several different things.

=item - 

Place for the translator to add comments about the translations.

=item -

Gettext-compatible tools, if any, might be usable if we adopt
the gettext format.

=back

Right now it does about the same thing as text-extract2.pl but
generates gettext-style output; however, because it is scanner-
instead of parser-based, it is able to address the 4 weaknesses
listed in translator_doc.txt.  Ultimately, the goal is to make
this able to do some kind of simple analysis on the input to
produce gettext-style output with c-format strings, in order to
facilitate translation to languages with a different word order
than English.

When the above is finished, the generated po file may contain
some HTML tags in addition to %s strings.

If you want to generate GNOME-style POTFILES.in files, such
files (passed to -f) can be generated thus:

	(cd ../.. && find koha-tmpl/opac-tmpl/default/en
		-name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
	(cd ../.. && find koha-tmpl/intranet-tmpl/default/en
		-name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in

This is, however, quite pointless, because the "create" and
"update" actions have already been implemented in tmpl_process3.pl.

=head1 SEE ALSO

tmpl_process.pl,
xgettext(1),
Locale::PO(3),
translator_doc.txt

=head1 BUGS

There probably are some. Bugs related to scanning of <INPUT>
tags seem to be especially likely to be present.

Its diagnostics are probably too verbose.

When a <TMPL_VAR> within a JavaScript-related attribute is
detected, the script currently displays no warnings at all.
It might be good to display some kind of warning.

Its sort order (-s option) seems to be different than the real
xgettext(1)'s sort option. This will result in translation
strings inside the generated PO file spuriously moving about
when tmpl_process3.pl calls msgmerge(1) to update the PO file.

=cut

Index: TmplToken.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplToken.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** TmplToken.pm	17 Feb 2004 05:42:27 -0000	1.2
--- TmplToken.pm	19 Feb 2004 21:24:30 -0000	1.3
***************
*** 68,71 ****
--- 68,104 ----
  }

+ # only meaningful for TEXT_PARAMETRIZED tokens
+ sub children {
+     my $this = shift;
+     return $this->{'_kids'};
+ }
+ 
+ # only meaningful for TEXT_PARAMETRIZED tokens
+ sub set_children {
+     my $this = shift;
+     $this->{'_kids'} = ref $_[0] eq 'ARRAY'? $_[0]: \@_;
+     return $this;
+ }
+ 
+ # only meaningful for TEXT_PARAMETRIZED tokens
+ # FIXME: DIRECTIVE is not necessarily TMPL_VAR !!
+ sub parameters {
+     my $this = shift;
+     return map { $_->type == TmplTokenType::DIRECTIVE? $_: ()} @{$this->{'_kids'}};
+ }
+ 
+ # only meaningful for TEXT_PARAMETRIZED tokens
+ sub form {
+     my $this = shift;
+     return $this->{'_form'};
+ }
+ 
+ # only meaningful for TEXT_PARAMETRIZED tokens
+ sub set_form {
+     my $this = shift;
+     $this->{'_form'} = $_[0];
+     return $this;
+ }
+ 
  ###############################################################################

Index: TmplTokenType.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplTokenType.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** TmplTokenType.pm	17 Feb 2004 03:02:39 -0000	1.2
--- TmplTokenType.pm	19 Feb 2004 21:24:30 -0000	1.3
***************
*** 15,18 ****
--- 15,19 ----

  This is a Java-style "safe enum" singleton class for types of TmplToken objects.
+ The predefined constants are

  =cut
***************
*** 25,28 ****
--- 26,30 ----
  @EXPORT_OK = qw(
      &TEXT
+     &TEXT_PARAMETRIZED
      &CDATA
      &TAG
***************
*** 36,40 ****
  ###############################################################################

! use vars qw( $_text $_cdata $_tag $_decl $_pi $_directive $_comment $_unknown );

  BEGIN {
--- 38,43 ----
  ###############################################################################

! use vars qw( $_text $_text_parametrized $_cdata
!     $_tag $_decl $_pi $_directive $_comment $_null $_unknown );

  BEGIN {
***************
*** 47,58 ****
  	return $self;
      };
!     $_text	= &$new(0, 'TEXT');
!     $_cdata	= &$new(1, 'CDATA');
!     $_tag	= &$new(2, 'TAG');
!     $_decl	= &$new(3, 'DECL');
!     $_pi	= &$new(4, 'PI');
!     $_directive	= &$new(5, 'DIRECTIVE');
!     $_comment	= &$new(6, 'COMMENT');
!     $_unknown	= &$new(7, 'UNKNOWN');
  }

--- 50,62 ----
  	return $self;
      };
!     $_text		= &$new(0, 'TEXT');
!     $_text_parametrized	= &$new(8, 'TEXT-PARAMETRIZED');
!     $_cdata		= &$new(1, 'CDATA');
!     $_tag		= &$new(2, 'TAG');
!     $_decl		= &$new(3, 'DECL');
!     $_pi		= &$new(4, 'PI');
!     $_directive		= &$new(5, 'DIRECTIVE');
!     $_comment		= &$new(6, 'COMMENT');
!     $_unknown		= &$new(7, 'UNKNOWN');
  }

***************
*** 62,75 ****
  }

! sub TEXT	() { $_text }
! sub CDATA	() { $_cdata }
! sub TAG		() { $_tag }
! sub DECL	() { $_decl }
! sub PI		() { $_pi }
! sub DIRECTIVE	() { $_directive }
! sub COMMENT	() { $_comment }
! sub UNKNOWN	() { $_unknown }

  ###############################################################################

  1;
--- 66,128 ----
  }

! sub TEXT		() { $_text }
! sub TEXT_PARAMETRIZED	() { $_text_parametrized }
! sub CDATA		() { $_cdata }
! sub TAG			() { $_tag }
! sub DECL		() { $_decl }
! sub PI			() { $_pi }
! sub DIRECTIVE		() { $_directive }
! sub COMMENT		() { $_comment }
! sub UNKNOWN		() { $_unknown }

  ###############################################################################

+ =over
+ 
+ =item TEXT
+ 
+ normal text (#text in the DTD)
+ 
+ =item TEXT_PARAMETRIZED
+ 
+ parametrized normal text
+ (result of simple recognition of text interspersed with <TMPL_VAR> directives;
+ this has to be explicitly enabled in the scanner)
+ 
+ =item CDATA
+ 
+ normal text (CDATA in the DTD)
+ 
+ =item TAG
+ 
+ something that has the form of an HTML tag
+ 
+ =item DECL
+ 
+ something that has the form of an SGML declaration
+ 
+ =item PI
+ 
+ something that has the form of an SGML processing instruction
+ 
+ =item DIRECTIVE
+ 
+ a HTML::Template directive (whether or not embedded in an SGML comment)
+ 
+ =item COMMENT
+ 
+ something that has the form of an HTML comment
+ (and is not recognized as an HTML::Template directive)
+ 
+ =item UNKNOWN
+ 
+ something that is not recognized at all by the scanner
+ 
+ =back
+ 
+ Note that end of file is currently represented by undef,
+ instead of a constant predefined by this module.
+ 
+ =cut
+ 
  1;

Index: TmplTokenizer.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplTokenizer.pm,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -r1.10 -r1.11
*** TmplTokenizer.pm	18 Feb 2004 06:56:19 -0000	1.10
--- TmplTokenizer.pm	19 Feb 2004 21:24:30 -0000	1.11
***************
*** 23,31 ****
  This module is a simple-minded attempt at such a scanner.

- =head1 HISTORY
- 
- This tokenizer is mostly based
- on Ambrose's hideous Perl script known as subst.pl.
- 
  =cut

--- 23,26 ----
***************
*** 98,101 ****
--- 93,98 ----
  sub CDATA_CLOSE		() {'cdata-close'}

+ sub ALLOW_CFORMAT_P	() {'allow-cformat-p'}
+ 
  sub new {
      my $this = shift;
***************
*** 171,174 ****
--- 168,176 ----
  }

+ sub allow_cformat_p {
+     my $this = shift;
+     return $this->{+ALLOW_CFORMAT_P};
+ }
+ 
  # Simple setters

***************
*** 232,235 ****
--- 234,243 ----
  }

+ sub set_allow_cformat {
+     my $this = shift;
+     $this->{+ALLOW_CFORMAT_P} = $_[0];
+     return $this;
+ }
+ 
  ###############################################################################

***************
*** 306,310 ****
  	$this->_set_readahead( $' );
      # FIXME the following (the [<\s] part) is an unreliable HACK :-(
!     } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])+/s) {	# non-space normal text
  	($kind, $it) = (TmplTokenType::TEXT, $&);
  	$this->_set_readahead( $' );
--- 314,318 ----
  	$this->_set_readahead( $' );
      # FIXME the following (the [<\s] part) is an unreliable HACK :-(
!     } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) {	# non-space normal text
  	($kind, $it) = (TmplTokenType::TEXT, $&);
  	$this->_set_readahead( $' );
***************
*** 324,331 ****
  		}
  	    } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
! 		($kind, $it) = (TmplTokenType::TAG, "$1>");
! 		$this->_set_readahead( $3 );
! 		$ok_p = 1;
! 		warn_normal "SGML \"closed start tag\" notation: $1<\n", $this->line_number_start if $2 eq '';
  	    } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
  		($kind, $it) = (TmplTokenType::COMMENT, $&);
--- 332,346 ----
  		}
  	    } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
! 		# If we detect a "closed start tag" but we know that the
! 		# following token looks like a TMPL_VAR, don't stop
! 		my($head, $tail, $post) = ($1, $2, $3);
! 		if ($tail eq '' && $post =~ $re_tmpl_var) {
! 		    warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number;
! 		} else {
! 		    ($kind, $it) = (TmplTokenType::TAG, "$head>");
! 		    $this->_set_readahead( $post );
! 		    $ok_p = 1;
! 		    warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
! 		}
  	    } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
  		($kind, $it) = (TmplTokenType::COMMENT, $&);
***************
*** 348,352 ****
  	    $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
  	    if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
! 		warn_normal "Apache #include directive found instead of HTML::Template directive <TMPL_INCLUDE>", $this->line_number_start;
  	    }
  	} elsif ($it =~ /^<\?/) {
--- 363,367 ----
  	    $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
  	    if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
! 		warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
  	    }
  	} elsif ($it =~ /^<\?/) {
***************
*** 363,371 ****
      }
      warn_normal "Unrecognizable token found: $it\n", $this->line_number_start
! 	    if $kind eq TmplTokenType::UNKNOWN;
      return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
  }

! sub next_token {
      my $this = shift;
      my $h = $this->_handle;
--- 378,386 ----
      }
      warn_normal "Unrecognizable token found: $it\n", $this->line_number_start
! 	    if $kind == TmplTokenType::UNKNOWN;
      return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
  }

! sub _next_token_intermediate {
      my $this = shift;
      my $h = $this->_handle;
***************
*** 373,377 ****
      if (!$this->cdata_mode_p) {
  	$it = $this->_next_token_internal($h);
! 	if (defined $it && $it->type eq TmplTokenType::TAG) {
  	    if ($it->string =~ /^<(script|style|textarea)\b/i) {
  		$this->_set_cdata_mode( 1 );
--- 388,392 ----
      if (!$this->cdata_mode_p) {
  	$it = $this->_next_token_internal($h);
! 	if (defined $it && $it->type == TmplTokenType::TAG) {
  	    if ($it->string =~ /^<(script|style|textarea)\b/i) {
  		$this->_set_cdata_mode( 1 );
***************
*** 397,403 ****
  }

  ###############################################################################

! # Other easy functions

  sub blank_p ($) {
--- 412,510 ----
  }

+ sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token
+     my($t) = @_;
+     return $t->type == TmplTokenType::TEXT
+ 	|| ($t->type == TmplTokenType::DIRECTIVE
+ 		&& $t->string =~ /^(?:$re_tmpl_var)$/os)
+ 	|| ($t->type == TmplTokenType::TAG
+ 		&& ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is
+ 		|| ($t->string =~ /^<input/i
+ 		    && $t->attributes->{'type'} =~ /^(?:text)$/i)))
+ }
+ 
+ sub _quote_cformat ($) {
+     my($s) = @_;
+     $s =~ s/%/%%/g;
+     return $s;
+ }
+ 
+ sub _formalize ($) {
+     my($t) = @_;
+     return $t->type == TmplTokenType::DIRECTIVE? '%s': _quote_cformat($t->string);
+ }
+ 
+ sub next_token {
+     my $this = shift;
+     my $h = $this->_handle;
+     my $it;
+     $this->{_queue} = [] unless defined $this->{_queue};
+     if (@{$this->{_queue}}) {
+ 	$it = pop @{$this->{_queue}};
+     } else {
+ 	$it = $this->_next_token_intermediate($h);
+ 	if ($this->allow_cformat_p && defined $it
+ 	    && ($it->type == TmplTokenType::TEXT?
+ 		!blank_p( $it->string ): _token_groupable_p( $it ))) {
+ 	    my @structure = ( $it );
+ 	    my($n_trailing_spaces, $next) = (0, undef);
+ 	    my($nonblank_text_p, $parametrized_p, $next) = (0, 0);
+ 	    if ($it->type == TmplTokenType::TEXT) {
+ 		$nonblank_text_p = 1 if !blank_p( $it->string );
+ 	    } elsif ($it->type == TmplTokenType::DIRECTIVE) {
+ 		$parametrized_p = 1;
+ 	    }
+ 	    for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) {
+ 		$next = $this->_next_token_intermediate($h);
+ 		push @structure, $next; # for consistency (with initialization)
+ 	    last unless defined $next && _token_groupable_p( $next );
+ 		if ($next->type == TmplTokenType::TEXT) {
+ 		    if (blank_p( $next->string )) {
+ 			$n_trailing_spaces += 1;
+ 		    } else {
+ 			($n_trailing_spaces, $nonblank_text_p) = (0, 1);
+ 		    }
+ 		} elsif ($next->type == TmplTokenType::DIRECTIVE) {
+ 		    $n_trailing_spaces = 0;
+ 		    $parametrized_p = 1;
+ 		} else {
+ 		    $n_trailing_spaces = 0;
+ 		}
+ 	    }
+ 	    # Undo the last token
+ 	    push @{$this->{_queue}}, pop @structure;
+ 	    # Undo trailing blank tokens
+ 	    for (my $i = 0; $i < $n_trailing_spaces; $i += 1) {
+ 		push @{$this->{_queue}}, pop @structure;
+ 	    }
+ 	    if (@structure < 2) {
+ 		# Nothing to do
+ 		;
+ 	    } elsif ($nonblank_text_p && $parametrized_p) {
+ 		# Create the corresponding c-format string
+ 		my $string = join('', map { $_->string } @structure);
+ 		my $form = join('', map { _formalize $_ } @structure);
+ 		$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
+ 		$it->set_form( $form );
+ 		$it->set_children( @structure );
+ #	    } elsif ($nonblank_text_p) {
+ #		# Combine the strings
+ #		my $string = join('', map { $_->string } @structure);
+ #		;
+ 	    } else {
+ 		# Requeue the tokens thus seen for re-emitting
+ 		for (;;) {
+ 		    push @{$this->{_queue}}, pop @structure;
+ 		last if !@structure;
+ 		}
+ 		$it = pop @{$this->{_queue}};
+ 	    }
+ 	}
+     }
+     return $it;
+ }
+ 
  ###############################################################################

! # Other simple functions (These are not methods)

  sub blank_p ($) {
***************
*** 415,438 ****
  }

  ###############################################################################

! =head1 FUTURE PLANS

! Code could be written to detect template variables and
! construct gettext-c-format-string-like meta-strings (e.g., "Results %s
! through %s of %s records" that will be more likely to be translatable
! to languages where word order is very unlike English word order.
! This will be relatively major rework, requiring corresponding
! rework in tmpl_process.pl
! 
! Gettext-style line number references would also be very helpful in
! disambiguating the strings. Ultimately, we should generate and work
! with gettext-style po files, so that translators are able to use
! tools designed for gettext.
! 
! An example of a string untranslatable to Chinese is "Accounts for";
! "Accounts for %s", however, would be translatable. Short words like
! "in" would also be untranslatable, not only to Chinese, but also to
! languages requiring declension of nouns.

  =cut
--- 522,610 ----
  }

+ sub quote_po ($) {
+     my($s) = @_;
+     # Locale::PO->quote is buggy, it doesn't quote newlines :-/
+     $s =~ s/([\\"])/\\\1/gs;
+     $s =~ s/\n/\\n/g;
+     return "\"$s\"";
+ }
+ 
+ # Complication function that shouldn't be here
+ sub parametrize ($@) {
+     my($fmt, @params) = @_;
+     my $it = '';
+     for (my $n = 0; length $fmt;) {
+ 	if ($fmt =~ /^[^%]+/) {
+ 	    $fmt = $';
+ 	    $it .= $&;
+ 	} elsif ($fmt =~ /^%%/) {
+ 	    $fmt = $';
+ 	    $it .= '%';
+ 	} elsif ($fmt =~ /^%(?:(\d+)\$)?s/) {
+ 	    $n += 1;
+ 	    my $i = (defined $1? $1: $n) - 1;
+ 	    $fmt = $';
+ 	    $it .= $params[$i]
+ 	} elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
+ 	    $fmt = $';
+ 	    $it .= $&;
+ 	    die "Unknown or unsupported format specification: $&\n"; #XXX
+ 	} else {
+ 	    die "Completely confused parametrizing: $fmt\n";#XXX
+ 	}
+     }
+     return $it;
+ }
+ 
  ###############################################################################

! =pod
! 
! In addition to the basic scanning, this class will also perform
! the following:

! =over
! 
! =item -
! 
! Emulation of c-format strings (see below)
! 
! =item -
! 
! Display of warnings for certain things that affects either the
! ability of this class to yield correct output, or things that
! are known to cause the original template to cause trouble.
! 
! =item -
! 
! Automatic correction of some of the things warned about
! (e.g., SGML "closed start tag" notation).
! 
! =back
! 
! =head2 c-format strings emulation
! 
! Because English word order is not universal, a simple extraction
! of translatable strings may yield some strings like "Accounts for"
! or ambiguous strings like "in". This makes the resulting strings
! difficult to translate, but does not affect all languages alike.
! For example, Chinese (with a somewhat different word order) would
! be hit harder, but French would be relatively unaffected.
! 
! To overcome this problem, the scanner can be configured to detect
! patterns with <TMPL_VAR> directives (as well as certain HTML tags),
! and try to construct a larger pattern that will appear in the PO
! file as c-format strings with %s placeholders. This additional
! step allows the translator to deal with cases where word order
! is different (replacing %s with %1$s, %2$s, etc.), or when certain
! words will require certain inflectional suffixes in sentences.
! 
! Because this is an incompatible change, this mode must be explicitly
! turned on using the set_cformat(1) method call.
! 
! =head1 HISTORY
! 
! This tokenizer is mostly based
! on Ambrose's hideous Perl script known as subst.pl.

  =cut

Index: text-extract2.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/text-extract2.pl,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -r1.40 -r1.41
*** text-extract2.pl	17 Feb 2004 06:30:38 -0000	1.40
--- text-extract2.pl	19 Feb 2004 21:24:30 -0000	1.41
***************
*** 23,29 ****
--- 23,35 ----
  use vars qw( $debug_dump_only_p );
  use vars qw( $pedantic_p );
+ use vars qw( $allow_cformat_p ); # FOR TESTING PURPOSES ONLY!!

  ###############################################################################

+ sub underline ($) { # for testing only
+     my($s) = @_;
+     join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $s));
+ }
+ 
  sub debug_dump ($) { # for testing only
      my($h) = @_;
***************
*** 35,49 ****
  	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  	printf "%s [line %d]:\n", $kind->to_string, $s->line_number;
! 	printf "%4dH%s\n", length($t),
! 		join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
! 	if ($kind eq TmplTokenType::TAG && %$attr) {
  	    printf "Attributes:\n";
  	    for my $a (keys %$attr) {
  		my($key, $val, $val_orig, $order) = @{$attr->{$a}};
! 		printf "%s = %dH%s -- %s\n", $a, length $val,
! 		join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $val)),
  		$val_orig;
  	    }
  	}
      }
  }
--- 41,64 ----
  	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  	printf "%s [line %d]:\n", $kind->to_string, $s->line_number;
! 	printf "%4dH%s\n", length($t), underline($t);
! 	if ($kind == TmplTokenType::TAG && %$attr) {
  	    printf "Attributes:\n";
  	    for my $a (keys %$attr) {
  		my($key, $val, $val_orig, $order) = @{$attr->{$a}};
! 		printf "%s = %dH%s -- %s\n", $a, length $val, underline $val,
  		$val_orig;
  	    }
  	}
+ 	if ($kind == TmplTokenType::TEXT_PARAMETRIZED) {
+ 	    printf "Form (c-format string):\n";
+ 	    printf "%dH%s\n", length $s->form, underline $s->form;
+ 	    printf "Parameters:\n";
+ 	    my $i = 1;
+ 	    for my $a ($s->parameters) {
+ 		my $t = $a->string;
+ 		printf "%%%d\$s = %dH%s\n", $i, length $t, underline $t;
+ 		$i += 1;
+ 	    }
+ 	}
      }
  }
***************
*** 58,65 ****
      last unless defined $s;
  	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
! 	if ($kind eq TmplTokenType::TEXT) {
  	    $t = TmplTokenizer::trim $t;
  	    $text{$t} = 1 if $t =~ /\S/s;
! 	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
  	    # value [tag=input], meta
  	    my $tag = lc($1) if $t =~ /^<(\S+)/s;
--- 73,80 ----
      last unless defined $s;
  	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
! 	if ($kind == TmplTokenType::TEXT) {
  	    $t = TmplTokenizer::trim $t;
  	    $text{$t} = 1 if $t =~ /\S/s;
! 	} elsif ($kind == TmplTokenType::TAG && %$attr) {
  	    # value [tag=input], meta
  	    my $tag = lc($1) if $t =~ /^<(\S+)/s;
***************
*** 113,116 ****
--- 128,132 ----

  GetOptions(
+     'enable-cformat'	=> \$allow_cformat_p,
      'f|file=s'		=> \$input,
      'debug-dump-only'	=> \$debug_dump_only_p,
***************
*** 126,129 ****
--- 142,146 ----

  my $h = TmplTokenizer->new( $input );
+ $h->set_allow_cformat( 1 ) if $allow_cformat_p;
  if ($debug_dump_only_p) {
      debug_dump( $h );