[Koha-patches] [PATCH] (bug #4020) XSLT unimarc display

Henri-Damien LAURENT henridamien.laurent at biblibre.com
Thu Dec 31 10:19:58 CET 2009


From: Paul Poulain <paul.poulain at biblibre.com>

When using XSLT Display, and UNIMARC,
since marcFlavour is not used in encoding data, when data is true utf8, as_xml
fails on some subfields.

Moreover, because transformMARCXMLForXSLT edits some values in the marc record
and the PERL UTF8 is not handled by MARC::File::USMARC, it endsup in double
encoding the data.
Sending a patch to fix both issues.

This patch adds
    - two functions in  C4/Charset.pm
        NormalizeString (uses Unicode::Normalize)
        SetUTF8Flag (This function in my opinion belongs to MARC::Record, or at least MARC::File::USMARC)
    - edits C4::XSLT in order to cope with the correct marcflavour
    - edits C4::Search searchResults to use setUTF8Flag
---
 C4/Charset.pm |   82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 C4/Search.pm  |    5 +++
 C4/XSLT.pm    |    5 ++-
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/C4/Charset.pm b/C4/Charset.pm
index e1b6c96..e39637a 100644
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -23,6 +23,7 @@ use warnings;
 use MARC::Charset qw/marc8_to_utf8/;
 use Text::Iconv;
 use C4::Debug;
+use Unicode::Normalize;
 
 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
@@ -34,6 +35,7 @@ BEGIN {
     @EXPORT = qw(
         IsStringUTF8ish
         MarcToUTF8Record
+        SetUTF8Flag
         SetMarcUnicodeFlag
         StripNonXmlChars
     );
@@ -111,6 +113,86 @@ sub IsStringUTF8ish {
     return utf8::decode($str);
 }
 
+=head2 SetUTF8Flag
+
+=over 4
+
+my $marc_record = SetUTF8Flag($marc_record);
+
+=back
+
+This function sets the PERL UTF8 flag for data.
+It is required when using new_from_usmarc 
+since MARC::File::USMARC does not handle PERL UTF8 setting.
+When editing unicode marc records fields and subfields, you
+would end up in double encoding without using this function. 
+
+FIXME
+In my opinion, this function belongs to MARC::Record and not
+to this package.
+But since it handles charset, and MARC::Record, it finds its way in that package
+
+=cut
+
+sub SetUTF8Flag{
+	my ($record)=@_;
+	return unless ($record && $record->fields());
+	foreach my $field ($record->fields()){
+		if ($field->tag()>=10){
+			my @subfields;
+			foreach my $subfield ($field->subfields()){
+				push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
+			}
+			my $newfield=MARC::Field->new(
+							$field->tag(),
+							$field->indicator(1),
+							$field->indicator(2),
+							@subfields
+						);
+			$field->replace_with($newfield);
+		}
+	}
+}
+
+=head2 NormalizeString
+
+=over 4
+
+    my $normalized_string=NormalizeString($string);
+
+=back
+	Given 
+	    a string
+        nfc : If you want to set NFC and not NFD
+        transform : If you expect all the signs to be removed
+    Sets the PERL UTF8 Flag on your initial data if need be
+    and applies cleaning if required 
+    
+	Returns a utf8 NFD normalized string
+	
+	Sample code :
+	my $string=NormalizeString ("l'ornithoptère");
+    #results into ornithoptère in NFD form and sets UTF8 Flag
+=cut
+
+sub NormalizeString{
+	my ($string,$nfc,$transform)=@_;
+	utf8::decode($string) unless (utf8::is_utf8($string));
+	if ($nfc){
+		$string= NFD($string);
+	}
+	else {
+		$string=NFC($string);
+	}
+	if ($transform){
+    $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
+	#removing one letter words "d'" "l'"  was changed into "d " "l " 
+    $string=~s/\b\S\b//g;
+    $string=~s/\s+$//g;
+	}
+    return $string; 
+}
+
 =head2 MarcToUTF8Record
 
 =over 4
diff --git a/C4/Search.pm b/C4/Search.pm
index 5605ede..cf1309e 100644
--- a/C4/Search.pm
+++ b/C4/Search.pm
@@ -448,6 +448,7 @@ sub getRecords {
                     # not an index scan
                     else {
                         $record = $results[ $i - 1 ]->record($j)->raw();
+            		warn $results[$i-1]->record($j)->render() ;
 
                         # warn "RECORD $j:".$record;
                         $results_hash->{'RECORDS'}[$j] = $record;
@@ -1648,6 +1649,10 @@ sub searchResults {
         }
 
         # XSLT processing of some stuff
+	my $debug=1;
+	use C4::Charset;
+	SetUTF8Flag($marcrecord);
+	$debug && warn $marcrecord->as_formatted;
         if (C4::Context->preference("XSLTResultsDisplay") && !$scan) {
             $oldbiblio->{XSLTResultsRecord} = XSLTParse4Display(
                 $oldbiblio->{biblionumber}, $marcrecord, 'Results' );
diff --git a/C4/XSLT.pm b/C4/XSLT.pm
index 8a67e04..1c7184a 100644
--- a/C4/XSLT.pm
+++ b/C4/XSLT.pm
@@ -124,8 +124,9 @@ sub XSLTParse4Display {
     my $record = transformMARCXML4XSLT($biblionumber, $orig_record);
     #return $record->as_formatted();
     my $itemsxml  = buildKohaItemsNamespace($biblionumber);
-    my $xmlrecord = $record->as_xml();
+    my $xmlrecord = $record->as_xml(C4::Context->preference('marcflavour'));
     my $sysxml = "<sysprefs>\n";
+    warn $xmlrecord;
     foreach my $syspref ( qw/OPACURLOpenInNewWindow DisplayOPACiconsXSLT URLLinkText/ ) {
         $sysxml .= "<syspref name=\"$syspref\">" .
                    C4::Context->preference( $syspref ) .
@@ -137,7 +138,7 @@ sub XSLTParse4Display {
 
     my $parser = XML::LibXML->new();
     # don't die when you find &, >, etc
-    $parser->recover_silently(1);
+    $parser->recover_silently(0);
     my $source = $parser->parse_string($xmlrecord);
     unless ( $stylesheet ) {
         my $xslt = XML::LibXSLT->new();
-- 
1.6.3.3




More information about the Koha-patches mailing list