[Koha-patches] [PATCH] Amending SetUNIMARCencoding Adding SetUTF8Flag

Thu Jul 23 18:47:42 CEST 2009

SetUTF8Flag sets the PERL flag for utf8 data : this is really helpful to deal with diacritics.
If you donot use that, you may end up when you append field or insert field to a marc record having all data you previously had double encoded.
See :     http://rt.cpan.org/Ticket/Display.html?id=48120

SetUNIMARCencoding was not correct for Authority data 100$a for authorities is shorter than biblio 100$a
---
 C4/Charset.pm |  100 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 89 insertions(+), 11 deletions(-)

diff --git a/C4/Charset.pm b/C4/Charset.pm
index fb62e27..20ebf7c 100644
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -1,6 +1,7 @@
 package C4::Charset;
 
 # Copyright (C) 2008 LibLime
+# Copyright (C) 2009 BibLibre
 #
 # This file is part of Koha.
 #
@@ -22,6 +23,8 @@ use warnings;
 
 use MARC::Charset qw/marc8_to_utf8/;
 use Text::Iconv;
+use Unicode::Normalize;
+use C4::Debug;
 
 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
@@ -32,8 +35,10 @@ BEGIN {
     @ISA    = qw(Exporter);
     @EXPORT = qw(
         IsStringUTF8ish
+		normalize_string
         MarcToUTF8Record
         SetMarcUnicodeFlag
+		SetUTF8Flag
         StripNonXmlChars
     );
 }
@@ -110,6 +115,28 @@ sub IsStringUTF8ish {
     return utf8::decode($str);
 }
 
+=head2 normalize_string
+	Given 
+	    a string
+	    a code 0 or undef => NFC 1 => NFD
+	Returns a utf8 normalized string
+	
+	Sample code :
+	my $string=normalize_string ("PÃ©taudiÃ¨re");
+=cut
+
+sub normalize_string{
+	my ($string,$code)=@_;
+	utf8::decode($string) unless (utf8::is_utf8($string));
+	if ($code){
+		$string=NFC($string);
+	}
+	else {
+		$string=NFD($string);
+	}
+    return $string; 
+}
+
 =head2 MarcToUTF8Record
 
 =over 4
@@ -215,7 +242,7 @@ sub MarcToUTF8Record {
             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
         } else {
             # assume any other character encoding is for Text::Iconv
-            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
+            @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
         }
 
         if (@errors) {
@@ -247,6 +274,8 @@ any actual character conversion.
 sub SetMarcUnicodeFlag {
     my $marc_record = shift;
     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
+    my $record_type = shift ||"biblio"; # || C4::Context->preference("marcflavour");
+    my $cataloging_language = shift ||"fre"; # || C4::Context->preference("marcflavour");
 
     $marc_record->encoding('UTF-8');
     if ($marc_flavour eq 'MARC21') {
@@ -254,21 +283,70 @@ sub SetMarcUnicodeFlag {
         substr($leader, 9, 1) = 'a';
         $marc_record->leader($leader); 
     } elsif ($marc_flavour=~/UNIMARC/) {
-        if (my $field = $marc_record->field('100')) {
-            my $sfa = $field->subfield('a');
-            my $subflength = 36;
-            # fix the length of the field
-            $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
-            $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
-            
-            substr($sfa, 26, 4) = '50  ';
-            $field->update('a' => $sfa);
-        }
+		SetUNIMARC_utf8_encoding($marc_record,$record_type,$cataloging_language);
     } else {
         warn "Unrecognized marcflavour: $marc_flavour";
     }
 }
 
+sub SetUNIMARC_utf8_encoding{
+	my ($record,$recordtype,$lang)=@_;
+	$recordtype||="biblio";
+	$lang||="fre";
+    my $string; 
+	my $subflength=($recordtype=~/biblio/?36:21);
+	my $encodingposition=($recordtype=~/biblio/?22:8);
+	$string=$record->subfield( 100, "a" );
+    if (length($string)>=$subflength) { 
+		$string = substr $string, 0,$subflength if (length($string)>$subflength);
+	}
+    else { 
+        $string = POSIX::strftime( "%Y%m%d", localtime ); 
+        $string =~ s/\-//g; 
+        $string = sprintf( "%-*s", $subflength-1, $string ); 
+    } 
+    substr( $string, $encodingposition, 8, $lang."y50  " ); 
+    if ( $record->subfield( 100, "a" ) ) { 
+		$record->field('100')->update(a=>$string);
+	}
+	else {
+        $record->insert_grouped_field( 
+        MARC::Field->new( 100, '', '', "a" => $string ) ); 
+    }
+	$debug && warn "encoding: ", substr( $record->subfield(100, 'a'), $encodingposition, 8 );
+}
+
+=over 4
+
+SetUTF8Flag($marc_record);
+
+=back
+
+Set both the PERL internal UTF8 Flag on all subfields of record
+So that data woud not be double encoded by PERL
+
+=cut
+
+sub SetUTF8Flag{
+	my ($record)=@_;
+	return unless ($record && $record->fields());
+	foreach my $field ($record->fields()){
+		if ($field->tag()>=10){
+			my @subfields;
+			foreach my $subfield ($field->subfields()){
+				push @subfields,($$subfield[0],normalize_string($$subfield[1]));
+			}
+			my $newfield=MARC::Field->new(
+							$field->tag(),
+							$field->indicator(1),
+							$field->indicator(2),
+							@subfields
+						);
+			$field->replace_with($newfield);
+		}
+	}
+}
+
 =head2 StripNonXmlChars
 
 =over 4
-- 
1.6.0.4