[Koha-patches] [PATCH] Amending SetUNIMARCencoding Adding SetUTF8Flag
Henri-Damien LAURENT
henridamien.laurent at biblibre.com
Thu Jul 23 18:47:42 CEST 2009
SetUTF8Flag sets the PERL flag for utf8 data : this is really helpful to deal with diacritics.
If you donot use that, you may end up when you append field or insert field to a marc record having all data you previously had double encoded.
See : http://rt.cpan.org/Ticket/Display.html?id=48120
SetUNIMARCencoding was not correct for Authority data 100$a for authorities is shorter than biblio 100$a
---
C4/Charset.pm | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 89 insertions(+), 11 deletions(-)
diff --git a/C4/Charset.pm b/C4/Charset.pm
index fb62e27..20ebf7c 100644
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -1,6 +1,7 @@
package C4::Charset;
# Copyright (C) 2008 LibLime
+# Copyright (C) 2009 BibLibre
#
# This file is part of Koha.
#
@@ -22,6 +23,8 @@ use warnings;
use MARC::Charset qw/marc8_to_utf8/;
use Text::Iconv;
+use Unicode::Normalize;
+use C4::Debug;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
@@ -32,8 +35,10 @@ BEGIN {
@ISA = qw(Exporter);
@EXPORT = qw(
IsStringUTF8ish
+ normalize_string
MarcToUTF8Record
SetMarcUnicodeFlag
+ SetUTF8Flag
StripNonXmlChars
);
}
@@ -110,6 +115,28 @@ sub IsStringUTF8ish {
return utf8::decode($str);
}
+=head2 normalize_string
+ Given
+ a string
+ a code 0 or undef => NFC 1 => NFD
+ Returns a utf8 normalized string
+
+ Sample code :
+ my $string=normalize_string ("Pétaudière");
+=cut
+
+sub normalize_string{
+ my ($string,$code)=@_;
+ utf8::decode($string) unless (utf8::is_utf8($string));
+ if ($code){
+ $string=NFC($string);
+ }
+ else {
+ $string=NFD($string);
+ }
+ return $string;
+}
+
=head2 MarcToUTF8Record
=over 4
@@ -215,7 +242,7 @@ sub MarcToUTF8Record {
@errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
} else {
# assume any other character encoding is for Text::Iconv
- @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
+ @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
}
if (@errors) {
@@ -247,6 +274,8 @@ any actual character conversion.
sub SetMarcUnicodeFlag {
my $marc_record = shift;
my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
+ my $record_type = shift ||"biblio"; # || C4::Context->preference("marcflavour");
+ my $cataloging_language = shift ||"fre"; # || C4::Context->preference("marcflavour");
$marc_record->encoding('UTF-8');
if ($marc_flavour eq 'MARC21') {
@@ -254,21 +283,70 @@ sub SetMarcUnicodeFlag {
substr($leader, 9, 1) = 'a';
$marc_record->leader($leader);
} elsif ($marc_flavour=~/UNIMARC/) {
- if (my $field = $marc_record->field('100')) {
- my $sfa = $field->subfield('a');
- my $subflength = 36;
- # fix the length of the field
- $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
- $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
-
- substr($sfa, 26, 4) = '50 ';
- $field->update('a' => $sfa);
- }
+ SetUNIMARC_utf8_encoding($marc_record,$record_type,$cataloging_language);
} else {
warn "Unrecognized marcflavour: $marc_flavour";
}
}
+sub SetUNIMARC_utf8_encoding{
+ my ($record,$recordtype,$lang)=@_;
+ $recordtype||="biblio";
+ $lang||="fre";
+ my $string;
+ my $subflength=($recordtype=~/biblio/?36:21);
+ my $encodingposition=($recordtype=~/biblio/?22:8);
+ $string=$record->subfield( 100, "a" );
+ if (length($string)>=$subflength) {
+ $string = substr $string, 0,$subflength if (length($string)>$subflength);
+ }
+ else {
+ $string = POSIX::strftime( "%Y%m%d", localtime );
+ $string =~ s/\-//g;
+ $string = sprintf( "%-*s", $subflength-1, $string );
+ }
+ substr( $string, $encodingposition, 8, $lang."y50 " );
+ if ( $record->subfield( 100, "a" ) ) {
+ $record->field('100')->update(a=>$string);
+ }
+ else {
+ $record->insert_grouped_field(
+ MARC::Field->new( 100, '', '', "a" => $string ) );
+ }
+ $debug && warn "encoding: ", substr( $record->subfield(100, 'a'), $encodingposition, 8 );
+}
+
+=over 4
+
+SetUTF8Flag($marc_record);
+
+=back
+
+Set both the PERL internal UTF8 Flag on all subfields of record
+So that data woud not be double encoded by PERL
+
+=cut
+
+sub SetUTF8Flag{
+ my ($record)=@_;
+ return unless ($record && $record->fields());
+ foreach my $field ($record->fields()){
+ if ($field->tag()>=10){
+ my @subfields;
+ foreach my $subfield ($field->subfields()){
+ push @subfields,($$subfield[0],normalize_string($$subfield[1]));
+ }
+ my $newfield=MARC::Field->new(
+ $field->tag(),
+ $field->indicator(1),
+ $field->indicator(2),
+ @subfields
+ );
+ $field->replace_with($newfield);
+ }
+ }
+}
+
=head2 StripNonXmlChars
=over 4
--
1.6.0.4
More information about the Koha-patches
mailing list