[Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl [rel_2_2]

Wed Apr 26 09:36:12 CEST 2006

CVSROOT:	/sources/koha
Module name:	koha
Branch: 	rel_2_2
Changes by:	Thomas D <thd at savannah.gnu.org>	06/04/26 07:36:12

Modified files:
	misc/migration_tools: bulkmarcimport.pl 

Log message:
	MARC8 to UTF-8 support added without XML problems.  Rationalised the variable
	name used for designating MARC flavour to avoid confusion with character set
	encoding.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/koha/misc/migration_tools/bulkmarcimport.pl.diff?only_with_tag=rel_2_2&tr1=1.1.2.2&tr2=1.1.2.3&r1=text&r2=text

Patches:
Index: koha/misc/migration_tools/bulkmarcimport.pl
diff -u koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.2 koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.3

--- koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.2	Fri Mar 10 04:11:23 2006
+++ koha/misc/migration_tools/bulkmarcimport.pl	Wed Apr 26 07:36:12 2006
@@ -2,28 +2,139 @@
 # small script that import an iso2709 file into koha 2.0
 
 use strict;
+use warnings;
 
 # Koha modules used
 use MARC::File::USMARC;
+# Uncomment the line below and use MARC::File::XML again when it works better.
+# -- thd
+# use MARC::File::XML;
 use MARC::Record;
 use MARC::Batch;
+use MARC::Charset;
 use C4::Context;
 use C4::Biblio;
 use Time::HiRes qw(gettimeofday);
-
 use Getopt::Long;
+binmode(STDOUT, ":utf8");
+
 my ( $input_marc_file, $number) = ('',0);
-my ($version, $delete, $test_parameter,$char_encoding, $verbose);
+my ($version, $delete, $test_parameter,$marcFlavour, $verbose);
+
 GetOptions(
-    'file:s'    => \$input_marc_file,
-    'n' => \$number,
-    'h' => \$version,
-    'd' => \$delete,
-    't' => \$test_parameter,
-    'c:s' => \$char_encoding,
-    'v:s' => \$verbose,
+	'file:s'    => \$input_marc_file,
+	'n' => \$number,
+	'h' => \$version,
+	'd' => \$delete,
+	't' => \$test_parameter,
+	'c:s' => \$marcFlavour,
+	'v:s' => \$verbose,
 );
 
+# FIXME:  Management of error conditions needed for record parsing problems
+# and MARC8 character sets with mappings to Unicode not yet included in 
+# MARC::Charset.  The real world rarity of these problems is not fully tested.
+# Unmapped character sets will throw a warning currently and processing will 
+# continue with the error condition.  A fairly trivial correction should 
+# address some record parsing and unmapped character set problems but I need 
+# time to implement a test and correction for undef subfields and revert to 
+# MARC8 if mappings are missing. -- thd
+sub fMARC8ToUTF8($$) {
+	my ($record) = shift;
+	my ($verbose) = shift;
+	if ($verbose) {
+		if ($verbose >= 2) {
+			my $leader = $record->leader();
+			$leader =~ s/ /#/g;
+			print "\n000 " . $leader;
+		}
+	}
+	foreach my $field ($record->fields()) {
+		if ($field->is_control_field()) {
+			if ($verbose) {
+				if ($verbose >= 2) {
+					my $fieldName = $field->tag();
+					my $fieldValue = $field->data();
+					$fieldValue =~ s/ /#/g;
+					print "\n" . $fieldName;
+					print ' ' . $fieldValue;
+				}
+			}
+		} else {
+			my @subfieldsArray;
+			my $fieldName = $field->tag();
+			my $indicator1Value = $field->indicator(1);
+			my $indicator2Value = $field->indicator(2);
+			if ($verbose) {
+				if ($verbose >= 2) {
+					$indicator1Value =~ s/ /#/;
+					$indicator2Value =~ s/ /#/;
+					print "\n" . $fieldName . ' ' . 
+							$indicator1Value . 
+					$indicator2Value;
+				}
+			}
+			foreach my $subfield ($field->subfields()) {
+				my $subfieldName = $subfield->[0];
+				my $subfieldValue = $subfield->[1];
+				$subfieldValue = MARC::Charset::marc8_to_utf8($subfieldValue);
+				
+				# Alas, MARC::Field::update() does not work correctly.
+				## push (@subfieldsArray, $subfieldName, $subfieldValue);
+				
+				push @subfieldsArray, [$subfieldName, $subfieldValue];
+				if ($verbose) {
+					if ($verbose >= 2) {
+						print " \$" . $subfieldName . ' ' . $subfieldValue;
+					}
+				}
+			}
+			
+			# Alas, MARC::Field::update() does not work correctly.
+			# 
+			# The first instance in the field of a of a repeated subfield 
+			# overwrites the content from later instances with the content 
+			# from the first instance.
+			## $field->update(@subfieldsArray);
+			
+			foreach my $subfieldRow(@subfieldsArray) {
+				my $subfieldName = $subfieldRow->[0];
+				$field->delete_subfields($subfieldName);
+			}
+			foreach my $subfieldRow(@subfieldsArray) {
+				$field->add_subfields(@$subfieldRow);
+			}
+			
+			if ($verbose) {
+				if ($verbose >= 2) {
+					# Reading the indicator values again is not necessary.  
+					# They were not converted.
+					# $indicator1Value = $field->indicator(1);
+					# $indicator2Value = $field->indicator(2);
+					# $indicator1Value =~ s/ /#/;
+					# $indicator2Value =~ s/ /#/;
+					print "\nCONVERTED TO UTF-8:\n" . $fieldName . ' ' . 
+							$indicator1Value . 
+					$indicator2Value;
+					foreach my $subfield ($field->subfields()) {
+						my $subfieldName = $subfield->[0];
+						my $subfieldValue = $subfield->[1];
+						print " \$" . $subfieldName . ' ' . $subfieldValue;
+					}
+				}
+			}
+			if ($verbose) {
+				if ($verbose >= 2) {
+					print "\n" if $verbose;
+				}
+			}
+		}
+	}
+	$record->encoding('UTF-8');
+	return $record;
+}
+
+
 if ($version || ($input_marc_file eq '')) {
 	print <<EOF
 small script to import an iso2709 file into Koha.
@@ -33,8 +144,9 @@
 \tv : verbose mode. 1 means "some infos", 2 means "MARC dumping"
 \tn : the number of the record to import. If missing, all the file is imported
 \tt : test mode : parses the file, saying what he would do, but doing nothing.
-\tc : the char encoding. At the moment, only MARC21 and UNIMARC supported. MARC21 by default.
-\d : delete EVERYTHING related to biblio in koha-DB before import  :tables :
+\tc : the characteristic MARC flavour. At the moment, only MARC21 and UNIMARC 
+\tsupported. MARC21 by default.
+\td : delete EVERYTHING related to biblio in koha-DB before import  :tables :
 \t\tbiblio, \t\tbiblioitems, \t\tsubjects,\titems
 \t\tadditionalauthors, \tbibliosubtitles, \tmarc_biblio,
 \t\tmarc_subfield_table, \tmarc_word, \t\tmarc_blob_subfield
@@ -66,8 +178,8 @@
 	print "TESTING MODE ONLY\n    DOING NOTHING\n===============\n";
 }
 
-$char_encoding = 'MARC21' unless ($char_encoding);
-print "CHAR : $char_encoding\n" if $verbose;
+$marcFlavour = 'MARC21' unless ($marcFlavour);
+print "Characteristic MARC flavour: $marcFlavour\n" if $verbose;
 my $starttime = gettimeofday;
 my $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
 $batch->warnings_off();
@@ -77,23 +189,64 @@
 my ($tagfield,$tagsubfield) = &MARCfind_marc_from_kohafield($dbh,"items.itemnumber",'');
 # $dbh->do("lock tables biblio write, biblioitems write, items write, marc_biblio write, marc_subfield_table write, marc_blob_subfield write, marc_word write, marc_subfield_structure write, stopwords write");
 while ( my $record = $batch->next() ) {
-    #FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and then back again just to fix the encoding
-    my $uxml = $record->as_xml;
-    $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
 	$i++;
+#FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and 
+	# then back again just to fix the encoding
+	#
+	# It is even sillier when the conversion too frequently produces errors 
+	# instead of fixing the encoding.  Hence, the following MARC::File::XML 
+	# lines are now commented out until character set conversion in XML 
+	# works better. -- thd
+	## my $uxml = $record->as_xml;
+	## $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
+	
+	# Check record encoding and convert encoding if necessary.
+	
+	if ($marcFlavour eq 'MARC21') {
+		my $tag000_pos09;
+		if ($record->encoding() eq 'UTF-8') {
+			if ($verbose) {
+				print "\nRecord $i encoding is UTF-8\n";
+				$tag000_pos09 = substr ($record->leader, 9, 1);
+				$tag000_pos09 =~ s/ /#/;
+				print "\nUTF-8 LEADER/09: " . $tag000_pos09 ."\n";
+			}
+		} elsif ($record->encoding() eq 'MARC-8') {
+			print "\nConverting record $i encoding from MARC8 to UTF-8\n";
+			# Convert MARC-8 to UTF-8
+			$record = fMARC8ToUTF8($record, $verbose);
+			if ($verbose) {
+				print "\nRecord $i encoding has been converted to UTF-8\n";
+				$tag000_pos09 = substr ($record->leader, 9, 1);
+				$tag000_pos09 =~ s/ /#/;
+				print "\nUTF-8 LEADER/09: " . $tag000_pos09 ."\n";
+			}
+		}
+	} elsif ($marcFlavour eq 'UNIMARC') {
+		# I have not developed a UNIMARC character encoding conversion script 
+		# yet.  Common encodings should be easy.  Less comon and multiple 
+		# encodings will need extra work.  I am happy to work on this if there 
+		# is some interest. -- thd
+	}
+	
 	#now, parse the record, extract the item fields, and store them in somewhere else.
 
-    ## create an empty record object to populate
-    my $newRecord = MARC::Record->new();
+	## create an empty record object to populate
+	my $newRecord = MARC::Record->new();
 	$newRecord->leader($record->leader());
 
-    # go through each field in the existing record
-    foreach my $oldField ( $record->fields() ) {
+	# go through each field in the existing record
+	foreach my $oldField ( $record->fields() ) {
 
 	# just reproduce tags < 010 in our new record
-	if ( $oldField->tag() < 10 ) {
-	    $newRecord->append_fields( $oldField );
-	    next();
+	# 
+	# Fields are not necessarily only numeric in the actual world of records 
+	# nor in what I would recommend for additonal safe non-interfering local
+	# use fields.  The following regular expression match is much safer than 
+	# a numeric evaluation. -- thd
+	if ( $oldField->tag() =~ m/^00/ ) {
+		$newRecord->append_fields( $oldField );
+		next();
 	}
 
 	# store our new subfield data in this list
@@ -103,23 +256,25 @@
 	foreach my $pair ( $oldField->subfields() ) { 
 		$pair->[1] =~ s/\<//g;
 		$pair->[1] =~ s/\>//g;
-		push( @newSubfields, $pair->[0], char_decode($pair->[1],$char_encoding) );
+		push( @newSubfields, $pair->[0], char_decode($pair->[1],$marcFlavour) );
 	}
 
 	# add the new field to our new record
 	my $newField = MARC::Field->new(
-	    $oldField->tag(),
-	    $oldField->indicator(1),
-	    $oldField->indicator(2),
-	    @newSubfields
+		$oldField->tag(),
+		$oldField->indicator(1),
+		$oldField->indicator(2),
+		@newSubfields
 	);
 
 	$newRecord->append_fields( $newField );
 
-    }
+	}
 
 
-	warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+	if ($verbose) {
+#		warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+	}
 	my @fields = $newRecord->field($tagfield);
 	my @items;
 	my $nbitems=0;