[Koha-patches] [PATCH] Bug 5635 changes for bulkmarcimport

Chris Cormack chrisc at catalyst.net.nz
Wed Mar 30 22:51:27 CEST 2011


Squashed commit of the following:

commit e335b5a9c9deedd81ad2a1ef00ea81d1c4d1b692
Author: Paul Poulain <paul.poulain at biblibre.com>
Date:   Wed Jan 19 21:56:02 2011 +0100

    Bug 5635 : merge conflict solved bulkmarcimport

commit e9de4d73b009928139dfc9a68ca6144437f14d0e
Author: Henri-Damien LAURENT <henridamien.laurent at biblibre.com>
Date:   Wed Jan 19 21:56:01 2011 +0100

    Bug 5635 : bulkmarcimport.pl Minor fixes

    Testing existence of isbn before using it
    Testing existence of data before printing in printlog
    Adding debugs

commit 445954ffb00b3d75990299eee6b21aa6a3358d7e
Author: Paul Poulain <paul.poulain at biblibre.com>
Date:   Wed Jan 19 21:56:00 2011 +0100

    Bug 5635 : Fixing Imporving YAML for authorities

    There was one case where the heading would not be precisely good

commit ae2b0abd962205ecc85ec9cf625bbdd1af65c50f
Author: Henri-Damien LAURENT <henridamien.laurent at biblibre.com>
Date:   Wed Jan 19 21:55:59 2011 +0100

    Bug 5635 : bulkmarcimport log correction

    insert was used both for insert biblio and insertitem
    using insertitem for items

commit cc718b71a30ce5cd54db2bd36a99f1d947820409
Author: Henri-Damien LAURENT <henridamien.laurent at biblibre.com>
Date:   Wed Jan 19 21:55:58 2011 +0100

    Bug 5635 : Enhancement bulkmarcimport for Agates

    Adding test parameter
    Adding insert to allow INSERT only
    Adding update parameter to allow biblios to be only updated

    Adding filter on tags/subfields
    exemples :
        --filter 995
        --filter 98.    removes all 98x field
        --filter 981aSmad removes all 981$a with value Smad

commit 9fa051837242d7d2411d3fcdcf207235ad1946d0
Author: Henri-Damien LAURENT <henridamien.laurent at biblibre.com>
Date:   Wed Jan 19 21:55:57 2011 +0100

    Bug 5635 : (MT 2256)Followup bulkmarcimport enhancement

    yaml file as output in order to be able to create a mapping

commit 6d9af0e85da17e94563f5e388cc0e28d4e4402a9
Author: Henri-Damien LAURENT <henridamien.laurent at biblibre.com>
Date:   Wed Jan 19 21:55:56 2011 +0100

    Bug 5635 : bulkmarcimport : Adding checks on last update date

    For UNIMARC authorities at import, import only those whose edition date is more recent than the data in database

    Adding management for multiple heading field
    Especially usefull to manage multi script headings

    UTF8 management and encoding for all input/output

commit e7be7631ba4359f755866d6eea94105d3863fe21
Author: Henri-Damien LAURENT <henridamien.laurent at biblibre.com>
Date:   Wed Jan 19 21:55:55 2011 +0100

    Bug 5635 : Take care of encoding in bulkmarc import
---
 misc/migration_tools/bulkmarcimport.pl |  206 ++++++++++++++++++++++++++------
 1 files changed, 169 insertions(+), 37 deletions(-)

diff --git a/misc/migration_tools/bulkmarcimport.pl b/misc/migration_tools/bulkmarcimport.pl
index 3bb9ed8..e783d17 100755
--- a/misc/migration_tools/bulkmarcimport.pl
+++ b/misc/migration_tools/bulkmarcimport.pl
@@ -24,16 +24,20 @@ use C4::Koha;
 use C4::Debug;
 use C4::Charset;
 use C4::Items;
+use YAML;
 use Unicode::Normalize;
 use Time::HiRes qw(gettimeofday);
 use Getopt::Long;
 use IO::File;
 use Pod::Usage;
 
+use open qw( :std :utf8 );
 binmode(STDOUT, ":utf8");
+
 my ( $input_marc_file, $number, $offset) = ('',0,0);
-my ($version, $delete, $test_parameter, $skip_marc8_conversion, $char_encoding, $verbose, $commit, $fk_off,$format,$biblios,$authorities,$keepids,$match, $isbn_check, $logfile);
+my ($version, $delete, $skip_marc8_conversion, $char_encoding, $verbose, $commit, $fk_off,$format,$biblios,$authorities,$keepids,$match, $isbn_check, $logfile,$yamlfile);
 my ($sourcetag,$sourcesubfield,$idmapfl);
+my ($insert,$filters,$update,$all,$test_parameter);
 
 $|=1;
 
@@ -44,7 +48,7 @@ GetOptions(
     'o|offset:f' => \$offset,
     'h' => \$version,
     'd' => \$delete,
-    't' => \$test_parameter,
+    't|test' => \$test_parameter,
     's' => \$skip_marc8_conversion,
     'c:s' => \$char_encoding,
     'v:s' => \$verbose,
@@ -54,17 +58,73 @@ GetOptions(
     'k|keepids:s' => \$keepids,
     'b|biblios' => \$biblios,
     'a|authorities' => \$authorities,
+    'filter=s@' => \$filters,
+    'insert' => \$insert,
+    'update' => \$update,
+    'all' => \$all,
     'match=s@'    => \$match,
     'i|isbn' => \$isbn_check,
     'x:s' => \$sourcetag,
     'y:s' => \$sourcesubfield,
     'idmap:s' => \$idmapfl,
+    'yaml:s' => \$yamlfile,
 );
-$biblios=!$authorities||$biblios;
+$biblios||= !$authorities;
+$insert ||= !$update;
+if ($all){
+    $insert=1;
+    $update=1;
+}
 
 if ($version || ($input_marc_file eq '')) {
-    pod2usage( -verbose => 2 );
-    exit;
+    print <<EOF ;
+Small script to import bibliographic records into Koha.
+
+Parameters:
+  h      this version/help screen
+  file   /path/to/file/to/dump: the file to import
+  v      verbose mode. 1 means "some infos", 2 means "MARC dumping"
+  fk     Turn off foreign key checks during import.
+  n      the number of records to import. If missing, all the file is imported
+  o      file offset before importing, ie number of records to skip.
+  commit the number of records to wait before performing a 'commit' operation
+  l file logs actions done for each record and their status into file
+  t      test mode: parses the file, saying what he would do, but doing nothing.
+  s      skip automatic conversion of MARC-8 to UTF-8.  This option is 
+         provided for debugging.
+  c      the characteristic MARC flavour. At the moment, only MARC21 and 
+         UNIMARC are supported. MARC21 by default.
+  d      delete EVERYTHING related to biblio in koha-DB before import. Tables:
+         biblio, biblioitems, titems
+  m      format, MARCXML or ISO2709 (defaults to ISO2709)
+  yaml file  format a yaml file with ids
+  keepids field store ids in field (usefull for authorities, where 001 contains the authid for Koha, that can contain a very valuable info for authorities coming from LOC or BNF. useless for biblios probably)
+  x      source bib tag for reporting the source bib number
+  y      source subfield for reporting the source bib number
+  idmap  file for the koha bib and source id
+  keepids store ids in 009 (usefull for authorities, where 001 contains the authid for Koha, that can contain a very valuable info for authorities coming from LOC or BNF. useless for biblios probably)
+  b|biblios type of import : bibliographic records
+  a|authorities type of import : authority records
+  match  matchindex,fieldtomatch matchpoint to use to deduplicate
+          fieldtomatch can be either 001 to 999 
+                       or field and list of subfields as such 100abcde
+  test   if set, test mode only, donot add anything in database
+  insert if set, only insert when possible
+  update if set, only updates (any biblio should have a matching record)
+  all    if set, do whatever is required
+  i|isbn if set, a search will be done on isbn, and, if the same isbn is found, the biblio is not added. It's another
+         method to deduplicate. 
+         match & i can be both set.
+IMPORTANT: don't use this script before you've entered and checked your MARC 
+           parameters tables twice (or more!). Otherwise, the import won't work 
+           correctly and you will get invalid data.
+
+SAMPLE: 
+  \$ export KOHA_CONF=/etc/koha.conf
+  \$ perl misc/migration_tools/bulkmarcimport.pl -d -commit 1000 \\
+    -file /home/jmf/koha.mrc -n 3000
+EOF
+exit;
 }
 
 if (defined $idmapfl) {
@@ -137,7 +197,7 @@ $batch->warnings_off();
 $batch->strict_off();
 my $i=0;
 my $commitnum = $commit ? $commit : 50;
-
+my $yamlhash;
 
 # Skip file offset
 if ( $offset ) {
@@ -195,14 +255,17 @@ RECORD: while (  ) {
             next RECORD;            
         }
     }
+    SetUTF8Flag($record);
     my $isbn;
     # remove trailing - in isbn (only for biblios, of course)
     if ($biblios) {
         if ($marcFlavour eq 'UNIMARC') {
             if (my $f010 = $record->field('010')) {
-                $isbn = $f010->subfield('a');
-                $isbn =~ s/-//g;
-                $f010->update('a' => $isbn);
+                if ($f010->subfield('a')){
+                    $isbn = $f010->subfield('a');
+                    $isbn =~ s/-//g;
+                    $f010->update('a' => $isbn);
+                }
             }
         } else {
             if (my $f020 = $record->field('020')) {
@@ -215,27 +278,46 @@ RECORD: while (  ) {
     }
     my $id;
     # search for duplicates (based on Local-number)
+    my $originalid;
+    $originalid=GetRecordId($record,$tagid,$subfieldid);
     if ($match){
        require C4::Search;
        my $query=build_query($match,$record);
        my $server=($authorities?'authorityserver':'biblioserver');
+       $debug && warn $query;
        my ($error, $results,$totalhits)=C4::Search::SimpleSearch( $query, 0, 3, [$server] );
        die "unable to search the database for duplicates : $error" if (defined $error);
-       #warn "$query $server : $totalhits";
+       $debug && warn "$query $server : $totalhits";
        if ($results && scalar(@$results)==1){
            my $marcrecord = MARC::File::USMARC::decode($results->[0]);
+           SetUTF8Flag($marcrecord);
 	   	   $id=GetRecordId($marcrecord,$tagid,$subfieldid);
-       } 
+           if ($authorities && $marcFlavour ) {
+                #Skip if authority in database is the same as the on in database
+				if ($marcrecord->field('005')->data >= $record->field('005')->data){
+					if ($yamlfile){
+						$yamlhash->{$originalid}->{'authid'}=$id;
+						# On récupère tous les souschamps des champs vedettes d'autorités
+						my @subfields;
+						foreach my $field ($marcrecord->field("2..")){
+						    push @subfields, map{
+									($_->[0]=~/[a-z]/?$_->[1]:())
+						    		       }  $field->subfields();
+						}
+						$yamlhash->{$originalid}->{'subfields'}=\@subfields;
+					}
+					next;
+				}
+			}
+       }
        elsif  ($results && scalar(@$results)>1){
-       $debug && warn "more than one match for $query";
-       } 
+          $debug && warn "more than one match for $query";
+       }
        else {
-       $debug && warn "nomatch for $query";
+          $debug && warn "nomatch for $query";
        }
     }
-	my $originalid;
     if ($keepids){
-	  $originalid=GetRecordId($record,$tagid,$subfieldid);
       if ($originalid){
 		 my $storeidfield;
 		 if (length($keepids)==3){
@@ -248,14 +330,31 @@ RECORD: while (  ) {
 	     $record->delete_field($record->field($tagid));
       }
     }
-    unless ($test_parameter) {
+    foreach my $stringfilter (@$filters){
+        if (length($stringfilter)==3){
+            foreach my $field ($record->field($stringfilter)){
+                $record->delete_field($field);
+                $debug && warn "removed : ",$field->as_string;
+            }
+        }
+        else {
+                my ($removetag,$removesubfield,$removematch)=($1,$2,$3) 
+                    if $stringfilter=~/([0-9]{3})([a-z0-9])(.*)/;
+                if (($removetag >"010")&& $removesubfield){
+                    foreach my $field ($record->field($removetag)){
+                        $field->delete_subfield(code=>"$removesubfield",match=>$removematch);
+                        $debug && warn "Potentially removed : ",$field->subfield($removesubfield);
+                    }
+                }
+        }
+    }
         if ($authorities){
             use C4::AuthoritiesMarc;
             my $authtypecode=GuessAuthTypeCode($record);
             my $authid= ($id?$id:GuessAuthId($record));
-            if ($authid && GetAuthority($authid)){
+            if ($authid && GetAuthority($authid) && $update){
             ## Authority has an id and is in database : Replace
-                eval { ( $authid ) = ModAuthority($authid,$record, $authtypecode) };
+                (! $test_parameter) and eval { ( $authid ) = ModAuthority($authid,$record, $authtypecode) };
                 if ($@){
                     warn "Problem with authority $authid Cannot Modify";
 					printlog({id=>$originalid||$id||$authid, op=>"edit",status=>"ERROR"}) if ($logfile);
@@ -266,7 +365,7 @@ RECORD: while (  ) {
             }  
             elsif (defined $authid) {
             ## An authid is defined but no authority in database : add
-                eval { ( $authid ) = AddAuthority($record,$authid, $authtypecode) };
+                (! $test_parameter) and eval { ( $authid ) = AddAuthority($record,$authid, $authtypecode) };
                 if ($@){
                     warn "Problem with authority $authid Cannot Add ".$@;
 					printlog({id=>$originalid||$id||$authid, op=>"insert",status=>"ERROR"}) if ($logfile);
@@ -277,7 +376,7 @@ RECORD: while (  ) {
             }
 	        else {
             ## True insert in database
-                eval { ( $authid ) = AddAuthority($record,"", $authtypecode) };
+                (! $test_parameter) and eval { ( $authid ) = AddAuthority($record,"", $authtypecode) };
                 if ($@){
                     warn "Problem with authority $authid Cannot Add".$@;
 					printlog({id=>$originalid||$id||$authid, op=>"insert",status=>"ERROR"}) if ($logfile);
@@ -286,6 +385,16 @@ RECORD: while (  ) {
 					printlog({id=>$originalid||$id||$authid, op=>"insert",status=>"ok"}) if ($logfile);
 				}
  	        }
+	        if ($yamlfile){
+              	$yamlhash->{$originalid}->{'authid'}=$authid;
+				my @subfields;
+				foreach my $field ($record->field("2..")){
+		    		push @subfields, map{
+							($_->[0]=~/[a-z]/?$_->[1]:())
+		    		       	}  $field->subfields();
+				}
+	      		$yamlhash->{$originalid}->{'subfields'}=\@subfields;
+            }
         }
         else {
             my ( $biblionumber, $biblioitemnumber, $itemnumbers_ref, $errors_ref );
@@ -309,21 +418,40 @@ RECORD: while (  ) {
 			}
 					# create biblio, unless we already have it ( either match or isbn )
             if ($biblionumber) {
-				eval{$biblioitemnumber=GetBiblioData($biblionumber)->{biblioitemnumber};}
+				eval{$biblioitemnumber=GetBiblioData($biblionumber)->{biblioitemnumber};};
+                if ($update) {
+                    (! $test_parameter) and eval { ( $biblionumber, $biblioitemnumber ) = ModBiblio($record, $biblionumber,GetFrameworkcode($biblionumber)) };
+                    if ( $@ ) {
+                        warn "ERROR: Edit biblio $biblionumber failed: $@\n";
+                        printlog({id=>$id||$originalid||$biblionumber, op=>"update",status=>"ERROR"}) if ($logfile);
+                        next RECORD;
+                    }
+                    else{
+                        printlog({id=>$id||$originalid||$biblionumber, op=>"update",status=>"ok"}) if ($logfile);
+                    }
+                }
+                else {
+                   printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"warning : already in database"}) if ($logfile);
+                }
 			}
 			else 
 			{
-                eval { ( $biblionumber, $biblioitemnumber ) = AddBiblio($record, '', { defer_marc_save => 1 }) };
+                if ($insert){
+                    (! $test_parameter) and eval { ( $biblionumber, $biblioitemnumber ) = AddBiblio($record, '', { defer_marc_save => 1 }) };
+                    if ( $@ ) {
+                        warn "ERROR: Adding biblio $biblionumber failed: $@\n";
+                        printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ERROR"}) if ($logfile);
+                        next RECORD;
+                    }
+                    else{
+                        printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ok"}) if ($logfile);
+                    }
+                }
+                else {
+                   printlog({id=>$id||$originalid||$biblionumber, op=>"update",status=>"warning : not in database"}) if ($logfile);
+                }
             }
-            if ( $@ ) {
-                warn "ERROR: Adding biblio $biblionumber failed: $@\n";
-				printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ERROR"}) if ($logfile);
-                next RECORD;
-            } 
- 			else{
-				printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ok"}) if ($logfile);
-			}
-            eval { ( $itemnumbers_ref, $errors_ref ) = AddItemBatchFromMarc( $record, $biblionumber, $biblioitemnumber, '' ); };
+            (! $test_parameter) and eval { ( $itemnumbers_ref, $errors_ref ) = AddItemBatchFromMarc( $record, $biblionumber, $biblioitemnumber, '' ); };
             if ( $@ ) {
                 warn "ERROR: Adding items to bib $biblionumber failed: $@\n";
 				printlog({id=>$id||$originalid||$biblionumber, op=>"insertitem",status=>"ERROR"}) if ($logfile);
@@ -331,16 +459,16 @@ RECORD: while (  ) {
                 # the MARC columns in biblioitems were not set.
                 ModBiblioMarc( $record, $biblionumber, '' );
                 next RECORD;
-            } 
+            }
  			else{
-				printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ok"}) if ($logfile);
+				printlog({id=>$id||$originalid||$biblionumber, op=>"insertitem",status=>"ok"}) if ($logfile);
 			}
             if ($#{ $errors_ref } > -1) { 
                 report_item_errors($biblionumber, $errors_ref);
             }
+            $yamlhash->{$originalid}=$biblionumber if ($yamlfile);
         }
         $dbh->commit() if (0 == $i % $commitnum);
-    }
     last if $i == $number;
 }
 $dbh->commit();
@@ -361,6 +489,10 @@ if ($logfile){
   print $loghandle "$i MARC records done in $timeneeded seconds\n";
   $loghandle->close;
 }
+if ($yamlfile){
+    open YAML, "> $yamlfile" or die "cannot open $yamlfile \n"; 
+    print YAML Dump($yamlhash);
+}
 exit 0;
 
 sub GetRecordId{
@@ -370,7 +502,7 @@ sub GetRecordId{
 	my $id;
 	if ($tag lt "010"){
 		return $marcrecord->field($tag)->data() if $marcrecord->field($tag);
-	} 
+	}
 	elsif ($subfield){
 		if ($marcrecord->field($tag)){
 			return $marcrecord->subfield($tag,$subfield);
@@ -415,7 +547,7 @@ sub report_item_errors {
 }
 sub printlog{
 	my $logelements=shift;
-	print $loghandle join (";",@$logelements{qw<id op status>}),"\n";
+	print $loghandle join (";",map{defined $_?$_:""}@$logelements{qw<id op status>}),"\n";
 }
 
 
-- 
1.7.1



More information about the Koha-patches mailing list