[Koha-cvs] koha/misc/migration_tools build_marc21_authorit... [rel_2_2]

Joshua Ferraro jmf at kados.org
Thu Mar 9 00:39:11 CET 2006


CVSROOT:	/sources/koha
Module name:	koha
Branch: 	rel_2_2
Changes by:	Joshua Ferraro <kados at savannah.gnu.org>	06/03/08 23:39:11

Added files:
	misc/migration_tools: build_marc21_authorities.pl 

Log message:
	An example for building marc21 authorities from existing bib data.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/koha/misc/migration_tools/build_marc21_authorities.pl?only_with_tag=rel_2_2&rev=1.1.2.1

Patches:
Index: koha/misc/migration_tools/build_marc21_authorities.pl
diff -u /dev/null koha/misc/migration_tools/build_marc21_authorities.pl:1.1.2.1
--- /dev/null	Wed Mar  8 23:39:11 2006
+++ koha/misc/migration_tools/build_marc21_authorities.pl	Wed Mar  8 23:39:11 2006
@@ -0,0 +1,274 @@
+#!/usr/bin/perl
+# script that rebuild thesaurus from biblio table.
+
+use strict;
+
+# Koha modules used
+use MARC::File::USMARC;
+use MARC::Record;
+use MARC::Batch;
+use C4::Context;
+use C4::Biblio;
+use C4::AuthoritiesMarc;
+use Time::HiRes qw(gettimeofday);
+
+use Getopt::Long;
+my ( $input_marc_file, $number) = ('',0);
+my ($version, $verbose, $delete, $confirm, $howmany);
+GetOptions(
+    'h' => \$version,
+    'd' => \$delete,
+    'v' => \$verbose,
+    'c' => \$confirm,
+# this $howmany parameter & other commented code was here to enable incremental building of the authorities, but it does not work well.
+# 	'n:s' => \$howmany,
+);
+
+if ($version || (!$confirm)) {
+	print <<EOF
+Script to recreate a authority tables into Koha from biblios
+parameters :
+\th : this version/help screen
+\tc : confirm. this script run without -c shows this help, pls run it with -c to execute it
+\tv : verbose mode.
+\td : delete the thesaurus before doing work. This deleting is smart enough to delete only the categories to rebuild. However, it is quite slow. Don''t be surprised...
+\tn X : do only X entries, then stop. As the script is incremental, you can create authorities X by X until done.
+
+BEFORE RUNNING this script, you MUST edit it & adapt the %whattodo hash to fit your needs. It contains :
+* as key, the code of the authority to be created. It's the one you've choosen (or will choose) in Koha >> parameters >> thesaurus structure >> add). It can be whatever you want. NP/CO/NG/TI/NC in CVS refers to UNIMARC french RAMEAU category codes.
+* in values a sub-hash with the following values :
+\ttaglist : the list of MARC tags using this authority
+\tkey : the list of MARC subfields used as key for authority. 2 entries in biblio having the same key will be considered as the same.
+\tother : the list of MARC subfields not used as key, but to be copied in authority.
+\tauthtag : the field in authority that will be reported in biblio. Remember that all subfields in tag "authtag" will be reported in the same subfield of the biblio (in MARC tags that are in "taglist")
+
+don't forget to define the itemfield. In UNIMARC, it should be 995, in MARC21, probably 852
+
+Any warning will be stored in the warnings.log file.
+EOF
+;#'
+die;
+}
+
+my $dbh = C4::Context->dbh;
+
+my $itemfield = '852'; # enter the TAG number where your items are stored.
+
+my %whattodo = (
+	# authority code (the one you've choosen (or will choose) in Koha >> parameters >> thesaurus structure >> add)
+	PNAME 	=>	{	taglist	=> "100|600|700|800|900",
+				key	=> "a|b|c|q|d",
+				other	=> "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z",
+				authtag => "100",
+			},
+	CNAME   =>      {       taglist => "110|610|710|810|910",
+                                key	=> "a|b|c|d",
+                                other   => "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z",
+                                authtag => "110",
+                        },
+	MNAME   =>      {       taglist => "111|611|711|811|911",
+                                key     => "a|b|c|q|d",
+                                other   => "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z",
+                                authtag => "111",
+                        },
+	GNAME   =>      {       taglist => "151|651|751|851",
+                                key	=> "a|b|c|q|d",
+                                other   => "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z",
+                                authtag => "151",
+                        },
+	#TITLE   =>      {       taglist => "130|210|211|212|222|240|243|400|410|411|440|490|630|730|830|930|940|980|981|982|983",
+	TITLE   =>      {       taglist => "130|210|211|212|222|240|243|440|490|630|730|830|930|940|980|981|982|983",
+                                key	=> "a",
+                                other   => "0|1|2|4|5|6|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z",
+                                authtag => "130",
+                        },
+	GENRE   =>      {       taglist => "655|656|657|658",
+                                key	=> "a|z|x|y|v",
+                                other   => "0|1|2|4|5|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|w",
+                                authtag => "155",
+                        },
+	TTERM   =>      {       taglist => "650",
+                                key	=> "a|z|x|y|v",
+                                other   => "0|1|2|4|5|6|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|w",
+                                authtag => "150",
+                        },
+	CTERM   =>      {       taglist => "648",
+                                key	=> "a|z|x|y|v",
+                                other   => "0|1|2|4|5|6|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|w",
+                                authtag => "148",
+                        },
+);
+my %authorities;
+
+open WARNING_FILE,">","warnings.log";
+
+my $field_list;
+my $category_list;
+foreach (keys %whattodo) {
+	$field_list .= $whattodo{$_}->{taglist}.'|';
+	$category_list.= "'".$_."',"
+}
+chop $field_list;
+
+if ($delete) {
+	print "deleting thesaurus step 1\n";
+	chop $category_list;
+	my $del1 = $dbh->prepare("delete from auth_subfield_table where authid=?");
+	my $del2 = $dbh->prepare("delete from auth_word where authid=?");
+	my $sth = $dbh->prepare("select authid from auth_header where authtypecode in ($category_list)");
+	$sth->execute;
+	while (my ($authid) = $sth->fetchrow) {
+		$del1->execute($authid);
+		$del2->execute($authid);
+	}
+	print "deleting thesaurus step 2\n";
+	$dbh->do("delete from auth_header where authtypecode in ($category_list)");
+	$dbh->do("delete from marc_subfield_table where tag in ('".join("','",split('\|',$field_list))."') and subfieldcode='9'");
+	$dbh->do("delete from marc_word where tagsubfield in ('".join("9','",split('\|',$field_list))."9')");
+# 	die;
+}
+
+$|=1; # flushes output
+my $starttime = gettimeofday;
+my $sth = $dbh->prepare("select bibid from marc_biblio");
+$sth->execute;
+my $i=1;
+
+my $modified;
+my $alreadydone;
+my $totalskipped;
+while (my ($bibid) = $sth->fetchrow) {
+	my $record = MARCgetbiblio($dbh,$bibid);
+	$modified=0;
+	$i++;
+# 	print "i : $i / howmany : $howmany\n";
+# 	exit if $i>$howmany;
+	
+	# skip what has already been done...
+# 	$alreadydone=0;
+# 	foreach my $field ($record->fields) {
+# 		if ($field->tag() =~ /$field_list/) {
+# # 			print "F : $field_list ".$field->tag()." => ".$field->as_formatted."\n";
+# 			if ($field->subfield('9')) {
+# 				$alreadydone++;
+# 			} else {
+# 				if ($alreadydone) {
+# 					print "ERROR : biblio partially done, some \$9 (authority link) missing : ".$record->as_formatted."\n======= You should run the script again using -d to delete everything";
+# 					die;
+# 				}
+# 			}
+# 		}
+# 	}
+# 	$totalskipped++ if $alreadydone;
+# 	next if $alreadydone;
+# 	my $timeneeded = gettimeofday - $starttime;
+	print " $i in ".(gettimeofday-$starttime)." s\n" unless ($i % 100);
+	# be careful, as the last entry may have been 
+	# delete ITEM field, we only deal with BIBLIOS
+	foreach my $field ($record->field($itemfield)) {
+		$record->delete_field($field);
+	}
+	my $totdone=0;
+# 	my $authid;
+	# on passe tous les champs
+	foreach my $field ($record->fields) {
+		foreach my $DOauthtype (keys %whattodo) {
+			my $DOtaglist = $whattodo{$DOauthtype}->{taglist};
+			my $DOkey = $whattodo{$DOauthtype}->{key};
+			my $DOother = $whattodo{$DOauthtype}->{other};
+			my $DOauthtag = $whattodo{$DOauthtype}->{authtag};
+			if ($field->tag() =~ /$DOtaglist/) {
+				# try to find the authority in %NP ...
+				# build the "key"
+				my $authPrimaryKey;
+				foreach (split '\|',$DOkey) {
+					$authPrimaryKey .= join('|',$field->subfield($_))."|" if $field->subfield($_);
+				}
+				# if authority exist, check it can't be completed by subfields not previously seen.
+				# otherwise, create if with whatever available.
+				if ($authorities{$DOauthtype}->{$authPrimaryKey}) {
+					# check that the existing authority has all the datas. Otherwise, add them, but don't modify already parsed biblios.
+					# at the end of the script, all authorities will be updated. So, the "merge_authority.pl" tool can be used to update all biblios.
+					foreach my $subfieldtotest (split '\|',$DOother) {
+						if ($field->subfield($subfieldtotest)) {
+							if ($authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->subfield($subfieldtotest) 
+												ne
+													$field->subfield($subfieldtotest)) {
+								print WARNING_FILE "========\nERROR ON $i $subfieldtotest authorities seems to differ, can't choose between : \n".$authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->as_formatted()." \n====== AND ======\n ".$field->as_formatted()."\n=======\n";
+								print "W";
+							}
+							# $c was not here, add it...
+							unless ($authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->subfield($subfieldtotest)) {
+								my $fieldA= $authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->clone();
+								$fieldA->add_subfields($subfieldtotest => $field->subfield($subfieldtotest));
+								$authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->replace_with($fieldA);
+								$authorities{$DOauthtype}->{$authPrimaryKey}->{modified} = 1;
+							}
+						}
+					}
+				} else {
+					my $authrecord = MARC::Record->new();
+					my $authfield;
+					foreach (split '\|',$DOkey) {
+						if ($authfield) {
+							$authfield->add_subfields($_ => join ('|',$field->subfield($_))) if $field->subfield($_);
+						} else {
+							$authfield = MARC::Field->new( $DOauthtag,'','',$_ => join ('|',$field->subfield($_)));
+						}
+					}
+					foreach (split '\|',$DOother) {
+						if ($authfield) {
+							$authfield->add_subfields($_ => join ('|',$field->subfield($_))) if $field->subfield($_);
+						} else {
+							$authfield = MARC::Field->new( $DOauthtag,'','',$_ => join ('|',$field->subfield($_)));
+						}
+					}
+					$authrecord->insert_fields_ordered($authfield);
+					my $authid = AUTHaddauthority($dbh,$authrecord,'',$DOauthtype);
+					$authorities{$DOauthtype}->{$authPrimaryKey}->{authid} = $authid;
+					$authorities{$DOauthtype}->{$authPrimaryKey}->{record} = $authrecord->clone;
+					$authorities{$DOauthtype}->{$authPrimaryKey}->{modified} = 0;
+				}
+				print "ERROR !!!! \$9 already exists in $authPrimaryKey / ".$field->as_formatted."\n" if $field->subfield('9');
+				my $fieldC = $field->clone();
+				$fieldC->add_subfields('9' => $authorities{$DOauthtype}->{$authPrimaryKey}->{authid});
+				$field->replace_with($fieldC);
+	# 			print $NP{$keyNP}->{authid}." => ".$record->as_formatted."\n";
+				$modified++;
+			}
+		}
+	}
+#
+# NC
+#
+# OK, done, now store modified biblio if it has been modified
+	if ($modified) {
+		my $frameworkcode=MARCfind_frameworkcode($dbh,$bibid);
+		NEWmodbiblio($dbh,$record,$bibid,$frameworkcode);
+# 		print "skipped $totalskipped biblios in ".(gettimeofday-$starttime)." s\n" if $totalskipped;
+# 		$totalskipped=0;
+		print "$modified";
+	} else {
+		# if $totalskipped is not null, we are in a biblio that has no authorities entry, but inside an already done part of the job
+		# ++ totalskipped & don't show a useless *
+# 		if ($totalskipped) {
+# 			$totalskipped++;
+# 		} else {
+			print "*";
+# 		}
+	}
+}
+
+#
+# now, parse authorities & modify them if they have been modified/completed by a subfield not existing on the 1st biblio using this authority.
+#
+foreach my $authtype (keys %whattodo) {
+	foreach my $authentry (keys %{$authorities{$authtype}}) {
+# 		print "AUTH : $authentry\n" if $authorities{$authtype}->{$authentry}->{modified};
+		AUTHmodauthority($dbh,$authorities{$authtype}->{$authentry}->{authid},$authorities{$authtype}->{$authentry}->{record}) if $authorities{$authtype}->{$authentry}->{modified};
+	}
+}
+#
+my $timeneeded = gettimeofday - $starttime;
+print "$i entries done in $timeneeded seconds (".($i/$timeneeded)." per second)\n";
+close WARNING_FILE;





More information about the Koha-cvs mailing list