[Koha-patches] [PATCH 1/2] [Bug 5166] Libraries and zebraqueue daemon scripts

Tomas Cohen Arazi tomascohen at gmail.com
Mon Mar 21 16:20:20 CET 2011


Something went wrong with the previous patch. For some reason the system preferences
updates introduce problems for patching so I split this in two different patches.
---
 C4/Catalog/Zebra.pm |  474 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 474 insertions(+), 0 deletions(-)
 create mode 100644 C4/Catalog/Zebra.pm

diff --git a/C4/Catalog/Zebra.pm b/C4/Catalog/Zebra.pm
new file mode 100644
index 0000000..9c7dc4e
--- /dev/null
+++ b/C4/Catalog/Zebra.pm
@@ -0,0 +1,474 @@
+package C4::Catalog::Zebra;
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA  02111-1307 USA
+
+# Derived from rebuild_zebra.pl (2005-08-11) Paul Poulain and others
+# Rewriten 02/03/2011 by Tomas Cohen Arazi (tomascohen at gmail.com)
+#                      Universidad Nacional de Cordoba / Argentina
+
+# Library for managing updates in zebra, usually from zebraqueue
+
+use strict;
+use warnings;
+use C4::Context;
+use Getopt::Long;
+use File::Temp qw/ tempdir /;
+use File::Path;
+use Time::HiRes qw(time);
+use C4::Biblio;
+use C4::AuthoritiesMarc;
+
+use vars qw($VERSION @ISA @EXPORT);
+
+BEGIN {
+	# set the version for version checking
+	$VERSION = 0.01;
+
+	require Exporter;
+	@ISA = qw(Exporter);
+	@EXPORT = qw(
+		&UpdateAuths
+		&UpdateBiblios
+		&UpdateAuthsAndBiblios
+		&IndexZebraqueueRecords
+	);
+}
+
+
+=head1 NAME
+
+C4::Catalog::Zebra
+
+Comment:
+	This should be used when merging the rest of the rebuild_zebra.pl indexing logic
+	my $nosanitize				= (C4::Context->preference('ZebraNoSanitize')) ? 1 : 0;
+
+
+=head2 UpdateAuths
+
+  ( $num_records_updated ) = &UpdateAuths ();
+
+returns the number of updated+deleted authority records 
+
+=cut
+
+sub UpdateAuths
+{
+	# Update authorities
+	return IndexZebraqueueRecords('authority');
+}
+
+=head2 UpdateBiblios
+
+  ( $num_records_updated ) = &UpdateBiblios ();
+
+returns the number of updated+deleted biblio records 
+
+=cut
+
+sub UpdateBiblios
+{
+	# Update authorities
+	return IndexZebraqueueRecords('biblio');
+}
+
+=head2 UpdateAuthsAndBiblios
+
+  ( $num_records_updated ) = &UpdateAuthsAndBiblios ();
+
+returns the number of updated+deleted authority and biblio records 
+
+=cut
+
+sub UpdateAuthsAndBiblios
+{
+	my $ret;
+	# Update authorities
+	$ret = UpdateAuths();
+
+	# Update biblios
+	$ret += UpdateBiblios();
+
+	return $ret;
+}
+
+=head2 IndexZebraqueueRecords
+
+  ( $num_records_updated ) = &IndexZebraqueueRecords ($record_type);
+
+returns the number of updated+deleted $record_type records 
+
+Comment :
+$record_type can be either 'biblio' or 'authority'
+
+=cut
+
+sub IndexZebraqueueRecords
+{
+	my ($record_type) = @_;
+	my $as_xml			= (C4::Context->preference('ZebraUseXml')) ? 1 : 0;
+	my $noxml			= ($as_xml) ? 0 : 1;
+	my $record_format	= ($as_xml) ? 'marcxml' : 'iso2709' ;
+
+	my ($num_records_updated,$num_records_deleted);
+
+	$num_records_deleted = (IndexZebraqueueByAction('deleted',$record_type,$record_format,$as_xml,$noxml)||0);
+	$num_records_updated = (IndexZebraqueueByAction('updated',$record_type,$record_format,$as_xml,$noxml)||0);
+
+	return $num_records_deleted + $num_records_updated;
+}
+
+=head2 IndexZebraqueueByAction
+
+  ( $num_records_updated ) = &IndexZebraqueueByAction ($action,$record_type,
+														$record_format,$as_xml,$noxml);
+
+returns the number of updated+deleted $record_type records 
+
+Comment :
+$record_type can be 'biblio' or 'authority'
+$record_format can be 'marcxml' or 'iso2709'
+$action can be 'updated' or 'deleted'
+$as_xml and $noxml are maintained for legacy reasons, one is enough. They
+indicate whether to use marcxml for indexing in zebra or iso2709. They should
+all be deduced from C4::Context->preference('ZebraUseXml').
+
+=cut
+
+sub IndexZebraqueueByAction
+{
+	my ($action,$record_type,$record_format,$as_xml,$noxml) = @_;
+	my ($num_records_exported,$ret,$zaction);
+
+	if ($action eq 'updated' or $action eq 'deleted') {
+		# get records by action
+		my $entries = select_zebraqueue_records($record_type, $action);
+		# Create tmp dir
+		my $directory = File::Temp->newdir();
+
+		# get records from zebraqueue, export to file for zebraidx
+		if ($action eq 'updated') {
+			$zaction = 'update';
+			$num_records_exported = export_marc_records_from_list($record_type, 
+												$entries, "$directory", $as_xml, $noxml);
+		} else {	
+			# $action eq 'deleted'
+			$zaction = 'delete';
+			$num_records_exported = generate_deleted_marc_records($record_type,
+												$entries, "$directory", $as_xml);
+		}
+
+		if ($num_records_exported) {
+			# log export
+			my $time = localtime(time);
+			print "$time $num_records_exported $record_type record(s) exported for $zaction\n";
+			# TODO error handling / and better logging
+			$ret = DoIndexing($record_type,$zaction,"$directory",$record_format);
+			if ($ret) {
+				print "$time $num_records_exported $record_type record(s) $action\n";
+				mark_zebraqueue_batch_done($entries);
+				print "$time $num_records_exported $record_type record(s) marked done in zebraqueue\n";
+			}
+			# /TODO
+		}
+	} else {
+		# Wrong action
+		$ret = -1;
+	}
+
+	return $ret;
+}
+
+
+sub select_zebraqueue_records {
+	my ($record_type, $update_type) = @_;
+
+	my $dbh = C4::Context->dbh;
+	my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+	my $op = ($update_type eq 'deleted') ? 'recordDelete' : 'specialUpdate';
+
+	my $sth = $dbh->prepare(<<'SQL');
+		SELECT id, biblio_auth_number 
+		FROM zebraqueue
+		WHERE server = ?
+		AND   operation = ?
+		AND   done = 0
+		ORDER BY id DESC;
+SQL
+
+	$sth->execute($server, $op);
+	my $entries = $sth->fetchall_arrayref({});
+}
+
+sub mark_zebraqueue_batch_done {
+	my ($entries) = @_;
+
+	my $dbh = C4::Context->dbh;
+
+	$dbh->{AutoCommit} = 0;
+	my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1 WHERE id = ?");
+	$dbh->commit();
+	foreach my $id (map { $_->{id} } @$entries) {
+		$sth->execute($id);
+	}
+	$dbh->{AutoCommit} = 1;
+}
+
+sub export_marc_records_from_list {
+	my ($record_type, $entries, $directory, $as_xml, $noxml) = @_;
+	my $verbose_logging = (C4::Context->preference('ZebraqueueVerboseLogging')) ? 1 : 0;
+
+	my $num_exported = 0;
+	open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+	my $i = 0;
+	my %found = ();
+	foreach my $record_number ( map { $_->{biblio_auth_number} }
+								grep { !$found{ $_->{biblio_auth_number} }++ }
+								@$entries ) {
+		print "." if ( $verbose_logging );
+		print "\r$i" unless ($i++ %100 or !$verbose_logging);
+		my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
+		if (defined $marc) {
+			# FIXME - when more than one record is exported and $as_xml is true,
+			# the output file is not valid XML - it's just multiple <record> elements
+			# strung together with no single root element.  zebraidx doesn't seem
+			# to care, though, at least if you're using the GRS-1 filter.  It does
+			# care if you're using the DOM filter, which requires valid XML file(s).
+			print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+			$num_exported++;
+		}
+	}
+	print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+	close OUT;
+	return $num_exported;
+}
+
+sub generate_deleted_marc_records {
+	my ($record_type, $entries, $directory, $as_xml) = @_;
+	my $verbose_logging = (C4::Context->preference('ZebraqueueVerboseLogging')) ? 1 : 0;
+
+	my $num_exported = 0;
+	open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+	my $i = 0;
+	foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
+		print "\r$i" unless ($i++ %100 or !$verbose_logging);
+		print "." if ( $verbose_logging );
+
+		my $marc = MARC::Record->new();
+		if ($record_type eq 'biblio') {
+			fix_biblio_ids($marc, $record_number, $record_number);
+		} else {
+			fix_authority_id($marc, $record_number);
+		}
+		if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+			fix_unimarc_100($marc);
+		}
+
+		print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+		$num_exported++;
+	}
+	print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+	close OUT;
+	return $num_exported;
+}
+
+sub get_corrected_marc_record {
+	my ($record_type, $record_number, $noxml) = @_;
+
+	my $marc = get_raw_marc_record($record_type, $record_number, $noxml); 
+
+	if (defined $marc) {
+		fix_leader($marc);
+		if ($record_type eq 'biblio') {
+			my $succeeded = fix_biblio_ids($marc, $record_number);
+			return unless $succeeded;
+		} else {
+			fix_authority_id($marc, $record_number);
+		}
+		if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+			fix_unimarc_100($marc);
+		}
+	}
+
+	return $marc;
+}
+
+sub get_raw_marc_record {
+	my ($record_type, $record_number, $noxml) = @_;
+	my $dbh = C4::Context->dbh;
+
+	my $marc; 
+	if ($record_type eq 'biblio') {
+		if ($noxml) {
+			my $fetch_sth = $dbh->prepare_cached("SELECT marc FROM biblioitems WHERE biblionumber = ?");
+			$fetch_sth->execute($record_number);
+			if (my ($blob) = $fetch_sth->fetchrow_array) {
+				$marc = MARC::Record->new_from_usmarc($blob);
+				$fetch_sth->finish();
+			} else {
+				return; # failure to find a bib is not a problem -
+						# a delete could have been done before
+						# trying to process a record update
+			}
+		} else {
+			eval { $marc = GetMarcBiblio($record_number); };
+			if ($@) {
+				# here we do warn since catching an exception
+				# means that the bib was found but failed
+				# to be parsed
+				warn "error retrieving biblio $record_number";
+				return;
+			}
+		}
+	} else {
+		eval { $marc = GetAuthority($record_number); };
+		if ($@) {
+			warn "error retrieving authority $record_number";
+			return;
+		}
+	}
+	return $marc;
+}
+
+sub fix_leader {
+    # FIXME - this routine is suspect
+    # It blanks the Leader/00-05 and Leader/12-16 to
+    # force them to be recalculated correct when
+    # the $marc->as_usmarc() or $marc->as_xml() is called.
+    # But why is this necessary?  It would be a serious bug
+    # in MARC::Record (definitely) and MARC::File::XML (arguably) 
+    # if they are emitting incorrect leader values.
+    my $marc = shift;
+
+    my $leader = $marc->leader;
+    substr($leader,  0, 5) = '     ';
+    substr($leader, 10, 7) = '22     ';
+    $marc->leader(substr($leader, 0, 24));
+}
+
+sub fix_biblio_ids {
+	# FIXME - it is essential to ensure that the biblionumber is present,
+	#         otherwise, Zebra will choke on the record.  However, this
+	#         logic belongs in the relevant C4::Biblio APIs.
+	my $marc = shift;
+	my $biblionumber = shift;
+	my $biblioitemnumber;
+	my $dbh = C4::Context->dbh;
+
+	if (@_) {
+		$biblioitemnumber = shift;
+	} else {    
+		my $sth = $dbh->prepare(
+			"SELECT biblioitemnumber FROM biblioitems WHERE biblionumber=?");
+		$sth->execute($biblionumber);
+		($biblioitemnumber) = $sth->fetchrow_array;
+		$sth->finish;
+		unless ($biblioitemnumber) {
+			warn "failed to get biblioitemnumber for biblio $biblionumber";
+			return 0;
+		}
+	}
+
+	# FIXME - this is cheating on two levels
+	# 1. C4::Biblio::_koha_marc_update_bib_ids is meant to be an internal function
+	# 2. Making sure that the biblionumber and biblioitemnumber are correct and
+	#    present in the MARC::Record object ought to be part of GetMarcBiblio.
+	#
+	# On the other hand, this better for now than what rebuild_zebra.pl used to
+	# do, which was duplicate the code for inserting the biblionumber 
+	# and biblioitemnumber
+	C4::Biblio::_koha_marc_update_bib_ids($marc, '', $biblionumber, $biblioitemnumber);
+
+	return 1;
+}
+
+sub fix_authority_id {
+	# FIXME - as with fix_biblio_ids, the authid must be present
+	#         for Zebra's sake.  However, this really belongs
+	#         in C4::AuthoritiesMarc.
+	my ($marc, $authid) = @_;
+	unless ($marc->field('001') and $marc->field('001')->data() eq $authid){
+		$marc->delete_field($marc->field('001'));
+		$marc->insert_fields_ordered(MARC::Field->new('001',$authid));
+	}
+}
+
+sub fix_unimarc_100 {
+	# FIXME - again, if this is necessary, it belongs in C4::AuthoritiesMarc.
+	my $marc = shift;
+
+	my $string;
+	if ( length($marc->subfield( 100, "a" )) == 35 ) {
+		$string = $marc->subfield( 100, "a" );
+		my $f100 = $marc->field(100);
+		$marc->delete_field($f100);
+	}
+	else {
+		$string = POSIX::strftime( "%Y%m%d", localtime );
+		$string =~ s/\-//g;
+		$string = sprintf( "%-*s", 35, $string );
+	}
+	substr( $string, 22, 6, "frey50" );
+	unless ( length($marc->subfield( 100, "a" )) == 35 ) {
+		$marc->delete_field($marc->field(100));
+		$marc->insert_grouped_field(MARC::Field->new( 100, "", "", "a" => $string ));
+	}
+}
+
+=head2 DoIndexing
+
+  ( $error_code ) = &DoIndexing($record_type,$op,$record_dir,$record_format);
+
+returns the corresponding zebraidx error code
+
+Comment :
+$record_type can be 'biblio' or 'authority'
+$zaction can be 'delete' or 'update'
+$record_dir is the directory where the exported records are
+$record_format can be 'marcxml' or 'iso2709'
+
+=cut
+
+sub DoIndexing {
+	my ($record_type, $zaction, $record_dir, $record_format) = @_;
+	my $zebra_server	= ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+	my $zebra_db_name	= ($record_type eq 'biblio') ? 'biblios' : 'authorities';
+	my $zebra_config	= C4::Context->zebraconfig($zebra_server)->{'config'};
+	my $zebra_db_dir	= C4::Context->zebraconfig($zebra_server)->{'directory'};
+	my $noshadow		= (C4::Context->preference('ZebraNoshadow')) ? '-n' : '';
+	my $zebraidx_log_opt		= " -v none,fatal ";
+
+	# TODO better error handling!!
+	system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $zaction $record_dir");
+	system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
+	# /TODO
+	
+	return 1;
+}
+
+
+END { }
+
+1;
+__END__
+
+=head1 AUTHOR
+
+Koha Development Team <http://koha-community.org/>
+
+Tomas Cohen Arazi tomascohen at gmail.com
+
+=cut
-- 
1.7.1



More information about the Koha-patches mailing list