[Koha-patches] [PATCH] Add skip-commit option to rebuild_zebra.pl

Jesse Weaver jesse.weaver at liblime.com
Fri Oct 3 02:32:36 CEST 2008


This adds an option -c to rebuild_zebra.pl that causes it to leave the shadow
files and not commit to the main index, increasing performance. It only allows
this mode when processing the zebraqueue. It also can automatically recover
from failed updates (which break commits) by marking everything up to the last
commit as not done.
---
 misc/migration_tools/rebuild_zebra.pl |  186 ++++++++++++++++++++++++++-------
 1 files changed, 148 insertions(+), 38 deletions(-)

diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl
index 38054c3..7cb366d 100755
--- a/misc/migration_tools/rebuild_zebra.pl
+++ b/misc/migration_tools/rebuild_zebra.pl
@@ -6,6 +6,7 @@ use C4::Context;
 use Getopt::Long;
 use File::Temp qw/ tempdir /;
 use File::Path;
+use File::Spec;
 use C4::Biblio;
 use C4::AuthoritiesMarc;
 
@@ -16,14 +17,16 @@ use C4::AuthoritiesMarc;
 
 $|=1; # flushes output
 
-my $directory;
+our $directory;
+our $keep_export;
+our $noshadow;
+our $use_tempdir;
 my $skip_export;
-my $keep_export;
+my $skip_commit;
 my $reset;
 my $biblios;
 my $authorities;
 my $noxml;
-my $noshadow;
 my $do_munge;
 my $want_help;
 my $as_xml;
@@ -33,6 +36,7 @@ my $result = GetOptions(
     'd:s'           => \$directory,
     'reset'         => \$reset,
     's'             => \$skip_export,
+    'c'             => \$skip_commit,
     'k'             => \$keep_export,
     'b'             => \$biblios,
     'noxml'         => \$noxml,
@@ -40,7 +44,7 @@ my $result = GetOptions(
     'munge-config'  => \$do_munge,
     'a'             => \$authorities,
     'h|help'        => \$want_help,
-	'x'				=> \$as_xml,
+    'x'             => \$as_xml,
     'y'             => \$do_not_clear_zebraqueue,
     'z'             => \$process_zebraqueue,
 );
@@ -63,6 +67,12 @@ if ($authorities and $as_xml) {
     die $msg;
 }
 
+if ($noshadow and $skip_commit) {
+    my $msg = "Cannot specify both -c and -w\n";
+    $msg   .= "Please do '$0 --help' to see usage.\n";
+    die $msg;
+}
+
 if ($process_zebraqueue and ($skip_export or $reset)) {
     my $msg = "Cannot specify -r or -s if -z is specified\n";
     $msg   .= "Please do '$0 --help' to see usage.\n";
@@ -75,10 +85,15 @@ if ($process_zebraqueue and $do_not_clear_zebraqueue) {
     die $msg;
 }
 
+if (not $process_zebraqueue and $skip_commit) {
+    my $msg = "Must specify -z if -c is specified\n";
+    $msg   .= "Please do '$0 --help' to see usage.\n";
+    die $msg;
+}
+
 if ($noshadow) {
     $noshadow = ' -n ';
 }
-my $use_tempdir = 0;
 unless ($directory) {
     $use_tempdir = 1;
     $directory = tempdir(CLEANUP => ($keep_export ? 0 : 1));
@@ -118,30 +133,7 @@ if ($biblios) {
     print "skipping biblios\n";
 }
 
-
-print "====================\n";
-print "CLEANING\n";
-print "====================\n";
-if ($keep_export) {
-    print "NOTHING cleaned : the export $directory has been kept.\n";
-    print "You can re-run this script with the -s ";
-    if ($use_tempdir) {
-        print " and -d $directory parameters";
-    } else {
-        print "parameter";
-    }
-    print "\n";
-    print "if you just want to rebuild zebra after changing the record.abs\n";
-    print "or another zebra config file\n";
-} else {
-    unless ($use_tempdir) {
-        # if we're using a temporary directory
-        # created by File::Temp, it will be removed
-        # automatically.
-        rmtree($directory, 0, 1);
-        print "directory $directory deleted\n";
-    }
-}
+cleanup();
 
 sub index_records {
     my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue) = @_;
@@ -183,16 +175,38 @@ sub index_records {
     print "====================\n";
     print "REINDEXING zebra\n";
     print "====================\n";
-	my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
+    my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
+    my $zebra_server  = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+    my $del_success = 1;
+    my $upd_success = 1;
+
     if ($process_zebraqueue) {
-        do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt) 
+        my $index_age = get_index_age($zebra_server);
+        my $dbh = C4::Context->dbh;
+
+        $del_success = do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $skip_commit, $record_fmt) 
             if $num_records_deleted;
-        do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt)
-            if $num_records_exported;
+        $upd_success = do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $skip_commit, $record_fmt)
+            if ($num_records_exported);
     } else {
-        do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt)
+        $upd_success = do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $skip_commit, $record_fmt)
             if $num_records_exported;
     }
+
+    if (!$del_success || !$upd_success) {
+        my $sth = $dbh->prepare('
+            UPDATE zebraqueue
+              SET done = 0
+              WHERE done = 1 AND time >= ?
+        ');
+
+        $sth->execute($index_age);
+
+        cleanup();
+        die "Indexing failed";
+    } elsif (!$skip_commit) {
+        set_index_age($zebra_server);
+    }
 }
 
 sub select_zebraqueue_records {
@@ -474,8 +488,66 @@ sub fix_unimarc_100 {
     }
 }
 
+sub set_index_age {
+    my ($zebra_server, $last_done) = @_;
+
+    if (!$last_done) {
+        $last_done = $dbh->selectrow_array('
+            SELECT MAX(time)
+            FROM zebraqueue
+        ');
+    }
+
+    my $index_age_file = File::Spec->catfile(
+        C4::Context->zebraconfig($zebra_server)->{'directory'},
+        'index_age'
+    );
+
+    open INDEX_AGE, '>', $index_age_file;
+    print INDEX_AGE "$last_done\n";
+    close INDEX_AGE;
+}
+
+sub get_index_age {
+    my ($zebra_server) = @_;
+
+    my $dbh = C4::Context->dbh;
+
+    my $index_age_file = File::Spec->catfile(
+        C4::Context->zebraconfig($zebra_server)->{'directory'},
+        'index_age'
+    );
+
+    if (!-f $index_age_file) {
+        my $last_done = $dbh->selectrow_array('
+            SELECT MAX(time)
+            FROM zebraqueue
+            WHERE done = 1
+        ');
+
+        if (!$last_done) {
+            # If zebraqueue has not yet been processed, default to today at midnight
+            my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
+
+            $last_done = strftime('%Y-%m-%d %H:%M:%S', 0, 0, 0, $mday, $mon, $year);
+        }
+
+        set_index_age($zebra_server, $last_done);
+
+        return $last_done;
+    }
+
+    open INDEX_AGE, $index_age_file or die $!;
+
+    my $age = <INDEX_AGE>;
+    chomp $age;
+    close INDEX_AGE;
+
+    return $age;
+}
+
 sub do_indexing {
-    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format) = @_;
+    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $skip_commit, $record_format) = @_;
 
     my $zebra_server  = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
     my $zebra_db_name = ($record_type eq 'biblio') ? 'biblios' : 'authorities';
@@ -483,9 +555,17 @@ sub do_indexing {
     my $zebra_db_dir  = C4::Context->zebraconfig($zebra_server)->{'directory'};
 
     system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name init") if $reset_index;
-    system("zebraidx -c $zebra_config $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
-    system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name commit") unless $noshadow;
+    my $index_error = system("zebraidx -c $zebra_config $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
+
+    return if ($index_error);
 
+    my $commit_error = system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name commit") unless ($noshadow || $skip_commit);
+
+    return if ($commit_error);
+
+    set_index_age($zebra_server, strftime('%Y-%m-%d %H:%M:%S', localtime(time))) unless ($skip_commit); # Indexing errors handled above
+
+    return 1;
 }
 
 sub print_usage {
@@ -522,6 +602,10 @@ Parameters:
                             already exported the records 
                             in a previous run.
 
+    -s                      Skip commit.  Skips committing the
+                            Zebra shadow files to the index, helping
+                            speed.
+
     -noxml                  index from ISO MARC blob
                             instead of MARC XML.  This
                             option is recommended only
@@ -529,7 +613,7 @@ Parameters:
 
     -x                      export and index as xml instead of is02709 (biblios only).
                             use this if you might have records > 99,999 chars,
-							
+                            
     -w                      skip shadow indexing for this batch
 
     -y                      do NOT clear zebraqueue after indexing; normally,
@@ -914,3 +998,29 @@ rank:rank-1
     
 }
 }
+
+sub cleanup {
+    print "====================\n";
+    print "CLEANING\n";
+    print "====================\n";
+    if ($keep_export) {
+        print "NOTHING cleaned : the export $directory has been kept.\n";
+        print "You can re-run this script with the -s ";
+        if ($use_tempdir) {
+            print " and -d $directory parameters";
+        } else {
+            print "parameter";
+        }
+        print "\n";
+        print "if you just want to rebuild zebra after changing the record.abs\n";
+        print "or another zebra config file\n";
+    } else {
+        unless ($use_tempdir) {
+            # if we're using a temporary directory
+            # created by File::Temp, it will be removed
+            # automatically.
+            rmtree($directory, 0, 1);
+            print "directory $directory deleted\n";
+        }
+    }
+}
-- 
1.5.5.GIT




More information about the Koha-patches mailing list