[Koha-patches] [PATCH] Script to check for corruption of marcxml, and optionally to attempt fix.

Joe Atzberger joe.atzberger at liblime.com
Mon May 11 23:44:29 CEST 2009


When marcxml cannot be parsed into a MARC::Record object, the biblio is
undisplayable and it obviously breaks many features in Koha.  This script
can test to parse every marcxml, and alert on failures.  Optionally, the
marcxml can be replaced from the marc field.

See extensive perldoc for details.
---
 misc/cronjobs/MARC21_parse_test.pl |  330 ++++++++++++++++++++++++++++++++++++
 1 files changed, 330 insertions(+), 0 deletions(-)
 create mode 100755 misc/cronjobs/MARC21_parse_test.pl

diff --git a/misc/cronjobs/MARC21_parse_test.pl b/misc/cronjobs/MARC21_parse_test.pl
new file mode 100755
index 0000000..2903e33
--- /dev/null
+++ b/misc/cronjobs/MARC21_parse_test.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/perl
+#
+# Copyright 2009 Liblime
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA  02111-1307 USA
+
+use strict;
+use warnings;
+
+use MARC::Record;
+use MARC::File::XML;
+use MARC::File::USMARC;
+
+use open OUT => ':utf8';
+
+use Getopt::Long qw(:config auto_help auto_version);
+use Pod::Usage;
+
+use C4::Biblio;
+use C4::Charset;
+use C4::Context;
+use C4::Debug;
+
+use vars qw($VERSION);
+
+BEGIN {
+    # find Koha's Perl modules
+    # test carefully before changing this
+    use FindBin;
+    eval { require "$FindBin::Bin/../kohalib.pl" };
+    $VERSION = 0.03;
+}
+
+our $debug;
+
+## OPTIONS
+my $help    = 0;
+my $man     = 0;
+my $verbose = 0;
+
+my $limit;      # undef, not zero.
+my $offset  = 0;
+my $dump    = 0;
+my $all     = 0;
+my $summary = 1;
+my $lint    = 0;
+my $fix     = 0;
+my $filename = "/tmp/MARC21_parse_test.$$.marc";
+
+GetOptions(
+       'help|?' => \$help,
+          'man' => \$man,
+      'verbose' => \$verbose,
+      'limit=i' => \$limit,
+     'offset=i' => \$offset,
+     'filename' => \$filename,
+         'All!' => \$all,
+        'Lint!' => \$lint,
+        'dump!' => \$dump,
+     'summary!' => \$summary,
+         'fix!' => \$fix,
+) or pod2usage(2);
+pod2usage( -verbose => 2 ) if ($man);
+pod2usage( -verbose => 2 ) if ($help and $verbose);
+pod2usage(1) if $help;
+
+if ($debug) {
+    $summary++;
+    $verbose++;
+    $lint++;
+}
+
+my $lint_object;
+if ($lint) {
+    require MARC::Lint;
+    $lint_object = new MARC::Lint;
+}
+my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
+(uc($marcflavour) eq 'MARC21') or die "Only marcflavour MARC21, not '$marcflavour'";
+
+# my $countq = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems");    # Too SLOW on large systems
+# $countq->execute; $countq->fetchrow();
+my $max = 999999;   # arbitrary ceiling
+
+$limit or $limit = $max;       # limit becomes max if unspecified
+
+if ($summary) {
+    printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
+    printf "# limit %d, offset %d:\n", $limit, $offset;
+    printf "# MARC::Lint warnings: %s\n", ($lint ? 'ON' : 'OFF');
+    $verbose and print "# Using temp file: $filename\n"
+}
+
+MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
+
+my $query = "SELECT  *  FROM biblioitems ";
+my $recs;
+if ($all) {
+    if ($limit or $offset) {
+        my $limit_clause = sprintf "LIMIT %d, %d", ($offset || 0), ($limit || $max);
+        $query .= $limit_clause;
+    }
+    $verbose and print "# Query: $query\n";
+    $recs = C4::Context->dbh->prepare($query);
+    $recs->execute();
+} else {
+    $query .= "WHERE biblionumber=?";
+    $verbose and print "# Query: $query\n";
+    $recs = C4::Context->dbh->prepare($query);
+    # no execute, we execute per biblionumber
+    print "# Reading biblionumbers from STDIN\n";
+}
+
+sub next_row {
+    $all and return $recs->fetchrow_hashref();  # no WHERE clause, just get it
+    while (my $biblionumber = <>) {
+        chomp($biblionumber);
+        unless (defined $biblionumber) {
+            print "Skipping blank line $.\n";
+            next;
+        } 
+        unless ($biblionumber =~ s/^\s*(\d+)\s*$/$1/ and $biblionumber != 0) {
+            print "Skipping illegal biblionumber: $biblionumber  (line $.)\n";
+            next;
+        }
+        ($verbose > 1) and printf("(%9d) plausible biblionumber\n", $biblionumber);
+        $recs->execute($biblionumber);
+        return $recs->fetchrow_hashref();
+    }
+    return undef;   # just in case
+}
+
+my $ilimit = $limit;
+$ilimit += $offset unless $all;    # increase ilimit for offset.  if $all, then offset is built into query.
+my $i = 0;
+my $found  = 0;
+my $fixed  = 0;
+my $fine   = 0;
+my $failed = 0;
+my $warns  = 0;
+my $printline = 0;
+while ( my $row = next_row() ) {
+    ++$i;
+    unless ($all) {
+        ($i > $ilimit) and last;  # controls for user-input data/files
+        ($i > $offset) or next;
+    }
+    my $xml = $row->{marcxml};
+    my $bibnum_prefix = sprintf "(%9d)", $row->{biblionumber};
+    # $xml now pared down to just the <leader> element
+    $verbose and printf "# %4d of %4d: biblionumber %s\n", ++$printline, $limit, $row->{biblionumber};
+    my $stripped = StripNonXmlChars($xml);
+    ($stripped eq $xml) or printf "$bibnum_prefix: %d NON-XML Characters removed!!\n", (length($xml) - length($stripped));
+    my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
+    if (not $record) {
+        $found++;
+    	my $msg = $@ || '';
+        $verbose or $msg =~ s# at /usr/.*$##gs;    # shorten common error message
+        print "$bibnum_prefix ERROR: $msg\n";
+    } else {
+        $fine++;
+    }
+    if ($lint) {
+        open (FILE, ">$filename") or die "Cannot write to temp file: $filename";
+        print FILE $xml;
+        close FILE;
+        my $file = MARC::File::XML->in( $filename );
+        while ( my $marc = $file->next() ) {    # should be only 1
+            # $marc->field("245") or print "pre check_record 245 check 1: FAIL\n"; use Data::Dumper;  print Dumper($marc);
+            $lint_object->check_record( $marc );
+            if ($lint_object->warnings) {
+                $warns++;
+                print join("\n", map {"$bibnum_prefix $_"} $lint_object->warnings), "\n";
+            }
+        }
+    }
+    if ($fix and not $record) {
+        my $record_from_blob = MARC::Record->new_from_usmarc($row->{marc});
+        unless ($record_from_blob) {
+            print "$bibnum_prefix ERROR: Cannot recover from biblioitems.marc\n";
+            $failed++;
+        } else {
+            my $mod = ModBiblioMarc($record_from_blob, $row->{biblionumber}, '');
+            if ($mod) {
+                $fixed++;  print "$bibnum_prefix FIXED\n";
+            } else {
+                $failed++; print "$bibnum_prefix FAILED from marc.  Manual intervention required.\n";
+            }
+        }
+    }
+    $dump and print $row->{marcxml}, "\n";
+}
+
+(-f $filename) and unlink ($filename);  # remove tempfile
+
+if ($summary) {
+    printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
+    printf "# limit %d, offset %d:\n", $limit, $offset;
+    print "\nRESULTS (number of records)...\n";
+    printf "  %6d -- OK              \n",  $fine;
+    printf "  %6d -- w/ bad marcxml  \n",  $found;
+    printf "  %6d -- w/ MARC::Lint warnings\n", $warns;
+    printf "  %6d -- fixed from marc \n",  $fixed;
+    printf "  %6d -- failed to fix   \n",  $failed;
+}
+
+
+__END__
+
+=head1 NAME
+
+MARC21_parse_test.pl - Try parsing and optionally fixing biblioitems.marcxml, report errors
+
+=head1 SYNOPSIS
+
+MARC21_parse_test.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l=N ] [ -o=N ] [ -l ] [ -f ] [ -A | filename ...]
+
+ Help Options:
+   -h --help -?   Brief help message
+   -m --man       Full documentation, same as --help --verbose
+      --version   Prints version info
+
+ Feedback Options:
+   -d --dump      Dump MARCXML of biblioitems processed, default OFF
+   -s --summary   Print initial and closing summary of good and bad biblioitems counted, default ON
+   -L --Lint      Show any warnings from MARC::Lint, default OFF
+   -v --verbose   Increase verbosity of output, default OFF
+
+ Run Options:
+   -f --fix       Replace biblioitems.marcxml from data in marc field, default OFF
+   -A --All       Use the whole biblioitems table as target set, default OFF
+   -l --limit     Number of biblioitems to display or fix
+   -o --offset    Number of biblioitems to skip (not displayed or fixed)
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--All>
+
+Target the entire biblioitems table.
+Beware, on a large table B<--All> can be very costly to performance.
+
+=item B<--fix>
+
+Without this option, no changes to any records are made.  With <--fix>, the script attempts to reconstruct
+biblioitems.marcxml from biblioitems.marc.  
+
+=item B<--limit=N>
+
+Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.  
+This applies whether the target records are determined by user input, filenames or <--All>.
+
+=item B<--offset=N>
+
+Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
+The default is 0, i.e. skip none of them.
+
+=back
+
+The binary ON/OFF options can be negated like:
+   B<--nosummary>   Do not display summary.
+   B<--nodump>      Do not dump MARCXML.
+   B<--noLint>      Do not show MARC::Lint warnings.
+   B<--nofix>       Do not change any records.  This is the default mode.
+
+=head1 ARGUMENTS
+
+Any number of filepath arguments can be referenced.  They will be read in order and used to select the target
+set of biblioitems.  The file format should be simply one biblionumber per line.  The B<--limit> and B<--offset>
+options can still be used with biblionumbers specified from file.  Files will be ignored under the B<--All> option.
+
+=head1 DESCRIPTION
+
+This checks for data corruption or otherwise unparsable data in biblioitems.marcxml.  
+As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
+
+Run MARC21_parse_test.pl the first time with no options and type in individual biblionumbers to test.
+Or run with B<--All> to go through the entire table.
+Run the script again with B<--fix> to attempt repair of the same target set.
+
+After fixing any records, you will need to rebuild your index, e.g. B<rebuild_zebra -b -r -x>.
+
+=head1 USAGE EXAMPLES
+
+B<MARC21_parse_test.pl>
+
+In the most basic form, allows you to input biblionumbers and checks them individually.
+
+B<MARC21_parse_test.pl --fix>
+
+Same thing but fixes them if they fail to parse.
+
+B<MARC21_parse_test.pl --fix --limit=15 bibnumbers1.txt>
+
+Fixes biblioitems from the first 15 biblionumbers in file bibnumbers1.txt.  Multiple file arguments can be used.
+
+B<MARC21_parse_test.pl --All --limit=3 --offset=15 --nosummary --dump>
+
+Dumps MARCXML from the 16th, 17th and 18th records found in the database.
+
+B<MARC21_parse_test.pl -A -l=3 -o=15 -s=0 -d>
+
+Same thing as previous example in terse form.
+
+=head1 TODO
+
+Add more documentation for OPTIONS.
+
+Update zebra status so rebuild of index is not necessary.
+
+=head1 SEE ALSO
+
+MARC::Lint
+C4::Biblio
+
+=cut
-- 
1.5.6.5




More information about the Koha-patches mailing list