[Koha-patches] [PATCH 1/1] Bug 8218 : A script for maintenance to clean any &

Christophe Croullebois christophe.croullebois at biblibre.com
Mon Nov 26 14:03:04 CET 2012


Two subs written by Alex Arnaud in C4/Charset.pm, SanitizeEntity and entity_clean
I had created two scripts for Biblibre, one to be used in cron, it reindexes all transformed records.
Sophie Meynieux brought together the two scripts, she has created a batch mode, usefull but without the reindexing
So I have rewrited it and I have integrated "Fixing copyright headers and license and description"
written by Chris Cormack and "Followup: Perltidy and QA FIX" written by Jonathan Druart
Thx to all
---
 C4/Charset.pm                           |   48 +++++++++
 misc/maintenance/batchSanitizeEntity.pl |  165 +++++++++++++++++++++++++++++++
 2 files changed, 213 insertions(+), 0 deletions(-)
 create mode 100644 misc/maintenance/batchSanitizeEntity.pl

diff --git a/C4/Charset.pm b/C4/Charset.pm
index c294b36..4641bd1 100644
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -40,6 +40,7 @@ BEGIN {
         SetMarcUnicodeFlag
         StripNonXmlChars
         nsb_clean
+	SanitizeEntity
     );
 }
 
@@ -1158,6 +1159,53 @@ sub char_decode5426 {
   return $result;
 }
 
+=head2 SanitizeEntity
+
+=over 4
+
+SanitizeEntity($marcrecord);
+
+=back
+
+Sanitize Entity
+
+=cut
+
+sub SanitizeEntity {
+    my $record = shift;
+
+    foreach my $field ($record->fields()) {
+        if ($field->is_control_field()) {
+            $field->update(entity_clean($field->data()));
+        } else {
+            my @subfields = $field->subfields();
+            my @new_subfields;
+            foreach my $subfield (@subfields) {
+                push @new_subfields, $subfield->[0] => entity_clean($subfield->[1]);
+            }
+            if (scalar(@new_subfields) > 0) {
+                my $new_field;
+                eval {
+                    $new_field = MARC::Field->new($field->tag(), $field->indicator(1), $field->indicator(2), @new_subfields);
+                };
+                if ($@) {
+                    warn "error : $@";
+                } else {
+                    $field->replace_with($new_field);
+                }
+
+            }
+        }
+    }
+    return $record;
+}
+
+sub entity_clean {
+    my $string=shift;
+    $string=~s/(&)(amp;)+/$1/g;
+    return $string;
+}
+
 1;
 
 
diff --git a/misc/maintenance/batchSanitizeEntity.pl b/misc/maintenance/batchSanitizeEntity.pl
new file mode 100644
index 0000000..7bd5e5f
--- /dev/null
+++ b/misc/maintenance/batchSanitizeEntity.pl
@@ -0,0 +1,165 @@
+#!/usr/bin/perl
+# small script that replaces '&etc.' by '&' in a record
+
+# Copyright 2012 Biblibre SARL
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+use Modern::Perl;
+use C4::Charset;
+use C4::Context;
+use DBI;
+use C4::Biblio;
+use Getopt::Long;
+use Pod::Usage;
+
+my ( $biblios,$run,$want_help,$batch,$cron );
+my $result = GetOptions(
+    'where=s' => \$biblios,
+    '--run'   => \$run,
+    'help|h'  => \$want_help,
+    'batch|b' => \$batch,
+    'cron|c'  => \$cron,
+);
+
+# We check if required entries are given :
+if ( not $result or $want_help or not $biblios or not $run ) {
+    print_usage();
+    exit 0;
+}
+
+# We initialise some tools :
+my $count;
+my $bibliocount;
+my @bibliofile;
+my @biblionum;
+my @biblios;
+my $record;
+my $cleanrecord;
+
+# We first detect if we have a file or biblos directly entered by command line
+#or if we want to use findAmp() sub
+if ( $biblios eq "search" ) {
+    @biblios = findAmp();
+}
+else {
+    if ( $biblios =~ m|/| ) {
+        open( FILE, "$biblios" ) || die("Can't open $biblios");
+        @bibliofile = <FILE>;
+        close(FILE);
+    }
+    else {
+        @biblionum = split ',', $biblios;
+    }
+    
+    # We take the biblios
+    @biblios = @bibliofile ? @bibliofile : @biblionum;
+}
+
+# We remove spaces
+s/(^\s*|\s*$)//g for @biblios;
+# Remove empty lines
+ at biblios = grep {!/^$/} @biblios;
+
+# We valid the input
+foreach my $biblio (@biblios) {
+    if ( $biblio !~ /^\d+$/ ) {
+        print
+"=============== \"$biblio\" is not a biblionumber !!! ===============\n";
+        print "=============== please verify \"$biblio\" !!! ===============\n";
+        $count++;
+    }
+    exit(1) if $count;
+}
+
+$bibliocount = scalar @biblios;
+
+print
+"=============== $bibliocount Biblio(s) ready to be cleaned ===============\n";
+if ( !$batch or !$cron ) {
+    my $confirm = 0;
+    print "=============== You are ok ? Types yes/no :\n";
+    while ( $confirm == 0 ) {
+        my $prout = <STDIN>;
+        if ($prout eq "yes\n") {
+            $confirm = 1;
+        }
+        elsif ($prout eq "no\n") {
+            print "=============== Ok, bye ===============\n";
+            exit(0);
+        }
+        else {
+            print "======= Bad answer please types 'yes' or 'no' !!!!!!!!!!! ===============\n";
+        }
+            }
+}
+
+foreach my $biblio ( @biblios ) {
+    print "=============== N° $biblio selected ===============\n";
+    $record      = GetMarcBiblio($biblio);
+    $cleanrecord = SanitizeEntity($record);
+    my $frameworkcode = GetFrameworkCode($biblio);
+    ModBiblio( $cleanrecord, $biblio, $frameworkcode );
+    print "=============== Biblio n° $biblio cleaning done ===============\n";
+    $count++;
+}
+
+print "==============================================================\n";
+print "=============== $count Biblios cleaned ===============\n";
+
+if ( $cron ) {
+    print "==============================================================\n";
+    print
+"========= Now we are reindexing -b -v -where \"biblionumber=xxx\" =========\n";
+    print "==============================================================\n";
+    $count = 0;
+    my $kohapath = C4::Context->config('intranetdir');
+    foreach my $biblio ( @biblios ) {
+        system("$kohapath/misc/migration_tools/rebuild_zebra.pl
+        -b -v -where \"biblionumber=$biblio\"");
+        print "========= Biblio n° $biblio re-indexing done =========\n";
+        $count ++;
+}
+    print "=============== $count Biblios re-indexed ===============\n";
+}
+
+sub print_usage {
+    print <<_USAGE_;
+$0: replaces '&' by '&' in a record, you can either give some biblionumbers or a file with biblionumbers or
+
+Parameters:
+    -where                  use this to give biblionumbers in a string with "" (separated by coma)
+                            or an absolute path to a file containing biblionumbers (1 by row)
+                            or the command 'search' that creates an array with biblionumbers with "&amp;..."
+    --run                   run the command
+    --batch or -b           run in batch mode
+    --cron or -c            run in cron mode, it reindexes the changed records (you can redirect the output to a file)
+    --help or -h            show this message.
+_USAGE_
+}
+
+sub findAmp {
+    my @bibliosearch;
+    my $dbh = C4::Context->dbh;
+    my $strsth =
+      qq{SELECT biblionumber FROM biblioitems WHERE marcxml LIKE "%amp;amp;%"};
+    my $sth = $dbh->prepare($strsth);
+    $sth->execute();
+    while ( my $bib = $sth-> fetchrow_array() ) {
+        push @bibliosearch, $bib;
+    }
+    return @bibliosearch;
+}
-- 
1.7.0.4



More information about the Koha-patches mailing list