[Koha-cvs] koha C4/Biblio.pm C4/Search.pm updater/updateda...

Wed May 2 18:44:31 CEST 2007

CVSROOT:	/sources/koha
Module name:	koha
Changes by:	paul poulain <tipaul>	07/05/02 16:44:31

Modified files:
	C4             : Biblio.pm Search.pm 
	updater        : updatedatabase 
	misc/migration_tools: rebuild_nozebra.pl 

Log message:
	NoZebra SQL index management : 
	* adding 3 subs in Biblio.pm
	- GetNoZebraIndexes, that get the index structure in a new systempreference (added with this commit)
	- _DelBiblioNoZebra, that retrieve all index entries for a biblio and remove in a variable the biblio reference
	- _AddBiblioNoZebra, that add index entries for a biblio.
	Note that the 2 _Add and _Del subs work only in a hash variable, to speed up things in case of a modif (ie : delete+add). The effective SQL update is done in the ModZebra sub (that existed before, and dealed with zebra index).
	I think the code has to be more deeply tested, but it works at least partially.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/C4/Biblio.pm?cvsroot=koha&r1=1.201&r2=1.202
http://cvs.savannah.gnu.org/viewcvs/koha/C4/Search.pm?cvsroot=koha&r1=1.134&r2=1.135
http://cvs.savannah.gnu.org/viewcvs/koha/updater/updatedatabase?cvsroot=koha&r1=1.162&r2=1.163
http://cvs.savannah.gnu.org/viewcvs/koha/misc/migration_tools/rebuild_nozebra.pl?cvsroot=koha&r1=1.2&r2=1.3

Patches:
Index: C4/Biblio.pm
===================================================================
RCS file: /sources/koha/koha/C4/Biblio.pm,v
retrieving revision 1.201
retrieving revision 1.202
diff -u -b -r1.201 -r1.202

--- C4/Biblio.pm	27 Apr 2007 14:00:49 -0000	1.201
+++ C4/Biblio.pm	2 May 2007 16:44:31 -0000	1.202
@@ -33,7 +33,7 @@
 use vars qw($VERSION @ISA @EXPORT);
 
 # set the version for version checking
-$VERSION = do { my @v = '$Revision: 1.201 $' =~ /\d+/g; shift(@v).".".join( "_", map { sprintf "%03d", $_ } @v ); };
+$VERSION = do { my @v = '$Revision: 1.202 $' =~ /\d+/g; shift(@v).".".join( "_", map { sprintf "%03d", $_ } @v ); };
 
 @ISA = qw( Exporter );
 
@@ -113,6 +113,7 @@
   &TransformHtmlToXml
   &PrepareItemrecordDisplay
   &char_decode
+  &GetNoZebraIndexes
 );
 
 =head1 NAME
@@ -607,7 +608,11 @@
     }
     return $error if $error;
 
-    # Delete in Zebra
+    # Delete in Zebra. Be careful NOT to move this line after _koha_delete_biblio
+    # for at least 2 reasons :
+    # - we need to read the biblio if NoZebra is set (to remove it from the indexes
+    # - if something goes wrong, the biblio may be deleted from Koha but not from zebra
+    #   and we would have no way to remove it (except manually in zebra, but I bet it would be very hard to handle the problem)
     ModZebra($biblionumber,"delete_record","biblioserver");
 
     # delete biblio from Koha tables and save in deletedbiblio
@@ -2705,97 +2710,254 @@
 # replaced by a zebraqueue table, that is filled with ModZebra to run.
 # the table is emptied by misc/cronjobs/zebraqueue_start.pl script
 
-my $sth=$dbh->prepare("insert into zebraqueue  (biblio_auth_number ,server,operation) values(?,?,?)");
-$sth->execute($biblionumber,$server,$op);
-$sth->finish;
-
-#
-#     my @Zconnbiblio;
-#     my $tried     = 0;
-#     my $recon     = 0;
-#     my $reconnect = 0;
-#     my $record;
-#     my $shadow;
-# 
-#   reconnect:
-#     $Zconnbiblio[0] = C4::Context->Zconn( $server, 0, 1 );
-# 
-#     if ( $server eq "biblioserver" ) {
-# 
-#         # it's unclear to me whether this should be in xml or MARC format
-#         # but it is clear it should be nabbed from zebra rather than from
-#         # the koha tables
-#         $record = GetMarcBiblio($biblionumber);
-#         $record = $record->as_xml_record() if $record;
-# #            warn "RECORD $biblionumber => ".$record;
-#         $shadow="biblioservershadow";
-# 
-#         #           warn "RECORD $biblionumber => ".$record;
-#         $shadow = "biblioservershadow";
-# 
-#     }
-#     elsif ( $server eq "authorityserver" ) {
-#         $record = C4::AuthoritiesMarc::XMLgetauthority( $dbh, $biblionumber );
-#         $shadow = "authorityservershadow";
-#     }    ## Add other servers as necessary
-# 
-#     my $Zpackage = $Zconnbiblio[0]->package();
-#     $Zpackage->option( action => $op );
-#     $Zpackage->option( record => $record );
-# 
-#   retry:
-#     $Zpackage->send("update");
-#     my $i;
-#     my $event;
-# 
-#     while ( ( $i = ZOOM::event( \@Zconnbiblio ) ) != 0 ) {
-#         $event = $Zconnbiblio[0]->last_event();
-#         last if $event == ZOOM::Event::ZEND;
-#     }
-# 
-#     my ( $error, $errmsg, $addinfo, $diagset ) = $Zconnbiblio[0]->error_x();
-#     if ( $error == 10000 && $reconnect == 0 )
-#     {    ## This is serious ZEBRA server is not available -reconnect
-#         warn "problem with zebra server connection";
-#         $reconnect = 1;
-#         my $res = system('sc start "Z39.50 Server" >c:/zebraserver/error.log');
-# 
-#         #warn "Trying to restart ZEBRA Server";
-#         #goto "reconnect";
-#     }
-#     elsif ( $error == 10007 && $tried < 2 )
-#     {    ## timeout --another 30 looonng seconds for this update
-#         $tried = $tried + 1;
-#         warn "warn: timeout, trying again";
-#         goto "retry";
-#     }
-#     elsif ( $error == 10004 && $recon == 0 ) {    ##Lost connection -reconnect
-#         $recon = 1;
-#         warn "error: reconnecting to zebra";
-#         goto "reconnect";
-# 
-#    # as a last resort, we save the data to the filesystem to be indexed in batch
-#     }
-#     elsif ($error) {
-#         warn
-# "Error-$server   $op $biblionumber /errcode:, $error, /MSG:,$errmsg,$addinfo \n";
-#         $Zpackage->destroy();
-#         $Zconnbiblio[0]->destroy();
-#         ModZebrafiles( $dbh, $biblionumber, $record, $op, $server );
-#         return;
-#     }
-#     if ( C4::Context->$shadow ) {
-#         $Zpackage->send('commit');
-#         while ( ( $i = ZOOM::event( \@Zconnbiblio ) ) != 0 ) {
-# 
-#             #waiting zebra to finish;
-#          }
-#     }
-#     $Zpackage->destroy();
+    if (C4::Context->preference("NoZebra")) {
+        # lock the nozebra table : we will read index lines, update them in Perl process
+        # and write everything in 1 transaction.
+        # lock the table to avoid someone else overwriting what we are doing
+        $dbh->do('LOCK TABLES nozebra WRITE,biblio WRITE,biblioitems WRITE, systempreferences WRITE');
+        my %result; # the result hash that will be builded by deletion / add, and written on mySQL at the end, to improve speed
+        my $record= GetMarcBiblio($biblionumber);
+        if ($op eq 'specialUpdate') {
+            # OK, we have to add or update the record
+            # 1st delete (virtually, in indexes) ...
+            %result = _DelBiblioNoZebra($biblionumber,$record);
+            # ... add the record
+            %result=_AddBiblioNoZebra($biblionumber,$record, %result);
+        } else {
+            # it's a deletion, delete the record...
+            %result=_DelBiblioNoZebra($biblionumber,$record);
+        }
+        # ok, now update the database...
+        my $sth = $dbh->prepare("UPDATE nozebra SET biblionumbers=? WHERE indexname=? AND value=?");
+        foreach my $key (keys %result) {
+            foreach my $index (keys %{$result{$key}}) {
+                $sth->execute($result{$key}->{$index},$key,$index);
+            }
+        }
+    $dbh->do('UNLOCK TABLES');
+
+    } else {
+    #
+    # we use zebra, just fill zebraqueue table
+    #
+    my $sth=$dbh->prepare("insert into zebraqueue  (biblio_auth_number ,server,operation) values(?,?,?)");
+    $sth->execute($biblionumber,$server,$op);
+    $sth->finish;
+    }
+}
+
+=head2 GetNoZebraIndexes
+
+=cut
+
+sub GetNoZebraIndexes {
+    my $index = C4::Context->preference('NoZebraIndexes');
+    my %indexes;
+    foreach my $line (split /('|"),/,$index) {
+        $line =~ /(.*)=>(.*)/;
+        my $index = substr($1,1); # get the index, don't forget to remove initial ' or "
+        my $fields = $2;
+        $index =~ s/'|"| //g;
+        $fields =~ s/'|"| //g;
+        $indexes{$index}=$fields;
+    }
+    return %indexes;
 }
 
 =head1 INTERNAL FUNCTIONS
 
+=head2 _DelBiblioNoZebra($biblionumber,$record);
+
+    function to delete a biblio in NoZebra indexes
+    This function does NOT delete anything in database : it reads all the indexes entries
+    that have to be deleted & delete them in the hash
+    The SQL part is done either :
+    - after the Add if we are modifying a biblio (delete + add again)
+    - immediatly after this sub if we are doing a true deletion.
+
+=cut
+
+
+sub _DelBiblioNoZebra {
+    my ($biblionumber,$record)=@_;
+    # Get the indexes
+    my $dbh = C4::Context->dbh;
+    # Get the indexes
+    my %index=GetNoZebraIndexes;
+    # get title of the record (to store the 10 first letters with the index)
+    my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+    my $title = lc($record->subfield($titletag,$titlesubfield));
+    
+    my %result;
+    # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
+    $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
+    # limit to 10 char, should be enough, and limit the DB size
+    $title = substr($title,0,10);
+    #parse each field
+    my $sth2=$dbh->prepare('SELECT biblionumbers FROM nozebra WHERE indexname=? AND value=?');
+    foreach my $field ($record->fields()) {
+        #parse each subfield
+        next if $field->tag <10;
+        foreach my $subfield ($field->subfields()) {
+            my $tag = $field->tag();
+            my $subfieldcode = $subfield->[0];
+            my $indexed=0;
+            # check each index to see if the subfield is stored somewhere
+            # otherwise, store it in __RAW__ index
+            foreach my $key (keys %index) {
+#                 warn "examining $key index : ".$index{$key}." for $tag $subfieldcode";
+                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
+                    $indexed=1;
+                    my $line= lc $subfield->[1];
+                    # remove meaningless value in the field...
+                    $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                    # ... and split in words
+                    foreach (split / /,$line) {
+                        next unless $_; # skip  empty values (multiple spaces)
+                        # if the entry is already here, do nothing, the biblionumber has already be removed
+                        unless ($result{$key}->{$_} =~ /$biblionumber,$title\-(\d);/) {
+                            # get the index value if it exist in the nozebra table and remove the entry, otherwise, do nothing
+                            $sth2->execute($key,$_);
+                            my $existing_biblionumbers = $sth2->fetchrow;
+                            # it exists
+                            if ($existing_biblionumbers) {
+                                warn " existing for $key $_: $existing_biblionumbers";
+                                $result{$key}->{$_} =$existing_biblionumbers;
+                                $result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                            }
+                        }
+                    }
+                }
+            }
+            # the subfield is not indexed, store it in __RAW__ index anyway
+            unless ($indexed) {
+                my $line= lc $subfield->[1];
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                # ... and split in words
+                foreach (split / /,$line) {
+                    next unless $_; # skip  empty values (multiple spaces)
+                    # if the entry is already here, do nothing, the biblionumber has already be removed
+                    unless ($result{'__RAW__'}->{$_} =~ /$biblionumber,$title\-(\d);/) {
+                        # get the index value if it exist in the nozebra table and remove the entry, otherwise, do nothing
+                        $sth2->execute('__RAW__',$_);
+                        my $existing_biblionumbers = $sth2->fetchrow;
+                        # it exists
+                        if ($existing_biblionumbers) {
+                            warn " existing for __RAW__ $_ : $existing_biblionumbers";
+                            $result{'__RAW__'}->{$_} =$existing_biblionumbers;
+                            $result{'__RAW__'}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return %result;
+}
+
+=head2 _DelBiblioNoZebra($biblionumber,$record);
+
+    function to delete a biblio in NoZebra indexes
+
+=cut
+
+
+sub _AddBiblioNoZebra {
+    my ($biblionumber,$record,%result)=@_;
+    my $dbh = C4::Context->dbh;
+    # Get the indexes
+    my %index=GetNoZebraIndexes;
+    # get title of the record (to store the 10 first letters with the index)
+    my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+    my $title = lc($record->subfield($titletag,$titlesubfield));
+
+    # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
+    $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
+    # limit to 10 char, should be enough, and limit the DB size
+    $title = substr($title,0,10);
+    #parse each field
+    my $sth2=$dbh->prepare('SELECT biblionumbers FROM nozebra WHERE indexname=? AND value=?');
+    foreach my $field ($record->fields()) {
+        #parse each subfield
+        next if $field->tag <10;
+        foreach my $subfield ($field->subfields()) {
+            my $tag = $field->tag();
+            my $subfieldcode = $subfield->[0];
+            my $indexed=0;
+            # check each index to see if the subfield is stored somewhere
+            # otherwise, store it in __RAW__ index
+            foreach my $key (keys %index) {
+#                 warn "examining $key index : ".$index{$key}." for $tag $subfieldcode";
+                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
+                    $indexed=1;
+                    my $line= lc $subfield->[1];
+                    # remove meaningless value in the field...
+                    $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                    # ... and split in words
+                    foreach (split / /,$line) {
+                        next unless $_; # skip  empty values (multiple spaces)
+                        # if the entry is already here, improve weight
+#                         warn "managing $_";
+                        if ($result{$key}->{$_} =~ /$biblionumber,$title\-(\d);/) {
+                            my $weight=$1+1;
+                            $result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                            $result{$key}->{$_} .= "$biblionumber,$title-$weight;";
+                        } else {
+                            # get the value if it exist in the nozebra table, otherwise, create it
+                            $sth2->execute($key,$_);
+                            my $existing_biblionumbers = $sth2->fetchrow;
+                            # it exists
+                            if ($existing_biblionumbers) {
+                                warn" existing : $existing_biblionumbers";
+                                $result{$key}->{$_} =$existing_biblionumbers;
+                                my $weight=$1+1;
+                                $result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                                $result{$key}->{$_} .= "$biblionumber,$title-$weight;";
+                            # create a new ligne for this entry
+                            } else {
+                                $dbh->do('INSERT INTO nozebra SET indexname='.$dbh->quote($key).',value='.$dbh->quote($_));
+                                $result{$key}->{$_}.="$biblionumber,$title-1;";
+                            }
+                        }
+                    }
+                }
+            }
+            # the subfield is not indexed, store it in __RAW__ index anyway
+            unless ($indexed) {
+                my $line= lc $subfield->[1];
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                # ... and split in words
+                foreach (split / /,$line) {
+                    next unless $_; # skip  empty values (multiple spaces)
+                    # if the entry is already here, improve weight
+                    if ($result{'__RAW__'}->{$_} =~ /$biblionumber,$title\-(\d);/) {
+                        my $weight=$1+1;
+                        $result{'__RAW__'}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                        $result{'__RAW__'}->{$_} .= "$biblionumber,$title-$weight;";
+                    } else {
+                        # get the value if it exist in the nozebra table, otherwise, create it
+                        $sth2->execute('__RAW__',$_);
+                        my $existing_biblionumbers = $sth2->fetchrow;
+                        # it exists
+                        if ($existing_biblionumbers) {
+                            $result{'__RAW__'}->{$_} =$existing_biblionumbers;
+                            my $weight=$1+1;
+                            $result{'__RAW__'}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                            $result{'__RAW__'}->{$_} .= "$biblionumber,$title-$weight;";
+                        # create a new ligne for this entry
+                        } else {
+                            $dbh->do('INSERT INTO nozebra SET indexname="__RAW__",value='.$dbh->quote($_));
+                            $result{'__RAW__'}->{$_}.="$biblionumber,$title-1;";
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return %result;
+}
+
+
 =head2 MARCitemchange
 
 =over 4
@@ -3505,7 +3667,7 @@
 # deal with UNIMARC field 100 (encoding) : create it if needed & set encoding to unicode
     if ( $encoding eq "UNIMARC" ) {
         my $string;
-        if ( $record->subfield( 100, "a" ) ) {
+        if ( length($record->subfield( 100, "a" )) == 35 ) {
             $string = $record->subfield( 100, "a" );
             my $f100 = $record->field(100);
             $record->delete_field($f100);
@@ -3689,8 +3851,17 @@
 
 =cut
 
-# $Id: Biblio.pm,v 1.201 2007/04/27 14:00:49 hdl Exp $
+# $Id: Biblio.pm,v 1.202 2007/05/02 16:44:31 tipaul Exp $
 # $Log: Biblio.pm,v $
+# Revision 1.202  2007/05/02 16:44:31  tipaul
+# NoZebra SQL index management :
+# * adding 3 subs in Biblio.pm
+# - GetNoZebraIndexes, that get the index structure in a new systempreference (added with this commit)
+# - _DelBiblioNoZebra, that retrieve all index entries for a biblio and remove in a variable the biblio reference
+# - _AddBiblioNoZebra, that add index entries for a biblio.
+# Note that the 2 _Add and _Del subs work only in a hash variable, to speed up things in case of a modif (ie : delete+add). The effective SQL update is done in the ModZebra sub (that existed before, and dealed with zebra index).
+# I think the code has to be more deeply tested, but it works at least partially.
+#
 # Revision 1.201  2007/04/27 14:00:49  hdl
 # Removing $dbh from GetMarcFromKohaField (dbh is not used in this function.)
 #

Index: C4/Search.pm
===================================================================
RCS file: /sources/koha/koha/C4/Search.pm,v
retrieving revision 1.134
retrieving revision 1.135
diff -u -b -r1.134 -r1.135
--- C4/Search.pm	2 May 2007 11:57:11 -0000	1.134
+++ C4/Search.pm	2 May 2007 16:44:31 -0000	1.135
@@ -25,7 +25,7 @@
 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
 # set the version for version checking
-$VERSION = do { my @v = '$Revision: 1.134 $' =~ /\d+/g;
+$VERSION = do { my @v = '$Revision: 1.135 $' =~ /\d+/g;
     shift(@v) . "." . join( "_", map { sprintf "%03d", $_ } @v );
 };
 
@@ -1454,26 +1454,21 @@
             $title =~ /(.*)-(\d)/;
             # get weight 
             my $ranking =$2;
-            # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
-            # and we don't want to get only 1 result for each of them !!!
             # note that we + the ranking because ranking is calculated on weight of EACH term requested.
             # if we ask for "two towers", and "two" has weight 2 in biblio N, and "towers" has weight 4 in biblio N
             # biblio N has ranking = 6
-            $count_ranking{$biblionumber}=0 unless $count_ranking{$biblionumber};
             $count_ranking{$biblionumber} =+ $ranking;
         }
         # build the result by "inverting" the count_ranking hash
         # hing : as usual, we don't order by ranking only, to avoid having only 1 result for each rank. We build an hash on concat(ranking,biblionumber) instead
 #         warn "counting";
         foreach (keys %count_ranking) {
-            warn "$_ =".sprintf("%10d",$count_ranking{$_}).'-'.$_;
             $result{sprintf("%10d",$count_ranking{$_}).'-'.$_} = $_;
         }
         # sort the hash and return the same structure as GetRecords (Zebra querying)
         my $result_hash;
         my $numbers=0;
             foreach my $key (sort {$b <=> $a} (keys %result)) {
-            warn "KEY : $key = ".$result{$key};
                 $result_hash->{'RECORDS'}[$numbers++] = $result{$key};
             }
         # for the requested page, replace biblionumber by the complete record

Index: updater/updatedatabase
===================================================================
RCS file: /sources/koha/koha/updater/updatedatabase,v
retrieving revision 1.162
retrieving revision 1.163
diff -u -b -r1.162 -r1.163
--- updater/updatedatabase	30 Apr 2007 16:16:50 -0000	1.162
+++ updater/updatedatabase	2 May 2007 16:44:31 -0000	1.163
@@ -1,6 +1,6 @@
 #!/usr/bin/perl
 
-# $Id: updatedatabase,v 1.162 2007/04/30 16:16:50 tipaul Exp $
+# $Id: updatedatabase,v 1.163 2007/05/02 16:44:31 tipaul Exp $
 
 # Database Updater
 # This script checks for required updates to the database.
@@ -530,6 +530,14 @@
             explanation         => 'Active this if you want NOT to use zebra (large libraries should avoid this parameters)',
             type                => 'YesNo',
         },
+        {
+            uniquefieldrequired => 'variable',
+            variable            => 'NoZebraIndexes',
+            value               => '0',
+            forceupdate         => {'explanation' => 1, 'type' => 1},
+            explanation         => "Enter a specific hash for NoZebra indexes. Enter : 'indexname' => '100a,245a,500*','index2' => '...'",
+            type                => 'Free',
+        },
     ],
     userflags => [
         {
@@ -1993,6 +2001,15 @@
 exit;
 
 # $Log: updatedatabase,v $
+# Revision 1.163  2007/05/02 16:44:31  tipaul
+# NoZebra SQL index management :
+# * adding 3 subs in Biblio.pm
+# - GetNoZebraIndexes, that get the index structure in a new systempreference (added with this commit)
+# - _DelBiblioNoZebra, that retrieve all index entries for a biblio and remove in a variable the biblio reference
+# - _AddBiblioNoZebra, that add index entries for a biblio.
+# Note that the 2 _Add and _Del subs work only in a hash variable, to speed up things in case of a modif (ie : delete+add). The effective SQL update is done in the ModZebra sub (that existed before, and dealed with zebra index).
+# I think the code has to be more deeply tested, but it works at least partially.
+#
 # Revision 1.162  2007/04/30 16:16:50  tipaul
 # bugfix for updatedatabase : when there is no default value (NULL fields) + removing bibliothesaurus table+adding NoZebra systempref (False by default)
 #

Index: misc/migration_tools/rebuild_nozebra.pl
===================================================================
RCS file: /sources/koha/koha/misc/migration_tools/rebuild_nozebra.pl,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- misc/migration_tools/rebuild_nozebra.pl	2 May 2007 11:57:11 -0000	1.2
+++ misc/migration_tools/rebuild_nozebra.pl	2 May 2007 16:44:31 -0000	1.3
@@ -14,7 +14,7 @@
 $|=1; # flushes output
 
 # limit for database dumping
-my $limit;# = "LIMIT 1000";
+my $limit = "LIMIT 100";
 my $directory;
 my $skip_export;
 my $keep_export;
@@ -47,22 +47,7 @@
 my $i=0;
 my %result;
 
-my %index = (
-    'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
-    'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
-    'isbn' => '010a',
-    'issn' => '011a',
-    'biblionumber' =>'0909',
-    'itemtype' => '200b',
-    'language' => '010a',
-    'publisher' => '210x',
-    'date' => '210d',
-    'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
-    'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109',
-    'subject' => '600*,601*,606*,610*',
-    'dewey' => '676a',
-    'host-item' => '995a,995c',
-    );
+my %index = GetNoZebraIndexes();
 
 $|=1;
 while (my ($biblionumber) = $sth->fetchrow) {
@@ -71,14 +56,11 @@
     my $record = GetMarcBiblio($biblionumber);
 
     # get title of the record (to store the 10 first letters with the index)
-    my $title;
-    if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
-        $title = lc($record->subfield('200','a'));
-    } else {
-        $title = lc($record->subfield('245','a'));
-    }
+    my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+    my $title = lc($record->subfield($titletag,$titlesubfield));
+
     # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
-    $title =~ s/ |,|;|\[|\]|\(|\)|\*//g;
+    $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
     # limit to 10 char, should be enough, and limit the DB size
     $title = substr($title,0,10);
     #parse each field
@@ -92,16 +74,20 @@
             # check each index to see if the subfield is stored somewhere
             # otherwise, store it in __RAW__ index
             foreach my $key (keys %index) {
-                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfield/) {
+                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
                     $indexed=1;
                     my $line= lc $subfield->[1];
-                    $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
+                    # remove meaningless value in the field...
+                    $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                    # ... and split in words
                     foreach (split / /,$line) {
-                        # see if the entry is already here
+                        next unless $_; # skip  empty values (multiple spaces)
+                        # if the entry is already here, improve weight
                         if ($result{$key}->{$_} =~ /$biblionumber,$title\-(\d);/) {
                             my $weight=$1+1;
                             $result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//;
                             $result{$key}->{$_} .= "$biblionumber,$title-$weight;";
+                        # otherwise, create it, with weight=1
                         } else {
                             $result{$key}->{$_}.="$biblionumber,$title-1;";
                         }
@@ -111,8 +97,9 @@
             # the subfield is not indexed, store it in __RAW__ index anyway
             unless ($indexed) {
                 my $line= lc $subfield->[1];
-                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
                 foreach (split / /,$line) {
+                        next unless $_;
 #                     warn $record->as_formatted."$_ =>".$title;
                         if ($result{__RAW__}->{$_} =~ /$biblionumber,$title\-(\d);/) {
                             my $weight=$1+1;
@@ -131,8 +118,8 @@
 foreach my $key (keys %result) {
     foreach my $index (keys %{$result{$key}}) {
         $sth->execute($key,$index,$result{$key}->{$index});
-        if (length($result{$key}->{$index}) >40000) {
-            print length($result{$key}->{$index})." for $key / $index";
+        if (length($result{$key}->{$index}) > 40000) {
+            print length($result{$key}->{$index})."\n for $key / $index\n";
         }
     }
 }