[Koha-cvs] koha C4/Search.pm misc/migration_tools/rebuild_...

Wed May 2 13:57:11 CEST 2007

CVSROOT:	/sources/koha
Module name:	koha
Changes by:	paul poulain <tipaul>	07/05/02 11:57:11

Modified files:
	C4             : Search.pm 
	misc/migration_tools: rebuild_nozebra.pl 

Log message:
	improving NOzebra search : 
	- changing nozebra table to have biblionumber,title-ranking; (; is the entry separator. Now, if a value is several times in an index, it is stored only once, with a higher ranking (the ranking is the number of times the word appeard for this index)
	- improving search to have ranking value (default order). The ranking is the sum of ranking of all terms. The list is ordered by ranking+title, from most to lower

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/C4/Search.pm?cvsroot=koha&r1=1.133&r2=1.134
http://cvs.savannah.gnu.org/viewcvs/koha/misc/migration_tools/rebuild_nozebra.pl?cvsroot=koha&r1=1.1&r2=1.2

Patches:
Index: C4/Search.pm
===================================================================
RCS file: /sources/koha/koha/C4/Search.pm,v
retrieving revision 1.133
retrieving revision 1.134
diff -u -b -r1.133 -r1.134

--- C4/Search.pm	30 Apr 2007 14:29:21 -0000	1.133
+++ C4/Search.pm	2 May 2007 11:57:11 -0000	1.134
@@ -25,7 +25,7 @@
 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
 
 # set the version for version checking
-$VERSION = do { my @v = '$Revision: 1.133 $' =~ /\d+/g;
+$VERSION = do { my @v = '$Revision: 1.134 $' =~ /\d+/g;
     shift(@v) . "." . join( "_", map { sprintf "%03d", $_ } @v );
 };
 
@@ -1225,11 +1225,11 @@
                 }
                 # do a AND with existing list if there is one, otherwise, use the biblionumbers list as 1st result list
                 if ($results) {
-                    my @leftresult = split /,/, $biblionumbers;
+                    my @leftresult = split /;/, $biblionumbers;
                     my $temp;
                     foreach (@leftresult) {
-                        if ($results =~ "$_,") {
-                            $temp .= "$_,$_,";
+                        if ($results =~ "$_;") {
+                            $temp .= "$_;$_;";
                         }
                     }
                     $results = $temp;
@@ -1253,8 +1253,8 @@
                     my @leftresult = split /,/, $biblionumbers;
                     my $temp;
                     foreach (@leftresult) {
-                        if ($results =~ "$_,") {
-                            $temp .= "$_,$_,";
+                        if ($results =~ "$_;") {
+                            $temp .= "$_;$_;";
                         }
                     }
                     $results = $temp;
@@ -1270,7 +1270,7 @@
 sub NZorder {
     my ($biblionumbers, $ordering,$results_per_page,$offset) = @_;
     # order title asc by default
-    $ordering = '1=36 <i' unless $ordering;
+#     $ordering = '1=36 <i' unless $ordering;
     $results_per_page=20 unless $results_per_page;
     $offset = 0 unless $offset;
     my $dbh = C4::Context->dbh;
@@ -1282,8 +1282,8 @@
         my %popularity;
         # popularity is not in MARC record, it's builded from a specific query
         my $sth = $dbh->prepare("select sum(issues) from items where biblionumber=?");
-        foreach (split /,/,$biblionumbers) {
-            my ($biblionumber,$title) = split /;/,$_;
+        foreach (split /;/,$biblionumbers) {
+            my ($biblionumber,$title) = split /,/,$_;
             $result{$biblionumber}=GetMarcBiblio($biblionumber);
             $sth->execute($biblionumber);
             my $popularity= $sth->fetchrow ||0;
@@ -1314,8 +1314,8 @@
     #
     } elsif ($ordering eq '1=1003 <i'){
         my %result;
-        foreach (split /,/,$biblionumbers) {
-            my ($biblionumber,$title) = split /;/,$_;
+        foreach (split /;/,$biblionumbers) {
+            my ($biblionumber,$title) = split /,/,$_;
             my $record=GetMarcBiblio($biblionumber);
             my $author;
             if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
@@ -1349,8 +1349,8 @@
     #
     } elsif ($ordering eq '1=20 <i'){
         my %result;
-        foreach (split /,/,$biblionumbers) {
-            my ($biblionumber,$title) = split /;/,$_;
+        foreach (split /;/,$biblionumbers) {
+            my ($biblionumber,$title) = split /,/,$_;
             my $record=GetMarcBiblio($biblionumber);
             my $callnumber;
             my ($callnumber_tag,$callnumber_subfield)=GetMarcFromKohaField($dbh,'items.itemcallnumber');
@@ -1382,8 +1382,8 @@
         return $finalresult;
     } elsif ($ordering =~ /1=31/){ #pub year
         my %result;
-        foreach (split /,/,$biblionumbers) {
-            my ($biblionumber,$title) = split /;/,$_;
+        foreach (split /;/,$biblionumbers) {
+            my ($biblionumber,$title) = split /,/,$_;
             my $record=GetMarcBiblio($biblionumber);
             my ($publicationyear_tag,$publicationyear_subfield)=GetMarcFromKohaField($dbh,'biblioitems.publicationyear');
             my $publicationyear=$record->subfield($publicationyear_tag,$publicationyear_subfield);
@@ -1410,13 +1410,11 @@
     #
     # ORDER BY title
     #
-    } else { 
+    } elsif ($ordering =~ /1=36/) { 
         # the title is in the biblionumbers string, so we just need to build a hash, sort it and return
         my %result;
-#         splice(@X,$results_per_page*(1+$offset));
-#         splice(@X,0,$results_per_page*$offset);
-        foreach (split /,/,$biblionumbers) {
-            my ($biblionumber,$title) = split /;/,$_;
+        foreach (split /;/,$biblionumbers) {
+            my ($biblionumber,$title) = split /,/,$_;
             # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
             # and we don't want to get only 1 result for each of them !!!
             # hint & speed improvement : we can order without reading the record
@@ -1444,8 +1442,52 @@
         $result_hash->{'hits'} = $numbers;
         $finalresult->{'biblioserver'} = $result_hash;
         return $finalresult;
+    } else {
+    #
+    # order by ranking
+    #
+        # we need 2 hashes to order by ranking : the 1st one to count the ranking, the 2nd to order by ranking
+        my %result;
+        my %count_ranking;
+        foreach (split /;/,$biblionumbers) {
+            my ($biblionumber,$title) = split /,/,$_;
+            $title =~ /(.*)-(\d)/;
+            # get weight 
+            my $ranking =$2;
+            # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
+            # and we don't want to get only 1 result for each of them !!!
+            # note that we + the ranking because ranking is calculated on weight of EACH term requested.
+            # if we ask for "two towers", and "two" has weight 2 in biblio N, and "towers" has weight 4 in biblio N
+            # biblio N has ranking = 6
+            $count_ranking{$biblionumber}=0 unless $count_ranking{$biblionumber};
+            $count_ranking{$biblionumber} =+ $ranking;
+        }
+        # build the result by "inverting" the count_ranking hash
+        # hing : as usual, we don't order by ranking only, to avoid having only 1 result for each rank. We build an hash on concat(ranking,biblionumber) instead
+#         warn "counting";
+        foreach (keys %count_ranking) {
+            warn "$_ =".sprintf("%10d",$count_ranking{$_}).'-'.$_;
+            $result{sprintf("%10d",$count_ranking{$_}).'-'.$_} = $_;
+        }
+        # sort the hash and return the same structure as GetRecords (Zebra querying)
+        my $result_hash;
+        my $numbers=0;
+            foreach my $key (sort {$b <=> $a} (keys %result)) {
+            warn "KEY : $key = ".$result{$key};
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key};
+            }
+        # for the requested page, replace biblionumber by the complete record
+        # speed improvement : avoid reading too much things
+        for (my $counter=$offset;$counter<=$offset+$results_per_page;$counter++) {
+            $result_hash->{'RECORDS'}[$counter] = GetMarcBiblio($result_hash->{'RECORDS'}[$counter])->as_usmarc;
+        }
+        my $finalresult=();
+        $result_hash->{'hits'} = $numbers;
+        $finalresult->{'biblioserver'} = $result_hash;
+        return $finalresult;
     }
 }
+
 END { }    # module clean-up code here (global destructor)
 
 1;

Index: misc/migration_tools/rebuild_nozebra.pl
===================================================================
RCS file: /sources/koha/koha/misc/migration_tools/rebuild_nozebra.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- misc/migration_tools/rebuild_nozebra.pl	25 Apr 2007 16:26:42 -0000	1.1
+++ misc/migration_tools/rebuild_nozebra.pl	2 May 2007 11:57:11 -0000	1.2
@@ -14,7 +14,7 @@
 $|=1; # flushes output
 
 # limit for database dumping
-my $limit = "LIMIT 1000";
+my $limit;# = "LIMIT 1000";
 my $directory;
 my $skip_export;
 my $keep_export;
@@ -32,6 +32,14 @@
 
 $directory = "export" unless $directory;
 my $dbh=C4::Context->dbh;
+$dbh->do("update systempreferences set value=1 where variable='NoZebra'");
+$dbh->do("CREATE TABLE `nozebra` (
+                `indexname` varchar(40) character set latin1 NOT NULL,
+                `value` varchar(250) character set latin1 NOT NULL,
+                `biblionumbers` longtext character set latin1 NOT NULL,
+                KEY `indexname` (`indexname`),
+                KEY `value` (`value`))
+                ENGINE=InnoDB DEFAULT CHARSET=utf8");
 $dbh->do("truncate nozebra");
 my $sth;
 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
@@ -40,8 +48,20 @@
 my %result;
 
 my %index = (
-    'title' => '200a,200c,200d',
-    'author' =>'200f,700*,701*,702*'
+    'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
+    'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
+    'isbn' => '010a',
+    'issn' => '011a',
+    'biblionumber' =>'0909',
+    'itemtype' => '200b',
+    'language' => '010a',
+    'publisher' => '210x',
+    'date' => '210d',
+    'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
+    'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109',
+    'subject' => '600*,601*,606*,610*',
+    'dewey' => '676a',
+    'host-item' => '995a,995c',
     );
 
 $|=1;
@@ -57,8 +77,8 @@
     } else {
         $title = lc($record->subfield('245','a'));
     }
-    # remove blancks and comma (that could cause problem when decoding the string for CQL retrieval
-    $title =~ s/ |,|;//g;
+    # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
+    $title =~ s/ |,|;|\[|\]|\(|\)|\*//g;
     # limit to 10 char, should be enough, and limit the DB size
     $title = substr($title,0,10);
     #parse each field
@@ -77,7 +97,14 @@
                     my $line= lc $subfield->[1];
                     $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
                     foreach (split / /,$line) {
-                        $result{$key}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
+                        # see if the entry is already here
+                        if ($result{$key}->{$_} =~ /$biblionumber,$title\-(\d);/) {
+                            my $weight=$1+1;
+                            $result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                            $result{$key}->{$_} .= "$biblionumber,$title-$weight;";
+                        } else {
+                            $result{$key}->{$_}.="$biblionumber,$title-1;";
+                        }
                     }
                 }
             }
@@ -86,7 +113,15 @@
                 my $line= lc $subfield->[1];
                 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
                 foreach (split / /,$line) {
-                    $result{'__RAW__'}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
+#                     warn $record->as_formatted."$_ =>".$title;
+                        if ($result{__RAW__}->{$_} =~ /$biblionumber,$title\-(\d);/) {
+                            my $weight=$1+1;
+#                             $weight++;
+                            $result{__RAW__}->{$_} =~ s/$biblionumber,$title\-(\d);//;
+                            $result{__RAW__}->{$_} .= "$biblionumber,$title-$weight;";
+                        } else {
+                            $result{__RAW__}->{$_}.="$biblionumber,$title-1;";
+                        }
                 }
             }
         }
@@ -96,5 +131,8 @@
 foreach my $key (keys %result) {
     foreach my $index (keys %{$result{$key}}) {
         $sth->execute($key,$index,$result{$key}->{$index});
+        if (length($result{$key}->{$index}) >40000) {
+            print length($result{$key}->{$index})." for $key / $index";
+        }
     }
 }