[Koha-patches] [PATCH] [3.0.x] (bug #3172) support stemming for all languages using lingua::stem::snowball

Nahuel ANGELINETTI nahuel.angelinetti at biblibre.com
Tue May 19 17:17:38 CEST 2009


This patch add a function to get the user language, and add the support of lingua::stem::snowball instead of lingua::stem, which support a lot of languages and is better.
The stemming is now dynamic based on the user language.
---
 C4/Output.pm        |   18 ++++++++++++++++--
 C4/Search.pm        |   26 ++++++++++----------------
 Makefile.PL         |    2 +-
 about.pl            |    2 +-
 catalogue/search.pl |    3 ++-
 opac/opac-search.pl |    3 ++-
 6 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/C4/Output.pm b/C4/Output.pm
index fd32541..53cb6e0 100644
--- a/C4/Output.pm
+++ b/C4/Output.pm
@@ -39,13 +39,13 @@ BEGIN {
     require Exporter;
     @ISA    = qw(Exporter);
 	@EXPORT_OK = qw(&output_ajax_with_http_headers &is_ajax); # More stuff should go here instead
-	%EXPORT_TAGS = ( all =>[qw(&themelanguage &gettemplate setlanguagecookie pagination_bar
+	%EXPORT_TAGS = ( all =>[qw(&themelanguage &gettemplate setlanguagecookie getlanguagecookie pagination_bar
 								&output_ajax_with_http_headers &output_html_with_http_headers)],
 					ajax =>[qw(&output_ajax_with_http_headers is_ajax)],
 					html =>[qw(&output_html_with_http_headers)]
 				);
     push @EXPORT, qw(
-        &themelanguage &gettemplate setlanguagecookie pagination_bar
+        &themelanguage &gettemplate setlanguagecookie getlanguagecookie pagination_bar
     );
     push @EXPORT, qw(
         &output_html_with_http_headers
@@ -201,6 +201,20 @@ sub setlanguagecookie {
     );
 }
 
+sub getlanguagecookie {
+    my ($query) = @_;
+    my $lang;
+    if ($query->cookie('KohaOpacLanguage')){
+        $lang = $query->cookie('KohaOpacLanguage') ;
+    }else{
+        $lang = $ENV{HTTP_ACCEPT_LANGUAGE};
+        
+    }
+    $lang = substr($lang, 0, 2);
+
+    return $lang;
+}
+
 =item pagination_bar
 
    pagination_bar($base_url, $nb_pages, $current_page, $startfrom_name)
diff --git a/C4/Search.pm b/C4/Search.pm
index 6b3c632..99c8ac0 100644
--- a/C4/Search.pm
+++ b/C4/Search.pm
@@ -20,7 +20,6 @@ require Exporter;
 use C4::Context;
 use C4::Biblio;    # GetMarcFromKohaField
 use C4::Koha;      # getFacets
-use Lingua::Stem;
 use C4::Search::PazPar2;
 use XML::Simple;
 use C4::Dates qw(format_date);
@@ -733,9 +732,11 @@ sub _detect_truncation {
 
 # STEMMING
 sub _build_stemmed_operand {
-    my ($operand) = @_;
+    my ($operand, $lang) = @_;
     my $stemmed_operand;
 
+    require Lingua::Stem::Snowball;
+
     # If operand contains a digit, it is almost certainly an identifier, and should
     # not be stemmed.  This is particularly relevant for ISBNs and ISSNs, which
     # can contain the letter "X" - for example, _build_stemmend_operand would reduce 
@@ -743,20 +744,13 @@ sub _build_stemmed_operand {
     # results (e.g., "23 x 29 cm." from the 300$c).  Bug 2098.
     return $operand if $operand =~ /\d/;
 
-# FIXME: the locale should be set based on the user's language and/or search choice
-    my $stemmer = Lingua::Stem->new( -locale => 'EN-US' );
+    my $stemmer = Lingua::Stem::Snowball->new( lang => $lang,
+                                                encoding => "UTF-8" );
 
-# FIXME: these should be stored in the db so the librarian can modify the behavior
-    $stemmer->add_exceptions(
-        {
-            'and' => 'and',
-            'or'  => 'or',
-            'not' => 'not',
-        }
-    );
     my @words = split( / /, $operand );
-    my $stems = $stemmer->stem(@words);
-    for my $stem (@$stems) {
+    
+    for my $word (@words) {
+        my $stem = $stemmer->stem($word);
         $stemmed_operand .= "$stem";
         $stemmed_operand .= "?"
           unless ( $stem =~ /(and$|or$|not$)/ ) || ( length($stem) < 3 );
@@ -844,7 +838,7 @@ See verbose embedded documentation.
 =cut
 
 sub buildQuery {
-    my ( $operators, $operands, $indexes, $limits, $sort_by, $scan ) = @_;
+    my ( $operators, $operands, $indexes, $limits, $sort_by, $scan, $lang ) = @_;
 
     warn "---------\nEnter buildQuery\n---------" if $DEBUG;
 
@@ -1028,7 +1022,7 @@ sub buildQuery {
 
                 # Handle Stemming
                 my $stemmed_operand;
-                $stemmed_operand = _build_stemmed_operand($operand)
+                $stemmed_operand = _build_stemmed_operand($operand, $lang)
                   if $stemming;
                 warn "STEMMED OPERAND: >$stemmed_operand<" if $DEBUG;
 
diff --git a/Makefile.PL b/Makefile.PL
index a69f2a1..2a651b9 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -559,7 +559,7 @@ WriteMakefile(
                             'HTTP::Request::Common'            => 1.26,
                             'LWP::Simple'                      => 1.41,
                             'LWP::UserAgent'                   => 2.033,
-                            'Lingua::Stem'                     => 0.82,
+                            'Lingua::Stem::Snowball'           => 0.952, # optional
                             'List::Util'                       => 1.18,
                             'List::MoreUtils'                  => 0.21,
                             'Locale::Language'                 => 2.07,
diff --git a/about.pl b/about.pl
index 6e23540..bd68b91 100755
--- a/about.pl
+++ b/about.pl
@@ -89,7 +89,7 @@ HTTP::Request::Common
 HTML::Scrubber
 LWP::Simple
 LWP::UserAgent
-Lingua::Stem
+Lingua::Stem::Snowball
 List::Util
 List::MoreUtils
 Locale::Language
diff --git a/catalogue/search.pl b/catalogue/search.pl
index 902318a..d770557 100755
--- a/catalogue/search.pl
+++ b/catalogue/search.pl
@@ -419,7 +419,8 @@ my ( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit
 my @results;
 
 ## I. BUILD THE QUERY
-( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by,$scan);
+my $lang = C4::Output::getlanguagecookie($cgi);
+( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by,$scan,$lang));
 
 ## parse the query_cgi string and put it into a form suitable for <input>s
 my @query_inputs;
diff --git a/opac/opac-search.pl b/opac/opac-search.pl
index fa26b57..627635a 100755
--- a/opac/opac-search.pl
+++ b/opac/opac-search.pl
@@ -333,7 +333,8 @@ my ($error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_
 my @results;
 
 ## I. BUILD THE QUERY
-( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by);
+my $lang = C4::Output::getlanguagecookie($cgi);
+( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by, 0, $lang);
 
 sub _input_cgi_parse ($) { 
     my @elements;
-- 
1.6.0.4




More information about the Koha-patches mailing list