[Koha-patches] [PATCH] [3.0.x] (bug #3172) support stemming for all languages using lingua::stem::snowball
Nahuel ANGELINETTI
nahuel.angelinetti at biblibre.com
Tue May 19 17:17:38 CEST 2009
This patch add a function to get the user language, and add the support of lingua::stem::snowball instead of lingua::stem, which support a lot of languages and is better.
The stemming is now dynamic based on the user language.
---
C4/Output.pm | 18 ++++++++++++++++--
C4/Search.pm | 26 ++++++++++----------------
Makefile.PL | 2 +-
about.pl | 2 +-
catalogue/search.pl | 3 ++-
opac/opac-search.pl | 3 ++-
6 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/C4/Output.pm b/C4/Output.pm
index fd32541..53cb6e0 100644
--- a/C4/Output.pm
+++ b/C4/Output.pm
@@ -39,13 +39,13 @@ BEGIN {
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw(&output_ajax_with_http_headers &is_ajax); # More stuff should go here instead
- %EXPORT_TAGS = ( all =>[qw(&themelanguage &gettemplate setlanguagecookie pagination_bar
+ %EXPORT_TAGS = ( all =>[qw(&themelanguage &gettemplate setlanguagecookie getlanguagecookie pagination_bar
&output_ajax_with_http_headers &output_html_with_http_headers)],
ajax =>[qw(&output_ajax_with_http_headers is_ajax)],
html =>[qw(&output_html_with_http_headers)]
);
push @EXPORT, qw(
- &themelanguage &gettemplate setlanguagecookie pagination_bar
+ &themelanguage &gettemplate setlanguagecookie getlanguagecookie pagination_bar
);
push @EXPORT, qw(
&output_html_with_http_headers
@@ -201,6 +201,20 @@ sub setlanguagecookie {
);
}
+sub getlanguagecookie {
+ my ($query) = @_;
+ my $lang;
+ if ($query->cookie('KohaOpacLanguage')){
+ $lang = $query->cookie('KohaOpacLanguage') ;
+ }else{
+ $lang = $ENV{HTTP_ACCEPT_LANGUAGE};
+
+ }
+ $lang = substr($lang, 0, 2);
+
+ return $lang;
+}
+
=item pagination_bar
pagination_bar($base_url, $nb_pages, $current_page, $startfrom_name)
diff --git a/C4/Search.pm b/C4/Search.pm
index 6b3c632..99c8ac0 100644
--- a/C4/Search.pm
+++ b/C4/Search.pm
@@ -20,7 +20,6 @@ require Exporter;
use C4::Context;
use C4::Biblio; # GetMarcFromKohaField
use C4::Koha; # getFacets
-use Lingua::Stem;
use C4::Search::PazPar2;
use XML::Simple;
use C4::Dates qw(format_date);
@@ -733,9 +732,11 @@ sub _detect_truncation {
# STEMMING
sub _build_stemmed_operand {
- my ($operand) = @_;
+ my ($operand, $lang) = @_;
my $stemmed_operand;
+ require Lingua::Stem::Snowball;
+
# If operand contains a digit, it is almost certainly an identifier, and should
# not be stemmed. This is particularly relevant for ISBNs and ISSNs, which
# can contain the letter "X" - for example, _build_stemmend_operand would reduce
@@ -743,20 +744,13 @@ sub _build_stemmed_operand {
# results (e.g., "23 x 29 cm." from the 300$c). Bug 2098.
return $operand if $operand =~ /\d/;
-# FIXME: the locale should be set based on the user's language and/or search choice
- my $stemmer = Lingua::Stem->new( -locale => 'EN-US' );
+ my $stemmer = Lingua::Stem::Snowball->new( lang => $lang,
+ encoding => "UTF-8" );
-# FIXME: these should be stored in the db so the librarian can modify the behavior
- $stemmer->add_exceptions(
- {
- 'and' => 'and',
- 'or' => 'or',
- 'not' => 'not',
- }
- );
my @words = split( / /, $operand );
- my $stems = $stemmer->stem(@words);
- for my $stem (@$stems) {
+
+ for my $word (@words) {
+ my $stem = $stemmer->stem($word);
$stemmed_operand .= "$stem";
$stemmed_operand .= "?"
unless ( $stem =~ /(and$|or$|not$)/ ) || ( length($stem) < 3 );
@@ -844,7 +838,7 @@ See verbose embedded documentation.
=cut
sub buildQuery {
- my ( $operators, $operands, $indexes, $limits, $sort_by, $scan ) = @_;
+ my ( $operators, $operands, $indexes, $limits, $sort_by, $scan, $lang ) = @_;
warn "---------\nEnter buildQuery\n---------" if $DEBUG;
@@ -1028,7 +1022,7 @@ sub buildQuery {
# Handle Stemming
my $stemmed_operand;
- $stemmed_operand = _build_stemmed_operand($operand)
+ $stemmed_operand = _build_stemmed_operand($operand, $lang)
if $stemming;
warn "STEMMED OPERAND: >$stemmed_operand<" if $DEBUG;
diff --git a/Makefile.PL b/Makefile.PL
index a69f2a1..2a651b9 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -559,7 +559,7 @@ WriteMakefile(
'HTTP::Request::Common' => 1.26,
'LWP::Simple' => 1.41,
'LWP::UserAgent' => 2.033,
- 'Lingua::Stem' => 0.82,
+ 'Lingua::Stem::Snowball' => 0.952, # optional
'List::Util' => 1.18,
'List::MoreUtils' => 0.21,
'Locale::Language' => 2.07,
diff --git a/about.pl b/about.pl
index 6e23540..bd68b91 100755
--- a/about.pl
+++ b/about.pl
@@ -89,7 +89,7 @@ HTTP::Request::Common
HTML::Scrubber
LWP::Simple
LWP::UserAgent
-Lingua::Stem
+Lingua::Stem::Snowball
List::Util
List::MoreUtils
Locale::Language
diff --git a/catalogue/search.pl b/catalogue/search.pl
index 902318a..d770557 100755
--- a/catalogue/search.pl
+++ b/catalogue/search.pl
@@ -419,7 +419,8 @@ my ( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit
my @results;
## I. BUILD THE QUERY
-( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by,$scan);
+my $lang = C4::Output::getlanguagecookie($cgi);
+( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by,$scan,$lang));
## parse the query_cgi string and put it into a form suitable for <input>s
my @query_inputs;
diff --git a/opac/opac-search.pl b/opac/opac-search.pl
index fa26b57..627635a 100755
--- a/opac/opac-search.pl
+++ b/opac/opac-search.pl
@@ -333,7 +333,8 @@ my ($error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_
my @results;
## I. BUILD THE QUERY
-( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by);
+my $lang = C4::Output::getlanguagecookie($cgi);
+( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by, 0, $lang);
sub _input_cgi_parse ($) {
my @elements;
--
1.6.0.4
More information about the Koha-patches
mailing list