[Koha-patches] [PATCH] (bug #3172) support stemming for all languages using lingua::stem::snowball
Nahuel ANGELINETTI
nahuel.angelinetti at biblibre.com
Wed Apr 29 16:38:11 CEST 2009
This patch add a function to get the user language, and add the support of lingua::stem::snowball instead of lingua::stem, which support a lot of languages and is better.
The stemming is now dynamic based on the user language.
---
C4/Output.pm | 18 ++++++++++++++++--
C4/Search.pm | 27 ++++++++++-----------------
Makefile.PL | 2 +-
about.pl | 2 +-
catalogue/search.pl | 3 ++-
opac/opac-search.pl | 3 ++-
6 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/C4/Output.pm b/C4/Output.pm
index c8404dc..9c70b8a 100644
--- a/C4/Output.pm
+++ b/C4/Output.pm
@@ -39,13 +39,13 @@ BEGIN {
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw(&output_ajax_with_http_headers &is_ajax); # More stuff should go here instead
- %EXPORT_TAGS = ( all =>[qw(&themelanguage &gettemplate setlanguagecookie pagination_bar
+ %EXPORT_TAGS = ( all =>[qw(&themelanguage &gettemplate setlanguagecookie getlanguagecookie pagination_bar
&output_ajax_with_http_headers &output_html_with_http_headers)],
ajax =>[qw(&output_ajax_with_http_headers is_ajax)],
html =>[qw(&output_html_with_http_headers)]
);
push @EXPORT, qw(
- &themelanguage &gettemplate setlanguagecookie pagination_bar
+ &themelanguage &gettemplate setlanguagecookie getlanguagecookie pagination_bar
);
push @EXPORT, qw(
&output_html_with_http_headers
@@ -208,6 +208,20 @@ sub setlanguagecookie {
);
}
+sub getlanguagecookie {
+ my ($query) = @_;
+ my $lang;
+ if ($query->cookie('KohaOpacLanguage')){
+ $lang = $query->cookie('KohaOpacLanguage') ;
+ }else{
+ $lang = $ENV{HTTP_ACCEPT_LANGUAGE};
+
+ }
+ $lang = substr($lang, 0, 2);
+
+ return $lang;
+}
+
=item pagination_bar
pagination_bar($base_url, $nb_pages, $current_page, $startfrom_name)
diff --git a/C4/Search.pm b/C4/Search.pm
index eab50b7..2070542 100644
--- a/C4/Search.pm
+++ b/C4/Search.pm
@@ -21,12 +21,12 @@ require Exporter;
use C4::Context;
use C4::Biblio; # GetMarcFromKohaField
use C4::Koha; # getFacets
-use Lingua::Stem;
use C4::Search::PazPar2;
use XML::Simple;
use C4::Dates qw(format_date);
use C4::XSLT;
use C4::Branch;
+require Lingua::Stem::Snowball;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $DEBUG);
@@ -736,7 +736,7 @@ sub _detect_truncation {
# STEMMING
sub _build_stemmed_operand {
- my ($operand) = @_;
+ my ($operand, $lang) = @_;
my $stemmed_operand;
# If operand contains a digit, it is almost certainly an identifier, and should
@@ -745,21 +745,14 @@ sub _build_stemmed_operand {
# "014100018X" to "x ", which for a MARC21 database would bring up irrelevant
# results (e.g., "23 x 29 cm." from the 300$c). Bug 2098.
return $operand if $operand =~ /\d/;
+
+ my $stemmer = Lingua::Stem::Snowball->new( lang => $lang,
+ encoding => "UTF-8" );
-# FIXME: the locale should be set based on the user's language and/or search choice
- my $stemmer = Lingua::Stem->new( -locale => 'EN-US' );
-
-# FIXME: these should be stored in the db so the librarian can modify the behavior
- $stemmer->add_exceptions(
- {
- 'and' => 'and',
- 'or' => 'or',
- 'not' => 'not',
- }
- );
my @words = split( / /, $operand );
- my $stems = $stemmer->stem(@words);
- for my $stem (@$stems) {
+
+ for my $word (@words) {
+ my $stem = $stemmer->stem($word);
$stemmed_operand .= "$stem";
$stemmed_operand .= "?"
unless ( $stem =~ /(and$|or$|not$)/ ) || ( length($stem) < 3 );
@@ -847,7 +840,7 @@ See verbose embedded documentation.
=cut
sub buildQuery {
- my ( $operators, $operands, $indexes, $limits, $sort_by, $scan ) = @_;
+ my ( $operators, $operands, $indexes, $limits, $sort_by, $scan, $lang ) = @_;
warn "---------\nEnter buildQuery\n---------" if $DEBUG;
@@ -1038,7 +1031,7 @@ sub buildQuery {
# Handle Stemming
my $stemmed_operand;
- $stemmed_operand = _build_stemmed_operand($operand)
+ $stemmed_operand = _build_stemmed_operand($operand, $lang)
if $stemming;
warn "STEMMED OPERAND: >$stemmed_operand<" if $DEBUG;
diff --git a/Makefile.PL b/Makefile.PL
index 1445690..b3d0e1c 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -562,7 +562,7 @@ WriteMakefile(
'JSON' => 2.07, # Needed by admin/item_circulation_alerts.pl
'LWP::Simple' => 1.41,
'LWP::UserAgent' => 2.033,
- 'Lingua::Stem' => 0.82,
+ 'Lingua::Stem::Snowball' => 0.952, # optional
'List::Util' => 1.18,
'List::MoreUtils' => 0.21,
'Locale::Language' => 2.07,
diff --git a/about.pl b/about.pl
index c106db2..ff42c51 100755
--- a/about.pl
+++ b/about.pl
@@ -94,7 +94,7 @@ HTML::Scrubber
JSON
LWP::Simple
LWP::UserAgent
-Lingua::Stem
+Lingua::Stem::Snowball
List::Util
List::MoreUtils
Locale::Language
diff --git a/catalogue/search.pl b/catalogue/search.pl
index a5beb6d..503ed4f 100755
--- a/catalogue/search.pl
+++ b/catalogue/search.pl
@@ -419,7 +419,8 @@ my ( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit
my @results;
## I. BUILD THE QUERY
-( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by,$scan);
+my $lang = C4::Output::getlanguagecookie($cgi);
+( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by,$scan,$lang);
## parse the query_cgi string and put it into a form suitable for <input>s
my @query_inputs;
diff --git a/opac/opac-search.pl b/opac/opac-search.pl
index 4e70857..30641a2 100755
--- a/opac/opac-search.pl
+++ b/opac/opac-search.pl
@@ -334,7 +334,8 @@ my ($error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_
my @results;
## I. BUILD THE QUERY
-( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by);
+my $lang = C4::Output::getlanguagecookie($cgi);
+( $error,$query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type) = buildQuery(\@operators,\@operands,\@indexes,\@limits,\@sort_by, 0, $lang);
sub _input_cgi_parse ($) {
my @elements;
--
1.6.0.4
More information about the Koha-patches
mailing list