[Koha-patches] [PATCH] Remove Stopwords bug fixing

henridamien.laurent at biblibre.com henridamien.laurent at biblibre.com
Fri Oct 9 13:49:14 CEST 2009


From: Henri-Damien LAURENT <henridamien.laurent at biblibre.com>

in french, les is a stopword

Modèles would match because of the combining and \P{IsAlnum} would not detect that.
---
 C4/Search.pm            |    6 +++++-
 t/db_dependent/Search.t |   28 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletions(-)
 create mode 100644 t/db_dependent/Search.t

diff --git a/C4/Search.pm b/C4/Search.pm
index c233931..f600685 100644
--- a/C4/Search.pm
+++ b/C4/Search.pm
@@ -27,6 +27,8 @@ use XML::Simple;
 use C4::Dates qw(format_date);
 use C4::XSLT;
 use C4::Branch;
+use C4::Debug;
+use YAML;
 use URI::Escape;
 
 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $DEBUG);
@@ -633,10 +635,12 @@ sub _remove_stopwords {
 #       we use IsAlpha unicode definition, to deal correctly with diacritics.
 #       otherwise, a French word like "leçon" woudl be split into "le" "çon", "le"
 #       is a stopword, we'd get "çon" and wouldn't find anything...
+#       
 		foreach ( keys %{ C4::Context->stopwords } ) {
 			next if ( $_ =~ /(and|or|not)/ );    # don't remove operators
+			$debug && warn "$_ Dump($operand)";
 			if ( my ($matched) = ($operand =~
-				/(\P{IsAlnum}\Q$_\E\P{IsAlnum}|^\Q$_\E\P{IsAlnum}|\P{IsAlnum}\Q$_\E$|^\Q$_\E$)/gi) )
+				/([^\X\p{isAlnum}]\Q$_\E[^\X\p{isAlnum}]|[^\X\p{isAlnum}]\Q$_\E$|^\Q$_\E[^\X\p{isAlnum}])/gi))
 			{
 				$operand =~ s/\Q$matched\E/ /gi;
 				push @stopwords_removed, $_;
diff --git a/t/db_dependent/Search.t b/t/db_dependent/Search.t
new file mode 100644
index 0000000..00d5b7c
--- /dev/null
+++ b/t/db_dependent/Search.t
@@ -0,0 +1,28 @@
+#!/usr/bin/perl
+#
+# This Koha test module is a stub!
+# Add more tests here!!!
+
+use strict;
+use warnings;
+use YAML;
+
+use C4::Debug;
+use C4::Context;
+use C4::Search;
+
+use Test::More tests => 3;
+
+BEGIN {
+    use_ok('C4::Search');
+}
+foreach my $string ("Leçon","mod\xc3\xa8les"){
+my @results=C4::Search::_remove_stopwords($string,"kw");
+$debug && warn "$string ",Dump(@results);
+ok($results[0] eq $string,"$string is not modified");
+}
+foreach my $string ("Les chaussettes de l'archiduchesse"){
+my @results=C4::Search::_remove_stopwords($string,"kw");
+$debug && warn "$string ",Dump(@results);
+ok($results[0] ne $string,"$results[0] from $string");
+}
-- 
1.6.0.4




More information about the Koha-patches mailing list