[Koha-patches] [PATCH] bug 10729 Add phrases configuration for icu

Colin Campbell colin.campbell at ptfs-europe.com
Wed Aug 14 17:36:42 CEST 2013


Add a separate phrases-icu.xml for phrase indexes
The file is based on that distributed with zebra
with a couple of additions to reflect Koha usage

This patch adds a separate tokenizer variable
for phrase indexes so that default.idx is
correctly rewritten for sites using icu
indexing
---
 Makefile.PL                     |  4 ++++
 etc/zebradb/etc/default.idx     |  2 +-
 etc/zebradb/etc/phrases-icu.xml | 10 ++++++++++
 rewrite-config.PL               |  1 +
 4 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 etc/zebradb/etc/phrases-icu.xml

diff --git a/Makefile.PL b/Makefile.PL
index a61072f..8340e8f 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -558,6 +558,10 @@ $config{ZEBRA_TOKENIZER_STMT} = $config{ZEBRA_TOKENIZER} eq 'icu'
     ? 'icuchain words-icu.xml'
     : 'charmap word-phrase-utf.chr';
 
+$config{ZEBRA_PTOKENIZER_STMT} = $config{ZEBRA_TOKENIZER} eq 'icu'
+    ? 'icuchain phrases-icu.xml'
+    : 'charmap word-phrase-utf.chr';
+
 my %test_suite_override_dirs = (
     KOHA_CONF_DIR  => ['etc'],
     ZEBRA_CONF_DIR => ['etc', 'zebradb'],
diff --git a/etc/zebradb/etc/default.idx b/etc/zebradb/etc/default.idx
index d6314c6..3a70392 100644
--- a/etc/zebradb/etc/default.idx
+++ b/etc/zebradb/etc/default.idx
@@ -18,7 +18,7 @@ __ZEBRA_TOKENIZER_STMT__
 index p
 completeness 1
 firstinfield 1
-__ZEBRA_TOKENIZER_STMT__
+__ZEBRA_PTOKENIZER_STMT__
 
 # URX (URL) index
 # Used if structure=urx (@attr 4=104)
diff --git a/etc/zebradb/etc/phrases-icu.xml b/etc/zebradb/etc/phrases-icu.xml
new file mode 100644
index 0000000..59d415c
--- /dev/null
+++ b/etc/zebradb/etc/phrases-icu.xml
@@ -0,0 +1,10 @@
+<icu_chain locale="">
+  <transform rule="[:Control:] Any-Remove"/>
+  <tokenize rule="s"/>
+  <transform rule="[:Punctuation:] Remove"/>
+  <transform rule="NFD"/>
+  <transform rule="[:Nonspacing Mark:] Remove"/>
+  <transform rule="NFC"/>
+  <display/>
+  <casemap rule="l"/>
+</icu_chain>
diff --git a/rewrite-config.PL b/rewrite-config.PL
index e903d49..f53d402 100644
--- a/rewrite-config.PL
+++ b/rewrite-config.PL
@@ -124,6 +124,7 @@ $prefix = $ENV{'INSTALL_BASE'} || "/usr";
   '__ZEBRA_LANGUAGE__' => 'en',
   '__ZEBRA_TOKENIZER__' => 'chr',
   '__ZEBRA_TOKENIZER_STMT__' => 'charmap word-phrase-utf.chr',
+  '__ZEBRA_PTOKENIZER_STMT__' => 'charmap word-phrase-utf.chr',
   '__ZEBRA_AUTH_CFG__' => 'zebra-authorities.cfg',
   '__ZEBRA_BIB_CFG__' => 'zebra-biblios.cfg',
   '__AUTH_RETRIEVAL_CFG__' => 'retrieval-info-auth-grs1.xml',
-- 
1.8.4.rc2.15.g96cb27a



More information about the Koha-patches mailing list