[Koha-patches] [PATCH 3/6] bug 7818: utility to generate DOM indexing configs

Galen Charlton gmc at esilibrary.com
Mon Mar 26 07:44:44 CEST 2012


misc/maintenance/make_zebra_dom_cfg_from_record_abs:
  generate a DOM filter Zebra index config from a GRS-1 config

Given a Zebra record.abs file containing a set of index definitions for
Zebra's GRS-1 filter, write an equivalent DOM filter configuration.

To generate the XSLT that is to be used by Zebra, run something like
the following on the output of this utility:

xsltproc ZEBRA_CFG_DIR/xsl/koha-indexdefs-to-zebra.xsl \
  biblio-koha-indexdefs.xml \
  > ZEBRA_CFG_DIR/marc_defs/marc21/biblios/biblio-zebra-indexdefs.xsl

The above example assumes that the output of the program was named
biblio-koha-indexdefs.xsl.

This commit also introduces Koha::Indexer::Utils, a new package for
misceallenous routines that support Koha's indexing definitions.

Signed-off-by: Galen Charlton <gmc at esilibrary.com>
---
 Koha/Indexer/Utils.pm                              |  222 ++++++++++++++++++++
 .../maintenance/make_zebra_dom_cfg_from_record_abs |   72 +++++++
 2 files changed, 294 insertions(+), 0 deletions(-)
 create mode 100644 Koha/Indexer/Utils.pm
 create mode 100755 misc/maintenance/make_zebra_dom_cfg_from_record_abs

diff --git a/Koha/Indexer/Utils.pm b/Koha/Indexer/Utils.pm
new file mode 100644
index 0000000..a422104
--- /dev/null
+++ b/Koha/Indexer/Utils.pm
@@ -0,0 +1,222 @@
+package Koha::Indexer::Utils;
+
+# Copyright (c) 2012 Equinox Software, Inc.
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA  02111-1307 USA
+
+use strict;
+use warnings;
+use 5.010;
+
+use XML::LibXML;
+
+=head1 Koha::Indexer::Utils
+
+Koha::Indexer::Utils - utility functions for managing search indexes
+
+=head1 DESCRIPTION
+
+This modules contains utility functions for managing various aspects
+of Koha's bibliographic and authority search indexes.
+
+=head1 FUNCTIONS
+
+=cut
+
+=head2 zebra_record_abs_to_dom
+
+$dom_config = Koha::Indexer::Utils::zebra_record_abs_to_dom($record_abs_config, $marcflavour);
+
+Given a string containing the contents of a records.abs configuration file as
+used by Zebra's GRS-1 filter, emit an equivalent DOM configuration.
+
+=cut
+
+our $idxNS = 'http://www.koha-community.org/schemas/index-defs';
+
+sub zebra_record_abs_to_dom {
+    my $grs1_cfg = shift;
+    my $marcflavour = shift;
+   
+    chomp $grs1_cfg;
+    my @grs1_cfg_lines = split /\n/, $grs1_cfg, -1;
+    my $grs1_defs = [];
+
+    # generate an arrayref of structures representing
+    # each records.abs line
+    for (my $i = 0; $i <= $#grs1_cfg_lines; $i++) {
+        my $line = $grs1_cfg_lines[$i];
+        next if _can_ignore_grs1_cfg_line($line);
+        my $grs1_def = _parse_grs1_cfg_line($line);
+        $grs1_def->{orig_def} = $line;
+        $grs1_def->{lineno} = $i + 1;
+        push @$grs1_defs, $grs1_def;
+    }
+
+    # map the index definitions to a DOM tree representing
+    # the index definitions -- if you squint hard, you
+    # can see the beginnings of a more general definition language
+    # for Koha index definitions
+    my $dom_cfg = XML::LibXML::Document->new('1.0', 'utf-8');
+    my $root = $dom_cfg->createElement('index_defs');
+    $root->setNamespace($idxNS, 'kohaidx');
+    foreach my $grs1_def (@$grs1_defs) {
+        _append_grs1_def_to_dom_cfg($dom_cfg, $root, $grs1_def, $marcflavour);
+    }
+
+    # and emit the result as a string
+    $dom_cfg->setDocumentElement($root);
+    return $dom_cfg->toString(1);
+}
+
+#
+# bunch of utility functions for zebra_record_abs_to_dom
+#
+sub _can_ignore_grs1_cfg_line {
+    my $line = shift;
+    return 1 if $line =~ /^\s*$/ or
+                $line =~ /^#/ or
+                $line =~ /^(encoding|name|attset|esetname|marc|systag|xpath)/ or
+                $line =~ /^all/; # DOM filter automatically indexes all tokens, so
+                                 # no need to deal with 'all any' lines in record.abs
+    return 0;
+}
+
+sub _parse_grs1_cfg_line {
+    my $line = shift;
+    my $grs1_def;
+   
+    if ($line =~ /^melm\s+(.*)/ || $line =~ m!^xelm /record/(.*)!) {
+        $grs1_def = _parse_xelm_melm($1);
+    }
+    return $grs1_def;
+}
+
+sub _parse_xelm_melm {
+    my $line = shift;
+   
+    my ($field, $index_defs) = split /\s+/, $line, 2; 
+
+    # munge fixed field range indicators
+    $index_defs =~ s/range\(data,(\d+),(\d+)\)/$1:$2/g;
+
+    my ($tag, $subfield) = split /\$/, $field, 2;
+    return {
+        tag         => $tag,
+        subfield    => $subfield,
+        index_defs  => [ map { _parse_grs1_index_def($_) } split /,/, $index_defs ],
+    };   
+}
+
+sub _parse_grs1_index_def {
+    my $index_def = shift;
+
+    my @parts = split /:/, $index_def, -1;
+    my $parsed_def = {};
+    $parsed_def->{name}       = shift @parts;
+    $parsed_def->{index_type} = shift @parts;
+    $parsed_def->{offset}     = shift @parts;
+    $parsed_def->{length}     = shift @parts;
+    # if the original index definition didn't specify an index
+    # type, set it 'w' -- the DOM filter needs the index type
+    # to be specified explicitly
+    $parsed_def->{index_type} = 'w' unless defined $parsed_def->{index_type};
+    return $parsed_def;
+}
+
+sub _append_grs1_def_to_dom_cfg {
+    my $dom_cfg = shift;
+    my $root = shift;
+    my $grs1_def = shift;
+    my $marcflavour = shift;
+
+    my $comment = $dom_cfg->createComment('record.abs line ' . 
+                                          $grs1_def->{lineno} . ': ' .
+                                          $grs1_def->{orig_def});
+    $root->appendChild($comment);
+
+    if (defined $grs1_def->{tag} && defined $grs1_def->{subfield}) {
+        my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_subfields');
+        $dom_def->setAttribute('tag', $grs1_def->{tag});
+        $dom_def->setAttribute('subfields', $grs1_def->{subfield});
+        _append_target_indexes($dom_cfg, $dom_def, $grs1_def);
+        $root->appendChild($dom_def);
+    } elsif (defined $grs1_def->{tag} and $grs1_def->{tag} eq 'leader') {
+        # we're the leader
+        _append_grs1_defs_for_leader($dom_cfg, $root, $grs1_def);
+    } elsif (defined $grs1_def->{tag} and $grs1_def->{tag} < 10) {
+        # we're a control field
+        _append_grs1_defs_for_control_field($dom_cfg, $root, $grs1_def);
+    } elsif (defined $grs1_def->{tag}) {
+        # we're indexing an entire variable data field
+        my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_data_field');
+        $dom_def->setAttribute('tag', $grs1_def->{tag});
+        _append_target_indexes($dom_cfg, $dom_def, $grs1_def);
+        $root->appendChild($dom_def);
+    }
+} 
+
+sub _append_target_indexes { 
+    my $dom_cfg = shift;
+    my $dom_def = shift;
+    my $grs1_def = shift;
+
+    foreach my $index_def (@{ $grs1_def->{index_defs} }) {
+        _append_one_target_index($dom_cfg, $dom_def, $index_def);
+    }
+}
+
+sub _append_one_target_index {
+    my $dom_cfg = shift;
+    my $dom_def = shift;
+    my $index_def = shift;
+    my $tgt_idx = $dom_cfg->createElementNS($idxNS, 'target_index');
+    my $index_name = "$index_def->{name}:$index_def->{index_type}"; 
+    $tgt_idx->appendText($index_name);
+    $dom_def->appendChild($tgt_idx);
+}
+
+sub _append_grs1_defs_for_leader {
+    my $dom_cfg = shift;
+    my $root = shift;
+    my $grs1_def = shift;
+    foreach my $index_def (@{ $grs1_def->{index_defs} }) {
+        my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_leader');
+        if (defined $index_def->{offset} && defined $index_def->{length}) {
+            $dom_def->setAttribute('offset', $index_def->{offset});
+            $dom_def->setAttribute('length', $index_def->{length});
+        }
+        _append_one_target_index($dom_cfg, $dom_def, $index_def);
+        $root->appendChild($dom_def);
+    }
+}
+
+sub _append_grs1_defs_for_control_field {
+    my $dom_cfg = shift;
+    my $root = shift;
+    my $grs1_def = shift;
+    foreach my $index_def (@{ $grs1_def->{index_defs} }) {
+        my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_control_field');
+        $dom_def->setAttribute('tag', $grs1_def->{tag});
+        if (defined $index_def->{offset} && defined $index_def->{length}) {
+            $dom_def->setAttribute('offset', $index_def->{offset});
+            $dom_def->setAttribute('length', $index_def->{length});
+        }
+        _append_one_target_index($dom_cfg, $dom_def, $index_def);
+        $root->appendChild($dom_def);
+    }
+}
+
+1;
diff --git a/misc/maintenance/make_zebra_dom_cfg_from_record_abs b/misc/maintenance/make_zebra_dom_cfg_from_record_abs
new file mode 100755
index 0000000..bd954ef
--- /dev/null
+++ b/misc/maintenance/make_zebra_dom_cfg_from_record_abs
@@ -0,0 +1,72 @@
+#!/usr/bin/perl
+
+# Copyright (c) 2012 Equinox Software, Inc.
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA  02111-1307 USA
+
+use strict;
+use warnings;
+use 5.010;
+
+use Koha::Indexer::Utils;
+
+use Getopt::Long;
+
+my $input_file;
+my $output_file;
+my $want_help;
+my $result = GetOptions(
+    'input:s'      => \$input_file,
+    'output:s'     => \$output_file,
+    'help|h'       => \$want_help,
+);
+
+if ( not $result or $want_help or not defined $input_file or not defined $output_file ) {
+    print_usage();
+    exit 0;
+}
+
+open my $infh,  '<', $input_file or die "$0: cannot open input file $input_file: $!\n";
+open my $outfh, '>', $output_file or die "$0: cannot open output file $output_file: $!\n";
+
+my $grs1_cfg = join('', <$infh>);
+close $infh;
+my $dom_cfg = Koha::Indexer::Utils::zebra_record_abs_to_dom($grs1_cfg);
+print $outfh $dom_cfg;
+close $outfh;
+
+sub print_usage {
+    print <<_USAGE_;
+$0: generate a DOM filter Zebra index config from a GRS-1 config
+
+Given a Zebra record.abs file containing a set of index definitions for
+Zebra's GRS-1 filter, write an equivalent DOM filter configuration.
+
+To generate the XSLT that is to be used by Zebra, run something like
+the following on the output of this utility:
+
+xsltproc ZEBRA_CFG_DIR/xsl/koha-indexdefs-to-zebra.xsl \\
+  biblio-koha-indexdefs.xml \\
+  > ZEBRA_CFG_DIR/marc_defs/marc21/biblios/biblio-zebra-indexdefs.xsl
+
+The above example assumes that the output of the program was named 
+biblio-koha-indexdefs.xsl.
+
+Parameters:
+    --input                 input file name
+    --output                output file name
+    --help or -h            show this message
+_USAGE_
+}
-- 
1.7.2.5



More information about the Koha-patches mailing list