[Koha-patches] [PATCH] DDCN callnumber splitting with test.

Fri Mar 13 15:57:14 CET 2009

Similar to previous patch for LCCN splitting, this patch incorporates
changes to split_ddcn and supplies a test file for verifying proper
operation.  Note that the only previously documented example for intended
operation is included as one of the tests.

This regexps are created to be rather forgiving.  For example, the function
will not choke if two spaces were included where the "spec" (such as it is)
expects one.  Obviously this is because for CN splitting purposes, it doesn't
matter, we're not going to ever split in the middle of whitespace.
---
 C4/Labels.pm          |   48 +++++++++++++++++++++++++++++++++++-------------
 t/Labels_split_ddcn.t |   36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 13 deletions(-)
 create mode 100755 t/Labels_split_ddcn.t

diff --git a/C4/Labels.pm b/C4/Labels.pm
index 36c8809..827ce91 100644
--- a/C4/Labels.pm
+++ b/C4/Labels.pm
@@ -905,6 +905,8 @@ sub deduplicate_batch {
 	return $killed, undef;
 }
 
+our $possible_decimal = qr/\d+(?:\.\d+)?/;
+
 sub split_lccn {
     my ($lccn) = @_;    
     $_ = $lccn;
@@ -915,9 +917,13 @@ sub split_lccn {
         \s*
         (\.*\D+\d*)       # .P6         # .E8
         \s*
-        (.*)              # T44 1983    # H39 1996   # everything else (except any trailing spaces)
+        (.*)              # T44 1983    # H39 1996   # everything else (except any bracketing spaces)
         \s*
         /x;
+    unless (scalar @parts)  {
+        $debug and print STDERR "split_lccn regexp failed to match string: $_\n";
+        push @parts, $_;     # if no match, just push the whole string.
+    }
     push @parts, split /\s+/, pop @parts;   # split the last piece into an arbitrary number of pieces at spaces
     $debug and print STDERR "split_lccn array: ", join(" | ", @parts), "\n";
     return @parts;
@@ -925,19 +931,35 @@ sub split_lccn {
 
 sub split_ddcn {
     my ($ddcn) = @_;
-    $ddcn =~ s/\///g;   # in theory we should be able to simply remove all segmentation markers and arrive at the correct call number...
     $_ = $ddcn;
-    # ddcn example R220.3 H2793Z H32 c.2
-    my @splits = m/^([A-Z]{0,3})                # R (OS, REF, etc. up do three letters)
-                    ([0-9]+\.[0-9]*)            # 220.3
-                    \s?                         # space (not requiring anything beyond the call number)
-                    ([a-zA-Z0-9]*\.?[a-zA-Z0-9])# cutter number... maybe, but if so it is in this position (Z indicates literary criticism)
-                    \s?                         # space if it exists
-                    ([a-zA-Z]*\.?[0-9]*)        # other indicators such as cutter for author of literary criticism in this example if it exists
-                    \s?                         # space if ie exists
-                    ([a-zA-Z]*\.?[0-9]*)        # other indicators such as volume number, copy number, edition date, etc. if it exists
-                    /x;
-    return @splits;
+    s/\///g;   # in theory we should be able to simply remove all segmentation markers and arrive at the correct call number...
+    # ddcn examples: 'R220.3 H2793Z H32 c.2', 'BIO JP2 R5c.1'
+
+    my (@parts) = m/
+        ^([a-zA-Z]+(?:$possible_decimal)?) # R220.3            # BIO   # first example will require extra splitting
+        \s*
+        (.+)                               # H2793Z H32 c.2   # R5c.1   # everything else (except bracketing spaces)
+        \s*
+        /x;
+    unless (scalar @parts)  {
+        $debug and print STDERR "split_ddcn regexp failed to match string: $_\n";
+        push @parts, $_;     # if no match, just push the whole string.
+    }
+
+    if ($parts[ 0] =~ /^([a-zA-Z]+)($possible_decimal)$/) {
+          shift @parts;         # pull off the mathching first element, like example 1
+        unshift @parts, $1, $2; # replace it with the two pieces
+    }
+
+    push @parts, split /\s+/, pop @parts;   # split the last piece into an arbitrary number of pieces at spaces
+
+    if ($parts[-1] =~ /^(.*\d+)(\D.*)$/) {
+         pop @parts;            # pull off the mathching last element, like example 2
+        push @parts, $1, $2;    # replace it with the two pieces
+    }
+
+    $debug and print STDERR "split_ddcn array: ", join(" | ", @parts), "\n";
+    return @parts;
 }
 
 sub split_fcn {
diff --git a/t/Labels_split_ddcn.t b/t/Labels_split_ddcn.t
new file mode 100755
index 0000000..ca879f3
--- /dev/null
+++ b/t/Labels_split_ddcn.t
@@ -0,0 +1,36 @@
+#!/usr/bin/perl
+#
+# for context, see http://bugs.koha.org
+
+use strict;
+use warnings;
+
+use Test::More tests => 52;
+
+BEGIN {
+    use_ok('C4::Labels');
+}
+ok(defined C4::Labels::split_ddcn, 'C4::Labels::split_ddcn defined');
+
+my $ddcns = {
+    'BIO JP2 R5c.1'         => [qw(BIO JP2 R5 c.1 )],
+    'FIC GIR J5c.1'         => [qw(FIC GIR J5 c.1 )],
+    'J DAR G7c.11'          => [qw( J  DAR G7 c.11)],
+    'R220.3 H2793Z H32 c.2' => [qw(R 220.3 H2793Z H32 c.2)],
+};
+
+foreach my $ddcn (sort keys %$ddcns) {
+    my (@parts, @expected);
+    ok($ddcn, "ddcn: $ddcn");
+    ok(@expected = @{$ddcns->{$ddcn}}, "split expected to produce " . scalar(@expected) . " pieces");
+    ok(@parts = C4::Labels::split_ddcn($ddcn), "C4::Labels::split_ddcn($ddcn)");
+    ok(scalar(@expected) == scalar(@parts), sprintf("%d of %d pieces produced", scalar(@parts), scalar(@expected)));
+    my $i = 0;
+    foreach my $unit (@expected) {
+        my $part;
+        ok($part = $parts[$i], "($ddcn)[$i] populated: " . (defined($part) ? $part : 'UNDEF'));
+        ok((defined($part) and $part eq $unit),     "($ddcn)[$i]   matches: $unit");
+        $i++;
+    }
+}
+
-- 
1.5.6.5