From 6215956dffa4ee8490ae0dfa49a58aafa057a7c2 Mon Sep 17 00:00:00 2001
From: Joe Atzberger <joe.atzberger@liblime.com>
Date: Fri, 13 Mar 2009 09:57:14 -0500
Subject: [PATCH] DDCN callnumber splitting with test.

Similar to previous patch for LCCN splitting, this patch incorporates
changes to split_ddcn and supplies a test file for verifying proper
operation.  Note that the only previously documented example for intended
operation is included as one of the tests.

This regexps are created to be rather forgiving.  For example, the function
will not choke if two spaces were included where the "spec" (such as it is)
expects one.  Obviously this is because for CN splitting purposes, it doesn't
matter, we're not going to ever split in the middle of whitespace.

Signed-off-by: Galen Charlton <galen.charlton@liblime.com>
---
 C4/Labels.pm          | 48 +++++++++++++++++++++++++++++++------------
 t/Labels_split_ddcn.t | 36 ++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 13 deletions(-)
 create mode 100755 t/Labels_split_ddcn.t

diff --git a/C4/Labels.pm b/C4/Labels.pm
index 2eb7b41b21..5fc15d678f 100644
--- a/C4/Labels.pm
+++ b/C4/Labels.pm
@@ -949,6 +949,8 @@ sub deduplicate_batch {
 	return $killed, undef;
 }
 
+our $possible_decimal = qr/\d+(?:\.\d+)?/;
+
 sub split_lccn {
     my ($lccn) = @_;    
     $_ = $lccn;
@@ -959,9 +961,13 @@ sub split_lccn {
         \s*
         (\.*\D+\d*)       # .P6         # .E8
         \s*
-        (.*)              # T44 1983    # H39 1996   # everything else (except any trailing spaces)
+        (.*)              # T44 1983    # H39 1996   # everything else (except any bracketing spaces)
         \s*
         /x;
+    unless (scalar @parts)  {
+        $debug and print STDERR "split_lccn regexp failed to match string: $_\n";
+        push @parts, $_;     # if no match, just push the whole string.
+    }
     push @parts, split /\s+/, pop @parts;   # split the last piece into an arbitrary number of pieces at spaces
     $debug and print STDERR "split_lccn array: ", join(" | ", @parts), "\n";
     return @parts;
@@ -969,19 +975,35 @@ sub split_lccn {
 
 sub split_ddcn {
     my ($ddcn) = @_;
-    $ddcn =~ s/\///g;   # in theory we should be able to simply remove all segmentation markers and arrive at the correct call number...
     $_ = $ddcn;
-    # ddcn example R220.3 H2793Z H32 c.2
-    my @splits = m/^([A-Z]{0,3})                # R (OS, REF, etc. up do three letters)
-                    ([0-9]+\.[0-9]*)            # 220.3
-                    \s?                         # space (not requiring anything beyond the call number)
-                    ([a-zA-Z0-9]*\.?[a-zA-Z0-9])# cutter number... maybe, but if so it is in this position (Z indicates literary criticism)
-                    \s?                         # space if it exists
-                    ([a-zA-Z]*\.?[0-9]*)        # other indicators such as cutter for author of literary criticism in this example if it exists
-                    \s?                         # space if ie exists
-                    ([a-zA-Z]*\.?[0-9]*)        # other indicators such as volume number, copy number, edition date, etc. if it exists
-                    /x;
-    return @splits;
+    s/\///g;   # in theory we should be able to simply remove all segmentation markers and arrive at the correct call number...
+    # ddcn examples: 'R220.3 H2793Z H32 c.2', 'BIO JP2 R5c.1'
+
+    my (@parts) = m/
+        ^([a-zA-Z]+(?:$possible_decimal)?) # R220.3            # BIO   # first example will require extra splitting
+        \s*
+        (.+)                               # H2793Z H32 c.2   # R5c.1   # everything else (except bracketing spaces)
+        \s*
+        /x;
+    unless (scalar @parts)  {
+        $debug and print STDERR "split_ddcn regexp failed to match string: $_\n";
+        push @parts, $_;     # if no match, just push the whole string.
+    }
+
+    if ($parts[ 0] =~ /^([a-zA-Z]+)($possible_decimal)$/) {
+          shift @parts;         # pull off the mathching first element, like example 1
+        unshift @parts, $1, $2; # replace it with the two pieces
+    }
+
+    push @parts, split /\s+/, pop @parts;   # split the last piece into an arbitrary number of pieces at spaces
+
+    if ($parts[-1] =~ /^(.*\d+)(\D.*)$/) {
+         pop @parts;            # pull off the mathching last element, like example 2
+        push @parts, $1, $2;    # replace it with the two pieces
+    }
+
+    $debug and print STDERR "split_ddcn array: ", join(" | ", @parts), "\n";
+    return @parts;
 }
 
 sub split_fcn {
diff --git a/t/Labels_split_ddcn.t b/t/Labels_split_ddcn.t
new file mode 100755
index 0000000000..ca879f372e
--- /dev/null
+++ b/t/Labels_split_ddcn.t
@@ -0,0 +1,36 @@
+#!/usr/bin/perl
+#
+# for context, see http://bugs.koha.org
+
+use strict;
+use warnings;
+
+use Test::More tests => 52;
+
+BEGIN {
+    use_ok('C4::Labels');
+}
+ok(defined C4::Labels::split_ddcn, 'C4::Labels::split_ddcn defined');
+
+my $ddcns = {
+    'BIO JP2 R5c.1'         => [qw(BIO JP2 R5 c.1 )],
+    'FIC GIR J5c.1'         => [qw(FIC GIR J5 c.1 )],
+    'J DAR G7c.11'          => [qw( J  DAR G7 c.11)],
+    'R220.3 H2793Z H32 c.2' => [qw(R 220.3 H2793Z H32 c.2)],
+};
+
+foreach my $ddcn (sort keys %$ddcns) {
+    my (@parts, @expected);
+    ok($ddcn, "ddcn: $ddcn");
+    ok(@expected = @{$ddcns->{$ddcn}}, "split expected to produce " . scalar(@expected) . " pieces");
+    ok(@parts = C4::Labels::split_ddcn($ddcn), "C4::Labels::split_ddcn($ddcn)");
+    ok(scalar(@expected) == scalar(@parts), sprintf("%d of %d pieces produced", scalar(@parts), scalar(@expected)));
+    my $i = 0;
+    foreach my $unit (@expected) {
+        my $part;
+        ok($part = $parts[$i], "($ddcn)[$i] populated: " . (defined($part) ? $part : 'UNDEF'));
+        ok((defined($part) and $part eq $unit),     "($ddcn)[$i]   matches: $unit");
+        $i++;
+    }
+}
+
-- 
2.39.5