C4/Matcher.pm

   1 package C4::Matcher;
   2
   3 # Copyright (C) 2007 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use C4::Context;
  22 use MARC::Record;
  23 use C4::Search;
  24 use C4::Biblio;
  25
  26 use vars qw($VERSION);
  27
  28 # set the version for version checking
  29 $VERSION = 3.00;
  30
  31 =head1 NAME
  32
  33 C4::Matcher - find MARC records matching another one
  34
  35 =head1 SYNOPSIS
  36
  37 =over 4
  38
  39 my $matcher = C4::Matcher->new($record_type);
  40 $matcher->threshold($threshold);
  41 $matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer,
  42                          $index, $score);
  43 $matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer,
  44                              $target_tag, $target_subfields, $target_normalizer);
  45
  46 my @matches = $matcher->get_matches($marc_record, $max_matches);
  47
  48 foreach $match (@matches) {
  49
  50     # matches already sorted in order of
  51     # decreasing score
  52     print "record ID: $match->{'record_id'};
  53     print "score:     $match->{'score'};
  54
  55 }
  56
  57 =back
  58
  59 =head1 METHODS
  60
  61 =cut
  62
  63 =head2 new
  64
  65 =over 4
  66
  67 my $matcher = C4::Matcher->new($record_type, $threshold);
  68
  69 =back
  70
  71 Creates a new Matcher.  C<$record_type> indicates which search
  72 database to use, e.g., 'biblio' or 'authority' and defaults to
  73 'biblio', while C<$threshold> is the minimum score required for a match
  74 and defaults to 1000.
  75
  76 =cut
  77
  78 sub new {
  79     my $class = shift;
  80     my $self = {};
  81
  82     if ($#_ > -1) {
  83         $self->{'record_type'} = shift;
  84     } else {
  85         $self->{'record_type'} = 'biblio';
  86     }
  87
  88     if ($#_ > -1) {
  89         $self->{'threshold'} = shift;
  90     } else {
  91         $self->{'threshold'} = 1000;
  92     }
  93
  94     $self->{'matchpoints'} = [];
  95     $self->{'required_checks'} = [];
  96
  97     bless $self, $class;
  98     return $self;
  99 }
 100
 101 =head2 threshold
 102
 103 =over 4
 104
 105 $matcher->threshold(1000);
 106 my $threshhold = $matcher->threshhold();
 107
 108 =back
 109
 110 Accessor method.
 111
 112 =cut
 113
 114 sub threshold {
 115     my $self = shift;
 116     @_ ? $self->{'threshold'} = shift : $self->{'threshold'};
 117 }
 118
 119 =head2 add_matchpoint
 120
 121 =over 4
 122
 123 $matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer,
 124                          $index, $score);
 125
 126 =back
 127
 128 Adds a matchpoint rule -- after composing a key based on the source tag and subfields,
 129 normalized per the normalization fuction, search the index.  All records retrieved
 130 will receive the assigned score.
 131
 132 =cut
 133
 134 sub add_matchpoint {
 135     my $self = shift;
 136     my ($source_tag, $source_subfields, $source_normalizer, $index, $score) = @_;
 137
 138     # FIXME - $source_normalizer not used yet
 139     my $matchpoint = {
 140         'source_tag'        => $source_tag,
 141         'source_subfields'  => { map { $_ => 1 } split(//, $source_subfields) },
 142         'source_normalizer' => $source_normalizer,
 143         'index'             => $index,
 144         'score'             => $score
 145     };
 146     push @{ $self->{'matchpoints'} }, $matchpoint;
 147 }
 148
 149 =head2 add_required_check
 150
 151 $matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer,
 152                              $target_tag, $target_subfields, $target_normalizer);
 153
 154 =over 4
 155
 156 Adds a required check, which requires that the normalized keys made from the source and targets
 157 must match for a match to be considered valid.
 158
 159 =back
 160
 161 =cut
 162
 163 sub add_required_check {
 164     my $self = shift;
 165     my ($check_name, $source_tag, $source_subfields, $source_normalizer, $target_tag, $target_subfields, $target_normalizer) = @_;
 166
 167     my $check = {
 168         'check_name'        => $check_name,
 169         'source_tag'        => $source_tag,
 170         'source_subfields'  => { map { $_ => 1 } split(//, $source_subfields) },
 171         'source_normalizer' => $source_normalizer,
 172         'target_tag'        => $target_tag,
 173         'target_subfields'  => { map { $_ => 1 } split(//, $target_subfields) },
 174         'target_normalizer' => $target_normalizer
 175     };
 176
 177     push @{ $self->{'required_checks'} }, $check;
 178 }
 179
 180 =head2 find_matches
 181
 182 my @matches = $matcher->get_matches($marc_record, $max_matches);
 183 foreach $match (@matches) {
 184   # matches already sorted in order of
 185   # decreasing score
 186   print "record ID: $match->{'record_id'};
 187   print "score:     $match->{'score'};
 188 }
 189
 190 =back
 191
 192 Identifies all of the records matching the given MARC record.  For a record already
 193 in the database to be considered a match, it must meet the following criteria:
 194
 195 =over 2
 196
 197 =item 1. Total score from its matching field must exceed the supplied threshold.
 198
 199 =item 2. It must pass all required checks.
 200
 201 =back
 202
 203 Only the top $max_matches matches are returned.  The returned array is sorted
 204 in order of decreasing score, i.e., the best match is first.
 205
 206 =cut
 207
 208 sub get_matches {
 209     my $self = shift;
 210     my ($source_record, $max_matches) = @_;
 211
 212     my %matches = ();
 213
 214     foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
 215         my @source_keys = _get_match_keys($source_record, $matchpoint->{'source_tag'},
 216                                           $matchpoint->{'source_subfields'}, $matchpoint->{'source_normalizer'});
 217         next if scalar(@source_keys) == 0;
 218         # build query
 219         my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys);
 220         # FIXME only searching biblio index at the moment
 221         my ($error, $searchresults) = SimpleSearch($query);
 222
 223         warn "search failed ($query) $error" if $error;
 224         foreach my $matched (@$searchresults) {
 225             $matches{$matched} += $matchpoint->{'score'};
 226         }
 227     }
 228
 229     # get rid of any that don't meet the threshold
 230     %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches;
 231
 232     # FIXME - implement record checks
 233     my @results = ();
 234     foreach my $marcblob (keys %matches) {
 235         my $target_record = MARC::Record->new_from_usmarc($marcblob);
 236         my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, '');
 237         # FIXME - again, bibliospecific
 238         # also, can search engine be induced to give just the number in the first place?
 239         my $record_number = $result->{'biblionumber'};
 240         push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} };
 241     }
 242     @results = sort { $b->{'score'} cmp $a->{'score'} } @results;
 243     if (scalar(@results) > $max_matches) {
 244         @results = @results[0..$max_matches-1];
 245     }
 246     return @results;
 247
 248 }
 249
 250 sub _get_match_keys {
 251     my ($source_record, $source_tag, $source_subfields, $source_normalizer) = @_;
 252
 253     use Data::Dumper;
 254     my @keys = ();
 255     foreach my $field ($source_record->field($source_tag)) {
 256         if ($field->is_control_field()) {
 257             push @keys, _normalize($field->data());
 258         } else {
 259             my $key = "";
 260             foreach my $subfield ($field->subfields()) {
 261                 if (exists $source_subfields->{$subfield->[0]}) {
 262                     $key .= " " . $subfield->[1];
 263                 }
 264             }
 265             $key = _normalize($key);
 266
 267             push @keys, $key if $key;
 268         }
 269     }
 270     return @keys;
 271
 272 }
 273
 274 # FIXME - default normalizer
 275 sub _normalize {
 276     my $value = uc shift;
 277     $value =~ s/^\s+//;
 278     $value =~ s/^\s+$//;
 279     $value =~ s/\s+/ /g;
 280     $value =~ s/[.;,\]\[\)\(\/"']//g;
 281     return $value;
 282 }
 283
 284 1;
 285
 286 =head1 AUTHOR
 287
 288 Koha Development Team <info@koha.org>
 289
 290 Galen Charlton <galen.charlton@liblime.com>
 291
 292 =cut