3 # Copyright (C) 2007 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along with
17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
18 # Suite 330, Boston, MA 02111-1307 USA
26 use vars qw($VERSION);
28 # set the version for version checking
33 C4::Matcher - find MARC records matching another one
39 my $matcher = C4::Matcher->new($record_type);
40 $matcher->threshold($threshold);
41 $matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer,
43 $matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer,
44 $target_tag, $target_subfields, $target_normalizer);
46 my @matches = $matcher->get_matches($marc_record, $max_matches);
48 foreach $match (@matches) {
50 # matches already sorted in order of
52 print "record ID: $match->{'record_id'};
53 print "score: $match->{'score'};
67 my $matcher = C4::Matcher->new($record_type, $threshold);
71 Creates a new Matcher. C<$record_type> indicates which search
72 database to use, e.g., 'biblio' or 'authority' and defaults to
73 'biblio', while C<$threshold> is the minimum score required for a match
83 $self->{'record_type'} = shift;
85 $self->{'record_type'} = 'biblio';
89 $self->{'threshold'} = shift;
91 $self->{'threshold'} = 1000;
94 $self->{'matchpoints'} = [];
95 $self->{'required_checks'} = [];
105 $matcher->threshold(1000);
106 my $threshhold = $matcher->threshhold();
116 @_ ? $self->{'threshold'} = shift : $self->{'threshold'};
119 =head2 add_matchpoint
123 $matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer,
128 Adds a matchpoint rule -- after composing a key based on the source tag and subfields,
129 normalized per the normalization fuction, search the index. All records retrieved
130 will receive the assigned score.
136 my ($source_tag, $source_subfields, $source_normalizer, $index, $score) = @_;
138 # FIXME - $source_normalizer not used yet
140 'source_tag' => $source_tag,
141 'source_subfields' => { map { $_ => 1 } split(//, $source_subfields) },
142 'source_normalizer' => $source_normalizer,
146 push @{ $self->{'matchpoints'} }, $matchpoint;
149 =head2 add_required_check
151 $matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer,
152 $target_tag, $target_subfields, $target_normalizer);
156 Adds a required check, which requires that the normalized keys made from the source and targets
157 must match for a match to be considered valid.
163 sub add_required_check {
165 my ($check_name, $source_tag, $source_subfields, $source_normalizer, $target_tag, $target_subfields, $target_normalizer) = @_;
168 'check_name' => $check_name,
169 'source_tag' => $source_tag,
170 'source_subfields' => { map { $_ => 1 } split(//, $source_subfields) },
171 'source_normalizer' => $source_normalizer,
172 'target_tag' => $target_tag,
173 'target_subfields' => { map { $_ => 1 } split(//, $target_subfields) },
174 'target_normalizer' => $target_normalizer
177 push @{ $self->{'required_checks'} }, $check;
182 my @matches = $matcher->get_matches($marc_record, $max_matches);
183 foreach $match (@matches) {
184 # matches already sorted in order of
186 print "record ID: $match->{'record_id'};
187 print "score: $match->{'score'};
192 Identifies all of the records matching the given MARC record. For a record already
193 in the database to be considered a match, it must meet the following criteria:
197 =item 1. Total score from its matching field must exceed the supplied threshold.
199 =item 2. It must pass all required checks.
203 Only the top $max_matches matches are returned. The returned array is sorted
204 in order of decreasing score, i.e., the best match is first.
210 my ($source_record, $max_matches) = @_;
214 foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
215 my @source_keys = _get_match_keys($source_record, $matchpoint->{'source_tag'},
216 $matchpoint->{'source_subfields'}, $matchpoint->{'source_normalizer'});
217 next if scalar(@source_keys) == 0;
219 my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys);
220 # FIXME only searching biblio index at the moment
221 my ($error, $searchresults) = SimpleSearch($query);
223 warn "search failed ($query) $error" if $error;
224 foreach my $matched (@$searchresults) {
225 $matches{$matched} += $matchpoint->{'score'};
229 # get rid of any that don't meet the threshold
230 %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches;
232 # FIXME - implement record checks
234 foreach my $marcblob (keys %matches) {
235 my $target_record = MARC::Record->new_from_usmarc($marcblob);
236 my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, '');
237 # FIXME - again, bibliospecific
238 # also, can search engine be induced to give just the number in the first place?
239 my $record_number = $result->{'biblionumber'};
240 push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} };
242 @results = sort { $b->{'score'} cmp $a->{'score'} } @results;
243 if (scalar(@results) > $max_matches) {
244 @results = @results[0..$max_matches-1];
250 sub _get_match_keys {
251 my ($source_record, $source_tag, $source_subfields, $source_normalizer) = @_;
255 foreach my $field ($source_record->field($source_tag)) {
256 if ($field->is_control_field()) {
257 push @keys, _normalize($field->data());
260 foreach my $subfield ($field->subfields()) {
261 if (exists $source_subfields->{$subfield->[0]}) {
262 $key .= " " . $subfield->[1];
265 $key = _normalize($key);
267 push @keys, $key if $key;
274 # FIXME - default normalizer
276 my $value = uc shift;
280 $value =~ s/[.;,\]\[\)\(\/"']//g;
288 Koha Development Team <info@koha.org>
290 Galen Charlton <galen.charlton@liblime.com>