Koha/C4/Matcher.pm

package C4::Matcher;

# Copyright (C) 2007 LibLime
#
# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
# Suite 330, Boston, MA  02111-1307 USA

use strict;
use C4::Context;
use MARC::Record;
use C4::Search;
use C4::Biblio;

use vars qw($VERSION);

# set the version for version checking
$VERSION = 3.00;

=head1 NAME

C4::Matcher - find MARC records matching another one

=head1 SYNOPSIS

=over 4

my $matcher = C4::Matcher->new($record_type);
$matcher->threshold($threshold);
$matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer,
                         $index, $score);
$matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer,
                             $target_tag, $target_subfields, $target_normalizer);

my @matches = $matcher->get_matches($marc_record, $max_matches);

foreach $match (@matches) {

    # matches already sorted in order of
    # decreasing score
    print "record ID: $match->{'record_id'};
    print "score:     $match->{'score'};

}

=back

=head1 METHODS

=cut

=head2 new

=over 4

my $matcher = C4::Matcher->new($record_type, $threshold);

=back

Creates a new Matcher.  C<$record_type> indicates which search
database to use, e.g., 'biblio' or 'authority' and defaults to
'biblio', while C<$threshold> is the minimum score required for a match
and defaults to 1000.

=cut

sub new {
    my $class = shift;
    my $self = {};

    if ($#_ > -1) {
        $self->{'record_type'} = shift;
    } else {
        $self->{'record_type'} = 'biblio';
    }

    if ($#_ > -1) {
        $self->{'threshold'} = shift;
    } else {
        $self->{'threshold'} = 1000;
    }

    $self->{'matchpoints'} = [];
    $self->{'required_checks'} = [];

    bless $self, $class;
    return $self;
}

=head2 threshold

=over 4

$matcher->threshold(1000);
my $threshhold = $matcher->threshhold();

=back

Accessor method.

=cut

sub threshold {
    my $self = shift;
    @_ ? $self->{'threshold'} = shift : $self->{'threshold'};
}

=head2 add_matchpoint

=over 4

$matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer,
                         $index, $score);

=back

Adds a matchpoint rule -- after composing a key based on the source tag and subfields,
normalized per the normalization fuction, search the index.  All records retrieved
will receive the assigned score.

=cut

sub add_matchpoint {
    my $self = shift;
    my ($source_tag, $source_subfields, $source_normalizer, $index, $score) = @_;

    # FIXME - $source_normalizer not used yet
    my $matchpoint = {
        'source_tag'        => $source_tag,
        'source_subfields'  => { map { $_ => 1 } split(//, $source_subfields) },
        'source_normalizer' => $source_normalizer,
        'index'             => $index,
        'score'             => $score
    };
    push @{ $self->{'matchpoints'} }, $matchpoint;
}

=head2 add_required_check

$matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer,
                             $target_tag, $target_subfields, $target_normalizer);

=over 4

Adds a required check, which requires that the normalized keys made from the source and targets
must match for a match to be considered valid.

=back

=cut

sub add_required_check {
    my $self = shift;
    my ($check_name, $source_tag, $source_subfields, $source_normalizer, $target_tag, $target_subfields, $target_normalizer) = @_;

    my $check = {
        'check_name'        => $check_name,
        'source_tag'        => $source_tag,
        'source_subfields'  => { map { $_ => 1 } split(//, $source_subfields) },
        'source_normalizer' => $source_normalizer,
        'target_tag'        => $target_tag,
        'target_subfields'  => { map { $_ => 1 } split(//, $target_subfields) },
        'target_normalizer' => $target_normalizer
    };

    push @{ $self->{'required_checks'} }, $check;
}

=head2 find_matches

my @matches = $matcher->get_matches($marc_record, $max_matches);
foreach $match (@matches) {
  # matches already sorted in order of
  # decreasing score
  print "record ID: $match->{'record_id'};
  print "score:     $match->{'score'};
}

=back

Identifies all of the records matching the given MARC record.  For a record already
in the database to be considered a match, it must meet the following criteria:

=over 2

=item 1. Total score from its matching field must exceed the supplied threshold.

=item 2. It must pass all required checks.

=back

Only the top $max_matches matches are returned.  The returned array is sorted
in order of decreasing score, i.e., the best match is first.

=cut

sub get_matches {
    my $self = shift;
    my ($source_record, $max_matches) = @_;

    my %matches = ();

    foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
        my @source_keys = _get_match_keys($source_record, $matchpoint->{'source_tag'},
                                          $matchpoint->{'source_subfields'}, $matchpoint->{'source_normalizer'});
        next if scalar(@source_keys) == 0;
        # build query
        my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys);
        # FIXME only searching biblio index at the moment
        my ($error, $searchresults) = SimpleSearch($query);

        warn "search failed ($query) $error" if $error;
        foreach my $matched (@$searchresults) {
            $matches{$matched} += $matchpoint->{'score'};
        }
    }

    # get rid of any that don't meet the threshold
    %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches;

    # FIXME - implement record checks
    my @results = ();
    foreach my $marcblob (keys %matches) {
        my $target_record = MARC::Record->new_from_usmarc($marcblob);
        my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, '');
        # FIXME - again, bibliospecific
        # also, can search engine be induced to give just the number in the first place?
        my $record_number = $result->{'biblionumber'};
        push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} };
    }
    @results = sort { $b->{'score'} cmp $a->{'score'} } @results;
    if (scalar(@results) > $max_matches) {
        @results = @results[0..$max_matches-1];
    }
    return @results;

}

sub _get_match_keys {
    my ($source_record, $source_tag, $source_subfields, $source_normalizer) = @_;

    use Data::Dumper;
    my @keys = ();
    foreach my $field ($source_record->field($source_tag)) {
        if ($field->is_control_field()) {
            push @keys, _normalize($field->data());
        } else {
            my $key = "";
            foreach my $subfield ($field->subfields()) {
                if (exists $source_subfields->{$subfield->[0]}) {
                    $key .= " " . $subfield->[1];
                }
            }
            $key = _normalize($key);

            push @keys, $key if $key;
        }
    }
    return @keys;

}

# FIXME - default normalizer
sub _normalize {
    my $value = uc shift;
    $value =~ s/^\s+//;
    $value =~ s/^\s+$//;
    $value =~ s/\s+/ /g;
    $value =~ s/[.;,\]\[\)\(\/"']//g;
    return $value;
}

1;

=head1 AUTHOR

Koha Development Team <info@koha.org>

Galen Charlton <galen.charlton@liblime.com>

=cut