Koha/C4/Matcher.pm
Aleisha Amohia 33856898fd
Bug 26611: Make authority record matching use required match checks
This patch fixes record matching for authorities to correctly apply required match checks.

To test:

1) Create a record matching rule:

Threshold: 100
Record type: Authority record

Match point 1
Search index: subject-topical
Score: 100
Tag: 150
Subfields: a

Match check 1
Both source and target record check fields:
Tag: 040
Subfields: a

This rule says we want to match on the topical term heading, then confirm the match with 040$a.

2) Create two topical term authority records

Authority 1:
150$a: Test
040$a: A

Authority 2:
150$a: Test
040$a: B

3) Export authority 2 and save

4) Go to Cataloging -> Stage records for import

5) Upload your downloaded authority 2 file. Change the record type to Authority. Choose your new record matching rule.

6) Stage for import.

7) Confirm that with the record matching rule applied, both authorities 1 and 2 show as possible matches, even though only authority 2 has a matching 040$a.

8) Apply the patch and restart services

9) Change the matching rule to "Do not look for matching records" and apply. This is to essentially refresh the page.

10) Change the matching rule to your new matching rule and apply. Confirm only the matching authority 2 shows and is selected.

Sponsored-by: Waikato Institute of Technology
Signed-off-by: David Nind <david@davidnind.com>
Signed-off-by: Nick Clemens <nick@bywatersolutions.com>
Signed-off-by: Tomas Cohen Arazi <tomascohen@theke.io>
2023-05-24 11:21:31 -03:00

935 lines
30 KiB
Perl

package C4::Matcher;
# Copyright (C) 2007 LibLime, 2012 C & P Bibliography Services
#
# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Koha is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Koha; if not, see <http://www.gnu.org/licenses>.
use Modern::Perl;
use Koha::SearchEngine;
use Koha::SearchEngine::Search;
use Koha::SearchEngine::QueryBuilder;
use Koha::Util::Normalize qw(
ISBN
legacy_default
lower_case
remove_spaces
upper_case
);
=head1 NAME
C4::Matcher - find MARC records matching another one
=head1 SYNOPSIS
my @matchers = C4::Matcher::GetMatcherList();
my $matcher = C4::Matcher->new($record_type);
$matcher->threshold($threshold);
$matcher->code($code);
$matcher->description($description);
$matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, '');
$matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, '');
$matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]);
$matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, '');
$matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ],
[ { tag => '245', subfields => 'a', norms => [] } ]);
my @matches = $matcher->get_matches($marc_record, $max_matches);
foreach $match (@matches) {
# matches already sorted in order of
# decreasing score
print "record ID: $match->{'record_id'};
print "score: $match->{'score'};
}
my $matcher_description = $matcher->dump();
=head1 FUNCTIONS
=cut
=head2 GetMatcherList
my @matchers = C4::Matcher::GetMatcherList();
Returns an array of hashrefs list all matchers
present in the database. Each hashref includes:
* matcher_id
* code
* description
=cut
sub GetMatcherList {
my $dbh = C4::Context->dbh;
my $sth = $dbh->prepare_cached("SELECT matcher_id, code, description FROM marc_matchers ORDER BY matcher_id");
$sth->execute();
my @results = ();
while (my $row = $sth->fetchrow_hashref) {
push @results, $row;
}
return @results;
}
=head2 GetMatcherId
my $matcher_id = C4::Matcher::GetMatcherId($code);
Returns the matcher_id of a code.
=cut
sub GetMatcherId {
my ($code) = @_;
my $dbh = C4::Context->dbh;
my $matcher_id = $dbh->selectrow_array("SELECT matcher_id FROM marc_matchers WHERE code = ?", undef, $code);
return $matcher_id;
}
=head1 METHODS
=head2 new
my $matcher = C4::Matcher->new($record_type, $threshold);
Creates a new Matcher. C<$record_type> indicates which search
database to use, e.g., 'biblio' or 'authority' and defaults to
'biblio', while C<$threshold> is the minimum score required for a match
and defaults to 1000.
=cut
sub new {
my $class = shift;
my $self = {};
$self->{'id'} = undef;
if ($#_ > -1) {
$self->{'record_type'} = shift;
} else {
$self->{'record_type'} = 'biblio';
}
if ($#_ > -1) {
$self->{'threshold'} = shift;
} else {
$self->{'threshold'} = 1000;
}
$self->{'code'} = '';
$self->{'description'} = '';
$self->{'matchpoints'} = [];
$self->{'required_checks'} = [];
bless $self, $class;
return $self;
}
=head2 fetch
my $matcher = C4::Matcher->fetch($id);
Creates a matcher object from the version stored
in the database. If a matcher with the given
id does not exist, returns undef.
=cut
sub fetch {
my $class = shift;
my $id = shift;
my $dbh = C4::Context->dbh();
my $sth = $dbh->prepare_cached("SELECT * FROM marc_matchers WHERE matcher_id = ?");
$sth->execute($id);
my $row = $sth->fetchrow_hashref;
$sth->finish();
return unless defined $row;
my $self = {};
$self->{'id'} = $row->{'matcher_id'};
$self->{'record_type'} = $row->{'record_type'};
$self->{'code'} = $row->{'code'};
$self->{'description'} = $row->{'description'};
$self->{'threshold'} = int($row->{'threshold'});
bless $self, $class;
# matchpoints
$self->{'matchpoints'} = [];
$sth = $dbh->prepare_cached("SELECT * FROM matcher_matchpoints WHERE matcher_id = ? ORDER BY matchpoint_id");
$sth->execute($self->{'id'});
while (my $row = $sth->fetchrow_hashref) {
my $matchpoint = $self->_fetch_matchpoint($row->{'matchpoint_id'});
push @{ $self->{'matchpoints'} }, $matchpoint;
}
# required checks
$self->{'required_checks'} = [];
$sth = $dbh->prepare_cached("SELECT * FROM matchchecks WHERE matcher_id = ? ORDER BY matchcheck_id");
$sth->execute($self->{'id'});
while (my $row = $sth->fetchrow_hashref) {
my $source_matchpoint = $self->_fetch_matchpoint($row->{'source_matchpoint_id'});
my $target_matchpoint = $self->_fetch_matchpoint($row->{'target_matchpoint_id'});
my $matchcheck = {};
$matchcheck->{'source_matchpoint'} = $source_matchpoint;
$matchcheck->{'target_matchpoint'} = $target_matchpoint;
push @{ $self->{'required_checks'} }, $matchcheck;
}
return $self;
}
sub _fetch_matchpoint {
my $self = shift;
my $matchpoint_id = shift;
my $dbh = C4::Context->dbh;
my $sth = $dbh->prepare_cached("SELECT * FROM matchpoints WHERE matchpoint_id = ?");
$sth->execute($matchpoint_id);
my $row = $sth->fetchrow_hashref;
my $matchpoint = {};
$matchpoint->{'index'} = $row->{'search_index'};
$matchpoint->{'score'} = int($row->{'score'});
$sth->finish();
$matchpoint->{'components'} = [];
$sth = $dbh->prepare_cached("SELECT * FROM matchpoint_components WHERE matchpoint_id = ? ORDER BY sequence");
$sth->execute($matchpoint_id);
while ($row = $sth->fetchrow_hashref) {
my $component = {};
$component->{'tag'} = $row->{'tag'};
$component->{'subfields'} = { map { $_ => 1 } split(//, $row->{'subfields'}) };
$component->{'offset'} = int($row->{'offset'});
$component->{'length'} = int($row->{'length'});
$component->{'norms'} = [];
my $sth2 = $dbh->prepare_cached("SELECT *
FROM matchpoint_component_norms
WHERE matchpoint_component_id = ? ORDER BY sequence");
$sth2->execute($row->{'matchpoint_component_id'});
while (my $row2 = $sth2->fetchrow_hashref) {
push @{ $component->{'norms'} }, $row2->{'norm_routine'};
}
push @{ $matchpoint->{'components'} }, $component;
}
return $matchpoint;
}
=head2 store
my $id = $matcher->store();
Stores matcher in database. The return value is the ID
of the marc_matchers row. If the matcher was
previously retrieved from the database via the fetch()
method, the DB representation of the matcher
is replaced.
=cut
sub store {
my $self = shift;
if (defined $self->{'id'}) {
# update
$self->_del_matcher_components();
$self->_update_marc_matchers();
} else {
# create new
$self->_new_marc_matchers();
}
$self->_store_matcher_components();
return $self->{'id'};
}
sub _del_matcher_components {
my $self = shift;
my $dbh = C4::Context->dbh();
my $sth = $dbh->prepare_cached("DELETE FROM matchpoints WHERE matcher_id = ?");
$sth->execute($self->{'id'});
$sth = $dbh->prepare_cached("DELETE FROM matchchecks WHERE matcher_id = ?");
$sth->execute($self->{'id'});
# foreign key delete cascades take care of deleting relevant rows
# from matcher_matchpoints, matchpoint_components, and
# matchpoint_component_norms
}
sub _update_marc_matchers {
my $self = shift;
my $dbh = C4::Context->dbh();
my $sth = $dbh->prepare_cached("UPDATE marc_matchers
SET code = ?,
description = ?,
record_type = ?,
threshold = ?
WHERE matcher_id = ?");
$sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}, $self->{'id'});
}
sub _new_marc_matchers {
my $self = shift;
my $dbh = C4::Context->dbh();
my $sth = $dbh->prepare_cached("INSERT INTO marc_matchers
(code, description, record_type, threshold)
VALUES (?, ?, ?, ?)");
$sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'});
$self->{'id'} = $dbh->{'mysql_insertid'};
}
sub _store_matcher_components {
my $self = shift;
my $dbh = C4::Context->dbh();
my $sth;
my $matcher_id = $self->{'id'};
foreach my $matchpoint (@{ $self->{'matchpoints'}}) {
my $matchpoint_id = $self->_store_matchpoint($matchpoint);
$sth = $dbh->prepare_cached("INSERT INTO matcher_matchpoints (matcher_id, matchpoint_id)
VALUES (?, ?)");
$sth->execute($matcher_id, $matchpoint_id);
}
foreach my $matchcheck (@{ $self->{'required_checks'} }) {
my $source_matchpoint_id = $self->_store_matchpoint($matchcheck->{'source_matchpoint'});
my $target_matchpoint_id = $self->_store_matchpoint($matchcheck->{'target_matchpoint'});
$sth = $dbh->prepare_cached("INSERT INTO matchchecks
(matcher_id, source_matchpoint_id, target_matchpoint_id)
VALUES (?, ?, ?)");
$sth->execute($matcher_id, $source_matchpoint_id, $target_matchpoint_id);
}
}
sub _store_matchpoint {
my $self = shift;
my $matchpoint = shift;
my $dbh = C4::Context->dbh();
my $sth;
my $matcher_id = $self->{'id'};
$sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score)
VALUES (?, ?, ?)");
$sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'}||0);
my $matchpoint_id = $dbh->{'mysql_insertid'};
my $seqnum = 0;
foreach my $component (@{ $matchpoint->{'components'} }) {
$seqnum++;
$sth = $dbh->prepare_cached("INSERT INTO matchpoint_components
(matchpoint_id, sequence, tag, subfields, `offset`, length)
VALUES (?, ?, ?, ?, ?, ?)");
$sth->bind_param(1, $matchpoint_id);
$sth->bind_param(2, $seqnum);
$sth->bind_param(3, $component->{'tag'});
$sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} });
$sth->bind_param(5, $component->{'offset'}||0);
$sth->bind_param(6, $component->{'length'});
$sth->execute();
my $matchpoint_component_id = $dbh->{'mysql_insertid'};
my $normseq = 0;
foreach my $norm (@{ $component->{'norms'} }) {
$normseq++;
$sth = $dbh->prepare_cached("INSERT INTO matchpoint_component_norms
(matchpoint_component_id, sequence, norm_routine)
VALUES (?, ?, ?)");
$sth->execute($matchpoint_component_id, $normseq, $norm);
}
}
return $matchpoint_id;
}
=head2 delete
C4::Matcher->delete($id);
Deletes the matcher of the specified ID
from the database.
=cut
sub delete {
my $class = shift;
my $matcher_id = shift;
my $dbh = C4::Context->dbh;
my $sth = $dbh->prepare("DELETE FROM marc_matchers WHERE matcher_id = ?");
$sth->execute($matcher_id); # relying on cascading deletes to clean up everything
}
=head2 record_type
$matcher->record_type('biblio');
my $record_type = $matcher->record_type();
Accessor method.
=cut
sub record_type {
my $self = shift;
@_ ? $self->{'record_type'} = shift : $self->{'record_type'};
}
=head2 threshold
$matcher->threshold(1000);
my $threshold = $matcher->threshold();
Accessor method.
=cut
sub threshold {
my $self = shift;
@_ ? $self->{'threshold'} = shift : $self->{'threshold'};
}
=head2 _id
$matcher->_id(123);
my $id = $matcher->_id();
Accessor method. Note that using this method
to set the DB ID of the matcher should not be
done outside of the editing CGI.
=cut
sub _id {
my $self = shift;
@_ ? $self->{'id'} = shift : $self->{'id'};
}
=head2 code
$matcher->code('ISBN');
my $code = $matcher->code();
Accessor method.
=cut
sub code {
my $self = shift;
@_ ? $self->{'code'} = shift : $self->{'code'};
}
=head2 description
$matcher->description('match on ISBN');
my $description = $matcher->description();
Accessor method.
=cut
sub description {
my $self = shift;
@_ ? $self->{'description'} = shift : $self->{'description'};
}
=head2 add_matchpoint
$matcher->add_matchpoint($index, $score, $matchcomponents);
Adds a matchpoint that may include multiple components. The $index
parameter identifies the index that will be searched, while $score
is the weight that will be added if a match is found.
$matchcomponents should be a reference to an array of matchpoint
compoents, each of which should be a hash containing the following
keys:
tag
subfields
offset
length
norms
The normalization_rules value should in turn be a reference to an
array, each element of which should be a reference to a
normalization subroutine (under C4::Normalize) to be applied
to the source string.
=cut
sub add_matchpoint {
my $self = shift;
my ($index, $score, $matchcomponents) = @_;
my $matchpoint = {};
$matchpoint->{'index'} = $index;
$matchpoint->{'score'} = $score;
$matchpoint->{'components'} = [];
foreach my $input_component (@{ $matchcomponents }) {
push @{ $matchpoint->{'components'} }, _parse_match_component($input_component);
}
push @{ $self->{'matchpoints'} }, $matchpoint;
}
=head2 add_simple_matchpoint
$matcher->add_simple_matchpoint($index, $score, $source_tag,
$source_subfields, $source_offset,
$source_length, $source_normalizer);
Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields,
normalized per the normalization fuction, search the index. All records retrieved
will receive the assigned score.
=cut
sub add_simple_matchpoint {
my $self = shift;
my ($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer) = @_;
$self->add_matchpoint($index, $score, [
{ tag => $source_tag, subfields => $source_subfields,
offset => $source_offset, 'length' => $source_length,
norms => [ $source_normalizer ]
}
]);
}
=head2 add_required_check
$match->add_required_check($source_matchpoint, $target_matchpoint);
Adds a required check definition. A required check means that in
order for a match to be considered valid, the key derived from the
source (incoming) record must match the key derived from the target
(already in DB) record.
Unlike a regular matchpoint, only the first repeat of each tag
in the source and target match criteria are considered.
A typical example of a required check would be verifying that the
titles and publication dates match.
$source_matchpoint and $target_matchpoint are each a reference to
an array of hashes, where each hash follows the same definition
as the matchpoint component specification in add_matchpoint, i.e.,
tag
subfields
offset
length
norms
The normalization_rules value should in turn be a reference to an
array, each element of which should be a reference to a
normalization subroutine (under C4::Normalize) to be applied
to the source string.
=cut
sub add_required_check {
my $self = shift;
my ($source_matchpoint, $target_matchpoint) = @_;
my $matchcheck = {};
$matchcheck->{'source_matchpoint'}->{'index'} = '';
$matchcheck->{'source_matchpoint'}->{'score'} = 0;
$matchcheck->{'source_matchpoint'}->{'components'} = [];
$matchcheck->{'target_matchpoint'}->{'index'} = '';
$matchcheck->{'target_matchpoint'}->{'score'} = 0;
$matchcheck->{'target_matchpoint'}->{'components'} = [];
foreach my $input_component (@{ $source_matchpoint }) {
push @{ $matchcheck->{'source_matchpoint'}->{'components'} }, _parse_match_component($input_component);
}
foreach my $input_component (@{ $target_matchpoint }) {
push @{ $matchcheck->{'target_matchpoint'}->{'components'} }, _parse_match_component($input_component);
}
push @{ $self->{'required_checks'} }, $matchcheck;
}
=head2 add_simple_required_check
$matcher->add_simple_required_check($source_tag, $source_subfields,
$source_offset, $source_length, $source_normalizer,
$target_tag, $target_subfields, $target_offset,
$target_length, $target_normalizer);
Adds a required check, which requires that the normalized keys made from the source and targets
must match for a match to be considered valid.
=cut
sub add_simple_required_check {
my $self = shift;
my ($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer,
$target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer) = @_;
$self->add_required_check(
[ { tag => $source_tag, subfields => $source_subfields, offset => $source_offset, 'length' => $source_length,
norms => [ $source_normalizer ] } ],
[ { tag => $target_tag, subfields => $target_subfields, offset => $target_offset, 'length' => $target_length,
norms => [ $target_normalizer ] } ]
);
}
=head2 get_matches
my @matches = $matcher->get_matches($marc_record, $max_matches);
foreach $match (@matches) {
# matches already sorted in order of
# decreasing score
print "record ID: $match->{'record_id'};
print "score: $match->{'score'};
}
Identifies all of the records matching the given MARC record. For a record already
in the database to be considered a match, it must meet the following criteria:
=over 2
=item 1. Total score from its matching field must exceed the supplied threshold.
=item 2. It must pass all required checks.
=back
Only the top $max_matches matches are returned. The returned array is sorted
in order of decreasing score, i.e., the best match is first.
=cut
sub get_matches {
my $self = shift;
my ($source_record, $max_matches) = @_;
my $matches = {};
foreach my $matchpoint ( @{ $self->{'matchpoints'} } ) {
my @source_keys = _get_match_keys( $source_record, $matchpoint );
next if scalar(@source_keys) == 0;
@source_keys = C4::Koha::GetVariationsOfISBNs(@source_keys)
if ( $matchpoint->{index} =~ /^isbn$/i
&& C4::Context->preference('AggressiveMatchOnISBN') );
@source_keys = C4::Koha::GetVariationsOfISSNs(@source_keys)
if ( $matchpoint->{index} =~ /^issn$/i
&& C4::Context->preference('AggressiveMatchOnISSN') );
# build query
my $query;
my $error;
my $searchresults;
my $total_hits;
if ( $self->{'record_type'} eq 'biblio' ) {
my $phr = ( C4::Context->preference('AggressiveMatchOnISBN') || C4::Context->preference('AggressiveMatchOnISSN') ) ? ',phr' : q{};
$query = join( " OR ",
map { "$matchpoint->{'index'}$phr=\"$_\"" } @source_keys );
#NOTE: double-quote the values so you don't get a "Embedded truncation not supported" error when a term has a ? in it.
# Use state variables to avoid recreating the objects every time.
# With Elasticsearch this also avoids creating a massive amount of
# ES connectors that would eventually run out of file descriptors.
state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX});
( $error, $searchresults, $total_hits ) =
$searcher->simple_search_compat( $query, 0, $max_matches, undef, skip_normalize => 1 );
if ( defined $error ) {
warn "search failed ($query) $error";
}
else {
foreach my $matched ( @{$searchresults} ) {
my $target_record = C4::Search::new_record_from_zebra( 'biblioserver', $matched );
my ( $biblionumber_tag, $biblionumber_subfield ) = C4::Biblio::GetMarcFromKohaField( "biblio.biblionumber" );
my $id = ( $biblionumber_tag > 10 ) ?
$target_record->field($biblionumber_tag)->subfield($biblionumber_subfield) :
$target_record->field($biblionumber_tag)->data();
$matches->{$id}->{score} += $matchpoint->{score};
$matches->{$id}->{record} = $target_record;
}
}
}
elsif ( $self->{'record_type'} eq 'authority' ) {
my @marclist;
my @and_or;
my @excluding = [];
my @operator;
my @value;
foreach my $key (@source_keys) {
push @marclist, $matchpoint->{'index'};
push @and_or, 'or';
push @operator, 'exact';
push @value, $key;
}
# Use state variables to avoid recreating the objects every time.
# With Elasticsearch this also avoids creating a massive amount of
# ES connectors that would eventually run out of file descriptors.
state $builder = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX});
my $search_query = $builder->build_authorities_query_compat(
\@marclist, \@and_or, \@excluding, \@operator,
\@value, undef, 'AuthidAsc'
);
my ( $authresults, $total ) = $searcher->search_auth_compat( $search_query, 0, 20 );
foreach my $result (@$authresults) {
my $id = $result->{authid};
my $target_record = Koha::Authorities->find( $id )->record;
$matches->{$id}->{score} += $matchpoint->{'score'};
$matches->{$id}->{record} = $target_record;
}
}
}
# get rid of any that don't meet the threshold
$matches = { map { ($matches->{$_}->{score} >= $self->{'threshold'}) ? ($_ => $matches->{$_}) : () } keys %$matches };
my @results = ();
if ($self->{'record_type'} eq 'biblio') {
require C4::Biblio;
# get rid of any that don't meet the required checks
$matches = {
map {
_passes_required_checks( $source_record, $matches->{$_}->{'record'}, $self->{'required_checks'} )
? ( $_ => $matches->{$_} )
: ()
} keys %$matches
};
foreach my $id ( keys %$matches ) {
push @results, {
record_id => $id,
score => $matches->{$id}->{score}
};
}
} elsif ($self->{'record_type'} eq 'authority') {
require C4::AuthoritiesMarc;
# get rid of any that don't meet the required checks
$matches = {
map {
_passes_required_checks( $source_record, $matches->{$_}->{'record'}, $self->{'required_checks'} )
? ( $_ => $matches->{$_} )
: ()
} keys %$matches
};
foreach my $id (keys %$matches) {
push @results, {
record_id => $id,
score => $matches->{$id}->{score}
};
}
}
@results = sort {
$b->{'score'} cmp $a->{'score'} or
$b->{'record_id'} cmp $a->{'record_id'}
} @results;
if (scalar(@results) > $max_matches) {
@results = @results[0..$max_matches-1];
}
return @results;
}
=head2 dump
$description = $matcher->dump();
Returns a reference to a structure containing all of the information
in the matcher object. This is mainly a convenience method to
aid setting up a HTML editing form.
=cut
sub dump {
my $self = shift;
my $result = {};
$result->{'matcher_id'} = $self->{'id'};
$result->{'code'} = $self->{'code'};
$result->{'description'} = $self->{'description'};
$result->{'record_type'} = $self->{'record_type'};
$result->{'matchpoints'} = [];
foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
push @{ $result->{'matchpoints'} }, $matchpoint;
}
$result->{'matchchecks'} = [];
foreach my $matchcheck (@{ $self->{'required_checks'} }) {
push @{ $result->{'matchchecks'} }, $matchcheck;
}
return $result;
}
sub _passes_required_checks {
my ($source_record, $target_record, $matchchecks) = @_;
# no checks supplied == automatic pass
return 1 if $#{ $matchchecks } == -1;
foreach my $matchcheck (@{ $matchchecks }) {
my $source_key = join "", _get_match_keys($source_record, $matchcheck->{'source_matchpoint'});
my $target_key = join "", _get_match_keys($target_record, $matchcheck->{'target_matchpoint'});
return 0 unless $source_key eq $target_key;
}
return 1;
}
sub _get_match_keys {
my $source_record = shift;
my $matchpoint = shift;
my $check_only_first_repeat = @_ ? shift : 0;
# If there is more than one component to the matchpoint (e.g.,
# matchpoint includes both 003 and 001), any repeats
# of the first component's tag are identified; repeats
# of the subsequent components' tags are appended to
# each parallel key dervied from the first component,
# up to the number of repeats of the first component's tag.
#
# For example, if the record has one 003 and two 001s, only
# one key is retrieved because there is only one 003. The key
# will consist of the contents of the first 003 and first 001.
#
# If there are two 003s and two 001s, there will be two keys:
# first 003 + first 001
# second 003 + second 001
my @keys = ();
for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) {
my $component = $matchpoint->{'components'}->[$i];
my $j = -1;
my @fields = ();
my $tag = $component->{'tag'};
if ($tag && $tag eq 'LDR'){
$fields[0] = $source_record->leader();
}
else {
@fields = $source_record->field($tag);
}
FIELD: foreach my $field (@fields) {
$j++;
last FIELD if $j > 0 and $check_only_first_repeat;
last FIELD if $i > 0 and $j > $#keys;
my $string;
if ( ! ref $field ){
$string = "$field";
}
elsif ( $field->is_control_field() ) {
$string = $field->data();
} elsif ( defined $component->{subfields} && keys %{$component->{subfields}} ){
$string = $field->as_string(
join('', keys %{ $component->{ subfields } }), ' ' # ' ' as separator
);
} else {
$string = $field->as_string();
}
if ($component->{'length'}>0) {
$string= substr($string, $component->{'offset'}, $component->{'length'});
} elsif ($component->{'offset'}) {
$string= substr($string, $component->{'offset'});
}
my $norms = $component->{'norms'};
my $key = $string;
foreach my $norm ( @{ $norms } ) {
if ( grep { $norm eq $_ } valid_normalization_routines() ) {
if ( $norm eq 'remove_spaces' ) {
$key = remove_spaces($key);
}
elsif ( $norm eq 'upper_case' ) {
$key = upper_case($key);
}
elsif ( $norm eq 'lower_case' ) {
$key = lower_case($key);
}
elsif ( $norm eq 'legacy_default' ) {
$key = legacy_default($key);
}
elsif ( $norm eq 'ISBN' ) {
$key = ISBN($key);
}
} else {
warn "Invalid normalization routine required ($norm)"
unless $norm eq 'none';
}
}
if ($i == 0) {
push @keys, $key if $key;
} else {
$keys[$j] .= " $key" if $key;
}
}
}
return @keys;
}
sub _parse_match_component {
my $input_component = shift;
my $component = {};
$component->{'tag'} = $input_component->{'tag'};
$component->{'subfields'} = { map { $_ => 1 } split(//, $input_component->{'subfields'}) };
$component->{'offset'} = exists($input_component->{'offset'}) ? $input_component->{'offset'} : -1;
$component->{'length'} = $input_component->{'length'} ? $input_component->{'length'} : 0;
$component->{'norms'} = $input_component->{'norms'} ? $input_component->{'norms'} : [];
return $component;
}
sub valid_normalization_routines {
return (
'remove_spaces',
'upper_case',
'lower_case',
'legacy_default',
'ISBN'
);
}
1;
__END__
=head1 AUTHOR
Koha Development Team <http://koha-community.org/>
Galen Charlton <galen.charlton@liblime.com>
=cut