From 75bd4b2dacbe00bca5d90fe689f546efd8be4cde Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Wed, 14 Nov 2007 16:10:10 -0600 Subject: [PATCH] bibliographic matching enhancements * serialization of matching rules to database * added two default matching rules for MARC21 * update import staging CGI to choose matching rule Signed-off-by: Chris Cormack Signed-off-by: Joshua Ferraro --- C4/Matcher.pm | 562 ++++++++++++++++-- .../marc21_default_matching_rules.sql | 38 ++ .../marc21_default_matching_rules.txt | 5 + .../en/modules/tools/stage-marc-import.tmpl | 27 +- misc/stage_biblios_file.pl | 4 +- tools/stage-marc-import.pl | 24 +- 6 files changed, 593 insertions(+), 67 deletions(-) create mode 100644 installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.sql create mode 100644 installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.txt diff --git a/C4/Matcher.pm b/C4/Matcher.pm index b374fc1e25..16f42c268f 100644 --- a/C4/Matcher.pm +++ b/C4/Matcher.pm @@ -36,12 +36,20 @@ C4::Matcher - find MARC records matching another one =over 4 +my @matchers = C4::Matcher::GetMatcherList(); + my $matcher = C4::Matcher->new($record_type); $matcher->threshold($threshold); -$matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer, - $index, $score); -$matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer, - $target_tag, $target_subfields, $target_normalizer); +$matcher->code($code); +$matcher->description($description); + +$matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, ''); +$matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, ''); +$matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]); + +$matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, ''); +$matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ], + [ { tag => '245', subfields => 'a', norms => [] } ]); my @matches = $matcher->get_matches($marc_record, $max_matches); @@ -56,6 +64,39 @@ foreach $match (@matches) { =back +=head1 FUNCTIONS + +=cut + +=head2 GetMatcherList + +=over 4 + +my @matchers = C4::Matcher::GetMatcherList(); + +=back + +Returns an array of hashrefs list all matchers +present in the database. Each hashref includes: + +matcher_id +code +description + +=cut + +sub GetMatcherList { + my $dbh = C4::Context->dbh; + + my $sth = $dbh->prepare_cached("SELECT matcher_id, code, description FROM marc_matchers ORDER BY matcher_id"); + $sth->execute(); + my @results = (); + while (my $row = $sth->fetchrow_hashref) { + push @results, $row; + } + return @results; +} + =head1 METHODS =cut @@ -79,6 +120,8 @@ sub new { my $class = shift; my $self = {}; + $self->{'id'} = undef; + if ($#_ > -1) { $self->{'record_type'} = shift; } else { @@ -91,6 +134,9 @@ sub new { $self->{'threshold'} = 1000; } + $self->{'code'} = ''; + $self->{'description'} = ''; + $self->{'matchpoints'} = []; $self->{'required_checks'} = []; @@ -98,12 +144,232 @@ sub new { return $self; } +=head2 fetch + +=over 4 + +my $matcher = C4::Matcher->fetch($id); + +=back + +Creates a matcher object from the version stored +in the database. If a matcher with the given +id does not exist, returns undef. + +=cut + +sub fetch { + my $class = shift; + my $id = shift; + my $dbh = C4::Context->dbh(); + + my $sth = $dbh->prepare_cached("SELECT * FROM marc_matchers WHERE matcher_id = ?"); + $sth->execute($id); + my $row = $sth->fetchrow_hashref; + return undef unless defined $row; + + my $self = {}; + $self->{'id'} = $row->{'matcher_id'}; + $self->{'record_type'} = $row->{'record_type'}; + $self->{'code'} = $row->{'code'}; + $self->{'description'} = $row->{'description'}; + $self->{'threshold'} = int($row->{'threshold'}); + bless $self, $class; + + # matchpoints + $self->{'matchpoints'} = []; + $sth = $dbh->prepare_cached("SELECT * FROM matcher_matchpoints WHERE matcher_id = ? ORDER BY matchpoint_id"); + $sth->execute($self->{'id'}); + while (my $row = $sth->fetchrow_hashref) { + my $matchpoint = $self->_fetch_matchpoint($row->{'matchpoint_id'}); + push @{ $self->{'matchpoints'} }, $matchpoint; + } + + # required checks + $self->{'required_checks'} = []; + $sth = $dbh->prepare_cached("SELECT * FROM matchchecks WHERE matcher_id = ? ORDER BY matchcheck_id"); + $sth->execute($self->{'id'}); + while (my $row = $sth->fetchrow_hashref) { + my $source_matchpoint = $self->_fetch_matchpoint($row->{'source_matchpoint_id'}); + my $target_matchpoint = $self->_fetch_matchpoint($row->{'target_matchpoint_id'}); + my $matchcheck = {}; + $matchcheck->{'source_matchpoint'} = $source_matchpoint; + $matchcheck->{'target_matchpoint'} = $target_matchpoint; + push @{ $self->{'required_checks'} }, $matchcheck; + } + + return $self; +} + +sub _fetch_matchpoint { + my $self = shift; + my $matchpoint_id = shift; + + my $dbh = C4::Context->dbh; + my $sth = $dbh->prepare_cached("SELECT * FROM matchpoints WHERE matchpoint_id = ?"); + $sth->execute($matchpoint_id); + my $row = $sth->fetchrow_hashref; + my $matchpoint = {}; + $matchpoint->{'index'} = $row->{'search_index'}; + $matchpoint->{'score'} = int($row->{'score'}); + $sth->finish(); + + $matchpoint->{'components'} = []; + $sth = $dbh->prepare_cached("SELECT * FROM matchpoint_components WHERE matchpoint_id = ? ORDER BY sequence"); + $sth->execute($matchpoint_id); + while ($row = $sth->fetchrow_hashref) { + my $component = {}; + $component->{'tag'} = $row->{'tag'}; + $component->{'subfields'} = { map { $_ => 1 } split(//, $row->{'subfields'}) }; + $component->{'offset'} = int($row->{'offset'}); + $component->{'length'} = int($row->{'length'}); + $component->{'norms'} = []; + my $sth2 = $dbh->prepare_cached("SELECT * + FROM matchpoint_component_norms + WHERE matchpoint_component_id = ? ORDER BY sequence"); + $sth2->execute($row->{'matchpoint_component_id'}); + while (my $row2 = $sth2->fetchrow_hashref) { + push @{ $component->{'norms'} }, $row2->{'norm_routine'}; + } + push @{ $matchpoint->{'components'} }, $component; + } + return $matchpoint; +} + +=head2 store + +=over 4 + +my $id = $matcher->store(); + +=back + +Stores matcher in database. The return value is the ID +of the marc_matchers row. If the matcher was +previously retrieved from the database via the fetch() +method, the DB representation of the matcher +is replaced. + +=cut + +sub store { + my $self = shift; + + if (defined $self->{'id'}) { + # update + $self->_del_matcher_components(); + $self->_update_marc_matchers(); + } else { + # create new + $self->_new_marc_matchers(); + } + $self->_store_matcher_components(); + return $self->{'id'}; +} + +sub _del_matcher_components { + my $self = shift; + + my $dbh = C4::Context->dbh(); + my $sth = $dbh->prepare_cached("DELETE FROM matchpoints WHERE matcher_id = ?"); + $sth->execute($self->{'id'}); + $sth = $dbh->prepare_cached("DELETE FROM matchchecks WHERE matcher_id = ?"); + $sth->execute($self->{'id'}); + # foreign key delete cascades take care of deleting relevant rows + # from matcher_matchpoints, matchpoint_components, and + # matchpoint_component_norms +} + +sub _update_marc_matchers { + my $self = shift; + + my $dbh = C4::Context->dbh(); + my $sth = $dbh->prepare_cached("UPDATE marc_matchers + SET code = ?, + description = ?, + record_type = ?, + threshold = ? + WHERE matcher_id = ?"); + $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}, $self->{'id'}); +} + +sub _new_marc_matchers { + my $self = shift; + + my $dbh = C4::Context->dbh(); + my $sth = $dbh->prepare_cached("INSERT INTO marc_matchers + (code, description, record_type, threshold) + VALUES (?, ?, ?, ?)"); + $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}); + $self->{'id'} = $dbh->{'mysql_insertid'}; +} + +sub _store_matcher_components { + my $self = shift; + + my $dbh = C4::Context->dbh(); + my $sth; + my $matcher_id = $self->{'id'}; + foreach my $matchpoint (@{ $self->{'matchpoints'}}) { + my $matchpoint_id = $self->_store_matchpoint($matchpoint); + $sth = $dbh->prepare_cached("INSERT INTO matcher_matchpoints (matcher_id, matchpoint_id) + VALUES (?, ?)"); + $sth->execute($matcher_id, $matchpoint_id); + } + foreach my $matchcheck (@{ $self->{'required_checks'} }) { + my $source_matchpoint_id = $self->_store_matchpoint($matchcheck->{'source_matchpoint'}); + my $target_matchpoint_id = $self->_store_matchpoint($matchcheck->{'target_matchpoint'}); + $sth = $dbh->prepare_cached("INSERT INTO matchchecks + (matcher_id, source_matchpoint_id, target_matchpoint_id) + VALUES (?, ?, ?)"); + $sth->execute($matcher_id, $source_matchpoint_id, $target_matchpoint_id); + } + +} + +sub _store_matchpoint { + my $self = shift; + my $matchpoint = shift; + + my $dbh = C4::Context->dbh(); + my $sth; + my $matcher_id = $self->{'id'}; + $sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score) + VALUES (?, ?, ?)"); + $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'}); + my $matchpoint_id = $dbh->{'mysql_insertid'}; + my $seqnum = 0; + foreach my $component (@{ $matchpoint->{'components'} }) { + $seqnum++; + $sth = $dbh->prepare_cached("INSERT INTO matchpoint_components + (matchpoint_id, sequence, tag, subfields, offset, length) + VALUES (?, ?, ?, ?, ?, ?)"); + $sth->bind_param(1, $matchpoint_id); + $sth->bind_param(2, $seqnum); + $sth->bind_param(3, $component->{'tag'}); + $sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} }); + $sth->bind_param(5, $component->{'offset'}); + $sth->bind_param(6, $component->{'length'}); + $sth->execute(); + my $matchpoint_component_id = $dbh->{'mysql_insertid'}; + my $normseq = 0; + foreach my $norm (@{ $component->{'norms'} }) { + $normseq++; + $sth = $dbh->prepare_cached("INSERT INTO matchpoint_component_norms + (matchpoint_component_id, sequence, norm_routine) + VALUES (?, ?, ?)"); + $sth->execute($matchpoint_component_id, $normseq, $norm); + } + } + return $matchpoint_id; +} + =head2 threshold =over 4 $matcher->threshold(1000); -my $threshhold = $matcher->threshhold(); +my $threshold = $matcher->threshold(); =back @@ -116,40 +382,172 @@ sub threshold { @_ ? $self->{'threshold'} = shift : $self->{'threshold'}; } +=head2 code + +=over 4 + +$matcher->code('ISBN'); +my $code = $matcher->code(); + +=back + +Accessor method. + +=cut + +sub code { + my $self = shift; + @_ ? $self->{'code'} = shift : $self->{'code'}; +} + +=head2 description + +=over 4 + +$matcher->description('match on ISBN'); +my $description = $matcher->description(); + +=back + +Accessor method. + +=cut + +sub description { + my $self = shift; + @_ ? $self->{'description'} = shift : $self->{'description'}; +} + =head2 add_matchpoint =over 4 -$matcher->add_matchpoint($source_tag, $source_subfields, $source_normalizer, - $index, $score); +$matcher->add_matchpoint($index, $score, $matchcomponents); + +=back + +Adds a matchpoint that may include multiple components. The $index +parameter identifies the index that will be searched, while $score +is the weight that will be added if a match is found. + +$matchcomponents should be a reference to an array of matchpoint +compoents, each of which should be a hash containing the following +keys: + tag + subfields + offset + length + norms + +The normalization_rules value should in turn be a reference to an +array, each element of which should be a reference to a +normalization subroutine (under C4::Normalize) to be applied +to the source string. + +=cut + +sub add_matchpoint { + my $self = shift; + my ($index, $score, $matchcomponents) = @_; + + my $matchpoint = {}; + $matchpoint->{'index'} = $index; + $matchpoint->{'score'} = $score; + $matchpoint->{'components'} = []; + foreach my $input_component (@{ $matchcomponents }) { + push @{ $matchpoint->{'components'} }, _parse_match_component($input_component); + } + push @{ $self->{'matchpoints'} }, $matchpoint; +} + +=head2 add_simple_matchpoint + +=over 4 + +$matcher->add_simple_matchpoint($index, $score, $source_tag, $source_subfields, + $source_offset, $source_length, + $source_normalizer); =back -Adds a matchpoint rule -- after composing a key based on the source tag and subfields, +Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields, normalized per the normalization fuction, search the index. All records retrieved will receive the assigned score. =cut -sub add_matchpoint { +sub add_simple_matchpoint { my $self = shift; - my ($source_tag, $source_subfields, $source_normalizer, $index, $score) = @_; - - # FIXME - $source_normalizer not used yet - my $matchpoint = { - 'source_tag' => $source_tag, - 'source_subfields' => { map { $_ => 1 } split(//, $source_subfields) }, - 'source_normalizer' => $source_normalizer, - 'index' => $index, - 'score' => $score - }; - push @{ $self->{'matchpoints'} }, $matchpoint; + my ($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer) = @_; + + $self->add_matchpoint($index, $score, [ + { tag => $source_tag, subfields => $source_subfields, + offset => $source_offset, length => $source_length, + norms => [ $source_normalizer ] + } + ]); } =head2 add_required_check -$matcher->add_required_check($check_name, $source_tag, $source_subfields, $source_normalizer, - $target_tag, $target_subfields, $target_normalizer); +=over 4 + +$match->add_required_check($source_matchpoint, $target_matchpoint); + +=back + +Adds a required check definition. A required check means that in +order for a match to be considered valid, the key derived from the +source (incoming) record must match the key derived from the target +(already in DB) record. + +Unlike a regular matchpoint, only the first repeat of each tag +in the source and target match criteria are considered. + +A typical example of a required check would be verifying that the +titles and publication dates match. + +$source_matchpoint and $target_matchpoint are each a reference to +an array of hashes, where each hash follows the same definition +as the matchpoint component specification in add_matchpoint, i.e., + + tag + subfields + offset + length + norms + +The normalization_rules value should in turn be a reference to an +array, each element of which should be a reference to a +normalization subroutine (under C4::Normalize) to be applied +to the source string. + +=cut + +sub add_required_check { + my $self = shift; + my ($source_matchpoint, $target_matchpoint) = @_; + + my $matchcheck = {}; + $matchcheck->{'source_matchpoint'}->{'index'} = ''; + $matchcheck->{'source_matchpoint'}->{'score'} = 0; + $matchcheck->{'source_matchpoint'}->{'components'} = []; + $matchcheck->{'target_matchpoint'}->{'index'} = ''; + $matchcheck->{'target_matchpoint'}->{'score'} = 0; + $matchcheck->{'target_matchpoint'}->{'components'} = []; + foreach my $input_component (@{ $source_matchpoint }) { + push @{ $matchcheck->{'source_matchpoint'}->{'components'} }, _parse_match_component($input_component); + } + foreach my $input_component (@{ $target_matchpoint }) { + push @{ $matchcheck->{'target_matchpoint'}->{'components'} }, _parse_match_component($input_component); + } + push @{ $self->{'required_checks'} }, $matchcheck; +} + +=head2 add_simple_required_check + +$matcher->add_simple_required_check($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer, + $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer); =over 4 @@ -160,25 +558,23 @@ must match for a match to be considered valid. =cut -sub add_required_check { +sub add_simple_required_check { my $self = shift; - my ($check_name, $source_tag, $source_subfields, $source_normalizer, $target_tag, $target_subfields, $target_normalizer) = @_; - - my $check = { - 'check_name' => $check_name, - 'source_tag' => $source_tag, - 'source_subfields' => { map { $_ => 1 } split(//, $source_subfields) }, - 'source_normalizer' => $source_normalizer, - 'target_tag' => $target_tag, - 'target_subfields' => { map { $_ => 1 } split(//, $target_subfields) }, - 'target_normalizer' => $target_normalizer - }; - - push @{ $self->{'required_checks'} }, $check; + my ($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer, + $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer) = @_; + + $self->add_required_check( + [ { tag => $source_tag, subfields => $source_subfields, offset => $source_offset, length => $source_length, + norms => [ $source_normalizer ] } ], + [ { tag => $target_tag, subfields => $target_subfields, offset => $target_offset, length => $target_length, + norms => [ $target_normalizer ] } ] + ); } =head2 find_matches +=over 4 + my @matches = $matcher->get_matches($marc_record, $max_matches); foreach $match (@matches) { # matches already sorted in order of @@ -212,8 +608,7 @@ sub get_matches { my %matches = (); foreach my $matchpoint (@{ $self->{'matchpoints'} }) { - my @source_keys = _get_match_keys($source_record, $matchpoint->{'source_tag'}, - $matchpoint->{'source_subfields'}, $matchpoint->{'source_normalizer'}); + my @source_keys = _get_match_keys($source_record, $matchpoint); next if scalar(@source_keys) == 0; # build query my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys); @@ -229,7 +624,10 @@ sub get_matches { # get rid of any that don't meet the threshold %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches; - # FIXME - implement record checks + # get rid of any that don't meet the required checks + %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ? ($_ => $matches{$_}) : () } + keys %matches; + my @results = (); foreach my $marcblob (keys %matches) { my $target_record = MARC::Record->new_from_usmarc($marcblob); @@ -247,30 +645,90 @@ sub get_matches { } -sub _get_match_keys { - my ($source_record, $source_tag, $source_subfields, $source_normalizer) = @_; +sub _passes_required_checks { + my ($source_record, $target_blob, $matchchecks) = @_; + my $target_record = MARC::Record->new_from_usmarc($target_blob); # FIXME -- need to avoid parsing record twice + + # no checks supplied == automatic pass + return 1 if $#{ $matchchecks } == -1; + + foreach my $matchcheck (@{ $matchchecks }) { + my $source_key = join "", _get_match_keys($source_record, $matchcheck->{'source_matchpoint'}); + my $target_key = join "", _get_match_keys($target_record, $matchcheck->{'target_matchpoint'}); + return 0 unless $source_key eq $target_key; + } + return 1; +} - use Data::Dumper; +sub _get_match_keys { + my $source_record = shift; + my $matchpoint = shift; + my $check_only_first_repeat = @_ ? shift : 0; + + # If there is more than one component to the matchpoint (e.g., + # matchpoint includes both 003 and 001), any repeats + # of the first component's tag are identified; repeats + # of the subsequent components' tags are appended to + # each parallel key dervied from the first component, + # up to the number of repeats of the first component's tag. + # + # For example, if the record has one 003 and two 001s, only + # one key is retrieved because there is only one 003. The key + # will consist of the contents of the first 003 and first 001. + # + # If there are two 003s and two 001s, there will be two keys: + # first 003 + first 001 + # second 003 + second 001 + my @keys = (); - foreach my $field ($source_record->field($source_tag)) { - if ($field->is_control_field()) { - push @keys, _normalize($field->data()); - } else { + for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) { + my $component = $matchpoint->{'components'}->[$i]; + my $j = -1; + FIELD: foreach my $field ($source_record->field($component->{'tag'})) { + $j++; + last FIELD if $j > 0 and $check_only_first_repeat; + last FIELD if $i > 0 and $j > $#keys; my $key = ""; - foreach my $subfield ($field->subfields()) { - if (exists $source_subfields->{$subfield->[0]}) { - $key .= " " . $subfield->[1]; + if ($field->is_control_field()) { + if ($component->{'length'}) { + $key = _normalize(substr($field->data(), $component->{'offset'}, $component->{'length'})) + # FIXME normalize, substr + } else { + $key = _normalize($field->data()); } + } else { + foreach my $subfield ($field->subfields()) { + if (exists $component->{'subfields'}->{$subfield->[0]}) { + $key .= " " . $subfield->[1]; + } + } + $key = _normalize($key); + } + if ($i == 0) { + push @keys, $key if $key; + } else { + $keys[$j] .= " $key" if $key; } - $key = _normalize($key); - - push @keys, $key if $key; } } return @keys; } + +sub _parse_match_component { + my $input_component = shift; + + my $component = {}; + $component->{'tag'} = $input_component->{'tag'}; + $component->{'subfields'} = { map { $_ => 1 } split(//, $input_component->{'subfields'}) }; + $component->{'offset'} = exists($input_component->{'offset'}) ? $input_component->{'offset'} : -1; + $component->{'length'} = $input_component->{'length'} ? $input_component->{'length'} : 0; + $component->{'norms'} = $input_component->{'norms'} ? $input_component->{'norms'} : []; + + return $component; +} + # FIXME - default normalizer sub _normalize { my $value = uc shift; diff --git a/installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.sql b/installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.sql new file mode 100644 index 0000000000..e592b391af --- /dev/null +++ b/installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.sql @@ -0,0 +1,38 @@ +-- +-- Default MARC matching rules for Koha +-- +-- Copyright (C) 2007 LiblimeA +-- +-- This file is part of Koha. +-- +-- Koha is free software; you can redistribute it and/or modify it under the +-- terms of the GNU General Public License as published by the Free Software +-- Foundation; either version 2 of the License, or (at your option) any later +-- version. +-- +-- Koha is distributed in the hope that it will be useful, but WITHOUT ANY +-- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +-- A PARTICULAR PURPOSE. See the GNU General Public License for more details. +-- +-- You should have received a copy of the GNU General Public License along with +-- Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, +-- Suite 330, Boston, MA 02111-1307 USA + +INSERT INTO marc_matchers (code, description, record_type, threshold) + VALUES ('ISBN', '020$a', 'biblio', 1000); +INSERT INTO matchpoints (matcher_id, search_index, score) SELECT MAX(matcher_id), 'isbn', 1000 FROM marc_matchers; +INSERT INTO matcher_matchpoints SELECT MAX(matcher_id), MAX(matchpoint_id) FROM matchpoints; +INSERT INTO matchpoint_components (matchpoint_id, sequence, tag, subfields) + SELECT MAX(matchpoint_id), 1, '020', 'a' FROM matchpoints; +INSERT INTO matchpoint_component_norms (matchpoint_component_id, sequence, norm_routine) + SELECT MAX(matchpoint_component_id), 1, 'ISBN' FROM matchpoint_components; + +INSERT INTO marc_matchers (code, description, record_type, threshold) + VALUES ('ISSN', '022$a', 'biblio', 1000); +INSERT INTO matchpoints (matcher_id, search_index, score) SELECT MAX(matcher_id), 'isbn', 1000 FROM marc_matchers; +INSERT INTO matcher_matchpoints SELECT MAX(matcher_id), MAX(matchpoint_id) FROM matchpoints; +INSERT INTO matchpoint_components (matchpoint_id, sequence, tag, subfields) + SELECT MAX(matchpoint_id), 1, '022', 'a' FROM matchpoints; +INSERT INTO matchpoint_component_norms (matchpoint_component_id, sequence, norm_routine) + SELECT MAX(matchpoint_component_id), 1, 'ISSN' FROM matchpoint_components; + diff --git a/installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.txt b/installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.txt new file mode 100644 index 0000000000..471db24b3e --- /dev/null +++ b/installer/data/mysql/en/marcflavour/marc21/optional/marc21_default_matching_rules.txt @@ -0,0 +1,5 @@ +Selected matching rules for MARC 21 bibliographic +records, including: + +ISBN +ISSN diff --git a/koha-tmpl/intranet-tmpl/prog/en/modules/tools/stage-marc-import.tmpl b/koha-tmpl/intranet-tmpl/prog/en/modules/tools/stage-marc-import.tmpl index 57b298d8d2..71ca8a6c80 100644 --- a/koha-tmpl/intranet-tmpl/prog/en/modules/tools/stage-marc-import.tmpl +++ b/koha-tmpl/intranet-tmpl/prog/en/modules/tools/stage-marc-import.tmpl @@ -23,9 +23,14 @@
  • records not staged because of MARC error
  • records staged
  • -
  • records with at least one match in catalogue
  • +
  • records with at least one match in catalogue per matching rule + ""
  • -
  • Did not check for matches with existing records in catalogue
  • + +
  • Record matching failed -- unable to retrieve selected matching rule.
  • + +
  • Did not check for matches with existing records in catalogue
  • +
  • item records found and staged
  • ">Manage staged records
  • @@ -55,12 +60,18 @@ -
    - Look for existing records in catalogue?
    1. -
    2. -
    3. - -
    +
    + Look for existing records in catalogue? +
    1. + +
    +
    Check for embedded item record data?
      diff --git a/misc/stage_biblios_file.pl b/misc/stage_biblios_file.pl index 0c70139545..a741990ce9 100755 --- a/misc/stage_biblios_file.pl +++ b/misc/stage_biblios_file.pl @@ -62,7 +62,9 @@ sub process_batch { my $num_with_matches = 0; if ($match_bibs) { my $matcher = C4::Matcher->new('biblio'); - $matcher->add_matchpoint("020", "a", '', 'isbn', 1000); + $matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, ''); + $matcher->add_simple_required_check('245', 'a', -1, 0, '', + '245', 'a', -1, 0, ''); print "... looking for matches with records already in database\n"; $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher, 10, 100, \&print_progress); print "... finished looking for matches\n"; diff --git a/tools/stage-marc-import.pl b/tools/stage-marc-import.pl index 625362631d..7f8063d3d6 100755 --- a/tools/stage-marc-import.pl +++ b/tools/stage-marc-import.pl @@ -59,7 +59,7 @@ my $input = new CGI; my $dbh = C4::Context->dbh; my $uploadmarc=$input->param('uploadmarc'); -my $check_for_matches = $input->param('check_for_matches'); +my $matcher_id = $input->param('matcher'); my $parse_items = $input->param('parse_items'); my $comments = $input->param('comments'); my $syntax = $input->param('syntax'); @@ -84,13 +84,19 @@ if ($uploadmarc && length($uploadmarc)>0) { # FIXME branch code my ($batch_id, $num_valid, $num_items, @import_errors) = BatchStageMarcRecords($syntax, $marcrecord, $filename, $comments, '', $parse_items, 0); - my $matcher = C4::Matcher->new('biblio'); - $matcher->add_matchpoint("020", "a", '', 'isbn', 1000); my $num_with_matches = 0; my $checked_matches = 0; - if ($check_for_matches) { - $checked_matches = 1; - $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher); + my $matcher_failed = 0; + my $matcher_code = ""; + if ($matcher_id ne "") { + my $matcher = C4::Matcher->fetch($matcher_id); + if (defined $matcher) { + $checked_matches = 1; + $matcher_code = $matcher->code(); + $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher); + } else { + $matcher_failed = 1; + } } $template->param(staged => $num_valid, @@ -99,9 +105,15 @@ if ($uploadmarc && length($uploadmarc)>0) { import_errors => scalar(@import_errors), total => $num_valid + scalar(@import_errors), checked_matches => $checked_matches, + matcher_failed => $matcher_failed, + matcher_code => $matcher_code, import_batch_id => $batch_id ); +} else { + # initial form + my @matchers = C4::Matcher::GetMatcherList(); + $template->param(available_matchers => \@matchers); } output_html_with_http_headers $input, $cookie, $template->output; -- 2.39.5