Koha/misc/maintenance/dedup_authorities.pl
Tomas Cohen Arazi 5d8f6a76ec
Bug 13706: Minor tidy/spelling issues
Signed-off-by: Tomas Cohen Arazi <tomascohen@theke.io>
Signed-off-by: Katrin Fischer <katrin.fischer@bsz-bw.de>
2024-04-26 14:14:54 +02:00

330 lines
11 KiB
Perl
Executable file

#!/usr/bin/perl
use Modern::Perl;
use C4::Context;
use C4::AuthoritiesMarc;
use C4::Biblio;
use C4::Search;
use C4::Charset;
use C4::Heading;
use Koha::SearchEngine;
use Koha::SearchEngine::QueryBuilder;
use Koha::Logger;
use Koha::Authorities;
use Getopt::Long;
use YAML;
use List::MoreUtils qw/uniq/;
use Pod::Usage qw( pod2usage );
=head1 NAME
misc/migration_tools/dedup_authorities.pl - Deduping authorities script
=head1 SYNOPSIS
dedup_authorities.pl [ -h ] [ -where="authid < 5000" ] -c [ -v ] [ -m d ] [ -a PERSO_NAME ]
Options:
-h --help display usage statement
-v --verbose increase verbosity, can be repeated for greater verbosity
-m --method method for choosing the reference authority, can be: date, used, or ppn (UNIMARC)
can be repeated
-w --where a SQL WHERE statement to limit the authority records checked
-c --confirm without this parameter no changes will be made, script will run in test mode
-a --authtypecode check only specified auth type, repeatable
=head1 OPTIONS
=over
=item B<--method>
Method(s) used to choose which authority to keep in case we found
duplicates.
<methods> is a string composed of letters describing what methods to use
and in which order.
Letters can be:
date: keep the most recent authority (based on 005 field)
used: keep the most used authority
ppn: PPN (UNIMARC only), keep the authority with a ppn (when some
authorities don't have one, based on 009 field)
Example:
-m ppn -m date -m used
Among the authorities that have a PPN, keep the most recent,
and if two (or more) have the same date in 005, keep the
most used.
Default is 'used'
=item B<--where>
limit the deduplication to SOME authorities only
Example:
-where="authid < 5000"
will only auths with a low auth_id (old records)
=item B<--verbose>
display verbose logging, can be repeated twice for more info
=item B<--help>
show usage information.
=back
=cut
my @methods;
my @authtypecodes;
my $help = 0;
my $confirm = 0;
my $verbose = 0;
my $wherestring = "";
my $debug = 0;
my $result = GetOptions(
"d|debug" => \$debug,
"v|verbose+" => \$verbose,
"c|confirm" => \$confirm,
"h|help" => \$help,
"w|where=s" => \$wherestring,
"m|method=s" => \@methods,
"a|authtypecode=s" => \@authtypecodes
);
pod2usage( -verbose => 2 ) if ($help);
print "RUNNING IN TEST MODE, NO CHANGES WILL BE MADE\n" unless $confirm;
$verbose = 1 unless ( $confirm || $verbose );
my @choose_subs;
@methods = ('used') unless @methods;
foreach my $method (@methods) {
if ( $method eq 'date' ) {
push @choose_subs, \&_get_date;
} elsif ( $method eq 'ppn' ) {
die 'PPN method is only valid for UNIMARC'
unless ( C4::Context->preference('marcflavour') eq 'UNIMARC' );
push @choose_subs, \&_has_ppn;
} elsif ( $method eq 'used' ) {
push @choose_subs, \&_get_usage;
} else {
warn "Choose method '$method' is not supported";
}
}
my $dbh = C4::Context->dbh;
$verbose and print "Fetching authtypecodes...\n";
my $params = undef;
if (@authtypecodes) {
$params = { authtypecode => { -in => \@authtypecodes } };
}
my @auth_types = Koha::Authority::Types->search($params)->as_list;
my %auth_match_headings =
map { $_->authtypecode => $_->auth_tag_to_report } @auth_types;
$verbose and print "Fetching authtypecodes done.\n";
my %biblios;
my $seen;
for my $authtype (@auth_types) {
my $authtypecode = $authtype->authtypecode;
my %duplicated;
my $deleted = 0;
my $updated_bibs = 0;
my $i = 0;
$verbose and print "Deduping authtype '$authtypecode' \n";
$verbose and print "Fetching authorities for '$authtypecode'... ";
my $authorities = Koha::Authorities->search( { authtypecode => $authtypecode } );
$authorities = $authorities->search( \$wherestring ) if $wherestring;
my $size = $authorities->count;
$verbose and print "$size authorities found\n";
while ( my $authority = $authorities->next ) {
next if defined $seen->{ $authority->authid };
$seen->{ $authority->authid } = 1;
$i++;
if ( $verbose >= 2 ) {
my $percentage = sprintf( "%.2f", $i * 100 / $size );
print "Processing authority " . $authority->authid . " ($i/$size $percentage%)\n";
} elsif ( $verbose and ( $i % 100 ) == 0 ) {
my $percentage = sprintf( "%.2f", $i * 100 / $size );
print "Progression for authtype '$authtypecode': $i/$size ($percentage%)\n";
}
#authority was marked as duplicate
next if defined $duplicated{ $authority->authid };
my $authrecord = C4::AuthoritiesMarc::GetAuthority( $authority->authid );
next unless $authrecord;
C4::Charset::SetUTF8Flag($authrecord);
$debug and print " Building query...\n";
my $field = $authrecord->field( $auth_match_headings{$authtypecode} );
unless ($field) {
warn " Malformed authority record, no heading";
next;
}
unless ( $field->as_string ) {
warn " Malformed authority record, blank heading";
next;
}
my $heading = C4::Heading->new_from_field( $field, undef, 1 ); #new auth heading
my $search_term = $heading->search_form;
$debug and print " Building query done\n";
$debug and print " $search_term\n";
$debug and print " Searching...";
my $builder = Koha::SearchEngine::QueryBuilder->new( { index => $Koha::SearchEngine::AUTHORITIES_INDEX } );
my $searcher = Koha::SearchEngine::Search->new( { index => $Koha::SearchEngine::AUTHORITIES_INDEX } );
my $query = $builder->build_authorities_query_compat(
['match-heading'], [''],
[''], ['exact'], [$search_term], $authtypecode, ''
);
my ( $results, $total ) = $searcher->search_auth_compat( $query, 0, 50, undef );
if ( !$results ) {
$debug and warn " " . $@;
$debug and warn " " . YAML::Dump($search_term);
$debug and warn " " . $field->as_string;
next;
}
$debug and warn " " . YAML::Dump($results);
my @recordids =
map { $_->{authid} != $authority->authid ? $_->{authid} : () } @$results;
if ( !$results || scalar(@$results) < 1 || scalar @recordids < 1 ) {
( $verbose >= 2 )
and print ' No duplicates found for ' . $heading->display_form . "\n";
next;
}
map { $seen->{$_} = 1 } @recordids;
$debug and print " Searching done.\n";
$debug and print " Choosing records...";
my ( $recordid_to_keep, @recordids_to_merge ) = _choose_records( $authority->authid, @recordids );
$debug and print " Choosing records done.\n";
unless ( !$confirm or @recordids_to_merge == 0 ) {
( $verbose >= 2 )
and print " Merging " . join( ',', @recordids_to_merge ) . " into $recordid_to_keep.\n";
for my $localauthid (@recordids_to_merge) {
next if $recordid_to_keep == $localauthid;
my $MARCto = C4::AuthoritiesMarc::GetAuthority($recordid_to_keep);
my $editedbiblios = 0;
eval {
$editedbiblios = C4::AuthoritiesMarc::merge(
{
mergefrom => $localauthid,
MARCfrom => undef,
mergeto => $recordid_to_keep,
MARCto => $MARCto
}
);
};
if ($@) {
warn " Merging $localauthid into $recordid_to_keep failed :",
$@;
} else {
print " Updated " . $editedbiblios . " biblios\n";
$updated_bibs += $editedbiblios;
$duplicated{$localauthid} = 2;
print " Deleting $localauthid\n";
C4::AuthoritiesMarc::DelAuthority( { authid => $localauthid, skip_merge => 1 } );
$deleted++;
}
}
( $verbose >= 2 ) and print " Merge done.\n";
$duplicated{$recordid_to_keep} = 1;
} elsif ( $verbose >= 2 ) {
if ( @recordids_to_merge > 0 ) {
print ' Would merge '
. join( ',', @recordids_to_merge )
. " into $recordid_to_keep ("
. $heading->display_form . ")\n";
} else {
print " No duplicates found for $recordid_to_keep\n";
}
}
}
$verbose and print "End of deduping for authtype '$authtypecode'\n";
$verbose and print "Updated $updated_bibs biblios\n";
$verbose and print "Deleted $deleted authorities\n";
}
# Update biblios
my @biblios_to_update = grep { defined $biblios{$_} and $biblios{$_} == 1 }
keys %biblios;
if ( @biblios_to_update > 0 ) {
} else {
print "No biblios to update\n";
}
exit 0;
sub _get_id {
my $record = shift;
if ( $record and ( my $field = $record->field('001') ) ) {
return $field->data();
}
return 0;
}
sub _has_ppn {
my $record = shift;
if ( $record and ( my $field = $record->field('009') ) ) {
return $field->data() ? 1 : 0;
}
return 0;
}
sub _get_date {
my $record = shift;
if ( $record and ( my $field = $record->field('005') ) ) {
return $field->data();
}
return 0;
}
sub _get_usage {
my $record = shift;
if ( $record and ( my $field = $record->field('001') ) ) {
return Koha::Authorities->get_usage_count( { authid => $field->data() } );
}
return 0;
}
=head2 _choose_records
this function takes input of candidate record ids to merging
and returns
first the record to merge to
and list of records to merge from
=cut
sub _choose_records {
my @recordids = @_;
my @records = map { C4::AuthoritiesMarc::GetAuthority($_) } @recordids;
my @candidate_auths = @records;
# See http://www.sysarch.com/Perl/sort_paper.html Schwartzian transform
my @candidate_authids =
map $_->[0] => sort { $b->[1] <=> $a->[1] || $b->[2] <=> $a->[2] || $b->[3] <=> $a->[3] }
map [ _get_id($_),
$choose_subs[0] ? $choose_subs[0]->($_) : 0,
$choose_subs[1] ? $choose_subs[1]->($_) : 0,
$choose_subs[2] ? $choose_subs[2]->($_) : 0 ] => ( $records[0], @candidate_auths );
return @candidate_authids;
}