From d9b71ff26d6cdccea4c1d61142c5254dbaff10c1 Mon Sep 17 00:00:00 2001 From: Julian Maurice Date: Fri, 27 Jul 2012 12:30:41 +0200 Subject: [PATCH] Bug 13706: New script dedup_authorities.pl This script allow to deduplicate authorities automatically. Script is in misc/maintenance/ It works this way: 1) authorities are fetched from the database. You can limit fetched results by authtypecode, or directly by specifying WHERE clause 2) for each authority: 2.1) build a Zebra query using the 'search_form' for the heading 2.2) run the query, retrieve the results 2.3) among duplicates, choose the one we want to keep (use --choose-method option). 2.5) use C4::Authorities::merge to merge authorities 3) delete the merged authorities Use --help for more informations on options To be done: 1 - Move to module and cover with tests 2 - Add option to only merge unused authorities 3 - Expand 'ppn' option to be 'control-number' option and allow specifying field 4 - More? 1 & 2 I will attempt - 3 & 4 may be future enhancements Signed-off-by: David Nind Signed-off-by: Tomas Cohen Arazi Signed-off-by: Katrin Fischer --- misc/maintenance/dedup_authorities.pl | 330 ++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100755 misc/maintenance/dedup_authorities.pl diff --git a/misc/maintenance/dedup_authorities.pl b/misc/maintenance/dedup_authorities.pl new file mode 100755 index 0000000000..b5e25db06b --- /dev/null +++ b/misc/maintenance/dedup_authorities.pl @@ -0,0 +1,330 @@ +#!/usr/bin/perl +use Modern::Perl; +use C4::Context; +use C4::AuthoritiesMarc; +use C4::Biblio; +use C4::Search; +use C4::Charset; +use C4::Heading; +use Koha::SearchEngine; +use Koha::SearchEngine::QueryBuilder; +use Koha::Logger; + +use Koha::Authorities; + +use Getopt::Long; +use YAML; +use List::MoreUtils qw/uniq/; +use Pod::Usage qw( pod2usage ); + +=head1 NAME + +misc/migration_tools/dedup_authorities.pl - Deduping authorities script + +=head1 SYNOPSIS + +dedup_authorities.pl [ -h ] [ -where="authid < 5000" ] -c [ -v ] [ -m d ] [ -a PERSO_NAME ] + + Options: + -h --help display usage statement + -v --verbose increase verbosity, can be repeated for greater verbosity + -m --method method for choosing the reference authority, can be: date, used, or ppn (UNIMARC) + can be repeated + -w --where a SQL WHERE statement to limit the authority records checked + -c --confirm without this parameter no changes will be made, script will run in test mode + -a --authtypecode check only specified auth type, repeatable + +=head1 OPTIONS + +=over + +=item B<--method> + +Method(s) used to choose which authority to keep in case we found +duplicates. + is a string composed of letters describing what methods to use +and in which order. +Letters can be: + date: keep the most recent authority (based on 005 field) + used: keep the most used authority + ppn: PPN (UNIMARC only), keep the authority with a ppn (when some + authorities don't have one, based on 009 field) + +Example: +-m ppn -m date -m used +Among the authorities that have a PPN, keep the most recent, +and if two (or more) have the same date in 005, keep the +most used. + +Default is 'used' + +=item B<--where> + +limit the deduplication to SOME authorities only + +Example: +-where="authid < 5000" +will only auths with a low auth_id (old records) + +=item B<--verbose> + +display verbose loggin, can be repeated twice for more info + + +=item B<--help> + +show usage information. + +=back + +=cut + +my @methods; +my @authtypecodes; +my $help = 0; +my $confirm = 0; +my $verbose = 0; +my $wherestring = ""; +my $debug = 0; + +my $result = GetOptions( + "d|debug" => \$debug, + "v|verbose+" => \$verbose, + "c|confirm" => \$confirm, + "h|help" => \$help, + "w|where=s" => \$wherestring, + "m|method=s" => \@methods, + "a|authtypecode=s" => \@authtypecodes +); + +pod2usage( -verbose => 2 ) if ($help); + +print "RUNNING IN TEST MODE, NO CHANGES WILL BE MADE\n" unless $confirm; +$verbose = 1 unless ( $confirm || $verbose ); + +my @choose_subs; +@methods = ('used') unless @methods; +foreach my $method (@methods) { + if ( $method eq 'date' ) { + push @choose_subs, \&_get_date; + } elsif ( $method eq 'ppn' ) { + die 'PPN method is only valid for UNIMARC' + unless ( C4::Context->preference('marcflavour') eq 'UNIMARC' ); + push @choose_subs, \&_has_ppn; + } elsif ( $method eq 'used' ) { + push @choose_subs, \&_get_usage; + } else { + warn "Choose method '$method' is not supported"; + } +} + +my $dbh = C4::Context->dbh; + +$verbose and print "Fetching authtypecodes...\n"; +my $params = undef; +if (@authtypecodes) { + $params = { authtypecode => { -in => \@authtypecodes } }; +} +my @auth_types = Koha::Authority::Types->search($params)->as_list; +my %auth_match_headings = + map { $_->authtypecode => $_->auth_tag_to_report } @auth_types; +$verbose and print "Fetching authtypecodes done.\n"; + +my %biblios; +my $seen; + +for my $authtype (@auth_types) { + my $authtypecode = $authtype->authtypecode; + my %duplicated; + my $deleted = 0; + my $updated_bibs = 0; + my $i = 0; + $verbose and print "Deduping authtype '$authtypecode' \n"; + + $verbose and print "Fetching authorities for '$authtypecode'... "; + my $authorities = Koha::Authorities->search( { authtypecode => $authtypecode } ); + $authorities = $authorities->search( \$wherestring ) if $wherestring; + my $size = $authorities->count; + $verbose and print "$size authorities found\n"; + + while ( my $authority = $authorities->next ) { + next if defined $seen->{ $authority->authid }; + $seen->{ $authority->authid } = 1; + $i++; + if ( $verbose >= 2 ) { + my $percentage = sprintf( "%.2f", $i * 100 / $size ); + print "Processing authority " . $authority->authid . " ($i/$size $percentage%)\n"; + } elsif ( $verbose and ( $i % 100 ) == 0 ) { + my $percentage = sprintf( "%.2f", $i * 100 / $size ); + print "Progression for authtype '$authtypecode': $i/$size ($percentage%)\n"; + } + + #authority was marked as duplicate + next if defined $duplicated{ $authority->authid }; + my $authrecord = C4::AuthoritiesMarc::GetAuthority( $authority->authid ); + + next unless $authrecord; + C4::Charset::SetUTF8Flag($authrecord); + + $debug and print " Building query...\n"; + my $field = $authrecord->field( $auth_match_headings{$authtypecode} ); + unless ($field) { + warn " Malformed authority record, no heading"; + next; + } + unless ( $field->as_string ) { + warn " Malformed authority record, blank heading"; + next; + } + my $heading = C4::Heading->new_from_field( $field, undef, 1 ); #new auth heading + my $search_term = $heading->search_form; + $debug and print " Building query done\n"; + $debug and print " $search_term\n"; + + $debug and print " Searching..."; + + my $builder = Koha::SearchEngine::QueryBuilder->new( { index => $Koha::SearchEngine::AUTHORITIES_INDEX } ); + my $searcher = Koha::SearchEngine::Search->new( { index => $Koha::SearchEngine::AUTHORITIES_INDEX } ); + my $query = $builder->build_authorities_query_compat( + ['match-heading'], [''], + [''], ['exact'], [$search_term], $authtypecode, '' + ); + my ( $results, $total ) = $searcher->search_auth_compat( $query, 0, 50, undef ); + if ( !$results ) { + $debug and warn " " . $@; + $debug and warn " " . YAML::Dump($search_term); + $debug and warn " " . $field->as_string; + next; + } + + $debug and warn " " . YAML::Dump($results); + + my @recordids = + map { $_->{authid} != $authority->authid ? $_->{authid} : () } @$results; + if ( !$results || scalar(@$results) < 1 || scalar @recordids < 1 ) { + ( $verbose >= 2 ) + and print ' No duplicates found for ' . $heading->display_form . "\n"; + next; + } + map { $seen->{$_} = 1 } @recordids; + $debug and print " Searching done.\n"; + + $debug and print " Choosing records..."; + my ( $recordid_to_keep, @recordids_to_merge ) = _choose_records( $authority->authid, @recordids ); + $debug and print " Choosing records done.\n"; + unless ( !$confirm or @recordids_to_merge == 0 ) { + ( $verbose >= 2 ) + and print " Merging " . join( ',', @recordids_to_merge ) . " into $recordid_to_keep.\n"; + for my $localauthid (@recordids_to_merge) { + next if $recordid_to_keep == $localauthid; + my $MARCto = C4::AuthoritiesMarc::GetAuthority($recordid_to_keep); + my $editedbiblios = 0; + eval { + $editedbiblios = C4::AuthoritiesMarc::merge( + { + mergefrom => $localauthid, + MARCfrom => undef, + mergeto => $recordid_to_keep, + MARCto => $MARCto + } + ); + }; + if ($@) { + warn " Merging $localauthid into $recordid_to_keep failed :", + $@; + } else { + print " Updated " . $editedbiblios . " biblios\n"; + $updated_bibs += $editedbiblios; + $duplicated{$localauthid} = 2; + print " Deleting $localauthid\n"; + C4::AuthoritiesMarc::DelAuthority( { authid => $localauthid, skip_merge => 1 } ); + $deleted++; + } + } + ( $verbose >= 2 ) and print " Merge done.\n"; + $duplicated{$recordid_to_keep} = 1; + } elsif ( $verbose >= 2 ) { + if ( @recordids_to_merge > 0 ) { + print ' Would merge ' + . join( ',', @recordids_to_merge ) + . " into $recordid_to_keep (" + . $heading->display_form . ")\n"; + } else { + print " No duplicates found for $recordid_to_keep\n"; + } + } + } + $verbose and print "End of deduping for authtype '$authtypecode'\n"; + $verbose and print "Updated $updated_bibs biblios\n"; + $verbose and print "Deleted $deleted authorities\n"; +} + +# Update biblios +my @biblios_to_update = grep { defined $biblios{$_} and $biblios{$_} == 1 } + keys %biblios; +if ( @biblios_to_update > 0 ) { +} else { + print "No biblios to update\n"; +} + +exit 0; + +sub _get_id { + my $record = shift; + + if ( $record and ( my $field = $record->field('001') ) ) { + return $field->data(); + } + return 0; +} + +sub _has_ppn { + my $record = shift; + + if ( $record and ( my $field = $record->field('009') ) ) { + return $field->data() ? 1 : 0; + } + return 0; +} + +sub _get_date { + my $record = shift; + + if ( $record and ( my $field = $record->field('005') ) ) { + return $field->data(); + } + return 0; +} + +sub _get_usage { + my $record = shift; + + if ( $record and ( my $field = $record->field('001') ) ) { + return Koha::Authorities->get_usage_count( { authid => $field->data() } ); + } + return 0; +} + +=head2 _choose_records + this function takes input of candidate record ids to merging + and returns + first the record to merge to + and list of records to merge from +=cut + +sub _choose_records { + my @recordids = @_; + + my @records = map { C4::AuthoritiesMarc::GetAuthority($_) } @recordids; + my @candidate_auths = @records; + + # See http://www.sysarch.com/Perl/sort_paper.html Schwartzian transform + my @candidate_authids = + map $_->[0] => sort { $b->[1] <=> $a->[1] || $b->[2] <=> $a->[2] || $b->[3] <=> $a->[3] } + map [ _get_id($_), + $choose_subs[0] ? $choose_subs[0]->($_) : 0, + $choose_subs[1] ? $choose_subs[1]->($_) : 0, + $choose_subs[2] ? $choose_subs[2]->($_) : 0 ] => ( $records[0], @candidate_auths ); + + return @candidate_authids; +} -- 2.39.5