Browse Source

Bug 8218 : Add a maintenance script to sanitize biblio records

This patch adds:
- a new maintenance script batch_sanitize_records
- a new subroutine C4::Charset::SanitizeRecord
- new unit tests for the new subroutine

Test plan:
1/ prove t/db_dependent/Charset.t
2/ Create a record containing "&" (could be follow with as many
'amp;' as you want) in one of its fields and the same for the field
linked to biblioitems.url.
The url should not be sanitized, it may contain "&".
3/ Launch the maintenance script with the -h parameter to see how to use
it.
4/ Launch the script using the different parameters:
 --filename=FILENAME
 --biblionumbers='XXX'
 --auto-search

The auto-search permits to sanitize all records containing "&" in
the marcxml field.

Use the verbose flag for testing.
Without the --confirm flag, nothing is done.

5/ Use the --confirm flag and verify in the biblioitems.marcxml field
that the record has been sanitized.

6/ Try the --reindex flag to reindex records which have been modified.

Signed-off-by: Marcel de Rooy <m.de.rooy@rijksmuseum.nl>

Signed-off-by: Kyle M Hall <kyle@bywatersolutions.com>
Signed-off-by: Tomas Cohen Arazi <tomascohen@gmail.com>
3.18.x
Jonathan Druart 10 years ago
committed by Tomas Cohen Arazi
parent
commit
299a8a6997
  1. 64
      C4/Charset.pm
  2. 222
      misc/maintenance/batch_sanitize_records.pl
  3. 51
      t/db_dependent/Charset.t

64
C4/Charset.pm

@ -41,6 +41,7 @@ BEGIN {
SetMarcUnicodeFlag
StripNonXmlChars
nsb_clean
SanitizeRecord
);
}
@ -423,6 +424,69 @@ sub nsb_clean {
}
=head2 SanitizeRecord
SanitizeRecord($marcrecord);
Sanitize a record
This routine is called in the maintenance script misc/maintenance/batch_sanitize_records.pl.
It cleans any string with '&amp;amp;...', replacing it by '&'
=cut
sub SanitizeRecord {
my ( $record, $biblionumber ) = @_;
my $string;
my $record_modified = 0;
my $frameworkcode = C4::Biblio::GetFrameworkCode($biblionumber);
my ( $url_field, $url_subfield ) =
C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
foreach my $field ( $record->fields() ) {
if ( $field->is_control_field() ) {
my $value = $field->data();
my $sanitized_value = _entity_clean($value);
$record_modified = 1 if $sanitized_value ne $value;
$field->update($sanitized_value);
}
else {
my @subfields = $field->subfields();
my @new_subfields;
foreach my $subfield (@subfields) {
next
if $url_field eq $field->tag()
and $url_subfield eq $subfield->[0];
my $value = $subfield->[1];
my $sanitized_value = _entity_clean($value);
push @new_subfields, $subfield->[0] => $sanitized_value;
$record_modified = 1 if $sanitized_value ne $value;
}
if ( scalar(@new_subfields) > 0 ) {
my $new_field = eval {
MARC::Field->new(
$field->tag(), $field->indicator(1),
$field->indicator(2), @new_subfields
);
};
if ($@) {
warn "error : $@";
}
else {
$field->replace_with($new_field);
}
}
}
}
return $record, $record_modified;
}
sub _entity_clean {
my ($string) = @_;
$string =~ s/(&)(amp;)+/$1/g;
return $string;
}
=head1 INTERNAL FUNCTIONS
=head2 _default_marc21_charconv_to_utf8

222
misc/maintenance/batch_sanitize_records.pl

@ -0,0 +1,222 @@
#!/usr/bin/perl
# This file is part of Koha.
#
# Copyright 2014 BibLibre
#
# Koha is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Koha is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Koha; if not, see <http://www.gnu.org/licenses>.
use Modern::Perl;
use C4::Charset qw( SanitizeRecord );
use C4::Context;
use DBI;
use C4::Biblio;
use Getopt::Long;
use Pod::Usage;
my ( $help, $verbose, $confirm, $biblionumbers, $reindex, $filename,
$auto_search );
my $result = GetOptions(
'h|help' => \$help,
'v|verbose' => \$verbose,
'c|confirm' => \$confirm,
'biblionumbers:s' => \$biblionumbers,
'reindex' => \$reindex,
'f|filename:s' => \$filename,
'auto-search' => \$auto_search,
) || pod2usage(1);
if ($help) {
pod2usage(0);
}
unless ( $filename or $biblionumbers or $auto_search ) {
pod2usage(
-exitval => 1,
-message =>
qq{\n\tAt least one record number source should be provided.\n}
);
}
if ( $filename and $biblionumbers
or $filename and $auto_search
or $biblionumbers and $auto_search )
{
pod2usage(
-exitval => 1,
-message => qq{\n\tOnly one record number source should be provided.\n}
);
}
my @biblionumbers;
# We first detect if we have a file or biblos directly entered by command line
#or if we want to use findAmp() sub
if ($auto_search) {
@biblionumbers = biblios_to_sanitize();
}
elsif ($filename) {
if ( -e $filename ) {
open( my $fh, '<', $filename ) || die("Can't open $filename ($!)");
while (<$fh>) {
chomp;
my $line = $_;
push @biblionumbers, split( " |,", $line );
}
close $fh;
}
else {
pod2usage(
-exitval => 1,
-message =>
qq{\n\tThis filename does not exist. Please verify the path is correct.\n}
);
}
}
else {
@biblionumbers = split m|,|, $biblionumbers if $biblionumbers;
}
# We remove spaces
s/(^\s*|\s*$)//g for @biblionumbers;
# Remove empty lines
@biblionumbers = grep { !/^$/ } @biblionumbers;
say @biblionumbers . " records to process" if $verbose;
my @changes;
for my $biblionumber (@biblionumbers) {
print "processing record $biblionumber..." if $verbose;
unless ( $biblionumber =~ m|^\d+$| ) {
say " skipping. ERROR: Invalid biblionumber." if $verbose;
next;
}
my $record = C4::Biblio::GetMarcBiblio($biblionumber);
unless ($record) {
say " skipping. ERROR: Invalid record." if $verbose;
next;
}
my ( $cleaned_record, $has_been_modified ) =
C4::Charset::SanitizeRecord( $record, $biblionumber );
if ($has_been_modified) {
my $frameworkcode = C4::Biblio::GetFrameworkCode($record);
C4::Biblio::ModBiblio( $cleaned_record, $biblionumber, $frameworkcode )
if $confirm;
push @changes, $biblionumber;
say " Done!" if $verbose;
}
else {
say " Nothing todo." if $verbose;
}
}
if ($verbose) {
say "Total: "
. @changes
. " records "
. ( $confirm ? "cleaned!" : "to clean." );
}
if ( $reindex and $confirm and @changes ) {
say "Now, reindexing using -b -v" if $verbose;
my $kohapath = C4::Context->config('intranetdir');
my $cmd = qq|
$kohapath/misc/migration_tools/rebuild_zebra.pl -b -v -where "biblionumber IN ( |
. join( ',', @changes ) . q| )"
|;
system($cmd);
}
sub biblios_to_sanitize {
my $dbh = C4::Context->dbh;
my $query = q{
SELECT biblionumber
FROM biblioitems
WHERE marcxml
LIKE "%&amp;amp;%"
};
return @{ $dbh->selectcol_arrayref( $query, { Slice => {} }, ) };
}
=head1 NAME
batch_sanitize_biblios - This script sanitize a biblio, replacing '&amp;amp;amp;etc.' with '&amp;' in it.
=head1 SYNOPSIS
batch_sanitize_biblios.pl [-h|--help] [-v|--verbose] [-c|--confirm] [--biblionumbers=BIBLIONUMBER_LIST] [-f|--filename=FILENAME] [--auto-search] [--reindex]
Replace '&amp;' by '&' in a record, you can either give some biblionumbers or a file with biblionumbers or ask for an auto-search
=head1 OPTIONS
=over
=item B<-h|--help>
Print a brief help message
=item B<-v|--verbose>
Verbose mode.
=item B<-c|--confirm>
This flag must be provided in order for the script to actually
sanitize records. If it is not supplied, the script will
only report on the record list to process.
=item B<--biblionumbers=BIBLIONUMBER_LIST>
Give a biblionumber list using this parameter. They must be separated by comma.
=item B<-f|--filename=FILENAME>
Give a biblionumber list using a filename. One biblionumber by line or separate them with a withespace character.
=item B<--auto_search>
Automatically search records containing "&amp;" in biblioitems.marcxml or in the specified fields.
=item B<--reindex>
Reindex the modified records.
=back
=head1 AUTHOR
Alex Arnaud <alex.arnaud@biblibre.com>
Christophe Croullebois <christophe.croullebois@biblibre.com>
Jonathan Druart <jonathan.druart@biblibre.com>
=head1 COPYRIGHT
Copyright 2014 BibLibre
=head1 LICENSE
This file is part of Koha.
Koha is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
Foundation; either version 3 of the License, or (at your option) any later version.
You should have received a copy of the GNU General Public License along
with Koha; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
=cut

51
t/db_dependent/Charset.t

@ -0,0 +1,51 @@
use Modern::Perl;
use Test::More tests => 4;
use MARC::Record;
use C4::Biblio qw( AddBiblio SetFieldMapping GetMarcFromKohaField );
use C4::Context;
use C4::Charset qw( SanitizeRecord );
my $dbh = C4::Context->dbh;
$dbh->{RaiseError} = 1;
$dbh->{AutoCommit} = 0;
my $frameworkcode = q||;
$dbh->do(q|
DELETE FROM marc_subfield_structure WHERE kohafield='biblioitems.url'
|);
$dbh->do(qq|
INSERT INTO marc_subfield_structure(frameworkcode,kohafield,tagfield,tagsubfield)
VALUES ('$frameworkcode', 'biblioitems.url', '856', 'u')
|);
my ( $url_field, $url_subfield ) = C4::Biblio::GetMarcFromKohaField('biblioitems.url', $frameworkcode);
my $title = q|My title & a word & another word|;
my $url = q|http://www.example.org/index.pl?arg1=val1&amp;arg2=val2|;
my $record = MARC::Record->new();
$record->append_fields(
MARC::Field->new('100', ' ', ' ', a => 'my author'),
MARC::Field->new('245', ' ', ' ', a => $title),
MARC::Field->new($url_field, ' ', ' ', $url_subfield => $url ),
);
my ($biblionumber, $biblioitemnumber) = AddBiblio($record, $frameworkcode);
my ( $sanitized_record, $has_been_modified ) = C4::Charset::SanitizeRecord( $record, $biblionumber );
is( $has_been_modified, 0, 'SanitizeRecord: the record has not been modified' );
is( $url, $sanitized_record->subfield($url_field, $url_subfield), 'SanitizeRecord: the url has not been modified');
$title = q|My title &amp;amp;amp; a word &amp;amp; another word|;
$record = MARC::Record->new();
$record->append_fields(
MARC::Field->new('100', ' ', ' ', a => 'my author'),
MARC::Field->new('245', ' ', ' ', a => $title),
MARC::Field->new($url_field, ' ', ' ', $url_subfield => $url ),
);
($biblionumber, $biblioitemnumber) = AddBiblio($record, $frameworkcode);
( $sanitized_record, $has_been_modified ) = C4::Charset::SanitizeRecord( $record, $biblionumber );
is( $has_been_modified, 1, 'SanitizeRecord: the record has been modified' );
is( $url, $sanitized_record->subfield($url_field, $url_subfield), 'SanitizeRecord: the url has not been modified');
$dbh->rollback;
Loading…
Cancel
Save