Koha/misc/cronjobs/MARC21_parse_test.pl
Colin Campbell 263dded818 Bug 6752: Be stricter with utf-8 encoding of output
use encoding(UTF-8) rather than utf-8 for stricter
encoding
Marking output as ':utf8' only flags the data as utf8
using :encoding(UTF-8) also checks it as valid utf-8
see binmode in perlfunc for more details
In accordance with the robustness principle input
filehandles have not been changed as code may make
the undocumented assumption that invalid utf-8 is present
in the imput
Fixes errors reported by t/00-testcritic.t
Where feasable some filehandles have been made lexical rather than
reusing global filehandle vars

Signed-off-by: Jonathan Druart <jonathan.druart@biblibre.com>
Signed-off-by: Paul Poulain <paul.poulain@biblibre.com>
2012-01-27 12:11:06 +01:00

330 lines
10 KiB
Perl
Executable file

#!/usr/bin/perl
#
# Copyright 2009 Liblime
#
# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with Koha; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
use strict;
use warnings;
use MARC::Record;
use MARC::File::XML;
use MARC::File::USMARC;
use open OUT => ':encoding(UTF-8)';
use Getopt::Long qw(:config auto_help auto_version);
use Pod::Usage;
use C4::Biblio;
use C4::Charset;
use C4::Context;
use C4::Debug;
use vars qw($VERSION);
BEGIN {
# find Koha's Perl modules
# test carefully before changing this
use FindBin;
eval { require "$FindBin::Bin/../kohalib.pl" };
$VERSION = 0.03;
}
our $debug;
## OPTIONS
my $help = 0;
my $man = 0;
my $verbose = 0;
my $limit; # undef, not zero.
my $offset = 0;
my $dump = 0;
my $all = 0;
my $summary = 1;
my $lint = 0;
my $fix = 0;
my $filename = "/tmp/MARC21_parse_test.$$.marc";
GetOptions(
'help|?' => \$help,
'man' => \$man,
'verbose' => \$verbose,
'limit=i' => \$limit,
'offset=i' => \$offset,
'filename' => \$filename,
'All!' => \$all,
'Lint!' => \$lint,
'dump!' => \$dump,
'summary!' => \$summary,
'fix!' => \$fix,
) or pod2usage(2);
pod2usage( -verbose => 2 ) if ($man);
pod2usage( -verbose => 2 ) if ($help and $verbose);
pod2usage(1) if $help;
if ($debug) {
$summary++;
$verbose++;
$lint++;
}
my $lint_object;
if ($lint) {
require MARC::Lint;
$lint_object = new MARC::Lint;
}
my $marcflavour = C4::Context->preference('marcflavour') or die "No marcflavour (MARC21 or UNIMARC) set in syspref";
(uc($marcflavour) eq 'MARC21') or die "Only marcflavour MARC21, not '$marcflavour'";
# my $countq = C4::Context->dbh->prepare("SELECT COUNT(*) FROM biblioitems"); # Too SLOW on large systems
# $countq->execute; $countq->fetchrow();
my $max = 999999; # arbitrary ceiling
$limit or $limit = $max; # limit becomes max if unspecified
if ($summary) {
printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
printf "# limit %d, offset %d:\n", $limit, $offset;
printf "# MARC::Lint warnings: %s\n", ($lint ? 'ON' : 'OFF');
$verbose and print "# Using temp file: $filename\n"
}
MARC::File::XML->default_record_format($marcflavour) or die "FAILED MARC::File::XML->default_record_format($marcflavour)";
my $query = "SELECT * FROM biblioitems ";
my $recs;
if ($all) {
if ($limit or $offset) {
my $limit_clause = sprintf "LIMIT %d, %d", ($offset || 0), ($limit || $max);
$query .= $limit_clause;
}
$verbose and print "# Query: $query\n";
$recs = C4::Context->dbh->prepare($query);
$recs->execute();
} else {
$query .= "WHERE biblionumber=?";
$verbose and print "# Query: $query\n";
$recs = C4::Context->dbh->prepare($query);
# no execute, we execute per biblionumber
print "# Reading biblionumbers from STDIN\n";
}
sub next_row {
$all and return $recs->fetchrow_hashref(); # no WHERE clause, just get it
while (my $biblionumber = <>) {
chomp($biblionumber);
unless (defined $biblionumber) {
print "Skipping blank line $.\n";
next;
}
unless ($biblionumber =~ s/^\s*(\d+)\s*$/$1/ and $biblionumber != 0) {
print "Skipping illegal biblionumber: $biblionumber (line $.)\n";
next;
}
($verbose > 1) and printf("(%9d) plausible biblionumber\n", $biblionumber);
$recs->execute($biblionumber);
return $recs->fetchrow_hashref();
}
return undef; # just in case
}
my $ilimit = $limit;
$ilimit += $offset unless $all; # increase ilimit for offset. if $all, then offset is built into query.
my $i = 0;
my $found = 0;
my $fixed = 0;
my $fine = 0;
my $failed = 0;
my $warns = 0;
my $printline = 0;
while ( my $row = next_row() ) {
++$i;
unless ($all) {
($i > $ilimit) and last; # controls for user-input data/files
($i > $offset) or next;
}
my $xml = $row->{marcxml};
my $bibnum_prefix = sprintf "(%9d)", $row->{biblionumber};
# $xml now pared down to just the <leader> element
$verbose and printf "# %4d of %4d: biblionumber %s\n", ++$printline, $limit, $row->{biblionumber};
my $stripped = StripNonXmlChars($xml);
($stripped eq $xml) or printf "$bibnum_prefix: %d NON-XML Characters removed!!\n", (length($xml) - length($stripped));
my $record = eval { MARC::Record::new_from_xml( $stripped, 'utf8', $marcflavour ) };
if (not $record) {
$found++;
my $msg = $@ || '';
$verbose or $msg =~ s# at /usr/.*$##gs; # shorten common error message
print "$bibnum_prefix ERROR: $msg\n";
} else {
$fine++;
}
if ($lint) {
open (FILE, ">$filename") or die "Cannot write to temp file: $filename";
print FILE $xml;
close FILE;
my $file = MARC::File::XML->in( $filename );
while ( my $marc = $file->next() ) { # should be only 1
# $marc->field("245") or print "pre check_record 245 check 1: FAIL\n"; use Data::Dumper; print Dumper($marc);
$lint_object->check_record( $marc );
if ($lint_object->warnings) {
$warns++;
print join("\n", map {"$bibnum_prefix $_"} $lint_object->warnings), "\n";
}
}
}
if ($fix and not $record) {
my $record_from_blob = MARC::Record->new_from_usmarc($row->{marc});
unless ($record_from_blob) {
print "$bibnum_prefix ERROR: Cannot recover from biblioitems.marc\n";
$failed++;
} else {
my $mod = ModBiblioMarc($record_from_blob, $row->{biblionumber}, '');
if ($mod) {
$fixed++; print "$bibnum_prefix FIXED\n";
} else {
$failed++; print "$bibnum_prefix FAILED from marc. Manual intervention required.\n";
}
}
}
$dump and print $row->{marcxml}, "\n";
}
(-f $filename) and unlink ($filename); # remove tempfile
if ($summary) {
printf "# Examining marcxml from %s\n", ($all ? 'ALL biblioitems' : 'SELECT biblionumbers');
printf "# limit %d, offset %d:\n", $limit, $offset;
print "\nRESULTS (number of records)...\n";
printf " %6d -- OK \n", $fine;
printf " %6d -- w/ bad marcxml \n", $found;
printf " %6d -- w/ MARC::Lint warnings\n", $warns;
printf " %6d -- fixed from marc \n", $fixed;
printf " %6d -- failed to fix \n", $failed;
}
__END__
=head1 NAME
MARC21_parse_test.pl - Try parsing and optionally fixing biblioitems.marcxml, report errors
=head1 SYNOPSIS
MARC21_parse_test.pl [ -h | -m ] [ -v ] [ -d ] [ -s ] [ -l=N ] [ -o=N ] [ -l ] [ -f ] [ -A | filename ...]
Help Options:
-h --help -? Brief help message
-m --man Full documentation, same as --help --verbose
--version Prints version info
Feedback Options:
-d --dump Dump MARCXML of biblioitems processed, default OFF
-s --summary Print initial and closing summary of good and bad biblioitems counted, default ON
-L --Lint Show any warnings from MARC::Lint, default OFF
-v --verbose Increase verbosity of output, default OFF
Run Options:
-f --fix Replace biblioitems.marcxml from data in marc field, default OFF
-A --All Use the whole biblioitems table as target set, default OFF
-l --limit Number of biblioitems to display or fix
-o --offset Number of biblioitems to skip (not displayed or fixed)
=head1 OPTIONS
=over 8
=item B<--All>
Target the entire biblioitems table.
Beware, on a large table B<--All> can be very costly to performance.
=item B<--fix>
Without this option, no changes to any records are made. With <--fix>, the script attempts to reconstruct
biblioitems.marcxml from biblioitems.marc.
=item B<--limit=N>
Like a LIMIT statement in SQL, this constrains the number of records targeted by the script to an integer N.
This applies whether the target records are determined by user input, filenames or <--All>.
=item B<--offset=N>
Like an OFFSET statement in SQL, this tells the script to skip N of the targetted records.
The default is 0, i.e. skip none of them.
=back
The binary ON/OFF options can be negated like:
B<--nosummary> Do not display summary.
B<--nodump> Do not dump MARCXML.
B<--noLint> Do not show MARC::Lint warnings.
B<--nofix> Do not change any records. This is the default mode.
=head1 ARGUMENTS
Any number of filepath arguments can be referenced. They will be read in order and used to select the target
set of biblioitems. The file format should be simply one biblionumber per line. The B<--limit> and B<--offset>
options can still be used with biblionumbers specified from file. Files will be ignored under the B<--All> option.
=head1 DESCRIPTION
This checks for data corruption or otherwise unparsable data in biblioitems.marcxml.
As the name suggests, this script is only useful for MARC21 and will die for marcflavour UNIMARC.
Run MARC21_parse_test.pl the first time with no options and type in individual biblionumbers to test.
Or run with B<--All> to go through the entire table.
Run the script again with B<--fix> to attempt repair of the same target set.
After fixing any records, you will need to rebuild your index, e.g. B<rebuild_zebra -b -r -x>.
=head1 USAGE EXAMPLES
B<MARC21_parse_test.pl>
In the most basic form, allows you to input biblionumbers and checks them individually.
B<MARC21_parse_test.pl --fix>
Same thing but fixes them if they fail to parse.
B<MARC21_parse_test.pl --fix --limit=15 bibnumbers1.txt>
Fixes biblioitems from the first 15 biblionumbers in file bibnumbers1.txt. Multiple file arguments can be used.
B<MARC21_parse_test.pl --All --limit=3 --offset=15 --nosummary --dump>
Dumps MARCXML from the 16th, 17th and 18th records found in the database.
B<MARC21_parse_test.pl -A -l=3 -o=15 -s=0 -d>
Same thing as previous example in terse form.
=head1 TODO
Add more documentation for OPTIONS.
Update zebra status so rebuild of index is not necessary.
=head1 SEE ALSO
MARC::Lint
C4::Biblio
=cut