From 9cf40a72ba2f8b206717e19e703506970cedfef2 Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Mon, 7 Jan 2013 19:12:57 -0500 Subject: [PATCH] bug 9496: improve error checking in rebuild_zebra.pl When using rebuild_zebra to index all records, skip over bibliographic or authority records that don't come out as valid XML. Also, strip extraneous XML declarations when using --nosanitize. Test plans ---------- Note that both plans assume that DOM indexing is turned on. Test plan #1 ============ [1] Run rebuild_zebra.pl with the -x -nosanitize options. Without the patch, zebraidx should terminate early and complain about invalid XML. [2] With the patch, the rebuild_zebra.pl should work without error. Test plan #2 ============ [1] Intentionally make a MARCXML record invalid, e.g, by running the following SQL: UPDATE bilbioitems SET marcxml = CONCATENATE(marcxml, 'junk') WHERE biblionumber = 123; [2] Run rebuild_zebra.pl -b -x -r [3] Without the patch, only part of the database will be indexed. [4] With the patch, rebuild_zebra.pl will not export the bad record and will give an error message saying so, but will successfully index the rest of the records. Signed-off-by: Galen Charlton Signed-off-by: Larry Baerveldt Signed-off-by: Mason James Signed-off-by: Paul Poulain Signed-off-by: Jared Camins-Esakov Signed-off-by: Chris Cormack --- misc/migration_tools/rebuild_zebra.pl | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl index 0e24df5b54..205539c0a2 100755 --- a/misc/migration_tools/rebuild_zebra.pl +++ b/misc/migration_tools/rebuild_zebra.pl @@ -11,6 +11,7 @@ use C4::Biblio; use C4::AuthoritiesMarc; use C4::Items; use Koha::RecordProcessor; +use XML::LibXML; # # script that checks zebradir structure & create directories & mandatory files if needed @@ -136,6 +137,8 @@ if ($do_munge) { munge_config(); } +my $tester = XML::LibXML->new(); + if ($authorities) { index_records('authority', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir); } else { @@ -373,8 +376,18 @@ sub export_marc_records_from_sth { substr($itemsxml, index($itemsxml, "\n", 0) + 10); } } + # extra test to ensure that result is valid XML; otherwise + # Zebra won't parse it in DOM mode + eval { + my $doc = $tester->parse_string($marcxml); + }; + if ($@) { + warn "Error exporting record $record_number ($record_type): $@\n"; + next; + } if ( $marcxml ) { - print {$fh} $marcxml if $marcxml; + $marcxml =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!; + print {$fh} $marcxml; $num_exported++; } next; @@ -385,6 +398,12 @@ sub export_marc_records_from_sth { my $rec; if ($as_xml) { $rec = $marc->as_xml_record(C4::Context->preference('marcflavour')); + eval { + my $doc = $tester->parse_string($rec); + }; + if ($@) { + die "invalid XML: $@"; + } $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!; } else { $rec = $marc->as_usmarc(); @@ -393,7 +412,8 @@ sub export_marc_records_from_sth { $num_exported++; }; if ($@) { - warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML"); + warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML"); + warn "... specific error is $@" if $verbose_logging; } } } -- 2.39.5