From 459d732180e14ed9f081d8a61f7a248c9761a43f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Demians?= Date: Sun, 14 Jun 2009 07:17:40 +0200 Subject: [PATCH] Bug 3301 - Speed up rebuild_zebra script With this patch, rebuild_zebra can re-index a whole Koha DB quickly: rebuild_zebra -r -b -nosanitize Biblio (authority) records are dump directly in a file from marcxml field without beeing transformed into MARC::Record object and corrected. DOCUMENTATION: rebuild_zebra.pl new paramater: -nosanitize export biblio/authority records directly from DB marcxml field without sanitizing records. It speed up dump process but could fail if DB contains badly encoded records. Works now only with -x and -b Signed-off-by: Galen Charlton --- misc/migration_tools/rebuild_zebra.pl | 33 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl index e6ad84e9dc..0f4b4a0b13 100755 --- a/misc/migration_tools/rebuild_zebra.pl +++ b/misc/migration_tools/rebuild_zebra.pl @@ -17,6 +17,7 @@ use C4::AuthoritiesMarc; $|=1; # flushes output my $directory; +my $nosanitize; my $skip_export; my $keep_export; my $reset; @@ -36,6 +37,7 @@ my $result = GetOptions( 'reset' => \$reset, 's' => \$skip_export, 'k' => \$keep_export, + 'nosanitize' => \$nosanitize, 'b' => \$biblios, 'noxml' => \$noxml, 'w' => \$noshadow, @@ -66,6 +68,12 @@ if ($authorities and $as_xml) { die $msg; } +if ( !$as_xml and $nosanitize ) { + my $msg = "Cannot specify both -no_xml and -nosanitize\n"; + $msg .= "Please do '$0 --help' to see usage.\n"; + die $msg; +} + if ($process_zebraqueue and ($skip_export or $reset)) { my $msg = "Cannot specify -r or -s if -z is specified\n"; $msg .= "Please do '$0 --help' to see usage.\n"; @@ -119,13 +127,13 @@ if ($do_munge) { } if ($authorities) { - index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt); + index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt); } else { print "skipping authorities\n" if ( $verbose_logging ); } if ($biblios) { - index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt); + index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt); } else { print "skipping biblios\n" if ( $verbose_logging ); } @@ -158,7 +166,7 @@ if ($keep_export) { } sub index_records { - my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt) = @_; + my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt) = @_; my $num_records_exported = 0; my $num_records_deleted = 0; @@ -186,7 +194,7 @@ sub index_records { mark_zebraqueue_batch_done($entries); } else { my $sth = select_all_records($record_type); - $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml); + $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize); unless ($do_not_clear_zebraqueue) { mark_all_zebraqueue_done($record_type); } @@ -270,7 +278,7 @@ sub select_all_biblios { } sub export_marc_records_from_sth { - my ($record_type, $sth, $directory, $as_xml, $noxml) = @_; + my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_; my $num_exported = 0; open (OUT, ">:utf8 ", "$directory/exported_records") or die $!; @@ -278,6 +286,16 @@ sub export_marc_records_from_sth { while (my ($record_number) = $sth->fetchrow_array) { print "." if ( $verbose_logging ); print "\r$i" unless ($i++ %100 or !$verbose_logging); + if ( $nosanitize ) { + my $marcxml = $record_type eq 'biblio' + ? GetXmlBiblio( $record_number ) + : GetAuthorityXML( $record_number ); + if ( $marcxml ) { + print OUT $marcxml if $marcxml; + $num_exported++; + } + next; + } my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml); if (defined $marc) { # FIXME - when more than one record is exported and $as_xml is true, @@ -548,6 +566,11 @@ Parameters: -x export and index as xml instead of is02709 (biblios only). use this if you might have records > 99,999 chars, + -nosanitize export biblio/authority records directly from DB marcxml + field without sanitizing records. It speed up + dump process but could fail if DB contains badly + encoded records. Works only with -x, + -w skip shadow indexing for this batch -y do NOT clear zebraqueue after indexing; normally, -- 2.39.5