From 57424a9fdc269e205d3783735cfdb02cba5e2430 Mon Sep 17 00:00:00 2001 From: Julian Maurice Date: Mon, 2 Jul 2012 13:57:31 +0200 Subject: [PATCH] Bug 7286: rebuild_zebra_sliced for biblios and authorities Complete rewrite of rebuild_zebra_sliced.zsh (renamed to .sh). Main improvements are: - both biblio and authority records are handled - records are exported only once It also add an option --skip-index to rebuild_zebra.pl that permit to use rebuild_zebra.pl as an 'export only' script. Description: Index Koha records by chunks. It is useful when some record causes errors and stop the indexation process. With this script, if indexation of one chunk fails, chunk is splitted in 2 (or 3) chunks, and indexation continue on these chunks. rebuild_zebra.pl is called only once to export records. Splitting and indexing is handled by this script (using yaz-marcdump and zebraidx). Signed-off-by: Martin Renvoize Signed-off-by: Paul Poulain --- misc/migration_tools/rebuild_zebra.pl | 44 ++-- misc/migration_tools/rebuild_zebra_sliced.sh | 223 +++++++++++++++++++ 2 files changed, 250 insertions(+), 17 deletions(-) create mode 100755 misc/migration_tools/rebuild_zebra_sliced.sh diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl index dbbb14010f..1621e84b9b 100755 --- a/misc/migration_tools/rebuild_zebra.pl +++ b/misc/migration_tools/rebuild_zebra.pl @@ -24,6 +24,7 @@ my $directory; my $nosanitize; my $skip_export; my $keep_export; +my $skip_index; my $reset; my $biblios; my $authorities; @@ -44,6 +45,7 @@ my $result = GetOptions( 'r|reset' => \$reset, 's' => \$skip_export, 'k' => \$keep_export, + 'I|skip-index' => \$skip_index, 'nosanitize' => \$nosanitize, 'b' => \$biblios, 'noxml' => \$noxml, @@ -134,13 +136,13 @@ if ($do_munge) { } if ($authorities) { - index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir); + index_records('authority', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir); } else { print "skipping authorities\n" if ( $verbose_logging ); } if ($biblios) { - index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir); + index_records('biblio', $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir); } else { print "skipping biblios\n" if ( $verbose_logging ); } @@ -191,7 +193,7 @@ sub check_zebra_dirs { } # ---------- end of subroutine check_zebra_dirs ---------- sub index_records { - my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_; + my ($record_type, $directory, $skip_export, $skip_index, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_; my $num_records_exported = 0; my $records_deleted; @@ -230,24 +232,32 @@ sub index_records { } } } - + # # and reindexing everything # - if ( $verbose_logging ) { - print "====================\n"; - print "REINDEXING zebra\n"; - print "====================\n"; - } - my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ; - if ($process_zebraqueue) { - do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) - if %$records_deleted; - do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) - if $num_records_exported; + if ($skip_index) { + if ($verbose_logging) { + print "====================\n"; + print "SKIPPING $record_type indexing\n"; + print "====================\n"; + } } else { - do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) - if ($num_records_exported or $skip_export); + if ( $verbose_logging ) { + print "====================\n"; + print "REINDEXING zebra\n"; + print "====================\n"; + } + my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ; + if ($process_zebraqueue) { + do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) + if %$records_deleted; + do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) + if $num_records_exported; + } else { + do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) + if ($num_records_exported or $skip_export); + } } } diff --git a/misc/migration_tools/rebuild_zebra_sliced.sh b/misc/migration_tools/rebuild_zebra_sliced.sh new file mode 100755 index 0000000000..f1b73f99d9 --- /dev/null +++ b/misc/migration_tools/rebuild_zebra_sliced.sh @@ -0,0 +1,223 @@ +#!/bin/sh + +usage() { + local scriptname=$(basename $0) + cat <' | wc -l) + logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log" + ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 update $chunkfile" + $ZEBRAIDX_CMD >$logfile 2>&1 + grep "Records: $size" $logfile >/dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "Indexing failed. Split file and continue..." + indexfile $chunkfile $(($chunkssize/2)) + else + ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 commit" + $ZEBRAIDX_CMD >> $logfile 2>&1 + fi + done + fi +} + +OFFSET=0 +LENGTH= +CHUNKSSIZE=10000 +EXPORTDIR=/tmp/rebuild/export +LOGDIR=/tmp/rebuild/logs +RMLOGS=no +NOCONFIRM=no +TYPE=biblios +HELP=no + +# Get parameters +while [ $1 ]; do + case $1 in + -o | --offset ) + shift + OFFSET=$1 + ;; + -l | --length ) + shift + LENGTH=$1 + ;; + -s | --chunks-size ) + shift + CHUNKSSIZE=$1 + ;; + -d | --export-dir ) + shift + EXPORTDIR=$1 + ;; + -L | --log-dir ) + shift + LOGDIR=$1 + ;; + -r | --remove-logs ) + RMLOGS=yes + ;; + -t | --type ) + shift + TYPE=$1 + ;; + -f | --force ) + NOCONFIRM=yes + ;; + -h | --help) + HELP=yes + ;; + * ) + usage + exit 1 + esac + shift +done + +if [ $HELP = "yes" ]; then + usage + exit 0 +fi + +if [ -z $LENGTH ]; then + echo "--length parameter is mandatory" + exit 1 +fi + +TYPESWITCH= +case $TYPE in + biblios ) + TYPESWITCH=-b + ;; + authorities ) + TYPESWITCH=-a + ;; + * ) + echo "'$TYPE' is an unknown type. Defaulting to 'biblios'" + TYPESWITCH=-b + TYPE=biblios +esac + +ZEBRAIDX=`which zebraidx` +if [ -z $ZEBRAIDX ]; then + echo "zebraidx not found" + exit 1 +fi + +YAZMARCDUMP=`which yaz-marcdump` +if [ -z $YAZMARCDUMP ]; then + echo "yaz-marcdump not found" + exit 1 +fi + +REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl" +if [ ! -f $REBUILDZEBRA ]; then + echo "$REBUILDZEBRA: file not found" + exit 1 +fi + +echo "" +echo "Configuration" +echo "=========================================================================" +echo "Start at offset: $OFFSET" +echo "Total number of records to index: $LENGTH" +echo "Initial chunk size: $CHUNKSSIZE" +echo "Export directory: $EXPORTDIR" +echo "Log directory: $LOGDIR" +echo "Remove logs before start? $RMLOGS" +echo "Type of record: $TYPE" +echo "-------------------------------------------------------------------------" +echo "zebraidx path: $ZEBRAIDX" +echo "yaz-marcdump path: $YAZMARCDUMP" +echo "rebuild_zebra path: $REBUILDZEBRA" +echo "=========================================================================" + +if [ $NOCONFIRM != "yes" ]; then + confirm=y + echo -n "Confirm ? [Y/n] " + read response + if [ $response ] && [ $response != "yes" ] && [ $response != "y" ]; then + confirm=n + fi + + if [ $confirm = "n" ]; then + exit 0 + fi +fi + +mkdir -p $EXPORTDIR +if [ $? -ne 0 ]; then + echo "Failed to create directory $EXPORTDIR. Aborting." + exit 1 +fi + +mkdir -p $LOGDIR +if [ $? -ne 0 ]; then + echo "Failed to create directory $LOGDIR. Aborting." + exit 1 +fi + +if [ $RMLOGS = "yes" ]; then + rm -f $LOGDIR/*.log +fi + +REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index" +echo "\n$REBUILDZEBRA_CMD" +$REBUILDZEBRA_CMD + +EXPORTFILE= +case $TYPE in + biblios ) + EXPORTFILE="$EXPORTDIR/biblio/exported_records" + ;; + authorities ) + EXPORTFILE="$EXPORTDIR/authority/exported_records" + ;; + * ) + echo "Error: TYPE '$TYPE' is not supported" + exit 1 +esac + +CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg" + + +indexfile $EXPORTFILE $CHUNKSSIZE -- 2.39.5