From 055c09fc2de20bbf0f5c2600e7dbeb32e491ae7e Mon Sep 17 00:00:00 2001 From: Julian Maurice Date: Thu, 20 Sep 2012 12:12:31 +0200 Subject: [PATCH] Bug 8746: rebuild_zebra_sliced.sh now export/index records as MARCXML This avoid indexing failures due to "bad offset" or "bad length" error with ISO2709 format + minor improvements: - --length parameter is optional. If not given, it will execute the right sql query to find the number of records to index - new parameter --reset-index. If set, index is reset before indexing Signed-off-by: Bernardo Gonzalez Kriegel Comment: Work as described. No errors. Test: Edit record to make it longer than 9999. Without patch rebuild_sliced fails. With patches works. Signed-off-by: Katrin Fischer Signed-off-by: Jared Camins-Esakov (cherry picked from commit eef4b3f23c0b4558bd2a87af009f39012393994c) Signed-off-by: Jared Camins-Esakov --- misc/migration_tools/rebuild_zebra_sliced.sh | 110 +++++++++++++++---- 1 file changed, 91 insertions(+), 19 deletions(-) diff --git a/misc/migration_tools/rebuild_zebra_sliced.sh b/misc/migration_tools/rebuild_zebra_sliced.sh index 35752e089e..799e941d70 100755 --- a/misc/migration_tools/rebuild_zebra_sliced.sh +++ b/misc/migration_tools/rebuild_zebra_sliced.sh @@ -12,21 +12,60 @@ rebuild_zebra.pl is called only once to export records. Splitting and indexing is handled by this script (using yaz-marcdump and zebraidx). Usage: -$scriptname -t type -l X [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] +$scriptname [-t type] [-l X] [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] [--reset-index] $scriptname -h - -o | --offset Offset parameter of rebuild_zebra.pl - -l | --length Length parameter of rebuild_zebra.pl + -o | --offset Offset parameter of rebuild_zebra.pl. + Default: $OFFSET + -l | --length Length parameter of rebuild_zebra.pl. If omitted, the + length is automatically calculated to index all + records -s | --chunks-size Initial chunk size (number of records indexed at once) + Default: $CHUNKSSIZE -d | --export-dir Where rebuild_zebra.pl will export data + Default: $EXPORTDIR -L | --log-dir Log directory + Default: $LOGDIR -r | --remove-logs Clean log directory before start + Default: $RMLOGS -t | --type Record type ('biblios' or 'authorities') + Default: $TYPE -f | --force Don't ask for confirmation before start -h | --help Display this help message + --reset-index Reset Zebra index for 'type' EOF } +splitfile() { + local file=$1 + local prefix=$2 + local size=$3 + local script=' + my $prefix = '"\"$prefix\""'; + my $size = '"$size"'; + my ($i,$count) = (0,0); + open(my $fh, "<", '"\"$file\""'); + open(my $out, ">", sprintf("$prefix%02d", $i)); + my $closed = 0; + while (<$fh>) { + my $line = $_; + if ($closed) { + open($out, ">", sprintf("$prefix%02d", $i)); + $closed = 0; + } + print $out $line; + $count++ if ($line =~ m|^|); + if ($count == $size) { + $count = 0; + $i++; + close($out); + $closed = 1; + } + } + ' + $PERL -e "$script" +} + indexfile() { local file=$1 local chunkssize=$2 @@ -37,23 +76,22 @@ indexfile() { local prefix="${file}_${chunkssize}_" echo "Splitting file in chunks of $chunkssize records" - YAZMARCDUMP_CMD="$YAZMARCDUMP -n -s $prefix -C $chunkssize $file" - $YAZMARCDUMP_CMD + splitfile $file $prefix $chunkssize dir=$(dirname $prefix) local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')" for chunkfile in $files; do echo "Indexing $chunkfile" - size=$($YAZMARCDUMP -p $chunkfile | grep '' | wc -l) + size=$(grep '^' $chunkfile | wc -l) logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log" - ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 update $chunkfile" + ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml update $chunkfile" $ZEBRAIDX_CMD >$logfile 2>&1 grep "Records: $size" $logfile >/dev/null 2>&1 if [ $? -ne 0 ]; then echo "Indexing failed. Split file and continue..." indexfile $chunkfile $(($chunkssize/2)) else - ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 commit" + ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml commit" $ZEBRAIDX_CMD >> $logfile 2>&1 fi done @@ -69,6 +107,7 @@ RMLOGS=no NOCONFIRM=no TYPE=biblios HELP=no +RESETINDEX=no # Get parameters while [ $1 ]; do @@ -103,9 +142,12 @@ while [ $1 ]; do -f | --force ) NOCONFIRM=yes ;; - -h | --help) + -h | --help ) HELP=yes ;; + --reset-index ) + RESETINDEX=yes + ;; * ) usage exit 1 @@ -118,34 +160,54 @@ if [ $HELP = "yes" ]; then exit 0 fi -if [ -z $LENGTH ]; then - echo "--length parameter is mandatory" +if [ -z $KOHA_CONF ]; then + echo "KOHA_CONF is not set" exit 1 fi +if [ -z $PERL5LIB ]; then + echo "PERL5LIB is not set" + exit 1 +fi + + TYPESWITCH= +SQLTABLE= case $TYPE in biblios ) TYPESWITCH=-b + SQLTABLE="biblio" ;; authorities ) TYPESWITCH=-a + SQLTABLE="auth_header" ;; * ) echo "'$TYPE' is an unknown type. Defaulting to 'biblios'" TYPESWITCH=-b TYPE=biblios + SQLTABLE="biblio" esac -ZEBRAIDX=`which zebraidx` -if [ -z $ZEBRAIDX ]; then - echo "zebraidx not found" +PERL=`which perl` +if [ -z $PERL ]; then + echo "perl not found" exit 1 fi -YAZMARCDUMP=`which yaz-marcdump` -if [ -z $YAZMARCDUMP ]; then - echo "yaz-marcdump not found" +if [ -z $LENGTH ]; then + LENGTH=$($PERL -e ' + use C4::Context; + my ($count) = C4::Context->dbh->selectrow_array(qq{ + SELECT COUNT(*) FROM '"$SQLTABLE"' + }); + print $count; + ') +fi + +ZEBRAIDX=`which zebraidx` +if [ -z $ZEBRAIDX ]; then + echo "zebraidx not found" exit 1 fi @@ -158,6 +220,9 @@ fi echo "" echo "Configuration" echo "=========================================================================" +echo "KOHA_CONF: $KOHA_CONF" +echo "PERL5LIB: $PERL5LIB" +echo "-------------------------------------------------------------------------" echo "Start at offset: $OFFSET" echo "Total number of records to index: $LENGTH" echo "Initial chunk size: $CHUNKSSIZE" @@ -165,10 +230,11 @@ echo "Export directory: $EXPORTDIR" echo "Log directory: $LOGDIR" echo "Remove logs before start? $RMLOGS" echo "Type of record: $TYPE" +echo "Reset index before start? $RESETINDEX" echo "-------------------------------------------------------------------------" echo "zebraidx path: $ZEBRAIDX" -echo "yaz-marcdump path: $YAZMARCDUMP" echo "rebuild_zebra path: $REBUILDZEBRA" +echo "perl path: $PERL" echo "=========================================================================" if [ $NOCONFIRM != "yes" ]; then @@ -200,7 +266,7 @@ if [ $RMLOGS = "yes" ]; then rm -f $LOGDIR/*.log fi -REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index" +REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -x -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index" echo "\n$REBUILDZEBRA_CMD" $REBUILDZEBRA_CMD @@ -219,5 +285,11 @@ esac CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg" +if [ $RESETINDEX = "yes" ]; then + RESETINDEX_CMD="$ZEBRAIDX -c $CONFIGFILE init" + echo "\n$RESETINDEX_CMD" + $RESETINDEX_CMD + echo "" +fi indexfile $EXPORTFILE $CHUNKSSIZE -- 2.39.5