Koha/misc/migration_tools/rebuild_zebra_sliced.sh
Julian Maurice eef4b3f23c Bug 8746: rebuild_zebra_sliced.sh now export/index records as MARCXML
This avoid indexing failures due to "bad offset" or "bad length" error
with ISO2709 format

+ minor improvements:
  -  --length parameter is optional. If not given, it will execute the
     right sql query to find the number of records to index
  -  new parameter --reset-index. If set, index is reset before indexing

Signed-off-by: Bernardo Gonzalez Kriegel <bgkriegel@gmail.com>
Comment: Work as described. No errors.

Test: Edit record to make it longer than 9999. Without patch rebuild_sliced
fails. With patches works.

Signed-off-by: Katrin Fischer <Katrin.Fischer.83@web.de>
Signed-off-by: Jared Camins-Esakov <jcamins@cpbibliography.com>
2013-04-21 09:23:22 -04:00

295 lines
7.4 KiB
Bash
Executable file

#!/bin/sh
usage() {
local scriptname=$(basename $0)
cat <<EOF
$scriptname
Index Koha records by chunks. It is useful when a record causes errors and
stops the indexing process. With this script, if indexing of one chunk fails,
that chunk is split into two or more chunks, and indexing continues on these chunks.
rebuild_zebra.pl is called only once to export records. Splitting and indexing
is handled by this script (using yaz-marcdump and zebraidx).
Usage:
$scriptname [-t type] [-l X] [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] [--reset-index]
$scriptname -h
-o | --offset Offset parameter of rebuild_zebra.pl.
Default: $OFFSET
-l | --length Length parameter of rebuild_zebra.pl. If omitted, the
length is automatically calculated to index all
records
-s | --chunks-size Initial chunk size (number of records indexed at once)
Default: $CHUNKSSIZE
-d | --export-dir Where rebuild_zebra.pl will export data
Default: $EXPORTDIR
-L | --log-dir Log directory
Default: $LOGDIR
-r | --remove-logs Clean log directory before start
Default: $RMLOGS
-t | --type Record type ('biblios' or 'authorities')
Default: $TYPE
-f | --force Don't ask for confirmation before start
-h | --help Display this help message
--reset-index Reset Zebra index for 'type'
EOF
}
splitfile() {
local file=$1
local prefix=$2
local size=$3
local script='
my $prefix = '"\"$prefix\""';
my $size = '"$size"';
my ($i,$count) = (0,0);
open(my $fh, "<", '"\"$file\""');
open(my $out, ">", sprintf("$prefix%02d", $i));
my $closed = 0;
while (<$fh>) {
my $line = $_;
if ($closed) {
open($out, ">", sprintf("$prefix%02d", $i));
$closed = 0;
}
print $out $line;
$count++ if ($line =~ m|^</record>|);
if ($count == $size) {
$count = 0;
$i++;
close($out);
$closed = 1;
}
}
'
$PERL -e "$script"
}
indexfile() {
local file=$1
local chunkssize=$2
if [ $chunkssize -lt 1 ]; then
echo "Fail on file $file"
else
local prefix="${file}_${chunkssize}_"
echo "Splitting file in chunks of $chunkssize records"
splitfile $file $prefix $chunkssize
dir=$(dirname $prefix)
local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
for chunkfile in $files; do
echo "Indexing $chunkfile"
size=$(grep '^</record>' $chunkfile | wc -l)
logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml update $chunkfile"
$ZEBRAIDX_CMD >$logfile 2>&1
grep "Records: $size" $logfile >/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Indexing failed. Split file and continue..."
indexfile $chunkfile $(($chunkssize/2))
else
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml commit"
$ZEBRAIDX_CMD >> $logfile 2>&1
fi
done
fi
}
OFFSET=0
LENGTH=
CHUNKSSIZE=10000
EXPORTDIR=/tmp/rebuild/export
LOGDIR=/tmp/rebuild/logs
RMLOGS=no
NOCONFIRM=no
TYPE=biblios
HELP=no
RESETINDEX=no
# Get parameters
while [ $1 ]; do
case $1 in
-o | --offset )
shift
OFFSET=$1
;;
-l | --length )
shift
LENGTH=$1
;;
-s | --chunks-size )
shift
CHUNKSSIZE=$1
;;
-d | --export-dir )
shift
EXPORTDIR=$1
;;
-L | --log-dir )
shift
LOGDIR=$1
;;
-r | --remove-logs )
RMLOGS=yes
;;
-t | --type )
shift
TYPE=$1
;;
-f | --force )
NOCONFIRM=yes
;;
-h | --help )
HELP=yes
;;
--reset-index )
RESETINDEX=yes
;;
* )
usage
exit 1
esac
shift
done
if [ $HELP = "yes" ]; then
usage
exit 0
fi
if [ -z $KOHA_CONF ]; then
echo "KOHA_CONF is not set"
exit 1
fi
if [ -z $PERL5LIB ]; then
echo "PERL5LIB is not set"
exit 1
fi
TYPESWITCH=
SQLTABLE=
case $TYPE in
biblios )
TYPESWITCH=-b
SQLTABLE="biblio"
;;
authorities )
TYPESWITCH=-a
SQLTABLE="auth_header"
;;
* )
echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
TYPESWITCH=-b
TYPE=biblios
SQLTABLE="biblio"
esac
PERL=`which perl`
if [ -z $PERL ]; then
echo "perl not found"
exit 1
fi
if [ -z $LENGTH ]; then
LENGTH=$($PERL -e '
use C4::Context;
my ($count) = C4::Context->dbh->selectrow_array(qq{
SELECT COUNT(*) FROM '"$SQLTABLE"'
});
print $count;
')
fi
ZEBRAIDX=`which zebraidx`
if [ -z $ZEBRAIDX ]; then
echo "zebraidx not found"
exit 1
fi
REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl"
if [ ! -f $REBUILDZEBRA ]; then
echo "$REBUILDZEBRA: file not found"
exit 1
fi
echo ""
echo "Configuration"
echo "========================================================================="
echo "KOHA_CONF: $KOHA_CONF"
echo "PERL5LIB: $PERL5LIB"
echo "-------------------------------------------------------------------------"
echo "Start at offset: $OFFSET"
echo "Total number of records to index: $LENGTH"
echo "Initial chunk size: $CHUNKSSIZE"
echo "Export directory: $EXPORTDIR"
echo "Log directory: $LOGDIR"
echo "Remove logs before start? $RMLOGS"
echo "Type of record: $TYPE"
echo "Reset index before start? $RESETINDEX"
echo "-------------------------------------------------------------------------"
echo "zebraidx path: $ZEBRAIDX"
echo "rebuild_zebra path: $REBUILDZEBRA"
echo "perl path: $PERL"
echo "========================================================================="
if [ $NOCONFIRM != "yes" ]; then
confirm=y
echo -n "Confirm ? [Y/n] "
read response
if [ $response ] && [ $response != "yes" ] && [ $response != "y" ]; then
confirm=n
fi
if [ $confirm = "n" ]; then
exit 0
fi
fi
mkdir -p $EXPORTDIR
if [ $? -ne 0 ]; then
echo "Failed to create directory $EXPORTDIR. Aborting."
exit 1
fi
mkdir -p $LOGDIR
if [ $? -ne 0 ]; then
echo "Failed to create directory $LOGDIR. Aborting."
exit 1
fi
if [ $RMLOGS = "yes" ]; then
rm -f $LOGDIR/*.log
fi
REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -x -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
echo "\n$REBUILDZEBRA_CMD"
$REBUILDZEBRA_CMD
EXPORTFILE=
case $TYPE in
biblios )
EXPORTFILE="$EXPORTDIR/biblio/exported_records"
;;
authorities )
EXPORTFILE="$EXPORTDIR/authority/exported_records"
;;
* )
echo "Error: TYPE '$TYPE' is not supported"
exit 1
esac
CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg"
if [ $RESETINDEX = "yes" ]; then
RESETINDEX_CMD="$ZEBRAIDX -c $CONFIGFILE init"
echo "\n$RESETINDEX_CMD"
$RESETINDEX_CMD
echo ""
fi
indexfile $EXPORTFILE $CHUNKSSIZE