Koha/misc/migration_tools/rebuild_zebra_sliced.sh
Julian Maurice 57424a9fdc Bug 7286: rebuild_zebra_sliced for biblios and authorities
Complete rewrite of rebuild_zebra_sliced.zsh (renamed to .sh). Main
improvements are:
  - both biblio and authority records are handled
  - records are exported only once

It also add an option --skip-index to rebuild_zebra.pl that permit to
use rebuild_zebra.pl as an 'export only' script.

Description:
Index Koha records by chunks. It is useful when some record causes
errors and stop the indexation process. With this script, if indexation
of one chunk fails, chunk is splitted in 2 (or 3) chunks, and
indexation continue on these chunks.
rebuild_zebra.pl is called only once to export records.
Splitting and indexing is handled by this script (using yaz-marcdump and
zebraidx).

Signed-off-by: Martin Renvoize <martin.renvoize@ptfs-europe.com>
Signed-off-by: Paul Poulain <paul.poulain@biblibre.com>
2012-07-06 15:06:40 +02:00

223 lines
5.6 KiB
Bash
Executable file

#!/bin/sh
usage() {
local scriptname=$(basename $0)
cat <<EOF
$scriptname
Index Koha records by chunks. It is useful when some record causes errors and
stop the indexation process. With this script, if indexation of one chunk fails,
chunk is splitted in two or more chunks, and indexation continue on these chunks.
rebuild_zebra.pl is called only once to export records. Splitting and indexing
is handled by this script (using yaz-marcdump and zebraidx).
Usage:
$scriptname -t type -l X [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f]
$scriptname -h
-o | --offset Offset parameter of rebuild_zebra.pl
-l | --length Length parameter of rebuild_zebra.pl
-s | --chunks-size Initial chunk size (number of records indexed at once)
-d | --export-dir Where rebuild_zebra.pl will export data
-L | --log-dir Log directory
-r | --remove-logs Clean log directory before start
-t | --type Record type ('biblios' or 'authorities')
-f | --force Don't ask for confirmation before start
-h | --help Display this help message
EOF
}
indexfile() {
local file=$1
local chunkssize=$2
if [ $chunkssize -lt 1 ]; then
echo "Fail on file $file"
else
local prefix="${file}_${chunkssize}_"
echo "Splitting file in chunks of $chunkssize records"
YAZMARCDUMP_CMD="$YAZMARCDUMP -n -s $prefix -C $chunkssize $file"
$YAZMARCDUMP_CMD
dir=$(dirname $prefix)
local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
for chunkfile in $files; do
echo "Indexing $chunkfile"
size=$($YAZMARCDUMP -p $chunkfile | grep '<!-- Record [0-9]\+ offset .* -->' | wc -l)
logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 update $chunkfile"
$ZEBRAIDX_CMD >$logfile 2>&1
grep "Records: $size" $logfile >/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Indexing failed. Split file and continue..."
indexfile $chunkfile $(($chunkssize/2))
else
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 commit"
$ZEBRAIDX_CMD >> $logfile 2>&1
fi
done
fi
}
OFFSET=0
LENGTH=
CHUNKSSIZE=10000
EXPORTDIR=/tmp/rebuild/export
LOGDIR=/tmp/rebuild/logs
RMLOGS=no
NOCONFIRM=no
TYPE=biblios
HELP=no
# Get parameters
while [ $1 ]; do
case $1 in
-o | --offset )
shift
OFFSET=$1
;;
-l | --length )
shift
LENGTH=$1
;;
-s | --chunks-size )
shift
CHUNKSSIZE=$1
;;
-d | --export-dir )
shift
EXPORTDIR=$1
;;
-L | --log-dir )
shift
LOGDIR=$1
;;
-r | --remove-logs )
RMLOGS=yes
;;
-t | --type )
shift
TYPE=$1
;;
-f | --force )
NOCONFIRM=yes
;;
-h | --help)
HELP=yes
;;
* )
usage
exit 1
esac
shift
done
if [ $HELP = "yes" ]; then
usage
exit 0
fi
if [ -z $LENGTH ]; then
echo "--length parameter is mandatory"
exit 1
fi
TYPESWITCH=
case $TYPE in
biblios )
TYPESWITCH=-b
;;
authorities )
TYPESWITCH=-a
;;
* )
echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
TYPESWITCH=-b
TYPE=biblios
esac
ZEBRAIDX=`which zebraidx`
if [ -z $ZEBRAIDX ]; then
echo "zebraidx not found"
exit 1
fi
YAZMARCDUMP=`which yaz-marcdump`
if [ -z $YAZMARCDUMP ]; then
echo "yaz-marcdump not found"
exit 1
fi
REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl"
if [ ! -f $REBUILDZEBRA ]; then
echo "$REBUILDZEBRA: file not found"
exit 1
fi
echo ""
echo "Configuration"
echo "========================================================================="
echo "Start at offset: $OFFSET"
echo "Total number of records to index: $LENGTH"
echo "Initial chunk size: $CHUNKSSIZE"
echo "Export directory: $EXPORTDIR"
echo "Log directory: $LOGDIR"
echo "Remove logs before start? $RMLOGS"
echo "Type of record: $TYPE"
echo "-------------------------------------------------------------------------"
echo "zebraidx path: $ZEBRAIDX"
echo "yaz-marcdump path: $YAZMARCDUMP"
echo "rebuild_zebra path: $REBUILDZEBRA"
echo "========================================================================="
if [ $NOCONFIRM != "yes" ]; then
confirm=y
echo -n "Confirm ? [Y/n] "
read response
if [ $response ] && [ $response != "yes" ] && [ $response != "y" ]; then
confirm=n
fi
if [ $confirm = "n" ]; then
exit 0
fi
fi
mkdir -p $EXPORTDIR
if [ $? -ne 0 ]; then
echo "Failed to create directory $EXPORTDIR. Aborting."
exit 1
fi
mkdir -p $LOGDIR
if [ $? -ne 0 ]; then
echo "Failed to create directory $LOGDIR. Aborting."
exit 1
fi
if [ $RMLOGS = "yes" ]; then
rm -f $LOGDIR/*.log
fi
REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
echo "\n$REBUILDZEBRA_CMD"
$REBUILDZEBRA_CMD
EXPORTFILE=
case $TYPE in
biblios )
EXPORTFILE="$EXPORTDIR/biblio/exported_records"
;;
authorities )
EXPORTFILE="$EXPORTDIR/authority/exported_records"
;;
* )
echo "Error: TYPE '$TYPE' is not supported"
exit 1
esac
CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg"
indexfile $EXPORTFILE $CHUNKSSIZE