Bug 8746: rebuild_zebra_sliced.sh now export/index records as MARCXML

This avoid indexing failures due to "bad offset" or "bad length" error
with ISO2709 format

+ minor improvements:
  -  --length parameter is optional. If not given, it will execute the
     right sql query to find the number of records to index
  -  new parameter --reset-index. If set, index is reset before indexing

Signed-off-by: Bernardo Gonzalez Kriegel <bgkriegel@gmail.com>
Comment: Work as described. No errors.

Test: Edit record to make it longer than 9999. Without patch rebuild_sliced
fails. With patches works.

Signed-off-by: Katrin Fischer <Katrin.Fischer.83@web.de>
Signed-off-by: Jared Camins-Esakov <jcamins@cpbibliography.com>
This commit is contained in:
Julian Maurice 2012-09-20 12:12:31 +02:00 committed by Jared Camins-Esakov
parent 8524994eda
commit eef4b3f23c

View file

@ -12,21 +12,60 @@ rebuild_zebra.pl is called only once to export records. Splitting and indexing
is handled by this script (using yaz-marcdump and zebraidx).
Usage:
$scriptname -t type -l X [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f]
$scriptname [-t type] [-l X] [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] [--reset-index]
$scriptname -h
-o | --offset Offset parameter of rebuild_zebra.pl
-l | --length Length parameter of rebuild_zebra.pl
-o | --offset Offset parameter of rebuild_zebra.pl.
Default: $OFFSET
-l | --length Length parameter of rebuild_zebra.pl. If omitted, the
length is automatically calculated to index all
records
-s | --chunks-size Initial chunk size (number of records indexed at once)
Default: $CHUNKSSIZE
-d | --export-dir Where rebuild_zebra.pl will export data
Default: $EXPORTDIR
-L | --log-dir Log directory
Default: $LOGDIR
-r | --remove-logs Clean log directory before start
Default: $RMLOGS
-t | --type Record type ('biblios' or 'authorities')
Default: $TYPE
-f | --force Don't ask for confirmation before start
-h | --help Display this help message
--reset-index Reset Zebra index for 'type'
EOF
}
splitfile() {
local file=$1
local prefix=$2
local size=$3
local script='
my $prefix = '"\"$prefix\""';
my $size = '"$size"';
my ($i,$count) = (0,0);
open(my $fh, "<", '"\"$file\""');
open(my $out, ">", sprintf("$prefix%02d", $i));
my $closed = 0;
while (<$fh>) {
my $line = $_;
if ($closed) {
open($out, ">", sprintf("$prefix%02d", $i));
$closed = 0;
}
print $out $line;
$count++ if ($line =~ m|^</record>|);
if ($count == $size) {
$count = 0;
$i++;
close($out);
$closed = 1;
}
}
'
$PERL -e "$script"
}
indexfile() {
local file=$1
local chunkssize=$2
@ -37,23 +76,22 @@ indexfile() {
local prefix="${file}_${chunkssize}_"
echo "Splitting file in chunks of $chunkssize records"
YAZMARCDUMP_CMD="$YAZMARCDUMP -n -s $prefix -C $chunkssize $file"
$YAZMARCDUMP_CMD
splitfile $file $prefix $chunkssize
dir=$(dirname $prefix)
local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
for chunkfile in $files; do
echo "Indexing $chunkfile"
size=$($YAZMARCDUMP -p $chunkfile | grep '<!-- Record [0-9]\+ offset .* -->' | wc -l)
size=$(grep '^</record>' $chunkfile | wc -l)
logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 update $chunkfile"
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml update $chunkfile"
$ZEBRAIDX_CMD >$logfile 2>&1
grep "Records: $size" $logfile >/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Indexing failed. Split file and continue..."
indexfile $chunkfile $(($chunkssize/2))
else
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 commit"
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml commit"
$ZEBRAIDX_CMD >> $logfile 2>&1
fi
done
@ -69,6 +107,7 @@ RMLOGS=no
NOCONFIRM=no
TYPE=biblios
HELP=no
RESETINDEX=no
# Get parameters
while [ $1 ]; do
@ -103,9 +142,12 @@ while [ $1 ]; do
-f | --force )
NOCONFIRM=yes
;;
-h | --help)
-h | --help )
HELP=yes
;;
--reset-index )
RESETINDEX=yes
;;
* )
usage
exit 1
@ -118,37 +160,57 @@ if [ $HELP = "yes" ]; then
exit 0
fi
if [ -z $LENGTH ]; then
echo "--length parameter is mandatory"
if [ -z $KOHA_CONF ]; then
echo "KOHA_CONF is not set"
exit 1
fi
if [ -z $PERL5LIB ]; then
echo "PERL5LIB is not set"
exit 1
fi
TYPESWITCH=
SQLTABLE=
case $TYPE in
biblios )
TYPESWITCH=-b
SQLTABLE="biblio"
;;
authorities )
TYPESWITCH=-a
SQLTABLE="auth_header"
;;
* )
echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
TYPESWITCH=-b
TYPE=biblios
SQLTABLE="biblio"
esac
PERL=`which perl`
if [ -z $PERL ]; then
echo "perl not found"
exit 1
fi
if [ -z $LENGTH ]; then
LENGTH=$($PERL -e '
use C4::Context;
my ($count) = C4::Context->dbh->selectrow_array(qq{
SELECT COUNT(*) FROM '"$SQLTABLE"'
});
print $count;
')
fi
ZEBRAIDX=`which zebraidx`
if [ -z $ZEBRAIDX ]; then
echo "zebraidx not found"
exit 1
fi
YAZMARCDUMP=`which yaz-marcdump`
if [ -z $YAZMARCDUMP ]; then
echo "yaz-marcdump not found"
exit 1
fi
REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl"
if [ ! -f $REBUILDZEBRA ]; then
echo "$REBUILDZEBRA: file not found"
@ -158,6 +220,9 @@ fi
echo ""
echo "Configuration"
echo "========================================================================="
echo "KOHA_CONF: $KOHA_CONF"
echo "PERL5LIB: $PERL5LIB"
echo "-------------------------------------------------------------------------"
echo "Start at offset: $OFFSET"
echo "Total number of records to index: $LENGTH"
echo "Initial chunk size: $CHUNKSSIZE"
@ -165,10 +230,11 @@ echo "Export directory: $EXPORTDIR"
echo "Log directory: $LOGDIR"
echo "Remove logs before start? $RMLOGS"
echo "Type of record: $TYPE"
echo "Reset index before start? $RESETINDEX"
echo "-------------------------------------------------------------------------"
echo "zebraidx path: $ZEBRAIDX"
echo "yaz-marcdump path: $YAZMARCDUMP"
echo "rebuild_zebra path: $REBUILDZEBRA"
echo "perl path: $PERL"
echo "========================================================================="
if [ $NOCONFIRM != "yes" ]; then
@ -200,7 +266,7 @@ if [ $RMLOGS = "yes" ]; then
rm -f $LOGDIR/*.log
fi
REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -x -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
echo "\n$REBUILDZEBRA_CMD"
$REBUILDZEBRA_CMD
@ -219,5 +285,11 @@ esac
CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg"
if [ $RESETINDEX = "yes" ]; then
RESETINDEX_CMD="$ZEBRAIDX -c $CONFIGFILE init"
echo "\n$RESETINDEX_CMD"
$RESETINDEX_CMD
echo ""
fi
indexfile $EXPORTFILE $CHUNKSSIZE