Bug 8746: rebuild_zebra_sliced.sh now export/index records as MARCXML
This avoid indexing failures due to "bad offset" or "bad length" error with ISO2709 format + minor improvements: - --length parameter is optional. If not given, it will execute the right sql query to find the number of records to index - new parameter --reset-index. If set, index is reset before indexing Signed-off-by: Bernardo Gonzalez Kriegel <bgkriegel@gmail.com> Comment: Work as described. No errors. Test: Edit record to make it longer than 9999. Without patch rebuild_sliced fails. With patches works. Signed-off-by: Katrin Fischer <Katrin.Fischer.83@web.de> Signed-off-by: Jared Camins-Esakov <jcamins@cpbibliography.com>
This commit is contained in:
parent
8524994eda
commit
eef4b3f23c
1 changed files with 91 additions and 19 deletions
|
@ -12,21 +12,60 @@ rebuild_zebra.pl is called only once to export records. Splitting and indexing
|
|||
is handled by this script (using yaz-marcdump and zebraidx).
|
||||
|
||||
Usage:
|
||||
$scriptname -t type -l X [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f]
|
||||
$scriptname [-t type] [-l X] [-o X] [-s X] [-d /export/dir] [-L /log/dir] [-r] [-f] [--reset-index]
|
||||
$scriptname -h
|
||||
|
||||
-o | --offset Offset parameter of rebuild_zebra.pl
|
||||
-l | --length Length parameter of rebuild_zebra.pl
|
||||
-o | --offset Offset parameter of rebuild_zebra.pl.
|
||||
Default: $OFFSET
|
||||
-l | --length Length parameter of rebuild_zebra.pl. If omitted, the
|
||||
length is automatically calculated to index all
|
||||
records
|
||||
-s | --chunks-size Initial chunk size (number of records indexed at once)
|
||||
Default: $CHUNKSSIZE
|
||||
-d | --export-dir Where rebuild_zebra.pl will export data
|
||||
Default: $EXPORTDIR
|
||||
-L | --log-dir Log directory
|
||||
Default: $LOGDIR
|
||||
-r | --remove-logs Clean log directory before start
|
||||
Default: $RMLOGS
|
||||
-t | --type Record type ('biblios' or 'authorities')
|
||||
Default: $TYPE
|
||||
-f | --force Don't ask for confirmation before start
|
||||
-h | --help Display this help message
|
||||
--reset-index Reset Zebra index for 'type'
|
||||
EOF
|
||||
}
|
||||
|
||||
splitfile() {
|
||||
local file=$1
|
||||
local prefix=$2
|
||||
local size=$3
|
||||
local script='
|
||||
my $prefix = '"\"$prefix\""';
|
||||
my $size = '"$size"';
|
||||
my ($i,$count) = (0,0);
|
||||
open(my $fh, "<", '"\"$file\""');
|
||||
open(my $out, ">", sprintf("$prefix%02d", $i));
|
||||
my $closed = 0;
|
||||
while (<$fh>) {
|
||||
my $line = $_;
|
||||
if ($closed) {
|
||||
open($out, ">", sprintf("$prefix%02d", $i));
|
||||
$closed = 0;
|
||||
}
|
||||
print $out $line;
|
||||
$count++ if ($line =~ m|^</record>|);
|
||||
if ($count == $size) {
|
||||
$count = 0;
|
||||
$i++;
|
||||
close($out);
|
||||
$closed = 1;
|
||||
}
|
||||
}
|
||||
'
|
||||
$PERL -e "$script"
|
||||
}
|
||||
|
||||
indexfile() {
|
||||
local file=$1
|
||||
local chunkssize=$2
|
||||
|
@ -37,23 +76,22 @@ indexfile() {
|
|||
|
||||
local prefix="${file}_${chunkssize}_"
|
||||
echo "Splitting file in chunks of $chunkssize records"
|
||||
YAZMARCDUMP_CMD="$YAZMARCDUMP -n -s $prefix -C $chunkssize $file"
|
||||
$YAZMARCDUMP_CMD
|
||||
splitfile $file $prefix $chunkssize
|
||||
|
||||
dir=$(dirname $prefix)
|
||||
local files="$(find $dir -regex $prefix[0-9]+ | sort | tr '\n' ' ')"
|
||||
for chunkfile in $files; do
|
||||
echo "Indexing $chunkfile"
|
||||
size=$($YAZMARCDUMP -p $chunkfile | grep '<!-- Record [0-9]\+ offset .* -->' | wc -l)
|
||||
size=$(grep '^</record>' $chunkfile | wc -l)
|
||||
logfile="$LOGDIR/zebraidx.$(basename $chunkfile).log"
|
||||
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 update $chunkfile"
|
||||
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml update $chunkfile"
|
||||
$ZEBRAIDX_CMD >$logfile 2>&1
|
||||
grep "Records: $size" $logfile >/dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Indexing failed. Split file and continue..."
|
||||
indexfile $chunkfile $(($chunkssize/2))
|
||||
else
|
||||
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g iso2709 commit"
|
||||
ZEBRAIDX_CMD="$ZEBRAIDX -c $CONFIGFILE -d $TYPE -g marcxml commit"
|
||||
$ZEBRAIDX_CMD >> $logfile 2>&1
|
||||
fi
|
||||
done
|
||||
|
@ -69,6 +107,7 @@ RMLOGS=no
|
|||
NOCONFIRM=no
|
||||
TYPE=biblios
|
||||
HELP=no
|
||||
RESETINDEX=no
|
||||
|
||||
# Get parameters
|
||||
while [ $1 ]; do
|
||||
|
@ -103,9 +142,12 @@ while [ $1 ]; do
|
|||
-f | --force )
|
||||
NOCONFIRM=yes
|
||||
;;
|
||||
-h | --help)
|
||||
-h | --help )
|
||||
HELP=yes
|
||||
;;
|
||||
--reset-index )
|
||||
RESETINDEX=yes
|
||||
;;
|
||||
* )
|
||||
usage
|
||||
exit 1
|
||||
|
@ -118,37 +160,57 @@ if [ $HELP = "yes" ]; then
|
|||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z $LENGTH ]; then
|
||||
echo "--length parameter is mandatory"
|
||||
if [ -z $KOHA_CONF ]; then
|
||||
echo "KOHA_CONF is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z $PERL5LIB ]; then
|
||||
echo "PERL5LIB is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
TYPESWITCH=
|
||||
SQLTABLE=
|
||||
case $TYPE in
|
||||
biblios )
|
||||
TYPESWITCH=-b
|
||||
SQLTABLE="biblio"
|
||||
;;
|
||||
authorities )
|
||||
TYPESWITCH=-a
|
||||
SQLTABLE="auth_header"
|
||||
;;
|
||||
* )
|
||||
echo "'$TYPE' is an unknown type. Defaulting to 'biblios'"
|
||||
TYPESWITCH=-b
|
||||
TYPE=biblios
|
||||
SQLTABLE="biblio"
|
||||
esac
|
||||
|
||||
PERL=`which perl`
|
||||
if [ -z $PERL ]; then
|
||||
echo "perl not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z $LENGTH ]; then
|
||||
LENGTH=$($PERL -e '
|
||||
use C4::Context;
|
||||
my ($count) = C4::Context->dbh->selectrow_array(qq{
|
||||
SELECT COUNT(*) FROM '"$SQLTABLE"'
|
||||
});
|
||||
print $count;
|
||||
')
|
||||
fi
|
||||
|
||||
ZEBRAIDX=`which zebraidx`
|
||||
if [ -z $ZEBRAIDX ]; then
|
||||
echo "zebraidx not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
YAZMARCDUMP=`which yaz-marcdump`
|
||||
if [ -z $YAZMARCDUMP ]; then
|
||||
echo "yaz-marcdump not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REBUILDZEBRA="`dirname $0`/rebuild_zebra.pl"
|
||||
if [ ! -f $REBUILDZEBRA ]; then
|
||||
echo "$REBUILDZEBRA: file not found"
|
||||
|
@ -158,6 +220,9 @@ fi
|
|||
echo ""
|
||||
echo "Configuration"
|
||||
echo "========================================================================="
|
||||
echo "KOHA_CONF: $KOHA_CONF"
|
||||
echo "PERL5LIB: $PERL5LIB"
|
||||
echo "-------------------------------------------------------------------------"
|
||||
echo "Start at offset: $OFFSET"
|
||||
echo "Total number of records to index: $LENGTH"
|
||||
echo "Initial chunk size: $CHUNKSSIZE"
|
||||
|
@ -165,10 +230,11 @@ echo "Export directory: $EXPORTDIR"
|
|||
echo "Log directory: $LOGDIR"
|
||||
echo "Remove logs before start? $RMLOGS"
|
||||
echo "Type of record: $TYPE"
|
||||
echo "Reset index before start? $RESETINDEX"
|
||||
echo "-------------------------------------------------------------------------"
|
||||
echo "zebraidx path: $ZEBRAIDX"
|
||||
echo "yaz-marcdump path: $YAZMARCDUMP"
|
||||
echo "rebuild_zebra path: $REBUILDZEBRA"
|
||||
echo "perl path: $PERL"
|
||||
echo "========================================================================="
|
||||
|
||||
if [ $NOCONFIRM != "yes" ]; then
|
||||
|
@ -200,7 +266,7 @@ if [ $RMLOGS = "yes" ]; then
|
|||
rm -f $LOGDIR/*.log
|
||||
fi
|
||||
|
||||
REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
|
||||
REBUILDZEBRA_CMD="$REBUILDZEBRA $TYPESWITCH -v -x -k -d $EXPORTDIR --offset $OFFSET --length $LENGTH --skip-index"
|
||||
echo "\n$REBUILDZEBRA_CMD"
|
||||
$REBUILDZEBRA_CMD
|
||||
|
||||
|
@ -219,5 +285,11 @@ esac
|
|||
|
||||
CONFIGFILE="$(dirname $KOHA_CONF)/zebradb/zebra-$TYPE.cfg"
|
||||
|
||||
if [ $RESETINDEX = "yes" ]; then
|
||||
RESETINDEX_CMD="$ZEBRAIDX -c $CONFIGFILE init"
|
||||
echo "\n$RESETINDEX_CMD"
|
||||
$RESETINDEX_CMD
|
||||
echo ""
|
||||
fi
|
||||
|
||||
indexfile $EXPORTFILE $CHUNKSSIZE
|
||||
|
|
Loading…
Reference in a new issue