Bug 38408: Add parallel exporting in rebuild_zebra.pl
The first part of the Zebra rebuild is the exporting. This part is made faster. The second part with zebraidx is not changed. A new commandline parameter -forks is added to the rebuild_zebra.pl script. A subroutine export_marc_records is added between index_records and export_marc_records_from_sth. The last routine has a new parameter, the sequence number of the export file. NOTE: This report does not touch koha-rebuild-zebra yet! This will be done on a follow-up. Test plan: Note that the number of forks/records below can be adjusted according to your server and database setup. [1] Reindex a subset of 100 records without forks: su [YOUR_KOHA_USER] misc/migration_tools/rebuild_zebra.pl -a -b -r -d /tmp/rebuild01 -k --length 100 Check if /tmp/rebuild01/biblio contains one export file for auth/bib. Verify that max. 100 auth and bib were indexed (check Auth search, Cataloguing) [2] Reindex an additional subset of 100 recs with forks (remove -r, add -forks): su [YOUR_KOHA_USER] misc/migration_tools/rebuild_zebra.pl -a -b -d /tmp/rebuild02 -k --length 100 --offset 100 -forks 3 Check if /tmp/rebuild02/biblio contains 3 export files for auth/bib. Verify that max. 200 auth and bib were indexed (check Auth search, Cataloguing) [3] Run a full reindex with forks: su [YOUR_KOHA_USER] misc/migration_tools/rebuild_zebra.pl -a -b -d /tmp/rebuild03 -k -forks 3 Check both searches again. [4] Bonus: To get a feeling of improved speed, reindex a larger production db with and without using -forks. (Use something like above.) You may add -I to skip indexing in order to better compare both exports. Signed-off-by: Marcel de Rooy <m.de.rooy@rijksmuseum.nl> Reindexed a prod db in 96 mins instead of 150 mins (3 forks, 4 cores). Main gain in biblio export; complete export took 35 mins, zebraidx 61 mins. Signed-off-by: Paul Derscheid <paul.derscheid@lmscloud.de> Signed-off-by: Martin Renvoize <martin.renvoize@ptfs-europe.com> Signed-off-by: Katrin Fischer <katrin.fischer@bsz-bw.de>
This commit is contained in:
parent
09801ed302
commit
72d0f23367
1 changed files with 58 additions and 18 deletions
|
@ -17,19 +17,24 @@
|
|||
|
||||
use Modern::Perl;
|
||||
|
||||
use Koha::Script;
|
||||
use C4::Context;
|
||||
use Getopt::Long qw( GetOptions );
|
||||
use Fcntl qw( LOCK_EX LOCK_NB LOCK_UN );
|
||||
use File::Temp qw( tempdir );
|
||||
use File::Path qw( mkpath rmtree );
|
||||
use C4::Biblio qw( GetXmlBiblio );
|
||||
use C4::AuthoritiesMarc qw( GetAuthority GetAuthorityXML );
|
||||
use C4::Items qw( Item2Marc );
|
||||
use Koha::RecordProcessor;
|
||||
use Koha::Caches;
|
||||
use Getopt::Long qw( GetOptions );
|
||||
use Fcntl qw( LOCK_EX LOCK_NB LOCK_UN );
|
||||
use File::Temp qw( tempdir );
|
||||
use File::Path qw( mkpath rmtree );
|
||||
use Parallel::ForkManager;
|
||||
use POSIX qw/ceil/;
|
||||
use XML::LibXML;
|
||||
|
||||
use C4::Context;
|
||||
use C4::AuthoritiesMarc qw( GetAuthority GetAuthorityXML );
|
||||
use C4::Biblio qw( GetXmlBiblio );
|
||||
use C4::Items qw( Item2Marc );
|
||||
use Koha::Authorities;
|
||||
use Koha::Biblios;
|
||||
use Koha::Caches;
|
||||
use Koha::RecordProcessor;
|
||||
use Koha::Script;
|
||||
|
||||
use constant LOCK_FILENAME => 'rebuild..LCK';
|
||||
|
||||
# script that checks zebradir structure & create directories & mandatory files if needed
|
||||
|
@ -65,6 +70,8 @@ my $run_user = ( getpwuid($<) )[0];
|
|||
my $wait_for_lock = 0;
|
||||
my $use_flock;
|
||||
my $table = 'biblioitems';
|
||||
my $forks = 0;
|
||||
my $chunk_size = 100000;
|
||||
my $is_memcached = Koha::Caches->get_instance->memcached_cache;
|
||||
|
||||
my $verbose_logging = 0;
|
||||
|
@ -93,6 +100,7 @@ my $result = GetOptions(
|
|||
'run-as-root' => \$run_as_root,
|
||||
'wait-for-lock' => \$wait_for_lock,
|
||||
't|table:s' => \$table,
|
||||
'forks:i' => \$forks,
|
||||
);
|
||||
|
||||
if ( not $result or $want_help ) {
|
||||
|
@ -409,9 +417,7 @@ sub index_records {
|
|||
mark_zebraqueue_batch_done($entries);
|
||||
|
||||
} else {
|
||||
my $sth = select_all_records($record_type);
|
||||
$num_records_exported =
|
||||
export_marc_records_from_sth( $record_type, $sth, "$directory/$record_type", $nosanitize );
|
||||
$num_records_exported = export_marc_records( $record_type, "$directory/$record_type", $nosanitize );
|
||||
unless ($do_not_clear_zebraqueue) {
|
||||
mark_all_zebraqueue_done($record_type);
|
||||
}
|
||||
|
@ -505,7 +511,9 @@ sub select_all_authorities {
|
|||
$strsth .= qq{ WHERE $where } if ($where);
|
||||
$strsth .= qq{ LIMIT $length } if ( $length && !$offset );
|
||||
$strsth .= qq{ LIMIT $offset,$length } if ( $length && $offset );
|
||||
my $sth = $dbh->prepare($strsth);
|
||||
|
||||
# If we are forking, we use a new db handle to prevent a potential deadlock
|
||||
my $sth = $forks ? C4::Context::_new_dbh->prepare($strsth) : $dbh->prepare($strsth);
|
||||
$sth->execute();
|
||||
return $sth;
|
||||
}
|
||||
|
@ -517,16 +525,48 @@ sub select_all_biblios {
|
|||
$strsth .= qq{ WHERE $where } if ($where);
|
||||
$strsth .= qq{ LIMIT $length } if ( $length && !$offset );
|
||||
$strsth .= qq{ LIMIT $offset,$length } if ($offset);
|
||||
my $sth = $dbh->prepare($strsth);
|
||||
|
||||
# If we are forking, we use a new db handle to prevent a potential deadlock
|
||||
my $sth = $forks ? C4::Context::_new_dbh->prepare($strsth) : $dbh->prepare($strsth);
|
||||
$sth->execute();
|
||||
return $sth;
|
||||
}
|
||||
|
||||
sub export_marc_records {
|
||||
my ( $record_type, $directory, $nosanitize ) = @_;
|
||||
my $pm = Parallel::ForkManager->new($forks);
|
||||
my @original_params = ( $offset, $length );
|
||||
$offset ||= 0;
|
||||
$length ||= ( $record_type eq 'biblio' ? Koha::Biblios->count : Koha::Authorities->count );
|
||||
my $chunk_size = $forks ? ceil( $length / $forks ) : $length;
|
||||
my ( $seq, $num_records_exported ) = ( undef, 0 );
|
||||
while ( $chunk_size > 0 ) {
|
||||
|
||||
# If you use forks, ->start forks after getting process slot
|
||||
unless ( $pm->start ) {
|
||||
|
||||
# Child code (or parent when forks parameter is absent)
|
||||
$length = $chunk_size;
|
||||
my $sth = select_all_records($record_type);
|
||||
export_marc_records_from_sth( $record_type, $sth, "$directory", $nosanitize, $seq );
|
||||
$pm->finish; # exit for child only
|
||||
}
|
||||
$offset += $chunk_size;
|
||||
$num_records_exported += $chunk_size;
|
||||
$seq++;
|
||||
$chunk_size = $length - $num_records_exported if $num_records_exported + $chunk_size > $length;
|
||||
}
|
||||
$pm->wait_all_children;
|
||||
( $offset, $length ) = @original_params; # restore for a potential second run (with -a -b)
|
||||
return $num_records_exported;
|
||||
}
|
||||
|
||||
sub export_marc_records_from_sth {
|
||||
my ( $record_type, $sth, $directory, $nosanitize ) = @_;
|
||||
my ( $record_type, $sth, $directory, $nosanitize, $seq ) = @_;
|
||||
$seq ||= q{};
|
||||
|
||||
my $num_exported = 0;
|
||||
open my $fh, '>:encoding(UTF-8) ', "$directory/exported_records" or die $!;
|
||||
open my $fh, '>:encoding(UTF-8)', "$directory/exported_records$seq" or die $!;
|
||||
|
||||
print {$fh} $marcxml_open;
|
||||
|
||||
|
|
Loading…
Reference in a new issue