From 72d0f2336758bd4623cde890a236125ad25b0574 Mon Sep 17 00:00:00 2001 From: Marcel de Rooy Date: Thu, 7 Nov 2024 15:24:40 +0000 Subject: [PATCH] Bug 38408: Add parallel exporting in rebuild_zebra.pl The first part of the Zebra rebuild is the exporting. This part is made faster. The second part with zebraidx is not changed. A new commandline parameter -forks is added to the rebuild_zebra.pl script. A subroutine export_marc_records is added between index_records and export_marc_records_from_sth. The last routine has a new parameter, the sequence number of the export file. NOTE: This report does not touch koha-rebuild-zebra yet! This will be done on a follow-up. Test plan: Note that the number of forks/records below can be adjusted according to your server and database setup. [1] Reindex a subset of 100 records without forks: su [YOUR_KOHA_USER] misc/migration_tools/rebuild_zebra.pl -a -b -r -d /tmp/rebuild01 -k --length 100 Check if /tmp/rebuild01/biblio contains one export file for auth/bib. Verify that max. 100 auth and bib were indexed (check Auth search, Cataloguing) [2] Reindex an additional subset of 100 recs with forks (remove -r, add -forks): su [YOUR_KOHA_USER] misc/migration_tools/rebuild_zebra.pl -a -b -d /tmp/rebuild02 -k --length 100 --offset 100 -forks 3 Check if /tmp/rebuild02/biblio contains 3 export files for auth/bib. Verify that max. 200 auth and bib were indexed (check Auth search, Cataloguing) [3] Run a full reindex with forks: su [YOUR_KOHA_USER] misc/migration_tools/rebuild_zebra.pl -a -b -d /tmp/rebuild03 -k -forks 3 Check both searches again. [4] Bonus: To get a feeling of improved speed, reindex a larger production db with and without using -forks. (Use something like above.) You may add -I to skip indexing in order to better compare both exports. Signed-off-by: Marcel de Rooy Reindexed a prod db in 96 mins instead of 150 mins (3 forks, 4 cores). Main gain in biblio export; complete export took 35 mins, zebraidx 61 mins. Signed-off-by: Paul Derscheid Signed-off-by: Martin Renvoize Signed-off-by: Katrin Fischer --- misc/migration_tools/rebuild_zebra.pl | 76 ++++++++++++++++++++------- 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl index b62f5a02a3..b3fa97a0f5 100755 --- a/misc/migration_tools/rebuild_zebra.pl +++ b/misc/migration_tools/rebuild_zebra.pl @@ -17,19 +17,24 @@ use Modern::Perl; -use Koha::Script; -use C4::Context; -use Getopt::Long qw( GetOptions ); -use Fcntl qw( LOCK_EX LOCK_NB LOCK_UN ); -use File::Temp qw( tempdir ); -use File::Path qw( mkpath rmtree ); -use C4::Biblio qw( GetXmlBiblio ); -use C4::AuthoritiesMarc qw( GetAuthority GetAuthorityXML ); -use C4::Items qw( Item2Marc ); -use Koha::RecordProcessor; -use Koha::Caches; +use Getopt::Long qw( GetOptions ); +use Fcntl qw( LOCK_EX LOCK_NB LOCK_UN ); +use File::Temp qw( tempdir ); +use File::Path qw( mkpath rmtree ); +use Parallel::ForkManager; +use POSIX qw/ceil/; use XML::LibXML; +use C4::Context; +use C4::AuthoritiesMarc qw( GetAuthority GetAuthorityXML ); +use C4::Biblio qw( GetXmlBiblio ); +use C4::Items qw( Item2Marc ); +use Koha::Authorities; +use Koha::Biblios; +use Koha::Caches; +use Koha::RecordProcessor; +use Koha::Script; + use constant LOCK_FILENAME => 'rebuild..LCK'; # script that checks zebradir structure & create directories & mandatory files if needed @@ -65,6 +70,8 @@ my $run_user = ( getpwuid($<) )[0]; my $wait_for_lock = 0; my $use_flock; my $table = 'biblioitems'; +my $forks = 0; +my $chunk_size = 100000; my $is_memcached = Koha::Caches->get_instance->memcached_cache; my $verbose_logging = 0; @@ -93,6 +100,7 @@ my $result = GetOptions( 'run-as-root' => \$run_as_root, 'wait-for-lock' => \$wait_for_lock, 't|table:s' => \$table, + 'forks:i' => \$forks, ); if ( not $result or $want_help ) { @@ -409,9 +417,7 @@ sub index_records { mark_zebraqueue_batch_done($entries); } else { - my $sth = select_all_records($record_type); - $num_records_exported = - export_marc_records_from_sth( $record_type, $sth, "$directory/$record_type", $nosanitize ); + $num_records_exported = export_marc_records( $record_type, "$directory/$record_type", $nosanitize ); unless ($do_not_clear_zebraqueue) { mark_all_zebraqueue_done($record_type); } @@ -505,7 +511,9 @@ sub select_all_authorities { $strsth .= qq{ WHERE $where } if ($where); $strsth .= qq{ LIMIT $length } if ( $length && !$offset ); $strsth .= qq{ LIMIT $offset,$length } if ( $length && $offset ); - my $sth = $dbh->prepare($strsth); + + # If we are forking, we use a new db handle to prevent a potential deadlock + my $sth = $forks ? C4::Context::_new_dbh->prepare($strsth) : $dbh->prepare($strsth); $sth->execute(); return $sth; } @@ -517,16 +525,48 @@ sub select_all_biblios { $strsth .= qq{ WHERE $where } if ($where); $strsth .= qq{ LIMIT $length } if ( $length && !$offset ); $strsth .= qq{ LIMIT $offset,$length } if ($offset); - my $sth = $dbh->prepare($strsth); + + # If we are forking, we use a new db handle to prevent a potential deadlock + my $sth = $forks ? C4::Context::_new_dbh->prepare($strsth) : $dbh->prepare($strsth); $sth->execute(); return $sth; } +sub export_marc_records { + my ( $record_type, $directory, $nosanitize ) = @_; + my $pm = Parallel::ForkManager->new($forks); + my @original_params = ( $offset, $length ); + $offset ||= 0; + $length ||= ( $record_type eq 'biblio' ? Koha::Biblios->count : Koha::Authorities->count ); + my $chunk_size = $forks ? ceil( $length / $forks ) : $length; + my ( $seq, $num_records_exported ) = ( undef, 0 ); + while ( $chunk_size > 0 ) { + + # If you use forks, ->start forks after getting process slot + unless ( $pm->start ) { + + # Child code (or parent when forks parameter is absent) + $length = $chunk_size; + my $sth = select_all_records($record_type); + export_marc_records_from_sth( $record_type, $sth, "$directory", $nosanitize, $seq ); + $pm->finish; # exit for child only + } + $offset += $chunk_size; + $num_records_exported += $chunk_size; + $seq++; + $chunk_size = $length - $num_records_exported if $num_records_exported + $chunk_size > $length; + } + $pm->wait_all_children; + ( $offset, $length ) = @original_params; # restore for a potential second run (with -a -b) + return $num_records_exported; +} + sub export_marc_records_from_sth { - my ( $record_type, $sth, $directory, $nosanitize ) = @_; + my ( $record_type, $sth, $directory, $nosanitize, $seq ) = @_; + $seq ||= q{}; my $num_exported = 0; - open my $fh, '>:encoding(UTF-8) ', "$directory/exported_records" or die $!; + open my $fh, '>:encoding(UTF-8)', "$directory/exported_records$seq" or die $!; print {$fh} $marcxml_open;