From 19f442c18fd600f098b90fbde18479338d382d9a Mon Sep 17 00:00:00 2001 From: Nick Clemens Date: Fri, 22 Dec 2023 20:11:34 +0000 Subject: [PATCH] Bug 35086: Add chunk_size option to elasticsearch configuration Whne performing batch operations we can send a large numebr of records for reindexing at once. Currently this can create requetss that are too large for Elasticsearch to process. We need to break these requests into chunks/ This patch adds a chunk_size configuration to the elasticsearch stanza in koha-conf.xml If blank we default to 5000. To test: 0 - Have Koha using Elasticsearch 1 - Create and download a report of all barcodes: SELECT barcode FROM items 2 - Batch modify these items 3 - Note a single ESindexing job is created 4 - Create and download a report of all authority ids: SELECT auth_header.authid FROM auth_header 5 - Setup a marc modification template, and batch modify all the authorities 6 - Again note a single ES backgorund job is created 7 - Apply patch 8 - Repeat the modifications above - you still get a single job 9 - Edit koha-conf.xml and add 250 to elasticsearch stanza 10 - Repeat modifications - you now get several background ES jobs 11 - prove -v t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t Signed-off-by: David Nind Signed-off-by: Jonathan Druart Signed-off-by: Katrin Fischer (cherry picked from commit 9951e230e43d08dc6a2fb635877d0db5c1e245ff) Signed-off-by: Fridolin Somers (cherry picked from commit 636920558caa114afef1c50256a5031f4be7b0da) Signed-off-by: Lucas Gass --- Koha/SearchEngine/Elasticsearch/Indexer.pm | 25 ++++++++++++++----- etc/koha-conf.xml | 2 ++ .../Koha/SearchEngine/Elasticsearch/Indexer.t | 20 ++++++++++++++- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch/Indexer.pm b/Koha/SearchEngine/Elasticsearch/Indexer.pm index 6101a3a372..0fd22a6e0b 100644 --- a/Koha/SearchEngine/Elasticsearch/Indexer.pm +++ b/Koha/SearchEngine/Elasticsearch/Indexer.pm @@ -19,9 +19,10 @@ package Koha::SearchEngine::Elasticsearch::Indexer; use Carp qw( carp croak ); use Modern::Perl; -use Try::Tiny qw( catch try ); -use List::Util qw( any ); -use base qw(Koha::SearchEngine::Elasticsearch); +use Try::Tiny qw( catch try ); +use List::Util qw( any ); +use List::MoreUtils qw( natatime ); +use base qw(Koha::SearchEngine::Elasticsearch); use Koha::Exceptions; use Koha::Exceptions::Elasticsearch; @@ -314,6 +315,9 @@ at the moment. The other variables are used for parity with Zebra indexing calls. Currently the calls are passed through to Zebra as well. +Will obey the chunk_size defined in koha-conf for amount of records to send during a single reindex, or default +to 5000. + =cut sub index_records { @@ -321,10 +325,19 @@ sub index_records { $record_numbers = [$record_numbers] if ref $record_numbers ne 'ARRAY' && defined $record_numbers; $records = [$records] if ref $records ne 'ARRAY' && defined $records; if ( $op eq 'specialUpdate' ) { - if ($records){ - $self->update_index( $record_numbers, $records ); + my $config = $self->get_elasticsearch_params; + my $at_a_time = $config->{chunk_size} // 5000; + my ( $record_chunks, $record_id_chunks ); + $record_chunks = natatime $at_a_time, @$records if ($records); + $record_id_chunks = natatime $at_a_time, @$record_numbers if ($record_numbers); + if ($records) { + while ( (my @records = $record_chunks->()) && (my @record_ids = $record_id_chunks->()) ) { + $self->update_index( \@record_ids, \@records ); + } } else { - $self->update_index_background( $record_numbers, $server ); + while ( my @record_ids = $record_id_chunks->() ) { + $self->update_index_background( \@record_ids, $server ); + } } } elsif ( $op eq 'recordDelete' ) { diff --git a/etc/koha-conf.xml b/etc/koha-conf.xml index 845a9f40ac..4925dd192c 100644 --- a/etc/koha-conf.xml +++ b/etc/koha-conf.xml @@ -171,6 +171,8 @@ Static + + diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t index fa60948ec8..69767bae8e 100755 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t @@ -88,7 +88,7 @@ subtest 'create_index() tests' => sub { }; subtest 'index_records() tests' => sub { - plan tests => 2; + plan tests => 4; my $mock_index = Test::MockModule->new("Koha::SearchEngine::Elasticsearch::Indexer"); $mock_index->mock( update_index => sub { my ($self, $record_ids, $records) = @_; @@ -122,6 +122,24 @@ subtest 'index_records() tests' => sub { "Update background " . $biblio->biblionumber, "When passing id only to index_records the marc record is fetched and passed through to update_index"; + my $chunks = 0; + $mock_index->mock( + update_index => sub { + my ( $self, $record_ids, $records ) = @_; + $chunks++; + } + ); + + t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo' } ); + my @big_array = 1 .. 10000; + $indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array ); + is( $chunks, 2, "We split 10000 records into two chunks when chunk size not set" ); + + $chunks = 0; + t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo', chunk_size => 10 } ); + $indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array ); + is( $chunks, 1000, "We split 10000 records into 1000 chunks when chunk size is 10" ); + }; subtest 'update_index' => sub { -- 2.39.5