Bug 35086: Add chunk_size option to elasticsearch configuration

Whne performing batch operations we can send a large numebr of records for reindexing at once. Currently this can create requetss that are too large for Elasticsearch to process. We need to break these requests into chunks/ This patch adds a chunk_size configuration to the elasticsearch stanza in koha-conf.xml If blank we default to 5000. To test: 0 - Have Koha using Elasticsearch 1 - Create and download a report of all barcodes: SELECT barcode FROM items 2 - Batch modify these items 3 - Note a single ESindexing job is created 4 - Create and download a report of all authority ids: SELECT auth_header.authid FROM auth_header 5 - Setup a marc modification template, and batch modify all the authorities 6 - Again note a single ES backgorund job is created 7 - Apply patch 8 - Repeat the modifications above - you still get a single job 9 - Edit koha-conf.xml and add <chunk_size>250</chunk_size> to elasticsearch stanza 10 - Repeat modifications - you now get several background ES jobs 11 - prove -v t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t Signed-off-by: David Nind <david@davidnind.com> Signed-off-by: Jonathan Druart <jonathan.druart@bugs.koha-community.org> Signed-off-by: Katrin Fischer <katrin.fischer@bsz-bw.de> (cherry picked from commit 9951e230e4) Signed-off-by: Fridolin Somers <fridolin.somers@biblibre.com> (cherry picked from commit 636920558c) Signed-off-by: Lucas Gass <lucas@bywatersolutions.com>
2023-12-22 20:11:34 +00:00 · 2023-12-22 20:11:34 +00:00 · 19f442c18f
commit 19f442c18f
parent 486140a758
3 changed files with 40 additions and 7 deletions
--- a/Koha/SearchEngine/Elasticsearch/Indexer.pm
+++ b/Koha/SearchEngine/Elasticsearch/Indexer.pm
@ -19,9 +19,10 @@ package Koha::SearchEngine::Elasticsearch::Indexer;

 use Carp qw( carp croak );
 use Modern::Perl;
-use Try::Tiny qw( catch try );
-use List::Util qw( any );
-use base qw(Koha::SearchEngine::Elasticsearch);
+use Try::Tiny       qw( catch try );
+use List::Util      qw( any );
+use List::MoreUtils qw( natatime );
+use base            qw(Koha::SearchEngine::Elasticsearch);

 use Koha::Exceptions;
 use Koha::Exceptions::Elasticsearch;
@ -314,6 +315,9 @@ at the moment.
 The other variables are used for parity with Zebra indexing calls. Currently the calls are passed through
 to Zebra as well.

+Will obey the chunk_size defined in koha-conf for amount of records to send during a single reindex, or default
+to 5000.
+
 =cut

 sub index_records {
@ -321,10 +325,19 @@ sub index_records {
    $record_numbers = [$record_numbers] if ref $record_numbers ne 'ARRAY' && defined $record_numbers;
    $records = [$records] if ref $records ne 'ARRAY' && defined $records;
    if ( $op eq 'specialUpdate' ) {
-        if ($records){
-            $self->update_index( $record_numbers, $records );
+        my $config    = $self->get_elasticsearch_params;
+        my $at_a_time = $config->{chunk_size} // 5000;
+        my ( $record_chunks, $record_id_chunks );
+        $record_chunks    = natatime $at_a_time, @$records        if ($records);
+        $record_id_chunks = natatime $at_a_time, @$record_numbers if ($record_numbers);
+        if ($records) {
+            while ( (my @records = $record_chunks->()) && (my @record_ids = $record_id_chunks->()) ) {
+                $self->update_index( \@record_ids, \@records );
+            }
        } else {
-            $self->update_index_background( $record_numbers, $server );
+            while ( my @record_ids = $record_id_chunks->() ) {
+                $self->update_index_background( \@record_ids, $server );
+            }
        }
    }
    elsif ( $op eq 'recordDelete' ) {
--- a/etc/koha-conf.xml
+++ b/etc/koha-conf.xml
@ -171,6 +171,8 @@
     <cxn_pool>Static</cxn_pool>
     <!-- See https://metacpan.org/pod/Search::Elasticsearch#trace_to -->
     <!-- <trace_to>Stderr</trace_to> -->
+     <!-- You can specify the maximum chunk size for records when batch processing in Koha -->
+     <!-- <chunk_size>500</chunk_size> -->
 </elasticsearch>
 <!-- Uncomment the following line if you want to override the Elasticsearch default index settings -->
 <!-- <elasticsearch_index_config>__KOHA_CONF_DIR__/searchengine/elasticsearch/index_config.yaml</elasticsearch_index_config> -->
--- a/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t
+++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t
@ -88,7 +88,7 @@ subtest 'create_index() tests' => sub {
 };

 subtest 'index_records() tests' => sub {
-    plan tests => 2;
+    plan tests => 4;
    my $mock_index = Test::MockModule->new("Koha::SearchEngine::Elasticsearch::Indexer");
    $mock_index->mock( update_index => sub {
        my ($self, $record_ids, $records) = @_;
@ -122,6 +122,24 @@ subtest 'index_records() tests' => sub {
    "Update background " . $biblio->biblionumber,
    "When passing id only to index_records the marc record is fetched and passed through to update_index";

+    my $chunks = 0;
+    $mock_index->mock(
+        update_index => sub {
+            my ( $self, $record_ids, $records ) = @_;
+            $chunks++;
+        }
+    );
+
+    t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo' } );
+    my @big_array = 1 .. 10000;
+    $indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array );
+    is( $chunks, 2, "We split 10000 records into two chunks when chunk size not set" );
+
+    $chunks = 0;
+    t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo', chunk_size => 10 } );
+    $indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array );
+    is( $chunks, 1000, "We split 10000 records into 1000 chunks when chunk size is 10" );
+
 };

 subtest 'update_index' => sub {