Bug 35086: Add chunk_size option to elasticsearch configuration

Whne performing batch operations we can send a large numebr of records for reindexing at once.
Currently this can create requetss that are too large for Elasticsearch to process. We need
to break these requests into chunks/

This patch adds a chunk_size configuration to the elasticsearch stanza in koha-conf.xml

If blank we default to 5000.

To test:
0 - Have Koha using Elasticsearch
1 - Create and download a report of all barcodes:
    SELECT barcode FROM items
2 - Batch modify these items
3 - Note a single ESindexing job is created
4 - Create and download a report of all authority ids:
    SELECT auth_header.authid FROM auth_header
5 - Setup a marc modification template, and batch modify all the authorities
6 - Again note a single ES backgorund job is created
7 - Apply patch
8 - Repeat the modifications above - you still get a single job
9 - Edit koha-conf.xml and add <chunk_size>250</chunk_size> to elasticsearch stanza
10 - Repeat modifications - you now get several background ES jobs
11 - prove -v t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t

Signed-off-by: David Nind <david@davidnind.com>

Signed-off-by: Jonathan Druart <jonathan.druart@bugs.koha-community.org>
Signed-off-by: Katrin Fischer <katrin.fischer@bsz-bw.de>
(cherry picked from commit 9951e230e4)
Signed-off-by: Fridolin Somers <fridolin.somers@biblibre.com>
(cherry picked from commit 636920558c)
Signed-off-by: Lucas Gass <lucas@bywatersolutions.com>
This commit is contained in:
Nick Clemens 2023-12-22 20:11:34 +00:00 committed by Lucas Gass
parent 486140a758
commit 19f442c18f
3 changed files with 40 additions and 7 deletions

View file

@ -19,9 +19,10 @@ package Koha::SearchEngine::Elasticsearch::Indexer;
use Carp qw( carp croak );
use Modern::Perl;
use Try::Tiny qw( catch try );
use List::Util qw( any );
use base qw(Koha::SearchEngine::Elasticsearch);
use Try::Tiny qw( catch try );
use List::Util qw( any );
use List::MoreUtils qw( natatime );
use base qw(Koha::SearchEngine::Elasticsearch);
use Koha::Exceptions;
use Koha::Exceptions::Elasticsearch;
@ -314,6 +315,9 @@ at the moment.
The other variables are used for parity with Zebra indexing calls. Currently the calls are passed through
to Zebra as well.
Will obey the chunk_size defined in koha-conf for amount of records to send during a single reindex, or default
to 5000.
=cut
sub index_records {
@ -321,10 +325,19 @@ sub index_records {
$record_numbers = [$record_numbers] if ref $record_numbers ne 'ARRAY' && defined $record_numbers;
$records = [$records] if ref $records ne 'ARRAY' && defined $records;
if ( $op eq 'specialUpdate' ) {
if ($records){
$self->update_index( $record_numbers, $records );
my $config = $self->get_elasticsearch_params;
my $at_a_time = $config->{chunk_size} // 5000;
my ( $record_chunks, $record_id_chunks );
$record_chunks = natatime $at_a_time, @$records if ($records);
$record_id_chunks = natatime $at_a_time, @$record_numbers if ($record_numbers);
if ($records) {
while ( (my @records = $record_chunks->()) && (my @record_ids = $record_id_chunks->()) ) {
$self->update_index( \@record_ids, \@records );
}
} else {
$self->update_index_background( $record_numbers, $server );
while ( my @record_ids = $record_id_chunks->() ) {
$self->update_index_background( \@record_ids, $server );
}
}
}
elsif ( $op eq 'recordDelete' ) {

View file

@ -171,6 +171,8 @@
<cxn_pool>Static</cxn_pool>
<!-- See https://metacpan.org/pod/Search::Elasticsearch#trace_to -->
<!-- <trace_to>Stderr</trace_to> -->
<!-- You can specify the maximum chunk size for records when batch processing in Koha -->
<!-- <chunk_size>500</chunk_size> -->
</elasticsearch>
<!-- Uncomment the following line if you want to override the Elasticsearch default index settings -->
<!-- <elasticsearch_index_config>__KOHA_CONF_DIR__/searchengine/elasticsearch/index_config.yaml</elasticsearch_index_config> -->

View file

@ -88,7 +88,7 @@ subtest 'create_index() tests' => sub {
};
subtest 'index_records() tests' => sub {
plan tests => 2;
plan tests => 4;
my $mock_index = Test::MockModule->new("Koha::SearchEngine::Elasticsearch::Indexer");
$mock_index->mock( update_index => sub {
my ($self, $record_ids, $records) = @_;
@ -122,6 +122,24 @@ subtest 'index_records() tests' => sub {
"Update background " . $biblio->biblionumber,
"When passing id only to index_records the marc record is fetched and passed through to update_index";
my $chunks = 0;
$mock_index->mock(
update_index => sub {
my ( $self, $record_ids, $records ) = @_;
$chunks++;
}
);
t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo' } );
my @big_array = 1 .. 10000;
$indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array );
is( $chunks, 2, "We split 10000 records into two chunks when chunk size not set" );
$chunks = 0;
t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo', chunk_size => 10 } );
$indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array );
is( $chunks, 1000, "We split 10000 records into 1000 chunks when chunk size is 10" );
};
subtest 'update_index' => sub {