Bug 35086: Add chunk_size option to elasticsearch configuration
Whne performing batch operations we can send a large numebr of records for reindexing at once. Currently this can create requetss that are too large for Elasticsearch to process. We need to break these requests into chunks/ This patch adds a chunk_size configuration to the elasticsearch stanza in koha-conf.xml If blank we default to 5000. To test: 0 - Have Koha using Elasticsearch 1 - Create and download a report of all barcodes: SELECT barcode FROM items 2 - Batch modify these items 3 - Note a single ESindexing job is created 4 - Create and download a report of all authority ids: SELECT auth_header.authid FROM auth_header 5 - Setup a marc modification template, and batch modify all the authorities 6 - Again note a single ES backgorund job is created 7 - Apply patch 8 - Repeat the modifications above - you still get a single job 9 - Edit koha-conf.xml and add <chunk_size>250</chunk_size> to elasticsearch stanza 10 - Repeat modifications - you now get several background ES jobs 11 - prove -v t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t Signed-off-by: David Nind <david@davidnind.com> Signed-off-by: Jonathan Druart <jonathan.druart@bugs.koha-community.org> Signed-off-by: Katrin Fischer <katrin.fischer@bsz-bw.de> (cherry picked from commit9951e230e4
) Signed-off-by: Fridolin Somers <fridolin.somers@biblibre.com> (cherry picked from commit636920558c
) Signed-off-by: Lucas Gass <lucas@bywatersolutions.com>
This commit is contained in:
parent
486140a758
commit
19f442c18f
3 changed files with 40 additions and 7 deletions
|
@ -19,9 +19,10 @@ package Koha::SearchEngine::Elasticsearch::Indexer;
|
|||
|
||||
use Carp qw( carp croak );
|
||||
use Modern::Perl;
|
||||
use Try::Tiny qw( catch try );
|
||||
use List::Util qw( any );
|
||||
use base qw(Koha::SearchEngine::Elasticsearch);
|
||||
use Try::Tiny qw( catch try );
|
||||
use List::Util qw( any );
|
||||
use List::MoreUtils qw( natatime );
|
||||
use base qw(Koha::SearchEngine::Elasticsearch);
|
||||
|
||||
use Koha::Exceptions;
|
||||
use Koha::Exceptions::Elasticsearch;
|
||||
|
@ -314,6 +315,9 @@ at the moment.
|
|||
The other variables are used for parity with Zebra indexing calls. Currently the calls are passed through
|
||||
to Zebra as well.
|
||||
|
||||
Will obey the chunk_size defined in koha-conf for amount of records to send during a single reindex, or default
|
||||
to 5000.
|
||||
|
||||
=cut
|
||||
|
||||
sub index_records {
|
||||
|
@ -321,10 +325,19 @@ sub index_records {
|
|||
$record_numbers = [$record_numbers] if ref $record_numbers ne 'ARRAY' && defined $record_numbers;
|
||||
$records = [$records] if ref $records ne 'ARRAY' && defined $records;
|
||||
if ( $op eq 'specialUpdate' ) {
|
||||
if ($records){
|
||||
$self->update_index( $record_numbers, $records );
|
||||
my $config = $self->get_elasticsearch_params;
|
||||
my $at_a_time = $config->{chunk_size} // 5000;
|
||||
my ( $record_chunks, $record_id_chunks );
|
||||
$record_chunks = natatime $at_a_time, @$records if ($records);
|
||||
$record_id_chunks = natatime $at_a_time, @$record_numbers if ($record_numbers);
|
||||
if ($records) {
|
||||
while ( (my @records = $record_chunks->()) && (my @record_ids = $record_id_chunks->()) ) {
|
||||
$self->update_index( \@record_ids, \@records );
|
||||
}
|
||||
} else {
|
||||
$self->update_index_background( $record_numbers, $server );
|
||||
while ( my @record_ids = $record_id_chunks->() ) {
|
||||
$self->update_index_background( \@record_ids, $server );
|
||||
}
|
||||
}
|
||||
}
|
||||
elsif ( $op eq 'recordDelete' ) {
|
||||
|
|
|
@ -171,6 +171,8 @@
|
|||
<cxn_pool>Static</cxn_pool>
|
||||
<!-- See https://metacpan.org/pod/Search::Elasticsearch#trace_to -->
|
||||
<!-- <trace_to>Stderr</trace_to> -->
|
||||
<!-- You can specify the maximum chunk size for records when batch processing in Koha -->
|
||||
<!-- <chunk_size>500</chunk_size> -->
|
||||
</elasticsearch>
|
||||
<!-- Uncomment the following line if you want to override the Elasticsearch default index settings -->
|
||||
<!-- <elasticsearch_index_config>__KOHA_CONF_DIR__/searchengine/elasticsearch/index_config.yaml</elasticsearch_index_config> -->
|
||||
|
|
|
@ -88,7 +88,7 @@ subtest 'create_index() tests' => sub {
|
|||
};
|
||||
|
||||
subtest 'index_records() tests' => sub {
|
||||
plan tests => 2;
|
||||
plan tests => 4;
|
||||
my $mock_index = Test::MockModule->new("Koha::SearchEngine::Elasticsearch::Indexer");
|
||||
$mock_index->mock( update_index => sub {
|
||||
my ($self, $record_ids, $records) = @_;
|
||||
|
@ -122,6 +122,24 @@ subtest 'index_records() tests' => sub {
|
|||
"Update background " . $biblio->biblionumber,
|
||||
"When passing id only to index_records the marc record is fetched and passed through to update_index";
|
||||
|
||||
my $chunks = 0;
|
||||
$mock_index->mock(
|
||||
update_index => sub {
|
||||
my ( $self, $record_ids, $records ) = @_;
|
||||
$chunks++;
|
||||
}
|
||||
);
|
||||
|
||||
t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo' } );
|
||||
my @big_array = 1 .. 10000;
|
||||
$indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array );
|
||||
is( $chunks, 2, "We split 10000 records into two chunks when chunk size not set" );
|
||||
|
||||
$chunks = 0;
|
||||
t::lib::Mocks::mock_config( 'elasticsearch', { server => 'false', index_name => 'pseudo', chunk_size => 10 } );
|
||||
$indexer->index_records( \@big_array, 'specialUpdate', 'biblioserver', \@big_array );
|
||||
is( $chunks, 1000, "We split 10000 records into 1000 chunks when chunk size is 10" );
|
||||
|
||||
};
|
||||
|
||||
subtest 'update_index' => sub {
|
||||
|
|
Loading…
Reference in a new issue