From 7d741a69919c17f8c8c6a4c965d93c95c2355793 Mon Sep 17 00:00:00 2001 From: Ere Maijala Date: Fri, 1 Feb 2019 13:18:11 +0200 Subject: [PATCH] Bug 22246: Fix indexing of large fields with Elasticsearch Deduplicate multivalued fields and make sure sort fields are not excessively long. Also updates default mappings so that sort fields are not created for item fields where it doesn't make sense. Test plan: 1. Reset ES mappings in administration 2. Check that sort is '0' for local-classification in biblio mappings. 3. Change sort back to '1' for local-classification for the next steps. 4. Create a record with 20 items, each with a 100 character long call number 5. Check that when indexed, the record in ES does not have duplicates in any of the item fields and local-classification__sort is truncated to 255 characters. Signed-off-by: Nick Clemens Signed-off-by: Josef Moravec Signed-off-by: Nick Clemens (cherry picked from commit bd68e2f75b2bd0cae7935715f4aa03342809cb9f) Signed-off-by: Martin Renvoize --- Koha/SearchEngine/Elasticsearch.pm | 21 +++++--- .../searchengine/elasticsearch/mappings.yaml | 54 +++++++++---------- t/Koha/SearchEngine/Elasticsearch.t | 36 +++++++++++-- 3 files changed, 73 insertions(+), 38 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index f7aa3826bb..5cd0ad968a 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -375,12 +375,7 @@ sub _process_mappings { $options->{property} => $_data } } - # For sort fields, index only a single field with concatenated values - if ($sort && @{$record_document->{$target}}) { - @{$record_document->{$target}}[0] .= " $_data"; - } else { - push @{$record_document->{$target}}, $_data; - } + push @{$record_document->{$target}}, $_data; } } @@ -512,6 +507,20 @@ sub marc_records_to_documents { } } + # Remove duplicate values and collapse sort fields + foreach my $field (keys %{$record_document}) { + if (ref($record_document->{$field}) eq 'ARRAY') { + @{$record_document->{$field}} = do { + my %seen; + grep { !$seen{ref($_) eq 'HASH' && defined $_->{input} ? $_->{input} : $_}++ } @{$record_document->{$field}}; + }; + if ($field =~ /__sort$/) { + # Make sure to keep the sort field length sensible. 255 was chosen as a nice round value. + $record_document->{$field} = [substr(join(' ', @{$record_document->{$field}}), 0, 255)]; + } + } + } + # TODO: Perhaps should check if $records_document non empty, but really should never be the case $record->encoding('UTF-8'); my @warnings; diff --git a/admin/searchengine/elasticsearch/mappings.yaml b/admin/searchengine/elasticsearch/mappings.yaml index 86b1b07cf3..ec0db59357 100644 --- a/admin/searchengine/elasticsearch/mappings.yaml +++ b/admin/searchengine/elasticsearch/mappings.yaml @@ -1967,17 +1967,17 @@ biblios: - facet: '1' marc_field: 952b marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' - facet: '1' marc_field: 952b marc_type: normarc - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: 995c marc_type: unimarc - sort: ~ + sort: 0 suggestible: '' type: string homebranch: @@ -1986,17 +1986,17 @@ biblios: - facet: '1' marc_field: 952a marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' - facet: '1' marc_field: 952a marc_type: normarc - sort: ~ + sort: 0 suggestible: '' - facet: '1' marc_field: 995b marc_type: unimarc - sort: ~ + sort: 0 suggestible: '' type: string host-item: @@ -2217,17 +2217,17 @@ biblios: - facet: '' marc_field: '9529' marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: '9529' marc_type: normarc - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: '9959' marc_type: unimarc - sort: ~ + sort: 0 suggestible: '' type: number itemtype: @@ -2634,22 +2634,22 @@ biblios: - facet: '' marc_field: 952o marc_type: marc21 - sort: ~ + sort: 0 suggestible: '1' - facet: '' marc_field: 952o marc_type: normarc - sort: ~ + sort: 0 suggestible: '1' - facet: '' marc_field: '686' marc_type: unimarc - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: 995k marc_type: unimarc - sort: ~ + sort: 0 suggestible: '1' type: '' local-number: @@ -2658,17 +2658,17 @@ biblios: - facet: '' marc_field: 999c marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: 999c marc_type: normarc - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: '001' marc_type: unimarc - sort: ~ + sort: 0 suggestible: '' type: string location: @@ -2677,17 +2677,17 @@ biblios: - facet: '1' marc_field: 952c marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' - facet: '1' marc_field: 952c marc_type: normarc - sort: ~ + sort: 0 suggestible: '' - facet: '1' marc_field: 995e marc_type: unimarc - sort: ~ + sort: 0 suggestible: '' type: '' lost: @@ -2829,7 +2829,7 @@ biblios: - facet: '' marc_field: '700' marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: '710' @@ -2956,7 +2956,7 @@ biblios: - facet: '' marc_field: '060' marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' type: '' not-onloan-count: @@ -2998,17 +2998,17 @@ biblios: - facet: 0 marc_field: '9527' marc_type: marc21 - sort: ~ + sort: 0 suggestible: 0 - facet: 0 marc_field: '9527' marc_type: normarc - sort: ~ + sort: 0 suggestible: 0 - facet: 0 marc_field: 995o marc_type: unimarc - sort: ~ + sort: 0 suggestible: 0 type: number number-db: @@ -3072,17 +3072,17 @@ biblios: - facet: '' marc_field: 952q marc_type: marc21 - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: 952q marc_type: normarc - sort: ~ + sort: 0 suggestible: '' - facet: '' marc_field: 995n marc_type: unimarc - sort: ~ + sort: 0 suggestible: '' type: boolean other-control-number: diff --git a/t/Koha/SearchEngine/Elasticsearch.t b/t/Koha/SearchEngine/Elasticsearch.t index a4fc6b561b..1fe3ad14e1 100644 --- a/t/Koha/SearchEngine/Elasticsearch.t +++ b/t/Koha/SearchEngine/Elasticsearch.t @@ -117,7 +117,7 @@ subtest 'get_elasticsearch_mappings() tests' => sub { subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' => sub { - plan tests => 47; + plan tests => 49; t::lib::Mocks::mock_preference('marcflavour', 'MARC21'); @@ -212,6 +212,15 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' marc_type => 'marc21', marc_field => '9520', }, + { + name => 'local_classification', + type => 'string', + facet => 0, + suggestible => 0, + sort => 1, + marc_type => 'marc21', + marc_field => '952o', + }, { name => 'type_of_record', type => 'string', @@ -251,6 +260,10 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' my $see = Koha::SearchEngine::Elasticsearch::Search->new({ index => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX }); + my $callno = 'ABC123'; + my $callno2 = 'ABC456'; + my $long_callno = '1234567890' x 30; + my $marc_record_1 = MARC::Record->new(); $marc_record_1->leader(' cam 22 a 4500'); $marc_record_1->append_fields( @@ -262,8 +275,9 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' MARC::Field->new('245', '', '', a => 'Title:', b => 'first record'), MARC::Field->new('999', '', '', c => '1234567'), # ' ' for testing trimming of white space in boolean value callback: - MARC::Field->new('952', '', '', 0 => ' ', g => '123.30'), - MARC::Field->new('952', '', '', 0 => 0, g => '127.20'), + MARC::Field->new('952', '', '', 0 => ' ', g => '123.30', o => $callno), + MARC::Field->new('952', '', '', 0 => 0, g => '127.20', o => $callno2), + MARC::Field->new('952', '', '', 0 => 1, g => '0.00', o => $long_callno), ); my $marc_record_2 = MARC::Record->new(); $marc_record_2->leader(' cam 22 a 4500'); @@ -272,7 +286,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' # MARC::Field->new('210', '', '', a => 'Title 2'), # MARC::Field->new('245', '', '', a => 'Title: second record'), MARC::Field->new('999', '', '', c => '1234568'), - MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric'), + MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric', o => $long_callno), ); my $records = [$marc_record_1, $marc_record_2]; @@ -336,7 +350,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' is(scalar @{$docs->[0][1]->{items_withdrawn_status}}, 2, 'First document items_withdrawn_status field should have two values'); is_deeply( $docs->[0][1]->{items_withdrawn_status}, - ['false', 'false'], + ['false', 'true'], 'First document items_withdrawn_status field should be set correctly' ); @@ -372,6 +386,12 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' is(scalar @{$docs->[0][1]->{isbn}}, 4, 'First document isbn field should contain four values'); is_deeply($docs->[0][1]->{isbn}, ['978-1-56619-909-4', '9781566199094', '1-56619-909-3', '1566199093'], 'First document isbn field should be set correctly'); + is_deeply( + $docs->[0][1]->{'local_classification'}, + [$callno, $callno2, $long_callno], + 'First document local_classification field should be set correctly' + ); + # Second record: is(scalar @{$docs->[1][1]->{author}}, 1, 'Second document author field should contain one value'); @@ -390,6 +410,12 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' 'Second document sum_item_price field should be set correctly' ); + is_deeply( + $docs->[1][1]->{local_classification__sort}, + [substr($long_callno, 0, 255)], + 'Second document local_classification__sort field should be set correctly' + ); + # Mappings marc_type: ok(!(defined $docs->[0][1]->{unimarc_title}), "No mapping when marc_type doesn't match marc flavour"); -- 2.39.5