浏览代码

Bug 22246: Fix indexing of large fields with Elasticsearch

Deduplicate multivalued fields and make sure sort fields are not excessively long. Also updates default mappings so that sort fields are not created for item fields where it doesn't make sense.

Test plan:
1. Reset ES mappings in administration
2. Check that sort is '0' for local-classification in biblio mappings.
3. Change sort back to '1' for local-classification for the next steps.
4. Create a record with 20 items, each with a 100 character long call number
5. Check that when indexed, the record in ES does not have duplicates in any of the item fields and local-classification__sort is truncated to 255 characters.

Signed-off-by: Nick Clemens <nick@bywatersolutions.com>

Signed-off-by: Josef Moravec <josef.moravec@gmail.com>

Signed-off-by: Nick Clemens <nick@bywatersolutions.com>
19.05.x
Ere Maijala 3 年前
提交者 Nick Clemens
父节点
当前提交
bd68e2f75b
  1. 21
      Koha/SearchEngine/Elasticsearch.pm
  2. 54
      admin/searchengine/elasticsearch/mappings.yaml
  3. 36
      t/Koha/SearchEngine/Elasticsearch.t

21
Koha/SearchEngine/Elasticsearch.pm

@ -375,12 +375,7 @@ sub _process_mappings {
$options->{property} => $_data
}
}
# For sort fields, index only a single field with concatenated values
if ($sort && @{$record_document->{$target}}) {
@{$record_document->{$target}}[0] .= " $_data";
} else {
push @{$record_document->{$target}}, $_data;
}
push @{$record_document->{$target}}, $_data;
}
}
@ -512,6 +507,20 @@ sub marc_records_to_documents {
}
}
# Remove duplicate values and collapse sort fields
foreach my $field (keys %{$record_document}) {
if (ref($record_document->{$field}) eq 'ARRAY') {
@{$record_document->{$field}} = do {
my %seen;
grep { !$seen{ref($_) eq 'HASH' && defined $_->{input} ? $_->{input} : $_}++ } @{$record_document->{$field}};
};
if ($field =~ /__sort$/) {
# Make sure to keep the sort field length sensible. 255 was chosen as a nice round value.
$record_document->{$field} = [substr(join(' ', @{$record_document->{$field}}), 0, 255)];
}
}
}
# TODO: Perhaps should check if $records_document non empty, but really should never be the case
$record->encoding('UTF-8');
my @warnings;

54
admin/searchengine/elasticsearch/mappings.yaml

@ -1967,17 +1967,17 @@ biblios:
- facet: '1'
marc_field: 952b
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
- facet: '1'
marc_field: 952b
marc_type: normarc
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: 995c
marc_type: unimarc
sort: ~
sort: 0
suggestible: ''
type: string
homebranch:
@ -1986,17 +1986,17 @@ biblios:
- facet: '1'
marc_field: 952a
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
- facet: '1'
marc_field: 952a
marc_type: normarc
sort: ~
sort: 0
suggestible: ''
- facet: '1'
marc_field: 995b
marc_type: unimarc
sort: ~
sort: 0
suggestible: ''
type: string
host-item:
@ -2217,17 +2217,17 @@ biblios:
- facet: ''
marc_field: '9529'
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: '9529'
marc_type: normarc
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: '9959'
marc_type: unimarc
sort: ~
sort: 0
suggestible: ''
type: number
itemtype:
@ -2634,22 +2634,22 @@ biblios:
- facet: ''
marc_field: 952o
marc_type: marc21
sort: ~
sort: 0
suggestible: '1'
- facet: ''
marc_field: 952o
marc_type: normarc
sort: ~
sort: 0
suggestible: '1'
- facet: ''
marc_field: '686'
marc_type: unimarc
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: 995k
marc_type: unimarc
sort: ~
sort: 0
suggestible: '1'
type: ''
local-number:
@ -2658,17 +2658,17 @@ biblios:
- facet: ''
marc_field: 999c
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: 999c
marc_type: normarc
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: '001'
marc_type: unimarc
sort: ~
sort: 0
suggestible: ''
type: string
location:
@ -2677,17 +2677,17 @@ biblios:
- facet: '1'
marc_field: 952c
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
- facet: '1'
marc_field: 952c
marc_type: normarc
sort: ~
sort: 0
suggestible: ''
- facet: '1'
marc_field: 995e
marc_type: unimarc
sort: ~
sort: 0
suggestible: ''
type: ''
lost:
@ -2829,7 +2829,7 @@ biblios:
- facet: ''
marc_field: '700'
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: '710'
@ -2956,7 +2956,7 @@ biblios:
- facet: ''
marc_field: '060'
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
type: ''
not-onloan-count:
@ -2998,17 +2998,17 @@ biblios:
- facet: 0
marc_field: '9527'
marc_type: marc21
sort: ~
sort: 0
suggestible: 0
- facet: 0
marc_field: '9527'
marc_type: normarc
sort: ~
sort: 0
suggestible: 0
- facet: 0
marc_field: 995o
marc_type: unimarc
sort: ~
sort: 0
suggestible: 0
type: number
number-db:
@ -3072,17 +3072,17 @@ biblios:
- facet: ''
marc_field: 952q
marc_type: marc21
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: 952q
marc_type: normarc
sort: ~
sort: 0
suggestible: ''
- facet: ''
marc_field: 995n
marc_type: unimarc
sort: ~
sort: 0
suggestible: ''
type: boolean
other-control-number:

36
t/Koha/SearchEngine/Elasticsearch.t

@ -117,7 +117,7 @@ subtest 'get_elasticsearch_mappings() tests' => sub {
subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' => sub {
plan tests => 47;
plan tests => 49;
t::lib::Mocks::mock_preference('marcflavour', 'MARC21');
@ -212,6 +212,15 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
marc_type => 'marc21',
marc_field => '9520',
},
{
name => 'local_classification',
type => 'string',
facet => 0,
suggestible => 0,
sort => 1,
marc_type => 'marc21',
marc_field => '952o',
},
{
name => 'type_of_record',
type => 'string',
@ -251,6 +260,10 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
my $see = Koha::SearchEngine::Elasticsearch::Search->new({ index => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX });
my $callno = 'ABC123';
my $callno2 = 'ABC456';
my $long_callno = '1234567890' x 30;
my $marc_record_1 = MARC::Record->new();
$marc_record_1->leader(' cam 22 a 4500');
$marc_record_1->append_fields(
@ -262,8 +275,9 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
MARC::Field->new('245', '', '', a => 'Title:', b => 'first record'),
MARC::Field->new('999', '', '', c => '1234567'),
# ' ' for testing trimming of white space in boolean value callback:
MARC::Field->new('952', '', '', 0 => ' ', g => '123.30'),
MARC::Field->new('952', '', '', 0 => 0, g => '127.20'),
MARC::Field->new('952', '', '', 0 => ' ', g => '123.30', o => $callno),
MARC::Field->new('952', '', '', 0 => 0, g => '127.20', o => $callno2),
MARC::Field->new('952', '', '', 0 => 1, g => '0.00', o => $long_callno),
);
my $marc_record_2 = MARC::Record->new();
$marc_record_2->leader(' cam 22 a 4500');
@ -272,7 +286,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
# MARC::Field->new('210', '', '', a => 'Title 2'),
# MARC::Field->new('245', '', '', a => 'Title: second record'),
MARC::Field->new('999', '', '', c => '1234568'),
MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric'),
MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric', o => $long_callno),
);
my $records = [$marc_record_1, $marc_record_2];
@ -336,7 +350,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
is(scalar @{$docs->[0][1]->{items_withdrawn_status}}, 2, 'First document items_withdrawn_status field should have two values');
is_deeply(
$docs->[0][1]->{items_withdrawn_status},
['false', 'false'],
['false', 'true'],
'First document items_withdrawn_status field should be set correctly'
);
@ -372,6 +386,12 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
is(scalar @{$docs->[0][1]->{isbn}}, 4, 'First document isbn field should contain four values');
is_deeply($docs->[0][1]->{isbn}, ['978-1-56619-909-4', '9781566199094', '1-56619-909-3', '1566199093'], 'First document isbn field should be set correctly');
is_deeply(
$docs->[0][1]->{'local_classification'},
[$callno, $callno2, $long_callno],
'First document local_classification field should be set correctly'
);
# Second record:
is(scalar @{$docs->[1][1]->{author}}, 1, 'Second document author field should contain one value');
@ -390,6 +410,12 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
'Second document sum_item_price field should be set correctly'
);
is_deeply(
$docs->[1][1]->{local_classification__sort},
[substr($long_callno, 0, 255)],
'Second document local_classification__sort field should be set correctly'
);
# Mappings marc_type:
ok(!(defined $docs->[0][1]->{unimarc_title}), "No mapping when marc_type doesn't match marc flavour");

正在加载...
取消
保存