From d1c8d7eccee6e75473bf7c780617d683f64e5229 Mon Sep 17 00:00:00 2001 From: David Gustafsson Date: Wed, 4 Mar 2020 17:07:11 +0100 Subject: [PATCH] Bug 24807: Add "year" type to improve sorting behaviour Add a "year" search field type. Fields with this type will only retain values that looks like years, so invalid values such as whitespace or word characters will not be indexed. This for instance improves the behaviour when sorting by "date-of-publication". If all values are indexed, records with junk data instead of valid years will appear first among the search results, drowning out more relevant hits. If assigning this field the "year" type these records will instead always appear last, regarless of sort order. To test: 1) Have at least two biblios, one with a valid year in 008 (pos 7-10) and another with an invalid one ("uuuu" for example) 2) Perform a wildcard search (*) and sort results by publication date. 3) The record with invalid year of pulication in 008 should appear first 4) Apply patch and run database updates 5) Reindex ElasticSearch 6) Perform the same search as in 2) 7) The record with the invalid year should now appear last Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Signed-off-by: Jonathan Druart Signed-off-by: Jonathan Druart --- Koha/SearchEngine/Elasticsearch.pm | 18 +++++++++++++-- Koha/SearchEngine/Elasticsearch/Indexer.pm | 3 +++ .../elasticsearch/field_config.yaml | 2 ++ .../searchengine/elasticsearch/mappings.tt | 5 +++++ .../Koha/SearchEngine/Elasticsearch.t | 22 ++++++++++++++++++- 5 files changed, 47 insertions(+), 3 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index 3183e1d910..1a8d3aa72b 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -39,7 +39,7 @@ use Search::Elasticsearch; use Try::Tiny; use YAML::Syck; -use List::Util qw( sum0 reduce ); +use List::Util qw( sum0 reduce all ); use MARC::File::XML; use MIME::Base64; use Encode qw(encode); @@ -209,6 +209,8 @@ sub get_elasticsearch_mappings { $es_type = 'integer'; } elsif ($type eq 'isbn' || $type eq 'stdno') { $es_type = 'stdno'; + } elsif ($type eq 'year') { + $es_type = 'year'; } if ($search) { @@ -470,7 +472,6 @@ sub _process_mappings { # with differing options for (possibly) mutating data # so need a different copy for each my $_data = $data; - $record_document->{$target} //= []; if (defined $options->{substr}) { my ($start, $length) = @{$options->{substr}}; $_data = length($data) > $start ? substr $data, $start, $length : ''; @@ -478,6 +479,10 @@ sub _process_mappings { if (defined $options->{value_callbacks}) { $_data = reduce { $b->($a) } ($_data, @{$options->{value_callbacks}}); } + if (defined $options->{filter_callbacks}) { + # Skip mapping unless all filter callbacks return true + next unless all { $_->($_data) } @{$options->{filter_callbacks}}; + } if (defined $options->{property}) { $_data = { $options->{property} => $_data @@ -490,6 +495,8 @@ sub _process_mappings { $_data = substr $_data, $nonfiling_chars; } } + + $record_document->{$target} //= []; push @{$record_document->{$target}}, $_data; } } @@ -885,6 +892,13 @@ sub _field_mappings { return $value ? 'true' : 'false'; }; } + elsif ($target_type eq 'year') { + $default_options->{filter_callbacks} //= []; + push @{$default_options->{filter_callbacks}}, sub { + my ($value) = @_; + return $value =~ /^\d+$/; + }; + } if ($search) { my $mapping = [$target_name, $default_options]; diff --git a/Koha/SearchEngine/Elasticsearch/Indexer.pm b/Koha/SearchEngine/Elasticsearch/Indexer.pm index e2c1c91fbd..f9270c0ae0 100644 --- a/Koha/SearchEngine/Elasticsearch/Indexer.pm +++ b/Koha/SearchEngine/Elasticsearch/Indexer.pm @@ -117,6 +117,9 @@ sub update_index { type => 'data', # is just hard coded in Indexer.pm? body => \@body ); + if ($response->{errors}) { + carp "One or more ElasticSearch errors occured when indexing documents"; + } } return $response; } diff --git a/admin/searchengine/elasticsearch/field_config.yaml b/admin/searchengine/elasticsearch/field_config.yaml index c98b26dcfe..d78bdf06dd 100644 --- a/admin/searchengine/elasticsearch/field_config.yaml +++ b/admin/searchengine/elasticsearch/field_config.yaml @@ -25,6 +25,8 @@ search: type: integer null_value: 0 ignore_malformed: true + year: + type: short stdno: type: text analyzer: analyzer_stdno diff --git a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt index 6452f2a98b..af4fc597e3 100644 --- a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt +++ b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt @@ -196,6 +196,11 @@ a.add, a.delete { [% ELSE %] [% END %] + [% IF search_field.type == "year" %] + + [% ELSE %] + + [% END %] [% IF search_field.type == "number" %] [% ELSE %] diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t index 12c3599885..2692afbd4a 100755 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t @@ -132,7 +132,7 @@ subtest 'get_elasticsearch_mappings() tests' => sub { subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' => sub { - plan tests => 53; + plan tests => 56; t::lib::Mocks::mock_preference('marcflavour', 'MARC21'); t::lib::Mocks::mock_preference('ElasticsearchMARCFormat', 'ISO2709'); @@ -298,6 +298,17 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' marc_type => 'marc21', marc_field => '952l', }, + { + name => 'date-of-publication', + type => 'year', + facet => 0, + suggestible => 0, + searchable => 1, + sort => 1, + marc_type => 'marc21', + marc_field => '008_/7-10', + }, + ); my $se = Test::MockModule->new('Koha::SearchEngine::Elasticsearch'); @@ -329,6 +340,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' $marc_record_1->append_fields( MARC::Field->new('001', '123'), MARC::Field->new('007', 'ku'), + MARC::Field->new('008', '901111s1962 xxk|||| |00| ||eng c'), MARC::Field->new('020', '', '', a => '1-56619-909-3'), MARC::Field->new('100', '', '', a => 'Author 1'), MARC::Field->new('110', '', '', a => 'Corp Author'), @@ -344,6 +356,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' my $marc_record_2 = MARC::Record->new(); $marc_record_2->leader(' cam 22 a 4500'); $marc_record_2->append_fields( + MARC::Field->new('008', '901111s19uu xxk|||| |00| ||eng c'), MARC::Field->new('100', '', '', a => 'Author 2'), # MARC::Field->new('210', '', '', a => 'Title 2'), # MARC::Field->new('245', '', '', a => 'Title: second record'), @@ -470,6 +483,10 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' 'First document uniform_title__sort field should contain the title with the first four initial characters removed' ); + # Tests for 'year' type and 'filter_callbacks' + is(scalar @{$docs->[0]->{'date-of-publication'}}, 1, 'First document date-of-publication field should contain one value'); + is_deeply($docs->[0]->{'date-of-publication'}, ['1962'], 'First document date-of-publication field should be set correctly'); + # Second record: is(scalar @{$docs->[1]->{author}}, 1, 'Second document author field should contain one value'); @@ -494,6 +511,9 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' 'Second document local_classification__sort field should be set correctly' ); + # Tests for 'year' type and 'filter_callbacks' + ok(!(defined $docs->[1]->{'date-of-publication'}), "Second document invalid date-of-publication value should have been removed"); + # Mappings marc_type: ok(!(defined $docs->[0]->{unimarc_title}), "No mapping when marc_type doesn't match marc flavour"); -- 2.39.5