From cbd56aa5fc7664c2dfb482398312f74751f1881f Mon Sep 17 00:00:00 2001 From: David Gustafsson Date: Wed, 4 Mar 2020 17:07:11 +0100 Subject: [PATCH] Bug 24807: [20.05.x] Add "year" type to improve sorting behaviour Add a "year" search field type. Fields with this type will only retain values that looks like years, so invalid values such as whitespace or word characters will not be indexed. This for instance improves the behaviour when sorting by "date-of-publication". If all values are indexed, records with junk data instead of valid years will appear first among the search results, drowning out more relevant hits. If assigning this field the "year" type these records will instead always appear last, regarless of sort order. To test: 1) Have at least two biblios, one with a valid year in 008 (pos 7-10) and another with an invalid one ("uuuu" for example) 2) Perform a wildcard search (*) and sort results by publication date. 3) The record with invalid year of pulication in 008 should appear first 4) Apply patch and run database updates 5) Reindex ElasticSearch 6) Perform the same search as in 2) 7) The record with the invalid year should now appear last Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Bug 24807: Add database update script Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Bug 24807: Update tests Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Bug 24807: Add suppport for uncertain fields and ranges To test: 1 - Have some records with uncertain dates in the 008 19uu, 195u, etc. 2 - Index them in Elasticsearch 3 - Do a search that will return them 4 - Sort results by publication/copyright date 5 - Note odd results 6 - Apply patch 7 - Reindex 8 - Sorting should be improved Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Bug 24807: Refactor using tokenize_callbacks Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Bug 24807: Simplify with new and imporved value_callbacks Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Bug 24807: (follow-up) Fix spelling Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Bug 24807: (follow-up) Add support for spaces as unknown characters Signed-off-by: Katrin Fischer Bug 24807: (QA follow-up) Remove uneccessary tests These tests fail now, the code expects a real response from ES in Indexer.pm but these tests mock 'bulk' and so don't have the necessary fields. We are testing the same code above and can just add the _id == biblionumber test Signed-off-by: Lucas Gass --- Koha/SearchEngine/Elasticsearch.pm | 51 ++++++++++--- Koha/SearchEngine/Elasticsearch/Indexer.pm | 3 + .../elasticsearch/field_config.yaml | 2 + .../bug_24807-add-year-search-field-type.perl | 7 ++ .../searchengine/elasticsearch/mappings.tt | 5 ++ .../Koha/SearchEngine/Elasticsearch.t | 73 ++++++++++++++++++- .../Koha/SearchEngine/Elasticsearch/Indexer.t | 33 +-------- 7 files changed, 130 insertions(+), 44 deletions(-) create mode 100644 installer/data/mysql/atomicupdate/bug_24807-add-year-search-field-type.perl diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index 3183e1d910..1df589c07c 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -39,7 +39,7 @@ use Search::Elasticsearch; use Try::Tiny; use YAML::Syck; -use List::Util qw( sum0 reduce ); +use List::Util qw( sum0 reduce all ); use MARC::File::XML; use MIME::Base64; use Encode qw(encode); @@ -209,6 +209,8 @@ sub get_elasticsearch_mappings { $es_type = 'integer'; } elsif ($type eq 'isbn' || $type eq 'stdno') { $es_type = 'stdno'; + } elsif ($type eq 'year') { + $es_type = 'year'; } if ($search) { @@ -469,28 +471,44 @@ sub _process_mappings { # Copy (scalar) data since can have multiple targets # with differing options for (possibly) mutating data # so need a different copy for each - my $_data = $data; - $record_document->{$target} //= []; + my $data_copy = $data; if (defined $options->{substr}) { my ($start, $length) = @{$options->{substr}}; - $_data = length($data) > $start ? substr $data, $start, $length : ''; + $data_copy = length($data) > $start ? substr $data_copy, $start, $length : ''; } + + # Add data to values array for callbacks processing + my $values = [$data_copy]; + + # Value callbacks takes subfield data (or values from previous + # callbacks) as argument, and returns a possibly different list of values. + # Note that the returned list may also be empty. if (defined $options->{value_callbacks}) { - $_data = reduce { $b->($a) } ($_data, @{$options->{value_callbacks}}); + foreach my $callback (@{$options->{value_callbacks}}) { + # Pass each value to current callback which returns a list + # (scalar is fine too) resulting either in a list or + # a list of lists that will be flattened by perl. + # The next callback will receive the possibly expanded list of values. + $values = [ map { $callback->($_) } @{$values} ]; + } } + + # Skip mapping if all values has been removed + next unless @{$values}; + if (defined $options->{property}) { - $_data = { - $options->{property} => $_data - } + $values = [ map { { $options->{property} => $_ } } @{$values} ]; } if (defined $options->{nonfiling_characters_indicator}) { my $nonfiling_chars = $meta->{field}->indicator($options->{nonfiling_characters_indicator}); $nonfiling_chars = looks_like_number($nonfiling_chars) ? int($nonfiling_chars) : 0; - if ($nonfiling_chars) { - $_data = substr $_data, $nonfiling_chars; - } + # Nonfiling chars does not make sense for multiple values + # Only apply on first element + $values->[0] = substr $values->[0], $nonfiling_chars; } - push @{$record_document->{$target}}, $_data; + + $record_document->{$target} //= []; + push @{$record_document->{$target}}, @{$values}; } } @@ -885,6 +903,15 @@ sub _field_mappings { return $value ? 'true' : 'false'; }; } + elsif ($target_type eq 'year') { + $default_options->{value_callbacks} //= []; + # Only accept years containing digits and "u" + push @{$default_options->{value_callbacks}}, sub { + my ($value) = @_; + # Replace "u" with "0" for sorting + return map { s/[u\s]/0/gr } ( $value =~ /[0-9u\s]{4}/g ); + }; + } if ($search) { my $mapping = [$target_name, $default_options]; diff --git a/Koha/SearchEngine/Elasticsearch/Indexer.pm b/Koha/SearchEngine/Elasticsearch/Indexer.pm index 4e67a77e87..39e129f7df 100644 --- a/Koha/SearchEngine/Elasticsearch/Indexer.pm +++ b/Koha/SearchEngine/Elasticsearch/Indexer.pm @@ -119,6 +119,9 @@ sub update_index { type => 'data', # is just hard coded in Indexer.pm? body => \@body ); + if ($response->{errors}) { + carp "One or more ElasticSearch errors occurred when indexing documents"; + } } return $response; } diff --git a/admin/searchengine/elasticsearch/field_config.yaml b/admin/searchengine/elasticsearch/field_config.yaml index c98b26dcfe..d78bdf06dd 100644 --- a/admin/searchengine/elasticsearch/field_config.yaml +++ b/admin/searchengine/elasticsearch/field_config.yaml @@ -25,6 +25,8 @@ search: type: integer null_value: 0 ignore_malformed: true + year: + type: short stdno: type: text analyzer: analyzer_stdno diff --git a/installer/data/mysql/atomicupdate/bug_24807-add-year-search-field-type.perl b/installer/data/mysql/atomicupdate/bug_24807-add-year-search-field-type.perl new file mode 100644 index 0000000000..480072c880 --- /dev/null +++ b/installer/data/mysql/atomicupdate/bug_24807-add-year-search-field-type.perl @@ -0,0 +1,7 @@ +$DBversion = 'XXX'; # will be replaced by the RM +if( CheckVersion( $DBversion ) ) { + $dbh->do( "ALTER TABLE `search_field` MODIFY COLUMN `type` enum('','string','date','number','boolean','sum','isbn','stdno','year') NOT NULL" ); + $dbh->do( "UPDATE `search_field` SET type = 'year' WHERE name = 'date-of-publication'" ); + SetVersion( $DBversion ); + print "Upgrade to $DBversion done (Bug 14957 - Add 'year' type to improve sorting behaviour)\n"; +} diff --git a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt index bf6d73d2fe..1c95a9330b 100644 --- a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt +++ b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/searchengine/elasticsearch/mappings.tt @@ -196,6 +196,11 @@ a.add, a.delete { [% ELSE %] [% END %] + [% IF search_field.type == "year" %] + + [% ELSE %] + + [% END %] [% IF search_field.type == "number" %] [% ELSE %] diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t index 12c3599885..96c8eda321 100644 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t @@ -132,7 +132,7 @@ subtest 'get_elasticsearch_mappings() tests' => sub { subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' => sub { - plan tests => 53; + plan tests => 59; t::lib::Mocks::mock_preference('marcflavour', 'MARC21'); t::lib::Mocks::mock_preference('ElasticsearchMARCFormat', 'ISO2709'); @@ -297,7 +297,28 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' sort => 1, marc_type => 'marc21', marc_field => '952l', + }, + { + name => 'copydate', + type => 'year', + facet => 0, + suggestible => 0, + searchable => 1, + sort => 1, + marc_type => 'marc21', + marc_field => '260c', + }, + { + name => 'date-of-publication', + type => 'year', + facet => 0, + suggestible => 0, + searchable => 1, + sort => 1, + marc_type => 'marc21', + marc_field => '008_/7-10', }, + ); my $se = Test::MockModule->new('Koha::SearchEngine::Elasticsearch'); @@ -329,12 +350,14 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' $marc_record_1->append_fields( MARC::Field->new('001', '123'), MARC::Field->new('007', 'ku'), + MARC::Field->new('008', '901111s1962 xxk|||| |00| ||eng c'), MARC::Field->new('020', '', '', a => '1-56619-909-3'), MARC::Field->new('100', '', '', a => 'Author 1'), MARC::Field->new('110', '', '', a => 'Corp Author'), MARC::Field->new('210', '', '', a => 'Title 1'), MARC::Field->new('240', '', '4', a => 'The uniform title with nonfiling indicator'), MARC::Field->new('245', '', '', a => 'Title:', b => 'first record'), + MARC::Field->new('260', '', '', a => 'New York :', b => 'Ace ,', c => 'c1962'), MARC::Field->new('999', '', '', c => '1234567'), # ' ' for testing trimming of white space in boolean value callback: MARC::Field->new('952', '', '', 0 => ' ', g => '123.30', o => $callno, l => 3), @@ -344,20 +367,34 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' my $marc_record_2 = MARC::Record->new(); $marc_record_2->leader(' cam 22 a 4500'); $marc_record_2->append_fields( + MARC::Field->new('008', '901111s19uu xxk|||| |00| ||eng c'), MARC::Field->new('100', '', '', a => 'Author 2'), # MARC::Field->new('210', '', '', a => 'Title 2'), # MARC::Field->new('245', '', '', a => 'Title: second record'), + MARC::Field->new('260', '', '', a => 'New York :', b => 'Ace ,', c => '1963-2003'), MARC::Field->new('999', '', '', c => '1234568'), MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric', o => $long_callno), ); - my $records = [ $marc_record_1, $marc_record_2 ]; + + my $marc_record_3 = MARC::Record->new(); + $marc_record_3->leader(' cam 22 a 4500'); + $marc_record_3->append_fields( + MARC::Field->new('008', '901111s19uu xxk|||| |00| ||eng c'), + MARC::Field->new('100', '', '', a => 'Author 2'), + # MARC::Field->new('210', '', '', a => 'Title 3'), + # MARC::Field->new('245', '', '', a => 'Title: third record'), + MARC::Field->new('260', '', '', a => 'New York :', b => 'Ace ,', c => ' 89 '), + MARC::Field->new('999', '', '', c => '1234568'), + MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric', o => $long_callno), + ); + my $records = [$marc_record_1, $marc_record_2, $marc_record_3]; $see->get_elasticsearch_mappings(); #sort_fields will call this and use the actual db values unless we call it first my $docs = $see->marc_records_to_documents($records); # First record: - is(scalar @{$docs}, 2, 'Two records converted to documents'); + is(scalar @{$docs}, 3, 'Two records converted to documents'); is_deeply($docs->[0]->{control_number}, ['123'], 'First record control number should be set correctly'); @@ -470,6 +507,16 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' 'First document uniform_title__sort field should contain the title with the first four initial characters removed' ); + # Tests for 'year' type + is(scalar @{$docs->[0]->{'date-of-publication'}}, 1, 'First document date-of-publication field should contain one value'); + is_deeply($docs->[0]->{'date-of-publication'}, ['1962'], 'First document date-of-publication field should be set correctly'); + + is_deeply( + $docs->[0]->{'copydate'}, + ['1962'], + 'First document copydate field should be set correctly' + ); + # Second record: is(scalar @{$docs->[1]->{author}}, 1, 'Second document author field should contain one value'); @@ -494,6 +541,26 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' 'Second document local_classification__sort field should be set correctly' ); + # Tests for 'year' type + is_deeply( + $docs->[1]->{'copydate'}, + ['1963', '2003'], + 'Second document copydate field should be set correctly' + ); + is_deeply( + $docs->[1]->{'date-of-publication'}, + ['1900'], + 'Second document date-of-publication field should be set correctly' + ); + + # Third record: + + is_deeply( + $docs->[2]->{'copydate'}, + ['0890'], + 'Third document copydate field should be set correctly' + ); + # Mappings marc_type: ok(!(defined $docs->[0]->{unimarc_title}), "No mapping when marc_type doesn't match marc flavour"); diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t index 7a26b70950..6fc4123954 100644 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch/Indexer.t @@ -19,7 +19,7 @@ use Modern::Perl; -use Test::More tests => 3; +use Test::More tests => 2; use Test::MockModule; use t::lib::Mocks; @@ -35,11 +35,11 @@ SKIP: { eval { Koha::SearchEngine::Elasticsearch->get_elasticsearch_params; }; - skip 'Elasticsearch configuration not available', 2 + skip 'Elasticsearch configuration not available', 1 if $@; subtest 'create_index() tests' => sub { - plan tests => 5; + plan tests => 6; my $se = Test::MockModule->new( 'Koha::SearchEngine::Elasticsearch' ); $se->mock( '_read_configuration', sub { my ($self, $sub ) = @_; @@ -72,6 +72,7 @@ subtest 'create_index() tests' => sub { my $response = $indexer->update_index([1], $records); is( $response->{errors}, 0, "no error on update_index" ); is( scalar(@{$response->{items}}), 1, "1 item indexed" ); + is( $response->{items}[0]->{index}->{_id},"1", "We should get a string matching the bibnumber passed in"); is( $indexer->drop_index(), @@ -79,30 +80,4 @@ subtest 'create_index() tests' => sub { 'Dropping the index' ); }; - -subtest 'update_index() tests' => sub { - plan tests => 2; - my $kse = Test::MockModule->new( 'Koha::SearchEngine::Elasticsearch' ); - $kse->mock( 'marc_records_to_documents', sub { - my ($self, $params ) = @_; - return [1]; - }); - - my $indexer; - ok( - $indexer = Koha::SearchEngine::Elasticsearch::Indexer->new({ 'index' => 'biblios' }), - 'Creating a new indexer object' - ); - - my $searcher = $indexer->get_elasticsearch(); - my $se = Test::MockModule->new( ref $searcher ); - $se->mock( 'bulk', sub { - my ($self, %params ) = @_; - return $params{body}; - }); - - my $bibnumber_array = $indexer->update_index([13],["faked"]); - is( $bibnumber_array->[0]->{index}->{_id},"13", "We should get a string matching the bibnumber"); -}; - } -- 2.39.5