From f5347f49587ee5406d4a3051117ada4687bf7a79 Mon Sep 17 00:00:00 2001 From: Nick Clemens Date: Fri, 1 Sep 2023 17:30:14 +0000 Subject: [PATCH] Bug 27153: Add filter option to Elasticsearch indexing This patch uses the filter option from the mappings to add a value_callback to reord processing for indexing. Fields defined with 'punctuation' filter will have all punctuation stripped when conveted to documents. Tests are updated. Signed-off-by: Danielle M Elder Signed-off-by: Martin Renvoize Signed-off-by: Tomas Cohen Arazi --- Koha/SearchEngine/Elasticsearch.pm | 46 ++++++++++++---- .../Koha/SearchEngine/Elasticsearch.t | 53 ++++++++++++++----- 2 files changed, 75 insertions(+), 24 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index 94193f279f..3f26cbc027 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -194,7 +194,7 @@ sub get_elasticsearch_mappings { my $marcflavour = lc C4::Context->preference('marcflavour'); $self->_foreach_mapping( sub { - my ( $name, $type, $facet, $suggestible, $sort, $search, $marc_type ) = @_; + my ( $name, $type, $facet, $suggestible, $sort, $search, $filter, $marc_type ) = @_; return if $marc_type ne $marcflavour; # TODO if this gets any sort of complexity to it, it should # be broken out into its own function. @@ -291,7 +291,8 @@ sub raw_elasticsearch_mappings { marc_type => $marc_map->marc_type, marc_field => $marc_map->marc_field, sort => $marc_to_field->sort, - suggestible => $marc_to_field->suggestible || '' + suggestible => $marc_to_field->suggestible || '', + filter => $marc_to_field->filter || '' }); } @@ -381,7 +382,8 @@ sub reset_elasticsearch_mappings { facet => $mapping->{facet} || 0, suggestible => $mapping->{suggestible} || 0, sort => $mapping->{sort} // 1, - search => $mapping->{search} // 1 + search => $mapping->{search} // 1, + filter => $mapping->{filter} // '' }); } } @@ -887,9 +889,9 @@ sub _array_to_marc { return $record; } -=head2 _field_mappings($facet, $suggestible, $sort, $search, $target_name, $target_type, $range) +=head2 _field_mappings($facet, $suggestible, $sort, $search, $filter, $target_name, $target_type, $range) - my @mappings = _field_mappings($facet, $suggestible, $sort, $search, $target_name, $target_type, $range) + my @mappings = _field_mappings( $facet, $suggestible, $sort, $search, $filter, $target_name, $target_type, $range ) Get mappings, an internal data structure later used by L<_process_mappings($mappings, $data, $record_document, $meta)> to process MARC target @@ -947,7 +949,7 @@ be extracted. =cut sub _field_mappings { - my ($_self, $facet, $suggestible, $sort, $search, $target_name, $target_type, $range) = @_; + my ( $_self, $facet, $suggestible, $sort, $search, $filter, $target_name, $target_type, $range ) = @_; my %mapping_defaults = (); my @mappings; @@ -984,6 +986,18 @@ sub _field_mappings { }; } + if ( defined $filter && $filter eq 'punctuation' ) { + $default_options->{value_callbacks} //= []; + push @{ $default_options->{value_callbacks} }, sub { + my ($value) = @_; + + # Trim punctuation marks from field + $value =~ + s/[\x00-\x1F,\x21-\x2F,\x3A-\x40,\x5B-\x60,\x7B-\x89,\x8B,\x8D,\x8F,\x90-\x99,\x9B,\x9D,\xA0-\xBF,\xD7,\xF7]//g; + return $value; + }; + } + if ($search) { my $mapping = [$target_name, $default_options]; push @mappings, $mapping; @@ -1046,7 +1060,7 @@ sub _get_marc_mapping_rules { }; $self->_foreach_mapping(sub { - my ($name, $type, $facet, $suggestible, $sort, $search, $marc_type, $marc_field) = @_; + my ($name, $type, $facet, $suggestible, $sort, $search, $filter, $marc_type, $marc_field) = @_; return if $marc_type ne $marcflavour; if ($type eq 'sum') { @@ -1110,7 +1124,7 @@ sub _get_marc_mapping_rules { } my $range = defined $3 ? $3 : undef; - my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $search, $name, $type, $range); + my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $search, $filter, $name, $type, $range); if ($field_tag < 10) { $rules->{control_fields}->{$field_tag} //= []; push @{$rules->{control_fields}->{$field_tag}}, @{clone(\@mappings)}; @@ -1129,7 +1143,7 @@ sub _get_marc_mapping_rules { } elsif ($marc_field =~ $leader_regexp) { my $range = defined $1 ? $1 : undef; - my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $search, $name, $type, $range); + my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $search, $filter, $name, $type, $range); push @{$rules->{leader}}, @{clone(\@mappings)}; } else { @@ -1182,7 +1196,7 @@ sub _get_marc_mapping_rules { $self->_foreach_mapping( sub { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type, + my ( $name, $type, $facet, $suggestible, $sort, $search, $filter, $marc_type, $marc_field ) = @_; return unless $marc_type eq 'marc21'; @@ -1218,6 +1232,15 @@ should be sorted on. False if a) but not b). Undef if not a). This allows, for example, author to be sorted on but not everything marked with "author" to be included in that sort. +=item C<$search> + +True if this value should be searchable. + +=item C<$filter> + +Contains a string that represents a filter defined in the indexing code. Currently supports +the option 'punctuation' + =item C<$marc_type> A string that indicates the MARC type that this mapping is for, e.g. 'marc21', @@ -1245,6 +1268,7 @@ sub _foreach_mapping { 'search_marc_to_fields.suggestible', 'search_marc_to_fields.sort', 'search_marc_to_fields.search', + 'search_marc_to_fields.filter', 'search_marc_map.marc_type', 'search_marc_map.marc_field', ], @@ -1253,6 +1277,7 @@ sub _foreach_mapping { 'suggestible', 'sort', 'search', + 'filter', 'marc_type', 'marc_field', ], @@ -1269,6 +1294,7 @@ sub _foreach_mapping { $search_field->get_column('suggestible'), $search_field->get_column('sort'), $search_field->get_column('search'), + $search_field->get_column('filter'), $search_field->get_column('marc_type'), $search_field->get_column('marc_field'), ); diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t index c4b8f59f72..dbbfeccbb4 100755 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t @@ -145,14 +145,15 @@ subtest 'get_elasticsearch_mappings() tests' => sub { marc_field => '001', }, { - name => 'isbn', - type => 'string', - facet => 0, + name => 'isbn', + type => 'string', + facet => 0, suggestible => 0, - searchable => 1, - sort => 1, - marc_type => 'marc21', - marc_field => '020a', + searchable => 1, + filter => 'punctuation', + sort => 1, + marc_type => 'marc21', + marc_field => '020a', }, ); my $search_engine_module = Test::MockModule->new('Koha::SearchEngine::Elasticsearch'); @@ -167,6 +168,7 @@ subtest 'get_elasticsearch_mappings() tests' => sub { $map->{suggestible}, $map->{sort}, $map->{searchable}, + $map->{filter}, $map->{marc_type}, $map->{marc_field} ); @@ -184,7 +186,7 @@ subtest 'get_elasticsearch_mappings() tests' => sub { subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' => sub { - plan tests => 65; + plan tests => 66; t::lib::Mocks::mock_preference('marcflavour', 'MARC21'); t::lib::Mocks::mock_preference('ElasticsearchMARCFormat', 'ISO2709'); @@ -240,6 +242,17 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' marc_type => 'marc21', marc_field => '245(ab)ab', }, + { + name => 'title-no-punctuation', + type => 'string', + facet => 0, + suggestible => 1, + searchable => 1, + sort => undef, + filter => 'punctuation', + marc_type => 'marc21', + marc_field => '245(ab)ab', + }, { name => 'unimarc_title', type => 'string', @@ -394,6 +407,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' $map->{suggestible}, $map->{sort}, $map->{searchable}, + $map->{filter}, $map->{marc_type}, $map->{marc_field} ); @@ -454,11 +468,11 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' my $marc_record_4 = MARC::Record->new(); $marc_record_4->leader(' cam 22 a 4500'); $marc_record_4->append_fields( - MARC::Field->new('008', '901111s19uu xxk|||| |00| ||eng c'), - MARC::Field->new('100', '', '', a => 'Author 2'), - MARC::Field->new('245', '', '4', a => 'The Title :', b => 'fourth record'), - MARC::Field->new('260', '', '', a => 'New York :', b => 'Ace ,', c => ' 89 '), - MARC::Field->new('999', '', '', c => '1234568'), + MARC::Field->new( '008', '901111s19uu xxk|||| |00| ||eng c' ), + MARC::Field->new( '100', '', '', a => 'Author 2' ), + MARC::Field->new( '245', '', '4', a => 'The Title\'s the thing :', b => 'fourth record' ), + MARC::Field->new( '260', '', '', a => 'New York :', b => 'Ace ,', c => ' 89 ' ), + MARC::Field->new( '999', '', '', c => '1234568' ), ); my $records = [$marc_record_1, $marc_record_2, $marc_record_3, $marc_record_4]; @@ -487,7 +501,15 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' is_deeply($docs->[0]->{title__sort}, ['Title: first record Title: first record'], 'First document title__sort field should be set correctly'); is(scalar @{$docs->[3]->{title__sort}}, 1, 'First document title__sort field should have a single'); - is_deeply($docs->[3]->{title__sort}, ['Title : fourth record The Title : fourth record'], 'Fourth document title__sort field should be set correctly'); + is_deeply( + $docs->[3]->{title__sort}, ['Title\'s the thing : fourth record The Title\'s the thing : fourth record'], + 'Fourth document title__sort field should be set correctly' + ); + is_deeply( + $docs->[3]->{'title-no-punctuation'}, + [ 'The Titles the thing ', 'fourth record', 'The Titles the thing fourth record' ], + 'Fourth document title-no-punctuation field should be set correctly' + ); is($docs->[0]->{issues}, 6, 'Issues field should be sum of the issues for each item'); is($docs->[0]->{issues__sort}, 6, 'Issues sort field should also be a sum of the issues'); @@ -764,6 +786,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents_array () t $map->{suggestible}, $map->{sort}, $map->{searchable}, + $map->{filter}, $map->{marc_type}, $map->{marc_field} ); @@ -859,6 +882,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () authori $map->{suggestible}, $map->{sort}, $map->{searchable}, + $map->{filter}, $map->{marc_type}, $map->{marc_field} ); @@ -959,6 +983,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents with Inclu $map->{suggestible}, $map->{sort}, $map->{searchable}, + $map->{filter}, $map->{marc_type}, $map->{marc_field} ); -- 2.39.5