From 342b51629d29db5e9309dc599c67b413e19ebbb8 Mon Sep 17 00:00:00 2001 From: David Gustafsson Date: Thu, 16 Apr 2020 17:26:41 +0200 Subject: [PATCH] Bug 24807: Refactor using tokenize_callbacks Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Signed-off-by: Jonathan Druart Signed-off-by: Jonathan Druart --- Koha/SearchEngine/Elasticsearch.pm | 73 ++++++++++++------- .../Koha/SearchEngine/Elasticsearch.t | 4 +- 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index aabf719e73..c4dce5501a 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -471,37 +471,56 @@ sub _process_mappings { # Copy (scalar) data since can have multiple targets # with differing options for (possibly) mutating data # so need a different copy for each - my $_data = $data; + my $data_copy = $data; if (defined $options->{substr}) { my ($start, $length) = @{$options->{substr}}; - $_data = length($data) > $start ? substr $data, $start, $length : ''; + $data_copy = length($data) > $start ? substr $data_copy, $start, $length : ''; + } + + # Add data to tokens array for callbacks processing + my $tokens = [$data_copy]; + + # Tokenize callbacks takes as token (possibly tokenized subfield data) + # as argument, and returns a possibly different list of tokens. + # Note that this list also might be empty. + if (defined $options->{tokenize_callbacks}) { + foreach my $callback (@{$options->{tokenize_callbacks}}) { + # Pass each token to current callback which returns a list + # (scalar is fine too) resulting either in a list or + # a list of lists that will be flattened by perl. + # The next callback will recieve the possibly expanded list of tokens. + $tokens = [ map { $callback->($_) } @{$tokens} ]; + } } if (defined $options->{value_callbacks}) { - $_data = reduce { $b->($a) } ($_data, @{$options->{value_callbacks}}); + $tokens = [ map { reduce { $b->($a) } ($_, @{$options->{value_callbacks}}) } @{$tokens} ]; } if (defined $options->{filter_callbacks}) { - # Skip mapping unless all filter callbacks return true - next unless all { $_data = $_->($_data) } @{$options->{filter_callbacks}}; + my @tokens_filtered; + foreach my $_data (@{$tokens}) { + if ( all { $_->($_data) } @{$options->{filter_callbacks}} ) { + push @tokens_filtered, $_data; + } + } + # Overwrite $tokens with filtered values + $tokens = \@tokens_filtered; } + # Skip mapping if all values has been removed + next unless @{$tokens}; + if (defined $options->{property}) { - $_data = { - $options->{property} => $_data - } + $tokens = [ map { { $options->{property} => $_ } } @{$tokens} ]; } if (defined $options->{nonfiling_characters_indicator}) { my $nonfiling_chars = $meta->{field}->indicator($options->{nonfiling_characters_indicator}); $nonfiling_chars = looks_like_number($nonfiling_chars) ? int($nonfiling_chars) : 0; - if ($nonfiling_chars) { - $_data = substr $_data, $nonfiling_chars; - } + # Nonfiling chars does not make sense for multiple tokens + # Only apply on first element + $tokens->[0] = substr $tokens->[0], $nonfiling_chars; } $record_document->{$target} //= []; - if( ref $_data eq 'ARRAY' ){ - push @{$record_document->{$target}}, @{$_data}; - } else { - push @{$record_document->{$target}}, $_data; - } + push @{$record_document->{$target}}, @{$tokens}; } } @@ -897,16 +916,20 @@ sub _field_mappings { }; } elsif ($target_type eq 'year') { - $default_options->{filter_callbacks} //= []; - push @{$default_options->{filter_callbacks}}, sub { + $default_options->{tokenize_callbacks} //= []; + # Only accept years containing digits and "u" + push @{$default_options->{tokenize_callbacks}}, sub { my ($value) = @_; - my @years = (); - my @field_years = ( $value =~ /[0-9u]{4}/g ); - foreach my $year (@field_years){ - $year =~ s/[u]/0/g; - push @years, $year; - } - return \@years; + my @years = ( $value =~ /[0-9u]{4}/g ); + return @years; + }; + + $default_options->{value_callbacks} //= []; + # Replace "u" with "0" for sorting + push @{$default_options->{value_callbacks}}, sub { + my ($value) = @_; + $value =~ s/[u]/0/g; + return $value; }; } diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t index c97d84688c..2030bb7e5a 100755 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch.t @@ -371,6 +371,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' MARC::Field->new('100', '', '', a => 'Author 2'), # MARC::Field->new('210', '', '', a => 'Title 2'), # MARC::Field->new('245', '', '', a => 'Title: second record'), + MARC::Field->new('260', '', '', a => 'New York :', b => 'Ace ,', c => '1963-2003'), MARC::Field->new('999', '', '', c => '1234568'), MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric', o => $long_callno), ); @@ -531,7 +532,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' # Tests for 'year' type is_deeply( $docs->[1]->{'copydate'}, - ['1963','2003'], + ['1963', '2003'], 'Second document copydate field should be set correctly' ); is_deeply( @@ -673,7 +674,6 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents_array () t MARC::Field->new('100', '', '', a => 'Author 2'), # MARC::Field->new('210', '', '', a => 'Title 2'), # MARC::Field->new('245', '', '', a => 'Title: second record'), - MARC::Field->new('260', '', '', a => 'New York :', b => 'Ace ,', c => '1963-2003'), MARC::Field->new('999', '', '', c => '1234568'), MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric'), ); -- 2.39.5