From 0d9a42b492ca21cc14fab69d0facb80aeb8dcff4 Mon Sep 17 00:00:00 2001 From: Ere Maijala Date: Wed, 5 Dec 2018 16:27:43 +0200 Subject: [PATCH] Bug 21084: Fix automatic truncation in authority search - Makes token splitting work like in biblio search - Only adds right hand truncation since that's what Zebra also does - Only adds truncation if the token is not a phrase and ends in a word character - Adds tests to existing and new QueryBuilder functions Test plan: 1. Create an authority record for "Duck, Donald" 2. Try contains type authority searches with the following terms and make sure they find the record: Duck, Donald donald duck don duck "Duck, Donald" 3. Make sure the following search works but does not find anything: "Duck, Don" Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer Signed-off-by: Nick Clemens --- .../Elasticsearch/QueryBuilder.pm | 50 ++++++++--- .../SearchEngine/Elasticsearch/QueryBuilder.t | 86 +++++++++++++++++++ .../SearchEngine/Elasticsearch/QueryBuilder.t | 28 +++--- 3 files changed, 134 insertions(+), 30 deletions(-) create mode 100644 t/Koha/SearchEngine/Elasticsearch/QueryBuilder.t diff --git a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm index 543728e841..6d79807e2c 100644 --- a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm +++ b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm @@ -209,11 +209,14 @@ sub build_query_compat { # each search thing is a handy unit. unshift @$operators, undef; # The first one can't have an op my @search_params; + my $truncate = C4::Context->preference("QueryAutoTruncate") || 0; my $ea = each_array( @$operands, @$operators, @index_params ); while ( my ( $oand, $otor, $index ) = $ea->() ) { next if ( !defined($oand) || $oand eq '' ); + $oand = $self->_clean_search_term($oand); + $oand = $self->_truncate_terms($oand) if ($truncate); push @search_params, { - operand => $self->_clean_search_term($oand), # the search terms + operand => $oand, # the search terms operator => defined($otor) ? uc $otor : undef, # AND and so on $index ? %$index : (), }; @@ -314,11 +317,14 @@ sub build_authorities_query { } else { # regular wordlist stuff -# push @query_parts, { match => {$wh => { query => $val, operator => 'and' }} }; - my @values = split(' ',$val); - foreach my $v (@values) { - push @query_parts, { wildcard => { "$wh.phrase" => "*" . lc $v . "*" } }; + my @tokens = $self->_split_query( $val ); + foreach my $token ( @tokens ) { + $token = $self->_truncate_terms( + $self->_clean_search_term( $token ) + ); } + my $query = $self->_join_queries( @tokens ); + push @query_parts, { query_string => { default_field => $wh, query => $query } }; } } @@ -714,14 +720,11 @@ to ensure those parts are correct. sub _clean_search_term { my ( $self, $term ) = @_; - my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0; - # Some hardcoded searches (like with authorities) produce things like # 'an=123', when it ought to be 'an:123' for our purposes. $term =~ s/=/:/g; $term = $self->_convert_index_strings_freeform($term); $term =~ s/[{}]/"/g; - $term = $self->_truncate_terms($term) if ($auto_truncation); return $term; } @@ -805,21 +808,40 @@ operands and double quoted strings. sub _truncate_terms { my ( $self, $query ) = @_; - # '"donald duck" title:"the mouse" and peter" get split into - # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete'] - my @tokens = split /((?:[\w\-.]+:)?"[^"]+"|\s+)/, $query; + my @tokens = $self->_split_query( $query ); # Filter out empty tokens my @words = grep { $_ !~ /^\s*$/ } @tokens; - # Append '*' to words if needed, ie. if it's not surrounded by quotes, not - # terminated by '*' and not a keyword + # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword my @terms = map { my $w = $_; - (/"$/ or /\*$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*"; + (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*"; } @words; return join ' ', @terms; } +=head2 _split_query + + my @token = $self->_split_query($query_str); + +Given a string query this function splits it to tokens taking into account +any field prefixes and quoted strings. + +=cut + +sub _split_query { + my ( $self, $query ) = @_; + + # '"donald duck" title:"the mouse" and peter" get split into + # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete'] + my @tokens = split /((?:[\w\-.]+:)?"[^"]+"|\s+)/, $query; + + # Filter out empty values + @tokens = grep( /\S/, @tokens ); + + return @tokens; +} + 1; diff --git a/t/Koha/SearchEngine/Elasticsearch/QueryBuilder.t b/t/Koha/SearchEngine/Elasticsearch/QueryBuilder.t new file mode 100644 index 0000000000..1c7f73e3f5 --- /dev/null +++ b/t/Koha/SearchEngine/Elasticsearch/QueryBuilder.t @@ -0,0 +1,86 @@ +#!/usr/bin/perl +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# Koha is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Koha; if not, see . + +use Modern::Perl; + +use Test::More tests => 2; + +use Koha::SearchEngine::Elasticsearch::QueryBuilder; + +subtest '_truncate_terms() tests' => sub { + plan tests => 7; + + my $qb; + ok( + $qb = Koha::SearchEngine::Elasticsearch::QueryBuilder->new({ 'index' => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX }), + 'Creating a new QueryBuilder object' + ); + + my $res = $qb->_truncate_terms('donald'); + is_deeply($res, 'donald*', 'single search term returned correctly'); + + $res = $qb->_truncate_terms('donald duck'); + is_deeply($res, 'donald* duck*', 'two search terms returned correctly'); + + $res = $qb->_truncate_terms(' donald duck '); + is_deeply($res, 'donald* duck*', 'two search terms surrounded by spaces returned correctly'); + + $res = $qb->_truncate_terms('"donald duck"'); + is_deeply($res, '"donald duck"', 'quoted search term returned correctly'); + + $res = $qb->_truncate_terms('"donald, duck"'); + is_deeply($res, '"donald, duck"', 'quoted search term with comma returned correctly'); + + $res = $qb->_truncate_terms(' "donald duck" '); + is_deeply($res, '"donald duck"', 'quoted search terms surrounded by spaces correctly'); +}; + +subtest '_split_query() tests' => sub { + plan tests => 7; + + my $qb; + ok( + $qb = Koha::SearchEngine::Elasticsearch::QueryBuilder->new({ 'index' => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX }), + 'Creating a new QueryBuilder object' + ); + + my @res = $qb->_split_query('donald'); + my @exp = 'donald'; + is_deeply(\@res, \@exp, 'single search term returned correctly'); + + @res = $qb->_split_query('donald duck'); + @exp = ('donald', 'duck'); + is_deeply(\@res, \@exp, 'two search terms returned correctly'); + + @res = $qb->_split_query(' donald duck '); + @exp = ('donald', 'duck'); + is_deeply(\@res, \@exp, 'two search terms surrounded by spaces returned correctly'); + + @res = $qb->_split_query('"donald duck"'); + @exp = ( '"donald duck"' ); + is_deeply(\@res, \@exp, 'quoted search term returned correctly'); + + @res = $qb->_split_query('"donald, duck"'); + @exp = ( '"donald, duck"' ); + is_deeply(\@res, \@exp, 'quoted search term with comma returned correctly'); + + @res = $qb->_split_query(' "donald duck" '); + @exp = ( '"donald duck"' ); + is_deeply(\@res, \@exp, 'quoted search terms surrounded by spaces correctly'); +}; + +1; \ No newline at end of file diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t index b6d78ad577..1a109c6a94 100644 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t @@ -82,7 +82,7 @@ $se->mock( 'get_elasticsearch_mappings', sub { }); subtest 'build_authorities_query_compat() tests' => sub { - plan tests => 44; + plan tests => 36; my $qb; @@ -96,11 +96,11 @@ subtest 'build_authorities_query_compat() tests' => sub { foreach my $koha_name ( keys %{ $koha_to_index_name } ) { my $query = $qb->build_authorities_query_compat( [ $koha_name ], undef, undef, ['contains'], [$search_term], 'AUTH_TYPE', 'asc' ); if ( $koha_name eq 'all' || $koha_name eq 'any' ) { - is( $query->{query}->{bool}->{must}[0]->{wildcard}->{"_all.phrase"}, - "*a*"); + is( $query->{query}->{bool}->{must}[0]->{query_string}->{query}, + "a*"); } else { - is( $query->{query}->{bool}->{must}[0]->{wildcard}->{$koha_to_index_name->{$koha_name}.".phrase"}, - "*a*"); + is( $query->{query}->{bool}->{must}[0]->{query_string}->{query}, + "a*"); } } @@ -108,15 +108,11 @@ subtest 'build_authorities_query_compat() tests' => sub { foreach my $koha_name ( keys %{ $koha_to_index_name } ) { my $query = $qb->build_authorities_query_compat( [ $koha_name ], undef, undef, ['contains'], [$search_term], 'AUTH_TYPE', 'asc' ); if ( $koha_name eq 'all' || $koha_name eq 'any' ) { - is( $query->{query}->{bool}->{must}[0]->{wildcard}->{"_all.phrase"}, - "*donald*"); - is( $query->{query}->{bool}->{must}[1]->{wildcard}->{"_all.phrase"}, - "*duck*"); + is( $query->{query}->{bool}->{must}[0]->{query_string}->{query}, + "(Donald*) AND (Duck*)"); } else { - is( $query->{query}->{bool}->{must}[0]->{wildcard}->{$koha_to_index_name->{$koha_name}.".phrase"}, - "*donald*"); - is( $query->{query}->{bool}->{must}[1]->{wildcard}->{$koha_to_index_name->{$koha_name}.".phrase"}, - "*duck*"); + is( $query->{query}->{bool}->{must}[0]->{query_string}->{query}, + "(Donald*) AND (Duck*)"); } } @@ -360,14 +356,14 @@ subtest 'build query from form subtests' => sub { my $query = $qb->build_authorities_query_compat( \@marclist, undef, undef, \@operator , \@values, 'AUTH_TYPE', 'asc' ); - is($query->{query}->{bool}->{must}[0]->{wildcard}->{'Heading.phrase'}, "*hamilton*","Expected search is populated"); + is($query->{query}->{bool}->{must}[0]->{query_string}->{query}, "Hamilton*","Expected search is populated"); is( scalar @{ $query->{query}->{bool}->{must} }, 1,"Only defined search is populated"); @values[2] = 'Jefferson'; $query = $qb->build_authorities_query_compat( \@marclist, undef, undef, \@operator , \@values, 'AUTH_TYPE', 'asc' ); - is($query->{query}->{bool}->{must}[0]->{wildcard}->{'Heading.phrase'}, "*hamilton*","First index searched as expected"); - is($query->{query}->{bool}->{must}[1]->{wildcard}->{'Match.phrase'}, "*jefferson*","Second index searched when populated"); + is($query->{query}->{bool}->{must}[0]->{query_string}->{query}, "Hamilton*","First index searched as expected"); + is($query->{query}->{bool}->{must}[1]->{query_string}->{query}, "Jefferson*","Second index searched when populated"); is( scalar @{ $query->{query}->{bool}->{must} }, 2,"Only defined searches are populated"); -- 2.39.5