From 7b8ea20bce5e1e2329536677dba049a768f19a83 Mon Sep 17 00:00:00 2001 From: Nick Clemens Date: Thu, 9 Nov 2017 16:23:57 +0000 Subject: [PATCH] Bug 19604: Elasticsearch Fixes for build_authorities_query for auth searching To test: 1 - Do some authority searches in Zebra 2 - Switch to ES and repeat, results will vary and some may fail 3 - Apply patch and dependencies 4 - Reindex ES 5 - Repeat searches, they should suceed and results should be similar to Zebra 6 - Slight differences are okay, but results should (mostly) meet expectations A few notes: We add a 'normalizer' to ensure we get a single token from the heading indexes, this makes 'starts with' work as expcted We switch to 'AND' for fields searched from cataloging editor - this matches Zebra results We force the '__sort' fields for sorting - if sorting looks wrong try reducing the heading field to a single subfield - this will need to be addressed on a future bug (multiple subfields create an array, ES sorts those randomly) Signed-off-by: Nicolas Legrand Signed-off-by: Katrin Fischer Signed-off-by: Nick Clemens (cherry picked from commit f983c1e3f7404c60143d595ba5824f5324829723) Signed-off-by: Martin Renvoize --- Koha/SearchEngine/Elasticsearch.pm | 10 ++++ .../Elasticsearch/QueryBuilder.pm | 55 +++++++++---------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index fe2e0ea17c..90c05e8ed5 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -142,6 +142,12 @@ sub get_elasticsearch_settings { my $settings = { index => { analysis => { + normalizer => { + my_normalizer => { + type => "custom", + char_filter => ['icu_normalizer'], + } + }, analyzer => { analyser_phrase => { tokenizer => 'icu_tokenizer', @@ -273,6 +279,10 @@ sub _elasticsearch_mapping_for_default { }, raw => { type => "keyword", + }, + lc_raw => { + type => "keyword", + normalizer => "my_normalizer", } }, }; diff --git a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm index ebc2786619..5c24d3cefd 100644 --- a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm +++ b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm @@ -290,40 +290,46 @@ sub build_authorities_query { # Start by making the query parts my @query_parts; - my @filter_parts; + foreach my $s ( @{ $search->{searches} } ) { my ( $wh, $op, $val ) = @{$s}{qw(where operator value)}; $wh = '_all' if $wh eq ''; if ( $op eq 'is' || $op eq '=' ) { - # look for something that matches completely + # look for something that matches a term completely # note, '=' is about numerical vals. May need special handling. - # _allphrase is a special field that only groups the exact - # matches. Also, we lowercase our search because the ES + # Also, we lowercase our search because the ES # index lowercases its values, and term searches don't get the # search analyzer applied to them. - push @filter_parts, { term => { "$wh.phrase" => lc $val } }; + push @query_parts, { term => {"$wh.phrase" => lc $val} }; } elsif ( $op eq 'exact' ) { - # left and right truncation, otherwise an exact phrase - push @query_parts, { match_phrase => { $wh => $val } }; + push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} }; } elsif ( $op eq 'start' ) { - - # startswith search - push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } }; + # startswith search, uses lowercase untokenized version of heading + push @query_parts, { prefix => {"$wh.lc_raw" => lc $val} }; } else { # regular wordlist stuff - push @query_parts, { match => { $wh => $val } }; +# push @query_parts, { match => {$wh => { query => $val, operator => 'and' }} }; + my @values = split(' ',$val); + foreach $val (@values) { + push @query_parts, { wildcard => { "$wh.phrase" => "*" . lc $val . "*" } }; + } } } - # Merge the query and filter parts appropriately - # 'should' behaves like 'or', if we want 'and', use 'must' - my $query_part = { bool => { should => \@query_parts } }; - my $filter_part = { bool => { should => \@filter_parts } }; + # Merge the query parts appropriately + # 'should' behaves like 'or' + # 'must' behaves like 'and' + # Zebra results seem to match must so using that here + my $query = { query=> + { bool => + { must => \@query_parts } + } + }; # We need to add '.phrase' to all the sort headings otherwise it'll sort # based on the tokenised form. @@ -336,20 +342,9 @@ sub build_authorities_query { $search->{sort} = \%s; } - # extract the sort stuff - my %sort; - %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort}; - my $query; - if (@filter_parts) { - $query = - { query => - { filtered => { filter => $filter_part, query => $query_part } } - }; - } - else { - $query = { query => $query_part }; - } - $query = { %$query, %sort }; + # add the sort stuff + $query->{sort} = [ $search->{sort} ] if exists $search->{sort}; + return $query; } @@ -446,7 +441,7 @@ sub build_authorities_query_compat { my %sort; my $sort_field = - ( $orderby =~ /^Heading/ ) ? 'Heading' + ( $orderby =~ /^Heading/ ) ? 'Heading__sort' : ( $orderby =~ /^Auth/ ) ? 'Local-Number' : undef; if ($sort_field) { -- 2.39.5