From f983c1e3f7404c60143d595ba5824f5324829723 Mon Sep 17 00:00:00 2001 From: Nick Clemens Date: Thu, 9 Nov 2017 16:23:57 +0000 Subject: [PATCH] Bug 19604: Elasticsearch Fixes for build_authorities_query for auth searching To test: 1 - Do some authority searches in Zebra 2 - Switch to ES and repeat, results will vary and some may fail 3 - Apply patch and dependencies 4 - Reindex ES 5 - Repeat searches, they should suceed and results should be similar to Zebra 6 - Slight differences are okay, but results should (mostly) meet expectations A few notes: We add a 'normalizer' to ensure we get a single token from the heading indexes, this makes 'starts with' work as expcted We switch to 'AND' for fields searched from cataloging editor - this matches Zebra results We force the '__sort' fields for sorting - if sorting looks wrong try reducing the heading field to a single subfield - this will need to be addressed on a future bug (multiple subfields create an array, ES sorts those randomly) Signed-off-by: Nicolas Legrand Signed-off-by: Katrin Fischer Signed-off-by: Nick Clemens --- .../Elasticsearch/QueryBuilder.pm | 55 +++++++++---------- .../elasticsearch/field_config.yaml | 3 + .../elasticsearch/index_config.yaml | 3 + 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm index 8a0ead1deb..5d7136ca6d 100644 --- a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm +++ b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm @@ -290,40 +290,46 @@ sub build_authorities_query { # Start by making the query parts my @query_parts; - my @filter_parts; + foreach my $s ( @{ $search->{searches} } ) { my ( $wh, $op, $val ) = @{$s}{qw(where operator value)}; $wh = '_all' if $wh eq ''; if ( $op eq 'is' || $op eq '=' ) { - # look for something that matches completely + # look for something that matches a term completely # note, '=' is about numerical vals. May need special handling. - # _allphrase is a special field that only groups the exact - # matches. Also, we lowercase our search because the ES + # Also, we lowercase our search because the ES # index lowercases its values, and term searches don't get the # search analyzer applied to them. - push @filter_parts, { term => { "$wh.phrase" => lc $val } }; + push @query_parts, { term => {"$wh.phrase" => lc $val} }; } elsif ( $op eq 'exact' ) { - # left and right truncation, otherwise an exact phrase - push @query_parts, { match_phrase => { $wh => $val } }; + push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} }; } elsif ( $op eq 'start' ) { - - # startswith search - push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } }; + # startswith search, uses lowercase untokenized version of heading + push @query_parts, { prefix => {"$wh.lc_raw" => lc $val} }; } else { # regular wordlist stuff - push @query_parts, { match => { $wh => $val } }; +# push @query_parts, { match => {$wh => { query => $val, operator => 'and' }} }; + my @values = split(' ',$val); + foreach $val (@values) { + push @query_parts, { wildcard => { "$wh.phrase" => "*" . lc $val . "*" } }; + } } } - # Merge the query and filter parts appropriately - # 'should' behaves like 'or', if we want 'and', use 'must' - my $query_part = { bool => { should => \@query_parts } }; - my $filter_part = { bool => { should => \@filter_parts } }; + # Merge the query parts appropriately + # 'should' behaves like 'or' + # 'must' behaves like 'and' + # Zebra results seem to match must so using that here + my $query = { query=> + { bool => + { must => \@query_parts } + } + }; # We need to add '.phrase' to all the sort headings otherwise it'll sort # based on the tokenised form. @@ -336,20 +342,9 @@ sub build_authorities_query { $search->{sort} = \%s; } - # extract the sort stuff - my %sort; - %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort}; - my $query; - if (@filter_parts) { - $query = - { query => - { filtered => { filter => $filter_part, query => $query_part } } - }; - } - else { - $query = { query => $query_part }; - } - $query = { %$query, %sort }; + # add the sort stuff + $query->{sort} = [ $search->{sort} ] if exists $search->{sort}; + return $query; } @@ -446,7 +441,7 @@ sub build_authorities_query_compat { my %sort; my $sort_field = - ( $orderby =~ /^Heading/ ) ? 'Heading' + ( $orderby =~ /^Heading/ ) ? 'Heading__sort' : ( $orderby =~ /^Auth/ ) ? 'Local-Number' : undef; if ($sort_field) { diff --git a/admin/searchengine/elasticsearch/field_config.yaml b/admin/searchengine/elasticsearch/field_config.yaml index 535a43c00b..dba47f5451 100644 --- a/admin/searchengine/elasticsearch/field_config.yaml +++ b/admin/searchengine/elasticsearch/field_config.yaml @@ -39,6 +39,9 @@ search: search_analyzer: analyser_phrase raw: type: keyword + lc_raw: + type: keyword + normalizer: my_normalizer copy_to: _all # Facets facet: diff --git a/admin/searchengine/elasticsearch/index_config.yaml b/admin/searchengine/elasticsearch/index_config.yaml index bdfcdf5852..dce27bc4fa 100644 --- a/admin/searchengine/elasticsearch/index_config.yaml +++ b/admin/searchengine/elasticsearch/index_config.yaml @@ -25,6 +25,9 @@ index: type: custom filter: - icu_folding + my_normalizer: + type: custom + char_filter: icu_normalizer char_filter: # The punctuation filter is used to remove any punctuation chars in fields that don't use icu_tokenizer. punctuation: -- 2.39.5