From e34f95a1f5eb0fce238ab442553bec5233263a9c Mon Sep 17 00:00:00 2001 From: Nick Clemens Date: Thu, 2 Nov 2017 16:52:53 +0000 Subject: [PATCH] Bug 17661: Ending punctuation causes duplicate facets The current code for facets doesn't pull strip ending punctuation from facets This causes duplicate facets for terms that should be combined Sometimes series can have different punctuation depending on the field they are in Author initials punctuation should be preserved To test: 1 - Do search and pull up some records 2 - Edit some of the records to have authors like: Date, C.J. Date, C.j. Date, C.J . 3 - Edit the records to have some series statments like: 830 $aDate, C.J. ;$v5 830 $aDate, C.J. ; $v5 830 $aDate, C.J.; $v5 4 - Add some 490s to the record with first indicator 1 and series like: You wouldn't want to-- You wouldn't want to You wouldn't want to.. 5 - Search again and note you have 3 facets each for author and series 6 - Apply patch 7 - Repeat 8 - Now you get 2 facets for author, period not removed when following Upper case immediately, is otherwise 9 - Now you should have a single series facet 10 - Switch search engine to ES (index before applying patch) 11 - Note facets are separate again 12 - Reset mappings and reindex perl misc/search_tools/rebuild_elasticsearch -v -r 13 - Repeat search, facets combined as above Signed-off-by: Sarah Cornell Signed-off-by: Katrin Fischer Signed-off-by: Jonathan Druart --- C4/Search.pm | 4 +++- admin/searchengine/elasticsearch/field_config.yaml | 1 + admin/searchengine/elasticsearch/index_config.yaml | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/C4/Search.pm b/C4/Search.pm index 56b7991a04..a9a5c28725 100644 --- a/C4/Search.pm +++ b/C4/Search.pm @@ -680,6 +680,7 @@ sub _get_facets_data_from_record { next if $field->indicator(1) eq 'z'; my $data = $field->as_string( $subfield_letters, $facet->{ sep } ); + $data =~ s/\s*(?textContent; + $facet_value =~ s/\s*(?{ $facet_value } = $term->getAttribute( 'occur' ); + $facets->{ $facet_value } = ( defined $facets->{$facet_value} ) ? $facets->{ $facet_value } + $term->getAttribute( 'occur' ) : $term->getAttribute( 'occur' ); } return $facets; diff --git a/admin/searchengine/elasticsearch/field_config.yaml b/admin/searchengine/elasticsearch/field_config.yaml index 82793ff421..c98b26dcfe 100644 --- a/admin/searchengine/elasticsearch/field_config.yaml +++ b/admin/searchengine/elasticsearch/field_config.yaml @@ -55,6 +55,7 @@ search: facet: default: type: keyword + normalizer: facet_normalizer # Suggestible suggestible: default: diff --git a/admin/searchengine/elasticsearch/index_config.yaml b/admin/searchengine/elasticsearch/index_config.yaml index 7a8d9052b4..dfae04dd97 100644 --- a/admin/searchengine/elasticsearch/index_config.yaml +++ b/admin/searchengine/elasticsearch/index_config.yaml @@ -28,6 +28,8 @@ index: nfkc_cf_normalizer: type: custom char_filter: icu_normalizer + facet_normalizer: + char_filter: facet char_filter: # The punctuation filter is used to remove any punctuation chars in fields that don't use icu_tokenizer. punctuation: @@ -35,4 +37,8 @@ index: # The pattern contains all ASCII punctuation characters. pattern: '([\x00-\x1F,\x21-\x2F,\x3A-\x40,\x5B-\x60,\x7B-\x89,\x8B,\x8D,\x8F,\x90-\x99,\x9B,\x9D,\xA0-\xBF,\xD7,\xF7])' replacement: '' + facet: + type: pattern_replace + pattern: '\s*(?