From 133df5e912259cdcd938366838dffdb161b4c280 Mon Sep 17 00:00:00 2001 From: Ere Maijala Date: Tue, 23 Jan 2018 15:21:31 +0200 Subject: [PATCH] Bug 20073: Move Elasticsearch configs to yaml files and improve the default settings. Improvements: 1) Index settings moved from code to etc/searchengine/elasticsearch/index_config.yaml. An alternative can be specified in koha-conf.xml. 2) Field settings moved from code to etc/searchengine/elasticsearch/field_config.yaml. An alternative can be specified in koha-conf.xml. 3) mappings.yaml has been moved from admin/searchengine/elasticsearch to etc/searchengine/elasticsearch. An alternative can be specified in koha-conf.xml. 4) Default settings have been improved to remove punctuation from phrases used for sorting etc. 5) State variables are used for storing configuration to avoid parsing it multiple times. 6) A possibility to reset the fields too has been added to the reset operation of mappings administration. 7) mappings.yaml has been moved from admin/searchengine/elasticsearch to etc/searchengine/elasticsearch. 8) An stdno field type has been added for standard identifiers. To test: 1) Run tests in t/Koha/SearchEngine/Elasticsearch.t 2) Clear tables search_fields and search_marc_map 3) Go to admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1 4) Verify that admin/searchengine/elasticsearch/mappings.pl displays the mappings properly, including ISBN and other standard number fields. 5) Index some records using the -d parameter with misc/search_tools/rebuild_elastic_search.pl to recreate the index 6) Verify that you can find the records 7) Put non_existent to koha-conf.xml 8) Verify that admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1 fails because it can't find non_existent. 9) Copy etc/searchengine/elasticsearch/mappings.yaml to a new location and make elasticsearch_index_mappings setting in koha-conf.xml point to it. 10) Make a change in the new mappings.yaml. 11) Clear table search_fields (mappings reset doesn't do it yet, see bug 20248) 12) Go to admin/searchengine/elasticsearch/mappings.pl?op=reset&i_know_what_i_am_doing=1 13) Verify that the changes you made are now visible in the mappings UI Signed-off-by: Kyle M Hall Bug 20073: Move Elasticsearch yaml files back to admin directory Signed-off-by: Kyle M Hall Signed-off-by: Nick Clemens --- Koha/SearchEngine/Elasticsearch.pm | 201 ++++++++---------- .../elasticsearch/field_config.yaml | 61 ++++++ .../elasticsearch/index_config.yaml | 34 +++ .../searchengine/elasticsearch/mappings.yaml | 7 +- debian/templates/koha-conf-site.xml.in | 11 + etc/koha-conf.xml | 11 + .../data/mysql/atomicupdate/bug_20073.perl | 8 + installer/data/mysql/kohastructure.sql | 2 +- .../searchengine/elasticsearch/mappings.tt | 10 + t/Koha/SearchEngine/Elasticsearch.t | 26 ++- 10 files changed, 250 insertions(+), 121 deletions(-) create mode 100644 admin/searchengine/elasticsearch/field_config.yaml create mode 100644 admin/searchengine/elasticsearch/index_config.yaml create mode 100644 installer/data/mysql/atomicupdate/bug_20073.perl diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index fe2e0ea17c..04ec0d948e 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -34,8 +34,6 @@ use Search::Elasticsearch; use Try::Tiny; use YAML::Syck; -use Data::Dumper; # TODO remove - __PACKAGE__->mk_ro_accessors(qw( index )); __PACKAGE__->mk_accessors(qw( sort_fields )); @@ -137,24 +135,14 @@ A hashref containing the settings is returned. sub get_elasticsearch_settings { my ($self) = @_; - # Ultimately this should come from a file or something, and not be - # hardcoded. - my $settings = { - index => { - analysis => { - analyzer => { - analyser_phrase => { - tokenizer => 'icu_tokenizer', - filter => ['icu_folding'], - }, - analyser_standard => { - tokenizer => 'icu_tokenizer', - filter => ['icu_folding'], - }, - }, - } - } - }; + # Use state to speed up repeated calls + state $settings = undef; + if (!defined $settings) { + my $config_file = C4::Context->config('elasticsearch_index_config'); + $config_file ||= C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/index_config.yaml'; + $settings = LoadFile( $config_file ); + } + return $settings; } @@ -170,116 +158,97 @@ created. sub get_elasticsearch_mappings { my ($self) = @_; - # TODO cache in the object? - my $mappings = { - data => { - _all => {type => "string", analyzer => "analyser_standard"}, - properties => { - record => { - store => "true", - include_in_all => JSON::false, - type => "text", - }, - } - } - }; - my %sort_fields; - my $marcflavour = lc C4::Context->preference('marcflavour'); - $self->_foreach_mapping( - sub { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_; - return if $marc_type ne $marcflavour; - # TODO if this gets any sort of complexity to it, it should - # be broken out into its own function. - - # TODO be aware of date formats, but this requires pre-parsing - # as ES will simply reject anything with an invalid date. - my $es_type = - $type eq 'boolean' - ? 'boolean' - : 'text'; - - if ($es_type eq 'boolean') { - $mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_boolean( $name, $es_type, $facet, $suggestible, $sort, $marc_type ); - return; #Boolean cannot have facets nor sorting nor suggestions - } else { - $mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_default( $name, $es_type, $facet, $suggestible, $sort, $marc_type ); - } + # Use state to speed up repeated calls + state %all_mappings; + state %sort_fields; + + if (!defined $all_mappings{$self->index}) { + $sort_fields{$self->index} = {}; + my $mappings = { + data => _get_elasticsearch_mapping('general', '') + }; + my $marcflavour = lc C4::Context->preference('marcflavour'); + $self->_foreach_mapping( + sub { + my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_; + return if $marc_type ne $marcflavour; + # TODO if this gets any sort of complexity to it, it should + # be broken out into its own function. + + # TODO be aware of date formats, but this requires pre-parsing + # as ES will simply reject anything with an invalid date. + my $es_type = 'text'; + if ($type eq 'boolean') { + $es_type = 'boolean'; + } elsif ($type eq 'number' || $type eq 'sum') { + $es_type = 'integer'; + } elsif ($type eq 'isbn' || $type eq 'stdno') { + $es_type = 'stdno'; + } - if ($facet) { - $mappings->{data}{properties}{ $name . '__facet' } = { - type => "keyword", - }; - } - if ($suggestible) { - $mappings->{data}{properties}{ $name . '__suggestion' } = { - type => 'completion', - analyzer => 'simple', - search_analyzer => 'simple', - }; - } - # Sort is a bit special as it can be true, false, undef. - # We care about "true" or "undef", - # "undef" means to do the default thing, which is make it sortable. - if ($sort || !defined $sort) { - $mappings->{data}{properties}{ $name . '__sort' } = { - search_analyzer => "analyser_phrase", - analyzer => "analyser_phrase", - type => "text", - include_in_all => JSON::false, - fields => { - phrase => { - type => "keyword", - }, - }, - }; - $sort_fields{$name} = 1; + $mappings->{data}{properties}{$name} = _get_elasticsearch_mapping('search', $es_type); + + if ($facet) { + $mappings->{data}{properties}{ $name . '__facet' } = _get_elasticsearch_mapping('facet', $es_type); + } + if ($suggestible) { + $mappings->{data}{properties}{ $name . '__suggestion' } = _get_elasticsearch_mapping('suggestible', $es_type); + } + # Sort is a bit special as it can be true, false, undef. + # We care about "true" or "undef", + # "undef" means to do the default thing, which is make it sortable. + if (!defined $sort || $sort) { + $mappings->{data}{properties}{ $name . '__sort' } = _get_elasticsearch_mapping('sort', $es_type); + $sort_fields{$self->index}{$name} = 1; + } } - } - ); - $self->sort_fields(\%sort_fields); - return $mappings; + ); + $all_mappings{$self->index} = $mappings; + } + $self->sort_fields(\%{$sort_fields{$self->index}}); + + return $all_mappings{$self->index}; } -=head2 _elasticsearch_mapping_for_* +=head2 _get_elasticsearch_mapping -Get the ES mappings for the given data type or a special mapping case +Get the ES mappings for the given purpose and data type -Receives the same parameters from the $self->_foreach_mapping() dispatcher +$mapping = _get_elasticsearch_mapping('search', 'text'); =cut -sub _elasticsearch_mapping_for_boolean { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_; +sub _get_elasticsearch_mapping { - return { - type => $type, - null_value => 0, - }; -} + my ( $purpose, $type ) = @_; -sub _elasticsearch_mapping_for_default { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_; - - return { - search_analyzer => "analyser_standard", - analyzer => "analyser_standard", - type => $type, - fields => { - phrase => { - search_analyzer => "analyser_phrase", - analyzer => "analyser_phrase", - type => "text", - }, - raw => { - type => "keyword", - } - }, - }; + # Use state to speed up repeated calls + state $settings = undef; + if (!defined $settings) { + my $config_file = C4::Context->config('elasticsearch_field_config'); + $config_file ||= C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/field_config.yaml'; + $settings = LoadFile( $config_file ); + } + + if (!defined $settings->{$purpose}) { + die "Field purpose $purpose not defined in field config"; + } + if ($type eq '') { + return $settings->{$purpose}; + } + if (defined $settings->{$purpose}{$type}) { + return $settings->{$purpose}{$type}; + } + if (defined $settings->{$purpose}{'default'}) { + return $settings->{$purpose}{'default'}; + } + return undef; } sub reset_elasticsearch_mappings { - my $mappings_yaml = C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/mappings.yaml'; + my ( $reset_fields ) = @_; + my $mappings_yaml = C4::Context->config('elasticsearch_index_mappings'); + $mappings_yaml ||= C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/mappings.yaml'; my $indexes = LoadFile( $mappings_yaml ); while ( my ( $index_name, $fields ) = each %$indexes ) { diff --git a/admin/searchengine/elasticsearch/field_config.yaml b/admin/searchengine/elasticsearch/field_config.yaml new file mode 100644 index 0000000000..535a43c00b --- /dev/null +++ b/admin/searchengine/elasticsearch/field_config.yaml @@ -0,0 +1,61 @@ +--- +# General field configuration +general: + _all: + type: string + analyzer: analyser_standard + properties: + record: + store: true + type: text +# Search fields +search: + boolean: + type: boolean + null_value: false + integer: + type: integer + null_value: 0 + stdno: + type: text + analyzer: analyser_stdno + search_analyzer: analyser_stdno + fields: + phrase: + type: text + analyzer: analyser_stdno + search_analyzer: analyser_stdno + raw: + type: keyword + copy_to: _all + default: + type: text + analyzer: analyser_standard + search_analyzer: analyser_standard + fields: + phrase: + type: text + analyzer: analyser_phrase + search_analyzer: analyser_phrase + raw: + type: keyword + copy_to: _all +# Facets +facet: + default: + type: keyword +# Suggestible +suggestible: + default: + type: completion + analyzer: simple + search_analyzer: simple +# Sort +sort: + default: + type: text + analyzer: analyser_phrase + search_analyzer: analyser_phrase + fields: + phrase: + type: keyword diff --git a/admin/searchengine/elasticsearch/index_config.yaml b/admin/searchengine/elasticsearch/index_config.yaml new file mode 100644 index 0000000000..bdfcdf5852 --- /dev/null +++ b/admin/searchengine/elasticsearch/index_config.yaml @@ -0,0 +1,34 @@ +--- +# Index configuration that defines how different analyzers work. +index: + analysis: + analyzer: + # Phrase analyzer is used for phrases (phrase match, sorting) + analyser_phrase: + tokenizer: keyword + filter: + - icu_folding + char_filter: + - punctuation + analyser_standard: + tokenizer: icu_tokenizer + filter: + - icu_folding + analyser_stdno: + tokenizer: whitespace + filter: + - icu_folding + char_filter: + - punctuation + normalizer: + normalizer_keyword: + type: custom + filter: + - icu_folding + char_filter: + # The punctuation filter is used to remove any punctuation chars in fields that don't use icu_tokenizer. + punctuation: + type: pattern_replace + # The pattern contains all ASCII punctuation characters. + pattern: '([\x00-\x1F,\x21-\x2F,\x3A-\x40,\x5B-\x60,\x7B-\x89,\x8B,\x8D,\x8F,\x90-\x99,\x9B,\x9D,\xA0-\xBF,\xD7,\xF7])' + replacement: '' diff --git a/admin/searchengine/elasticsearch/mappings.yaml b/admin/searchengine/elasticsearch/mappings.yaml index acfc24ce41..18e63fa5e8 100644 --- a/admin/searchengine/elasticsearch/mappings.yaml +++ b/admin/searchengine/elasticsearch/mappings.yaml @@ -1,4 +1,5 @@ --- +# Basic mappings from MARC fields to Elasticsearch fields. authorities: Corporate-name-see-also-from: label: Corporate-name-see-also-from @@ -2162,7 +2163,7 @@ biblios: marc_type: unimarc sort: ~ suggestible: '' - type: '' + type: 'stdno' isbn: label: isbn mappings: @@ -2181,7 +2182,7 @@ biblios: marc_type: unimarc sort: ~ suggestible: '' - type: '' + type: 'isbn' issn: label: issn mappings: @@ -2200,7 +2201,7 @@ biblios: marc_type: unimarc sort: ~ suggestible: '' - type: '' + type: 'stdno' issues: label: issues mappings: diff --git a/debian/templates/koha-conf-site.xml.in b/debian/templates/koha-conf-site.xml.in index 9b11c714b4..d2b47fcb30 100644 --- a/debian/templates/koha-conf-site.xml.in +++ b/debian/templates/koha-conf-site.xml.in @@ -328,10 +328,21 @@ __END_SRU_PUBLICSERVER__ 50 2 + localhost:9200 koha___KOHASITE__ + + + + + + localhost:9200 koha___DB_NAME__ + + + + + +