Nick Clemens
224ac84aec
Rather than limiting initials to [A-Z] we should test for a broad range of uppercase letters. The ES/Zebra changes are slightly different because of Perl vs Java regex conventions. POerl may support either, but I found 'Uppercase' to be a bit more explicit More info here: https://perldoc.perl.org/perlunicode.html TO test: Same plan as before but use Ж. as the ending initial Confirm the period is preserved and other punctuation removed Signed-off-by: Katrin Fischer <katrin.fischer.83@web.de> Signed-off-by: Jonathan Druart <jonathan.druart@bugs.koha-community.org>
44 lines
1.3 KiB
YAML
44 lines
1.3 KiB
YAML
---
|
|
# Index configuration that defines how different analyzers work.
|
|
index:
|
|
analysis:
|
|
analyzer:
|
|
# Phrase analyzer is used for phrases (exact phrase match)
|
|
analyzer_phrase:
|
|
tokenizer: keyword
|
|
filter:
|
|
- icu_folding
|
|
char_filter:
|
|
- punctuation
|
|
analyzer_standard:
|
|
tokenizer: icu_tokenizer
|
|
filter:
|
|
- icu_folding
|
|
analyzer_stdno:
|
|
tokenizer: whitespace
|
|
filter:
|
|
- icu_folding
|
|
char_filter:
|
|
- punctuation
|
|
normalizer:
|
|
icu_folding_normalizer:
|
|
type: custom
|
|
filter:
|
|
- icu_folding
|
|
nfkc_cf_normalizer:
|
|
type: custom
|
|
char_filter: icu_normalizer
|
|
facet_normalizer:
|
|
char_filter: facet
|
|
char_filter:
|
|
# The punctuation filter is used to remove any punctuation chars in fields that don't use icu_tokenizer.
|
|
punctuation:
|
|
type: pattern_replace
|
|
# The pattern contains all ASCII punctuation characters.
|
|
pattern: '([\x00-\x1F,\x21-\x2F,\x3A-\x40,\x5B-\x60,\x7B-\x89,\x8B,\x8D,\x8F,\x90-\x99,\x9B,\x9D,\xA0-\xBF,\xD7,\xF7])'
|
|
replacement: ''
|
|
facet:
|
|
type: pattern_replace
|
|
pattern: '\s*(?<!\p{Lu})[.\-,;]*\s*$'
|
|
replacement: ''
|
|
index.mapping.total_fields.limit: 10000
|