From 345eaeb7256bd35ecb8d2ca0a7d9fc1105882209 Mon Sep 17 00:00:00 2001 From: Paul POULAIN Date: Fri, 12 Oct 2007 17:35:23 -0500 Subject: [PATCH] Bugfix for empty words MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The \b consider only ASCCI as letter. So diacritics are considered as non word. and a word like leçon is splitted in 2, "le" is empty word, and the search is done on çon (which is not french [1], so has no result) [1] con (without the cedilla) is a french word, but I won't tell you what it means... anyway, there are probably no "con" in most catalogues ;-) Signed-off-by: Chris Cormack Signed-off-by: Joshua Ferraro --- C4/Search.pm | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/C4/Search.pm b/C4/Search.pm index 02a4438612..391f6f3c62 100644 --- a/C4/Search.pm +++ b/C4/Search.pm @@ -564,8 +564,13 @@ sub buildQuery { for ( my $i = 0 ; $i <= @operands ; $i++ ) { my $operand = $operands[$i]; # remove stopwords from operand : parse all stopwords & remove them (case insensitive) + # we use IsAlpha unicode definition, to deal correctly with diacritics. + # otherwise, a french word like "leçon" is splitted in "le" "çon", le is an empty word, we get "çon" + # and don't find anything... foreach (keys %{C4::Context->stopwords}) { - $operand=~ s/\b$_\b//i; + $operand=~ s/\P{IsAlpha}$_\P{IsAlpha}/ /i; + $operand=~ s/^$_\P{IsAlpha}/ /i; + $operand=~ s/\P{IsAlpha}$_$/ /i; } my $index = $indexes[$i]; my $stemmed_operand; -- 2.39.5