From de280b0f692ad9f2ab24d9a1f7e280a59b32263f Mon Sep 17 00:00:00 2001 From: "H.H.Chen" Date: Tue, 12 Nov 2024 01:53:31 +0800 Subject: [PATCH] Respect ignoreCase flag in CommonGramsFilterFactory (#781) * Respect ignoreCase flag in CommonGramsFilterFactory * Add LUCENENET-specific backport comment * Use GetType instead of typeof for resource loader --------- Co-authored-by: Paul Irwin --- .../CommonGrams/CommonGramsFilterFactory.cs | 7 +++--- .../TestCommonGramsFilterFactory.cs | 22 +++++++++++++++++-- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs index 560b3a8c20..670151703d 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs @@ -36,7 +36,7 @@ namespace Lucene.Net.Analysis.CommonGrams /// public class CommonGramsFilterFactory : TokenFilterFactory, IResourceLoaderAware { - // TODO: shared base class for Stop/Keep/CommonGrams? + // TODO: shared base class for Stop/Keep/CommonGrams? private CharArraySet commonWords; private readonly string commonWordFiles; private readonly string format; @@ -71,7 +71,8 @@ public virtual void Inform(IResourceLoader loader) } else { - commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + // LUCENENET-specific: backported ignoreCase fix from Lucene 8.10.0 (lucene#188, LUCENE-10008) + commonWords = new CharArraySet(m_luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } } @@ -85,4 +86,4 @@ public override TokenStream Create(TokenStream input) return commonGrams; } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Commongrams/TestCommonGramsFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Commongrams/TestCommonGramsFilterFactory.cs index 9655b6bd0a..063fc1b22b 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Commongrams/TestCommonGramsFilterFactory.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Commongrams/TestCommonGramsFilterFactory.cs @@ -28,7 +28,7 @@ namespace Lucene.Net.Analysis.CommonGrams /// Tests pretty much copied from StopFilterFactoryTest We use the test files /// used by the StopFilterFactoryTest TODO: consider creating separate test files /// so this won't break if stop filter test files change - /// + /// /// public class TestCommonGramsFilterFactory : BaseTokenStreamFactoryTestCase { @@ -79,6 +79,24 @@ public virtual void TestDefaults() AssertTokenStreamContents(stream, new string[] { "testing", "testing_the", "the", "the_factory", "factory" }); } + // LUCENENET-specific: backported ignoreCase fix from Lucene 8.10.0 (lucene#188, LUCENE-10008) + [Test] + public void TestIgnoreCase() + { + IResourceLoader loader = new ClasspathResourceLoader(GetType()); + CommonGramsFilterFactory factory = + (CommonGramsFilterFactory) + TokenFilterFactory("CommonGrams", TEST_VERSION_CURRENT, loader, "ignoreCase", "true"); + CharArraySet words = factory.CommonWords; + assertTrue("words is null and it shouldn't be", words != null); + assertTrue(words.contains("the")); + assertTrue(words.contains("The")); + Tokenizer tokenizer = new MockTokenizer(new StringReader("testing The factory"),MockTokenizer.WHITESPACE, false); + TokenStream stream = factory.Create(tokenizer); + AssertTokenStreamContents( + stream, new string[] {"testing", "testing_The", "The", "The_factory", "factory"}); + } + /// /// Test that bogus arguments result in exception [Test] @@ -95,4 +113,4 @@ public virtual void TestBogusArguments() } } } -} \ No newline at end of file +}