Skip to content

Commit

Permalink
SWEEP: Use BOM-less UTF-8 encoding for writes, #1027
Browse files Browse the repository at this point in the history
  • Loading branch information
paulirwin committed Dec 24, 2024
1 parent 85c0141 commit 2b3bf9e
Show file tree
Hide file tree
Showing 21 changed files with 100 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Support;
using System;
using System.IO;
using System.Text;
Expand All @@ -26,9 +27,9 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
*/

/// <summary>
/// A <see cref="WriteLineDocTask"/> which for Wikipedia input, will write category pages
/// A <see cref="WriteLineDocTask"/> which for Wikipedia input, will write category pages
/// to another file, while remaining pages will be written to the original file.
/// The categories file is derived from the original file, by adding a prefix "categories-".
/// The categories file is derived from the original file, by adding a prefix "categories-".
/// </summary>
public class WriteEnwikiLineDocTask : WriteLineDocTask
{
Expand All @@ -38,7 +39,7 @@ public WriteEnwikiLineDocTask(PerfRunData runData)
: base(runData)
{
Stream @out = StreamUtils.GetOutputStream(CategoriesLineFile(new FileInfo(m_fname)));
categoryLineFileOut = new StreamWriter(@out, Encoding.UTF8);
categoryLineFileOut = new StreamWriter(@out, StandardCharsets.UTF_8);
WriteHeader(categoryLineFileOut);
}

Expand Down
9 changes: 5 additions & 4 deletions src/Lucene.Net.Benchmark/ByTask/Tasks/WriteLineDocTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Support;
using Lucene.Net.Support.Threading;
using Lucene.Net.Util;
using System;
Expand Down Expand Up @@ -49,8 +50,8 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
/// <item><term>line.file.out</term><description>the name of the file to write the output to. That parameter is mandatory. <b>NOTE:</b> the file is re-created.</description></item>
/// <item><term>line.fields</term><description>which fields should be written in each line. (optional, default: <see cref="DEFAULT_FIELDS"/>).</description></item>
/// <item><term>sufficient.fields</term><description>
/// list of field names, separated by comma, which,
/// if all of them are missing, the document will be skipped. For example, to require
/// list of field names, separated by comma, which,
/// if all of them are missing, the document will be skipped. For example, to require
/// that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
/// that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>
/// (optional, default: <see cref="DEFAULT_SUFFICIENT_FIELDS"/>).
Expand Down Expand Up @@ -112,10 +113,10 @@ public WriteLineDocTask(PerfRunData runData, bool performWriteHeader)
throw new ArgumentException("line.file.out must be set");
}
Stream @out = StreamUtils.GetOutputStream(new FileInfo(m_fname));
m_lineFileOut = new StreamWriter(@out, Encoding.UTF8);
m_lineFileOut = new StreamWriter(@out, StandardCharsets.UTF_8);
docMaker = runData.DocMaker;

// init fields
// init fields
string f2r = config.Get("line.fields", null);
if (f2r is null)
{
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public static void Main(string[] args)

FileInfo topicsFile = new FileInfo(args[0]);
FileInfo qrelsFile = new FileInfo(args[1]);
SubmissionReport submitLog = new SubmissionReport(new StreamWriter(new FileStream(args[2], FileMode.Create, FileAccess.Write), Encoding.UTF8 /* huh, no nio.Charset ctor? */), "lucene");
SubmissionReport submitLog = new SubmissionReport(new StreamWriter(new FileStream(args[2], FileMode.Create, FileAccess.Write), IOUtils.CHARSET_UTF_8 /* huh, no nio.Charset ctor? */), "lucene");
using Store.FSDirectory dir = Store.FSDirectory.Open(new DirectoryInfo(args[3]));
using IndexReader reader = DirectoryReader.Open(dir);
string fieldSpec = args.Length == 5 ? args[4] : "T"; // default to Title-only if not specified.
Expand Down
5 changes: 3 additions & 2 deletions src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using Lucene.Net.Support;
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
Expand Down Expand Up @@ -118,7 +119,7 @@ protected virtual void ExtractFile(FileInfo sgmFile)
string outFile = System.IO.Path.Combine(outputDir.FullName, sgmFile.Name + "-"
+ (docNumber++) + ".txt");
// System.out.println("Writing " + outFile);
StreamWriter writer = new StreamWriter(new FileStream(outFile, FileMode.Create, FileAccess.Write), Encoding.UTF8);
StreamWriter writer = new StreamWriter(new FileStream(outFile, FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
writer.Write(@out);
writer.Dispose();
outBuffer.Length = 0;
Expand Down
3 changes: 2 additions & 1 deletion src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Lucene.Net.Benchmarks.ByTask.Feeds;
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Documents;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.Globalization;
Expand Down Expand Up @@ -88,7 +89,7 @@ public virtual void Create(string id, string title, string time, string body)

try
{
using TextWriter writer = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
using TextWriter writer = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
writer.Write(contents.ToString());
}
catch (Exception ioe) when (ioe.IsIOException())
Expand Down
22 changes: 11 additions & 11 deletions src/Lucene.Net.TestFramework/Util/Fst/FSTTester.cs
Original file line number Diff line number Diff line change
Expand Up @@ -318,16 +318,16 @@ internal virtual FST<T> DoTest(int prune1, int prune2, bool allowRandomSuffixSha

bool willRewrite = random.NextBoolean();

Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1 == 0 && prune2 == 0,
allowRandomSuffixSharing ? random.NextBoolean() : true,
allowRandomSuffixSharing ? TestUtil.NextInt32(random, 1, 10) : int.MaxValue,
outputs,
null,
willRewrite,
PackedInt32s.DEFAULT,
true,
Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1 == 0 && prune2 == 0,
allowRandomSuffixSharing ? random.NextBoolean() : true,
allowRandomSuffixSharing ? TestUtil.NextInt32(random, 1, 10) : int.MaxValue,
outputs,
null,
willRewrite,
PackedInt32s.DEFAULT,
true,
15);
if (LuceneTestCase.Verbose)
{
Expand Down Expand Up @@ -386,7 +386,7 @@ internal virtual FST<T> DoTest(int prune1, int prune2, bool allowRandomSuffixSha

if (LuceneTestCase.Verbose && pairs.Count <= 20 && fst != null)
{
using (TextWriter w = new StreamWriter(new FileStream("out.dot", FileMode.OpenOrCreate), Encoding.UTF8))
using (TextWriter w = new StreamWriter(new FileStream("out.dot", FileMode.OpenOrCreate), StandardCharsets.UTF_8))
{
Util.ToDot(fst, w, false, false);
}
Expand Down
16 changes: 8 additions & 8 deletions src/Lucene.Net.TestFramework/Util/TestUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ private static ISet<FileSystemInfo> Rm(ISet<FileSystemInfo> unremoved, params Fi

/// <summary>
/// Convenience method unzipping <paramref name="zipFileStream"/> into <paramref name="destDir"/>, cleaning up
/// <paramref name="destDir"/> first.
/// <paramref name="destDir"/> first.
/// </summary>
public static void Unzip(Stream zipFileStream, DirectoryInfo destDir)
{
Expand Down Expand Up @@ -167,7 +167,7 @@ public static CheckIndex.Status CheckIndex(Directory dir, bool crossCheckTermVec
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(dir);
checker.CrossCheckTermVectors = crossCheckTermVectors;
checker.InfoStream = new StreamWriter(bos, Encoding.UTF8);
checker.InfoStream = new StreamWriter(bos, IOUtils.CHARSET_UTF_8);
CheckIndex.Status indexStatus = checker.DoCheckIndex(null);
if (indexStatus is null || indexStatus.Clean == false)
{
Expand All @@ -180,7 +180,7 @@ public static CheckIndex.Status CheckIndex(Directory dir, bool crossCheckTermVec
{
if (LuceneTestCase.UseInfoStream)
{
checker.FlushInfoStream();
checker.FlushInfoStream();
Console.WriteLine(bos.ToString());
}
return indexStatus;
Expand All @@ -202,7 +202,7 @@ public static void CheckReader(IndexReader reader)
public static void CheckReader(AtomicReader reader, bool crossCheckTermVectors)
{
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
StreamWriter infoStream = new StreamWriter(bos, Encoding.UTF8);
StreamWriter infoStream = new StreamWriter(bos, IOUtils.CHARSET_UTF_8);

reader.CheckIntegrity();
CheckIndex.Status.FieldNormStatus fieldNormStatus = Index.CheckIndex.TestFieldNorms(reader, infoStream);
Expand Down Expand Up @@ -591,8 +591,8 @@ public static string GetDocValuesFormat(Codec codec, string field)
public static bool FieldSupportsHugeBinaryDocValues(string field)
{
string dvFormat = GetDocValuesFormat(field);
if (dvFormat.Equals("Lucene40", StringComparison.Ordinal)
|| dvFormat.Equals("Lucene42", StringComparison.Ordinal)
if (dvFormat.Equals("Lucene40", StringComparison.Ordinal)
|| dvFormat.Equals("Lucene42", StringComparison.Ordinal)
|| dvFormat.Equals("Memory", StringComparison.Ordinal))
{
return false;
Expand Down Expand Up @@ -868,7 +868,7 @@ public static ICharSequence BytesToCharSequence(BytesRef @ref, Random random)
/// Returns a valid (compiling) <see cref="Regex"/> instance with random stuff inside. Be careful
/// when applying random patterns to longer strings as certain types of patterns
/// may explode into exponential times in backtracking implementations (such as Java's).
/// </summary>
/// </summary>
public static Regex RandomRegex(Random random) // LUCENENET specific - renamed from RandomPattern()
{
return RandomizedTesting.Generators.RandomExtensions.NextRegex(random); // LUCENENET: Moved general random data generation to RandomizedTesting.Generators
Expand Down Expand Up @@ -1059,4 +1059,4 @@ public static string RandomSubString(Random random, int wordLength, bool simple)
'\u3000'
};
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Lucene version compatibility level 4.10.4
using J2N;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System.Collections.Generic;
Expand Down Expand Up @@ -35,25 +36,25 @@ public void Test()
FileInfo affix = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.aff"));
FileInfo dict = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.dic"));

using var affixWriter = new StreamWriter(
new FileStream(affix.FullName, FileMode.OpenOrCreate), Encoding.UTF8);

// 65k affixes with flag 1, then an affix with flag 2
affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
for (int i = 0; i < 65536; i++)
using (var affixWriter = new StreamWriter(
new FileStream(affix.FullName, FileMode.OpenOrCreate), StandardCharsets.UTF_8))
{
affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n");
}
affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n");
affixWriter.Dispose();

using var dictWriter = new StreamWriter(
new FileStream(dict.FullName, FileMode.OpenOrCreate), Encoding.UTF8);
// 65k affixes with flag 1, then an affix with flag 2
affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
for (int i = 0; i < 65536; i++)
{
affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n");
}

affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n");
} // affixWriter.Dispose();

// drink signed with affix 2 (takes -s)
dictWriter.Write("1\ndrink/2\n");
dictWriter.Dispose();
using (var dictWriter = new StreamWriter(
new FileStream(dict.FullName, FileMode.OpenOrCreate), StandardCharsets.UTF_8))
{
// drink signed with affix 2 (takes -s)
dictWriter.Write("1\ndrink/2\n");
} // dictWriter.Dispose();

using Stream affStream = new FileStream(affix.FullName, FileMode.OpenOrCreate, FileAccess.ReadWrite);
using Stream dictStream = new FileStream(dict.FullName, FileMode.OpenOrCreate, FileAccess.ReadWrite);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Lucene version compatibility level 4.8.1
using J2N;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
Expand Down Expand Up @@ -78,7 +79,7 @@ public virtual void TestBaseDir()
DirectoryInfo @base = CreateTempDir("fsResourceLoaderBase");
try
{
TextWriter os = new StreamWriter(new FileStream(System.IO.Path.Combine(@base.FullName, "template.txt"), FileMode.Create, FileAccess.Write), Encoding.UTF8);
TextWriter os = new StreamWriter(new FileStream(System.IO.Path.Combine(@base.FullName, "template.txt"), FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
try
{
os.Write("foobar\n");
Expand Down Expand Up @@ -120,4 +121,4 @@ public virtual void TestDelegation()
assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource("template.txt"), Encoding.UTF8).First());
}
}
}
}
3 changes: 2 additions & 1 deletion src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System.Collections.Generic;
using System.IO;
Expand Down Expand Up @@ -170,7 +171,7 @@ public void TestDocMakerLeak()
// DocMaker did not close its ContentSource if resetInputs was called twice,
// leading to a file handle leak.
FileInfo f = new FileInfo(Path.Combine(getWorkDir().FullName, "docMakerLeak.txt"));
TextWriter ps = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
TextWriter ps = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), IOUtils.CHARSET_UTF_8);
ps.WriteLine("one title\t" + (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) + "\tsome content"); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
ps.Dispose();

Expand Down
13 changes: 7 additions & 6 deletions src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
Expand Down Expand Up @@ -41,7 +42,7 @@ private void createBZ2LineFile(FileInfo file, bool addHeader)
{
Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
@out = new BZip2OutputStream(@out); // csFactory.createCompressorOutputStream("bzip2", @out);
TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
TextWriter writer = new StreamWriter(@out, StandardCharsets.UTF_8);
writeDocsToFile(writer, addHeader, null);
writer.Dispose();
}
Expand All @@ -59,7 +60,7 @@ private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<stri
writer.Write(DocMaker.BODY_FIELD);
if (otherFields != null)
{
// additional field names in the header
// additional field names in the header
foreach (Object fn in otherFields.Keys)
{
writer.Write(WriteLineDocTask.SEP);
Expand All @@ -72,7 +73,7 @@ private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<stri
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
if (otherFields != null)
{
// additional field values in the doc line
// additional field values in the doc line
foreach (Object fv in otherFields.Values)
{
doc.append(WriteLineDocTask.SEP).append(fv.toString());
Expand All @@ -85,15 +86,15 @@ private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<stri
private void createRegularLineFile(FileInfo file, bool addHeader)
{
Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
TextWriter writer = new StreamWriter(@out, StandardCharsets.UTF_8);
writeDocsToFile(writer, addHeader, null);
writer.Dispose();
}

private void createRegularLineFileWithMoreFields(FileInfo file, params String[] extraFields)
{
Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
TextWriter writer = new StreamWriter(@out, StandardCharsets.UTF_8);
Dictionary<string, string> p = new Dictionary<string, string>();
foreach (String f in extraFields)
{
Expand Down Expand Up @@ -231,7 +232,7 @@ public void TestInvalidFormat()
for (int i = 0; i < testCases.Length; i++)
{
FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
TextWriter writer = new StreamWriter(new FileStream(file.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
TextWriter writer = new StreamWriter(new FileStream(file.FullName, FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
writer.Write(testCases[i]);
writer.WriteLine();
writer.Dispose();
Expand Down
Loading

0 comments on commit 2b3bf9e

Please sign in to comment.