Skip to content

Commit

Permalink
BREAKING: Use BOM-less UTF-8 encoding for writes, #1027 (#1075)
Browse files Browse the repository at this point in the history
* SWEEP: Use BOM-less UTF-8 encoding for writes, #1027

* Remove OfflineSorter.DEFAULT_ENCODING field and replace with IOUtils.CHARSET_UTF_8

* Rename IOUtils.CHARSET_UTF_8 to ENCODING_UTF_8_NO_BOM
  • Loading branch information
paulirwin authored Jan 6, 2025
1 parent 8f5f421 commit 12b8941
Show file tree
Hide file tree
Showing 28 changed files with 126 additions and 109 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Support;
using System;
using System.IO;
using System.Text;
Expand All @@ -26,9 +27,9 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
*/

/// <summary>
/// A <see cref="WriteLineDocTask"/> which for Wikipedia input, will write category pages
/// A <see cref="WriteLineDocTask"/> which for Wikipedia input, will write category pages
/// to another file, while remaining pages will be written to the original file.
/// The categories file is derived from the original file, by adding a prefix "categories-".
/// The categories file is derived from the original file, by adding a prefix "categories-".
/// </summary>
public class WriteEnwikiLineDocTask : WriteLineDocTask
{
Expand All @@ -38,7 +39,7 @@ public WriteEnwikiLineDocTask(PerfRunData runData)
: base(runData)
{
Stream @out = StreamUtils.GetOutputStream(CategoriesLineFile(new FileInfo(m_fname)));
categoryLineFileOut = new StreamWriter(@out, Encoding.UTF8);
categoryLineFileOut = new StreamWriter(@out, StandardCharsets.UTF_8);
WriteHeader(categoryLineFileOut);
}

Expand Down
9 changes: 5 additions & 4 deletions src/Lucene.Net.Benchmark/ByTask/Tasks/WriteLineDocTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Support;
using Lucene.Net.Support.Threading;
using Lucene.Net.Util;
using System;
Expand Down Expand Up @@ -49,8 +50,8 @@ namespace Lucene.Net.Benchmarks.ByTask.Tasks
/// <item><term>line.file.out</term><description>the name of the file to write the output to. That parameter is mandatory. <b>NOTE:</b> the file is re-created.</description></item>
/// <item><term>line.fields</term><description>which fields should be written in each line. (optional, default: <see cref="DEFAULT_FIELDS"/>).</description></item>
/// <item><term>sufficient.fields</term><description>
/// list of field names, separated by comma, which,
/// if all of them are missing, the document will be skipped. For example, to require
/// list of field names, separated by comma, which,
/// if all of them are missing, the document will be skipped. For example, to require
/// that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
/// that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>
/// (optional, default: <see cref="DEFAULT_SUFFICIENT_FIELDS"/>).
Expand Down Expand Up @@ -112,10 +113,10 @@ public WriteLineDocTask(PerfRunData runData, bool performWriteHeader)
throw new ArgumentException("line.file.out must be set");
}
Stream @out = StreamUtils.GetOutputStream(new FileInfo(m_fname));
m_lineFileOut = new StreamWriter(@out, Encoding.UTF8);
m_lineFileOut = new StreamWriter(@out, StandardCharsets.UTF_8);
docMaker = runData.DocMaker;

// init fields
// init fields
string f2r = config.Get("line.fields", null);
if (f2r is null)
{
Expand Down
2 changes: 1 addition & 1 deletion src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public static void Main(string[] args)

FileInfo topicsFile = new FileInfo(args[0]);
FileInfo qrelsFile = new FileInfo(args[1]);
SubmissionReport submitLog = new SubmissionReport(new StreamWriter(new FileStream(args[2], FileMode.Create, FileAccess.Write), Encoding.UTF8 /* huh, no nio.Charset ctor? */), "lucene");
SubmissionReport submitLog = new SubmissionReport(new StreamWriter(new FileStream(args[2], FileMode.Create, FileAccess.Write), IOUtils.ENCODING_UTF_8_NO_BOM /* huh, no nio.Charset ctor? */), "lucene");
using Store.FSDirectory dir = Store.FSDirectory.Open(new DirectoryInfo(args[3]));
using IndexReader reader = DirectoryReader.Open(dir);
string fieldSpec = args.Length == 5 ? args[4] : "T"; // default to Title-only if not specified.
Expand Down
5 changes: 3 additions & 2 deletions src/Lucene.Net.Benchmark/Utils/ExtractReuters.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using Lucene.Net.Support;
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
Expand Down Expand Up @@ -118,7 +119,7 @@ protected virtual void ExtractFile(FileInfo sgmFile)
string outFile = System.IO.Path.Combine(outputDir.FullName, sgmFile.Name + "-"
+ (docNumber++) + ".txt");
// System.out.println("Writing " + outFile);
StreamWriter writer = new StreamWriter(new FileStream(outFile, FileMode.Create, FileAccess.Write), Encoding.UTF8);
StreamWriter writer = new StreamWriter(new FileStream(outFile, FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
writer.Write(@out);
writer.Dispose();
outBuffer.Length = 0;
Expand Down
3 changes: 2 additions & 1 deletion src/Lucene.Net.Benchmark/Utils/ExtractWikipedia.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Lucene.Net.Benchmarks.ByTask.Feeds;
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Documents;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.Globalization;
Expand Down Expand Up @@ -88,7 +89,7 @@ public virtual void Create(string id, string title, string time, string body)

try
{
using TextWriter writer = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
using TextWriter writer = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
writer.Write(contents.ToString());
}
catch (Exception ioe) when (ioe.IsIOException())
Expand Down
22 changes: 11 additions & 11 deletions src/Lucene.Net.TestFramework/Util/Fst/FSTTester.cs
Original file line number Diff line number Diff line change
Expand Up @@ -318,16 +318,16 @@ internal virtual FST<T> DoTest(int prune1, int prune2, bool allowRandomSuffixSha

bool willRewrite = random.NextBoolean();

Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1 == 0 && prune2 == 0,
allowRandomSuffixSharing ? random.NextBoolean() : true,
allowRandomSuffixSharing ? TestUtil.NextInt32(random, 1, 10) : int.MaxValue,
outputs,
null,
willRewrite,
PackedInt32s.DEFAULT,
true,
Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1 == 0 && prune2 == 0,
allowRandomSuffixSharing ? random.NextBoolean() : true,
allowRandomSuffixSharing ? TestUtil.NextInt32(random, 1, 10) : int.MaxValue,
outputs,
null,
willRewrite,
PackedInt32s.DEFAULT,
true,
15);
if (LuceneTestCase.Verbose)
{
Expand Down Expand Up @@ -386,7 +386,7 @@ internal virtual FST<T> DoTest(int prune1, int prune2, bool allowRandomSuffixSha

if (LuceneTestCase.Verbose && pairs.Count <= 20 && fst != null)
{
using (TextWriter w = new StreamWriter(new FileStream("out.dot", FileMode.OpenOrCreate), Encoding.UTF8))
using (TextWriter w = new StreamWriter(new FileStream("out.dot", FileMode.OpenOrCreate), StandardCharsets.UTF_8))
{
Util.ToDot(fst, w, false, false);
}
Expand Down
4 changes: 2 additions & 2 deletions src/Lucene.Net.TestFramework/Util/TestUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ public static CheckIndex.Status CheckIndex(Directory dir, bool crossCheckTermVec
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(dir);
checker.CrossCheckTermVectors = crossCheckTermVectors;
checker.InfoStream = new StreamWriter(bos, Encoding.UTF8);
checker.InfoStream = new StreamWriter(bos, IOUtils.ENCODING_UTF_8_NO_BOM);
CheckIndex.Status indexStatus = checker.DoCheckIndex(null);
if (indexStatus is null || indexStatus.Clean == false)
{
Expand Down Expand Up @@ -203,7 +203,7 @@ public static void CheckReader(AtomicReader reader, bool crossCheckTermVectors)
{
// LUCENENET: dispose the StreamWriter and ByteArrayOutputStream when done
using ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
using StreamWriter infoStream = new StreamWriter(bos, Encoding.UTF8, leaveOpen: true, bufferSize: 1024);
using StreamWriter infoStream = new StreamWriter(bos, IOUtils.ENCODING_UTF_8_NO_BOM, leaveOpen: true, bufferSize: 1024);

reader.CheckIntegrity();
CheckIndex.Status.FieldNormStatus fieldNormStatus = Index.CheckIndex.TestFieldNorms(reader, infoStream);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Lucene version compatibility level 4.10.4
using J2N;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System.Collections.Generic;
Expand Down Expand Up @@ -35,25 +36,25 @@ public void Test()
FileInfo affix = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.aff"));
FileInfo dict = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.dic"));

using var affixWriter = new StreamWriter(
new FileStream(affix.FullName, FileMode.OpenOrCreate), Encoding.UTF8);

// 65k affixes with flag 1, then an affix with flag 2
affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
for (int i = 0; i < 65536; i++)
using (var affixWriter = new StreamWriter(
new FileStream(affix.FullName, FileMode.OpenOrCreate), StandardCharsets.UTF_8))
{
affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n");
}
affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n");
affixWriter.Dispose();

using var dictWriter = new StreamWriter(
new FileStream(dict.FullName, FileMode.OpenOrCreate), Encoding.UTF8);
// 65k affixes with flag 1, then an affix with flag 2
affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
for (int i = 0; i < 65536; i++)
{
affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n");
}

affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n");
} // affixWriter.Dispose();

// drink signed with affix 2 (takes -s)
dictWriter.Write("1\ndrink/2\n");
dictWriter.Dispose();
using (var dictWriter = new StreamWriter(
new FileStream(dict.FullName, FileMode.OpenOrCreate), StandardCharsets.UTF_8))
{
// drink signed with affix 2 (takes -s)
dictWriter.Write("1\ndrink/2\n");
} // dictWriter.Dispose();

using Stream affStream = new FileStream(affix.FullName, FileMode.OpenOrCreate, FileAccess.ReadWrite);
using Stream dictStream = new FileStream(dict.FullName, FileMode.OpenOrCreate, FileAccess.ReadWrite);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Lucene version compatibility level 4.8.1
using J2N;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
Expand Down Expand Up @@ -78,7 +79,7 @@ public virtual void TestBaseDir()
DirectoryInfo @base = CreateTempDir("fsResourceLoaderBase");
try
{
TextWriter os = new StreamWriter(new FileStream(System.IO.Path.Combine(@base.FullName, "template.txt"), FileMode.Create, FileAccess.Write), Encoding.UTF8);
TextWriter os = new StreamWriter(new FileStream(System.IO.Path.Combine(@base.FullName, "template.txt"), FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
try
{
os.Write("foobar\n");
Expand Down Expand Up @@ -120,4 +121,4 @@ public virtual void TestDelegation()
assertEquals("foobar", WordlistLoader.GetLines(rl.OpenResource("template.txt"), Encoding.UTF8).First());
}
}
}
}
3 changes: 2 additions & 1 deletion src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/DocMakerTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System.Collections.Generic;
using System.IO;
Expand Down Expand Up @@ -170,7 +171,7 @@ public void TestDocMakerLeak()
// DocMaker did not close its ContentSource if resetInputs was called twice,
// leading to a file handle leak.
FileInfo f = new FileInfo(Path.Combine(getWorkDir().FullName, "docMakerLeak.txt"));
TextWriter ps = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
TextWriter ps = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), IOUtils.ENCODING_UTF_8_NO_BOM);
ps.WriteLine("one title\t" + (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) + "\tsome content"); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
ps.Dispose();

Expand Down
13 changes: 7 additions & 6 deletions src/Lucene.Net.Tests.Benchmark/ByTask/Feeds/LineDocSourceTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
Expand Down Expand Up @@ -41,7 +42,7 @@ private void createBZ2LineFile(FileInfo file, bool addHeader)
{
Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
@out = new BZip2OutputStream(@out); // csFactory.createCompressorOutputStream("bzip2", @out);
TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
TextWriter writer = new StreamWriter(@out, StandardCharsets.UTF_8);
writeDocsToFile(writer, addHeader, null);
writer.Dispose();
}
Expand All @@ -59,7 +60,7 @@ private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<stri
writer.Write(DocMaker.BODY_FIELD);
if (otherFields != null)
{
// additional field names in the header
// additional field names in the header
foreach (Object fn in otherFields.Keys)
{
writer.Write(WriteLineDocTask.SEP);
Expand All @@ -72,7 +73,7 @@ private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<stri
doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
if (otherFields != null)
{
// additional field values in the doc line
// additional field values in the doc line
foreach (Object fv in otherFields.Values)
{
doc.append(WriteLineDocTask.SEP).append(fv.toString());
Expand All @@ -85,15 +86,15 @@ private void writeDocsToFile(TextWriter writer, bool addHeader, IDictionary<stri
private void createRegularLineFile(FileInfo file, bool addHeader)
{
Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
TextWriter writer = new StreamWriter(@out, StandardCharsets.UTF_8);
writeDocsToFile(writer, addHeader, null);
writer.Dispose();
}

private void createRegularLineFileWithMoreFields(FileInfo file, params String[] extraFields)
{
Stream @out = new FileStream(file.FullName, FileMode.Create, FileAccess.Write);
TextWriter writer = new StreamWriter(@out, Encoding.UTF8);
TextWriter writer = new StreamWriter(@out, StandardCharsets.UTF_8);
Dictionary<string, string> p = new Dictionary<string, string>();
foreach (String f in extraFields)
{
Expand Down Expand Up @@ -231,7 +232,7 @@ public void TestInvalidFormat()
for (int i = 0; i < testCases.Length; i++)
{
FileInfo file = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
TextWriter writer = new StreamWriter(new FileStream(file.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8);
TextWriter writer = new StreamWriter(new FileStream(file.FullName, FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8);
writer.Write(testCases[i]);
writer.WriteLine();
writer.Dispose();
Expand Down
19 changes: 10 additions & 9 deletions src/Lucene.Net.Tests.Benchmark/ByTask/Utils/StreamUtilsTest.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using ICSharpCode.SharpZipLib.BZip2;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
Expand Down Expand Up @@ -27,7 +28,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Utils

public class StreamUtilsTest : BenchmarkTestCase
{
private static readonly String TEXT = "Some-Text...";
private static readonly string TEXT = "Some-Text...";
private DirectoryInfo testDir;

[Test]
Expand Down Expand Up @@ -82,31 +83,31 @@ public void TestGetOutputStreamPlain()
assertReadText(autoOutFile("TEXT"));
}

private FileInfo rawTextFile(String ext)
private FileInfo rawTextFile(string ext)
{
FileInfo f = new FileInfo(Path.Combine(testDir.FullName, "testfile." + ext));
using (TextWriter w = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.UTF8))
using (TextWriter w = new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), StandardCharsets.UTF_8))
w.WriteLine(TEXT);
return f;
}

private FileInfo rawGzipFile(String ext)
private FileInfo rawGzipFile(string ext)
{
FileInfo f = new FileInfo(Path.Combine(testDir.FullName, "testfile." + ext));
using (Stream os = new GZipStream(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), CompressionMode.Compress)) //new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, new FileOutputStream(f));
writeText(os);
return f;
}

private FileInfo rawBzip2File(String ext)
private FileInfo rawBzip2File(string ext)
{
FileInfo f = new FileInfo(Path.Combine(testDir.FullName, "testfile." + ext));
Stream os = new BZip2OutputStream(new FileStream(f.FullName, FileMode.Create, FileAccess.Write)); // new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f));
writeText(os);
return f;
}

private FileInfo autoOutFile(String ext)
private FileInfo autoOutFile(string ext)
{
FileInfo f = new FileInfo(Path.Combine(testDir.FullName, "testfile." + ext));
Stream os = StreamUtils.GetOutputStream(f);
Expand All @@ -116,16 +117,16 @@ private FileInfo autoOutFile(String ext)

private void writeText(Stream os)
{
TextWriter w = new StreamWriter(os, Encoding.UTF8);
TextWriter w = new StreamWriter(os, StandardCharsets.UTF_8);
w.WriteLine(TEXT);
w.Dispose();
}

private void assertReadText(FileInfo f)
{
Stream ir = StreamUtils.GetInputStream(f);
TextReader r = new StreamReader(ir, Encoding.UTF8);
String line = r.ReadLine();
TextReader r = new StreamReader(ir, StandardCharsets.UTF_8);
string line = r.ReadLine();
assertEquals("Wrong text found in " + f.Name, TEXT, line);
r.Dispose();
}
Expand Down
Loading

0 comments on commit 12b8941

Please sign in to comment.