From 186408742918dceb4252bc3688c346a0a09980f8 Mon Sep 17 00:00:00 2001 From: Evgeny Morozov Date: Tue, 7 Jan 2025 17:09:36 +0100 Subject: [PATCH] Always provide an meaningful exception message when CSV parsing fails When parsing failed the exception message was previous the generic "Exception of type 'Sylvan.Data.Csv.CsvFormatException' was thrown", which gave no clue as to what was wrong. Provide a good exception message in each case where this exception is thrown, including the position where parsing failed, a snippet of the input (which can be searched for to locate the problematic input easily) and what was wrong with it. Update unit tests to check for the new exception messages. --- .../CsvDataReaderTests.cs | 49 +++++++++++++------ .../CsvDataWriterTests.cs | 3 +- source/Sylvan.Data.Csv/CsvDataReader+Async.cs | 4 +- source/Sylvan.Data.Csv/CsvDataReader+Sync.cs | 4 +- source/Sylvan.Data.Csv/CsvDataReader.cs | 22 +++++++-- source/Sylvan.Data.Csv/Exceptions.cs | 11 +++-- 6 files changed, 62 insertions(+), 31 deletions(-) diff --git a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs index ee39f611..b9fb0891 100644 --- a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs +++ b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs @@ -199,7 +199,8 @@ public void Quoted2() [Fact] public void MissingHeaders() { - Assert.Throws(() => CsvDataReader.Create(new StringReader(""))); + var ex = Assert.Throws(() => CsvDataReader.Create(new StringReader(""))); + Assert.StartsWith("The CSV file does not have headers", ex.Message); } [Fact] @@ -491,7 +492,8 @@ public void BufferTooSmall() using var tr = File.OpenText("Data/Binary.csv"); var csv = CsvDataReader.Create(tr, opts); csv.Read(); - Assert.Throws(() => csv.Read()); + var ex = Assert.Throws(() => csv.Read()); + Assert.StartsWith("Row 2 was too large.", ex.Message); } [Fact] @@ -871,6 +873,8 @@ public void BadQuoteFirstRow() var ex = Assert.Throws(() => csv.Read()); Assert.Equal(1, ex.RowNumber); + Assert.StartsWith("Found unexpected character 'C' in field 1 on row 1: C\n" + Environment.NewLine + + "A delimiter (,), newline or EOF was expected", ex.Message); } [Fact] @@ -879,6 +883,8 @@ public void BadQuoteHeader() using var tr = new StringReader("Name,\"Va\"lue\nA,\"B\"C\n"); var ex = Assert.Throws(() => CsvDataReader.Create(tr, new CsvDataReaderOptions { Schema = CsvSchema.Nullable })); Assert.Equal(0, ex.RowNumber); + Assert.StartsWith("Found unexpected character 'l' in field 1 on row 0: lue\nA,\"B\"C\n" + Environment.NewLine + + "A delimiter (,), newline or EOF was expected after a closing quote", ex.Message); } [Fact] @@ -888,7 +894,9 @@ public void BadQuote() var csv = CsvDataReader.Create(tr, new CsvDataReaderOptions { Schema = CsvSchema.Nullable }); Assert.True(csv.Read()); Assert.Equal("A\"\"B", csv.GetString(0)); - Assert.Throws(() => csv.Read()); + var ex = Assert.Throws(() => csv.Read()); + Assert.StartsWith("Found unexpected character 'm' in field 1 on row 2: more,\n" + Environment.NewLine + + "A delimiter (,), newline or EOF was expected", ex.Message); } [Fact] @@ -1094,7 +1102,9 @@ public void QuoteHandling() using var reader = new StringReader("Name\r\n\b\r\n\"quoted\"field,"); var csv = CsvDataReader.Create(reader); Assert.True(csv.Read()); - Assert.Throws(() => csv.Read()); + var ex = Assert.Throws(() => csv.Read()); + Assert.StartsWith("Found unexpected character 'f' in field 0 on row 2: field," + Environment.NewLine + + "A delimiter (,), newline or EOF was expected after a closing quote", ex.Message); } [Fact] @@ -1321,7 +1331,8 @@ public void TooLongHeaderThrows() sw.Write(i); } var data = sw.ToString(); - Assert.Throws(() => CsvDataReader.Create(new StringReader(data), new CsvDataReaderOptions { BufferSize = 0x1000 })); + var ex = Assert.Throws(() => CsvDataReader.Create(new StringReader(data), new CsvDataReaderOptions { BufferSize = 0x1000 })); + Assert.StartsWith("Row 0 was too large", ex.Message); } @@ -1343,7 +1354,8 @@ public void TooLongRecordThrows() var data = sw.ToString(); var csv = CsvDataReader.Create(new StringReader(data), new CsvDataReaderOptions { BufferSize = 0x1000 }); csv.Read(); - Assert.Throws(() => csv.Read()); + var ex = Assert.Throws(() => csv.Read()); + Assert.StartsWith("Row 2 was too large", ex.Message); } [Fact] @@ -1440,7 +1452,8 @@ public void Issue_79_NoHeaders() [Fact] public void EmptyHeader() { - Assert.Throws(() => CsvDataReader.Create(new StringReader(""))); + var ex = Assert.Throws(() => CsvDataReader.Create(new StringReader(""))); + Assert.StartsWith("The CSV file does not have headers", ex.Message); } [Fact] @@ -1454,7 +1467,8 @@ public void EmptyNoHeader() [Fact] public async Task EmptyHeaderAsync() { - await Assert.ThrowsAsync(async () => await CsvDataReader.CreateAsync(new StringReader(""))); + var ex = await Assert.ThrowsAsync(async () => await CsvDataReader.CreateAsync(new StringReader(""))); + Assert.StartsWith("The CSV file does not have headers", ex.Message); } [Fact] @@ -1892,7 +1906,9 @@ public void EscapeEOF() HasHeaders = false, Escape = '\\', }); - Assert.Throws(() => csv.Read()); + var ex = Assert.Throws(() => csv.Read()); + Assert.Equal("Found unexpected character '\\' in field 0 on row 0: \\" + Environment.NewLine + + "Escape character \\ was encountered at the end of the text.", ex.Message); } [Fact] @@ -1968,13 +1984,13 @@ public void MacOSLineEndDetect() [InlineData("a\"a\"a", true, "a\"a\"a")] [InlineData("a\"\"\"a", true, "a\"\"\"a")] - [InlineData("\"a\"\"\"a\"", false, null)] - [InlineData("\"a\"a", false, null)] - [InlineData("\"a\"a\"a\"", false, null)] - [InlineData("\"\"a", false, null)] - [InlineData("\"\"a\"", false, null)] - [InlineData("\"\"\"", false, null)] - [InlineData("\"\"\"\"\"", false, null)] + [InlineData("\"a\"\"\"a\"", false, "Found unexpected character 'a' in field 0 on row 1: a\"")] + [InlineData("\"a\"a", false, "Found unexpected character 'a' in field 0 on row 1: a")] + [InlineData("\"a\"a\"a\"", false, "Found unexpected character 'a' in field 0 on row 1: a\"")] + [InlineData("\"\"a", false, "Found unexpected character 'a' in field 0 on row 1: a")] + [InlineData("\"\"a\"", false, "Found unexpected character 'a' in field 0 on row 1: a\"")] + [InlineData("\"\"\"", false, "A quoted field at the end of the input ends without a closing quote.")] + [InlineData("\"\"\"\"\"", false, "A quoted field at the end of the input ends without a closing quote.")] public void Quotes(string data, bool valid, string expected) { @@ -1991,6 +2007,7 @@ public void Quotes(string data, bool valid, string expected) { var ex = Assert.Throws(() => csv.Read()); Assert.Equal(1, ex.RowNumber); + Assert.StartsWith(expected, ex.Message); } } diff --git a/source/Sylvan.Data.Csv.Tests/CsvDataWriterTests.cs b/source/Sylvan.Data.Csv.Tests/CsvDataWriterTests.cs index 680a8e6d..226c6868 100644 --- a/source/Sylvan.Data.Csv.Tests/CsvDataWriterTests.cs +++ b/source/Sylvan.Data.Csv.Tests/CsvDataWriterTests.cs @@ -165,7 +165,8 @@ static void BinaryBig(BinaryEncoding encoding, int bufferSize, int? maxBufferSiz } else { - Assert.Throws(writeFunc); + var ex = Assert.Throws(writeFunc); + Assert.StartsWith("Row 1 was too large", ex.Message); return; } diff --git a/source/Sylvan.Data.Csv/CsvDataReader+Async.cs b/source/Sylvan.Data.Csv/CsvDataReader+Async.cs index 03b3235b..2c490912 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader+Async.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader+Async.cs @@ -221,7 +221,7 @@ async Task NextRecordAsync(CancellationToken cancel = default) { if (!GrowBuffer()) { - throw new CsvRecordTooLargeException(this.RowNumber, 0, null, null); + throw new CsvRecordTooLargeException(this.RowNumber, 0); } } await FillBufferAsync(cancel).ConfigureAwait(false); @@ -263,7 +263,7 @@ async Task NextRecordAsync(CancellationToken cancel = default) { // if we consumed the entire buffer reading this record, then this is an exceptional situation // we expect a record to be able to fit entirely within the buffer. - throw new CsvRecordTooLargeException(this.RowNumber, fieldIdx, null, null); + throw new CsvRecordTooLargeException(this.RowNumber, fieldIdx); } } await FillBufferAsync(cancel).ConfigureAwait(false); diff --git a/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs b/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs index b3e0eba8..fbb34bf7 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs @@ -86,7 +86,7 @@ bool NextRecord() { if (!GrowBuffer()) { - throw new CsvRecordTooLargeException(this.RowNumber, 0, null, null); + throw new CsvRecordTooLargeException(this.RowNumber, 0); } } FillBuffer(); @@ -131,7 +131,7 @@ bool NextRecord() { // if we consumed the entire buffer reading this record, then this is an exceptional situation // we expect a record to be able to fit entirely within the buffer. - throw new CsvRecordTooLargeException(this.RowNumber, fieldIdx, null, null); + throw new CsvRecordTooLargeException(this.RowNumber, fieldIdx); } } diff --git a/source/Sylvan.Data.Csv/CsvDataReader.cs b/source/Sylvan.Data.Csv/CsvDataReader.cs index aa06bac4..2c8311fc 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader.cs @@ -666,6 +666,14 @@ ReadResult ReadField(int fieldIdx) var idx = this.idx; var buffer = this.buffer; + void SetUnexpectedCharacterException(string reason) + { + string preview = new string(buffer.Skip(idx - 1).Take(Math.Min(this.bufferEnd - idx + 1, 50)).ToArray()); + this.pendingException = new CsvFormatException(rowNumber, fieldIdx, + $"Found unexpected character '{c}' in field {fieldIdx} on row {rowNumber}: {preview}" + + Environment.NewLine + reason); + } + // this will remain -1 if it is unquoted. // Otherwise we use it to determine if the quotes were "clean". var closeQuoteIdx = -1; @@ -710,7 +718,7 @@ ReadResult ReadField(int fieldIdx) if (atEndOfText) { // there was nothing to escape - pendingException = new CsvFormatException(rowNumber, fieldIdx); + SetUnexpectedCharacterException($"Escape character {this.escape} was encountered at the end of the text."); return ReadResult.False; } return ReadResult.Incomplete; @@ -826,7 +834,7 @@ ReadResult ReadField(int fieldIdx) } else { - this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + SetUnexpectedCharacterException($"An unescaped quote character ({quote}) was found inside a quoted field."); return ReadResult.False; } } @@ -864,7 +872,9 @@ ReadResult ReadField(int fieldIdx) { // if the field is quoted, we shouldn't be here. // the only valid characters would be a delimiter, a new line, or EOF. - this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + SetUnexpectedCharacterException($"A delimiter ({this.delimiter}), newline or EOF" + + $" was expected after a closing quote. If the preceding quote is part of the field it" + + $" should be escaped with {this.escape}."); return ReadResult.False; } } @@ -907,7 +917,8 @@ ReadResult ReadField(int fieldIdx) if (style != CsvStyle.Lax) { var rowNumber = this.rowNumber == 0 && this.state == State.Initialized ? 1 : this.rowNumber; - this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + this.pendingException = new CsvFormatException(rowNumber, fieldIdx, + "A quoted field at the end of the input ends without a closing quote."); return ReadResult.False; } } @@ -1716,7 +1727,8 @@ CharSpan PrepareField(int offset, int len, int escapeCount) { // we should never get here. Invalid fields should always be // handled in ReadField and end up in PrepareInvalidField - throw new CsvFormatException(rowNumber, -1); + throw new CsvFormatException(rowNumber, -1, + $"An quote escape character ({escape}) was the last character of a quoted field."); } } else diff --git a/source/Sylvan.Data.Csv/Exceptions.cs b/source/Sylvan.Data.Csv/Exceptions.cs index 19bdf131..fd5a682b 100644 --- a/source/Sylvan.Data.Csv/Exceptions.cs +++ b/source/Sylvan.Data.Csv/Exceptions.cs @@ -7,8 +7,8 @@ namespace Sylvan.Data.Csv; /// public class CsvFormatException : FormatException { - internal CsvFormatException(int row, int ordinal, string? msg = null, Exception? inner = null) - : base(msg, inner) + internal CsvFormatException(int row, int ordinal, string message, Exception? inner = null) + : base(message, inner) { this.RowNumber = row; this.FieldOrdinal = ordinal; @@ -43,8 +43,8 @@ internal CsvConfigurationException() { } /// public sealed class CsvRecordTooLargeException : CsvFormatException { - internal CsvRecordTooLargeException(int row, int ordinal, string? msg = null, Exception? inner = null) - : base(row, ordinal, msg, inner) { } + internal CsvRecordTooLargeException(int row, int ordinal) + : base(row, ordinal, $"Row {row} was too large. Try increasing the MaxBufferSize setting.") { } } /// @@ -52,7 +52,8 @@ internal CsvRecordTooLargeException(int row, int ordinal, string? msg = null, Ex /// public sealed class CsvMissingHeadersException : CsvFormatException { - internal CsvMissingHeadersException() : base(0, 0, null, null) { } + internal CsvMissingHeadersException() + : base(0, 0, "The CSV file does not have headers, but the HasHeaders option was set to true.") { } } ///