From 474263abda5a8737031de04065a5be7e185f8f20 Mon Sep 17 00:00:00 2001 From: Mark Pflug Date: Thu, 16 Nov 2023 11:39:32 -0800 Subject: [PATCH] Fix for escaping edge cases. (#225) Exception now thrown from "Read", instead of "Create" when first data row has format error. --- .../CsvDataReaderTests.cs | 58 ++++++++++++++++--- source/Sylvan.Data.Csv/CsvDataReader+Async.cs | 15 +++++ source/Sylvan.Data.Csv/CsvDataReader+Sync.cs | 8 +++ source/Sylvan.Data.Csv/CsvDataReader.cs | 37 ++++++++++-- 4 files changed, 103 insertions(+), 15 deletions(-) diff --git a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs index 57919e77..c0be13ec 100644 --- a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs +++ b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs @@ -864,7 +864,11 @@ public void RowFieldCount3Test() public void BadQuoteFirstRow() { using var tr = new StringReader("Name,Value\nA,\"B\"C\n"); - var ex = Assert.Throws(() => CsvDataReader.Create(tr, new CsvDataReaderOptions { Schema = CsvSchema.Nullable })); + + var csv = CsvDataReader.Create(tr, new CsvDataReaderOptions { Schema = CsvSchema.Nullable }); + + var ex = Assert.Throws(() => csv.Read()); + Assert.Equal(1, ex.RowNumber); } @@ -1782,7 +1786,7 @@ public void DynamicReader() public void DynamicSchema() { var data = new StringReader("a,b,c\n1,,3\n2022-11-12,12.4,1e5\n"); - + var reader = CsvDataReader.Create(data, new CsvDataReaderOptions { Schema = CsvSchema.Dynamic }); Assert.Equal(typeof(object), reader.GetFieldType(0)); @@ -1880,15 +1884,14 @@ public void FinalCharInCellIsEscaped() public void EscapeEOF() { using var reader = new StringReader("\\"); - Assert.Throws(() => + + using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions { - using var csvReader = CsvDataReader.Create(reader, new CsvDataReaderOptions - { - CsvStyle = CsvStyle.Escaped, - HasHeaders = false, - Escape = '\\', - }); + CsvStyle = CsvStyle.Escaped, + HasHeaders = false, + Escape = '\\', }); + Assert.Throws(() => csv.Read()); } [Fact] @@ -1907,6 +1910,43 @@ public void FinalCharInCellIsEscapeError() Assert.Equal("\\\n", value0); } + [Theory] + // These test cases were copied from the Sep parser library. Thanks, Nietras. + [InlineData("a", true, "a")] + [InlineData("\"\"", true, "")] + [InlineData("\"\"\"\"", true, "\"")] + [InlineData("\"\"\"\"\"\"", true, "\"\"")] + [InlineData("\"a\"", true, "a")] + [InlineData("\"a\"\"a\"", true, "a\"a")] + [InlineData("\"a\"\"a\"\"a\"", true, "a\"a\"a")] + [InlineData("a\"a\"a", true, "a\"a\"a")] + [InlineData("a\"\"\"a", true, "a\"\"\"a")] + + [InlineData("\"a\"\"\"a\"", false, null)] + [InlineData("\"a\"a", false, null)] + [InlineData("\"a\"a\"a\"", false, null)] + [InlineData("\"\"a", false, null)] + [InlineData("\"\"a\"", false, null)] + [InlineData("\"\"\"", false, null)] + [InlineData("\"\"\"\"\"", false, null)] + + public void Quotes(string data, bool valid, string expected) + { + var r = new StringReader("a,b,c\n" + data); + var csv = CsvDataReader.Create(r); + + if (valid) { + csv.Read(); + var value = csv.GetString(0); + Assert.Equal(expected, value); + } + else + { + var ex = Assert.Throws(() => csv.Read()); + Assert.Equal(1, ex.RowNumber); + } + } + #if NET6_0_OR_GREATER diff --git a/source/Sylvan.Data.Csv/CsvDataReader+Async.cs b/source/Sylvan.Data.Csv/CsvDataReader+Async.cs index a3ae8824..3b17c732 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader+Async.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader+Async.cs @@ -158,9 +158,15 @@ async Task InitializeReaderAsync(CancellationToken cancel = default) { if (carryRow || state == State.Open || await NextRecordAsync(cancel).ConfigureAwait(false)) { + if (pendingException != null) + { + // if the header row is malformed, the exception is throw from the call to Create + throw pendingException; + } carryRow = false; Initialize(); this.state = State.Initialized; + this.rowNumber++; var hasNext = await NextRecordAsync(cancel).ConfigureAwait(false); if (resultSetMode == ResultSetMode.SingleResult) { @@ -243,6 +249,10 @@ async Task NextRecordAsync(CancellationToken cancel = default) } if (result == ReadResult.False) { + if(this.state == State.Open && pendingException != null) + { + throw pendingException; + } return true; } @@ -293,6 +303,7 @@ async Task FillBufferAsync(CancellationToken cancel = default) public override async Task ReadAsync(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); + this.rowNumber++; if (this.state == State.Open) { @@ -309,6 +320,10 @@ public override async Task ReadAsync(CancellationToken cancellationToken) } else if (this.state == State.Initialized) { + if (pendingException != null) + { + throw pendingException; + } // after initizialization, the first record would already be in the buffer // if hasRows is true. if (hasRows) diff --git a/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs b/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs index 47090389..c2a29c49 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader+Sync.cs @@ -117,6 +117,10 @@ bool NextRecord() } if (result == ReadResult.False) { + if (this.state == State.Open && pendingException != null) + { + throw pendingException; + } return true; } @@ -181,6 +185,10 @@ public override bool Read() } else if (this.state == State.Initialized) { + if (pendingException != null) + { + throw pendingException; + } // after initizialization, the first record would already be in the buffer // if hasRows is true. if (hasRows) diff --git a/source/Sylvan.Data.Csv/CsvDataReader.cs b/source/Sylvan.Data.Csv/CsvDataReader.cs index b911d62f..0eb8886a 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader.cs @@ -111,6 +111,9 @@ enum ReadResult FieldInfo[] fieldInfos; CsvColumn[] columns; + // An exception that was created with initializing, and should be thrown on the next call to Read/ReadAsync + Exception? pendingException; + // in multi-result set mode carryRow indicates that a row is already parsed // and needs to be carried into the next result set. bool carryRow; @@ -204,7 +207,7 @@ private CsvDataReader(TextReader reader, char[]? buffer, CsvDataReaderOptions op this.newLineMode = NewLineMode.Unknown; } - static ColumnStringFactory BuildStringFactory(ColumnStringFactory? csf, StringFactory?sf) + static ColumnStringFactory BuildStringFactory(ColumnStringFactory? csf, StringFactory? sf) { if (csf != null) { @@ -354,7 +357,7 @@ public void Initialize() } #if INTRINSICS - + Vector256 delimiterMaskVector256; Vector256 lineEndMaskVector256; Vector256 quoteMaskVector256; @@ -680,7 +683,7 @@ ReadResult ReadField(int fieldIdx) if (IsEndOfLine(c)) { idx--;// "unconsume" the newline character, so that ConsumeLineEnd can process it. - // if the escape precede an EOL, we might have to consume 2 chars + // if the escape precede an EOL, we might have to consume 2 chars var r = ConsumeLineEnd(buffer, ref idx); if (r == ReadResult.Incomplete) { @@ -695,7 +698,8 @@ ReadResult ReadField(int fieldIdx) if (atEndOfText) { // there was nothing to escape - throw new InvalidDataException(); + pendingException = new CsvFormatException(rowNumber, fieldIdx); + return ReadResult.False; } return ReadResult.Incomplete; } @@ -807,6 +811,14 @@ ReadResult ReadField(int fieldIdx) complete = true; break; } + else + // this handles the case where we had a quoted field + if (c == quote && closeQuoteIdx >= 0) + { + this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + return ReadResult.False; + } + else if (IsEndOfLine(c)) { idx--; @@ -825,6 +837,16 @@ ReadResult ReadField(int fieldIdx) last = true; break; } + } + else + { + if (closeQuoteIdx >= 0) + { + // if the field is quoted, we shouldn't be here. + // the only valid characters would be a delimiter, a new line, or EOF. + this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + return ReadResult.False; + } } } @@ -862,7 +884,8 @@ ReadResult ReadField(int fieldIdx) else { var rowNumber = this.rowNumber == 0 && this.state == State.Initialized ? 1 : this.rowNumber; - throw new CsvFormatException(rowNumber, fieldIdx); + this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + return ReadResult.False; } } } @@ -1627,7 +1650,9 @@ CharSpan PrepareField(int offset, int len, int escapeCount) } else { - throw new InvalidDataException(); + // we should never get here. Bad fields should always be + // handled in "read" + throw new CsvFormatException(rowNumber, -1); } } else