From ba5b2f02fc06199e4fc2806f8cbe125694307aaa Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Mon, 29 Jan 2024 09:19:08 -0800 Subject: [PATCH 1/6] All lax unit tests passing --- .../CsvDataReaderTests.cs | 75 ++++- .../Sylvan.Data.Csv.Tests.csproj | 2 +- source/Sylvan.Data.Csv/CsvDataReader.cs | 281 ++++++++++++------ source/Sylvan.Data.Csv/CsvStyle.cs | 5 + 4 files changed, 263 insertions(+), 100 deletions(-) diff --git a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs index c0be13ec..752d4283 100644 --- a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs +++ b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs @@ -5,6 +5,7 @@ using System.Globalization; using System.IO; using System.Linq; +using System.Numerics; using System.Text; using System.Threading.Tasks; using Xunit; @@ -949,7 +950,7 @@ public void Binary2() [InlineData("N,V\na\\\nb,c\n", "a\nb", "c")] [InlineData("N,V\na\\\r\nb\n", "a\r\nb", "")] [InlineData("N,V\na\\\r\nb", "a\r\nb", "")] - public void ImpliedQuote(string input, string a, string b) + public void EscapedStyle(string input, string a, string b) { using var reader = new StringReader(input); var options = @@ -1884,14 +1885,14 @@ public void FinalCharInCellIsEscaped() public void EscapeEOF() { using var reader = new StringReader("\\"); - + using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions { CsvStyle = CsvStyle.Escaped, HasHeaders = false, Escape = '\\', }); - Assert.Throws(() => csv.Read()); + Assert.Throws(() => csv.Read()); } [Fact] @@ -1921,7 +1922,7 @@ public void FinalCharInCellIsEscapeError() [InlineData("\"a\"\"a\"\"a\"", true, "a\"a\"a")] [InlineData("a\"a\"a", true, "a\"a\"a")] [InlineData("a\"\"\"a", true, "a\"\"\"a")] - + [InlineData("\"a\"\"\"a\"", false, null)] [InlineData("\"a\"a", false, null)] [InlineData("\"a\"a\"a\"", false, null)] @@ -1935,11 +1936,12 @@ public void Quotes(string data, bool valid, string expected) var r = new StringReader("a,b,c\n" + data); var csv = CsvDataReader.Create(r); - if (valid) { + if (valid) + { csv.Read(); var value = csv.GetString(0); Assert.Equal(expected, value); - } + } else { var ex = Assert.Throws(() => csv.Read()); @@ -1947,6 +1949,67 @@ public void Quotes(string data, bool valid, string expected) } } + [Theory] + // these are valid, and parse the same as the non-lax test + //[InlineData("a", "a")] + //[InlineData("\"\"", "")] + //[InlineData("\"\"\"\"", "\"")] + //[InlineData("\"\"\"\"\"\"", "\"\"")] + //[InlineData("\"a\"", "a")] + //[InlineData("\"a\"\"a\"", "a\"a")] + //[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")] + //[InlineData("a\"a\"a", "a\"a\"a")] + //[InlineData("a\"\"\"a", "a\"\"\"a")] + // these are invalid, but will still produce a string in lax mode. + [InlineData("\"a\"\"\"a\"", "a\"a\"")] + [InlineData("\"a\"a", "aa")] + [InlineData("\"a\"a\"a\"", "aa\"a\"")] + [InlineData("\"\"a", "a")] + [InlineData("\"\"a\"", "a\"")] + //[InlineData("\"\"\"", "\"")] + [InlineData("\"\"\"\"\"", "\"\"")] + + public void LaxQuotes(string data, string expected) + { + var r = new StringReader("a,b,c\n1,2,3\n" + data); + var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax }; + var csv = CsvDataReader.Create(r, opts); + csv.Read(); // skip the 1,2,3 + csv.Read(); + var value = csv.GetString(0); + Assert.Equal(expected, value); + } + + [Theory] + // these are valid, and parse the same as the non-lax test + [InlineData("a", "a")] + [InlineData("\"\"", "")] + [InlineData("\"\"\"\"", "\"")] + [InlineData("\"\"\"\"\"\"", "\"\"")] + [InlineData("\"a\"", "a")] + [InlineData("\"a\"\"a\"", "a\"a")] + [InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")] + [InlineData("a\"a\"a", "a\"a\"a")] + [InlineData("a\"\"\"a", "a\"\"\"a")] + // these are invalid, but will still produce a string in lax mode. + [InlineData("\"a\"\"\"a\"", "a\"a\"")] + [InlineData("\"a\"a", "aa")] + [InlineData("\"a\"a\"a\"", "aa\"a\"")] + [InlineData("\"\"a", "a")] + [InlineData("\"\"a\"", "a\"")] + [InlineData("\"\"\"", "\"\n4,5,6\n")] + [InlineData("\"\"\"\"\"", "\"\"\n4,5,6\n")] + + public void LaxQuotes2(string data, string expected) + { + var r = new StringReader("a,b,c\n1,2,3\n" + data + "\n4,5,6\n"); + var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax }; + var csv = CsvDataReader.Create(r, opts); + csv.Read(); // skip the 1,2,3 + csv.Read(); + var value = csv.GetString(0); + Assert.Equal(expected, value); + } #if NET6_0_OR_GREATER diff --git a/source/Sylvan.Data.Csv.Tests/Sylvan.Data.Csv.Tests.csproj b/source/Sylvan.Data.Csv.Tests/Sylvan.Data.Csv.Tests.csproj index 4eb084a4..4ab443c9 100644 --- a/source/Sylvan.Data.Csv.Tests/Sylvan.Data.Csv.Tests.csproj +++ b/source/Sylvan.Data.Csv.Tests/Sylvan.Data.Csv.Tests.csproj @@ -13,7 +13,7 @@ - + diff --git a/source/Sylvan.Data.Csv/CsvDataReader.cs b/source/Sylvan.Data.Csv/CsvDataReader.cs index 0eb8886a..c11f84be 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader.cs @@ -54,6 +54,7 @@ enum QuoteState Unquoted = 0, Quoted = 1, ImplicitQuotes = 3, + InvalidQuotes = 4, } struct FieldInfo @@ -669,6 +670,14 @@ ReadResult ReadField(int fieldIdx) int fieldEnd = 0; bool last = false; bool complete = false; + + if (fieldIdx >= fieldInfos.Length) + { + // this resize is constrained by the fact that the record has to fit in one row + Array.Resize(ref fieldInfos, fieldInfos.Length * 2); + } + ref var fi = ref fieldInfos[fieldIdx]; + if (style == CsvStyle.Escaped) { // consume quoted field. @@ -716,85 +725,78 @@ ReadResult ReadField(int fieldIdx) { if (idx < bufferEnd) { - c = buffer[idx++]; + c = buffer[idx]; - if (c <= minSafe) + if (c == quote) { - if (c == quote) - { - closeQuoteIdx = idx; + idx++; // consume the quote we just read + closeQuoteIdx = idx; - // consume quoted field. - while (idx < bufferEnd) + // consume quoted field. + while (idx < bufferEnd) + { + c = buffer[idx++]; + if (c == escape) { - c = buffer[idx++]; - if (c == escape) + if (idx < bufferEnd) { - if (idx < bufferEnd) + c = buffer[idx++]; // the escaped char + if (c == escape || c == quote) + { + escapeCount++; + continue; + } + else + if (escape == quote) + { + idx--; + closeQuoteIdx = idx; + fieldEnd = closeQuoteIdx; + // the quote (escape) we just saw was a the closing quote + break; + } + } + else + { + if (atEndOfText) { - c = buffer[idx++]; // the escaped char - if (c == escape || c == quote) - { - escapeCount++; - continue; - } - else if (escape == quote) { - idx--; + complete = true; + last = true; closeQuoteIdx = idx; fieldEnd = closeQuoteIdx; // the quote (escape) we just saw was a the closing quote - break; } + break; } - else - { - if (atEndOfText) - { - if (escape == quote) - { - complete = true; - last = true; - closeQuoteIdx = idx; - fieldEnd = closeQuoteIdx; - // the quote (escape) we just saw was a the closing quote - } - break; - } - return ReadResult.Incomplete; - } + return ReadResult.Incomplete; } + } - if (c == quote) + if (c == quote) + { + // immediately after the quote should be a delimiter, eol, or eof, but... + // we can simply treat the remainder of the record like a normal unquoted field + // we are currently positioned on the quote, the next while loop will consume it + closeQuoteIdx = idx; + fieldEnd = closeQuoteIdx; + break; + } + if (IsEndOfLine(c)) + { + idx--; + var r = ConsumeLineEnd(buffer, ref idx); + if (r == ReadResult.Incomplete) { - // immediately after the quote should be a delimiter, eol, or eof, but... - // we can simply treat the remainder of the record like a normal unquoted field - // we are currently positioned on the quote, the next while loop will consume it - closeQuoteIdx = idx; - fieldEnd = closeQuoteIdx; - break; + return ReadResult.Incomplete; } - if (IsEndOfLine(c)) + else { - idx--; - var r = ConsumeLineEnd(buffer, ref idx); - if (r == ReadResult.Incomplete) - { - return ReadResult.Incomplete; - } - else - { - // continue on. We are inside a quoted string, so the newline is part of the value. - } + // continue on. We are inside a quoted string, so the newline is part of the value. } - } // we exit this loop when we reach the closing quote. - } - else - { - // "unread" the last character and let the next loop handle it. - idx--; - } + } + } // we exit this loop when we reach the closing quote. } } } @@ -815,8 +817,15 @@ ReadResult ReadField(int fieldIdx) // this handles the case where we had a quoted field if (c == quote && closeQuoteIdx >= 0) { - this.pendingException = new CsvFormatException(rowNumber, fieldIdx); - return ReadResult.False; + if (style == CsvStyle.Lax) + { + fi.quoteState = QuoteState.InvalidQuotes; + } + else + { + this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + return ReadResult.False; + } } else if (IsEndOfLine(c)) @@ -837,30 +846,38 @@ ReadResult ReadField(int fieldIdx) last = true; break; } - } + } else { if (closeQuoteIdx >= 0) { - // if the field is quoted, we shouldn't be here. - // the only valid characters would be a delimiter, a new line, or EOF. - this.pendingException = new CsvFormatException(rowNumber, fieldIdx); - return ReadResult.False; + if (style == CsvStyle.Lax) + { + // in lax mode, we'll continue reading the remainder of the field + // after the closig quote + fi.quoteState = QuoteState.InvalidQuotes; + } + else + { + // if the field is quoted, we shouldn't be here. + // the only valid characters would be a delimiter, a new line, or EOF. + this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + return ReadResult.False; + } } } } if (complete || atEndOfText) { - if (fieldIdx >= fieldInfos.Length) + + if (atEndOfText && !complete) { - // this resize is constrained by the fact that the record has to fit in one row - Array.Resize(ref fieldInfos, fieldInfos.Length * 2); + fieldEnd = idx; } curFieldCount++; - ref var fi = ref fieldInfos[fieldIdx]; if (style == CsvStyle.Escaped) { @@ -883,9 +900,13 @@ ReadResult ReadField(int fieldIdx) } else { - var rowNumber = this.rowNumber == 0 && this.state == State.Initialized ? 1 : this.rowNumber; - this.pendingException = new CsvFormatException(rowNumber, fieldIdx); - return ReadResult.False; + fi.quoteState = QuoteState.InvalidQuotes; + if (style != CsvStyle.Lax) + { + var rowNumber = this.rowNumber == 0 && this.state == State.Initialized ? 1 : this.rowNumber; + this.pendingException = new CsvFormatException(rowNumber, fieldIdx); + return ReadResult.False; + } } } } @@ -1521,6 +1542,12 @@ internal readonly struct CharSpan public CharSpan(char[] buffer, int offset, int length) { +#if DEBUG + if (offset < 0 || length < 0) + { + throw new Exception(); + } +#endif Debug.Assert(offset >= 0); Debug.Assert(length >= 0); this.buffer = buffer; @@ -1585,6 +1612,9 @@ internal CharSpan GetField(int ordinal) [MethodImpl(MethodImplOptions.AggressiveInlining)] CharSpan GetFieldUnsafe(int ordinal) { + // "Unsafe" meaning this should only be called + // in contexts where ordinal is already validated to be in-range + ref var fi = ref this.fieldInfos[ordinal]; var startIdx = recordStart + (ordinal == 0 ? 0 : this.fieldInfos[ordinal - 1].endIdx + 1); var endIdx = recordStart + fi.endIdx; @@ -1593,20 +1623,21 @@ CharSpan GetFieldUnsafe(int ordinal) var buffer = this.buffer; if (fi.quoteState != QuoteState.Unquoted) { - // if there are no escapes, we can just "trim" the quotes off - if (fi.quoteState != QuoteState.ImplicitQuotes) - { - offset += 1; - len -= 2; - } - - if (fi.quoteState == QuoteState.Quoted && fi.escapeCount == 0) - { - // happy path, nothing else to do - } - else + switch (fi.quoteState) { - return PrepareField(offset, len, fi.escapeCount); + case QuoteState.InvalidQuotes: + return PrepareInvalidField(offset, len); + case QuoteState.Quoted: + // trim the quotes + offset += 1; + len -= 2; + if (fi.escapeCount > 0) + { + goto case QuoteState.ImplicitQuotes; + } + break; + case QuoteState.ImplicitQuotes: // escaped + return PrepareField(offset, len, fi.escapeCount); } } return new CharSpan(buffer, offset, len); @@ -1619,11 +1650,10 @@ CharSpan PrepareField(int offset, int len, int escapeCount) var eLen = len - escapeCount; // if there is room in the buffer before the current record // we'll use that as scratch space to unescape the value - var temp = buffer; - if (recordStart < eLen) + if (scratchStr.Length < len) { // otherwise we'll allocate a buffer - temp = new char[eLen]; + scratchStr = new char[len]; } int i = 0; @@ -1651,7 +1681,7 @@ CharSpan PrepareField(int offset, int len, int escapeCount) else { // we should never get here. Bad fields should always be - // handled in "read" + // handled in "read" and end up in PrepareInvalidField throw new CsvFormatException(rowNumber, -1); } } @@ -1664,9 +1694,66 @@ CharSpan PrepareField(int offset, int len, int escapeCount) continue; } } - temp[d++] = c; + scratchStr[d++] = c; + } + return new CharSpan(scratchStr, 0, eLen); + } + + char[] scratchStr = Array.Empty(); + + CharSpan PrepareInvalidField(int offset, int len) + { + bool inQuote = false; + + // increase the scratch space if needed. + if (scratchStr.Length < len) + { + scratchStr = new char[len]; + } + + int i = 0; + if (buffer[offset + i] == quote) + { + i++; + inQuote = true; + } + + + + int d = 0; + while (i < len) + { + var c = buffer[offset + i++]; + if (inQuote) + { + if (c == escape) + { + if (i < len) + { + c = buffer[offset + i++]; + if (c != quote && c != escape) + { + if (quote == escape) + { + // the escape we just saw was actually the closing quote + // the remainder of the field will be added verbatim + inQuote = false; + } + } + } + } + else + if (c == quote) + { + // we've found the broken closing quote + // skip it. + inQuote = false; + continue; + } + } + scratchStr[d++] = c; } - return new CharSpan(temp, 0, eLen); + return new CharSpan(scratchStr, 0, d); } /// @@ -1953,6 +2040,14 @@ public ReadOnlySpan GetRawRecordSpan() return this.buffer.AsSpan().Slice(this.recordStart, len); } + /// + /// Gets a span containing the current record data, including the line ending. + /// + public ReadOnlySpan GetRawFieldSpan(int ordinal) + { + throw new NotImplementedException(); + } + #endif /// diff --git a/source/Sylvan.Data.Csv/CsvStyle.cs b/source/Sylvan.Data.Csv/CsvStyle.cs index 55ba3bbd..c365f4c9 100644 --- a/source/Sylvan.Data.Csv/CsvStyle.cs +++ b/source/Sylvan.Data.Csv/CsvStyle.cs @@ -22,4 +22,9 @@ public enum CsvStyle /// Interprets fields as if they are implicitly quoted. Delimiters and new lines within fields are preceded by an escape character. /// Escaped = 2, + + /// + /// Parses CSV using lax quote handling where incorrectly quoted fields don't produce an error. + /// + Lax = 3, } From 6c12354ba11ef49fc42388c8fe7b34d49f57db8a Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Wed, 31 Jan 2024 13:21:31 -0800 Subject: [PATCH 2/6] comment --- source/Sylvan.Data.Csv/CsvDataReader.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/Sylvan.Data.Csv/CsvDataReader.cs b/source/Sylvan.Data.Csv/CsvDataReader.cs index c11f84be..ec6aa186 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader.cs @@ -1680,8 +1680,8 @@ CharSpan PrepareField(int offset, int len, int escapeCount) } else { - // we should never get here. Bad fields should always be - // handled in "read" and end up in PrepareInvalidField + // we should never get here. Invalid fields should always be + // handled in ReadField and end up in PrepareInvalidField throw new CsvFormatException(rowNumber, -1); } } @@ -1701,6 +1701,8 @@ CharSpan PrepareField(int offset, int len, int escapeCount) char[] scratchStr = Array.Empty(); + // this should only be called in Lax mode, otherwise an exception + // would have been thrown in ReadField. CharSpan PrepareInvalidField(int offset, int len) { bool inQuote = false; @@ -1718,8 +1720,6 @@ CharSpan PrepareInvalidField(int offset, int len) inQuote = true; } - - int d = 0; while (i < len) { From 60ea4a4cc67603fbd1a138bc77902c34341b99d2 Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Wed, 31 Jan 2024 13:24:03 -0800 Subject: [PATCH 3/6] rename tests --- .../CsvDataReaderTests.cs | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs index 752d4283..2869c160 100644 --- a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs +++ b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs @@ -1951,15 +1951,15 @@ public void Quotes(string data, bool valid, string expected) [Theory] // these are valid, and parse the same as the non-lax test - //[InlineData("a", "a")] - //[InlineData("\"\"", "")] - //[InlineData("\"\"\"\"", "\"")] - //[InlineData("\"\"\"\"\"\"", "\"\"")] - //[InlineData("\"a\"", "a")] - //[InlineData("\"a\"\"a\"", "a\"a")] - //[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")] - //[InlineData("a\"a\"a", "a\"a\"a")] - //[InlineData("a\"\"\"a", "a\"\"\"a")] + [InlineData("a", "a")] + [InlineData("\"\"", "")] + [InlineData("\"\"\"\"", "\"")] + [InlineData("\"\"\"\"\"\"", "\"\"")] + [InlineData("\"a\"", "a")] + [InlineData("\"a\"\"a\"", "a\"a")] + [InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")] + [InlineData("a\"a\"a", "a\"a\"a")] + [InlineData("a\"\"\"a", "a\"\"\"a")] // these are invalid, but will still produce a string in lax mode. [InlineData("\"a\"\"\"a\"", "a\"a\"")] [InlineData("\"a\"a", "aa")] @@ -1968,8 +1968,8 @@ public void Quotes(string data, bool valid, string expected) [InlineData("\"\"a\"", "a\"")] //[InlineData("\"\"\"", "\"")] [InlineData("\"\"\"\"\"", "\"\"")] - - public void LaxQuotes(string data, string expected) + // test when invalid fields exist at the end of a file. + public void LaxQuotesEnd(string data, string expected) { var r = new StringReader("a,b,c\n1,2,3\n" + data); var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax }; @@ -1999,8 +1999,8 @@ public void LaxQuotes(string data, string expected) [InlineData("\"\"a\"", "a\"")] [InlineData("\"\"\"", "\"\n4,5,6\n")] [InlineData("\"\"\"\"\"", "\"\"\n4,5,6\n")] - - public void LaxQuotes2(string data, string expected) + // test when invalid fields exist in the middle of a file. + public void LaxQuotesMid(string data, string expected) { var r = new StringReader("a,b,c\n1,2,3\n" + data + "\n4,5,6\n"); var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax }; From ea6d366b47c6cc824441f03627632f47c71eef5c Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Thu, 1 Feb 2024 10:36:38 -0800 Subject: [PATCH 4/6] Consider GetRawFieldSpan in a future release. --- docs/Csv/Sylvan.Data.Csv.Releases.md | 4 ++++ .../Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs | 15 +++++++++++++++ source/Sylvan.Data.Csv/CsvDataReader.cs | 8 -------- .../Sylvan.Data.Csv/CsvDataWriter+FieldWriter.cs | 8 ++++---- source/Sylvan.Data.Csv/Sylvan.Data.Csv.csproj | 3 ++- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/docs/Csv/Sylvan.Data.Csv.Releases.md b/docs/Csv/Sylvan.Data.Csv.Releases.md index 4523d5bb..3af43c05 100644 --- a/docs/Csv/Sylvan.Data.Csv.Releases.md +++ b/docs/Csv/Sylvan.Data.Csv.Releases.md @@ -1,5 +1,9 @@ # Sylvan.Data.Csv Release Notes +_1.3.6_ +- Adds `CsvStyle.Lax` which allows parsing CVS files with invalid fields. In this mode, the parser will not produce exceptions, but will + do a "best effort" to parse invalid fields. + _1.3.5_ - Fixes a bug where fields could be incorrectly read when the final character was escaped when reading with `CsvStyle.Escaped`. diff --git a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs index 2869c160..5b43513f 100644 --- a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs +++ b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs @@ -2013,6 +2013,21 @@ public void LaxQuotesMid(string data, string expected) #if NET6_0_OR_GREATER + [Fact] + public void GetRawFieldSpan() + { + var data = "a,b,c\n1,\"1,b\",\"\"\"quote\"\"\"\n"; + + var reader = CsvDataReader.Create(new StringReader(data)); + reader.Read(); + + Assert.Equal("1", reader.GetRawFieldSpan(0)); + Assert.Equal("\"1,b\"", reader.GetRawFieldSpan(1)); + Assert.Equal("\"\"\"quote\"\"\"", reader.GetRawFieldSpan(2)); + + } + + [Fact] public void DateOnlyFormatsCulture() { diff --git a/source/Sylvan.Data.Csv/CsvDataReader.cs b/source/Sylvan.Data.Csv/CsvDataReader.cs index ec6aa186..e1b327f8 100644 --- a/source/Sylvan.Data.Csv/CsvDataReader.cs +++ b/source/Sylvan.Data.Csv/CsvDataReader.cs @@ -2040,14 +2040,6 @@ public ReadOnlySpan GetRawRecordSpan() return this.buffer.AsSpan().Slice(this.recordStart, len); } - /// - /// Gets a span containing the current record data, including the line ending. - /// - public ReadOnlySpan GetRawFieldSpan(int ordinal) - { - throw new NotImplementedException(); - } - #endif /// diff --git a/source/Sylvan.Data.Csv/CsvDataWriter+FieldWriter.cs b/source/Sylvan.Data.Csv/CsvDataWriter+FieldWriter.cs index e7cf626d..697dc4b2 100644 --- a/source/Sylvan.Data.Csv/CsvDataWriter+FieldWriter.cs +++ b/source/Sylvan.Data.Csv/CsvDataWriter+FieldWriter.cs @@ -193,12 +193,12 @@ public override int Write(WriterContext context, int ordinal, char[] buffer, int public override byte[] GetValue(DbDataReader reader, int ordinal) { - throw new InvalidOperationException(); + throw new NotSupportedException(); } public override int WriteValue(WriterContext context, byte[] value, char[] buffer, int offset) { - throw new NotImplementedException(); + throw new NotSupportedException(); } } @@ -260,12 +260,12 @@ static int ToHexCharArray(byte[] dataBuffer, int offset, int length, char[] outp public override byte[] GetValue(DbDataReader reader, int ordinal) { - throw new InvalidOperationException(); + throw new NotSupportedException(); } public override int WriteValue(WriterContext context, byte[] value, char[] buffer, int offset) { - throw new NotImplementedException(); + throw new NotSupportedException(); } } diff --git a/source/Sylvan.Data.Csv/Sylvan.Data.Csv.csproj b/source/Sylvan.Data.Csv/Sylvan.Data.Csv.csproj index f13f8cbd..45ad86f5 100644 --- a/source/Sylvan.Data.Csv/Sylvan.Data.Csv.csproj +++ b/source/Sylvan.Data.Csv/Sylvan.Data.Csv.csproj @@ -2,7 +2,8 @@ net6.0;netstandard2.1;netstandard2.0 - 1.3.5 + 1.3.6 + b0001 A .NET library for reading and writing delimited CSV data. csv;delimited;data;datareader;datawriter;simd enable From f50cb01f443d5c081e7d109176262034b104dac2 Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Thu, 1 Feb 2024 14:51:31 -0800 Subject: [PATCH 5/6] remove test --- .../Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs index 5b43513f..2869c160 100644 --- a/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs +++ b/source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs @@ -2013,21 +2013,6 @@ public void LaxQuotesMid(string data, string expected) #if NET6_0_OR_GREATER - [Fact] - public void GetRawFieldSpan() - { - var data = "a,b,c\n1,\"1,b\",\"\"\"quote\"\"\"\n"; - - var reader = CsvDataReader.Create(new StringReader(data)); - reader.Read(); - - Assert.Equal("1", reader.GetRawFieldSpan(0)); - Assert.Equal("\"1,b\"", reader.GetRawFieldSpan(1)); - Assert.Equal("\"\"\"quote\"\"\"", reader.GetRawFieldSpan(2)); - - } - - [Fact] public void DateOnlyFormatsCulture() { From 6f83bde1557574ea1a78003d57e04c10629bd656 Mon Sep 17 00:00:00 2001 From: MarkPflug Date: Mon, 5 Feb 2024 09:16:01 -0800 Subject: [PATCH 6/6] Add CsvDataReaderOptions.Style to the documentation. --- docs/Csv/Options.md | 14 ++++++++++++++ source/Sylvan.Data.Csv/CsvStyle.cs | 3 +++ 2 files changed, 17 insertions(+) diff --git a/docs/Csv/Options.md b/docs/Csv/Options.md index 00038be2..d08baf15 100644 --- a/docs/Csv/Options.md +++ b/docs/Csv/Options.md @@ -83,6 +83,20 @@ __Culture__ The `CultureInfo` used when parsing primitive values. Defaults to `InvariantCulture`. +__Style__ + +Specifies the parsing mode to be used when reading a CSV file. + +*Standard*: This mode uses slightly modified RFC4180 parsing, that allows non-comma delimiters to be used. +Valid RFC 4180 files should parse as expected in this mode. + +*Escaped*: This mode uses escaping instead of quoting fields. +Any field delimiter, record delimiter (newline) or escape character in a field value will be escaped by a preceeding escape character. + +*Lax*: This mode uses a more lenient parsing mode that will parse malformed fields and avoid throwing an exception. +This mode starts by parsing using the `Standard` style, and upon finding a closing quote will parse the remainder of the field +as if it were unquoted. + __OwnsReader__ Indicates if the `CsvDataReader` owns the TextReader and should dispose it when complete. Defaults to true. diff --git a/source/Sylvan.Data.Csv/CsvStyle.cs b/source/Sylvan.Data.Csv/CsvStyle.cs index c365f4c9..9f17e9d3 100644 --- a/source/Sylvan.Data.Csv/CsvStyle.cs +++ b/source/Sylvan.Data.Csv/CsvStyle.cs @@ -9,6 +9,7 @@ public enum CsvStyle { /// /// Parses using the standard RFC4180 mode. + /// Malformed fields will produce a during calls to . /// Standard = 1, @@ -25,6 +26,8 @@ public enum CsvStyle /// /// Parses CSV using lax quote handling where incorrectly quoted fields don't produce an error. + /// In this mode a field will be parsed using the mode, and when a (unescaped) closing quote is found, the remainder + /// of the field will be parsed as if it were unquoted. /// Lax = 3, }