Skip to content

Commit

Permalink
Add lax mode to csv reader. (#240)
Browse files Browse the repository at this point in the history
* All lax unit tests passing

* comment

* rename tests

* Consider GetRawFieldSpan in a future release.

* remove test

* Add CsvDataReaderOptions.Style to the documentation.
  • Loading branch information
MarkPflug authored Feb 5, 2024
1 parent f294b1c commit 1e90999
Show file tree
Hide file tree
Showing 7 changed files with 282 additions and 105 deletions.
14 changes: 14 additions & 0 deletions docs/Csv/Options.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,20 @@ __Culture__
The `CultureInfo` used when parsing primitive values. Defaults to
`InvariantCulture`.

__Style__

Specifies the parsing mode to be used when reading a CSV file.

*Standard*: This mode uses slightly modified RFC4180 parsing, that allows non-comma delimiters to be used.
Valid RFC 4180 files should parse as expected in this mode.

*Escaped*: This mode uses escaping instead of quoting fields.
Any field delimiter, record delimiter (newline) or escape character in a field value will be escaped by a preceeding escape character.

*Lax*: This mode uses a more lenient parsing mode that will parse malformed fields and avoid throwing an exception.
This mode starts by parsing using the `Standard` style, and upon finding a closing quote will parse the remainder of the field
as if it were unquoted.

__OwnsReader__

Indicates if the `CsvDataReader` owns the TextReader and should dispose it when complete. Defaults to true.
Expand Down
4 changes: 4 additions & 0 deletions docs/Csv/Sylvan.Data.Csv.Releases.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Sylvan.Data.Csv Release Notes

_1.3.6_
- Adds `CsvStyle.Lax` which allows parsing CVS files with invalid fields. In this mode, the parser will not produce exceptions, but will
do a "best effort" to parse invalid fields.

_1.3.5_
- Fixes a bug where fields could be incorrectly read when the final character was escaped when reading with `CsvStyle.Escaped`.

Expand Down
75 changes: 69 additions & 6 deletions source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using System.Numerics;
using System.Text;
using System.Threading.Tasks;
using Xunit;
Expand Down Expand Up @@ -949,7 +950,7 @@ public void Binary2()
[InlineData("N,V\na\\\nb,c\n", "a\nb", "c")]
[InlineData("N,V\na\\\r\nb\n", "a\r\nb", "")]
[InlineData("N,V\na\\\r\nb", "a\r\nb", "")]
public void ImpliedQuote(string input, string a, string b)
public void EscapedStyle(string input, string a, string b)
{
using var reader = new StringReader(input);
var options =
Expand Down Expand Up @@ -1884,14 +1885,14 @@ public void FinalCharInCellIsEscaped()
public void EscapeEOF()
{
using var reader = new StringReader("\\");

using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions
{
CsvStyle = CsvStyle.Escaped,
HasHeaders = false,
Escape = '\\',
});
Assert.Throws<CsvFormatException>(() => csv.Read());
Assert.Throws<CsvFormatException>(() => csv.Read());
}

[Fact]
Expand Down Expand Up @@ -1921,7 +1922,7 @@ public void FinalCharInCellIsEscapeError()
[InlineData("\"a\"\"a\"\"a\"", true, "a\"a\"a")]
[InlineData("a\"a\"a", true, "a\"a\"a")]
[InlineData("a\"\"\"a", true, "a\"\"\"a")]

[InlineData("\"a\"\"\"a\"", false, null)]
[InlineData("\"a\"a", false, null)]
[InlineData("\"a\"a\"a\"", false, null)]
Expand All @@ -1935,18 +1936,80 @@ public void Quotes(string data, bool valid, string expected)
var r = new StringReader("a,b,c\n" + data);
var csv = CsvDataReader.Create(r);

if (valid) {
if (valid)
{
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}
}
else
{
var ex = Assert.Throws<CsvFormatException>(() => csv.Read());
Assert.Equal(1, ex.RowNumber);
}
}

[Theory]
// these are valid, and parse the same as the non-lax test
[InlineData("a", "a")]
[InlineData("\"\"", "")]
[InlineData("\"\"\"\"", "\"")]
[InlineData("\"\"\"\"\"\"", "\"\"")]
[InlineData("\"a\"", "a")]
[InlineData("\"a\"\"a\"", "a\"a")]
[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")]
[InlineData("a\"a\"a", "a\"a\"a")]
[InlineData("a\"\"\"a", "a\"\"\"a")]
// these are invalid, but will still produce a string in lax mode.
[InlineData("\"a\"\"\"a\"", "a\"a\"")]
[InlineData("\"a\"a", "aa")]
[InlineData("\"a\"a\"a\"", "aa\"a\"")]
[InlineData("\"\"a", "a")]
[InlineData("\"\"a\"", "a\"")]
//[InlineData("\"\"\"", "\"")]
[InlineData("\"\"\"\"\"", "\"\"")]
// test when invalid fields exist at the end of a file.
public void LaxQuotesEnd(string data, string expected)
{
var r = new StringReader("a,b,c\n1,2,3\n" + data);
var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax };
var csv = CsvDataReader.Create(r, opts);
csv.Read(); // skip the 1,2,3
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}

[Theory]
// these are valid, and parse the same as the non-lax test
[InlineData("a", "a")]
[InlineData("\"\"", "")]
[InlineData("\"\"\"\"", "\"")]
[InlineData("\"\"\"\"\"\"", "\"\"")]
[InlineData("\"a\"", "a")]
[InlineData("\"a\"\"a\"", "a\"a")]
[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")]
[InlineData("a\"a\"a", "a\"a\"a")]
[InlineData("a\"\"\"a", "a\"\"\"a")]
// these are invalid, but will still produce a string in lax mode.
[InlineData("\"a\"\"\"a\"", "a\"a\"")]
[InlineData("\"a\"a", "aa")]
[InlineData("\"a\"a\"a\"", "aa\"a\"")]
[InlineData("\"\"a", "a")]
[InlineData("\"\"a\"", "a\"")]
[InlineData("\"\"\"", "\"\n4,5,6\n")]
[InlineData("\"\"\"\"\"", "\"\"\n4,5,6\n")]
// test when invalid fields exist in the middle of a file.
public void LaxQuotesMid(string data, string expected)
{
var r = new StringReader("a,b,c\n1,2,3\n" + data + "\n4,5,6\n");
var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax };
var csv = CsvDataReader.Create(r, opts);
csv.Read(); // skip the 1,2,3
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}

#if NET6_0_OR_GREATER

Expand Down
Loading

0 comments on commit 1e90999

Please sign in to comment.