Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lax mode to csv reader. #240

Merged
merged 7 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/Csv/Options.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,20 @@ __Culture__
The `CultureInfo` used when parsing primitive values. Defaults to
`InvariantCulture`.

__Style__

Specifies the parsing mode to be used when reading a CSV file.

*Standard*: This mode uses slightly modified RFC4180 parsing, that allows non-comma delimiters to be used.
Valid RFC 4180 files should parse as expected in this mode.

*Escaped*: This mode uses escaping instead of quoting fields.
Any field delimiter, record delimiter (newline) or escape character in a field value will be escaped by a preceeding escape character.

*Lax*: This mode uses a more lenient parsing mode that will parse malformed fields and avoid throwing an exception.
This mode starts by parsing using the `Standard` style, and upon finding a closing quote will parse the remainder of the field
as if it were unquoted.

__OwnsReader__

Indicates if the `CsvDataReader` owns the TextReader and should dispose it when complete. Defaults to true.
Expand Down
4 changes: 4 additions & 0 deletions docs/Csv/Sylvan.Data.Csv.Releases.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Sylvan.Data.Csv Release Notes

_1.3.6_
- Adds `CsvStyle.Lax` which allows parsing CVS files with invalid fields. In this mode, the parser will not produce exceptions, but will
do a "best effort" to parse invalid fields.

_1.3.5_
- Fixes a bug where fields could be incorrectly read when the final character was escaped when reading with `CsvStyle.Escaped`.

Expand Down
75 changes: 69 additions & 6 deletions source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using System.Numerics;
using System.Text;
using System.Threading.Tasks;
using Xunit;
Expand Down Expand Up @@ -949,7 +950,7 @@ public void Binary2()
[InlineData("N,V\na\\\nb,c\n", "a\nb", "c")]
[InlineData("N,V\na\\\r\nb\n", "a\r\nb", "")]
[InlineData("N,V\na\\\r\nb", "a\r\nb", "")]
public void ImpliedQuote(string input, string a, string b)
public void EscapedStyle(string input, string a, string b)
{
using var reader = new StringReader(input);
var options =
Expand Down Expand Up @@ -1884,14 +1885,14 @@ public void FinalCharInCellIsEscaped()
public void EscapeEOF()
{
using var reader = new StringReader("\\");

using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions
{
CsvStyle = CsvStyle.Escaped,
HasHeaders = false,
Escape = '\\',
});
Assert.Throws<CsvFormatException>(() => csv.Read());
Assert.Throws<CsvFormatException>(() => csv.Read());
}

[Fact]
Expand Down Expand Up @@ -1921,7 +1922,7 @@ public void FinalCharInCellIsEscapeError()
[InlineData("\"a\"\"a\"\"a\"", true, "a\"a\"a")]
[InlineData("a\"a\"a", true, "a\"a\"a")]
[InlineData("a\"\"\"a", true, "a\"\"\"a")]

[InlineData("\"a\"\"\"a\"", false, null)]
[InlineData("\"a\"a", false, null)]
[InlineData("\"a\"a\"a\"", false, null)]
Expand All @@ -1935,18 +1936,80 @@ public void Quotes(string data, bool valid, string expected)
var r = new StringReader("a,b,c\n" + data);
var csv = CsvDataReader.Create(r);

if (valid) {
if (valid)
{
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}
}
else
{
var ex = Assert.Throws<CsvFormatException>(() => csv.Read());
Assert.Equal(1, ex.RowNumber);
}
}

[Theory]
// these are valid, and parse the same as the non-lax test
[InlineData("a", "a")]
[InlineData("\"\"", "")]
[InlineData("\"\"\"\"", "\"")]
[InlineData("\"\"\"\"\"\"", "\"\"")]
[InlineData("\"a\"", "a")]
[InlineData("\"a\"\"a\"", "a\"a")]
[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")]
[InlineData("a\"a\"a", "a\"a\"a")]
[InlineData("a\"\"\"a", "a\"\"\"a")]
// these are invalid, but will still produce a string in lax mode.
[InlineData("\"a\"\"\"a\"", "a\"a\"")]
[InlineData("\"a\"a", "aa")]
[InlineData("\"a\"a\"a\"", "aa\"a\"")]
[InlineData("\"\"a", "a")]
[InlineData("\"\"a\"", "a\"")]
//[InlineData("\"\"\"", "\"")]
[InlineData("\"\"\"\"\"", "\"\"")]
// test when invalid fields exist at the end of a file.
public void LaxQuotesEnd(string data, string expected)
{
var r = new StringReader("a,b,c\n1,2,3\n" + data);
var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax };
var csv = CsvDataReader.Create(r, opts);
csv.Read(); // skip the 1,2,3
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}

[Theory]
// these are valid, and parse the same as the non-lax test
[InlineData("a", "a")]
[InlineData("\"\"", "")]
[InlineData("\"\"\"\"", "\"")]
[InlineData("\"\"\"\"\"\"", "\"\"")]
[InlineData("\"a\"", "a")]
[InlineData("\"a\"\"a\"", "a\"a")]
[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")]
[InlineData("a\"a\"a", "a\"a\"a")]
[InlineData("a\"\"\"a", "a\"\"\"a")]
// these are invalid, but will still produce a string in lax mode.
[InlineData("\"a\"\"\"a\"", "a\"a\"")]
[InlineData("\"a\"a", "aa")]
[InlineData("\"a\"a\"a\"", "aa\"a\"")]
[InlineData("\"\"a", "a")]
[InlineData("\"\"a\"", "a\"")]
[InlineData("\"\"\"", "\"\n4,5,6\n")]
[InlineData("\"\"\"\"\"", "\"\"\n4,5,6\n")]
// test when invalid fields exist in the middle of a file.
public void LaxQuotesMid(string data, string expected)
{
var r = new StringReader("a,b,c\n1,2,3\n" + data + "\n4,5,6\n");
var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax };
var csv = CsvDataReader.Create(r, opts);
csv.Read(); // skip the 1,2,3
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}

#if NET6_0_OR_GREATER

Expand Down
Loading
Loading