Skip to content

Commit

Permalink
Merge branch 'main' into binder_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkPflug committed Feb 5, 2024
2 parents 017dfd8 + dac42ee commit b5760f9
Show file tree
Hide file tree
Showing 16 changed files with 379 additions and 288 deletions.
4 changes: 2 additions & 2 deletions Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
<Authors>$(Owner)</Authors>
<PackageIconUrl>https://markpflug.github.io/Sylvan.png</PackageIconUrl>
<PackageIcon>Sylvan.png</PackageIcon>
<Copyright2023 $(Owner)</Copyright>
<Copyright2024 $(Owner)</Copyright>
<PackageLicenseFile>license.txt</PackageLicenseFile>
<LangVersion>11.0</LangVersion>
<LangVersion>12.0</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<WarningsNotAsErrors>CS1030;CA1835;$(WarningsNotAsErrors)</WarningsNotAsErrors>

Expand Down
42 changes: 39 additions & 3 deletions docs/Csv/Options.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,34 @@ These defaults to `null`, which attempt to parse the values as the default "true

If either `TrueString` or `FalseString` are non-null, then that value is the singular, case-insensitive string that will be interpreted as the associated boolean value. If only one of the two is assigned it causes all other values to be interpreted as the negation. If both are assigned any value that is not one or the other will result in a `FormatException` being thrown.

__DateFormat__

__DateTimeFormat__
The format string used to parse `DateTime` values. This defaults to null, which will result in values being parsed using the provide `CultureInfo`.

Some CSV data sources use a compact date format like `"yyyyMMdd"` which cannot be parsed by default date parsing behavior, in which case this option allows parsing such values.

__DateTimeOffsetFormat__
The format string used when writing DateTimeOffset values
This defaults to null, which will result in values being parsed using the provided `CultureInfo`.

__TimeSpanFormat__
The format string used when writing TimeSpan values that have to time component. This defaults to null, which will result in values being parsed using the provided `CultureInfo`.

__TimeOnlyFormat__
The format string used when writing TimeOnly values. This option is only available when using .NET 6 or greater.
This defaults to null, which will result in values being parsed using the provided `CultureInfo`.

__DateOnlyFormat__
The format string used when writing DateOnly values. This option is only available when using .NET 6 or greater.
This defaults to null, which will result in values being parsed using the provided `CultureInfo`.

__DateFormat__

**Obsolete**, Use DateTimeFormat instead.

__TimeFormat__

**Obsolete**, Use TimeOnlyFormat instead.

__BinaryEncoding__

The encoding format used to interpret binary data, either Base64 or Hexadecimal. Hexadecimal values can optionally be prefixed with "0x".
Expand Down Expand Up @@ -61,6 +83,20 @@ __Culture__
The `CultureInfo` used when parsing primitive values. Defaults to
`InvariantCulture`.

__Style__

Specifies the parsing mode to be used when reading a CSV file.

*Standard*: This mode uses slightly modified RFC4180 parsing, that allows non-comma delimiters to be used.
Valid RFC 4180 files should parse as expected in this mode.

*Escaped*: This mode uses escaping instead of quoting fields.
Any field delimiter, record delimiter (newline) or escape character in a field value will be escaped by a preceeding escape character.

*Lax*: This mode uses a more lenient parsing mode that will parse malformed fields and avoid throwing an exception.
This mode starts by parsing using the `Standard` style, and upon finding a closing quote will parse the remainder of the field
as if it were unquoted.

__OwnsReader__

Indicates if the `CsvDataReader` owns the TextReader and should dispose it when complete. Defaults to true.
Expand Down Expand Up @@ -122,4 +158,4 @@ static string Pool(char[] buf, int offset, int length)
// anything else just construct normally (or call a nested factory)
return new string(buf, offset, length);
}
```
```
4 changes: 4 additions & 0 deletions docs/Csv/Sylvan.Data.Csv.Releases.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Sylvan.Data.Csv Release Notes

_1.3.6_
- Adds `CsvStyle.Lax` which allows parsing CVS files with invalid fields. In this mode, the parser will not produce exceptions, but will
do a "best effort" to parse invalid fields.

_1.3.5_
- Fixes a bug where fields could be incorrectly read when the final character was escaped when reading with `CsvStyle.Escaped`.

Expand Down
2 changes: 1 addition & 1 deletion license.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2022 Mark Pflug
Copyright (c) 2024 Mark Pflug

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
75 changes: 69 additions & 6 deletions source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using System.Numerics;
using System.Text;
using System.Threading.Tasks;
using Xunit;
Expand Down Expand Up @@ -949,7 +950,7 @@ public void Binary2()
[InlineData("N,V\na\\\nb,c\n", "a\nb", "c")]
[InlineData("N,V\na\\\r\nb\n", "a\r\nb", "")]
[InlineData("N,V\na\\\r\nb", "a\r\nb", "")]
public void ImpliedQuote(string input, string a, string b)
public void EscapedStyle(string input, string a, string b)
{
using var reader = new StringReader(input);
var options =
Expand Down Expand Up @@ -1884,14 +1885,14 @@ public void FinalCharInCellIsEscaped()
public void EscapeEOF()
{
using var reader = new StringReader("\\");

using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions
{
CsvStyle = CsvStyle.Escaped,
HasHeaders = false,
Escape = '\\',
});
Assert.Throws<CsvFormatException>(() => csv.Read());
Assert.Throws<CsvFormatException>(() => csv.Read());
}

[Fact]
Expand Down Expand Up @@ -1921,7 +1922,7 @@ public void FinalCharInCellIsEscapeError()
[InlineData("\"a\"\"a\"\"a\"", true, "a\"a\"a")]
[InlineData("a\"a\"a", true, "a\"a\"a")]
[InlineData("a\"\"\"a", true, "a\"\"\"a")]

[InlineData("\"a\"\"\"a\"", false, null)]
[InlineData("\"a\"a", false, null)]
[InlineData("\"a\"a\"a\"", false, null)]
Expand All @@ -1935,18 +1936,80 @@ public void Quotes(string data, bool valid, string expected)
var r = new StringReader("a,b,c\n" + data);
var csv = CsvDataReader.Create(r);

if (valid) {
if (valid)
{
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}
}
else
{
var ex = Assert.Throws<CsvFormatException>(() => csv.Read());
Assert.Equal(1, ex.RowNumber);
}
}

[Theory]
// these are valid, and parse the same as the non-lax test
[InlineData("a", "a")]
[InlineData("\"\"", "")]
[InlineData("\"\"\"\"", "\"")]
[InlineData("\"\"\"\"\"\"", "\"\"")]
[InlineData("\"a\"", "a")]
[InlineData("\"a\"\"a\"", "a\"a")]
[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")]
[InlineData("a\"a\"a", "a\"a\"a")]
[InlineData("a\"\"\"a", "a\"\"\"a")]
// these are invalid, but will still produce a string in lax mode.
[InlineData("\"a\"\"\"a\"", "a\"a\"")]
[InlineData("\"a\"a", "aa")]
[InlineData("\"a\"a\"a\"", "aa\"a\"")]
[InlineData("\"\"a", "a")]
[InlineData("\"\"a\"", "a\"")]
//[InlineData("\"\"\"", "\"")]
[InlineData("\"\"\"\"\"", "\"\"")]
// test when invalid fields exist at the end of a file.
public void LaxQuotesEnd(string data, string expected)
{
var r = new StringReader("a,b,c\n1,2,3\n" + data);
var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax };
var csv = CsvDataReader.Create(r, opts);
csv.Read(); // skip the 1,2,3
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}

[Theory]
// these are valid, and parse the same as the non-lax test
[InlineData("a", "a")]
[InlineData("\"\"", "")]
[InlineData("\"\"\"\"", "\"")]
[InlineData("\"\"\"\"\"\"", "\"\"")]
[InlineData("\"a\"", "a")]
[InlineData("\"a\"\"a\"", "a\"a")]
[InlineData("\"a\"\"a\"\"a\"", "a\"a\"a")]
[InlineData("a\"a\"a", "a\"a\"a")]
[InlineData("a\"\"\"a", "a\"\"\"a")]
// these are invalid, but will still produce a string in lax mode.
[InlineData("\"a\"\"\"a\"", "a\"a\"")]
[InlineData("\"a\"a", "aa")]
[InlineData("\"a\"a\"a\"", "aa\"a\"")]
[InlineData("\"\"a", "a")]
[InlineData("\"\"a\"", "a\"")]
[InlineData("\"\"\"", "\"\n4,5,6\n")]
[InlineData("\"\"\"\"\"", "\"\"\n4,5,6\n")]
// test when invalid fields exist in the middle of a file.
public void LaxQuotesMid(string data, string expected)
{
var r = new StringReader("a,b,c\n1,2,3\n" + data + "\n4,5,6\n");
var opts = new CsvDataReaderOptions { CsvStyle = CsvStyle.Lax };
var csv = CsvDataReader.Create(r, opts);
csv.Read(); // skip the 1,2,3
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}

#if NET6_0_OR_GREATER

Expand Down
2 changes: 1 addition & 1 deletion source/Sylvan.Data.Csv.Tests/Sylvan.Data.Csv.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
<ItemGroup>
<PackageReference Include="Microsoft.Bcl.AsyncInterfaces" Version="5.0.0" />
<PackageReference Include="System.Buffers" Version="4.4.0" />
<PackageReference Include="System.Data.SqlClient" Version="4.8.5" />
<PackageReference Include="System.Data.SqlClient" Version="4.8.6" />
<PackageReference Include="System.IO.Compression" Version="4.3.0" />
<PackageReference Include="System.Net.Http" Version="4.3.4" />
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="5.0.0" />
Expand Down
Loading

0 comments on commit b5760f9

Please sign in to comment.