Skip to content

Commit

Permalink
Fix for escaping edge cases. (#225)
Browse files Browse the repository at this point in the history
Exception now thrown from "Read", instead of "Create" when first data row has format error.
  • Loading branch information
MarkPflug authored Nov 16, 2023
1 parent 38e2de5 commit 474263a
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 15 deletions.
58 changes: 49 additions & 9 deletions source/Sylvan.Data.Csv.Tests/CsvDataReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,11 @@ public void RowFieldCount3Test()
public void BadQuoteFirstRow()
{
using var tr = new StringReader("Name,Value\nA,\"B\"C\n");
var ex = Assert.Throws<CsvFormatException>(() => CsvDataReader.Create(tr, new CsvDataReaderOptions { Schema = CsvSchema.Nullable }));

var csv = CsvDataReader.Create(tr, new CsvDataReaderOptions { Schema = CsvSchema.Nullable });

var ex = Assert.Throws<CsvFormatException>(() => csv.Read());

Assert.Equal(1, ex.RowNumber);
}

Expand Down Expand Up @@ -1782,7 +1786,7 @@ public void DynamicReader()
public void DynamicSchema()
{
var data = new StringReader("a,b,c\n1,,3\n2022-11-12,12.4,1e5\n");

var reader = CsvDataReader.Create(data, new CsvDataReaderOptions { Schema = CsvSchema.Dynamic });

Assert.Equal(typeof(object), reader.GetFieldType(0));
Expand Down Expand Up @@ -1880,15 +1884,14 @@ public void FinalCharInCellIsEscaped()
public void EscapeEOF()
{
using var reader = new StringReader("\\");
Assert.Throws<InvalidDataException>(() =>

using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions
{
using var csvReader = CsvDataReader.Create(reader, new CsvDataReaderOptions
{
CsvStyle = CsvStyle.Escaped,
HasHeaders = false,
Escape = '\\',
});
CsvStyle = CsvStyle.Escaped,
HasHeaders = false,
Escape = '\\',
});
Assert.Throws<CsvFormatException>(() => csv.Read());
}

[Fact]
Expand All @@ -1907,6 +1910,43 @@ public void FinalCharInCellIsEscapeError()
Assert.Equal("\\\n", value0);
}

[Theory]
// These test cases were copied from the Sep parser library. Thanks, Nietras.
[InlineData("a", true, "a")]
[InlineData("\"\"", true, "")]
[InlineData("\"\"\"\"", true, "\"")]
[InlineData("\"\"\"\"\"\"", true, "\"\"")]
[InlineData("\"a\"", true, "a")]
[InlineData("\"a\"\"a\"", true, "a\"a")]
[InlineData("\"a\"\"a\"\"a\"", true, "a\"a\"a")]
[InlineData("a\"a\"a", true, "a\"a\"a")]
[InlineData("a\"\"\"a", true, "a\"\"\"a")]

[InlineData("\"a\"\"\"a\"", false, null)]
[InlineData("\"a\"a", false, null)]
[InlineData("\"a\"a\"a\"", false, null)]
[InlineData("\"\"a", false, null)]
[InlineData("\"\"a\"", false, null)]
[InlineData("\"\"\"", false, null)]
[InlineData("\"\"\"\"\"", false, null)]

public void Quotes(string data, bool valid, string expected)
{
var r = new StringReader("a,b,c\n" + data);
var csv = CsvDataReader.Create(r);

if (valid) {
csv.Read();
var value = csv.GetString(0);
Assert.Equal(expected, value);
}
else
{
var ex = Assert.Throws<CsvFormatException>(() => csv.Read());
Assert.Equal(1, ex.RowNumber);
}
}


#if NET6_0_OR_GREATER

Expand Down
15 changes: 15 additions & 0 deletions source/Sylvan.Data.Csv/CsvDataReader+Async.cs
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,15 @@ async Task<bool> InitializeReaderAsync(CancellationToken cancel = default)
{
if (carryRow || state == State.Open || await NextRecordAsync(cancel).ConfigureAwait(false))
{
if (pendingException != null)
{
// if the header row is malformed, the exception is throw from the call to Create
throw pendingException;
}
carryRow = false;
Initialize();
this.state = State.Initialized;
this.rowNumber++;
var hasNext = await NextRecordAsync(cancel).ConfigureAwait(false);
if (resultSetMode == ResultSetMode.SingleResult)
{
Expand Down Expand Up @@ -243,6 +249,10 @@ async Task<bool> NextRecordAsync(CancellationToken cancel = default)
}
if (result == ReadResult.False)
{
if(this.state == State.Open && pendingException != null)
{
throw pendingException;
}
return true;
}

Expand Down Expand Up @@ -293,6 +303,7 @@ async Task<int> FillBufferAsync(CancellationToken cancel = default)
public override async Task<bool> ReadAsync(CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();

this.rowNumber++;
if (this.state == State.Open)
{
Expand All @@ -309,6 +320,10 @@ public override async Task<bool> ReadAsync(CancellationToken cancellationToken)
}
else if (this.state == State.Initialized)
{
if (pendingException != null)
{
throw pendingException;
}
// after initizialization, the first record would already be in the buffer
// if hasRows is true.
if (hasRows)
Expand Down
8 changes: 8 additions & 0 deletions source/Sylvan.Data.Csv/CsvDataReader+Sync.cs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ bool NextRecord()
}
if (result == ReadResult.False)
{
if (this.state == State.Open && pendingException != null)
{
throw pendingException;
}
return true;
}

Expand Down Expand Up @@ -181,6 +185,10 @@ public override bool Read()
}
else if (this.state == State.Initialized)
{
if (pendingException != null)
{
throw pendingException;
}
// after initizialization, the first record would already be in the buffer
// if hasRows is true.
if (hasRows)
Expand Down
37 changes: 31 additions & 6 deletions source/Sylvan.Data.Csv/CsvDataReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ enum ReadResult
FieldInfo[] fieldInfos;
CsvColumn[] columns;

// An exception that was created with initializing, and should be thrown on the next call to Read/ReadAsync
Exception? pendingException;

// in multi-result set mode carryRow indicates that a row is already parsed
// and needs to be carried into the next result set.
bool carryRow;
Expand Down Expand Up @@ -204,7 +207,7 @@ private CsvDataReader(TextReader reader, char[]? buffer, CsvDataReaderOptions op
this.newLineMode = NewLineMode.Unknown;
}

static ColumnStringFactory BuildStringFactory(ColumnStringFactory? csf, StringFactory?sf)
static ColumnStringFactory BuildStringFactory(ColumnStringFactory? csf, StringFactory? sf)
{
if (csf != null)
{
Expand Down Expand Up @@ -354,7 +357,7 @@ public void Initialize()
}

#if INTRINSICS

Vector256<byte> delimiterMaskVector256;
Vector256<byte> lineEndMaskVector256;
Vector256<byte> quoteMaskVector256;
Expand Down Expand Up @@ -680,7 +683,7 @@ ReadResult ReadField(int fieldIdx)
if (IsEndOfLine(c))
{
idx--;// "unconsume" the newline character, so that ConsumeLineEnd can process it.
// if the escape precede an EOL, we might have to consume 2 chars
// if the escape precede an EOL, we might have to consume 2 chars
var r = ConsumeLineEnd(buffer, ref idx);
if (r == ReadResult.Incomplete)
{
Expand All @@ -695,7 +698,8 @@ ReadResult ReadField(int fieldIdx)
if (atEndOfText)
{
// there was nothing to escape
throw new InvalidDataException();
pendingException = new CsvFormatException(rowNumber, fieldIdx);
return ReadResult.False;
}
return ReadResult.Incomplete;
}
Expand Down Expand Up @@ -807,6 +811,14 @@ ReadResult ReadField(int fieldIdx)
complete = true;
break;
}
else
// this handles the case where we had a quoted field
if (c == quote && closeQuoteIdx >= 0)
{
this.pendingException = new CsvFormatException(rowNumber, fieldIdx);
return ReadResult.False;
}
else
if (IsEndOfLine(c))
{
idx--;
Expand All @@ -825,6 +837,16 @@ ReadResult ReadField(int fieldIdx)
last = true;
break;
}
}
else
{
if (closeQuoteIdx >= 0)
{
// if the field is quoted, we shouldn't be here.
// the only valid characters would be a delimiter, a new line, or EOF.
this.pendingException = new CsvFormatException(rowNumber, fieldIdx);
return ReadResult.False;
}
}
}

Expand Down Expand Up @@ -862,7 +884,8 @@ ReadResult ReadField(int fieldIdx)
else
{
var rowNumber = this.rowNumber == 0 && this.state == State.Initialized ? 1 : this.rowNumber;
throw new CsvFormatException(rowNumber, fieldIdx);
this.pendingException = new CsvFormatException(rowNumber, fieldIdx);
return ReadResult.False;
}
}
}
Expand Down Expand Up @@ -1627,7 +1650,9 @@ CharSpan PrepareField(int offset, int len, int escapeCount)
}
else
{
throw new InvalidDataException();
// we should never get here. Bad fields should always be
// handled in "read"
throw new CsvFormatException(rowNumber, -1);
}
}
else
Expand Down

0 comments on commit 474263a

Please sign in to comment.