Skip to content

Commit

Permalink
Conform ampersand-error reporting to HTML spec
Browse files Browse the repository at this point in the history
  • Loading branch information
sideshowbarker committed Aug 21, 2020
1 parent f30815d commit 5ae26d8
Showing 1 changed file with 36 additions and 25 deletions.
61 changes: 36 additions & 25 deletions src/nu/validator/htmlparser/impl/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -3233,6 +3233,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
case '<':
case '&':
case '\u0000':
case ';':
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
Expand Down Expand Up @@ -3261,17 +3262,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
firstCharKey = c - 'A';
} else {
// No match
/*
* If no match can be made, then this is a parse
* error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
}
// Didn't fail yet
Expand Down Expand Up @@ -3332,17 +3328,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
}
}
if (hilo == 0) {
/*
* If no match can be made, then this is a parse
* error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
}
// Didn't fail yet
Expand Down Expand Up @@ -3425,16 +3416,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {

if (candidate == -1) {
// reconsume deals with CR, LF or nul
/*
* If no match can be made, then this is a parse error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
state = transition(state, returnState, reconsume, pos);
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
} else {
// c can't be CR, LF or nul if we got here
Expand Down Expand Up @@ -3472,10 +3459,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
* after the U+0026 AMPERSAND (&) must be
* unconsumed, and nothing is returned.
*/
errNoNamedCharacterMatch();
appendCharRefBufToStrBuf();
reconsume = true;
state = transition(state, returnState, reconsume, pos);
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
}
}
Expand Down Expand Up @@ -3538,6 +3524,37 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
* I'm ∉ I tell you.
*/
}
// XXX reorder point
case AMBIGUOUS_AMPERSAND:
/*
* Unlike the definition is the spec, we don't consume the
* next input character right away when entering this state;
* that's because our current implementation differs from
* the spec in that we've already consumed the relevant
* character *before* entering this state.
* Also, our implementation of this state has no looping.
* So we never stay in this state; instead, we always
* transition out from it back to returnState.
*/
state = returnState;
if (c == ';') {
errNoNamedCharacterMatch();
continue stateloop;
} else if ((c >= '0' && c <= '9')
|| (c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z')) {
appendCharRefBuf(c);
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos + 1;
}
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
continue stateloop;
}
continue stateloop;
case CONSUME_NCR:
if (++pos == endPos) {
break stateloop;
Expand Down Expand Up @@ -6632,7 +6649,6 @@ public void eof() throws SAXException {
state = returnState;
continue;
case CHARACTER_REFERENCE_HILO_LOOKUP:
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue;
Expand Down Expand Up @@ -6686,10 +6702,6 @@ public void eof() throws SAXException {
}

if (candidate == -1) {
/*
* If no match can be made, then this is a parse error.
*/
errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue eofloop;
Expand Down Expand Up @@ -6727,7 +6739,6 @@ public void eof() throws SAXException {
* after the U+0026 AMPERSAND (&) must be
* unconsumed, and nothing is returned.
*/
errNoNamedCharacterMatch();
appendCharRefBufToStrBuf();
state = returnState;
continue eofloop;
Expand Down

0 comments on commit 5ae26d8

Please sign in to comment.