diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java
index 7096e704..95a35b11 100755
--- a/src/nu/validator/htmlparser/impl/Tokenizer.java
+++ b/src/nu/validator/htmlparser/impl/Tokenizer.java
@@ -3233,6 +3233,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
case '<':
case '&':
case '\u0000':
+ case ';':
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
@@ -3261,17 +3262,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
firstCharKey = c - 'A';
} else {
// No match
- /*
- * If no match can be made, then this is a parse
- * error.
- */
- errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
- state = transition(state, returnState, reconsume, pos);
+ state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
}
// Didn't fail yet
@@ -3332,17 +3328,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
}
}
if (hilo == 0) {
- /*
- * If no match can be made, then this is a parse
- * error.
- */
- errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
- state = transition(state, returnState, reconsume, pos);
+ state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
}
// Didn't fail yet
@@ -3425,16 +3416,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
if (candidate == -1) {
// reconsume deals with CR, LF or nul
- /*
- * If no match can be made, then this is a parse error.
- */
- errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
cstart = pos;
}
reconsume = true;
- state = transition(state, returnState, reconsume, pos);
+ state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
} else {
// c can't be CR, LF or nul if we got here
@@ -3472,10 +3459,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
* after the U+0026 AMPERSAND (&) must be
* unconsumed, and nothing is returned.
*/
- errNoNamedCharacterMatch();
appendCharRefBufToStrBuf();
reconsume = true;
- state = transition(state, returnState, reconsume, pos);
+ state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
continue stateloop;
}
}
@@ -3538,6 +3524,37 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
* I'm ∉ I tell you.
*/
}
+ // XXX reorder point
+ case AMBIGUOUS_AMPERSAND:
+ /*
+ * Unlike the definition is the spec, we don't consume the
+ * next input character right away when entering this state;
+ * that's because our current implementation differs from
+ * the spec in that we've already consumed the relevant
+ * character *before* entering this state.
+ * Also, our implementation of this state has no looping.
+ * So we never stay in this state; instead, we always
+ * transition out from it back to returnState.
+ */
+ state = returnState;
+ if (c == ';') {
+ errNoNamedCharacterMatch();
+ continue stateloop;
+ } else if ((c >= '0' && c <= '9')
+ || (c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z')) {
+ appendCharRefBuf(c);
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos + 1;
+ }
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ continue stateloop;
+ }
+ continue stateloop;
case CONSUME_NCR:
if (++pos == endPos) {
break stateloop;
@@ -6632,7 +6649,6 @@ public void eof() throws SAXException {
state = returnState;
continue;
case CHARACTER_REFERENCE_HILO_LOOKUP:
- errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue;
@@ -6686,10 +6702,6 @@ public void eof() throws SAXException {
}
if (candidate == -1) {
- /*
- * If no match can be made, then this is a parse error.
- */
- errNoNamedCharacterMatch();
emitOrAppendCharRefBuf(returnState);
state = returnState;
continue eofloop;
@@ -6727,7 +6739,6 @@ public void eof() throws SAXException {
* after the U+0026 AMPERSAND (&) must be
* unconsumed, and nothing is returned.
*/
- errNoNamedCharacterMatch();
appendCharRefBufToStrBuf();
state = returnState;
continue eofloop;