diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 00000000..3ef64062
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,29 @@
+name: Build
+
+on: [push, pull_request]
+
+jobs:
+ build:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ # See http://static.azul.com/zulu/bin/ for available JDK versions.
+ java: [8, 11, 14, 15-ea, 16-ea]
+ os: [ubuntu-latest, macos-latest, windows-latest]
+ name: Java ${{ matrix.java }}
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: true
+ - name: Set up java
+ uses: actions/setup-java@v1
+ with:
+ java-version: ${{ matrix.java }}
+ - name: Cache Maven packages
+ uses: actions/cache@v2
+ with:
+ path: ~/.m2
+ key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+ restore-keys: ${{ runner.os }}-m2
+ - name: Build with Maven
+ run: mvn -B verify
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..b469982b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "test-src/test/resources/html5lib-tests"]
+ path = test-src/test/resources/html5lib-tests
+ url = git@github.com:html5lib/html5lib-tests.git
diff --git a/pom.xml b/pom.xml
index 41f46725..a03072f0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -80,22 +80,13 @@
org.apache.maven.plugins
maven-compiler-plugin
-
- 1.5
+
+ 1.8
maven-antrun-plugin
1.7
-
-
- com.sun
- tools
- 1.5.0
- system
- ${java.home}/../lib/tools.jar
-
-
intitialize-sources
@@ -124,8 +115,12 @@
-
-
+
+
@@ -141,7 +136,11 @@
org.apache.maven.plugins
maven-surefire-plugin
- true
+ Html5libTest
+ true
+
+ ${project.build.testSourceDirectory}/test/resources
+
@@ -236,5 +235,26 @@
/usr/share/java
/usr/share/javadoc
UTF-8
+ false
+
+
+ Java 8
+
+ 1.8
+
+
+ true
+
+
+
+ com.sun
+ tools
+ 1.5.0
+ system
+ ${java.home}/../lib/tools.jar
+
+
+
+
diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
index 3de1af2a..4facce4a 100755
--- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
+++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2007 Henri Sivonen
- * Copyright (c) 2013 Mozilla Foundation
+ * Copyright (c) 2013-2020 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -61,7 +61,7 @@
public final class HtmlInputStreamReader extends Reader implements
ByteReadable, Locator, Locator2 {
- private static final int SNIFFING_LIMIT = 1024;
+ private int sniffingLimit = 1024;
private final InputStream inputStream;
@@ -87,11 +87,9 @@ public final class HtmlInputStreamReader extends Reader implements
private boolean charsetBoundaryPassed = false;
- private final byte[] byteArray = new byte[4096]; // Length must be >=
+ private byte[] byteArray = new byte[4096]; // Length must be >= sniffingLimit
- // SNIFFING_LIMIT
-
- private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
+ private ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
private boolean needToNotifyTokenizer = false;
@@ -112,18 +110,27 @@ public final class HtmlInputStreamReader extends Reader implements
/**
* @param inputStream
* @param errorHandler
- * @param locator
+ * @param tokenizer
+ * @param driver
+ * @param heuristics
+ * @param sniffingLimit
* @throws IOException
* @throws SAXException
*/
public HtmlInputStreamReader(InputStream inputStream,
ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
- Heuristics heuristics) throws SAXException, IOException {
+ Heuristics heuristics, int sniffingLimit)
+ throws SAXException, IOException {
this.inputStream = inputStream;
this.errorHandler = errorHandler;
this.tokenizer = tokenizer;
this.driver = driver;
this.sniffing = true;
+ if (sniffingLimit != -1) {
+ this.sniffingLimit = sniffingLimit;
+ this.byteArray = new byte[sniffingLimit];
+ this.byteBuffer = ByteBuffer.wrap(byteArray);
+ }
Encoding encoding = (new BomSniffer(this)).sniff();
if (encoding == null) {
position = 0;
@@ -178,6 +185,12 @@ public HtmlInputStreamReader(InputStream inputStream,
initDecoder();
}
+ public HtmlInputStreamReader(InputStream inputStream,
+ ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
+ Heuristics heuristics) throws SAXException, IOException {
+ this(inputStream, errorHandler, tokenizer, driver, heuristics, -1);
+ }
+
/**
*
*/
@@ -237,7 +250,7 @@ public HtmlInputStreamReader(InputStream inputStream,
if (charsetBoundaryPassed) {
readLen = byteArray.length - oldLimit;
} else {
- readLen = SNIFFING_LIMIT - oldLimit;
+ readLen = sniffingLimit - oldLimit;
}
int num = inputStream.read(byteArray, oldLimit, readLen);
if (num == -1) {
@@ -261,7 +274,7 @@ public HtmlInputStreamReader(InputStream inputStream,
} else if (cr == CoderResult.UNDERFLOW) {
int remaining = byteBuffer.remaining();
if (!charsetBoundaryPassed) {
- if (bytesRead + remaining >= SNIFFING_LIMIT) {
+ if (bytesRead + remaining >= sniffingLimit) {
needToNotifyTokenizer = true;
charsetBoundaryPassed = true;
}
@@ -389,12 +402,12 @@ public int readByte() throws IOException {
throw new IllegalStateException(
"readByte() called when not in the sniffing state.");
}
- if (position == SNIFFING_LIMIT) {
+ if (position == sniffingLimit) {
return -1;
} else if (position < limit) {
return byteArray[position++] & 0xFF;
} else {
- int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
+ int num = inputStream.read(byteArray, limit, sniffingLimit - limit);
if (num == -1) {
return -1;
} else {
diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java
index 95cd3018..01d164c8 100644
--- a/test-src/nu/validator/htmlparser/test/EncodingTester.java
+++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java
@@ -36,6 +36,10 @@
public class EncodingTester {
+ static int exitStatus = 0;
+
+ protected static int SNIFFING_LIMIT = 16384;
+
private final InputStream aggregateStream;
private final StringBuilder builder = new StringBuilder();
@@ -47,7 +51,7 @@ public EncodingTester(InputStream aggregateStream) {
this.aggregateStream = aggregateStream;
}
- private void runTests() throws IOException, SAXException {
+ void runTests() throws IOException, SAXException {
while (runTest()) {
// spin
}
@@ -59,10 +63,11 @@ private boolean runTest() throws IOException, SAXException {
}
UntilHashInputStream stream = new UntilHashInputStream(aggregateStream);
HtmlInputStreamReader reader = new HtmlInputStreamReader(stream, null,
- null, null, Heuristics.NONE);
+ null, null, Heuristics.NONE, SNIFFING_LIMIT);
Charset charset = reader.getCharset();
stream.close();
if (skipLabel()) {
+ exitStatus = 1;
System.err.println("Premature end of test data.");
return false;
}
@@ -73,6 +78,7 @@ private boolean runTest() throws IOException, SAXException {
case '\n':
break loop;
case -1:
+ exitStatus = 1;
System.err.println("Premature end of test data.");
return false;
default:
@@ -82,9 +88,9 @@ private boolean runTest() throws IOException, SAXException {
String sniffed = charset.name();
String expected = Encoding.forName(builder.toString()).newDecoder().charset().name();
if (expected.equalsIgnoreCase(sniffed)) {
- System.err.println("Success.");
// System.err.println(stream);
} else {
+ exitStatus = 1;
System.err.println("Failure. Expected: " + expected + " got "
+ sniffed + ".");
System.err.println(stream);
@@ -118,6 +124,7 @@ public static void main(String[] args) throws IOException, SAXException {
args[i]));
tester.runTests();
}
+ System.exit(exitStatus);
}
}
diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java
new file mode 100644
index 00000000..724062e2
--- /dev/null
+++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2020 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.test;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.function.Consumer;
+
+public class Html5libTest {
+
+ private final Path testDir;
+
+ public Html5libTest() throws URISyntaxException {
+ this.testDir = Paths.get(
+ Html5libTest.class.getResource("/html5lib-tests").toURI());
+ }
+
+ public void testEncoding() throws Exception {
+ Files.walkFileTree(testDir.resolve("encoding"), //
+ new TestVisitor(true, ".dat", file -> //
+ new EncodingTester(Files.newInputStream(file)).runTests()));
+ if (EncodingTester.exitStatus != 0) {
+ assert false : "Encoding test failed";
+ }
+ }
+
+ public void testTokenizer() throws Exception {
+ Files.walkFileTree(testDir.resolve("tokenizer"),
+ new TestVisitor(true, ".test", file -> //
+ new TokenizerTester(getDoubleEscapedInput(file)).runTests()));
+ if (TokenizerTester.exitStatus != 0) {
+ assert false : "Tokenizer test failed";
+ }
+ }
+
+ public void testTree() throws Exception {
+ Files.walkFileTree(testDir.resolve("tree-construction"),
+ new TestVisitor(true, ".dat", file -> //
+ new TreeTester(Files.newInputStream(file)).runTests()));
+ if (TreeTester.exitStatus != 0) {
+ assert false : "Tree test failed";
+ }
+ }
+
+ private ByteArrayInputStream getDoubleEscapedInput(Path file)
+ throws IOException {
+ byte[] fileBytes = Files.readAllBytes(file);
+ String fileContent = new String(fileBytes, StandardCharsets.UTF_8);
+ String unescapedContent = fileContent.replace("\\\\u", "\\u");
+ byte[] newBytes = unescapedContent.getBytes(StandardCharsets.UTF_8);
+ return new ByteArrayInputStream(newBytes);
+ }
+
+ private interface TestConsumer extends Consumer {
+
+ @Override
+ default void accept(Path t) {
+ try {
+ acceptTest(t);
+ } catch (Throwable e) {
+ throw new AssertionError(e);
+ }
+ }
+
+ void acceptTest(Path t) throws Throwable;
+
+ }
+
+ private static class TestVisitor extends SimpleFileVisitor {
+
+ private final boolean skipScripted;
+
+ private final String requiredTestExtension;
+
+ private final TestConsumer runner;
+
+ private TestVisitor(boolean skipScripted, String requiredTestExtension,
+ TestConsumer runner) {
+ this.skipScripted = skipScripted;
+ this.requiredTestExtension = requiredTestExtension;
+ this.runner = runner;
+ }
+
+ @Override
+ public FileVisitResult preVisitDirectory(Path dir,
+ BasicFileAttributes attrs) throws IOException {
+ if (skipScripted
+ && dir.getFileName().equals(Paths.get("scripted"))) {
+ return FileVisitResult.SKIP_SUBTREE;
+ }
+
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
+ throws IOException {
+ if (file.getFileName().toString().endsWith(requiredTestExtension)) {
+ runner.accept(file);
+ }
+ return FileVisitResult.CONTINUE;
+ }
+ }
+
+}
diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
index 52f96d2e..2db0395b 100644
--- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java
+++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
@@ -51,9 +51,11 @@
public class TokenizerTester {
+ static int exitStatus = 0;
+
private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state");
- private static JSONString PCDATA = new JSONString("DATA state");
+ private static JSONString PCDATA = new JSONString("Data state");
private static JSONString RCDATA = new JSONString("RCDATA state");
@@ -95,7 +97,7 @@ private static boolean jsonDeepEquals(JSONValue one, JSONValue other) {
private final Writer writer;
- private TokenizerTester(InputStream stream) throws TokenStreamException,
+ public TokenizerTester(InputStream stream) throws TokenStreamException,
RecognitionException, UnsupportedEncodingException {
tokenHandler = new JSONArrayTokenHandler();
driver = new Driver(new ErrorReportingTokenizer(tokenHandler));
@@ -119,7 +121,7 @@ private TokenizerTester(InputStream stream) throws TokenStreamException,
}
}
- private void runTests() throws SAXException, IOException {
+ void runTests() throws SAXException, IOException {
for (JSONValue val : tests.getValue()) {
runTest((JSONObject) val);
}
@@ -179,9 +181,8 @@ private void runTestInner(String inputString, JSONArray expectedTokens,
try {
driver.tokenize(is);
JSONArray actualTokens = tokenHandler.getArray();
- if (jsonDeepEquals(actualTokens, expectedTokens)) {
- writer.write("Success\n");
- } else {
+ if (!jsonDeepEquals(actualTokens, expectedTokens)) {
+ exitStatus = 1;
writer.write("Failure\n");
writer.write(description);
writer.write("\nInput:\n");
@@ -193,6 +194,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens,
writer.write("\n");
}
} catch (Throwable t) {
+ exitStatus = 1;
writer.write("Failure\n");
writer.write(description);
writer.write("\nInput:\n");
@@ -216,6 +218,7 @@ public static void main(String[] args) throws TokenStreamException,
args[i]));
tester.runTests();
}
+ System.exit(exitStatus);
}
}
diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java
index af5de942..f3efa268 100644
--- a/test-src/nu/validator/htmlparser/test/TreeTester.java
+++ b/test-src/nu/validator/htmlparser/test/TreeTester.java
@@ -43,6 +43,8 @@ public class TreeTester {
private boolean streaming = false;
+ static int exitStatus = 0;
+
/**
* @param aggregateStream
*/
@@ -50,7 +52,7 @@ public TreeTester(InputStream aggregateStream) {
this.aggregateStream = new BufferedInputStream(aggregateStream);
}
- private void runTests() throws Throwable {
+ void runTests() throws Throwable {
if (aggregateStream.read() != '#') {
System.err.println("No hash at start!");
return;
@@ -221,9 +223,9 @@ private boolean runTest() throws Throwable {
* && expectedErrors.size() ==
* actualErrors.size()
*/) {
- System.err.println("Success.");
// System.err.println(stream);
} else {
+ exitStatus = 1;
System.err.print("Failure.\nData:\n" + stream + "\nExpected:\n"
+ expected + "Got: \n" + actual);
System.err.println("Expected errors:");
@@ -236,6 +238,7 @@ private boolean runTest() throws Throwable {
}
}
} catch (Throwable t) {
+ exitStatus = 1;
System.err.println("Failure.\nData:\n" + stream);
throw t;
}
@@ -266,6 +269,7 @@ public static void main(String[] args) throws Throwable {
TreeTester tester = new TreeTester(new FileInputStream(args[i]));
tester.runTests();
}
+ System.exit(exitStatus);
}
}
diff --git a/test-src/test/resources/html5lib-tests b/test-src/test/resources/html5lib-tests
new file mode 160000
index 00000000..6ddcf58b
--- /dev/null
+++ b/test-src/test/resources/html5lib-tests
@@ -0,0 +1 @@
+Subproject commit 6ddcf58bea5a01e616911050c173622f84297211