From 05b9024ba7c80b7561e9d2dfeedd714c9394c28d Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 4 Sep 2020 13:48:35 +0900 Subject: [PATCH 01/15] Include com.sun.tools as dependency only if Java 8 This change causes com.sun.tools to be included as a dependency only if the JDK version is 1.8. Otherwise, without this change, the build fails when run under any Java version later than Java 8. --- pom.xml | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index 41f46725..d8443fb6 100644 --- a/pom.xml +++ b/pom.xml @@ -87,15 +87,6 @@ maven-antrun-plugin 1.7 - - - com.sun - tools - 1.5.0 - system - ${java.home}/../lib/tools.jar - - intitialize-sources @@ -237,4 +228,21 @@ /usr/share/javadoc UTF-8 + + + Java 8 + + 1.8 + + + + com.sun + tools + 1.5.0 + system + ${java.home}/../lib/tools.jar + + + + From a27c7b62dd4479b6d282c621fcc99bfc6ee36015 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 4 Sep 2020 13:50:54 +0900 Subject: [PATCH 02/15] Set fork=true for Maven AntRun javac/java if Java8 This change causes the Maven AntRun plugin to invoke the javac and java commands with fork=true when run in a Java 8 environment. Otherwise, without this change, the build fails when run under Java 8. --- pom.xml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index d8443fb6..0ce02f71 100644 --- a/pom.xml +++ b/pom.xml @@ -115,8 +115,12 @@ - - + + @@ -227,6 +231,7 @@ /usr/share/java /usr/share/javadoc UTF-8 + false @@ -234,6 +239,9 @@ 1.8 + + true + com.sun From 623cea1584f699a0493c6e2114f104b97cdf3837 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 5 Aug 2020 22:08:37 +0900 Subject: [PATCH 03/15] Fix TokenizerTester casing of PCDATA state name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change updates the TokenizerTester code to expect its input test data to have the string "Data state" to identify PCDATA tests — rather than the string "DATA state". The test data in the html5lib-tests suite uses "Data state", so without this change, running TokenizerTester against html5lib-tests causes TokenizerTester to fail with a “Broken test data” harness failure. --- test-src/nu/validator/htmlparser/test/TokenizerTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index 52f96d2e..cfe77c41 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -53,7 +53,7 @@ public class TokenizerTester { private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state"); - private static JSONString PCDATA = new JSONString("DATA state"); + private static JSONString PCDATA = new JSONString("Data state"); private static JSONString RCDATA = new JSONString("RCDATA state"); From d419264a99a3279a79e357e8b922ba209281c912 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 17 Aug 2020 16:11:03 +0900 Subject: [PATCH 04/15] Exit 1 for test harness if any tests fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change makes TokenizerTester, TreeTester, and EncodingTester exit with status code 1 if any test cases fail. Otherwise, without this change, we won’t catch the test failures when running tests under CI. --- test-src/nu/validator/htmlparser/test/EncodingTester.java | 6 ++++++ test-src/nu/validator/htmlparser/test/TokenizerTester.java | 5 +++++ test-src/nu/validator/htmlparser/test/TreeTester.java | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java index 95cd3018..5ae87d5c 100644 --- a/test-src/nu/validator/htmlparser/test/EncodingTester.java +++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java @@ -36,6 +36,8 @@ public class EncodingTester { + private static int exitStatus = 0; + private final InputStream aggregateStream; private final StringBuilder builder = new StringBuilder(); @@ -63,6 +65,7 @@ private boolean runTest() throws IOException, SAXException { Charset charset = reader.getCharset(); stream.close(); if (skipLabel()) { + exitStatus = 1; System.err.println("Premature end of test data."); return false; } @@ -73,6 +76,7 @@ private boolean runTest() throws IOException, SAXException { case '\n': break loop; case -1: + exitStatus = 1; System.err.println("Premature end of test data."); return false; default: @@ -85,6 +89,7 @@ private boolean runTest() throws IOException, SAXException { System.err.println("Success."); // System.err.println(stream); } else { + exitStatus = 1; System.err.println("Failure. Expected: " + expected + " got " + sniffed + "."); System.err.println(stream); @@ -118,6 +123,7 @@ public static void main(String[] args) throws IOException, SAXException { args[i])); tester.runTests(); } + System.exit(exitStatus); } } diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index cfe77c41..275ece18 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -51,6 +51,8 @@ public class TokenizerTester { + private static int exitStatus = 0; + private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state"); private static JSONString PCDATA = new JSONString("Data state"); @@ -182,6 +184,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens, if (jsonDeepEquals(actualTokens, expectedTokens)) { writer.write("Success\n"); } else { + exitStatus = 1; writer.write("Failure\n"); writer.write(description); writer.write("\nInput:\n"); @@ -193,6 +196,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens, writer.write("\n"); } } catch (Throwable t) { + exitStatus = 1; writer.write("Failure\n"); writer.write(description); writer.write("\nInput:\n"); @@ -216,6 +220,7 @@ public static void main(String[] args) throws TokenStreamException, args[i])); tester.runTests(); } + System.exit(exitStatus); } } diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java index af5de942..d90864c6 100644 --- a/test-src/nu/validator/htmlparser/test/TreeTester.java +++ b/test-src/nu/validator/htmlparser/test/TreeTester.java @@ -43,6 +43,8 @@ public class TreeTester { private boolean streaming = false; + private static int exitStatus = 0; + /** * @param aggregateStream */ @@ -224,6 +226,7 @@ private boolean runTest() throws Throwable { System.err.println("Success."); // System.err.println(stream); } else { + exitStatus = 1; System.err.print("Failure.\nData:\n" + stream + "\nExpected:\n" + expected + "Got: \n" + actual); System.err.println("Expected errors:"); @@ -236,6 +239,7 @@ private boolean runTest() throws Throwable { } } } catch (Throwable t) { + exitStatus = 1; System.err.println("Failure.\nData:\n" + stream); throw t; } @@ -266,6 +270,7 @@ public static void main(String[] args) throws Throwable { TreeTester tester = new TreeTester(new FileInputStream(args[i])); tester.runTests(); } + System.exit(exitStatus); } } From dd7ecfa3cca3980037667d9f74a29a683c7a278b Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 17 Aug 2020 16:43:53 +0900 Subject: [PATCH 05/15] =?UTF-8?q?Drop=20=E2=80=9CSuccess=E2=80=9D=20messag?= =?UTF-8?q?e=20for=20test=20passes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change makes TokenizerTester, TreeTester, and EncodingTester stop emitting the word “Success” to standard error every time a test passes. Otherwise, without this change, in test output, we end up with so many “Success” lines that the actual test failures are effectively obscured (you have to scroll back through the output/log to hunt for failures). --- test-src/nu/validator/htmlparser/test/EncodingTester.java | 1 - test-src/nu/validator/htmlparser/test/TokenizerTester.java | 4 +--- test-src/nu/validator/htmlparser/test/TreeTester.java | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java index 5ae87d5c..da588eb2 100644 --- a/test-src/nu/validator/htmlparser/test/EncodingTester.java +++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java @@ -86,7 +86,6 @@ private boolean runTest() throws IOException, SAXException { String sniffed = charset.name(); String expected = Encoding.forName(builder.toString()).newDecoder().charset().name(); if (expected.equalsIgnoreCase(sniffed)) { - System.err.println("Success."); // System.err.println(stream); } else { exitStatus = 1; diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index 275ece18..b4013234 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -181,9 +181,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens, try { driver.tokenize(is); JSONArray actualTokens = tokenHandler.getArray(); - if (jsonDeepEquals(actualTokens, expectedTokens)) { - writer.write("Success\n"); - } else { + if (!jsonDeepEquals(actualTokens, expectedTokens)) { exitStatus = 1; writer.write("Failure\n"); writer.write(description); diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java index d90864c6..1327e9ac 100644 --- a/test-src/nu/validator/htmlparser/test/TreeTester.java +++ b/test-src/nu/validator/htmlparser/test/TreeTester.java @@ -223,7 +223,6 @@ private boolean runTest() throws Throwable { * && expectedErrors.size() == * actualErrors.size() */) { - System.err.println("Success."); // System.err.println(stream); } else { exitStatus = 1; From ebbd0c50ca2f59975833cf6b9129e2ceea06f2b8 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Mon, 17 Aug 2020 23:46:42 +0900 Subject: [PATCH 06/15] Add GitHub Actions build.yml to run Maven in CI --- .github/workflows/build.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..3ef64062 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,29 @@ +name: Build + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + # See http://static.azul.com/zulu/bin/ for available JDK versions. + java: [8, 11, 14, 15-ea, 16-ea] + os: [ubuntu-latest, macos-latest, windows-latest] + name: Java ${{ matrix.java }} + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Set up java + uses: actions/setup-java@v1 + with: + java-version: ${{ matrix.java }} + - name: Cache Maven packages + uses: actions/cache@v2 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + - name: Build with Maven + run: mvn -B verify From 10f012aeeae1f39cf36616b5cedb4b357661379f Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 21 Aug 2020 13:35:48 +0900 Subject: [PATCH 07/15] Make HtmlInputStreamReader sniffing limit settable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change makes the sniffing limit in HtmlInputStreamReader settable. Without this change, the HtmlInputStreamReader sniffing limit is hardcoded to 1024 — and in the context of testing, that has the effect of limiting HtmlInputStreamReader to only being useful for testing expected output of the meta prescan. So this change makes it possible for HtmlInputStreamReader to also be used for testing the results for the state where the expected character encoding is not limited to what can be determined by checking the first 1024 bytes of the input stream. --- .../htmlparser/io/HtmlInputStreamReader.java | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java index 3de1af2a..4facce4a 100755 --- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java +++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2007 Henri Sivonen - * Copyright (c) 2013 Mozilla Foundation + * Copyright (c) 2013-2020 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -61,7 +61,7 @@ public final class HtmlInputStreamReader extends Reader implements ByteReadable, Locator, Locator2 { - private static final int SNIFFING_LIMIT = 1024; + private int sniffingLimit = 1024; private final InputStream inputStream; @@ -87,11 +87,9 @@ public final class HtmlInputStreamReader extends Reader implements private boolean charsetBoundaryPassed = false; - private final byte[] byteArray = new byte[4096]; // Length must be >= + private byte[] byteArray = new byte[4096]; // Length must be >= sniffingLimit - // SNIFFING_LIMIT - - private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray); + private ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray); private boolean needToNotifyTokenizer = false; @@ -112,18 +110,27 @@ public final class HtmlInputStreamReader extends Reader implements /** * @param inputStream * @param errorHandler - * @param locator + * @param tokenizer + * @param driver + * @param heuristics + * @param sniffingLimit * @throws IOException * @throws SAXException */ public HtmlInputStreamReader(InputStream inputStream, ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver, - Heuristics heuristics) throws SAXException, IOException { + Heuristics heuristics, int sniffingLimit) + throws SAXException, IOException { this.inputStream = inputStream; this.errorHandler = errorHandler; this.tokenizer = tokenizer; this.driver = driver; this.sniffing = true; + if (sniffingLimit != -1) { + this.sniffingLimit = sniffingLimit; + this.byteArray = new byte[sniffingLimit]; + this.byteBuffer = ByteBuffer.wrap(byteArray); + } Encoding encoding = (new BomSniffer(this)).sniff(); if (encoding == null) { position = 0; @@ -178,6 +185,12 @@ public HtmlInputStreamReader(InputStream inputStream, initDecoder(); } + public HtmlInputStreamReader(InputStream inputStream, + ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver, + Heuristics heuristics) throws SAXException, IOException { + this(inputStream, errorHandler, tokenizer, driver, heuristics, -1); + } + /** * */ @@ -237,7 +250,7 @@ public HtmlInputStreamReader(InputStream inputStream, if (charsetBoundaryPassed) { readLen = byteArray.length - oldLimit; } else { - readLen = SNIFFING_LIMIT - oldLimit; + readLen = sniffingLimit - oldLimit; } int num = inputStream.read(byteArray, oldLimit, readLen); if (num == -1) { @@ -261,7 +274,7 @@ public HtmlInputStreamReader(InputStream inputStream, } else if (cr == CoderResult.UNDERFLOW) { int remaining = byteBuffer.remaining(); if (!charsetBoundaryPassed) { - if (bytesRead + remaining >= SNIFFING_LIMIT) { + if (bytesRead + remaining >= sniffingLimit) { needToNotifyTokenizer = true; charsetBoundaryPassed = true; } @@ -389,12 +402,12 @@ public int readByte() throws IOException { throw new IllegalStateException( "readByte() called when not in the sniffing state."); } - if (position == SNIFFING_LIMIT) { + if (position == sniffingLimit) { return -1; } else if (position < limit) { return byteArray[position++] & 0xFF; } else { - int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit); + int num = inputStream.read(byteArray, limit, sniffingLimit - limit); if (num == -1) { return -1; } else { From e8e4a25fe1c533c7c1f43b2b43926d90672ebe2d Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Fri, 21 Aug 2020 13:49:57 +0900 Subject: [PATCH 08/15] Make EncodingTester usable in testing parsed state This change updates EncodingTester to make it test the result for cases when the expected character encoding is not limited to what can be determined by checking only the first 1024 bytes of the input stream. Otherwise, without this change, EncodingTester is limited to only being useful for testing the output of the meta prescan. --- test-src/nu/validator/htmlparser/test/EncodingTester.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java index da588eb2..d2a3c9d5 100644 --- a/test-src/nu/validator/htmlparser/test/EncodingTester.java +++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java @@ -38,6 +38,8 @@ public class EncodingTester { private static int exitStatus = 0; + protected static int SNIFFING_LIMIT = 16384; + private final InputStream aggregateStream; private final StringBuilder builder = new StringBuilder(); @@ -61,7 +63,7 @@ private boolean runTest() throws IOException, SAXException { } UntilHashInputStream stream = new UntilHashInputStream(aggregateStream); HtmlInputStreamReader reader = new HtmlInputStreamReader(stream, null, - null, null, Heuristics.NONE); + null, null, Heuristics.NONE, SNIFFING_LIMIT); Charset charset = reader.getCharset(); stream.close(); if (skipLabel()) { From 196bd48a97057a56df7c4faf56c49e1551e7edeb Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Wed, 26 Aug 2020 16:52:05 +0900 Subject: [PATCH 09/15] Make TokenizerTester() constructor public MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change makes the TokenizerTester(InputStream stream) constructor public — as the corresponding constructors for TreeTester and EncodingTester already are. --- test-src/nu/validator/htmlparser/test/TokenizerTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index b4013234..ead1dba9 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -97,7 +97,7 @@ private static boolean jsonDeepEquals(JSONValue one, JSONValue other) { private final Writer writer; - private TokenizerTester(InputStream stream) throws TokenStreamException, + public TokenizerTester(InputStream stream) throws TokenStreamException, RecognitionException, UnsupportedEncodingException { tokenHandler = new JSONArrayTokenHandler(); driver = new Driver(new ErrorReportingTokenizer(tokenHandler)); From d4edb40a2810d2e762ad862bae0ff6e76592747f Mon Sep 17 00:00:00 2001 From: Anthony Vanelverdinghe Date: Sat, 22 Aug 2020 13:07:01 +0200 Subject: [PATCH 10/15] Add Html5libTest --- .../htmlparser/test/EncodingTester.java | 4 +- .../htmlparser/test/Html5libTest.java | 86 +++++++++++++++++++ .../htmlparser/test/TokenizerTester.java | 4 +- .../validator/htmlparser/test/TreeTester.java | 4 +- 4 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 test-src/nu/validator/htmlparser/test/Html5libTest.java diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java index d2a3c9d5..01d164c8 100644 --- a/test-src/nu/validator/htmlparser/test/EncodingTester.java +++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java @@ -36,7 +36,7 @@ public class EncodingTester { - private static int exitStatus = 0; + static int exitStatus = 0; protected static int SNIFFING_LIMIT = 16384; @@ -51,7 +51,7 @@ public EncodingTester(InputStream aggregateStream) { this.aggregateStream = aggregateStream; } - private void runTests() throws IOException, SAXException { + void runTests() throws IOException, SAXException { while (runTest()) { // spin } diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java new file mode 100644 index 00000000..2b336b7a --- /dev/null +++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java @@ -0,0 +1,86 @@ +package nu.validator.htmlparser.test; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.function.Consumer; + +public class Html5libTest { + + private final Path testDir; + + public Html5libTest() throws URISyntaxException { + this.testDir = Path.of(Html5libTest.class.getResource("/html5lib-tests").toURI()); + } + + public void testEncoding() throws Exception { + Files.walkFileTree(testDir.resolve("encoding"), new TestVisitor(true, false, file -> new EncodingTester(Files.newInputStream(file)).runTests())); + if(EncodingTester.exitStatus != 0) { + assert false : "Encoding test failed"; + } + } + + public void testTokenizer() throws Exception { + Files.walkFileTree(testDir.resolve("tokenizer"), new TestVisitor(true, true, file -> new TokenizerTester(Files.newInputStream(file)).runTests())); + if(TokenizerTester.exitStatus != 0) { + assert false : "Tokenizer test failed"; + } + } + + public void testTree() throws Exception { + Files.walkFileTree(testDir.resolve("tree-construction"), new TestVisitor(true, false, file -> new TreeTester(Files.newInputStream(file)).runTests())); + if(TreeTester.exitStatus != 0) { + assert false : "Tree test failed"; + } + } + + private interface TestConsumer extends Consumer { + + @Override + default void accept(Path t) { + try { + acceptTest(t); + } catch(Throwable e) { + throw new AssertionError(e); + } + } + + void acceptTest(Path t) throws Throwable; + + } + + private static class TestVisitor extends SimpleFileVisitor { + + private final boolean skipScripted; + private final boolean requireTestExtension; + private final TestConsumer runner; + + private TestVisitor(boolean skipScripted, boolean requireTestExtension, TestConsumer runner) { + this.skipScripted = skipScripted; + this.requireTestExtension = requireTestExtension; + this.runner = runner; + } + + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + if (skipScripted && dir.getFileName().equals(Path.of("scripted"))) { + return FileVisitResult.SKIP_SUBTREE; + } + + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (!requireTestExtension || file.getFileName().toString().endsWith(".test")) { + runner.accept(file); + } + return FileVisitResult.CONTINUE; + } + } + +} diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index ead1dba9..2db0395b 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -51,7 +51,7 @@ public class TokenizerTester { - private static int exitStatus = 0; + static int exitStatus = 0; private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state"); @@ -121,7 +121,7 @@ public TokenizerTester(InputStream stream) throws TokenStreamException, } } - private void runTests() throws SAXException, IOException { + void runTests() throws SAXException, IOException { for (JSONValue val : tests.getValue()) { runTest((JSONObject) val); } diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java index 1327e9ac..f3efa268 100644 --- a/test-src/nu/validator/htmlparser/test/TreeTester.java +++ b/test-src/nu/validator/htmlparser/test/TreeTester.java @@ -43,7 +43,7 @@ public class TreeTester { private boolean streaming = false; - private static int exitStatus = 0; + static int exitStatus = 0; /** * @param aggregateStream @@ -52,7 +52,7 @@ public TreeTester(InputStream aggregateStream) { this.aggregateStream = new BufferedInputStream(aggregateStream); } - private void runTests() throws Throwable { + void runTests() throws Throwable { if (aggregateStream.read() != '#') { System.err.println("No hash at start!"); return; From ef95b9243dd91b0d42b07c2fcd1137e7d05d4de4 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 23 Aug 2020 22:10:49 +0900 Subject: [PATCH 11/15] Add html5lib-tests as a submodule --- .gitmodules | 3 +++ pom.xml | 10 +++++++--- test-src/test/resources/html5lib-tests | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 .gitmodules create mode 160000 test-src/test/resources/html5lib-tests diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..b469982b --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "test-src/test/resources/html5lib-tests"] + path = test-src/test/resources/html5lib-tests + url = git@github.com:html5lib/html5lib-tests.git diff --git a/pom.xml b/pom.xml index 0ce02f71..a03072f0 100644 --- a/pom.xml +++ b/pom.xml @@ -80,8 +80,8 @@ org.apache.maven.plugins maven-compiler-plugin - 1.5 - 1.5 + 1.8 + 1.8 @@ -136,7 +136,11 @@ org.apache.maven.plugins maven-surefire-plugin - true + Html5libTest + true + + ${project.build.testSourceDirectory}/test/resources + diff --git a/test-src/test/resources/html5lib-tests b/test-src/test/resources/html5lib-tests new file mode 160000 index 00000000..6ddcf58b --- /dev/null +++ b/test-src/test/resources/html5lib-tests @@ -0,0 +1 @@ +Subproject commit 6ddcf58bea5a01e616911050c173622f84297211 From fcd18b19e5e38297f0a02b29ae96d538e0c57f56 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Tue, 25 Aug 2020 16:48:16 +0900 Subject: [PATCH 12/15] Html5libTest: Add copyright; match project style --- .../htmlparser/test/Html5libTest.java | 59 +++++++++++++++---- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java index 2b336b7a..9c18e749 100644 --- a/test-src/nu/validator/htmlparser/test/Html5libTest.java +++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java @@ -1,3 +1,25 @@ +/* + * Copyright (c) 2020 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + package nu.validator.htmlparser.test; import java.io.IOException; @@ -14,26 +36,33 @@ public class Html5libTest { private final Path testDir; public Html5libTest() throws URISyntaxException { - this.testDir = Path.of(Html5libTest.class.getResource("/html5lib-tests").toURI()); + this.testDir = Path.of( + Html5libTest.class.getResource("/html5lib-tests").toURI()); } public void testEncoding() throws Exception { - Files.walkFileTree(testDir.resolve("encoding"), new TestVisitor(true, false, file -> new EncodingTester(Files.newInputStream(file)).runTests())); - if(EncodingTester.exitStatus != 0) { + Files.walkFileTree(testDir.resolve("encoding"), // + new TestVisitor(true, false, file -> // + new EncodingTester(Files.newInputStream(file)).runTests())); + if (EncodingTester.exitStatus != 0) { assert false : "Encoding test failed"; } } public void testTokenizer() throws Exception { - Files.walkFileTree(testDir.resolve("tokenizer"), new TestVisitor(true, true, file -> new TokenizerTester(Files.newInputStream(file)).runTests())); - if(TokenizerTester.exitStatus != 0) { + Files.walkFileTree(testDir.resolve("tokenizer"), + new TestVisitor(true, true, file -> // + new TokenizerTester(Files.newInputStream(file)).runTests())); + if (TokenizerTester.exitStatus != 0) { assert false : "Tokenizer test failed"; } } public void testTree() throws Exception { - Files.walkFileTree(testDir.resolve("tree-construction"), new TestVisitor(true, false, file -> new TreeTester(Files.newInputStream(file)).runTests())); - if(TreeTester.exitStatus != 0) { + Files.walkFileTree(testDir.resolve("tree-construction"), + new TestVisitor(true, false, file -> // + new TreeTester(Files.newInputStream(file)).runTests())); + if (TreeTester.exitStatus != 0) { assert false : "Tree test failed"; } } @@ -44,7 +73,7 @@ private interface TestConsumer extends Consumer { default void accept(Path t) { try { acceptTest(t); - } catch(Throwable e) { + } catch (Throwable e) { throw new AssertionError(e); } } @@ -56,17 +85,21 @@ default void accept(Path t) { private static class TestVisitor extends SimpleFileVisitor { private final boolean skipScripted; + private final boolean requireTestExtension; + private final TestConsumer runner; - private TestVisitor(boolean skipScripted, boolean requireTestExtension, TestConsumer runner) { + private TestVisitor(boolean skipScripted, boolean requireTestExtension, + TestConsumer runner) { this.skipScripted = skipScripted; this.requireTestExtension = requireTestExtension; this.runner = runner; } @Override - public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + public FileVisitResult preVisitDirectory(Path dir, + BasicFileAttributes attrs) throws IOException { if (skipScripted && dir.getFileName().equals(Path.of("scripted"))) { return FileVisitResult.SKIP_SUBTREE; } @@ -75,8 +108,10 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) th } @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - if (!requireTestExtension || file.getFileName().toString().endsWith(".test")) { + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { + if (!requireTestExtension + || file.getFileName().toString().endsWith(".test")) { runner.accept(file); } return FileVisitResult.CONTINUE; From fd2c2601079118649e6fa73acd207e72825e697a Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 30 Aug 2020 20:21:20 +0900 Subject: [PATCH 13/15] Make Html5libTest only check .dat and .test files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change refines how Html5libTest handles filenames; it adds a mechanism to allow a required/expected file extension to be specified for each test type, and uses the mechanism to specify that ".test" is the required/expected extension for tokenizer tests, and that ".dat" is the required/expected extension for tree-construction test files and for encoding test files. Without this change, Html5libTest only deals correctly with the ".test" extension for tokenizer test files — but not with the ".dat" extension for tree-construction test files and encoding test files. --- .../validator/htmlparser/test/Html5libTest.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java index 9c18e749..3958f347 100644 --- a/test-src/nu/validator/htmlparser/test/Html5libTest.java +++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java @@ -42,7 +42,7 @@ public Html5libTest() throws URISyntaxException { public void testEncoding() throws Exception { Files.walkFileTree(testDir.resolve("encoding"), // - new TestVisitor(true, false, file -> // + new TestVisitor(true, ".dat", file -> // new EncodingTester(Files.newInputStream(file)).runTests())); if (EncodingTester.exitStatus != 0) { assert false : "Encoding test failed"; @@ -51,7 +51,7 @@ public void testEncoding() throws Exception { public void testTokenizer() throws Exception { Files.walkFileTree(testDir.resolve("tokenizer"), - new TestVisitor(true, true, file -> // + new TestVisitor(true, ".test", file -> // new TokenizerTester(Files.newInputStream(file)).runTests())); if (TokenizerTester.exitStatus != 0) { assert false : "Tokenizer test failed"; @@ -60,7 +60,7 @@ public void testTokenizer() throws Exception { public void testTree() throws Exception { Files.walkFileTree(testDir.resolve("tree-construction"), - new TestVisitor(true, false, file -> // + new TestVisitor(true, ".dat", file -> // new TreeTester(Files.newInputStream(file)).runTests())); if (TreeTester.exitStatus != 0) { assert false : "Tree test failed"; @@ -86,14 +86,14 @@ private static class TestVisitor extends SimpleFileVisitor { private final boolean skipScripted; - private final boolean requireTestExtension; + private final String requiredTestExtension; private final TestConsumer runner; - private TestVisitor(boolean skipScripted, boolean requireTestExtension, + private TestVisitor(boolean skipScripted, String requiredTestExtension, TestConsumer runner) { this.skipScripted = skipScripted; - this.requireTestExtension = requireTestExtension; + this.requiredTestExtension = requiredTestExtension; this.runner = runner; } @@ -110,8 +110,7 @@ public FileVisitResult preVisitDirectory(Path dir, @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - if (!requireTestExtension - || file.getFileName().toString().endsWith(".test")) { + if (file.getFileName().toString().endsWith(requiredTestExtension)) { runner.accept(file); } return FileVisitResult.CONTINUE; From 8a48cb048d73a84540f7a146addc06c6078679d0 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Sun, 30 Aug 2020 22:29:24 +0900 Subject: [PATCH 14/15] Make Html5libTest handle double-escaped tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change makes Html5libTest correctly handle tests in the html5lib-tests suite which have cases with so-called “double-escaped” “input” and “output” values — for example, values that contain the literals “\\u0000” and “\\uFFFD" rather than “\u0000” and “\uFFFD”. --- .../nu/validator/htmlparser/test/Html5libTest.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java index 3958f347..67770481 100644 --- a/test-src/nu/validator/htmlparser/test/Html5libTest.java +++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java @@ -22,8 +22,10 @@ package nu.validator.htmlparser.test; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; @@ -52,7 +54,7 @@ public void testEncoding() throws Exception { public void testTokenizer() throws Exception { Files.walkFileTree(testDir.resolve("tokenizer"), new TestVisitor(true, ".test", file -> // - new TokenizerTester(Files.newInputStream(file)).runTests())); + new TokenizerTester(getDoubleEscapedInput(file)).runTests())); if (TokenizerTester.exitStatus != 0) { assert false : "Tokenizer test failed"; } @@ -67,6 +69,15 @@ public void testTree() throws Exception { } } + private ByteArrayInputStream getDoubleEscapedInput(Path file) + throws IOException { + byte[] fileBytes = Files.readAllBytes(file); + String fileContent = new String(fileBytes, StandardCharsets.UTF_8); + String unescapedContent = fileContent.replace("\\\\u", "\\u"); + byte[] newBytes = unescapedContent.getBytes(StandardCharsets.UTF_8); + return new ByteArrayInputStream(newBytes); + } + private interface TestConsumer extends Consumer { @Override From 390d9a91e8b5008516d25e648a514dafc32bb833 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Tue, 1 Sep 2020 12:34:18 +0900 Subject: [PATCH 15/15] Use Paths.get rather than Path.of in Html5libTest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change replaces Path.of() calls in Html5libTest with Paths.get(). Per https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/file/Path.html#of(java.net.URI) Path.of() was introduced in Java 11. So Java 8 has no Path.of(); see also https://docs.oracle.com/javase/8/docs/api/java/nio/file/Path.html We need to continue to support Java 8 for the time being. It seems Paths.get() will eventually end up being formally deprecated; by the time it finally is, we may also be able to quit supporting Java 8 — and so we can just switch to Path.of() then. --- test-src/nu/validator/htmlparser/test/Html5libTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java index 67770481..724062e2 100644 --- a/test-src/nu/validator/htmlparser/test/Html5libTest.java +++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java @@ -29,6 +29,7 @@ import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.function.Consumer; @@ -38,7 +39,7 @@ public class Html5libTest { private final Path testDir; public Html5libTest() throws URISyntaxException { - this.testDir = Path.of( + this.testDir = Paths.get( Html5libTest.class.getResource("/html5lib-tests").toURI()); } @@ -111,7 +112,8 @@ private TestVisitor(boolean skipScripted, String requiredTestExtension, @Override public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { - if (skipScripted && dir.getFileName().equals(Path.of("scripted"))) { + if (skipScripted + && dir.getFileName().equals(Paths.get("scripted"))) { return FileVisitResult.SKIP_SUBTREE; }