diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..3ef64062 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,29 @@ +name: Build + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + # See http://static.azul.com/zulu/bin/ for available JDK versions. + java: [8, 11, 14, 15-ea, 16-ea] + os: [ubuntu-latest, macos-latest, windows-latest] + name: Java ${{ matrix.java }} + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Set up java + uses: actions/setup-java@v1 + with: + java-version: ${{ matrix.java }} + - name: Cache Maven packages + uses: actions/cache@v2 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + - name: Build with Maven + run: mvn -B verify diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..b469982b --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "test-src/test/resources/html5lib-tests"] + path = test-src/test/resources/html5lib-tests + url = git@github.com:html5lib/html5lib-tests.git diff --git a/pom.xml b/pom.xml index 41f46725..a03072f0 100644 --- a/pom.xml +++ b/pom.xml @@ -80,22 +80,13 @@ org.apache.maven.plugins maven-compiler-plugin - 1.5 - 1.5 + 1.8 + 1.8 maven-antrun-plugin 1.7 - - - com.sun - tools - 1.5.0 - system - ${java.home}/../lib/tools.jar - - intitialize-sources @@ -124,8 +115,12 @@ - - + + @@ -141,7 +136,11 @@ org.apache.maven.plugins maven-surefire-plugin - true + Html5libTest + true + + ${project.build.testSourceDirectory}/test/resources + @@ -236,5 +235,26 @@ /usr/share/java /usr/share/javadoc UTF-8 + false + + + Java 8 + + 1.8 + + + true + + + + com.sun + tools + 1.5.0 + system + ${java.home}/../lib/tools.jar + + + + diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java index 3de1af2a..4facce4a 100755 --- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java +++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2007 Henri Sivonen - * Copyright (c) 2013 Mozilla Foundation + * Copyright (c) 2013-2020 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -61,7 +61,7 @@ public final class HtmlInputStreamReader extends Reader implements ByteReadable, Locator, Locator2 { - private static final int SNIFFING_LIMIT = 1024; + private int sniffingLimit = 1024; private final InputStream inputStream; @@ -87,11 +87,9 @@ public final class HtmlInputStreamReader extends Reader implements private boolean charsetBoundaryPassed = false; - private final byte[] byteArray = new byte[4096]; // Length must be >= + private byte[] byteArray = new byte[4096]; // Length must be >= sniffingLimit - // SNIFFING_LIMIT - - private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray); + private ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray); private boolean needToNotifyTokenizer = false; @@ -112,18 +110,27 @@ public final class HtmlInputStreamReader extends Reader implements /** * @param inputStream * @param errorHandler - * @param locator + * @param tokenizer + * @param driver + * @param heuristics + * @param sniffingLimit * @throws IOException * @throws SAXException */ public HtmlInputStreamReader(InputStream inputStream, ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver, - Heuristics heuristics) throws SAXException, IOException { + Heuristics heuristics, int sniffingLimit) + throws SAXException, IOException { this.inputStream = inputStream; this.errorHandler = errorHandler; this.tokenizer = tokenizer; this.driver = driver; this.sniffing = true; + if (sniffingLimit != -1) { + this.sniffingLimit = sniffingLimit; + this.byteArray = new byte[sniffingLimit]; + this.byteBuffer = ByteBuffer.wrap(byteArray); + } Encoding encoding = (new BomSniffer(this)).sniff(); if (encoding == null) { position = 0; @@ -178,6 +185,12 @@ public HtmlInputStreamReader(InputStream inputStream, initDecoder(); } + public HtmlInputStreamReader(InputStream inputStream, + ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver, + Heuristics heuristics) throws SAXException, IOException { + this(inputStream, errorHandler, tokenizer, driver, heuristics, -1); + } + /** * */ @@ -237,7 +250,7 @@ public HtmlInputStreamReader(InputStream inputStream, if (charsetBoundaryPassed) { readLen = byteArray.length - oldLimit; } else { - readLen = SNIFFING_LIMIT - oldLimit; + readLen = sniffingLimit - oldLimit; } int num = inputStream.read(byteArray, oldLimit, readLen); if (num == -1) { @@ -261,7 +274,7 @@ public HtmlInputStreamReader(InputStream inputStream, } else if (cr == CoderResult.UNDERFLOW) { int remaining = byteBuffer.remaining(); if (!charsetBoundaryPassed) { - if (bytesRead + remaining >= SNIFFING_LIMIT) { + if (bytesRead + remaining >= sniffingLimit) { needToNotifyTokenizer = true; charsetBoundaryPassed = true; } @@ -389,12 +402,12 @@ public int readByte() throws IOException { throw new IllegalStateException( "readByte() called when not in the sniffing state."); } - if (position == SNIFFING_LIMIT) { + if (position == sniffingLimit) { return -1; } else if (position < limit) { return byteArray[position++] & 0xFF; } else { - int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit); + int num = inputStream.read(byteArray, limit, sniffingLimit - limit); if (num == -1) { return -1; } else { diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java index 95cd3018..01d164c8 100644 --- a/test-src/nu/validator/htmlparser/test/EncodingTester.java +++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java @@ -36,6 +36,10 @@ public class EncodingTester { + static int exitStatus = 0; + + protected static int SNIFFING_LIMIT = 16384; + private final InputStream aggregateStream; private final StringBuilder builder = new StringBuilder(); @@ -47,7 +51,7 @@ public EncodingTester(InputStream aggregateStream) { this.aggregateStream = aggregateStream; } - private void runTests() throws IOException, SAXException { + void runTests() throws IOException, SAXException { while (runTest()) { // spin } @@ -59,10 +63,11 @@ private boolean runTest() throws IOException, SAXException { } UntilHashInputStream stream = new UntilHashInputStream(aggregateStream); HtmlInputStreamReader reader = new HtmlInputStreamReader(stream, null, - null, null, Heuristics.NONE); + null, null, Heuristics.NONE, SNIFFING_LIMIT); Charset charset = reader.getCharset(); stream.close(); if (skipLabel()) { + exitStatus = 1; System.err.println("Premature end of test data."); return false; } @@ -73,6 +78,7 @@ private boolean runTest() throws IOException, SAXException { case '\n': break loop; case -1: + exitStatus = 1; System.err.println("Premature end of test data."); return false; default: @@ -82,9 +88,9 @@ private boolean runTest() throws IOException, SAXException { String sniffed = charset.name(); String expected = Encoding.forName(builder.toString()).newDecoder().charset().name(); if (expected.equalsIgnoreCase(sniffed)) { - System.err.println("Success."); // System.err.println(stream); } else { + exitStatus = 1; System.err.println("Failure. Expected: " + expected + " got " + sniffed + "."); System.err.println(stream); @@ -118,6 +124,7 @@ public static void main(String[] args) throws IOException, SAXException { args[i])); tester.runTests(); } + System.exit(exitStatus); } } diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java new file mode 100644 index 00000000..724062e2 --- /dev/null +++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.function.Consumer; + +public class Html5libTest { + + private final Path testDir; + + public Html5libTest() throws URISyntaxException { + this.testDir = Paths.get( + Html5libTest.class.getResource("/html5lib-tests").toURI()); + } + + public void testEncoding() throws Exception { + Files.walkFileTree(testDir.resolve("encoding"), // + new TestVisitor(true, ".dat", file -> // + new EncodingTester(Files.newInputStream(file)).runTests())); + if (EncodingTester.exitStatus != 0) { + assert false : "Encoding test failed"; + } + } + + public void testTokenizer() throws Exception { + Files.walkFileTree(testDir.resolve("tokenizer"), + new TestVisitor(true, ".test", file -> // + new TokenizerTester(getDoubleEscapedInput(file)).runTests())); + if (TokenizerTester.exitStatus != 0) { + assert false : "Tokenizer test failed"; + } + } + + public void testTree() throws Exception { + Files.walkFileTree(testDir.resolve("tree-construction"), + new TestVisitor(true, ".dat", file -> // + new TreeTester(Files.newInputStream(file)).runTests())); + if (TreeTester.exitStatus != 0) { + assert false : "Tree test failed"; + } + } + + private ByteArrayInputStream getDoubleEscapedInput(Path file) + throws IOException { + byte[] fileBytes = Files.readAllBytes(file); + String fileContent = new String(fileBytes, StandardCharsets.UTF_8); + String unescapedContent = fileContent.replace("\\\\u", "\\u"); + byte[] newBytes = unescapedContent.getBytes(StandardCharsets.UTF_8); + return new ByteArrayInputStream(newBytes); + } + + private interface TestConsumer extends Consumer { + + @Override + default void accept(Path t) { + try { + acceptTest(t); + } catch (Throwable e) { + throw new AssertionError(e); + } + } + + void acceptTest(Path t) throws Throwable; + + } + + private static class TestVisitor extends SimpleFileVisitor { + + private final boolean skipScripted; + + private final String requiredTestExtension; + + private final TestConsumer runner; + + private TestVisitor(boolean skipScripted, String requiredTestExtension, + TestConsumer runner) { + this.skipScripted = skipScripted; + this.requiredTestExtension = requiredTestExtension; + this.runner = runner; + } + + @Override + public FileVisitResult preVisitDirectory(Path dir, + BasicFileAttributes attrs) throws IOException { + if (skipScripted + && dir.getFileName().equals(Paths.get("scripted"))) { + return FileVisitResult.SKIP_SUBTREE; + } + + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { + if (file.getFileName().toString().endsWith(requiredTestExtension)) { + runner.accept(file); + } + return FileVisitResult.CONTINUE; + } + } + +} diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java index 52f96d2e..2db0395b 100644 --- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java +++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -51,9 +51,11 @@ public class TokenizerTester { + static int exitStatus = 0; + private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state"); - private static JSONString PCDATA = new JSONString("DATA state"); + private static JSONString PCDATA = new JSONString("Data state"); private static JSONString RCDATA = new JSONString("RCDATA state"); @@ -95,7 +97,7 @@ private static boolean jsonDeepEquals(JSONValue one, JSONValue other) { private final Writer writer; - private TokenizerTester(InputStream stream) throws TokenStreamException, + public TokenizerTester(InputStream stream) throws TokenStreamException, RecognitionException, UnsupportedEncodingException { tokenHandler = new JSONArrayTokenHandler(); driver = new Driver(new ErrorReportingTokenizer(tokenHandler)); @@ -119,7 +121,7 @@ private TokenizerTester(InputStream stream) throws TokenStreamException, } } - private void runTests() throws SAXException, IOException { + void runTests() throws SAXException, IOException { for (JSONValue val : tests.getValue()) { runTest((JSONObject) val); } @@ -179,9 +181,8 @@ private void runTestInner(String inputString, JSONArray expectedTokens, try { driver.tokenize(is); JSONArray actualTokens = tokenHandler.getArray(); - if (jsonDeepEquals(actualTokens, expectedTokens)) { - writer.write("Success\n"); - } else { + if (!jsonDeepEquals(actualTokens, expectedTokens)) { + exitStatus = 1; writer.write("Failure\n"); writer.write(description); writer.write("\nInput:\n"); @@ -193,6 +194,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens, writer.write("\n"); } } catch (Throwable t) { + exitStatus = 1; writer.write("Failure\n"); writer.write(description); writer.write("\nInput:\n"); @@ -216,6 +218,7 @@ public static void main(String[] args) throws TokenStreamException, args[i])); tester.runTests(); } + System.exit(exitStatus); } } diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java index af5de942..f3efa268 100644 --- a/test-src/nu/validator/htmlparser/test/TreeTester.java +++ b/test-src/nu/validator/htmlparser/test/TreeTester.java @@ -43,6 +43,8 @@ public class TreeTester { private boolean streaming = false; + static int exitStatus = 0; + /** * @param aggregateStream */ @@ -50,7 +52,7 @@ public TreeTester(InputStream aggregateStream) { this.aggregateStream = new BufferedInputStream(aggregateStream); } - private void runTests() throws Throwable { + void runTests() throws Throwable { if (aggregateStream.read() != '#') { System.err.println("No hash at start!"); return; @@ -221,9 +223,9 @@ private boolean runTest() throws Throwable { * && expectedErrors.size() == * actualErrors.size() */) { - System.err.println("Success."); // System.err.println(stream); } else { + exitStatus = 1; System.err.print("Failure.\nData:\n" + stream + "\nExpected:\n" + expected + "Got: \n" + actual); System.err.println("Expected errors:"); @@ -236,6 +238,7 @@ private boolean runTest() throws Throwable { } } } catch (Throwable t) { + exitStatus = 1; System.err.println("Failure.\nData:\n" + stream); throw t; } @@ -266,6 +269,7 @@ public static void main(String[] args) throws Throwable { TreeTester tester = new TreeTester(new FileInputStream(args[i])); tester.runTests(); } + System.exit(exitStatus); } } diff --git a/test-src/test/resources/html5lib-tests b/test-src/test/resources/html5lib-tests new file mode 160000 index 00000000..6ddcf58b --- /dev/null +++ b/test-src/test/resources/html5lib-tests @@ -0,0 +1 @@ +Subproject commit 6ddcf58bea5a01e616911050c173622f84297211