From 05b9024ba7c80b7561e9d2dfeedd714c9394c28d Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Fri, 4 Sep 2020 13:48:35 +0900
Subject: [PATCH 01/15] Include com.sun.tools as dependency only if Java 8

This change causes com.sun.tools to be included as a dependency
only if the JDK version is 1.8.

Otherwise, without this change, the build fails when run under any
Java version later than Java 8.
---
 pom.xml | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)
diff --git a/pom.xml b/pom.xml
index 41f46725..d8443fb6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -87,15 +87,6 @@
       <plugin>
         <artifactId>maven-antrun-plugin</artifactId>
         <version>1.7</version>
-        <dependencies>
-          <dependency>
-            <groupId>com.sun</groupId>
-            <artifactId>tools</artifactId>
-            <version>1.5.0</version>
-            <scope>system</scope>
-            <systemPath>${java.home}/../lib/tools.jar</systemPath>
-          </dependency>
-        </dependencies>
         <executions>
           <execution>
             <id>intitialize-sources</id>
@@ -237,4 +228,21 @@
     <rpm.javadoc.dir>/usr/share/javadoc</rpm.javadoc.dir>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
   </properties>
+  <profiles>
+    <profile>
+      <id>Java 8</id>
+      <activation>
+        <jdk>1.8</jdk>
+      </activation>
+      <dependencies>
+        <dependency>
+          <groupId>com.sun</groupId>
+          <artifactId>tools</artifactId>
+          <version>1.5.0</version>
+          <scope>system</scope>
+          <systemPath>${java.home}/../lib/tools.jar</systemPath>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
 </project>

From a27c7b62dd4479b6d282c621fcc99bfc6ee36015 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Fri, 4 Sep 2020 13:50:54 +0900
Subject: [PATCH 02/15] Set fork=true for Maven AntRun javac/java if Java8

This change causes the Maven AntRun plugin to invoke the javac and java
commands with fork=true when run in a Java 8 environment.

Otherwise, without this change, the build fails when run under Java 8.
---
 pom.xml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index d8443fb6..0ce02f71 100644
--- a/pom.xml
+++ b/pom.xml
@@ -115,8 +115,12 @@
                 <property name="translator.sources" value="${basedir}/translator-src"/>
                 <property name="translator.classes" value="${project.build.directory}/translator-classes"/>
                 <mkdir dir="${translator.classes}"/>
-                <javac srcdir="${translator.sources}" includes="nu/validator/htmlparser/generator/ApplyHotSpotWorkaround.java" destdir="${translator.classes}" includeantruntime="false"/>
-                <java classname="nu.validator.htmlparser.generator.ApplyHotSpotWorkaround">
+                <javac srcdir="${translator.sources}" destdir="${translator.classes}"
+                       includes="nu/validator/htmlparser/generator/ApplyHotSpotWorkaround.java"
+                       includeantruntime="false"
+                       fork="${fork}"/>
+                <java classname="nu.validator.htmlparser.generator.ApplyHotSpotWorkaround"
+                      fork="${fork}">
                   <classpath>
                     <pathelement location="${translator.classes}"/>
                   </classpath>
@@ -227,6 +231,7 @@
     <rpm.java.dir>/usr/share/java</rpm.java.dir>
     <rpm.javadoc.dir>/usr/share/javadoc</rpm.javadoc.dir>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <fork>false</fork>
   </properties>
   <profiles>
     <profile>
@@ -234,6 +239,9 @@
       <activation>
         <jdk>1.8</jdk>
       </activation>
+      <properties>
+        <fork>true</fork>
+      </properties>
       <dependencies>
         <dependency>
           <groupId>com.sun</groupId>

From 623cea1584f699a0493c6e2114f104b97cdf3837 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Wed, 5 Aug 2020 22:08:37 +0900
Subject: [PATCH 03/15] Fix TokenizerTester casing of PCDATA state name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change updates the TokenizerTester code to expect its input test
data to have the string "Data state" to identify PCDATA tests — rather
than the string "DATA state".

The test data in the html5lib-tests suite uses "Data state", so without
this change, running TokenizerTester against html5lib-tests causes
TokenizerTester to fail with a “Broken test data” harness failure.
---
 test-src/nu/validator/htmlparser/test/TokenizerTester.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
index 52f96d2e..cfe77c41 100644
--- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java
+++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
@@ -53,7 +53,7 @@ public class TokenizerTester {
 
     private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state");
 
-    private static JSONString PCDATA = new JSONString("DATA state");
+    private static JSONString PCDATA = new JSONString("Data state");
 
     private static JSONString RCDATA = new JSONString("RCDATA state");
 

From d419264a99a3279a79e357e8b922ba209281c912 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Mon, 17 Aug 2020 16:11:03 +0900
Subject: [PATCH 04/15] Exit 1 for test harness if any tests fail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change makes TokenizerTester, TreeTester, and EncodingTester exit
with status code 1 if any test cases fail.

Otherwise, without this change, we won’t catch the test failures when
running tests under CI.
---
 test-src/nu/validator/htmlparser/test/EncodingTester.java  | 6 ++++++
 test-src/nu/validator/htmlparser/test/TokenizerTester.java | 5 +++++
 test-src/nu/validator/htmlparser/test/TreeTester.java      | 5 +++++
 3 files changed, 16 insertions(+)

diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java
index 95cd3018..5ae87d5c 100644
--- a/test-src/nu/validator/htmlparser/test/EncodingTester.java
+++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java
@@ -36,6 +36,8 @@
 
 public class EncodingTester {
 
+    private static int exitStatus = 0;
+
     private final InputStream aggregateStream;
 
     private final StringBuilder builder = new StringBuilder();
@@ -63,6 +65,7 @@ private boolean runTest() throws IOException, SAXException {
         Charset charset = reader.getCharset();
         stream.close();
         if (skipLabel()) {
+            exitStatus = 1;
             System.err.println("Premature end of test data.");
             return false;
         }
@@ -73,6 +76,7 @@ private boolean runTest() throws IOException, SAXException {
                 case '\n':
                     break loop;
                 case -1:
+                    exitStatus = 1;
                     System.err.println("Premature end of test data.");
                     return false;
                 default:
@@ -85,6 +89,7 @@ private boolean runTest() throws IOException, SAXException {
             System.err.println("Success.");
             // System.err.println(stream);
         } else {
+            exitStatus = 1;
             System.err.println("Failure. Expected: " + expected + " got "
                     + sniffed + ".");
             System.err.println(stream);
@@ -118,6 +123,7 @@ public static void main(String[] args) throws IOException, SAXException {
                     args[i]));
             tester.runTests();
         }
+        System.exit(exitStatus);
     }
 
 }
diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
index cfe77c41..275ece18 100644
--- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java
+++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
@@ -51,6 +51,8 @@
 
 public class TokenizerTester {
 
+    private static int exitStatus = 0;
+
     private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state");
 
     private static JSONString PCDATA = new JSONString("Data state");
@@ -182,6 +184,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens,
             if (jsonDeepEquals(actualTokens, expectedTokens)) {
                 writer.write("Success\n");
             } else {
+                exitStatus = 1;
                 writer.write("Failure\n");
                 writer.write(description);
                 writer.write("\nInput:\n");
@@ -193,6 +196,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens,
                 writer.write("\n");
             }
         } catch (Throwable t) {
+            exitStatus = 1;
             writer.write("Failure\n");
             writer.write(description);
             writer.write("\nInput:\n");
@@ -216,6 +220,7 @@ public static void main(String[] args) throws TokenStreamException,
                     args[i]));
             tester.runTests();
         }
+        System.exit(exitStatus);
     }
 
 }
diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java
index af5de942..d90864c6 100644
--- a/test-src/nu/validator/htmlparser/test/TreeTester.java
+++ b/test-src/nu/validator/htmlparser/test/TreeTester.java
@@ -43,6 +43,8 @@ public class TreeTester {
 
     private boolean streaming = false;
 
+    private static int exitStatus = 0;
+
     /**
      * @param aggregateStream
      */
@@ -224,6 +226,7 @@ private boolean runTest() throws Throwable {
                 System.err.println("Success.");
                 // System.err.println(stream);
             } else {
+                exitStatus = 1;
                 System.err.print("Failure.\nData:\n" + stream + "\nExpected:\n"
                         + expected + "Got: \n" + actual);
                 System.err.println("Expected errors:");
@@ -236,6 +239,7 @@ private boolean runTest() throws Throwable {
                 }
             }
         } catch (Throwable t) {
+            exitStatus = 1;
             System.err.println("Failure.\nData:\n" + stream);
             throw t;
         }
@@ -266,6 +270,7 @@ public static void main(String[] args) throws Throwable {
             TreeTester tester = new TreeTester(new FileInputStream(args[i]));
             tester.runTests();
         }
+        System.exit(exitStatus);
     }
 
 }

From dd7ecfa3cca3980037667d9f74a29a683c7a278b Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Mon, 17 Aug 2020 16:43:53 +0900
Subject: [PATCH 05/15] =?UTF-8?q?Drop=20=E2=80=9CSuccess=E2=80=9D=20messag?=
 =?UTF-8?q?e=20for=20test=20passes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change makes TokenizerTester, TreeTester, and EncodingTester stop
emitting the word “Success” to standard error every time a test passes.

Otherwise, without this change, in test output, we end up with so many
“Success” lines that the actual test failures are effectively obscured
(you have to scroll back through the output/log to hunt for failures).
---
 test-src/nu/validator/htmlparser/test/EncodingTester.java  | 1 -
 test-src/nu/validator/htmlparser/test/TokenizerTester.java | 4 +---
 test-src/nu/validator/htmlparser/test/TreeTester.java      | 1 -
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java
index 5ae87d5c..da588eb2 100644
--- a/test-src/nu/validator/htmlparser/test/EncodingTester.java
+++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java
@@ -86,7 +86,6 @@ private boolean runTest() throws IOException, SAXException {
         String sniffed = charset.name();
         String expected = Encoding.forName(builder.toString()).newDecoder().charset().name();
         if (expected.equalsIgnoreCase(sniffed)) {
-            System.err.println("Success.");
             // System.err.println(stream);
         } else {
             exitStatus = 1;
diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
index 275ece18..b4013234 100644
--- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java
+++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
@@ -181,9 +181,7 @@ private void runTestInner(String inputString, JSONArray expectedTokens,
         try {
             driver.tokenize(is);
             JSONArray actualTokens = tokenHandler.getArray();
-            if (jsonDeepEquals(actualTokens, expectedTokens)) {
-                writer.write("Success\n");
-            } else {
+            if (!jsonDeepEquals(actualTokens, expectedTokens)) {
                 exitStatus = 1;
                 writer.write("Failure\n");
                 writer.write(description);
diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java
index d90864c6..1327e9ac 100644
--- a/test-src/nu/validator/htmlparser/test/TreeTester.java
+++ b/test-src/nu/validator/htmlparser/test/TreeTester.java
@@ -223,7 +223,6 @@ private boolean runTest() throws Throwable {
              * && expectedErrors.size() ==
              * actualErrors.size()
              */) {
-                System.err.println("Success.");
                 // System.err.println(stream);
             } else {
                 exitStatus = 1;

From ebbd0c50ca2f59975833cf6b9129e2ceea06f2b8 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Mon, 17 Aug 2020 23:46:42 +0900
Subject: [PATCH 06/15] Add GitHub Actions build.yml to run Maven in CI

---
 .github/workflows/build.yml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/build.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 00000000..3ef64062
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,29 @@
+name: Build
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        # See http://static.azul.com/zulu/bin/ for available JDK versions.
+        java: [8, 11, 14, 15-ea, 16-ea]
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    name: Java ${{ matrix.java }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: true
+      - name: Set up java
+        uses: actions/setup-java@v1
+        with:
+          java-version: ${{ matrix.java }}
+      - name: Cache Maven packages
+        uses: actions/cache@v2
+        with:
+          path: ~/.m2
+          key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+          restore-keys: ${{ runner.os }}-m2
+      - name: Build with Maven
+        run: mvn -B verify

From 10f012aeeae1f39cf36616b5cedb4b357661379f Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Fri, 21 Aug 2020 13:35:48 +0900
Subject: [PATCH 07/15] Make HtmlInputStreamReader sniffing limit settable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change makes the sniffing limit in HtmlInputStreamReader settable.

Without this change, the HtmlInputStreamReader sniffing limit is
hardcoded to 1024 — and in the context of testing, that has the effect
of limiting HtmlInputStreamReader to only being useful for testing
expected output of the meta prescan.

So this change makes it possible for HtmlInputStreamReader to also be
used for testing the results for the state where the expected character
encoding is not limited to what can be determined by checking the first
1024 bytes of the input stream.
---
 .../htmlparser/io/HtmlInputStreamReader.java  | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
index 3de1af2a..4facce4a 100755
--- a/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
+++ b/src/nu/validator/htmlparser/io/HtmlInputStreamReader.java
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2007 Henri Sivonen
- * Copyright (c) 2013 Mozilla Foundation
+ * Copyright (c) 2013-2020 Mozilla Foundation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a 
  * copy of this software and associated documentation files (the "Software"), 
@@ -61,7 +61,7 @@
 public final class HtmlInputStreamReader extends Reader implements
         ByteReadable, Locator, Locator2 {
 
-    private static final int SNIFFING_LIMIT = 1024;
+    private int sniffingLimit = 1024;
 
     private final InputStream inputStream;
 
@@ -87,11 +87,9 @@ public final class HtmlInputStreamReader extends Reader implements
 
     private boolean charsetBoundaryPassed = false;
 
-    private final byte[] byteArray = new byte[4096]; // Length must be >=
+    private byte[] byteArray = new byte[4096]; // Length must be >= sniffingLimit
 
-    // SNIFFING_LIMIT
-
-    private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
+    private ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);
 
     private boolean needToNotifyTokenizer = false;
 
@@ -112,18 +110,27 @@ public final class HtmlInputStreamReader extends Reader implements
     /**
      * @param inputStream
      * @param errorHandler
-     * @param locator
+     * @param tokenizer
+     * @param driver
+     * @param heuristics
+     * @param sniffingLimit
      * @throws IOException
      * @throws SAXException
      */
     public HtmlInputStreamReader(InputStream inputStream,
             ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
-            Heuristics heuristics) throws SAXException, IOException {
+            Heuristics heuristics, int sniffingLimit)
+            throws SAXException, IOException {
         this.inputStream = inputStream;
         this.errorHandler = errorHandler;
         this.tokenizer = tokenizer;
         this.driver = driver;
         this.sniffing = true;
+        if (sniffingLimit != -1) {
+            this.sniffingLimit = sniffingLimit;
+            this.byteArray = new byte[sniffingLimit];
+            this.byteBuffer = ByteBuffer.wrap(byteArray);
+        }
         Encoding encoding = (new BomSniffer(this)).sniff();
         if (encoding == null) {
             position = 0;
@@ -178,6 +185,12 @@ public HtmlInputStreamReader(InputStream inputStream,
         initDecoder();
     }
 
+    public HtmlInputStreamReader(InputStream inputStream,
+            ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
+            Heuristics heuristics) throws SAXException, IOException {
+        this(inputStream, errorHandler, tokenizer, driver, heuristics, -1);
+    }
+
     /**
      * 
      */
@@ -237,7 +250,7 @@ public HtmlInputStreamReader(InputStream inputStream,
                 if (charsetBoundaryPassed) {
                     readLen = byteArray.length - oldLimit;
                 } else {
-                    readLen = SNIFFING_LIMIT - oldLimit;
+                    readLen = sniffingLimit - oldLimit;
                 }
                 int num = inputStream.read(byteArray, oldLimit, readLen);
                 if (num == -1) {
@@ -261,7 +274,7 @@ public HtmlInputStreamReader(InputStream inputStream,
                 } else if (cr == CoderResult.UNDERFLOW) {
                     int remaining = byteBuffer.remaining();
                     if (!charsetBoundaryPassed) {
-                        if (bytesRead + remaining >= SNIFFING_LIMIT) {
+                        if (bytesRead + remaining >= sniffingLimit) {
                             needToNotifyTokenizer = true;
                             charsetBoundaryPassed = true;
                         }
@@ -389,12 +402,12 @@ public int readByte() throws IOException {
             throw new IllegalStateException(
                     "readByte() called when not in the sniffing state.");
         }
-        if (position == SNIFFING_LIMIT) {
+        if (position == sniffingLimit) {
             return -1;
         } else if (position < limit) {
             return byteArray[position++] & 0xFF;
         } else {
-            int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
+            int num = inputStream.read(byteArray, limit, sniffingLimit - limit);
             if (num == -1) {
                 return -1;
             } else {

From e8e4a25fe1c533c7c1f43b2b43926d90672ebe2d Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Fri, 21 Aug 2020 13:49:57 +0900
Subject: [PATCH 08/15] Make EncodingTester usable in testing parsed state

This change updates EncodingTester to make it test the result for cases
when the expected character encoding is not limited to what can be
determined by checking only the first 1024 bytes of the input stream.

Otherwise, without this change, EncodingTester is limited to only being
useful for testing the output of the meta prescan.
---
 test-src/nu/validator/htmlparser/test/EncodingTester.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java
index da588eb2..d2a3c9d5 100644
--- a/test-src/nu/validator/htmlparser/test/EncodingTester.java
+++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java
@@ -38,6 +38,8 @@ public class EncodingTester {
 
     private static int exitStatus = 0;
 
+    protected static int SNIFFING_LIMIT = 16384;
+
     private final InputStream aggregateStream;
 
     private final StringBuilder builder = new StringBuilder();
@@ -61,7 +63,7 @@ private boolean runTest() throws IOException, SAXException {
         }
         UntilHashInputStream stream = new UntilHashInputStream(aggregateStream);
         HtmlInputStreamReader reader = new HtmlInputStreamReader(stream, null,
-                null, null, Heuristics.NONE);
+                null, null, Heuristics.NONE, SNIFFING_LIMIT);
         Charset charset = reader.getCharset();
         stream.close();
         if (skipLabel()) {

From 196bd48a97057a56df7c4faf56c49e1551e7edeb Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Wed, 26 Aug 2020 16:52:05 +0900
Subject: [PATCH 09/15] Make TokenizerTester() constructor public
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change makes the TokenizerTester(InputStream stream) constructor
public — as the corresponding constructors for TreeTester and
EncodingTester already are.
---
 test-src/nu/validator/htmlparser/test/TokenizerTester.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
index b4013234..ead1dba9 100644
--- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java
+++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
@@ -97,7 +97,7 @@ private static boolean jsonDeepEquals(JSONValue one, JSONValue other) {
 
     private final Writer writer;
 
-    private TokenizerTester(InputStream stream) throws TokenStreamException,
+    public TokenizerTester(InputStream stream) throws TokenStreamException,
             RecognitionException, UnsupportedEncodingException {
         tokenHandler = new JSONArrayTokenHandler();
         driver = new Driver(new ErrorReportingTokenizer(tokenHandler));

From d4edb40a2810d2e762ad862bae0ff6e76592747f Mon Sep 17 00:00:00 2001
From: Anthony Vanelverdinghe <dev@anthonyv.be>
Date: Sat, 22 Aug 2020 13:07:01 +0200
Subject: [PATCH 10/15] Add Html5libTest

---
 .../htmlparser/test/EncodingTester.java       |  4 +-
 .../htmlparser/test/Html5libTest.java         | 86 +++++++++++++++++++
 .../htmlparser/test/TokenizerTester.java      |  4 +-
 .../validator/htmlparser/test/TreeTester.java |  4 +-
 4 files changed, 92 insertions(+), 6 deletions(-)
 create mode 100644 test-src/nu/validator/htmlparser/test/Html5libTest.java

diff --git a/test-src/nu/validator/htmlparser/test/EncodingTester.java b/test-src/nu/validator/htmlparser/test/EncodingTester.java
index d2a3c9d5..01d164c8 100644
--- a/test-src/nu/validator/htmlparser/test/EncodingTester.java
+++ b/test-src/nu/validator/htmlparser/test/EncodingTester.java
@@ -36,7 +36,7 @@
 
 public class EncodingTester {
 
-    private static int exitStatus = 0;
+    static int exitStatus = 0;
 
     protected static int SNIFFING_LIMIT = 16384;
 
@@ -51,7 +51,7 @@ public EncodingTester(InputStream aggregateStream) {
         this.aggregateStream = aggregateStream;
     }
 
-    private void runTests() throws IOException, SAXException {
+    void runTests() throws IOException, SAXException {
         while (runTest()) {
             // spin
         }
diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java
new file mode 100644
index 00000000..2b336b7a
--- /dev/null
+++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java
@@ -0,0 +1,86 @@
+package nu.validator.htmlparser.test;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.function.Consumer;
+
+public class Html5libTest {
+
+    private final Path testDir;
+
+    public Html5libTest() throws URISyntaxException {
+        this.testDir = Path.of(Html5libTest.class.getResource("/html5lib-tests").toURI());
+    }
+
+    public void testEncoding() throws Exception {
+        Files.walkFileTree(testDir.resolve("encoding"), new TestVisitor(true, false, file -> new EncodingTester(Files.newInputStream(file)).runTests()));
+        if(EncodingTester.exitStatus != 0) {
+            assert false : "Encoding test failed";
+        }
+    }
+
+    public void testTokenizer() throws Exception {
+        Files.walkFileTree(testDir.resolve("tokenizer"), new TestVisitor(true, true, file -> new TokenizerTester(Files.newInputStream(file)).runTests()));
+        if(TokenizerTester.exitStatus != 0) {
+            assert false : "Tokenizer test failed";
+        }
+    }
+
+    public void testTree() throws Exception {
+        Files.walkFileTree(testDir.resolve("tree-construction"), new TestVisitor(true, false, file -> new TreeTester(Files.newInputStream(file)).runTests()));
+        if(TreeTester.exitStatus != 0) {
+            assert false : "Tree test failed";
+        }
+    }
+
+    private interface TestConsumer extends Consumer<Path> {
+
+        @Override
+        default void accept(Path t) {
+            try {
+                acceptTest(t);
+            } catch(Throwable e) {
+                throw new AssertionError(e);
+            }
+        }
+
+        void acceptTest(Path t) throws Throwable;
+
+    }
+
+    private static class TestVisitor extends SimpleFileVisitor<Path> {
+
+        private final boolean skipScripted;
+        private final boolean requireTestExtension;
+        private final TestConsumer runner;
+
+        private TestVisitor(boolean skipScripted, boolean requireTestExtension, TestConsumer runner) {
+            this.skipScripted = skipScripted;
+            this.requireTestExtension = requireTestExtension;
+            this.runner = runner;
+        }
+
+        @Override
+        public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
+            if (skipScripted && dir.getFileName().equals(Path.of("scripted"))) {
+                return FileVisitResult.SKIP_SUBTREE;
+            }
+
+            return FileVisitResult.CONTINUE;
+        }
+
+        @Override
+        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
+            if (!requireTestExtension || file.getFileName().toString().endsWith(".test")) {
+                runner.accept(file);
+            }
+            return FileVisitResult.CONTINUE;
+        }
+    }
+
+}
diff --git a/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
index ead1dba9..2db0395b 100644
--- a/test-src/nu/validator/htmlparser/test/TokenizerTester.java
+++ b/test-src/nu/validator/htmlparser/test/TokenizerTester.java
@@ -51,7 +51,7 @@
 
 public class TokenizerTester {
 
-    private static int exitStatus = 0;
+    static int exitStatus = 0;
 
     private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state");
 
@@ -121,7 +121,7 @@ public TokenizerTester(InputStream stream) throws TokenStreamException,
         }
     }
 
-    private void runTests() throws SAXException, IOException {
+    void runTests() throws SAXException, IOException {
         for (JSONValue val : tests.getValue()) {
             runTest((JSONObject) val);
         }
diff --git a/test-src/nu/validator/htmlparser/test/TreeTester.java b/test-src/nu/validator/htmlparser/test/TreeTester.java
index 1327e9ac..f3efa268 100644
--- a/test-src/nu/validator/htmlparser/test/TreeTester.java
+++ b/test-src/nu/validator/htmlparser/test/TreeTester.java
@@ -43,7 +43,7 @@ public class TreeTester {
 
     private boolean streaming = false;
 
-    private static int exitStatus = 0;
+    static int exitStatus = 0;
 
     /**
      * @param aggregateStream
@@ -52,7 +52,7 @@ public TreeTester(InputStream aggregateStream) {
         this.aggregateStream = new BufferedInputStream(aggregateStream);
     }
 
-    private void runTests() throws Throwable {
+    void runTests() throws Throwable {
         if (aggregateStream.read() != '#') {
             System.err.println("No hash at start!");
             return;

From ef95b9243dd91b0d42b07c2fcd1137e7d05d4de4 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Sun, 23 Aug 2020 22:10:49 +0900
Subject: [PATCH 11/15] Add html5lib-tests as a submodule

---
 .gitmodules                            |  3 +++
 pom.xml                                | 10 +++++++---
 test-src/test/resources/html5lib-tests |  1 +
 3 files changed, 11 insertions(+), 3 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 test-src/test/resources/html5lib-tests

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..b469982b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "test-src/test/resources/html5lib-tests"]
+	path = test-src/test/resources/html5lib-tests
+	url = git@github.com:html5lib/html5lib-tests.git
diff --git a/pom.xml b/pom.xml
index 0ce02f71..a03072f0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -80,8 +80,8 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
         <configuration>
-          <source>1.5</source>
-          <target>1.5</target>
+          <source>1.8</source>
+          <target>1.8</target>
         </configuration>
       </plugin>
       <plugin>
@@ -136,7 +136,11 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
         <configuration>
-          <skip>true</skip>
+          <test>Html5libTest</test>
+          <testFailureIgnore>true</testFailureIgnore> <!-- FIXME: Remove this testFailureIgnore after we have all tests passing -->
+          <additionalClasspathElements>
+            <additionalClasspathElement>${project.build.testSourceDirectory}/test/resources</additionalClasspathElement>
+          </additionalClasspathElements>
         </configuration>
       </plugin>
       <plugin>
diff --git a/test-src/test/resources/html5lib-tests b/test-src/test/resources/html5lib-tests
new file mode 160000
index 00000000..6ddcf58b
--- /dev/null
+++ b/test-src/test/resources/html5lib-tests
@@ -0,0 +1 @@
+Subproject commit 6ddcf58bea5a01e616911050c173622f84297211

From fcd18b19e5e38297f0a02b29ae96d538e0c57f56 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Tue, 25 Aug 2020 16:48:16 +0900
Subject: [PATCH 12/15] Html5libTest: Add copyright; match project style

---
 .../htmlparser/test/Html5libTest.java         | 59 +++++++++++++++----
 1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java
index 2b336b7a..9c18e749 100644
--- a/test-src/nu/validator/htmlparser/test/Html5libTest.java
+++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java
@@ -1,3 +1,25 @@
+/*
+ * Copyright (c) 2020 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
 package nu.validator.htmlparser.test;
 
 import java.io.IOException;
@@ -14,26 +36,33 @@ public class Html5libTest {
     private final Path testDir;
 
     public Html5libTest() throws URISyntaxException {
-        this.testDir = Path.of(Html5libTest.class.getResource("/html5lib-tests").toURI());
+        this.testDir = Path.of(
+                Html5libTest.class.getResource("/html5lib-tests").toURI());
     }
 
     public void testEncoding() throws Exception {
-        Files.walkFileTree(testDir.resolve("encoding"), new TestVisitor(true, false, file -> new EncodingTester(Files.newInputStream(file)).runTests()));
-        if(EncodingTester.exitStatus != 0) {
+        Files.walkFileTree(testDir.resolve("encoding"), //
+                new TestVisitor(true, false, file -> //
+                new EncodingTester(Files.newInputStream(file)).runTests()));
+        if (EncodingTester.exitStatus != 0) {
             assert false : "Encoding test failed";
         }
     }
 
     public void testTokenizer() throws Exception {
-        Files.walkFileTree(testDir.resolve("tokenizer"), new TestVisitor(true, true, file -> new TokenizerTester(Files.newInputStream(file)).runTests()));
-        if(TokenizerTester.exitStatus != 0) {
+        Files.walkFileTree(testDir.resolve("tokenizer"),
+                new TestVisitor(true, true, file -> //
+                new TokenizerTester(Files.newInputStream(file)).runTests()));
+        if (TokenizerTester.exitStatus != 0) {
             assert false : "Tokenizer test failed";
         }
     }
 
     public void testTree() throws Exception {
-        Files.walkFileTree(testDir.resolve("tree-construction"), new TestVisitor(true, false, file -> new TreeTester(Files.newInputStream(file)).runTests()));
-        if(TreeTester.exitStatus != 0) {
+        Files.walkFileTree(testDir.resolve("tree-construction"),
+                new TestVisitor(true, false, file -> //
+                new TreeTester(Files.newInputStream(file)).runTests()));
+        if (TreeTester.exitStatus != 0) {
             assert false : "Tree test failed";
         }
     }
@@ -44,7 +73,7 @@ private interface TestConsumer extends Consumer<Path> {
         default void accept(Path t) {
             try {
                 acceptTest(t);
-            } catch(Throwable e) {
+            } catch (Throwable e) {
                 throw new AssertionError(e);
             }
         }
@@ -56,17 +85,21 @@ default void accept(Path t) {
     private static class TestVisitor extends SimpleFileVisitor<Path> {
 
         private final boolean skipScripted;
+
         private final boolean requireTestExtension;
+
         private final TestConsumer runner;
 
-        private TestVisitor(boolean skipScripted, boolean requireTestExtension, TestConsumer runner) {
+        private TestVisitor(boolean skipScripted, boolean requireTestExtension,
+                TestConsumer runner) {
             this.skipScripted = skipScripted;
             this.requireTestExtension = requireTestExtension;
             this.runner = runner;
         }
 
         @Override
-        public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
+        public FileVisitResult preVisitDirectory(Path dir,
+                BasicFileAttributes attrs) throws IOException {
             if (skipScripted && dir.getFileName().equals(Path.of("scripted"))) {
                 return FileVisitResult.SKIP_SUBTREE;
             }
@@ -75,8 +108,10 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) th
         }
 
         @Override
-        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
-            if (!requireTestExtension || file.getFileName().toString().endsWith(".test")) {
+        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
+                throws IOException {
+            if (!requireTestExtension
+                    || file.getFileName().toString().endsWith(".test")) {
                 runner.accept(file);
             }
             return FileVisitResult.CONTINUE;

From fd2c2601079118649e6fa73acd207e72825e697a Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Sun, 30 Aug 2020 20:21:20 +0900
Subject: [PATCH 13/15] Make Html5libTest only check .dat and .test files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change refines how Html5libTest handles filenames; it adds a
mechanism to allow a required/expected file extension to be specified
for each test type, and uses the mechanism to specify that ".test" is
the required/expected extension for tokenizer tests, and that ".dat" is
the required/expected extension for tree-construction test files and for
encoding test files.

Without this change, Html5libTest only deals correctly with the ".test"
extension for tokenizer test files — but not with the ".dat" extension
for tree-construction test files and encoding test files.
---
 .../validator/htmlparser/test/Html5libTest.java   | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java
index 9c18e749..3958f347 100644
--- a/test-src/nu/validator/htmlparser/test/Html5libTest.java
+++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java
@@ -42,7 +42,7 @@ public Html5libTest() throws URISyntaxException {
 
     public void testEncoding() throws Exception {
         Files.walkFileTree(testDir.resolve("encoding"), //
-                new TestVisitor(true, false, file -> //
+                new TestVisitor(true, ".dat", file -> //
                 new EncodingTester(Files.newInputStream(file)).runTests()));
         if (EncodingTester.exitStatus != 0) {
             assert false : "Encoding test failed";
@@ -51,7 +51,7 @@ public void testEncoding() throws Exception {
 
     public void testTokenizer() throws Exception {
         Files.walkFileTree(testDir.resolve("tokenizer"),
-                new TestVisitor(true, true, file -> //
+                new TestVisitor(true, ".test", file -> //
                 new TokenizerTester(Files.newInputStream(file)).runTests()));
         if (TokenizerTester.exitStatus != 0) {
             assert false : "Tokenizer test failed";
@@ -60,7 +60,7 @@ public void testTokenizer() throws Exception {
 
     public void testTree() throws Exception {
         Files.walkFileTree(testDir.resolve("tree-construction"),
-                new TestVisitor(true, false, file -> //
+                new TestVisitor(true, ".dat", file -> //
                 new TreeTester(Files.newInputStream(file)).runTests()));
         if (TreeTester.exitStatus != 0) {
             assert false : "Tree test failed";
@@ -86,14 +86,14 @@ private static class TestVisitor extends SimpleFileVisitor<Path> {
 
         private final boolean skipScripted;
 
-        private final boolean requireTestExtension;
+        private final String requiredTestExtension;
 
         private final TestConsumer runner;
 
-        private TestVisitor(boolean skipScripted, boolean requireTestExtension,
+        private TestVisitor(boolean skipScripted, String requiredTestExtension,
                 TestConsumer runner) {
             this.skipScripted = skipScripted;
-            this.requireTestExtension = requireTestExtension;
+            this.requiredTestExtension = requiredTestExtension;
             this.runner = runner;
         }
 
@@ -110,8 +110,7 @@ public FileVisitResult preVisitDirectory(Path dir,
         @Override
         public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
                 throws IOException {
-            if (!requireTestExtension
-                    || file.getFileName().toString().endsWith(".test")) {
+            if (file.getFileName().toString().endsWith(requiredTestExtension)) {
                 runner.accept(file);
             }
             return FileVisitResult.CONTINUE;

From 8a48cb048d73a84540f7a146addc06c6078679d0 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Sun, 30 Aug 2020 22:29:24 +0900
Subject: [PATCH 14/15] Make Html5libTest handle double-escaped tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change makes Html5libTest correctly handle tests in the
html5lib-tests suite which have cases with so-called “double-escaped”
“input” and “output” values — for example, values that contain the
literals “\\u0000” and “\\uFFFD" rather than “\u0000” and “\uFFFD”.
---
 .../nu/validator/htmlparser/test/Html5libTest.java  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java
index 3958f347..67770481 100644
--- a/test-src/nu/validator/htmlparser/test/Html5libTest.java
+++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java
@@ -22,8 +22,10 @@
 
 package nu.validator.htmlparser.test;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -52,7 +54,7 @@ public void testEncoding() throws Exception {
     public void testTokenizer() throws Exception {
         Files.walkFileTree(testDir.resolve("tokenizer"),
                 new TestVisitor(true, ".test", file -> //
-                new TokenizerTester(Files.newInputStream(file)).runTests()));
+                new TokenizerTester(getDoubleEscapedInput(file)).runTests()));
         if (TokenizerTester.exitStatus != 0) {
             assert false : "Tokenizer test failed";
         }
@@ -67,6 +69,15 @@ public void testTree() throws Exception {
         }
     }
 
+    private ByteArrayInputStream getDoubleEscapedInput(Path file)
+            throws IOException {
+        byte[] fileBytes = Files.readAllBytes(file);
+        String fileContent = new String(fileBytes, StandardCharsets.UTF_8);
+        String unescapedContent = fileContent.replace("\\\\u", "\\u");
+        byte[] newBytes = unescapedContent.getBytes(StandardCharsets.UTF_8);
+        return new ByteArrayInputStream(newBytes);
+    }
+
     private interface TestConsumer extends Consumer<Path> {
 
         @Override

From 390d9a91e8b5008516d25e648a514dafc32bb833 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Tue, 1 Sep 2020 12:34:18 +0900
Subject: [PATCH 15/15] Use Paths.get rather than Path.of in Html5libTest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change replaces Path.of() calls in Html5libTest with Paths.get().

Per https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/file/Path.html#of(java.net.URI)
Path.of() was introduced in Java 11. So Java 8 has no Path.of(); see
also https://docs.oracle.com/javase/8/docs/api/java/nio/file/Path.html

We need to continue to support Java 8 for the time being. It seems
Paths.get() will eventually end up being formally deprecated; by the
time it finally is, we may also be able to quit supporting Java 8 — and
so we can just switch to Path.of() then.
---
 test-src/nu/validator/htmlparser/test/Html5libTest.java | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test-src/nu/validator/htmlparser/test/Html5libTest.java b/test-src/nu/validator/htmlparser/test/Html5libTest.java
index 67770481..724062e2 100644
--- a/test-src/nu/validator/htmlparser/test/Html5libTest.java
+++ b/test-src/nu/validator/htmlparser/test/Html5libTest.java
@@ -29,6 +29,7 @@
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.nio.file.SimpleFileVisitor;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.function.Consumer;
@@ -38,7 +39,7 @@ public class Html5libTest {
     private final Path testDir;
 
     public Html5libTest() throws URISyntaxException {
-        this.testDir = Path.of(
+        this.testDir = Paths.get(
                 Html5libTest.class.getResource("/html5lib-tests").toURI());
     }
 
@@ -111,7 +112,8 @@ private TestVisitor(boolean skipScripted, String requiredTestExtension,
         @Override
         public FileVisitResult preVisitDirectory(Path dir,
                 BasicFileAttributes attrs) throws IOException {
-            if (skipScripted && dir.getFileName().equals(Path.of("scripted"))) {
+            if (skipScripted
+                    && dir.getFileName().equals(Paths.get("scripted"))) {
                 return FileVisitResult.SKIP_SUBTREE;
             }