From 7f833016154bc2cd22c73c3a39204c94c9290571 Mon Sep 17 00:00:00 2001
From: Hsiang-Fu Yu <hsiangfu@amazon.com>
Date: Mon, 26 Apr 2021 20:42:01 +0000
Subject: [PATCH] Initial commit

---
 .github/ISSUE_TEMPLATE/bug_report.md          |    44 +
 .github/ISSUE_TEMPLATE/feature_request.md     |    17 +
 .github/PULL_REQUEST_TEMPLATE.md              |     6 +
 .github/build_pypi_wheel.sh                   |    35 +
 .github/workflows/build_pypi.yml              |    54 +
 .github/workflows/codeql.yml                  |    50 +
 .github/workflows/pytest.yml                  |    31 +
 .github/workflows/style_type_check.yml        |    32 +
 .style_type.cfg                               |     7 +
 CODE_OF_CONDUCT.md                            |     4 +
 CONTRIBUTING.md                               |    89 +
 LICENSE                                       |   201 +
 MANIFEST.in                                   |     8 +
 NOTICE                                        |     1 +
 README.md                                     |   147 +
 THIRD-PARTY-LICENSES.txt                      |   123 +
 pecos/__init__.py                             |   100 +
 pecos/apps/__init__.py                        |    10 +
 pecos/apps/text2text/README.md                |    88 +
 pecos/apps/text2text/__init__.py              |    11 +
 pecos/apps/text2text/evaluate.py              |   130 +
 pecos/apps/text2text/model.py                 |   488 +
 pecos/apps/text2text/predict.py               |   194 +
 pecos/apps/text2text/train.py                 |   308 +
 pecos/core/__init__.py                        |    21 +
 pecos/core/base.py                            |  1300 +
 pecos/core/libpecos.cpp                       |   276 +
 pecos/core/third_party/nlohmann_json/json.hpp | 22875 ++++++++++++++++
 .../robin_hood_hashing/robin_hood.h           |  2665 ++
 pecos/core/utils/clustering.hpp               |   371 +
 pecos/core/utils/file_util.hpp                |   129 +
 pecos/core/utils/matrix.hpp                   |  1038 +
 pecos/core/utils/parallel.hpp                 |    75 +
 pecos/core/utils/random.hpp                   |    47 +
 pecos/core/utils/scipy_loader.hpp             |   436 +
 pecos/core/utils/tfidf.hpp                    |  1465 +
 pecos/core/xmc/inference.hpp                  |  1971 ++
 pecos/core/xmc/linear_solver.hpp              |   585 +
 pecos/utils/__init__.py                       |    10 +
 pecos/utils/cli.py                            |    59 +
 pecos/utils/cluster_util.py                   |   350 +
 pecos/utils/featurization/__init__.py         |    10 +
 pecos/utils/featurization/text/README.md      |    86 +
 pecos/utils/featurization/text/__init__.py    |    10 +
 pecos/utils/featurization/text/preprocess.py  |   377 +
 .../text/sentencepiece/__init__.py            |    11 +
 .../featurization/text/sentencepiece/model.py |   355 +
 .../text/sentencepiece/predict.py             |    71 +
 .../featurization/text/sentencepiece/train.py |   113 +
 pecos/utils/featurization/text/vectorizers.py |   814 +
 pecos/utils/logging_util.py                   |    37 +
 pecos/utils/parallel_util.py                  |    32 +
 pecos/utils/smat_util.py                      |   823 +
 pecos/utils/torch_util.py                     |    67 +
 pecos/xmc/__init__.py                         |    18 +
 pecos/xmc/base.py                             |  1685 ++
 pecos/xmc/xlinear/README.md                   |   100 +
 pecos/xmc/xlinear/__init__.py                 |    11 +
 pecos/xmc/xlinear/evaluate.py                 |    63 +
 pecos/xmc/xlinear/model.py                    |   388 +
 pecos/xmc/xlinear/predict.py                  |   159 +
 pecos/xmc/xlinear/train.py                    |   328 +
 pecos/xmc/xtransformer/README.md              |    59 +
 pecos/xmc/xtransformer/__init__.py            |    12 +
 pecos/xmc/xtransformer/matcher.py             |  1439 +
 pecos/xmc/xtransformer/model.py               |   529 +
 pecos/xmc/xtransformer/module.py              |   114 +
 pecos/xmc/xtransformer/network.py             |   394 +
 pecos/xmc/xtransformer/predict.py             |   181 +
 pecos/xmc/xtransformer/train.py               |   567 +
 setup.cfg                                     |    46 +
 setup.py                                      |   174 +
 test/pecos/apps/text2text/test_text2text.py   |   194 +
 test/pecos/test_pecos.py                      |    15 +
 .../text/sentencepiece/test_sentencepiece.py  |    92 +
 .../featurization/text/test_preprocessor.py   |   159 +
 .../featurization/text/test_vectorizer.py     |   281 +
 test/pecos/utils/test_cluster_util.py         |    74 +
 test/pecos/utils/test_smat_utils.py           |   163 +
 test/pecos/utils/test_torch_util.py           |    31 +
 test/pecos/utils/test_utils.py                |    17 +
 test/pecos/xmc/test_xmc.py                    |   245 +
 test/pecos/xmc/xlinear/test_xlinear.py        |   745 +
 test/pecos/xmc/xtransformer/test_model.py     |    17 +
 .../xmc/xtransformer/test_xtransformer.py     |   121 +
 test/tst-data/apps/text2text/query_text.txt   |    20 +
 .../apps/text2text/query_to_keep_text.txt     |     5 +
 .../sklearn_tfidf_true_pred_items.txt         |     2 +
 .../sklearn_tfidf_true_pred_items_keep.txt    |     2 +
 test/tst-data/apps/text2text/test.txt         |     2 +
 test/tst-data/apps/text2text/train.txt        |     8 +
 .../apps/text2text/true_pred_items.txt        |     2 +
 test/tst-data/apps/text2text/truth_items.txt  |     2 +
 .../utils/data/dummy_transformer_config.json  |    36 +
 .../utils/data/dummy_transformer_model.bin    |   Bin 0 -> 15759 bytes
 .../utils/data/hashing_vectorizer_config.json |     7 +
 test/tst-data/utils/data/label_vocab.txt      |     2 +
 .../utils/data/sentencepiece_train.src        |    20 +
 .../utils/data/sentencepiece_train.tgt        |    20 +
 .../data/sentencepiece_word_tokenized.src     |    20 +
 .../data/sklearn_tfidf_vectorizer_config.json |     7 +
 .../utils/data/tfidf_vectorizer_config.json   |     9 +
 .../tst-data/utils/data/train.hashing.tgt.npz |   Bin 0 -> 1007 bytes
 .../utils/data/train.sklearn_tfidf.tgt.npz    |   Bin 0 -> 998 bytes
 test/tst-data/utils/data/train.src            |     2 +
 test/tst-data/utils/data/train.tfidf.tgt.npz  |   Bin 0 -> 1241 bytes
 .../utils/data/train.transformer.tgt.npy      |   Bin 0 -> 192 bytes
 test/tst-data/utils/data/train_text.src       |     2 +
 .../data/transformer_vectorizer_config.json   |    10 +
 test/tst-data/utils/data/vocab.txt            |    15 +
 test/tst-data/xmc/xlinear/L.pifa.npz          |   Bin 0 -> 1002 bytes
 test/tst-data/xmc/xlinear/P:nr_splits=2.npz   |   Bin 0 -> 956 bytes
 test/tst-data/xmc/xlinear/P:nr_splits=4.npz   |   Bin 0 -> 936 bytes
 test/tst-data/xmc/xlinear/X.npz               |   Bin 0 -> 1001 bytes
 test/tst-data/xmc/xlinear/Xt.npz              |   Bin 0 -> 1001 bytes
 test/tst-data/xmc/xlinear/Y.npz               |   Bin 0 -> 866 bytes
 test/tst-data/xmc/xlinear/Yt.npz              |   Bin 0 -> 866 bytes
 test/tst-data/xmc/xlinear/Yt_pred.npz         |   Bin 0 -> 1067 bytes
 .../xmc/xlinear/Yt_pred_with_tfn+man.npz      |   Bin 0 -> 948 bytes
 test/tst-data/xmc/xtransformer/clusters.npz   |   Bin 0 -> 1309 bytes
 .../xmc/xtransformer/dense_train_feat.npy     |   Bin 0 -> 248 bytes
 .../xmc/xtransformer/saved_model/C.npz        |   Bin 0 -> 1309 bytes
 .../saved_model/encoder/config.json           |    37 +
 .../saved_model/encoder/pytorch_model.bin     |   Bin 0 -> 16473 bytes
 .../saved_model/text_encoder/config.json      |    37 +
 .../text_encoder/pytorch_model.bin            |   Bin 0 -> 16473 bytes
 .../saved_model/text_tokenizer/vocab.txt      |    15 +
 .../saved_model/tokenizer/vocab.txt           |    15 +
 test/tst-data/xmc/xtransformer/train.txt      |     3 +
 test/tst-data/xmc/xtransformer/train_feat.npz |   Bin 0 -> 911 bytes
 .../tst-data/xmc/xtransformer/train_label.npz |   Bin 0 -> 1337 bytes
 131 files changed, 47378 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100755 .github/build_pypi_wheel.sh
 create mode 100644 .github/workflows/build_pypi.yml
 create mode 100644 .github/workflows/codeql.yml
 create mode 100644 .github/workflows/pytest.yml
 create mode 100644 .github/workflows/style_type_check.yml
 create mode 100644 .style_type.cfg
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 MANIFEST.in
 create mode 100644 NOTICE
 create mode 100644 README.md
 create mode 100644 THIRD-PARTY-LICENSES.txt
 create mode 100644 pecos/__init__.py
 create mode 100644 pecos/apps/__init__.py
 create mode 100644 pecos/apps/text2text/README.md
 create mode 100644 pecos/apps/text2text/__init__.py
 create mode 100644 pecos/apps/text2text/evaluate.py
 create mode 100644 pecos/apps/text2text/model.py
 create mode 100644 pecos/apps/text2text/predict.py
 create mode 100644 pecos/apps/text2text/train.py
 create mode 100644 pecos/core/__init__.py
 create mode 100644 pecos/core/base.py
 create mode 100644 pecos/core/libpecos.cpp
 create mode 100644 pecos/core/third_party/nlohmann_json/json.hpp
 create mode 100644 pecos/core/third_party/robin_hood_hashing/robin_hood.h
 create mode 100644 pecos/core/utils/clustering.hpp
 create mode 100644 pecos/core/utils/file_util.hpp
 create mode 100644 pecos/core/utils/matrix.hpp
 create mode 100644 pecos/core/utils/parallel.hpp
 create mode 100644 pecos/core/utils/random.hpp
 create mode 100644 pecos/core/utils/scipy_loader.hpp
 create mode 100644 pecos/core/utils/tfidf.hpp
 create mode 100644 pecos/core/xmc/inference.hpp
 create mode 100644 pecos/core/xmc/linear_solver.hpp
 create mode 100644 pecos/utils/__init__.py
 create mode 100644 pecos/utils/cli.py
 create mode 100644 pecos/utils/cluster_util.py
 create mode 100644 pecos/utils/featurization/__init__.py
 create mode 100644 pecos/utils/featurization/text/README.md
 create mode 100644 pecos/utils/featurization/text/__init__.py
 create mode 100644 pecos/utils/featurization/text/preprocess.py
 create mode 100644 pecos/utils/featurization/text/sentencepiece/__init__.py
 create mode 100644 pecos/utils/featurization/text/sentencepiece/model.py
 create mode 100644 pecos/utils/featurization/text/sentencepiece/predict.py
 create mode 100644 pecos/utils/featurization/text/sentencepiece/train.py
 create mode 100644 pecos/utils/featurization/text/vectorizers.py
 create mode 100644 pecos/utils/logging_util.py
 create mode 100644 pecos/utils/parallel_util.py
 create mode 100644 pecos/utils/smat_util.py
 create mode 100644 pecos/utils/torch_util.py
 create mode 100644 pecos/xmc/__init__.py
 create mode 100644 pecos/xmc/base.py
 create mode 100644 pecos/xmc/xlinear/README.md
 create mode 100644 pecos/xmc/xlinear/__init__.py
 create mode 100644 pecos/xmc/xlinear/evaluate.py
 create mode 100644 pecos/xmc/xlinear/model.py
 create mode 100644 pecos/xmc/xlinear/predict.py
 create mode 100644 pecos/xmc/xlinear/train.py
 create mode 100644 pecos/xmc/xtransformer/README.md
 create mode 100644 pecos/xmc/xtransformer/__init__.py
 create mode 100644 pecos/xmc/xtransformer/matcher.py
 create mode 100644 pecos/xmc/xtransformer/model.py
 create mode 100644 pecos/xmc/xtransformer/module.py
 create mode 100644 pecos/xmc/xtransformer/network.py
 create mode 100644 pecos/xmc/xtransformer/predict.py
 create mode 100644 pecos/xmc/xtransformer/train.py
 create mode 100644 setup.cfg
 create mode 100644 setup.py
 create mode 100644 test/pecos/apps/text2text/test_text2text.py
 create mode 100644 test/pecos/test_pecos.py
 create mode 100644 test/pecos/utils/featurization/text/sentencepiece/test_sentencepiece.py
 create mode 100644 test/pecos/utils/featurization/text/test_preprocessor.py
 create mode 100644 test/pecos/utils/featurization/text/test_vectorizer.py
 create mode 100644 test/pecos/utils/test_cluster_util.py
 create mode 100644 test/pecos/utils/test_smat_utils.py
 create mode 100644 test/pecos/utils/test_torch_util.py
 create mode 100644 test/pecos/utils/test_utils.py
 create mode 100644 test/pecos/xmc/test_xmc.py
 create mode 100644 test/pecos/xmc/xlinear/test_xlinear.py
 create mode 100644 test/pecos/xmc/xtransformer/test_model.py
 create mode 100644 test/pecos/xmc/xtransformer/test_xtransformer.py
 create mode 100644 test/tst-data/apps/text2text/query_text.txt
 create mode 100644 test/tst-data/apps/text2text/query_to_keep_text.txt
 create mode 100644 test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items.txt
 create mode 100644 test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items_keep.txt
 create mode 100644 test/tst-data/apps/text2text/test.txt
 create mode 100644 test/tst-data/apps/text2text/train.txt
 create mode 100644 test/tst-data/apps/text2text/true_pred_items.txt
 create mode 100644 test/tst-data/apps/text2text/truth_items.txt
 create mode 100644 test/tst-data/utils/data/dummy_transformer_config.json
 create mode 100644 test/tst-data/utils/data/dummy_transformer_model.bin
 create mode 100644 test/tst-data/utils/data/hashing_vectorizer_config.json
 create mode 100644 test/tst-data/utils/data/label_vocab.txt
 create mode 100644 test/tst-data/utils/data/sentencepiece_train.src
 create mode 100644 test/tst-data/utils/data/sentencepiece_train.tgt
 create mode 100644 test/tst-data/utils/data/sentencepiece_word_tokenized.src
 create mode 100644 test/tst-data/utils/data/sklearn_tfidf_vectorizer_config.json
 create mode 100644 test/tst-data/utils/data/tfidf_vectorizer_config.json
 create mode 100644 test/tst-data/utils/data/train.hashing.tgt.npz
 create mode 100644 test/tst-data/utils/data/train.sklearn_tfidf.tgt.npz
 create mode 100644 test/tst-data/utils/data/train.src
 create mode 100644 test/tst-data/utils/data/train.tfidf.tgt.npz
 create mode 100644 test/tst-data/utils/data/train.transformer.tgt.npy
 create mode 100644 test/tst-data/utils/data/train_text.src
 create mode 100644 test/tst-data/utils/data/transformer_vectorizer_config.json
 create mode 100644 test/tst-data/utils/data/vocab.txt
 create mode 100644 test/tst-data/xmc/xlinear/L.pifa.npz
 create mode 100644 test/tst-data/xmc/xlinear/P:nr_splits=2.npz
 create mode 100644 test/tst-data/xmc/xlinear/P:nr_splits=4.npz
 create mode 100644 test/tst-data/xmc/xlinear/X.npz
 create mode 100644 test/tst-data/xmc/xlinear/Xt.npz
 create mode 100644 test/tst-data/xmc/xlinear/Y.npz
 create mode 100644 test/tst-data/xmc/xlinear/Yt.npz
 create mode 100644 test/tst-data/xmc/xlinear/Yt_pred.npz
 create mode 100644 test/tst-data/xmc/xlinear/Yt_pred_with_tfn+man.npz
 create mode 100644 test/tst-data/xmc/xtransformer/clusters.npz
 create mode 100644 test/tst-data/xmc/xtransformer/dense_train_feat.npy
 create mode 100644 test/tst-data/xmc/xtransformer/saved_model/C.npz
 create mode 100644 test/tst-data/xmc/xtransformer/saved_model/encoder/config.json
 create mode 100644 test/tst-data/xmc/xtransformer/saved_model/encoder/pytorch_model.bin
 create mode 100644 test/tst-data/xmc/xtransformer/saved_model/text_encoder/config.json
 create mode 100644 test/tst-data/xmc/xtransformer/saved_model/text_encoder/pytorch_model.bin
 create mode 100644 test/tst-data/xmc/xtransformer/saved_model/text_tokenizer/vocab.txt
 create mode 100644 test/tst-data/xmc/xtransformer/saved_model/tokenizer/vocab.txt
 create mode 100644 test/tst-data/xmc/xtransformer/train.txt
 create mode 100644 test/tst-data/xmc/xtransformer/train_feat.npz
 create mode 100644 test/tst-data/xmc/xtransformer/train_label.npz

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..9c500921
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,44 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: 'bug'
+assignees: ''
+
+---
+## Description
+(A clear and concise description of what the bug is.)
+
+## How to Reproduce?
+(If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.)
+
+### Steps to reproduce
+(Please provide minimal example of code snippet that reproduces the error. For existing examples, please provide link.)
+
+```python
+put code here
+```
+
+(Paste the commands you ran that produced the error.)
+
+1.
+2.
+
+## What have you tried to solve it?
+
+1.
+2.
+
+## Error message or code output
+(Paste the complete error message, including stack trace, or the undesired output that the above snippet produces.)
+
+```
+put error or undesired output here
+```
+
+## Environment
+- Operating system:
+- Python version:
+- PECOS version:
+
+(Add as much information about your environment as possible, e.g. dependencies versions.)
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..366743f0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,17 @@
+---
+name: Feature request
+about: Suggest an enhancement, feature request, or idea for this project
+title: ''
+labels: 'enhancement'
+assignees: ''
+
+---
+
+## Description
+(A clear and concise description of what the feature is.)
+- If the proposal is about a new model, provide description of what the model is.
+- If the proposal is about an API, provide mock examples if possible.
+
+## References
+- list reference and related literature
+- list known implementations
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000..632000da
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,6 @@
+*Issue #, if available:*
+
+*Description of changes:*
+
+
+By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
\ No newline at end of file
diff --git a/.github/build_pypi_wheel.sh b/.github/build_pypi_wheel.sh
new file mode 100755
index 00000000..4fb0fb4b
--- /dev/null
+++ b/.github/build_pypi_wheel.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -e
+
+# Get pip
+echo "Build wheel using Python version $PIP_VER..."
+PIP=$(ls /opt/python/cp${PIP_VER//./}-cp*/bin/pip)
+if [ -z $PIP ]; then
+   echo "No pip found for version $PIP_VER, exit"
+   exit 1
+fi
+echo "pip: $($PIP --version)"
+
+
+# Install dependencies
+echo "Install dependencies..."
+$PIP install setuptools wheel twine auditwheel
+yum install -y openblas-devel
+
+
+# Build wheel
+PECOS_SOURCE=$DOCKER_MNT/
+WHEEL_OUTPUT_FOLDER=$DOCKER_MNT/$WHEEL_DIR
+
+$PIP wheel $PECOS_SOURCE --no-deps -w $WHEEL_OUTPUT_FOLDER
+WHEEL_NAME=$(ls $WHEEL_OUTPUT_FOLDER)
+
+echo "Temporary wheel: $(ls $WHEEL_OUTPUT_FOLDER)"
+auditwheel show $WHEEL_OUTPUT_FOLDER/$WHEEL_NAME
+
+echo "Auditing wheel to platform $PLAT..."
+auditwheel repair $WHEEL_OUTPUT_FOLDER/$WHEEL_NAME -w $WHEEL_OUTPUT_FOLDER
+rm $WHEEL_OUTPUT_FOLDER/$WHEEL_NAME
+
+echo "Audited wheel: $(ls $WHEEL_OUTPUT_FOLDER)"
+auditwheel show $WHEEL_OUTPUT_FOLDER/$(ls $WHEEL_OUTPUT_FOLDER)
\ No newline at end of file
diff --git a/.github/workflows/build_pypi.yml b/.github/workflows/build_pypi.yml
new file mode 100644
index 00000000..fb584e67
--- /dev/null
+++ b/.github/workflows/build_pypi.yml
@@ -0,0 +1,54 @@
+name: Pypi Build and Release
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    name: Build PyPI Wheel
+    strategy:
+      max-parallel: 4
+      fail-fast: false
+      matrix:
+        python-version: [3.6, 3.7, 3.8, 3.9]
+        build_platform: ["manylinux2014_x86_64"]
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+
+    - name: Build Wheel
+      env:
+        DOCKER_IMG: quay.io/pypa/${{ matrix.build_platform }}
+        DOCKER_MNT: pypi_build
+        WHEEL_DIR: wheel # wheel location
+      run: |
+        echo "Build wheel for PECOS using Python version ${{ matrix.python-version }}..."
+        docker run --rm -v $(pwd):/$DOCKER_MNT \
+        --env PLAT=${{ matrix.build_platform }} --env PIP_VER=${{ matrix.python-version }} \
+        --env DOCKER_MNT=$DOCKER_MNT --env WHEEL_DIR=$WHEEL_DIR \
+        $DOCKER_IMG /bin/bash /$DOCKER_MNT/.github/build_pypi_wheel.sh
+
+    - name: Check Version Tag
+      id: check-ver-tag
+      run: |
+        if [[ "${{ github.event_name }}" == "push" ]] && \
+           [[ "${{ github.event.ref }}" =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo ::set-output name=match::true
+        fi
+
+    - name: Upload to PyPI
+      if: steps.check-ver-tag.outputs.match == 'true' # Only upload for tags v*.*.*
+      env:
+        TWINE_USERNAME: ${{ secrets. PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets. PYPI_PASSWORD }}
+        WHEEL_DIR: wheel # same as Build-Wheel/WHEEL_DIR
+      run: |
+        WHEEL=$(ls $(pwd)/$WHEEL_DIR)
+        if [ -z $WHEEL ]; then
+          echo "Wheel build not successful, exit"
+          exit 1
+        fi
+        echo "Will upload $WHEEL to PyPI..."
+        pip install twine --upgrade
+        python3 -m twine upload $(pwd)/$WHEEL_DIR/$WHEEL --verbose
\ No newline at end of file
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 00000000..2e273978
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,50 @@
+name: "Code scanning - action"
+
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron: '0 19 * * 0'
+
+jobs:
+  CodeQL-Build:
+    name: CodeQL
+    # CodeQL runs on ubuntu-latest
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'python','cpp' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
+        # Learn more:
+        # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v1
+      with:
+        languages: ${{ matrix.language }}
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v1
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    #- run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v1
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
new file mode 100644
index 00000000..0e7bdefd
--- /dev/null
+++ b/.github/workflows/pytest.yml
@@ -0,0 +1,31 @@
+name: Python Unit Tests
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    name: Python Unit Tests
+    strategy:
+      max-parallel: 4
+      fail-fast: false
+      matrix:
+        python-version: [3.6, 3.7, 3.8, 3.9]
+        platform: [ubuntu-16.04, ubuntu-18.04, ubuntu-20.04]
+
+    runs-on: ${{ matrix.platform }}
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install pip
+        python -m pip install --upgrade pip
+        pip install pytest pytest-coverage
+        pip install -v --editable .
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.github/workflows/style_type_check.yml b/.github/workflows/style_type_check.yml
new file mode 100644
index 00000000..f8ad498c
--- /dev/null
+++ b/.github/workflows/style_type_check.yml
@@ -0,0 +1,32 @@
+name: Style and type checks
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    name: Style and Type Checks
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python 3.7
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        python -m pip install pip
+        python -m pip install --upgrade pip
+    - name: Style checks
+      run: |
+        pip install black
+        python -m black -v --check --config .style_type.cfg --line-length=100 --diff ./pecos
+        python -m black -v --check --config .style_type.cfg --line-length=100 --diff ./test
+    - name: Type checks
+      run: |
+        pip install mypy
+        python -m mypy -v --config-file .style_type.cfg -p pecos
+        python -m mypy -v --config-file .style_type.cfg `find ./test/ -type f -name "*.py"`
+      if: ${{ success() || failure() }}
+    # - name: Check license headers
+    #   run: python .devtools/license check src test
\ No newline at end of file
diff --git a/.style_type.cfg b/.style_type.cfg
new file mode 100644
index 00000000..5fbe0b45
--- /dev/null
+++ b/.style_type.cfg
@@ -0,0 +1,7 @@
+[isort]
+line_length = 100
+multi_line_output = 3
+include_trailing_comma = true
+
+[mypy]
+ignore_missing_imports = true
\ No newline at end of file
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..5b627cfa
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,4 @@
+## Code of Conduct
+This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
+For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
+opensource-codeofconduct@amazon.com with any additional questions or comments.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..e823b511
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,89 @@
+# Contributing Guidelines
+
+Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
+documentation, we greatly value feedback and contributions from our community.
+
+Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
+information to effectively respond to your bug report or contribution.
+
+
+## Reporting Bugs/Feature Requests
+
+We welcome you to use the GitHub issue tracker to report bugs or suggest features.
+
+When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
+reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
+
+* A reproducible test case or series of steps
+* The version of our code being used
+* Any modifications you've made relevant to the bug
+* Anything unusual about your environment or deployment
+
+
+## Contributing via Pull Requests
+Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
+
+1. You are working against the latest source on the `mainline` branch.
+2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
+3. You open an issue to discuss any significant work - we would hate for your time to be wasted. If there is an existing issue your are interested in working on, you should comment on it so others don't start working on the same issue.
+
+To send us a pull request, please:
+
+1. Create a personal fork of the project on Github webpage. Clone the fork on your local machine.
+2. Add the original repository as a remote called `upstream`. Your remote repo on Github is called `origin`.
+    ```
+    git remote add upstream https://github.com/amzn/pecos.git
+    git remote -v # Display all remotes to double-check
+    ```
+3. If you created your fork a while ago, be sure to pull upstream changes into your local repository.
+    ```
+    git checkout mainline
+    git pull -r upstream
+    ```
+4. Create a new branch from `mainline` to work on.
+5. Implement your code on the new branch:
+    * Follow the code style of the project.
+    * Write or adapt tests as needed.
+    * Add or change the documentation as needed.
+6. **Ensure local style/type checks and tests pass.**
+7. Commit using clear messages. **Squash your commits into a single commit.**
+8. Push your branch to **your fork** `origin` on Github.
+    ```
+    git push --set-upstream origin <BRANCH NAME>
+    ```
+9. On GitHub webpage's pull request panel, open a pull request into **original repository** `upstream` from **your fork**. *(NOTE: this is the default option if one does not make changes when creating PR)*
+    * Carefully fill out PR template
+    * Click on "Draft PR" on drop-down menu to double-check by oneself
+    * When ready, click on “Ready for review”
+10. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
+12. If you need to make changes to the pull request, simply modify on your branch by **amend** the commit, and then *force-push* to your `origin` branch. The pull request will automatically update its timeline:
+    ```
+    # Do modification
+    git commit -a --amend
+    git push -f
+    ```
+11. Once the pull request is approved and merged, pull the changes from upstream to your local repo and delete your extra branch(es).
+
+GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
+[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
+
+
+## Finding contributions to work on
+Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'Call for Contribution' or 'good first issue' issues is a great place to start.
+
+
+## Code of Conduct
+This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
+For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
+opensource-codeofconduct@amazon.com with any additional questions or comments.
+
+
+## Security issue notifications
+If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
+
+
+## Licensing
+
+See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
+
+We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..0a6974bc
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,8 @@
+include LICENSE
+include NOTICE
+include CODE_OF_CONDUCT.md
+include CONTRIBUTING.md
+include THIRD-PARTY-LICENSES.txt
+
+recursive-include pecos/core *.hpp *.h *.cpp *.c
+recursive-include test *
\ No newline at end of file
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 00000000..546b777f
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1 @@
+Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..a00c1f3f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,147 @@
+# PECOS - Predictions for Enormous and Correlated Output Spaces
+
+[![PyPi Latest Release](https://img.shields.io/pypi/v/libpecos)](https://img.shields.io/pypi/v/libpecos)
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](./LICENSE)
+
+PECOS is a versatile and modular machine learning (ML) framework for fast learning and inference on problems with large output spaces, such as extreme multi-label ranking (XMR) and large-scale retrieval.
+PECOS' design is intentionally agnostic to the specific nature of the inputs and outputs as it is envisioned to be a general-purpose framework for multiple distinct applications.
+
+Given an input, PECOS identifies a small set (10-100) of relevant outputs from amongst an extremely large (~100MM) candidate set and ranks these outputs in terms of relevance. 
+
+
+### Features
+
+#### Extreme Multi-label Ranking and Classification
+* X-Linear ([`pecos.xmc.xlinear`](pecos/xmc/xlinear/README.md)): recursive linear models learning to traverse an input from the root of a hierarchical label tree to a few leaf node clusters, and return top-k relevant labels within the clusters as predictions. See more details in the [PECOS paper (Yu et al., 2020)](https://arxiv.org/pdf/2010.05878.pdf).
+  + fast real-time inference in C++
+  + can handle 100MM output space
+
+* X-Transformer ([`pecos.xmc.xtransformer`](pecos/xmc/xtransformer/README.md)): a Transformer matcher learning to traverse an input from the root of a hierarchical label tree to a few leaf node clusters, and return top-k relevant labels within the clusters using a linear ranker as predictions. See technical details in [X-Transformer paper (Chang et al., 2020)](https://arxiv.org/pdf/1905.02331.pdf) and latest SOTA results in the [PECOS paper (Yu et al., 2020)](https://arxiv.org/pdf/2010.05878.pdf).
+  + easy to extend with many pre-trained Transformer models from [huggingface transformers](https://github.com/huggingface/transformers).
+  + one of the State-of-the-art in deep learning based XMC methods.
+
+* text2text application ([`pecos.apps.text2text`](pecos/apps/text2text/README.md)): an easy-to-use text classification pipeline (with X-Linear backend) that supports n-gram TFIDF vectorization, classification, and ensemble predictions. 
+
+
+
+## Requirements and Installation
+
+* Python (>=3.6)
+* Pip (>=19.3)
+
+See other dependencies in [`setup.py`](https://github.com/amzn/pecos/blob/mainline/setup.py#L135)
+You should install PECOS in a [virtual environment](https://docs.python.org/3/library/venv.html).
+If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+
+### Supporting Platforms
+* Ubuntu 16.04, 18.04 and 20.04
+* Amazon Linux 2
+
+### Installation from Wheel
+
+
+PECOS can be installed using pip as follows:
+```bash
+pip3 install libpecos
+```
+
+### Installation from Source
+
+#### Prerequisite builder tools
+* For Ubuntu (16.04, 18.04, 20.04):
+``` bash
+apt-get update && apt-get install -y build-essential git python3 python3-distutils python3-venv
+```
+* For Amazon Linux 2:
+``` bash
+yum -y install python3 python3-devel python3-distutils python3-venv &&  yum -y install groupinstall 'Development Tools' 
+```
+
+#### Install and develop locally
+```bash
+git clone https://github.com/amzn/pecos
+cd pecos
+pip3 install --editable ./
+```
+
+
+## Quick Tour
+To have a glimpse of how PECOS works, here is a quick tour of using PECOS API for the XMR problem.
+
+### Toy Example
+The eXtreme Multi-label Ranking (XMR) problem is defined by two matrices
+* instance-to-feature matrix `X`, of shape `N by D` in [`SciPy CSR format`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)
+* instance-to-label matrix `Y`, of shape `N by L` in [`SciPy CSR format`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)
+
+Some toy data matrices are available in the [`tst-data`](https://github.com/amzn/pecos/tree/mainline/test/tst-data/xmc/xlinear) folder. 
+
+PECOS constructs a hierarchical label tree and learns linear models recursively (e.g., XR-Linear):
+```python
+>>> from pecos.xmc.xlinear.model import XLinearModel
+>>> from pecos.xmc import Indexer, LabelEmbeddingFactory
+
+# Build hierarchical label tree and train a XR-Linear model
+>>> label_feat = LabelEmbeddingFactory.create(Y, X)
+>>> cluster_chain = Indexer.gen(label_feat)
+>>> model = XLinearModel.train(X, Y, C=cluster_chain)
+>>> model.save("./save-models")
+```
+
+After learning the model, we do prediction and evaluation 
+```python
+>>> from pecos.utils import smat_util
+>>> Yt_pred = model.predict(Xt)
+# print precision and recall at k=10
+>>> print(smat_util.Metrics.generate(Yt, Yt_pred))
+```
+
+PECOS also offers optimized C++ implementation for fast real-time inference
+```python
+>>> model = XLinearModel.load("./save-models", is_predict_only=True)
+>>> for i in range(X_tst.shape[0]):
+>>>   y_tst_pred = model.predict(X_tst[i], threads=1)
+```
+
+
+## Citation
+
+If you find PECOS useful, please consider citing our papers.
+
+* H. Yu, K. Zhong, I. Dhillon, [PECOS: Prediction for Enormous and Correlated Output Spaces](https://arxiv.org/pdf/2010.05878.pdf), Arxiv 2020. 
+```bibtex
+@article{yu2020pecos,
+  title={PECOS: Prediction for Enormous and Correlated Output Spaces},
+  author={Yu, Hsiang-Fu and Zhong, Kai and Dhillon, Inderjit S},
+  journal={arXiv preprint arXiv:2010.05878},
+  year={2020}
+}
+```
+
+* W. Chang, H. Yu, K. Zhong, Y. Yang, I. Dhillon, [Taming pretrained transformers for extreme multi-label text classification](https://arxiv.org/pdf/1905.02331.pdf), KDD 2020.
+```bibtex
+@inproceedings{chang2020taming,
+  title={Taming pretrained transformers for extreme multi-label text classification},
+  author={Chang, Wei-Cheng and Yu, Hsiang-Fu and Zhong, Kai and Yang, Yiming and Dhillon, Inderjit S},
+  booktitle={Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
+  pages={3163--3171},
+  year={2020}
+}
+```
+
+## License
+
+Copyright (2021) Amazon.com, Inc.
+ 
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ 
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
diff --git a/THIRD-PARTY-LICENSES.txt b/THIRD-PARTY-LICENSES.txt
new file mode 100644
index 00000000..b2970c39
--- /dev/null
+++ b/THIRD-PARTY-LICENSES.txt
@@ -0,0 +1,123 @@
+The Amazon PECOS Product includes the following third-party software/licensing:
+
+** exp-trmf-nips16; version 0.1.0 -- https://github.com/rofuyu/exp-trmf-nips16
+Copyright (c) 2015-2019 The TRMF Project.
+All rights reserved.
+** liblinear; version 2.43 -- https://github.com/cjlin1/liblinear
+Copyright (c) 2007-2021 The LIBLINEAR Project.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither name of copyright holders nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------
+
+** X-Transformer; version 0.1.0 --
+https://github.com/OctoberChang/X-Transformer
+Copyright (c) 2020 The X-Transformer Project.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------
+
+** Robin Hood Hashing; version 3.9.0 --
+https://github.com/martinus/robin-hood-hashing
+Copyright (c) 2018-2019 Martin Ankerl
+
+MIT License
+
+Copyright (c) 2018-2019 Martin Ankerl
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+------
+
+** nlohmann-json; version 3.7.3 -- https://nlohmann.github.io/json/
+Copyright © 2013-2019 Niels Lohmann. The code is licensed under the MIT
+License.
+
+MIT License
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/pecos/__init__.py b/pecos/__init__.py
new file mode 100644
index 00000000..69801d3f
--- /dev/null
+++ b/pecos/__init__.py
@@ -0,0 +1,100 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+try:
+    from ._version import __version__  # noqa
+except ImportError:
+    # For raw code without installing, use a dummy version
+    __version__ = "0.0.0"
+
+import dataclasses as dc
+import copy
+
+
+_class_mapping_ = {}
+
+
+class MetaClass(type):
+    @staticmethod
+    def class_fullname(cls):
+        return f"{cls.__module__}###{cls.__qualname__}"
+
+    def __new__(cls, name, bases, attr):
+        cls = super().__new__(cls, name, bases, attr)
+        _class_mapping_[MetaClass.class_fullname(cls)] = cls
+        return cls
+
+
+class BaseClass(metaclass=MetaClass):
+    @classmethod
+    def class_fullname(cls):
+        return MetaClass.class_fullname(cls)
+
+    @classmethod
+    def append_meta(cls, d: dict = None):
+        meta = {"__meta__": {"class_fullname": cls.class_fullname()}}
+        if d is not None:
+            meta.update(d)
+        return meta
+
+
+@dc.dataclass
+class BaseParams(BaseClass):
+    @classmethod
+    def from_dict(cls, param=None, recursive=False):
+        def get_param(x, type_hint=None):
+            if isinstance(x, BaseParams):
+                return copy.deepcopy(x)
+            elif isinstance(x, dict):
+                meta = x.get("__meta__", None)
+                if meta:
+                    cls = _class_mapping_[meta["class_fullname"]]
+                    x = cls.from_dict(x)
+                elif type_hint and issubclass(type_hint, BaseParams):
+                    x = type_hint.from_dict(x)
+            elif isinstance(x, (list, tuple)):
+                x = [get_param(y, type_hint=type_hint) for y in x]
+            else:
+                x = copy.deepcopy(x)
+            return x
+
+        if param is None:
+            return cls()
+        elif isinstance(param, cls):
+            return copy.deepcopy(param)
+        elif isinstance(param, dict):
+            d = {}
+            for f in dc.fields(cls):
+                if f.name not in param:
+                    if recursive and issubclass(f.type, BaseParams):
+                        d[f.name] = f.type.from_dict(param, recursive=recursive)
+                    continue
+                d[f.name] = get_param(param[f.name], type_hint=f.type)
+            return cls(**d)
+        raise ValueError(f"{param} is not a valid parameter dictionary for {cls.name}")
+
+    def to_dict(self, with_meta=True):
+        d = {}
+        for f in dc.fields(self):
+            value = getattr(self, f.name)
+            if isinstance(value, BaseParams):
+                d[f.name] = value.to_dict(with_meta)
+            elif isinstance(value, (tuple, list)):
+                d[f.name] = [
+                    x.to_dict(with_meta) if isinstance(x, BaseParams) else x for x in value
+                ]
+            elif isinstance(value, dict):
+                d[f.name] = {
+                    k: v.to_dict(with_meta) if isinstance(v, BaseParams) else v
+                    for k, v in value.items()
+                }
+            else:
+                d[f.name] = value
+        return self.append_meta(d) if with_meta else d
diff --git a/pecos/apps/__init__.py b/pecos/apps/__init__.py
new file mode 100644
index 00000000..695dd79a
--- /dev/null
+++ b/pecos/apps/__init__.py
@@ -0,0 +1,10 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
diff --git a/pecos/apps/text2text/README.md b/pecos/apps/text2text/README.md
new file mode 100644
index 00000000..a705a412
--- /dev/null
+++ b/pecos/apps/text2text/README.md
@@ -0,0 +1,88 @@
+# PECOS for text2text Applications
+
+Given an input text, pecos.apps.text2text can generate a subset of labels relevant to this input from a fixed set of output labels.
+The input should be a text sequence, while the output labels can be either text-based or symbol-based (although the symbols are usually represented in text format).
+In text classification, for example, the input text can be a paragraph and the relevant labels can be categories that are tagged to this paragraph.
+Another example is retrieval, where the input text can be natural question and the relevant labels can be paragraphs that contain answer span to that natural question.
+
+## Getting started
+### Usage
+```bash
+  > python3 -m pecos.apps.text2text.train --help
+  > python3 -m pecos.apps.text2text.predict --help
+  > python3 -m pecos.apps.text2text.evaluate --help
+```
+
+
+### Usage example: Multi-label Category Tagging for Web Documents
+This toy example demonstrates how to run the training and prediction of PECOS text2text applications.
+Note that we use utf-8 encoding for all text files.
+
+First, consider the following input text file `training-data.txt`:
+```
+0,1,2<TAB>Alan Turing is widely considered to be the father of theoretical computer science and artificial intelligence.
+0,2,3<TAB>Hinton was co-author of a highly cited paper published in 1986 that popularized the backpropagation algorithm for training multi-layer neural networks.
+3,4,5<TAB>Hinton received the 2018 Turing Award, together with Yoshua Bengio and Yann LeCun, for their work on artificial intelligence and deep learning.
+3,4,5<TAB>In 1989, Yann LeCun et al. applied the standard backpropagation algorithm on neural networks for hand digit recognition.
+```
+Each line contain two fields, separated by `<TAB>`, the former is relevant output label ids and the latter is the input text:
+```
+OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...<TAB>INPUT_TEXT
+```
+
+The output ids are zero-based and correspond to the line numbers in the output label file.
+In particular, the corresponding output label file `output-labels.txt` takes the format of:
+```
+Artificial intelligence researchers
+Computability theorists
+British computer scientists
+Machine learning researchers
+Turing Award laureates
+Deep Learning
+```
+Each line in the file is a representation of an output label, categories in Wikipedia.
+
+(Optionally) we can provide the vectorization json file to finer control of n-gram TF-IDF features.
+We will use the exemplar config `config.json` as in the README.md of pecos.utils.featurization.text/
+
+Now, we training the text2text model, which include feature vectorization and learning PECOS model.
+```
+python3 -m pecos.apps.text2text.train \
+  --input-text-path ./training-data.txt \
+  --vectorizer-config-path ./config.josn \
+  --output-item-path ./output-labels.txt \
+  --model-folder ./pecos-text2text-model
+```
+The models are saved into the `./pecos-text2text-model`.
+
+For batch Predicting, user should give the input text file `test-data.txt`, which has the same format as `training-data.txt`:
+```
+python3 -m pecos.apps.text2text.predict \
+  --input-text-path ./test-data.txt \
+  --model-folder ./pecos-text2text-model \
+  --predicted-output-item-path ./test-prediction.txt
+```
+The predictions are saved in the `./test-prediction.txt`.
+Each line contains the generated output labels and score as a json-format dictionary for the corresponding input from the input file.
+
+For Online Predicting (interactive mode)
+```
+python3 -m pecos.apps.text2text.predict --model-folder ./pecos-text2text-model
+```
+
+***
+
+Copyright (2021) Amazon.com, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
diff --git a/pecos/apps/text2text/__init__.py b/pecos/apps/text2text/__init__.py
new file mode 100644
index 00000000..d9ad4e0c
--- /dev/null
+++ b/pecos/apps/text2text/__init__.py
@@ -0,0 +1,11 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+from .model import Text2Text  # noqa
diff --git a/pecos/apps/text2text/evaluate.py b/pecos/apps/text2text/evaluate.py
new file mode 100644
index 00000000..a046514c
--- /dev/null
+++ b/pecos/apps/text2text/evaluate.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3 -u
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+
+import argparse
+import json
+
+import scipy.sparse as smat
+from pecos.utils import smat_util
+
+
+def parse_arguments():
+    """Parse Text2Text model evaluation arguments"""
+
+    parser = argparse.ArgumentParser(
+        description="Calculate precision and recall for text item outputs, where both ground truth target items and predicted items are text-based. "
+    )
+
+    parser.add_argument(
+        "-p",
+        "--pred-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the file of predicted output. Format follows the output from pecos.apps.text2text.predict",
+    )
+
+    parser.add_argument(
+        "-y",
+        "--truth-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the file of with ground truth output. \
+                Format 1 (Only has ground truth text items): each line is a list of tab-separated sorted text items. \
+                Format 2 (Same as training input format): each line is OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT. \
+                    where OUTPUT_IDs are the zero-based output text item indices corresponding to the line numbers of TEXT_ITEM_PATH.",
+    )
+
+    parser.add_argument(
+        "-q",
+        "--text-item-path",
+        type=str,
+        default=None,
+        metavar="TEXT_ITEM_PATH",
+        help="Text item file name. Format: each line corresponds to a text item. If this path is given, we assume TRUTH_PATH uses Format 2. Otherwise, TRUTH_PATH uses Format 1",
+    )
+
+    parser.add_argument("-k", "--topk", type=int, default=10, metavar="INT", help="evaluate @k")
+
+    return parser
+
+
+def do_evaluation(args):
+    """Evaluate predicted labels for Text2Text model
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    # Initialize an item-to-index dictionary, item_dict
+    if args.text_item_path is None:
+        item_dict = {}
+    else:
+        item_dict = {
+            line.strip("\n"): i
+            for i, line in enumerate(open(args.text_item_path, "r", encoding="utf-8"))
+        }
+
+    # parse the ground truth items
+    col_id_t = []
+    row_id_t = []
+    val_t = []
+    num_samples_t = 0
+    with open(args.truth_path, "r", encoding="utf-8") as fg:
+        for i, line in enumerate(fg):
+            num_samples_t = num_samples_t + 1
+            # If text_item_path is not given, use Format 1 for truth_path: each line is a list of tab-separated sorted text items
+            if args.text_item_path is None:
+                for item in line.strip().split("\t"):
+                    if item not in item_dict:
+                        item_dict[item] = len(item_dict)
+                    row_id_t.append(i)
+                    col_id_t.append(item_dict[item])
+                    val_t.append(1.0)
+            # If text_item_path is given, use Format 2 for truth_path: each line is OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT
+            else:
+                for idx in line.strip().split("\t")[0].split(","):
+                    row_id_t.append(i)
+                    col_id_t.append(int(idx))
+                    val_t.append(1.0)
+
+    # parse the predicted items
+    col_id_p = []
+    row_id_p = []
+    val_p = []
+    num_samples_p = 0
+    with open(args.pred_path, "r", encoding="utf-8") as fp:
+        for i, line in enumerate(fp):
+            num_samples_p = num_samples_p + 1
+            item_scores = json.loads(line.strip())["data"]
+            for j, x in enumerate(item_scores):
+                item = x[0]
+                if item not in item_dict:
+                    item_dict[item] = len(item_dict)
+                row_id_p.append(i)
+                col_id_p.append(item_dict[item])
+                val_p.append(1.0 / (j + 1))
+
+    assert num_samples_t == num_samples_p
+    Y_true = smat.csr_matrix((val_t, (row_id_t, col_id_t)), shape=(num_samples_t, len(item_dict)))
+    Y_pred = smat.csr_matrix((val_p, (row_id_p, col_id_p)), shape=(num_samples_p, len(item_dict)))
+
+    metric = smat_util.Metrics.generate(Y_true, Y_pred, args.topk)
+    print("==== evaluation results ====")
+    print(metric)
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    do_evaluation(args)
diff --git a/pecos/apps/text2text/model.py b/pecos/apps/text2text/model.py
new file mode 100644
index 00000000..e0fc0911
--- /dev/null
+++ b/pecos/apps/text2text/model.py
@@ -0,0 +1,488 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import gc
+import hashlib
+import itertools
+import json
+import logging
+import pathlib
+import tempfile
+from os import makedirs, path
+
+import numpy as np
+from pecos.utils import smat_util
+from pecos.utils.cluster_util import ClusterChain
+from pecos.utils.featurization.text.preprocess import Preprocessor
+from pecos.xmc import Indexer, LabelEmbeddingFactory
+from pecos.xmc.xlinear import XLinearModel
+
+LOGGER = logging.getLogger(__name__)
+
+
+class CachedWorkspace(object):
+    """Generate a folder name for caching intermediate variables"""
+
+    def __init__(self, ws=None):
+        """Initialization
+
+        Args:
+            ws (str, optional): Workspace folder
+                If not given, will use a temporary folder.
+        """
+        if ws is None:
+            self.tmp_folder = tempfile.TemporaryDirectory()
+            ws = self.tmp_folder.name
+        self.ws = ws
+
+    def get_path_for_name_and_kwargs(self, name, kwargs):
+        """Generate a hashed path in the workspace (self.ws)
+
+        Args:
+            name (str): a basename (e.g., the intermediate variable's name) for this folder.
+            kwargs (dict): args that generates the variable
+
+        Returns:
+            A str: a hashed path for the given name and kwargs
+        """
+        folder = path.join(self.ws, name)
+        makedirs(folder, exist_ok=True)
+        # mapping.json is for debug purpose
+        mapping_path = path.join(folder, "mapping.json")
+        if path.exists(mapping_path):
+            with open(mapping_path, "r", encoding="utf-8") as f_map:
+                mapping = json.loads(f_map.read())
+        else:
+            mapping = {}
+
+        key = hashlib.sha224(json.dumps(kwargs, sort_keys=True).encode("utf-8")).hexdigest()
+        mapping[key] = kwargs
+
+        with open(mapping_path, "w", encoding="utf-8") as fout:
+            fout.write(json.dumps(mapping, sort_keys=True, indent=2))
+        return path.join(folder, key)
+
+
+class Text2Text(object):
+    """Given an input text, generate a subset of items relevant to this input from a fixed set of output items.
+
+    The input should be a text sequence, while the output items can be either text-based or symbol-based
+    (although the symbols are usually represented in text format).
+    """
+
+    def __init__(self, preprocessor=None, xlinear_models=None, output_items=None):
+        """Initialization
+
+        Args:
+            preprocessor (Preprocessor): Text preprocessor.
+            xlinear_models (list): List of XLinear models.
+            output_items (list): List of output items name.
+        """
+        self.preprocessor = preprocessor
+        self.xlinear_models = xlinear_models
+        self.output_items = output_items
+
+    def save(self, model_folder):
+        """Save the Text2Text model
+
+        Args:
+            model_folder (str): folder name to save
+        """
+
+        LOGGER.info("Saving the model...")
+        self.preprocessor.save(path.join(model_folder, "preprocessor"))
+        xlinear_folder = path.join(model_folder, "xlinear_ensemble")
+        ensemble_config = {"nr_ensembles": len(self.xlinear_models), "kwargs": []}
+        for i, (m, kwargs) in enumerate(self.xlinear_models):
+            ensemble_config["kwargs"] += [kwargs]
+            folder = path.join(xlinear_folder, "{}".format(i))
+            m.save(folder)
+        with open(path.join(xlinear_folder, "config.json"), "w", encoding="utf-8") as fout:
+            fout.write(json.dumps(ensemble_config, indent=True))
+        with open(path.join(model_folder, "output_items.json"), "w", encoding="utf-8") as fp:
+            json.dump(self.output_items, fp)
+
+    @classmethod
+    def load(cls, model_folder, is_predict_only=False, **kwargs):
+        """Load the Text2Text model
+
+        Args:
+            model_folder (str): folder name to load
+            is_predict_only (bool): if the loaded model will be used for prediction only in the batch mode.
+
+        Returns:
+            A Text2Text object
+        """
+
+        preprocessor = Preprocessor.load(path.join(model_folder, "preprocessor"))
+        xlinear_folder = path.join(model_folder, "xlinear_ensemble")
+        with open(path.join(xlinear_folder, "config.json"), "r", encoding="utf-8") as fin:
+            ensemble_config = json.loads(fin.read())
+        xlinear_models = []
+        for i, model_kwargs in enumerate(ensemble_config["kwargs"]):
+            folder = path.join(xlinear_folder, "{}".format(i))
+            xlinear_models += [(XLinearModel.load(folder, is_predict_only, **kwargs), model_kwargs)]
+        output_items = None
+        folder_path = pathlib.Path(model_folder)
+        json_output_items_filepath = folder_path / "output_items.json"
+        with open(str(json_output_items_filepath), "r", encoding="utf-8") as fin:
+            output_items = json.load(fin)
+        if not output_items:
+            raise ValueError("Could not read output items saved in json format")
+
+        return cls(preprocessor, xlinear_models, output_items)
+
+    @classmethod
+    def train(
+        cls,
+        input_text_path,
+        output_text_path,
+        vectorizer_config=None,
+        dtype=np.float32,
+        label_embed_type=["pifa"],
+        indexer_algo=["hierarchicalkmeans"],
+        imbalanced_ratio=0.0,
+        imbalanced_depth=100,
+        spherical=True,
+        nr_splits=2,
+        max_leaf_size=[100],
+        seed=[0],
+        max_iter=[20],
+        solver_type=["L2R_L2LOSS_SVC_DUAL"],
+        Cp=[1.0],
+        Cn=[1.0],
+        bias=1.0,
+        threshold=[0.1],
+        negative_sampling_scheme="tfn",
+        pred_kwargs=None,
+        threads=-1,
+        workspace_folder=None,
+    ):
+        """Train a Text2Text model
+
+        Args:
+
+            input_text_path (str): Text input file name.
+                Format: in each line, OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT
+                where OUTPUT_IDs are the zero-based output item indices
+                corresponding to the line numbers of OUTPUT_ITEM_PATH.
+                We assume utf-8 encoding for text.
+            output_text_path (str): The file path for output text items.
+                Format: each line corresponds to a representation
+                of the output item. We assume utf-8 encoding for text.
+            vectorizer_config_json (str): Json_format string for vectorizer config (default None)
+            dtype (float32 | float64): data type (default float32)
+            label_embed_type (list of str): Label embedding types. (default pifa).
+                Multiple values will lead to different individual models for ensembling.
+            indexer_algo (list of str): Indexer algorithm (default ["hierarchicalkmeans"]).
+            imbalanced_ratio (float): Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness
+                constraint of 2_means can be. Specifically, if an iteration of 2_means is clustering L labels,
+                the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other.
+                (default 0.0)
+            imbalanced_depth (int): After hierarchical 2_means clustering has reached this depth,
+                it will continue clustering as if imbalanced_ratio is set to 0.0. (default 100)
+            spherical (bool): Do l2_normalize cluster centers while clustering (default True).
+            nr_splits (int): number of splits used to construct hierarchy (a power of 2 is recommended, default 2)
+            max_leaf_size (list of int): The max size of the leaf nodes of hierarchical 2_means clustering.
+                Multiple values (separated by comma) are supported and will lead to different
+                individual models for ensembling. (default [100])
+            seed (list of int): Random seeds (default [0]). Multiple values will lead to different individual
+                models for ensembling.
+            max_iter (int): The max iteration for indexing (default 20)
+            solver_type (list of string): solver type for ranking (default ["L2R_L2LOSS_SVC_DUAL"])
+            Cp (float): Coefficient for positive class in the loss function (default 1.0)
+            Cn (float): Coefficient for negative class in the loss function (default 1.0)
+            bias (float): bias for the ranking model (default=1.0)
+            threshold (float): Threshold to sparsify the model weights (default 0.1)
+            negative_sampling (str, choices=[tfn, man, tfn+man]): Negative Sampling Schemes (default tfn)
+            pred_kwargs (dict): kwargs for prediction used in matching-aware training
+                only_topk (int): the default number of top labels used in the prediction
+                beam_size (int): the default size of beam search used in the prediction
+                post_processor (str): the default post processor used in the prediction
+            workspace_folder: (str, default=None): A folder name for storing intermediate
+                variables during training
+
+        Returns:
+            A Text2Text object
+        """
+
+        ws = CachedWorkspace(workspace_folder)
+
+        # Train Preprocessor and obtain X, Y
+        XY_kwargs = dict(
+            input_text_path=input_text_path,
+            output_text_path=output_text_path,
+            vectorizer_config=vectorizer_config,
+            dtype=str(dtype),
+        )
+
+        # Prepare Preprocessor
+        preprocessor_path = ws.get_path_for_name_and_kwargs("preprocessor", XY_kwargs)
+        if path.exists(preprocessor_path):
+            LOGGER.info("Loading existing preprocessor...")
+            preprocessor = Preprocessor.load(preprocessor_path)
+        else:
+            LOGGER.info("Parsing text files...")
+            Y, corpus = Preprocessor.load_data_from_file(input_text_path, output_text_path)
+            LOGGER.info(
+                f"Training {vectorizer_config['type']} vectorizer on {len(corpus)} input texts..."
+            )
+            preprocessor = Preprocessor.train(corpus, vectorizer_config, dtype=dtype)
+            preprocessor.save(preprocessor_path)
+
+        # Prepare X, X could be dense or sparse
+        X_path = ws.get_path_for_name_and_kwargs("X", XY_kwargs)
+
+        if path.exists(X_path):
+            X = XLinearModel.load_feature_matrix(X_path)
+        else:
+            if "corpus" not in locals():
+                Y, corpus = Preprocessor.load_data_from_file(input_text_path, output_text_path)
+            LOGGER.info(f"Vectorizing {len(corpus)} texts...")
+            X = preprocessor.predict(corpus)
+            XLinearModel.save_feature_matrix(X_path, X)
+        LOGGER.info(
+            f"{vectorizer_config['type']} input X loaded: {X.shape[0]} samples with {X.shape[1]} features."
+        )
+
+        # Prepare Y, Y is always sparse
+        Y_path = ws.get_path_for_name_and_kwargs("Y", XY_kwargs) + ".npz"
+        if path.exists(Y_path):
+            Y = smat_util.load_matrix(Y_path)
+        else:
+            if "Y" not in locals():
+                Y, corpus = Preprocessor.load_data_from_file(input_text_path, output_text_path)
+            smat_util.save_matrix(Y_path, Y)
+        LOGGER.info(f"Output label Y loaded: {Y.shape[0]} samples with {Y.shape[1]} labels.")
+
+        # Grid Parameters for XLinearModel
+        ranker_param_names = [
+            "bias",
+            "Cp",
+            "Cn",
+            "solver_type",
+            "threshold",
+            "negative_sampling_scheme",
+            "pred_kwargs",
+        ]
+
+        ranker_grid_params = {}
+        for name in ranker_param_names:
+            tmp = locals()[name]
+            ranker_grid_params[name] = tmp if isinstance(tmp, (list, tuple)) else [tmp]
+
+        indexer_param_names = [
+            "indexer_algo",
+            "imbalanced_ratio",
+            "imbalanced_depth",
+            "spherical",
+            "seed",
+            "max_iter",
+            "max_leaf_size",
+            "nr_splits",
+            "label_embed_type",
+        ]
+
+        indexer_grid_params = {}
+        for name in indexer_param_names:
+            tmp = locals()[name]
+            indexer_grid_params[name] = tmp if isinstance(tmp, (list, tuple)) else [tmp]
+
+        # Generate various label features
+        label_feat_set = {}
+        for embed_type in indexer_grid_params["label_embed_type"]:
+            label_embed_kwargs = dict(
+                input_text_path=input_text_path,
+                output_text_path=output_text_path,
+                dtype=str(dtype),
+                vectorizer_config=vectorizer_config,
+                embed_type=embed_type,
+            )
+            label_embed_path = ws.get_path_for_name_and_kwargs("L", label_embed_kwargs)
+            if path.exists(label_embed_path):
+                LOGGER.info(f"Loading existing {embed_type} features for {Y.shape[1]} labels...")
+                label_feat_set[embed_type] = XLinearModel.load_feature_matrix(label_embed_path)
+            else:
+                LOGGER.info(f"Generating {embed_type} features for {Y.shape[1]} labels...")
+                # Create label features
+                label_feat_set[embed_type] = LabelEmbeddingFactory.create(Y, X, method=embed_type)
+                XLinearModel.save_feature_matrix(label_embed_path, label_feat_set[embed_type])
+
+        for indexer_values in itertools.product(
+            *[indexer_grid_params[k] for k in indexer_param_names]
+        ):
+            # Indexing
+            indexer_kwargs = dict(zip(indexer_param_names, indexer_values))
+            indexer_kwargs_local = indexer_kwargs.copy()
+            C_path = ws.get_path_for_name_and_kwargs("C", indexer_kwargs_local)
+            if path.exists(C_path):
+                LOGGER.info(f"Loading existing clustering code with params {indexer_kwargs_local}")
+                C = ClusterChain.load(C_path)
+            else:
+                label_embed_type = indexer_kwargs.pop(
+                    "label_embed_type", None
+                )  # as label_embed_type is not a valid argument for XLinearModel.train
+                LOGGER.info(f"Clustering with params {indexer_kwargs_local}...")
+                C = Indexer.gen(
+                    label_feat_set[indexer_kwargs_local["label_embed_type"]],
+                    indexer_kwargs.pop("indexer_algo"),
+                    threads=threads,
+                    **indexer_kwargs,
+                )
+                LOGGER.info(f"Created {C[-1].shape[1]} clusters.")
+                C.save(C_path)
+
+            # Ensemble Models
+            for ranker_values in itertools.product(
+                *[ranker_grid_params[k] for k in ranker_param_names]
+            ):
+                ranker_kwargs = dict(zip(ranker_param_names, ranker_values))
+                ranker_kwargs_local = ranker_kwargs.copy()
+                # Model Training
+                ranker_kwargs_local.update(indexer_kwargs_local)
+
+                model_path = ws.get_path_for_name_and_kwargs("model", ranker_kwargs_local)
+                if path.exists(model_path):
+                    LOGGER.info(f"Model with params {ranker_kwargs_local} exists")
+                else:
+                    LOGGER.info(f"Training model with params {ranker_kwargs_local}...")
+                    m = XLinearModel.train(
+                        X,
+                        Y,
+                        C,
+                        threads=threads,
+                        **ranker_kwargs,
+                    )
+                    m.save(model_path)
+                    del m
+                    gc.collect()
+
+            del C
+            gc.collect()
+
+        del X, Y, label_feat_set
+        gc.collect()
+
+        xlinear_models = []
+        for indexer_values in itertools.product(
+            *[indexer_grid_params[k] for k in indexer_param_names]
+        ):
+            indexer_kwargs = dict(zip(indexer_param_names, indexer_values))
+            indexer_kwargs_local = indexer_kwargs.copy()
+            for ranker_values in itertools.product(
+                *[ranker_grid_params[k] for k in ranker_param_names]
+            ):
+                ranker_kwargs = dict(zip(ranker_param_names, ranker_values))
+                ranker_kwargs_local = ranker_kwargs.copy()
+                ranker_kwargs_local.update(indexer_kwargs_local)
+                model_path = ws.get_path_for_name_and_kwargs("model", ranker_kwargs_local)
+                xlinear_models += [(XLinearModel.load(model_path), ranker_kwargs_local)]
+
+        # Load output items
+        with open(output_text_path, "r", encoding="utf-8") as f:
+            output_items = [q.strip() for q in f]
+
+        return cls(preprocessor, xlinear_models, output_items)
+
+    def predict(
+        self, corpus, topk=10, beam_size=None, post_processor=None, threshold=None, **kwargs
+    ):
+        """Predict labels for given inputs
+
+        Args:
+            corpus (list of strings): input strings.
+            topk (int, optional): override the only topk specified in the model
+                Default None to disable overriding
+            beam_size (int, optional): override the beam size specified in the model
+                Default None to disable overriding
+            post_processor (str, optional):  override the post_processor specified in the model
+                Default None to disable overriding
+            threshold (float, optional): Drop output items with scores less than this threshold among top-k items
+                Default None to not threshold
+            kwargs:
+                post_processor (str, optional):  override the post_processor specified in the model
+                    Default None to disable overriding
+                threads (int, optional): the number of threads to use for predicting.
+                    Default to -1 to use all.
+        Returns:
+            csr_matrix: predicted label matrix (num_samples x num_labels)
+        """
+
+        X = self.preprocessor.predict(corpus)
+        Y_pred = smat_util.CsrEnsembler.average(
+            *[
+                m.predict(
+                    X, only_topk=topk, beam_size=beam_size, post_processor=post_processor, **kwargs
+                )
+                for m, _ in self.xlinear_models
+            ]
+        )
+
+        if threshold is not None:
+            Y_pred.data[Y_pred.data <= threshold] = 0
+            Y_pred.eliminate_zeros()
+
+        return smat_util.sorted_csr(Y_pred, topk)
+
+    def set_output_constraint(self, output_items_to_keep):
+        """Prune the tree
+
+        Args:
+            output_items_to_keep (list of strings): A list of output items to be kept in the tree. The rest will be pruned.
+        """
+        output_items = dict()
+        for i, item in enumerate(self.output_items):
+            output_items[item] = i
+        output_labels_to_keep = set()
+        for item in output_items_to_keep:
+            if item in output_items:
+                output_labels_to_keep.add(output_items[item])
+        for xlm, _ in self.xlinear_models:
+            xlm.set_output_constraint(output_labels_to_keep)
+
+    def get_output_item(self, output_id):
+        """Get output item given an output id
+
+        Args:
+            output_id (int): output index
+
+        Returns:
+            A string: the output item corresponds to the input index
+        """
+        return self.output_items[output_id]
+
+    def print_predictions(self, Y, fout, meta_info=None):
+        """Dump predicted items
+
+        Args:
+            Y (csr_matrix): predicted label matrix
+            fout (a file object): The file (or stdout) to dump
+            meta_info (list of string, optional): meta_info to be incorporated into the output for each input.
+                Default None
+
+        """
+        if meta_info is not None:
+            assert Y.shape[0] == len(meta_info), "meta_info and Y should have the same sample size"
+
+        for i in range(Y.shape[0]):
+            output_data = {}
+            output_data["schema"] = ["output_item", "score"]
+            item_score = []
+            idx = slice(Y.indptr[i], Y.indptr[i + 1])
+            for output_id, score in zip(Y.indices[idx], Y.data[idx]):
+                item_score.append(
+                    [self.get_output_item(output_id), float(format(float(score), ".5g"))]
+                )
+            output_data["data"] = item_score
+            if meta_info is not None:
+                output_data["meta_info"] = meta_info[i]
+            ss = json.dumps(output_data)
+            fout.write(ss + "\n")
+            fout.flush()
diff --git a/pecos/apps/text2text/predict.py b/pecos/apps/text2text/predict.py
new file mode 100644
index 00000000..dcb10d86
--- /dev/null
+++ b/pecos/apps/text2text/predict.py
@@ -0,0 +1,194 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+import itertools
+import sys
+
+from pecos.xmc import PostProcessor
+
+from .model import Text2Text
+
+
+def parse_arguments():
+    """Parse Text2Text model prediction arguments"""
+
+    parser = argparse.ArgumentParser(description="Text2Text: online prediction or batch prediction")
+
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="MODEL_FOLDER",
+        help="model folder name",
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input-text-path",
+        type=str,
+        default="-",
+        metavar="INPUT_TEXT_PATH",
+        help='path to text input file name. (default "-" to denote stdin). We assume utf-8 encoding for text.',
+    )
+
+    parser.add_argument(
+        "-o",
+        "--predicted-output-item-path",
+        type=str,
+        default="-",
+        metavar="OUTPUT_PATH",
+        help='path to the predicted output item file name. (default "-" to denote stdout). We assume utf-8 encoding for text.',
+    )
+
+    parser.add_argument(
+        "--output-to-keep-path",
+        type=str,
+        default=None,
+        metavar="OUTPUT_TO_KEEP_PATH",
+        help="path to valid output texts file name. (default None to keep all output items). We assume utf-8 encoding for text.",
+    )
+
+    parser.add_argument(
+        "-k",
+        "--only-topk",
+        type=int,
+        default=20,
+        help="Output top-k items for each input (default 20)",
+    )
+
+    parser.add_argument(
+        "-b",
+        "--beam-size",
+        type=int,
+        default=None,
+        metavar="INT",
+        help="Override the beam size specified in the model (default None to disable overriding)",
+    )
+
+    parser.add_argument(
+        "-pp",
+        "--post-processor",
+        type=str,
+        choices=PostProcessor.valid_list(),
+        default=None,
+        metavar="STR",
+        help="Override the post processor specified in the model (default None to disable overriding)",
+    )
+
+    parser.add_argument(
+        "-B",
+        "--batch-size",
+        type=int,
+        default=2048,
+        help="Batch size for prediction (default 2048)",
+    )
+
+    parser.add_argument(
+        "-T",
+        "--threshold",
+        type=float,
+        default=None,
+        help="Drop output items with scores less than this threshold among top-k items (default None)",
+    )
+
+    parser.add_argument(
+        "--meta-info-path",
+        type=str,
+        default=None,
+        help="Path to the file of meta info for each line in input_text_path. (default None)",
+    )
+
+    return parser
+
+
+def predict(args):
+    """Predict labels for given text inputs using Text2Text model
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    t2t_model = Text2Text.load(
+        args.model_folder, is_predict_only=(args.output_to_keep_path is None)
+    )
+
+    if args.output_to_keep_path is not None:
+        outputs_to_keep = []
+        with open(args.output_to_keep_path, "r", encoding="utf-8") as f:
+            for line in f:
+                outputs_to_keep += [line.strip()]
+        t2t_model.set_output_constraint(outputs_to_keep)
+
+    if args.predicted_output_item_path != "-":
+        fout = open(args.predicted_output_item_path, "w", encoding="utf-8")
+    else:
+        fout = sys.stdout
+
+    if args.input_text_path != "-":
+        if args.meta_info_path is not None:
+            fmeta = open(args.meta_info_path, "r", encoding="utf-8")
+        else:
+            fmeta = []
+
+        with open(args.input_text_path, "r", encoding="utf-8") as fin:
+            # Divide the test file into small batches to avoid out-of-memory issue.
+            corpus = []
+            meta_info = []
+            for input_line, meta_line in itertools.zip_longest(fin, fmeta, fillvalue=""):
+                corpus += [input_line.strip("\n").split("\t")[-1]]
+                meta_info += [meta_line.strip("\n")]
+                if len(corpus) == args.batch_size:
+                    Y = t2t_model.predict(
+                        corpus,
+                        beam_size=args.beam_size,
+                        topk=args.only_topk,
+                        threshold=args.threshold,
+                    )
+                    if args.meta_info_path is None:
+                        t2t_model.print_predictions(Y, fout)
+                    else:
+                        t2t_model.print_predictions(Y, fout, meta_info=meta_info)
+                    corpus = []
+                    meta_info = []
+
+            if len(corpus) > 0:
+                Y = t2t_model.predict(
+                    corpus,
+                    topk=args.only_topk,
+                    beam_size=args.beam_size,
+                    post_processor=args.post_processor,
+                    threshold=args.threshold,
+                )
+                if args.meta_info_path is None:
+                    t2t_model.print_predictions(Y, fout)
+                else:
+                    t2t_model.print_predictions(Y, fout, meta_info=meta_info)
+    else:
+        fin = sys.stdin
+        for line in fin:
+            Y = t2t_model.predict(
+                [line.strip()],
+                topk=args.only_topk,
+                beam_size=args.beam_size,
+                post_processor=args.post_processor,
+                threshold=args.threshold,
+            )
+            t2t_model.print_predictions(Y, fout)
+
+    fin.close()
+    fout.close()
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    predict(args)
diff --git a/pecos/apps/text2text/train.py b/pecos/apps/text2text/train.py
new file mode 100644
index 00000000..0619533d
--- /dev/null
+++ b/pecos/apps/text2text/train.py
@@ -0,0 +1,308 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+import logging
+import sys
+
+import numpy as np
+from pecos.core import XLINEAR_SOLVERS
+from pecos.utils import cli, logging_util
+from pecos.utils.featurization.text.vectorizers import Vectorizer
+from pecos.xmc import Indexer, PostProcessor
+
+from .model import Text2Text
+
+
+def parse_arguments(args):
+    """Parse Text2Text model training arguments"""
+
+    parser = argparse.ArgumentParser(
+        description="Text2Text: Read input text training files, output item files and train a model"
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input-text-path",
+        type=str,
+        required=True,
+        metavar="INPUT_TEXT_PATH",
+        help="Text input file name. Format: in each line, OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT \
+                    where OUTPUT_IDs are the zero-based output item indices corresponding to the line numbers of OUTPUT_ITEM_PATH. We assume utf-8 encoding for text.",
+    )
+
+    parser.add_argument(
+        "-q",
+        "--output-item-path",
+        type=str,
+        required=True,
+        metavar="OUTPUT_ITEM_PATH",
+        help="Output item file name. Format: each line corresponds to a representation of the output item. We assume utf-8 encoding for text.",
+    )
+
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="MODEL_FOLDER",
+        help="Output model folder name",
+    )
+
+    parser.add_argument(
+        "--workspace-folder",
+        type=str,
+        default=None,
+        metavar="WORKSPACE_FOLDER",
+        help="A folder name for storing intermediate variables during training",
+    )
+
+    vectorizer_config_group_parser = parser.add_mutually_exclusive_group()
+    vectorizer_config_group_parser.add_argument(
+        "--vectorizer-config-path",
+        type=str,
+        default=None,
+        metavar="VECTORIZER_CONFIG_PATH",
+        help="Json file for vectorizer config (default tfidf vectorizer)",
+    )
+
+    vectorizer_config_group_parser.add_argument(
+        "--vectorizer-config-json",
+        type=str,
+        default='{"type":"tfidf", "kwargs":{}}',
+        metavar="VECTORIZER_CONFIG_JSON",
+        help='Json-format string for vectorizer config (default {"type":"tfidf", "kwargs":{}})',
+    )
+
+    parser.add_argument(
+        "--dtype",
+        type=lambda x: np.float32 if "32" in x else np.float64,
+        default=np.float32,
+        help="data type for the csr matrix. float32 | float64. (default float32)",
+    )
+
+    parser.add_argument(
+        "--max-leaf-size",
+        type=cli.comma_separated_type(int),
+        default=[100],
+        metavar="INT-LIST",
+        help="The max size of the leaf nodes of hierarchical 2-means clustering. Multiple values (separated by comma) are supported and will lead to different individual models for ensembling. (default [100])",
+    )
+
+    parser.add_argument(
+        "--nr-splits",
+        type=int,
+        default=2,
+        metavar="INT",
+        help="number of splits used to construct hierarchy (a power of 2 is recommended, default 2)",
+    )
+
+    parser.add_argument(
+        "--imbalanced-ratio",
+        type=float,
+        default=0.0,
+        metavar="FLOAT",
+        help="Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering L labels, the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other. (default 0.0)",
+    )
+
+    parser.add_argument(
+        "--imbalanced-depth",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="After hierarchical 2-means clustering has reached this depth, it will continue clustering as if --imbalanced-ratio is set to 0.0. (default 100)",
+    )
+
+    parser.add_argument(
+        "--label-embed-type",
+        type=cli.comma_separated_type(str),
+        default="pifa",
+        metavar="STR-LIST",
+        help="Label embedding types. (default pifa). Multiple values (separated by comma) are supported and will lead to different individual models for ensembling.",
+    )
+
+    parser.add_argument(
+        "--indexer",
+        choices=Indexer.indexer_dict.keys(),
+        default="hierarchicalkmeans",
+        metavar="STR",
+        help=f"Indexer algorithm (default hierarchicalkmeans). Available choices are {', '.join(Indexer.indexer_dict.keys())}",
+    )
+
+    parser.add_argument(
+        "--no-spherical",
+        action="store_true",
+        default=False,
+        help="Do not l2-normalize cluster centers while clustering",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=cli.comma_separated_type(int),
+        default=[0],
+        metavar="INT-LIST",
+        help="Random seeds (default 0). Multiple values (separated by comma) are supported and will lead to different individual models for ensembling.",
+    )
+
+    parser.add_argument(
+        "--max-iter",
+        type=int,
+        default=20,
+        metavar="INT",
+        help="The max iteration for indexing (default 20)",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--threads",
+        type=int,
+        default=-1,
+        metavar="INT",
+        help="Number of threads to use (default -1 to denote all the CPUs)",
+    )
+
+    # Linear matching/ranking parameters
+    parser.add_argument(
+        "-s",
+        "--solver-type",
+        type=str,
+        default="L2R_L2LOSS_SVC_DUAL",
+        metavar="STR",
+        help="{} (default L2R_L2LOSS_SVC_DUAL)".format(" | ".join(XLINEAR_SOLVERS.keys())),
+    )
+
+    parser.add_argument(
+        "--Cp",
+        type=float,
+        default=1.0,
+        metavar="VAL",
+        help="Coefficient for positive class in the loss function (default 1.0)",
+    )
+
+    parser.add_argument(
+        "--Cn",
+        type=float,
+        default=1.0,
+        metavar="VAL",
+        help="Coefficient for negative class in the loss function (default 1.0)",
+    )
+
+    parser.add_argument(
+        "--bias", type=float, default=1.0, metavar="VAL", help="bias term (default 1.0)"
+    )
+
+    parser.add_argument(
+        "-ns",
+        "--negative-sampling",
+        type=str,
+        choices=["tfn", "man", "tfn+man"],
+        default="tfn",
+        metavar="STR",
+        help="Negative Sampling Schemes",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        type=float,
+        default=0.1,
+        metavar="VAL",
+        help="Threshold to sparsify the model weights (default 0.1)",
+    )
+
+    # Prediction kwargs
+    parser.add_argument(
+        "-k",
+        "--only-topk",
+        type=int,
+        default=20,
+        metavar="INT",
+        help="the default number of top labels used in the prediction",
+    )
+
+    parser.add_argument(
+        "-b",
+        "--beam-size",
+        type=int,
+        default=10,
+        metavar="INT",
+        help="the default size of beam search used in the prediction",
+    )
+
+    parser.add_argument(
+        "-pp",
+        "--post-processor",
+        type=str,
+        choices=PostProcessor.valid_list(),
+        default="l3-hinge",
+        metavar="STR",
+        help="the default post processor used in the prediction",
+    )
+
+    parser.add_argument(
+        "--verbose-level",
+        type=int,
+        choices=logging_util.log_levels.keys(),
+        default=1,
+        metavar="INT",
+        help=f"the verbose level, {', '.join([str(k) + ' for ' + logging.getLevelName(v) for k, v in logging_util.log_levels.items()])}, default 1",
+    )
+
+    parsed_args = parser.parse_args(args)
+    return parsed_args
+
+
+def train(args):
+    """Train Text2Text model
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    pred_kwargs = {
+        "beam_size": args.beam_size,
+        "only_topk": args.only_topk,
+        "post_processor": args.post_processor,
+    }
+
+    vectorizer_config = Vectorizer.load_config_from_args(args)
+
+    t2t_model = Text2Text.train(
+        args.input_text_path,
+        args.output_item_path,
+        label_embed_type=args.label_embed_type,
+        max_leaf_size=args.max_leaf_size,
+        nr_splits=args.nr_splits,
+        vectorizer_config=vectorizer_config,
+        dtype=args.dtype,
+        indexer_algo=[args.indexer],
+        imbalanced_ratio=args.imbalanced_ratio,
+        imbalanced_depth=args.imbalanced_depth,
+        spherical=not args.no_spherical,
+        seed=args.seed,
+        max_iter=args.max_iter,
+        threads=args.threads,
+        solver_type=args.solver_type,
+        Cp=args.Cp,
+        Cn=args.Cn,
+        bias=args.bias,
+        threshold=args.threshold,
+        negative_sampling_scheme=args.negative_sampling,
+        pred_kwargs=pred_kwargs,
+        workspace_folder=args.workspace_folder,
+    )
+
+    t2t_model.save(args.model_folder)
+
+
+if __name__ == "__main__":
+    args = parse_arguments(sys.argv[1:])
+    logging_util.setup_logging_config(level=args.verbose_level)
+    train(args)
diff --git a/pecos/core/__init__.py b/pecos/core/__init__.py
new file mode 100644
index 00000000..f47a8e4d
--- /dev/null
+++ b/pecos/core/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+from .base import (  # noqa
+    XLINEAR_SOLVERS,  # noqa
+    XLINEAR_INFERENCE_MODEL_TYPES,  # noqa
+    ScipyCscF32,  # noqa
+    ScipyCsrF32,  # noqa
+    ScipyDrmF32,  # noqa
+    ScipyDcmF32,  # noqa
+    ScipyCoordinateSparseAllocator,  # noqa
+    ScipyCompressedSparseAllocator,  # noqa
+    clib,  # noqa
+)  # noqa
diff --git a/pecos/core/base.py b/pecos/core/base.py
new file mode 100644
index 00000000..f76e5db0
--- /dev/null
+++ b/pecos/core/base.py
@@ -0,0 +1,1300 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import copy
+import ctypes
+import logging
+import os
+from ctypes import (
+    CDLL,
+    CFUNCTYPE,
+    POINTER,
+    byref,
+    c_bool,
+    c_char_p,
+    c_double,
+    c_float,
+    c_int,
+    c_int32,
+    c_uint32,
+    c_uint64,
+    c_void_p,
+    cast,
+)
+from glob import glob
+from subprocess import check_output
+
+import numpy as np
+import pecos
+import scipy.sparse as smat
+from pecos.utils import smat_util
+
+LOGGER = logging.getLogger("__name__")
+
+XLINEAR_SOLVERS = {"L2R_L2LOSS_SVC_DUAL": 1, "L2R_L1LOSS_SVC_DUAL": 3, "L2R_LR_DUAL": 7}
+XLINEAR_INFERENCE_MODEL_TYPES = {"CSC": 0, "HASH_CHUNKED": 1, "BINARY_SEARCH_CHUNKED": 2}
+TFIDF_TOKENIZER_CODES = {"word": 10, "char": 20, "char_wb": 30}
+
+
+class TfidfBaseVectorizerParam(ctypes.Structure):
+    """
+    python class for handling struct TfidfBaseVectorizerParam in tfidf.hpp
+    """
+
+    _fields_ = [
+        ("min_ngram", c_int32),
+        ("max_ngram", c_int32),
+        ("max_length", c_int32),
+        ("max_feature", c_int32),
+        ("min_df_ratio", c_float),
+        ("max_df_ratio", c_float),
+        ("min_df_cnt", c_int32),
+        ("max_df_cnt", c_int32),
+        ("binary", c_bool),
+        ("use_idf", c_bool),
+        ("smooth_idf", c_bool),
+        ("sublinear_tf", c_bool),
+        ("keep_frequent_feature", c_bool),
+        ("norm_p", c_int32),
+        ("tok_type", c_int32),
+    ]
+
+    DEFAULTS = {
+        "min_ngram": 1,
+        "max_ngram": 1,
+        "max_length": -1,
+        "max_feature": 0,
+        "min_df_ratio": 0.0,
+        "max_df_ratio": 1.0,
+        "min_df_cnt": 0,
+        "max_df_cnt": -1,
+        "binary": False,
+        "use_idf": True,
+        "smooth_idf": True,
+        "sublinear_tf": False,
+        "keep_frequent_feature": True,
+        "norm_p": 2,
+        "tok_type": TFIDF_TOKENIZER_CODES["word"],
+    }
+
+    @classmethod
+    def get_default(cls, name):
+        return copy.deepcopy(cls.DEFAULTS[name])
+
+    def __init__(self, config_dict=None):
+        if config_dict is None:
+            config_dict = {}
+
+        def extract_dict_key(config_dict, key, alias):
+            return config_dict.get(key, config_dict.get(alias, self.get_default(key)))
+
+        config_dict["norm_p"] = extract_dict_key(config_dict, "norm_p", "norm")
+        # to support norm_p being "l1" or "l2"
+        if isinstance(config_dict["norm_p"], str):
+            config_dict["norm_p"] = int(config_dict["norm_p"][1:])
+        if not (config_dict["norm_p"] == 1 or config_dict["norm_p"] == 2):
+            raise NotImplementedError("norm_p only support 1 or 2")
+
+        config_dict["tok_type"] = extract_dict_key(config_dict, "tok_type", "analyzer")
+        if isinstance(config_dict["tok_type"], str):
+            config_dict["tok_type"] = TFIDF_TOKENIZER_CODES[config_dict["tok_type"]]
+
+        config_dict["max_length"] = extract_dict_key(config_dict, "max_length", "truncate_length")
+
+        if "ngram_range" in config_dict:
+            config_dict["min_ngram"] = config_dict["ngram_range"][0]
+            config_dict["max_ngram"] = config_dict["ngram_range"][1]
+
+        name2type = dict(TfidfBaseVectorizerParam._fields_)
+        for name in name2type:
+            setattr(self, name, name2type[name](config_dict.get(name, self.get_default(name))))
+
+
+class TfidfVectorizerParam(ctypes.Structure):
+    """
+    python class for handling struct TfidfVectorizerParam in tfidf.hpp
+    """
+
+    _fields_ = [
+        ("base_param_ptr", POINTER(TfidfBaseVectorizerParam)),
+        ("num_base_vect", c_int32),
+        ("norm_p", c_int32),
+    ]
+
+    def __init__(self, base_vect_param_list, norm_p):
+
+        self.num_base_vect = len(base_vect_param_list)
+        self.c_base_params = (TfidfBaseVectorizerParam * self.num_base_vect)()
+        for i, base_vect_param in enumerate(base_vect_param_list):
+            self.c_base_params[i] = base_vect_param
+
+        self.base_param_ptr = cast(self.c_base_params, POINTER(TfidfBaseVectorizerParam))
+        self.num_base_vect = c_int32(self.num_base_vect)
+        self.norm_p = c_int32(norm_p)
+
+
+class ScipyCscF32(ctypes.Structure):
+    """
+    PyMatrix for scipy.sparse.csc_matrix
+    """
+
+    _fields_ = [
+        ("rows", c_uint32),
+        ("cols", c_uint32),
+        ("col_ptr", POINTER(c_uint64)),
+        ("row_idx", POINTER(c_uint32)),
+        ("val", POINTER(c_float)),
+    ]
+
+    def __init__(self, A):
+        assert isinstance(A, smat.csc_matrix)
+        assert A.dtype == np.float32
+        self.py_buf = {
+            "col_ptr": A.indptr.astype(np.uint64, copy=False),
+            "row_idx": A.indices.astype(np.uint32, copy=False),
+            "val": A.data.astype(np.float32, copy=False),
+        }
+
+        self.rows = c_uint32(A.shape[0])
+        self.cols = c_uint32(A.shape[1])
+        name2type = dict(ScipyCscF32._fields_)
+        for name in self.py_buf:
+            setattr(self, name, self.py_buf[name].ctypes.data_as(name2type[name]))
+        self.buf = A
+
+    @property
+    def dtype(self):
+        return self.buf.dtype
+
+    @property
+    def shape(self):
+        return self.buf.shape
+
+    @classmethod
+    def init_from(cls, A):
+        if A is None:
+            return None
+        elif isinstance(A, cls):
+            return A
+        else:
+            return cls(A)
+
+
+class ScipyCsrF32(ctypes.Structure):
+    """
+    PyMatrix for scipy.sparse.csr_matrix
+    """
+
+    _fields_ = [
+        ("rows", c_uint32),
+        ("cols", c_uint32),
+        ("row_ptr", POINTER(c_uint64)),
+        ("col_idx", POINTER(c_uint32)),
+        ("val", POINTER(c_float)),
+    ]
+
+    def __init__(self, A):
+        assert isinstance(A, smat.csr_matrix)
+        assert A.dtype == np.float32
+        self.py_buf = {
+            "row_ptr": A.indptr.astype(np.uint64, copy=False),
+            "col_idx": A.indices.astype(np.uint32, copy=False),
+            "val": A.data.astype(np.float32, copy=False),
+        }
+
+        self.rows = c_uint32(A.shape[0])
+        self.cols = c_uint32(A.shape[1])
+        name2type = dict(ScipyCsrF32._fields_)
+        for name in self.py_buf:
+            setattr(self, name, self.py_buf[name].ctypes.data_as(name2type[name]))
+        self.buf = A
+
+    @classmethod
+    def init_from(cls, A):
+        if A is None:
+            return None
+        elif isinstance(A, cls):
+            return A
+        else:
+            return cls(A)
+
+    @property
+    def dtype(self):
+        return self.buf.dtype
+
+    @property
+    def shape(self):
+        return self.buf.shape
+
+    def dot(self, other):
+        return self.buf.dot(other)
+
+
+class ScipyDrmF32(ctypes.Structure):
+    """
+    PyMatrix for row-major scipy.ndarray
+    """
+
+    _fields_ = [("rows", c_uint32), ("cols", c_uint32), ("val", POINTER(c_float))]
+
+    def __init__(self, A):
+        assert isinstance(A, np.ndarray)
+        assert A.dtype == np.float32
+        assert A.flags.c_contiguous is True
+        self.py_buf = {"val": A}
+
+        self.rows = c_uint32(A.shape[0])
+        self.cols = c_uint32(A.shape[1])
+        name2type = dict(ScipyDrmF32._fields_)
+        for name in self.py_buf:
+            setattr(self, name, self.py_buf[name].ctypes.data_as(name2type[name]))
+        self.buf = A
+
+    @classmethod
+    def init_from(cls, A):
+        if A is None:
+            return None
+        elif isinstance(A, cls):
+            return A
+        else:
+            return cls(A)
+
+    @property
+    def dtype(self):
+        return self.buf.dtype
+
+    @property
+    def shape(self):
+        return self.buf.shape
+
+    def dot(self, other):
+        if isinstance(other, smat.spmatrix):
+            return other.T.dot(self.buf.T).T
+        else:
+            return self.buf.dot(other)
+
+
+class ScipyDcmF32(ctypes.Structure):
+    """
+    PyMatrix for col-major scipy.ndarray
+    """
+
+    _fields_ = [("rows", c_uint32), ("cols", c_uint32), ("val", POINTER(c_float))]
+
+    def __init__(self, A):
+        assert isinstance(A, np.ndarray)
+        assert A.dtype == np.float32
+        assert A.flags.f_contiguous is True
+        self.py_buf = {"val": A}
+
+        self.rows = c_uint32(A.shape[0])
+        self.cols = c_uint32(A.shape[1])
+        name2type = dict(ScipyDcmF32._fields_)
+        for name in self.py_buf:
+            setattr(self, name, self.py_buf[name].ctypes.data_as(name2type[name]))
+        self.buf = A
+
+    @classmethod
+    def init_from(cls, A):
+        if A is None:
+            return None
+        elif isinstance(A, cls):
+            return A
+        else:
+            return cls(A)
+
+    @property
+    def dtype(self):
+        return self.buf.dtype
+
+    @property
+    def shape(self):
+        return self.buf.shape
+
+    def dot(self, other):
+        if isinstance(other, smat.spmatrix):
+            return other.T.dot(self.buf.T).T
+        else:
+            return self.buf.dot(other)
+
+
+class ScipyCoordinateSparseAllocator(object):
+    """
+    Scipy Coordinate Sparse Matrix Allocator for C++/C code
+    """
+
+    CFUNCTYPE = CFUNCTYPE(None, c_uint32, c_uint32, c_uint64, c_void_p, c_void_p, c_void_p)
+
+    def __init__(self, rows=0, cols=0, dtype=np.float64):
+        self.rows = rows
+        self.cols = cols
+        self.row_idx = None
+        self.col_idx = None
+        self.data = None
+        self.dtype = dtype
+        assert dtype == np.float32 or dtype == np.float64
+
+    def __call__(self, rows, cols, nnz, row_ptr, col_ptr, val_ptr):
+        self.rows = rows
+        self.cols = cols
+        self.row_idx = np.zeros(nnz, dtype=np.uint64)
+        self.col_idx = np.zeros(nnz, dtype=np.uint64)
+        self.data = np.zeros(nnz, dtype=self.dtype)
+        cast(row_ptr, POINTER(c_uint64)).contents.value = self.row_idx.ctypes.data_as(
+            c_void_p
+        ).value
+        cast(col_ptr, POINTER(c_uint64)).contents.value = self.col_idx.ctypes.data_as(
+            c_void_p
+        ).value
+        cast(val_ptr, POINTER(c_uint64)).contents.value = self.data.ctypes.data_as(c_void_p).value
+
+    def tocoo(self):
+        return smat.coo_matrix(
+            (self.data, (self.row_idx, self.col_idx)), shape=(self.rows, self.cols)
+        )
+
+    def tocsr(self):
+        return smat.csr_matrix(
+            (self.data, (self.row_idx, self.col_idx)), shape=(self.rows, self.cols)
+        )
+
+    def tocsc(self):
+        return smat.csc_matrix(
+            (self.data, (self.row_idx, self.col_idx)), shape=(self.rows, self.cols)
+        )
+
+    @property
+    def cfunc(self):
+        return self.CFUNCTYPE(self)
+
+
+class ScipyCompressedSparseAllocator(object):
+    """
+    Scipy Compressed Sparse Matrix Allocator for C++/C code,
+    which supports both smat.csr_matrix and smat.csc_matrix.
+
+    Whether it is row or column major is controlled by self.is_col_major,
+    which is passed in by the first argument in the __call__().
+
+    Attributes:
+        CFUNCTYPE (ctypes.CFUNCTYPE): a function prototype creates functions that uses the standard C calling convention
+    """
+
+    CFUNCTYPE = CFUNCTYPE(None, c_bool, c_uint64, c_uint64, c_uint64, c_void_p, c_void_p, c_void_p)
+
+    def __init__(self, rows=0, cols=0, dtype=np.float32):
+        self.cols = cols
+        self.rows = rows
+        self.indices = None
+        self.indptr = None
+        self.data = None
+        self.dtype = dtype
+        self.is_col_major = None
+        assert dtype == np.float32
+
+    def __call__(self, is_col_major, rows, cols, nnz, indices_ptr, indptr_ptr, data_ptr):
+        """
+        Allocate memory for the members
+
+        Parameters:
+            is_col_major (bool):  specifying whether the to-be allocated matrix is row-majored or col-majored.
+            rows (int): the number of rows of the sparse matrix.
+            cols (int): the number of cols of the sparse matrix.
+            nnz (int): the number of non-zeros of the sparse matrix.
+            indptr_ptr (pointer): the pointer to the nnz array, of length (rows+1) or (cols+1).
+            indices_ptr (pointer): the pointer to the row/col indices array, of length nnz.
+            data_ptr (pointer): the pointer to the non-zero values array, of length nnz.
+
+        Returns:
+            None
+        """
+
+        self.cols = cols
+        self.rows = rows
+        self.is_col_major = is_col_major
+        if is_col_major:
+            self.indptr = np.zeros(cols + 1, dtype=np.uint64)
+        else:
+            self.indptr = np.zeros(rows + 1, dtype=np.uint64)
+        self.indices = np.zeros(nnz, dtype=np.uint32)
+        self.data = np.zeros(nnz, dtype=self.dtype)
+
+        cast(indices_ptr, POINTER(c_uint64)).contents.value = self.indices.ctypes.data_as(
+            c_void_p
+        ).value
+        cast(indptr_ptr, POINTER(c_uint64)).contents.value = self.indptr.ctypes.data_as(
+            c_void_p
+        ).value
+        cast(data_ptr, POINTER(c_uint64)).contents.value = self.data.ctypes.data_as(c_void_p).value
+
+    def get(self):
+        if self.is_col_major:
+            return smat_util.csc_matrix(
+                (self.data, self.indices, self.indptr), shape=(self.rows, self.cols)
+            )
+        else:
+            return smat_util.csr_matrix(
+                (self.data, self.indices, self.indptr), shape=(self.rows, self.cols)
+            )
+
+    @property
+    def cfunc(self):
+        return self.CFUNCTYPE(self)
+
+
+class corelib(object):
+    """
+    The core functions for linear problems
+    """
+
+    @staticmethod
+    def fillprototype(f, restype, argtypes):
+        """
+        Specify corelib function's return type and argument types.
+
+        Args:
+            restype (single or list of ctypes): The return type.
+            argtypes (list of ctypes): The argument types.
+        """
+        f.restype = restype
+        f.argtypes = argtypes
+
+    @staticmethod
+    def load_dynamic_library(dirname, soname, forced_rebuild=False):
+        """
+        Load compiled C library into Python.
+        If not found, will build upon loading.
+
+        Args:
+            dirname (str): The directory of C library.
+            soname (str): The name of C library.
+            force_rebuild (bool, optional): Whether to force rebuild C library upon calling.
+
+        Return:
+            c_lib (CDLL): Ctypes CDLL library.
+        """
+        try:
+            if forced_rebuild:
+                check_output("make -C {} clean lib".format(dirname), shell=True)
+            path_to_so = glob(os.path.join(dirname, soname) + "*.so")[0]
+            _c_lib = CDLL(path_to_so)
+        except BaseException:
+            try:
+                check_output("make -C {} clean lib".format(dirname), shell=True)
+                path_to_so = glob(os.path.join(dirname, soname) + "*.so")[0]
+                _c_lib = CDLL(path_to_so)
+            except BaseException:
+                raise Exception("{soname} library cannot be found and built.".format(soname=soname))
+        return _c_lib
+
+    def __init__(self, dirname, soname, forced_rebuild=False):
+        self.clib_float32 = corelib.load_dynamic_library(
+            dirname, soname + "_float32", forced_rebuild=forced_rebuild
+        )
+        self.link_xlinear_methods()
+        self.link_sparse_operations()
+        self.link_clustering()
+        self.link_tfidf_vectorizer()
+
+    def link_xlinear_methods(self):
+        """
+        Specify C-lib's Xlinear methods argument and return type.
+        """
+        arg_list = [
+            POINTER(ScipyCsrF32),  # CSR X
+            POINTER(ScipyCscF32),  # CSC Y
+            POINTER(ScipyCscF32),  # CSC C
+            POINTER(ScipyCscF32),  # CSC M
+            POINTER(ScipyCscF32),  # CSC R
+            ScipyCoordinateSparseAllocator.CFUNCTYPE,  # py_coo_allocator
+            c_double,  # threshold
+            c_uint32,  # max_nonzeros_per_label
+            c_int,  # solver_type
+            c_double,  # Cp
+            c_double,  # Cn
+            c_uint64,  # max_iter
+            c_double,  # eps
+            c_double,  # bias
+            c_int,  # threads
+        ]
+        corelib.fillprototype(
+            self.clib_float32.c_xlinear_single_layer_train_csr_f32,
+            None,
+            [POINTER(ScipyCsrF32)] + arg_list[1:],
+        )
+        corelib.fillprototype(
+            self.clib_float32.c_xlinear_single_layer_train_drm_f32,
+            None,
+            [POINTER(ScipyDrmF32)] + arg_list[1:],
+        )
+
+        arg_list = [c_void_p]
+        corelib.fillprototype(self.clib_float32.c_xlinear_destruct_model, None, arg_list)
+
+        # Interface for sparse prediction
+        arg_list = [
+            c_void_p,
+            POINTER(ScipyCsrF32),
+            c_uint32,
+            c_char_p,
+            c_uint32,
+            c_int,
+            ScipyCompressedSparseAllocator.CFUNCTYPE,
+        ]
+        corelib.fillprototype(self.clib_float32.c_xlinear_predict_csr_f32, None, arg_list)
+
+        # Interface for dense prediction
+        arg_list = [
+            c_void_p,
+            POINTER(ScipyDrmF32),
+            c_uint32,
+            c_char_p,
+            c_uint32,
+            c_int,
+            ScipyCompressedSparseAllocator.CFUNCTYPE,
+        ]
+        corelib.fillprototype(self.clib_float32.c_xlinear_predict_drm_f32, None, arg_list)
+
+        # c interface for loading just model tree directly (no tfidf)
+        res_list = c_void_p
+        arg_list = [c_char_p]
+        corelib.fillprototype(self.clib_float32.c_xlinear_load_model_from_disk, res_list, arg_list)
+
+        res_list = c_void_p
+        arg_list = [c_char_p, c_int]
+        corelib.fillprototype(
+            self.clib_float32.c_xlinear_load_model_from_disk_ext, res_list, arg_list
+        )
+
+        # c interface for per-layer prediction
+        arg_list = [
+            POINTER(ScipyCsrF32),
+            POINTER(ScipyCsrF32),
+            POINTER(ScipyCscF32),
+            POINTER(ScipyCscF32),
+            c_char_p,
+            c_uint32,
+            c_int,
+            c_float,
+            ScipyCompressedSparseAllocator.CFUNCTYPE,
+        ]
+        corelib.fillprototype(
+            self.clib_float32.c_xlinear_single_layer_predict_csr_f32, None, arg_list
+        )
+
+        arg_list = [
+            POINTER(ScipyDrmF32),
+            POINTER(ScipyCsrF32),
+            POINTER(ScipyCscF32),
+            POINTER(ScipyCscF32),
+            c_char_p,
+            c_uint32,
+            c_int,
+            c_float,
+            ScipyCompressedSparseAllocator.CFUNCTYPE,
+        ]
+        corelib.fillprototype(
+            self.clib_float32.c_xlinear_single_layer_predict_drm_f32, None, arg_list
+        )
+
+        res_list = c_uint32
+        arg_list = [c_void_p, c_char_p]
+        corelib.fillprototype(self.clib_float32.c_xlinear_get_int_attr, res_list, arg_list)
+
+    def xlinear_load_predict_only(
+        self,
+        folder,
+        weight_matrix_type="BINARY_SEARCH_CHUNKED",
+    ):
+        """
+        Load xlinear model in predict only mode.
+
+        Args:
+            folder (str): The folder path for xlinear model.
+            weight_matrix_type (str, optional): The xlinear inference model types.
+
+        Return:
+            cmodel (ptr): The pointer to xlinear model.
+        """
+        weight_matrix_type_id = XLINEAR_INFERENCE_MODEL_TYPES[weight_matrix_type]
+        cmodel = self.clib_float32.c_xlinear_load_model_from_disk_ext(
+            c_char_p(folder.encode("utf-8")), c_int(int(weight_matrix_type_id))
+        )
+        return cmodel
+
+    def xlinear_destruct_model(self, c_model):
+        """
+        Destruct xlinear model.
+
+        Args:
+            cmodel (ptr): The pointer to xlinear model.
+        """
+        self.clib_float32.c_xlinear_destruct_model(c_model)
+
+    def xlinear_predict(
+        self,
+        c_model,
+        X,
+        overriden_beam_size,
+        overriden_post_processor_str,
+        overriden_only_topk,
+        threads,
+        pred_alloc,
+    ):
+        """
+        Performs a full prediction using the given model and queries.
+
+        Args:
+            c_model (c_pointer): A C pointer to the model to use for prediction. This pointer
+                is returned by the c_load_xlinear_model_from_disk and
+                c_load_xlinear_model_from_disk_ext functions in corelib.clib_float32.
+            X: The query matrix (admissible formats are smat.csr_matrix,
+                np.ndarray, ScipyCsrF32, or ScipyDrmF32). Note that if this is smat.csr_matrix,
+                the matrix must have sorted indices. You can call sort_indices() to ensure this.
+            overriden_beam_size (uint): Overrides the beam size to use for prediction. Use None for
+                model defaults.
+            overriden_post_processor_str (string): Overrides the post processor to use by name. Use
+                None for model defaults.
+            overriden_only_topk (uint): Overrides the number of results to return for each query. Use
+                None for model defaults.
+            threads (int): Sets the number of threads to use in computation. Use
+                -1 to use the maximum amount of available threads.
+            pred_alloc (ScipyCompressedSparseAllocator): The allocator to store the result in.
+        """
+        clib = self.clib_float32
+
+        if isinstance(X, smat.csr_matrix):
+            if not X.has_sorted_indices:
+                raise ValueError("Query matrix does not have sorted indices!")
+            X = ScipyCsrF32.init_from(X)
+        elif isinstance(X, np.ndarray):
+            X = ScipyDrmF32.init_from(X)
+
+        if isinstance(X, ScipyCsrF32):
+            c_predict = clib.c_xlinear_predict_csr_f32
+        elif isinstance(X, ScipyDrmF32):
+            c_predict = clib.c_xlinear_predict_drm_f32
+        else:
+            raise NotImplementedError("type(X) = {} not implemented".format(type(X)))
+
+        c_predict(
+            c_model,
+            byref(X),
+            overriden_beam_size if overriden_beam_size else 0,
+            overriden_post_processor_str.encode("utf-8") if overriden_post_processor_str else None,
+            overriden_only_topk if overriden_only_topk else 0,
+            threads,
+            pred_alloc.cfunc,
+        )
+
+    def xlinear_single_layer_predict(
+        self,
+        X,
+        csr_codes,
+        W,
+        C,
+        post_processor_str,
+        only_topk,
+        num_threads,
+        bias,
+        pred_alloc,
+    ):
+        """
+        Performs a single layer prediction in C++ using matrices owned by Python.
+
+        Args:
+            X (csr_matrix): The query matrix.
+                Note that if this is smat.csr_matrix, the matrix must have sorted indices.
+                You can call sort_indices() to ensure this.
+            csr_codes (smat.csr_matrix or ScipyCsrF32): The prediction for the previous layer, None if this is the first layer.
+            W (smat.csc_matrix, ScipyCscF32): The weight matrix for this layer.
+            C (smat.csc_matrix, ScipyCscF32): The child/parent map for this layer.
+            post_processor_str (str): A string specifying which post processor to use.
+            only_topk (uint): How many results to return for each query.
+            num_threads (uint): How many threads to use in this computation. Set to -1 to use defaults.
+            bias (float): The bias of the model.
+            pred_alloc (ScipyCompressedSparseAllocator): The allocator to store the result in.
+        """
+        clib = self.clib_float32
+
+        post_processor_str = post_processor_str.encode("utf-8")
+
+        W = ScipyCscF32.init_from(W)
+
+        if isinstance(X, smat.csr_matrix):
+            if not X.has_sorted_indices:
+                raise ValueError("Query matrix does not have sorted indices!")
+            X = ScipyCsrF32.init_from(X)
+        elif isinstance(X, np.ndarray):
+            X = ScipyDrmF32.init_from(X)
+
+        if isinstance(X, ScipyCsrF32):
+            c_single_layer_predict = clib.c_xlinear_single_layer_predict_csr_f32
+        elif isinstance(X, ScipyDrmF32):
+            c_single_layer_predict = clib.c_xlinear_single_layer_predict_drm_f32
+        else:
+            raise NotImplementedError("type(X) = {} not implemented".format(type(X)))
+
+        # csr_codes and pC might be null
+        if csr_codes is not None:
+            csr_codes = ScipyCsrF32.init_from(csr_codes)
+
+        if C is None:
+            C = smat.csc_matrix(np.ones((W.shape[1], 1), dtype=W.dtype))
+        C = ScipyCscF32.init_from(C)
+
+        c_single_layer_predict(
+            byref(X),
+            byref(csr_codes) if csr_codes is not None else None,
+            byref(W),
+            byref(C),
+            post_processor_str,
+            only_topk,
+            num_threads,
+            bias,
+            pred_alloc.cfunc,
+        )
+
+    def xlinear_single_layer_train(
+        self,
+        pX,
+        pY,
+        pC,
+        pM,
+        pR,
+        threshold=0.1,
+        max_nonzeros_per_label=None,
+        solver_type="L2R_L2LOSS_SVC_DUAL",
+        Cp=1.0,
+        Cn=1.0,
+        max_iter=1000,
+        eps=0.1,
+        bias=1.0,
+        threads=-1,
+        verbose=0,
+        **kwargs,
+    ):
+        """
+        Performs a single layer training in C++ using matrices owned by Python.
+
+        Args:
+            pX (ScipyCsrF32 or ScipyDrmF32): Instance feature matrix of shape (nr_inst, nr_feat).
+            pY (ScipyCscF32): Label matrix of shape (nr_inst, nr_labels).
+            pC (ScipyCscF32): Single matrix from clustering chain, representing a hierarchical clustering.
+            pM (ScipyCsrF32): Single matrix from matching chain.
+            pR (ScipyCscF32): Relevance matrix for cost-sensitive learning, of shape (nr_inst, nr_labels).
+            threshold (float, optional): sparsify the final model by eliminating all entrees with abs value less than threshold.
+                Default to 0.1.
+            max_nonzeros_per_label (int, optional): keep at most NONZEROS weight parameters per label in model.
+                Default None to set to (nr_feat + 1)
+            solver_type (string, optional): backend linear solver type.
+                Options: L2R_L2LOSS_SVC_DUAL(default), L2R_L1LOSS_SVC_DUAL.
+            Cp (float, optional): positive penalty parameter. Defaults to 1.0
+            Cn (float, optional): negative penalty parameter. Defaults to 1.0
+            max_iter (int, optional): maximum iterations. Defaults to 100
+            eps (float, optional): epsilon. Defaults to 0.1
+            bias (float, optional): if >0, append the bias value to each instance feature. Defaults to 1.0
+            threads (int, optional): the number of threads to use for training. Defaults to -1 to use all
+            verbose (int, optional): verbose level. Defaults to 0
+
+        Return:
+            layer_train_res (smat.csc_matrix): The layer training result.
+        """
+        clib = self.clib_float32
+        coo_alloc = ScipyCoordinateSparseAllocator(dtype=np.float32)
+        if isinstance(pX, ScipyCsrF32):
+            c_xlinear_single_layer_train = clib.c_xlinear_single_layer_train_csr_f32
+        elif isinstance(pX, ScipyDrmF32):
+            c_xlinear_single_layer_train = clib.c_xlinear_single_layer_train_drm_f32
+        else:
+            raise NotImplementedError("type(pX) = {} not implemented".format(type(pX)))
+
+        c_xlinear_single_layer_train(
+            byref(pX),
+            byref(pY),
+            byref(pC) if pC is not None else None,
+            byref(pM) if pM is not None else None,
+            byref(pR) if pR is not None else None,
+            coo_alloc.cfunc,
+            threshold,
+            0 if max_nonzeros_per_label is None else max_nonzeros_per_label,
+            XLINEAR_SOLVERS[solver_type],
+            Cp,
+            Cn,
+            max_iter,
+            eps,
+            bias,
+            threads,
+        )
+        return coo_alloc.tocsc().astype(np.float32)
+
+    def xlinear_get_int_attr(self, c_model, attr):
+        """
+        Get int attribute from C xlinear model.
+
+        Args:
+            c_model (ptr): The C xlinear model pointer.
+            attr (str): The attribute name to get.
+
+        Return:
+            int_attr (int): The int attribute under given name.
+        """
+        assert attr in {
+            "depth",
+            "nr_features",
+            "nr_labels",
+            "nr_codes",
+        }, f"attr {attr} not implemented"
+        return self.clib_float32.c_xlinear_get_int_attr(c_model, c_char_p(attr.encode("utf-8")))
+
+    def link_sparse_operations(self):
+        """
+        Specify C-lib's sparse matrix operation methods argument and return type.
+        """
+        arg_list = [
+            POINTER(ScipyCscF32),  # pX (should support both CSC and CSR)
+            POINTER(ScipyCscF32),  # pY (should support both CSC and CSR)
+            ScipyCompressedSparseAllocator.CFUNCTYPE,  # allocator for pZ
+            c_bool,  # eliminate_zeros
+            c_bool,  # sorted_indices
+            c_int,  # threads
+        ]
+        corelib.fillprototype(
+            self.clib_float32.c_sparse_matmul_csc_f32,
+            None,
+            [POINTER(ScipyCscF32), POINTER(ScipyCscF32)] + arg_list[2:],
+        )
+        corelib.fillprototype(
+            self.clib_float32.c_sparse_matmul_csr_f32,
+            None,
+            [POINTER(ScipyCsrF32), POINTER(ScipyCsrF32)] + arg_list[2:],
+        )
+
+        arg_list = [
+            POINTER(ScipyCsrF32),  # pX
+            POINTER(ScipyCscF32),  # pW
+            c_uint64,  # len
+            POINTER(c_uint32),  # X_row_idx
+            POINTER(c_uint32),  # W_col_idx
+            POINTER(c_float),  # val
+            c_int,  # threads
+        ]
+        corelib.fillprototype(
+            self.clib_float32.c_sparse_inner_products_csr_f32,
+            None,
+            [POINTER(ScipyCsrF32)] + arg_list[1:],
+        )
+        corelib.fillprototype(
+            self.clib_float32.c_sparse_inner_products_drm_f32,
+            None,
+            [POINTER(ScipyDrmF32)] + arg_list[1:],
+        )
+
+    def sparse_matmul(self, X, Y, eliminate_zeros=False, sorted_indices=True, threads=-1):
+        """
+        Sparse-Sparse matrix multiplication with multithreading (shared-memory).
+
+        Args:
+            X (smat.csc_matrix, smat.csr_matrix, ScipyCscF32, ScipyCsrF32): The first sparse matrix.
+            Y (smat.csc_matrix, smat.csr_matrix, ScipyCscF32, ScipyCsrF32): The second sparse matrix.
+            eliminate_zeros (bool, optional): if true, then eliminate (potential) zeros created by maxnnz in output matrix Z. Default is false.
+            sorted_indices (bool, optional): if true, then sort the Z.indices for the output matrix Z. Default is true.
+            threads (int, optional): The number of threads. Default -1 to use all cores.
+
+        Return:
+            matmul_res (smat.csc_matrix or smat.csr_matrix): The matrix multiplication results of X and Y
+        """
+
+        if X.shape[1] != Y.shape[0]:
+            raise ValueError("X.shape[1]={} != Y.shape[0]={}".format(X.shape[1], Y.shape[0]))
+
+        clib = self.clib_float32
+        pred_alloc = ScipyCompressedSparseAllocator()
+
+        def is_col_major(X):
+            return isinstance(X, smat.csc_matrix) or isinstance(X, ScipyCscF32)
+
+        def is_row_major(X):
+            return isinstance(X, smat.csr_matrix) or isinstance(X, ScipyCsrF32)
+
+        if is_col_major(X) and is_col_major(Y):
+            pX = ScipyCscF32.init_from(X)
+            pY = ScipyCscF32.init_from(Y)
+            clib.c_sparse_matmul_csc_f32(
+                pX, pY, pred_alloc.cfunc, eliminate_zeros, sorted_indices, threads
+            )
+        elif is_row_major(X) and is_row_major(Y):
+            pX = ScipyCsrF32.init_from(X)
+            pY = ScipyCsrF32.init_from(Y)
+            clib.c_sparse_matmul_csr_f32(
+                pX, pY, pred_alloc.cfunc, eliminate_zeros, sorted_indices, threads
+            )
+        elif is_col_major(X) and is_row_major(Y):
+            if X.nnz > Y.nnz:
+                Y = Y.tocsc()
+                pX = ScipyCscF32.init_from(X)
+                pY = ScipyCscF32.init_from(Y)
+                clib.c_sparse_matmul_csc_f32(
+                    pX, pY, pred_alloc.cfunc, eliminate_zeros, sorted_indices, threads
+                )
+            else:
+                X = X.tocsr()
+                pX = ScipyCsrF32.init_from(X)
+                pY = ScipyCsrF32.init_from(Y)
+                clib.c_sparse_matmul_csr_f32(
+                    pX, pY, pred_alloc.cfunc, eliminate_zeros, sorted_indices, threads
+                )
+        elif is_row_major(X) and is_col_major(Y):
+            if X.nnz > Y.nnz:
+                Y = Y.tocsr()
+                pX = ScipyCsrF32.init_from(X)
+                pY = ScipyCsrF32.init_from(Y)
+                clib.c_sparse_matmul_csr_f32(
+                    pX, pY, pred_alloc.cfunc, eliminate_zeros, sorted_indices, threads
+                )
+            else:
+                X = X.tocsc()
+                pX = ScipyCscF32.init_from(X)
+                pY = ScipyCscF32.init_from(Y)
+                clib.c_sparse_matmul_csc_f32(
+                    pX, pY, pred_alloc.cfunc, eliminate_zeros, sorted_indices, threads
+                )
+        else:
+            raise ValueError(
+                "X and Y should be either csr_matrix/csc_matrix/ScipyCscF32/ScipyCsrF32 !"
+            )
+
+        return pred_alloc.get()
+
+    def sparse_inner_products(self, pX, pW, X_row_idx, W_col_idx, pred_values=None, threads=-1):
+        """
+        Sparse-Sparse matrix batch inner product with multithreading (shared-memory).
+        Do inner product for rows from `pX` indicated by `X_row_idx`, and columns from `pW` indicated by `W_col_idx`.
+        Results will be written in `pred_values` if provided; Otherwise, create a new array for results.
+
+        Args:
+            pX (ScipyCsrF32, ScipyDrmF32): The first sparse matrix.
+            pW (ScipyCscF32, ScipyDcmF32): The second sparse matrix.
+            X_row_idx (ndarray): Row indexes for `pX`.
+            W_col_idx (ndarray): Column indexes for `pW`.
+            pred_values (ndarray, optional): The inner product result array.
+            threads (int, optional): The number of threads. Default -1 to use all cores.
+
+        Return:
+            pred_values (ndarray): The matrix batch inner product results.
+                If `pred_values` not given, return a new allocated ndarray, dtype same as `pW`.
+        """
+        clib = self.clib_float32
+
+        nnz = len(X_row_idx)
+        assert nnz == len(W_col_idx)
+
+        if not isinstance(pW, ScipyCscF32):
+            raise NotImplementedError("type(pW) = {} no implemented".format(type(pW)))
+
+        if isinstance(pX, ScipyCsrF32):
+            c_sparse_inner_products = clib.c_sparse_inner_products_csr_f32
+        elif isinstance(pX, ScipyDrmF32):
+            c_sparse_inner_products = clib.c_sparse_inner_products_drm_f32
+        else:
+            raise NotImplementedError("type(pX) = {} no implemented".format(type(pX)))
+
+        if pred_values is None or len(pred_values) != nnz or pred_values.dtype != np.float32:
+            pred_values = np.zeros(nnz, pW.dtype)
+
+        c_sparse_inner_products(
+            byref(pX),
+            byref(pW),
+            nnz,
+            X_row_idx.ctypes.data_as(POINTER(c_uint32)),
+            W_col_idx.ctypes.data_as(POINTER(c_uint32)),
+            pred_values.ctypes.data_as(POINTER(c_float)),
+            threads,
+        )
+        return pred_values
+
+    def link_clustering(self):
+        """
+        Specify C-lib's clustering method argument and return type.
+        """
+        arg_list = [
+            POINTER(ScipyCsrF32),
+            c_uint32,
+            c_uint32,
+            c_int,
+            c_uint32,
+            c_int,
+            POINTER(c_uint32),
+        ]
+        corelib.fillprototype(
+            self.clib_float32.c_run_clustering_csr_f32, None, [POINTER(ScipyCsrF32)] + arg_list[1:]
+        )
+        corelib.fillprototype(
+            self.clib_float32.c_run_clustering_drm_f32, None, [POINTER(ScipyDrmF32)] + arg_list[1:]
+        )
+
+    def run_clustering(self, py_feat_mat, depth, algo, seed, codes=None, max_iter=10, threads=-1):
+        """
+        Run clustering with given label embedding matrix and parameters in C++.
+
+        Args:
+            py_feat_mat (ScipyCsrF32, ScipyDrmF32): label embedding matrix. (num_labels x num_features).
+            depth (int): Depth of K-means clustering N-nary tree.
+            algo (str): The algorithm for clustering, either `KMEANS` or `SKMEANS`.
+            seed (int): Randoms seed.
+            codes (ndarray, optional): Label clustering results.
+            max_iter (int, optional): Maximum number of iter for reordering each node based on score.
+            threads (int, optional): The number of threads. Default -1 to use all cores.
+
+        Return:
+            codes (ndarray): The clustering result.
+                If `codes` not given, return a new allocated ndarray, dtype `np.uint32`.
+        """
+        clib = self.clib_float32
+        if isinstance(py_feat_mat, ScipyCsrF32):
+            run_clustering = clib.c_run_clustering_csr_f32
+        elif isinstance(py_feat_mat, ScipyDrmF32):
+            run_clustering = clib.c_run_clustering_drm_f32
+        else:
+            raise NotImplementedError(
+                "type(py_feat_mat) = {} no implemented".format(type(py_feat_mat))
+            )
+
+        if codes is None or len(codes) != py_feat_mat.shape[0] or codes.dtype != np.uint32:
+            codes = np.zeros(py_feat_mat.rows, dtype=np.uint32)
+        run_clustering(
+            byref(py_feat_mat),
+            depth,
+            algo,
+            seed,
+            max_iter,
+            threads,
+            codes.ctypes.data_as(POINTER(c_uint32)),
+        )
+        return codes
+
+    def link_tfidf_vectorizer(self):
+        """
+        Specify C-lib's Tfidf vectorizer method argument and return type.
+        """
+        res_list = c_void_p
+        arg_list = [c_char_p]
+        corelib.fillprototype(self.clib_float32.c_tfidf_load, res_list, arg_list)
+
+        arg_list = [c_void_p, c_char_p]
+        corelib.fillprototype(self.clib_float32.c_tfidf_save, None, arg_list)
+
+        arg_list = [c_void_p]
+        corelib.fillprototype(self.clib_float32.c_tfidf_destruct, None, arg_list)
+
+        arg_list = [
+            c_int,  # threads
+            ScipyCompressedSparseAllocator.CFUNCTYPE,  # pred_alloc for result
+        ]
+
+        # model, fname, fname_len, buffer_size
+        corelib.fillprototype(
+            self.clib_float32.c_tfidf_predict_from_file,
+            None,
+            [c_void_p, c_void_p, c_uint64, c_uint64] + arg_list,
+        )
+
+        # model, corpus, doc_lens, nr_docs
+        corelib.fillprototype(
+            self.clib_float32.c_tfidf_predict,
+            None,
+            [c_void_p, c_void_p, POINTER(c_uint64), c_uint64] + arg_list,
+        )
+
+        res_list = c_void_p
+
+        # file-list, fname_lens, nr_files, param, buffer_size, threads
+        corelib.fillprototype(
+            self.clib_float32.c_tfidf_train_from_file,
+            res_list,
+            [c_void_p, POINTER(c_uint64), c_uint64, POINTER(TfidfVectorizerParam), c_uint64, c_int],
+        )
+        # corpus, doc_lens, nr_docs, params, threads
+        corelib.fillprototype(
+            self.clib_float32.c_tfidf_train,
+            res_list,
+            [c_void_p, POINTER(c_uint64), c_uint64, POINTER(TfidfVectorizerParam), c_int],
+        )
+
+    def tfidf_destruct(self, model):
+        """
+        Destruct Tfdif model.
+
+        Args:
+            model (ptr): Pointer to C Tfdif model.
+        """
+        if type(model) == c_void_p:
+            self.clib_float32.c_tfidf_destruct(model)
+
+    def tfidf_save(self, model, save_dir):
+        """
+        Save trained tfidf vectorizer to disk.
+
+        Args:
+            save_dir (str): Folder to save the model.
+        """
+        self.clib_float32.c_tfidf_save(model, c_char_p(save_dir.encode("utf-8")))
+
+    def tfidf_load(self, load_dir):
+        """
+        Load a CppTfidf vectorizer from disk.
+
+        Args:
+            load_dir (str): Folder inside which the model is loaded.
+
+        Returns:
+            pointer to C instance tfidf::Vectorizer
+        """
+        return self.clib_float32.c_tfidf_load(c_char_p(load_dir.encode("utf-8")))
+
+    def tfidf_train(self, trn_corpus, config=None):
+        """
+        Train on a corpus.
+
+        Args:
+            trn_corpus (list of str or str): Training corpus in the form of a list of strings or path to corpus file/folder.
+            config (dict): Dict with keyword arguments to pass to C++ class tfidf::Vectorizer. None to use default in TfidfVectorizerParam.
+                For TfidfVectorizerParam, the config should contain
+                    base_vect_configs (List(Dict)): list of config (list[TfidfBaseVectorizerParam]) to be used for TfidfBaseVectorizerParam.
+                    norm_p (int): after ensembling feature sub matrices, do row-wise normalization with norm_p.
+                    buffer_size (int): if train from file, number of bytes allocated for file I/O. Set to 0 to use default value.
+                    threads (int): number of threads to use, set to negative to use all
+                For TfidfBaseVectorizerParam, the config should contain
+                    ngram_range (tuple of int): (min_ngram, max_ngram)
+                    truncate_length (int): sequence truncation length, set to negative to disable
+                    max_feature (int): maximum number of features allowed, set to 0 to disable
+                    min_df_ratio (float, [0, max_df_ratio)): min ratio for document frequency truncation
+                    max_df_ratio (float, (min_df_ratio, 1]): max ratio for document frequency truncation
+                    min_df_cnt (int, [0, max_df_cnt)): min count for document frequency truncation
+                    max_df_cnt (float, (min_df_cnt, Inf)): max count for document frequency truncation. Default -1 to disable.
+                    binary (bool): whether to binarize term frequency, default False
+                    use_idf (bool): whether to use inverse document frequency, default True
+                    smooth_idf (bool): whether to smooth IDF by adding 1 to all DF counts, default True
+                    sublinear_tf (bool): whether to use sublinear mapping (log) on term frequency, default False
+                    keep_frequent_feature (bool): if max_feature > 0, will only keep max_feature features by
+                                    ignoring features with low document frequency (if True, default),
+                                    ignoring features with high document frequency (if False)
+                    norm (str, 'l1' or 'l2'): feature vector will have unit l1 or l2 norm
+                    analyzer (str, 'word', 'char' or 'char_wb'): Whether to use word or character n-grams.
+                                    Option ‘char_wb’ creates character n-grams only from text inside word boundaries,
+                                    n-grams at the edges of words are padded with space.
+                    buffer_size (int): if train from file, number of bytes allocated for file I/O. Set to 0 to use default value.
+                    threads (int): number of threads to use, set to negative to use all
+
+        Returns:
+            pointer to C instance tfidf::Vectorizer
+        """
+
+        # Check whether "base_vect_configs" is in config.keys()
+        # If not, this config is for TfidfBaseVectorizerParam.
+        # Otherwise, this config is for TfidfVectorizerParam.
+        if "base_vect_configs" not in config:
+            base_vect_param_list = [TfidfBaseVectorizerParam(config)]
+            norm_p = base_vect_param_list[0].norm_p
+        else:
+            base_vect_param_list = [
+                TfidfBaseVectorizerParam(base_vect_config)
+                for base_vect_config in config["base_vect_configs"]
+            ]
+            norm_p = config["norm_p"]
+        params = TfidfVectorizerParam(base_vect_param_list, norm_p)
+
+        if isinstance(trn_corpus, str):
+            if os.path.isfile(trn_corpus):  # train from a single corpus file
+                corpus_files = [trn_corpus]
+            elif os.path.isdir(trn_corpus):  # train from a folder of corpus files
+                corpus_files = [
+                    os.path.join(trn_corpus, f)
+                    for f in sorted(os.listdir(trn_corpus))
+                    if os.path.isfile(os.path.join(trn_corpus, f))
+                ]
+            else:
+                raise Exception("Failed to load training corpus from {}".format(trn_corpus))
+            nr_files = len(corpus_files)
+            c_corpusf_arr = (c_char_p * nr_files)()
+            c_corpusf_arr[:] = [line.encode("utf-8") for line in corpus_files]
+            fname_lens = np.array([len(line) for line in c_corpusf_arr], dtype=np.uint64)
+
+            model = self.clib_float32.c_tfidf_train_from_file(
+                c_corpusf_arr,
+                fname_lens.ctypes.data_as(POINTER(c_uint64)),
+                nr_files,
+                params,
+                config["buffer_size"],
+                config["threads"],
+            )
+        else:
+            nr_doc = len(trn_corpus)
+            c_corpus_arr = (c_char_p * nr_doc)()
+            c_corpus_arr[:] = [line.encode("utf-8") for line in trn_corpus]
+            doc_lens = np.array([len(line) for line in c_corpus_arr], dtype=np.uint64)
+
+            model = self.clib_float32.c_tfidf_train(
+                c_corpus_arr,
+                doc_lens.ctypes.data_as(POINTER(c_uint64)),
+                nr_doc,
+                params,
+                config["threads"],
+            )
+
+        return model
+
+    def tfidf_predict(self, model, corpus, buffer_size=0, threads=-1):
+        """
+        Vectorize a corpus.
+
+        Args:
+            model (ctypes.c_void_p): pointer to tfidf::Vectorizer model
+            corpus (list): List of strings to vectorize.
+            buffer_size (int, default 0): number of bytes used for file I/O while train from file, set to 0 to use default value
+            threads (int, default -1): number of threads to use for predict, set to negative to use all
+
+        Returns:
+            scipy.sparse.csr.csr_matrix: Matrix of features.
+        """
+        pred_alloc = ScipyCompressedSparseAllocator()
+        if isinstance(corpus, str):
+            # train from file
+            assert os.path.isfile(corpus), "Cannot predict from {}!".format(corpus)
+            corpus_utf8 = corpus.encode("utf-8")
+
+            self.clib_float32.c_tfidf_predict_from_file(
+                model,
+                c_char_p(corpus_utf8),
+                len(corpus_utf8),
+                buffer_size,
+                threads,
+                pred_alloc.cfunc,
+            )
+
+        else:
+            # in memory predict
+            nr_doc = len(corpus)
+            c_corpus_arr = (c_char_p * nr_doc)()
+            c_corpus_arr[:] = [line.encode("utf-8") for line in corpus]
+            doc_lens = np.array([len(line) for line in c_corpus_arr], dtype=np.uint64)
+
+            self.clib_float32.c_tfidf_predict(
+                model,
+                c_corpus_arr,
+                doc_lens.ctypes.data_as(POINTER(c_uint64)),
+                nr_doc,
+                threads,
+                pred_alloc.cfunc,
+            )
+        return pred_alloc.get()
+
+
+clib = corelib(os.path.join(os.path.dirname(os.path.abspath(pecos.__file__)), "core"), "libpecos")
diff --git a/pecos/core/libpecos.cpp b/pecos/core/libpecos.cpp
new file mode 100644
index 00000000..35281c58
--- /dev/null
+++ b/pecos/core/libpecos.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#include "utils/clustering.hpp"
+#include "utils/matrix.hpp"
+#include "utils/tfidf.hpp"
+#include "xmc/inference.hpp"
+#include "xmc/linear_solver.hpp"
+
+// ===== C Interface of Functions ======
+// C Interface of Types/Structures can be found in utils/matrix.hpp
+
+extern "C" {
+    // ==== C Interface of XMC Models ====
+    void* c_xlinear_load_model_from_disk(const char* model_path) {
+        auto model = new pecos::HierarchicalMLModel(model_path);
+        return static_cast<void*>(model);
+    }
+
+    void* c_xlinear_load_model_from_disk_ext(const char* model_path,
+        int weight_matrix_type) {
+        pecos::layer_type_t type = static_cast<pecos::layer_type_t>(weight_matrix_type);
+        auto model = new pecos::HierarchicalMLModel(model_path, type);
+        return static_cast<void*>(model);
+    }
+
+    void c_xlinear_destruct_model(void* ptr) {
+        pecos::HierarchicalMLModel* mc = static_cast<pecos::HierarchicalMLModel*>(ptr);
+        delete mc;
+    }
+
+    // Obtain attribute values of the model.
+    // Allowed attr: depth, nr_features, nr_labels, nr_codes
+    uint32_t c_xlinear_get_int_attr(void* ptr, const char* attr) {
+        pecos::HierarchicalMLModel* mc = static_cast<pecos::HierarchicalMLModel*>(ptr);
+        return mc->get_int_attr(attr);
+    }
+
+    #define C_XLINEAR_PREDICT(SUFFIX, PY_MAT, C_MAT) \
+    void c_xlinear_predict ## SUFFIX( \
+        void* ptr, \
+        const PY_MAT* input_x, \
+        const uint32_t overridden_beam_size, \
+        const char* overridden_post_processor_str, \
+        const uint32_t overridden_only_topk, \
+        const int threads, \
+        py_sparse_allocator_t pred_alloc) { \
+        pecos::HierarchicalMLModel* mc = static_cast<pecos::HierarchicalMLModel*>(ptr); \
+        C_MAT X(input_x); \
+        pecos::csr_t result; \
+        mc->predict(X, result, overridden_beam_size, overridden_post_processor_str, \
+        overridden_only_topk, threads); \
+        result.create_pycsr(pred_alloc); \
+        result.free_underlying_memory(); \
+    }
+    C_XLINEAR_PREDICT(_csr_f32, ScipyCsrF32, pecos::csr_t)
+    C_XLINEAR_PREDICT(_drm_f32, ScipyDrmF32, pecos::drm_t)
+
+
+    #define C_XLINEAR_SINGLE_LAYER_PREDICT(SUFFIX, PY_MAT, C_MAT) \
+    void c_xlinear_single_layer_predict ## SUFFIX( \
+        const PY_MAT* input_x, \
+        const ScipyCsrF32* csr_codes, \
+        ScipyCscF32* W, \
+        ScipyCscF32* C, \
+        const char* post_processor_str, \
+        const uint32_t only_topk, \
+        const int num_threads, \
+        const float bias, \
+        py_sparse_allocator_t pred_alloc) { \
+        C_MAT X(input_x); \
+        pecos::csr_t last_layer_pred; \
+        bool is_first_layer; \
+        if (csr_codes) { \
+            last_layer_pred = pecos::csr_t(csr_codes).deep_copy(); \
+            is_first_layer = false; \
+        } else { \
+            last_layer_pred.fill_ones(X.rows, 1); \
+            is_first_layer = true; \
+        } \
+        pecos::csc_t C_; \
+        C_ = pecos::csc_t(C); \
+        pecos::csr_t cur_layer_pred; \
+        pecos::csc_t W_ = pecos::csc_t(W); \
+        pecos::MLModelMetadata metadata(bias, only_topk, post_processor_str); \
+        pecos::MLModel<pecos::csc_t> layer(W_, C_, 0, false, metadata); \
+        layer.predict(X, last_layer_pred, is_first_layer, only_topk, \
+            post_processor_str, cur_layer_pred, num_threads); \
+        cur_layer_pred.create_pycsr(pred_alloc); \
+        cur_layer_pred.free_underlying_memory(); \
+    }
+    C_XLINEAR_SINGLE_LAYER_PREDICT(_csr_f32, ScipyCsrF32, pecos::csr_t)
+    C_XLINEAR_SINGLE_LAYER_PREDICT(_drm_f32, ScipyDrmF32, pecos::drm_t)
+
+
+    #define C_XLINEAR_SINGLE_LAYER_TRAIN(SUFFIX, PY_MAT, C_MAT) \
+    void c_xlinear_single_layer_train ## SUFFIX( \
+        const PY_MAT *pX, \
+        const ScipyCscF32 *pY, \
+        const ScipyCscF32 *pC, \
+        const ScipyCscF32 *pM, \
+        const ScipyCscF32 *pR, \
+        py_coo_allocator_t coo_alloc, \
+        double threshold, \
+        uint32_t max_nonzeros_per_label, \
+        int solver_type, \
+        double Cp, \
+        double Cn, \
+        size_t max_iter, \
+        double eps, \
+        double bias, \
+        int threads) { \
+        const C_MAT feat_mat(pX); \
+        const pecos::csc_t Y(pY); \
+        const pecos::csc_t& C = (pC == NULL) ? pecos::csc_t() : pecos::csc_t(pC); \
+        const pecos::csc_t& M = (pM == NULL) ? pecos::csc_t() : pecos::csc_t(pM); \
+        const pecos::csc_t& R = (pR == NULL) ? pecos::csc_t() : pecos::csc_t(pR); \
+        pecos::linear_solver::SVMParameter param(solver_type, Cp, Cn, max_iter, eps, bias); \
+        pecos::coo_t model; \
+        pecos::linear_solver::multilabel_train_with_codes(\
+            &feat_mat, \
+            &Y, \
+            (pC == NULL) ? NULL : &C, \
+            (pM == NULL) ? NULL : &M, \
+            (pR == NULL) ? NULL : &R, \
+            &model, \
+            threshold, \
+            max_nonzeros_per_label, \
+            &param, \
+            threads \
+        ); \
+        model.create_pycoo(coo_alloc); \
+    }
+    C_XLINEAR_SINGLE_LAYER_TRAIN(_csr_f32, ScipyCsrF32, pecos::csr_t)
+    C_XLINEAR_SINGLE_LAYER_TRAIN(_drm_f32, ScipyDrmF32, pecos::drm_t)
+
+    // ==== C Interface of Sparse Matrix/Vector Operations ====
+
+    #define C_SPARSE_MATMUL(SUFFIX, PY_MAT, C_MAT) \
+    void c_sparse_matmul ## SUFFIX( \
+        const PY_MAT* pX, \
+        const PY_MAT* pY, \
+        py_sparse_allocator_t pred_alloc, \
+        const bool eliminate_zeros, \
+        const bool sorted_indices, \
+        int threads) { \
+        C_MAT X(pX); \
+        C_MAT Y(pY); \
+        pecos::spmm_mat_t<C_MAT::IS_COLUMN_MAJORED> Z(pred_alloc); \
+        smat_x_smat(X, Y, Z, eliminate_zeros, sorted_indices, threads); \
+    }
+    C_SPARSE_MATMUL(_csc_f32, ScipyCscF32, pecos::csc_t)
+    C_SPARSE_MATMUL(_csr_f32, ScipyCsrF32, pecos::csr_t)
+
+
+    #define C_SPARSE_INNER_PRODUCTS(SUFFIX, PY_MAT, C_MAT) \
+    void c_sparse_inner_products ## SUFFIX( \
+        const PY_MAT *pX, \
+        const ScipyCscF32 *pW, \
+        uint64_t len, \
+        uint32_t *X_row_idx, \
+        uint32_t *W_col_idx, \
+        float32_t *val, \
+        int threads) { \
+        C_MAT X(pX); \
+        pecos::csc_t W(pW); \
+        compute_sparse_entries_from_rowmajored_X_and_colmajored_M( \
+            X, W, len, X_row_idx, W_col_idx, val, threads \
+        ); \
+    }
+    C_SPARSE_INNER_PRODUCTS(_csr_f32, ScipyCsrF32, pecos::csr_t)
+    C_SPARSE_INNER_PRODUCTS(_drm_f32, ScipyDrmF32, pecos::drm_t)
+
+    // ==== C Interface of Clustering ====
+
+    #define C_RUN_CLUSTERING(SUFFIX, PY_MAT, C_MAT) \
+    void c_run_clustering ## SUFFIX( \
+        const PY_MAT* py_mat_ptr, \
+        uint32_t depth, \
+        uint32_t partition_algo, \
+        int seed, \
+        uint32_t max_iter, \
+        int threads, \
+        uint32_t* label_codes) { \
+        C_MAT feat_mat(py_mat_ptr); \
+        pecos::clustering::Tree tree(depth); \
+        tree.run_clustering(feat_mat, partition_algo, seed, label_codes, max_iter, threads); \
+    }
+    C_RUN_CLUSTERING(_csr_f32, ScipyCsrF32, pecos::csr_t)
+    C_RUN_CLUSTERING(_drm_f32, ScipyDrmF32, pecos::drm_t)
+
+    // ==== C Interface of TFIDF vectorizer ====
+
+    void* c_tfidf_train_from_file(
+        void* corpus_files_ptr,
+        const size_t* fname_lens,
+        size_t nr_files,
+        const pecos::tfidf::TfidfVectorizerParam* param_ptr,
+        size_t buffer_size,
+        int threads) {
+        const char** corpus_files = static_cast<const char**>(corpus_files_ptr);
+        pecos::tfidf::Vectorizer* vect = new pecos::tfidf::Vectorizer(param_ptr);
+        vect->train_from_file(corpus_files, fname_lens, nr_files, buffer_size, threads);
+        return static_cast<void*>(vect);
+    }
+
+    void* c_tfidf_train(
+        void* corpus_ptr,
+        const size_t* doc_lens,
+        size_t nr_doc,
+        const pecos::tfidf::TfidfVectorizerParam* param_ptr,
+        int threads) {
+        const char** corpus = static_cast<const char**>(corpus_ptr);
+        pecos::tfidf::Vectorizer* vect = new pecos::tfidf::Vectorizer(param_ptr);
+        vect->train(corpus, doc_lens, nr_doc, threads);
+        return static_cast<void*>(vect);
+    }
+
+    void* c_tfidf_load(const char* model_dir) {
+        pecos::tfidf::Vectorizer* vect = new pecos::tfidf::Vectorizer(model_dir);
+        return static_cast<void*>(vect);
+    }
+
+    void c_tfidf_save(void* ptr, const char* model_dir) {
+        pecos::tfidf::Vectorizer* vect = static_cast<pecos::tfidf::Vectorizer*>(ptr);
+        vect->save(model_dir);
+    }
+
+    void c_tfidf_destruct(void* ptr) {
+        pecos::tfidf::Vectorizer* vect = static_cast<pecos::tfidf::Vectorizer*>(ptr);
+        delete vect;
+    }
+
+    void c_tfidf_predict_from_file(
+        void* ptr,
+        void* corpus_fname_ptr,
+        size_t fname_len,
+        size_t buffer_size,
+        int threads,
+        py_sparse_allocator_t pred_alloc) {
+        pecos::tfidf::Vectorizer* vect = static_cast<pecos::tfidf::Vectorizer*>(ptr);
+        const char* corpus = static_cast<const char*>(corpus_fname_ptr);
+        pecos::spmm_mat_t<false> feat_mat(pred_alloc);
+        vect->predict_from_file(corpus, fname_len, feat_mat, buffer_size, threads);
+    }
+
+    void c_tfidf_predict(
+        void* ptr,
+        void* corpus_ptr,
+        const size_t* doc_lens,
+        size_t nr_doc,
+        int threads,
+        py_sparse_allocator_t pred_alloc) {
+        pecos::tfidf::Vectorizer* vect = static_cast<pecos::tfidf::Vectorizer*>(ptr);
+        const char** corpus = static_cast<const char**>(corpus_ptr);
+        pecos::spmm_mat_t<false> feat_mat(pred_alloc);
+        if(nr_doc > 1) {
+            vect->predict(corpus, doc_lens, nr_doc, feat_mat, threads);
+        } else if(nr_doc == 1) {
+            std::string_view cur_doc(corpus[0], doc_lens[0]);
+            vect->predict(cur_doc, feat_mat);
+        } else {
+            throw std::runtime_error("Invalid nr_doc " + std::to_string(nr_doc));
+        }
+    }
+}
diff --git a/pecos/core/third_party/nlohmann_json/json.hpp b/pecos/core/third_party/nlohmann_json/json.hpp
new file mode 100644
index 00000000..7fc88726
--- /dev/null
+++ b/pecos/core/third_party/nlohmann_json/json.hpp
@@ -0,0 +1,22875 @@
+/*
+    __ _____ _____ _____
+ __|  |   __|     |   | |  JSON for Modern C++
+|  |  |__   |  |  | | | |  version 3.7.3
+|_____|_____|_____|_|___|  https://github.com/nlohmann/json
+
+Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+SPDX-License-Identifier: MIT
+Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
+
+Permission is hereby  granted, free of charge, to any  person obtaining a copy
+of this software and associated  documentation files (the "Software"), to deal
+in the Software  without restriction, including without  limitation the rights
+to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
+copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
+IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
+FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
+AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
+LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef INCLUDE_NLOHMANN_JSON_HPP_
+#define INCLUDE_NLOHMANN_JSON_HPP_
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3
+#define NLOHMANN_JSON_VERSION_MINOR 7
+#define NLOHMANN_JSON_VERSION_PATCH 3
+
+#include <algorithm> // all_of, find, for_each
+#include <cassert> // assert
+#include <ciso646> // and, not, or
+#include <cstddef> // nullptr_t, ptrdiff_t, size_t
+#include <functional> // hash, less
+#include <initializer_list> // initializer_list
+#include <iosfwd> // istream, ostream
+#include <iterator> // random_access_iterator_tag
+#include <memory> // unique_ptr
+#include <numeric> // accumulate
+#include <string> // string, stoi, to_string
+#include <utility> // declval, forward, move, pair, swap
+#include <vector> // vector
+
+// #include <nlohmann/adl_serializer.hpp>
+
+
+#include <utility>
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+
+#include <algorithm> // transform
+#include <array> // array
+#include <ciso646> // and, not
+#include <forward_list> // forward_list
+#include <iterator> // inserter, front_inserter, end
+#include <map> // map
+#include <string> // string
+#include <tuple> // tuple, make_tuple
+#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
+#include <unordered_map> // unordered_map
+#include <utility> // pair, declval
+#include <valarray> // valarray
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+
+#include <exception> // exception
+#include <stdexcept> // runtime_error
+#include <string> // to_string
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+
+#include <cstddef> // size_t
+
+namespace nlohmann
+{
+namespace detail
+{
+/// struct to capture the start position of the current token
+struct position_t
+{
+    /// the total number of characters read
+    std::size_t chars_read_total = 0;
+    /// the number of characters read in the current line
+    std::size_t chars_read_current_line = 0;
+    /// the number of lines read
+    std::size_t lines_read = 0;
+
+    /// conversion to size_t to preserve SAX interface
+    constexpr operator size_t() const
+    {
+        return chars_read_total;
+    }
+};
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+#include <utility> // pair
+// #include <nlohmann/thirdparty/hedley/hedley.hpp>
+/* Hedley - https://nemequ.github.io/hedley
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all
+ * copyright and related and neighboring rights to this software to
+ * the public domain worldwide. This software is distributed without
+ * any warranty.
+ *
+ * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ * SPDX-License-Identifier: CC0-1.0
+ */
+
+#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 11)
+#if defined(JSON_HEDLEY_VERSION)
+    #undef JSON_HEDLEY_VERSION
+#endif
+#define JSON_HEDLEY_VERSION 11
+
+#if defined(JSON_HEDLEY_STRINGIFY_EX)
+    #undef JSON_HEDLEY_STRINGIFY_EX
+#endif
+#define JSON_HEDLEY_STRINGIFY_EX(x) #x
+
+#if defined(JSON_HEDLEY_STRINGIFY)
+    #undef JSON_HEDLEY_STRINGIFY
+#endif
+#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)
+
+#if defined(JSON_HEDLEY_CONCAT_EX)
+    #undef JSON_HEDLEY_CONCAT_EX
+#endif
+#define JSON_HEDLEY_CONCAT_EX(a,b) a##b
+
+#if defined(JSON_HEDLEY_CONCAT)
+    #undef JSON_HEDLEY_CONCAT
+#endif
+#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)
+
+#if defined(JSON_HEDLEY_VERSION_ENCODE)
+    #undef JSON_HEDLEY_VERSION_ENCODE
+#endif
+#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
+    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
+
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #undef JSON_HEDLEY_GNUC_VERSION
+#endif
+#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#elif defined(__GNUC__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION)
+    #undef JSON_HEDLEY_MSVC_VERSION
+#endif
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
+#elif defined(_MSC_FULL_VER)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
+#elif defined(_MSC_VER)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
+    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#endif
+#if !defined(_MSC_VER)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
+#else
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #undef JSON_HEDLEY_INTEL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
+#elif defined(__INTEL_COMPILER)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
+    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #undef JSON_HEDLEY_PGI_VERSION
+#endif
+#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
+    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
+    #undef JSON_HEDLEY_PGI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #undef JSON_HEDLEY_SUNPRO_VERSION
+#endif
+#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
+#elif defined(__SUNPRO_C)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
+#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
+#elif defined(__SUNPRO_CC)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
+    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#endif
+#if defined(__EMSCRIPTEN__)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #undef JSON_HEDLEY_ARM_VERSION
+#endif
+#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
+#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
+    #undef JSON_HEDLEY_ARM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #undef JSON_HEDLEY_IBM_VERSION
+#endif
+#if defined(__ibmxl__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
+#elif defined(__xlC__) && defined(__xlC_ver__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
+#elif defined(__xlC__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
+    #undef JSON_HEDLEY_IBM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #undef JSON_HEDLEY_TI_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__)
+    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #undef JSON_HEDLEY_CRAY_VERSION
+#endif
+#if defined(_CRAYC)
+    #if defined(_RELEASE_PATCHLEVEL)
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
+    #else
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
+    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #undef JSON_HEDLEY_IAR_VERSION
+#endif
+#if defined(__IAR_SYSTEMS_ICC__)
+    #if __VER__ > 1000
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
+    #else
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
+    #undef JSON_HEDLEY_IAR_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #undef JSON_HEDLEY_TINYC_VERSION
+#endif
+#if defined(__TINYC__)
+    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
+    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #undef JSON_HEDLEY_DMC_VERSION
+#endif
+#if defined(__DMC__)
+    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
+    #undef JSON_HEDLEY_DMC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #undef JSON_HEDLEY_COMPCERT_VERSION
+#endif
+#if defined(__COMPCERT_VERSION__)
+    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
+    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #undef JSON_HEDLEY_PELLES_VERSION
+#endif
+#if defined(__POCC__)
+    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
+    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #undef JSON_HEDLEY_GCC_VERSION
+#endif
+#if \
+    defined(JSON_HEDLEY_GNUC_VERSION) && \
+    !defined(__clang__) && \
+    !defined(JSON_HEDLEY_INTEL_VERSION) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_ARM_VERSION) && \
+    !defined(JSON_HEDLEY_TI_VERSION) && \
+    !defined(__COMPCERT__)
+    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#endif
+#if \
+    defined(__has_cpp_attribute) && \
+    defined(__cplusplus) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#endif
+#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#elif \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
+    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_BUILTIN)
+    #undef JSON_HEDLEY_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_FEATURE)
+    #undef JSON_HEDLEY_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GCC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_EXTENSION)
+    #undef JSON_HEDLEY_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_WARNING)
+    #undef JSON_HEDLEY_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
+    #undef JSON_HEDLEY_GNUC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
+    #undef JSON_HEDLEY_GCC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#endif
+#if defined(__cplusplus) && JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
+#  define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#else
+#  define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
+#endif
+
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    defined(__clang__) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(6,0,0) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
+    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
+#else
+    #define JSON_HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
+    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
+    #undef JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif JSON_HEDLEY_TI_VERSION_CHECK(8,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
+    #define JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
+#elif JSON_HEDLEY_TI_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
+#elif JSON_HEDLEY_TI_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
+#elif JSON_HEDLEY_TI_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+
+#if defined(JSON_HEDLEY_DEPRECATED)
+    #undef JSON_HEDLEY_DEPRECATED
+#endif
+#if defined(JSON_HEDLEY_DEPRECATED_FOR)
+    #undef JSON_HEDLEY_DEPRECATED_FOR
+#endif
+#if defined(__cplusplus) && (__cplusplus >= 201402L)
+    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
+#elif \
+    JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,3,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
+#else
+    #define JSON_HEDLEY_DEPRECATED(since)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
+#endif
+
+#if defined(JSON_HEDLEY_UNAVAILABLE)
+    #undef JSON_HEDLEY_UNAVAILABLE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
+#else
+    #define JSON_HEDLEY_UNAVAILABLE(available_since)
+#endif
+
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#endif
+#if defined(__cplusplus) && (__cplusplus >= 201703L)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#elif defined(_Check_return_) /* SAL */
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
+#else
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT
+#endif
+
+#if defined(JSON_HEDLEY_SENTINEL)
+    #undef JSON_HEDLEY_SENTINEL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0)
+    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
+#else
+    #define JSON_HEDLEY_SENTINEL(position)
+#endif
+
+#if defined(JSON_HEDLEY_NO_RETURN)
+    #undef JSON_HEDLEY_NO_RETURN
+#endif
+#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NO_RETURN __noreturn
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+    #define JSON_HEDLEY_NO_RETURN _Noreturn
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(17,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#elif JSON_HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#else
+    #define JSON_HEDLEY_NO_RETURN
+#endif
+
+#if defined(JSON_HEDLEY_NO_ESCAPE)
+    #undef JSON_HEDLEY_NO_ESCAPE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
+    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
+#else
+    #define JSON_HEDLEY_NO_ESCAPE
+#endif
+
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #undef JSON_HEDLEY_UNREACHABLE
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
+    #undef JSON_HEDLEY_UNREACHABLE_RETURN
+#endif
+#if \
+    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5)
+    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+    #define JSON_HEDLEY_UNREACHABLE() __assume(0)
+#elif JSON_HEDLEY_TI_VERSION_CHECK(6,0,0)
+    #if defined(__cplusplus)
+        #define JSON_HEDLEY_UNREACHABLE() std::_nassert(0)
+    #else
+        #define JSON_HEDLEY_UNREACHABLE() _nassert(0)
+    #endif
+    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return value
+#elif defined(EXIT_FAILURE)
+    #define JSON_HEDLEY_UNREACHABLE() abort()
+#else
+    #define JSON_HEDLEY_UNREACHABLE()
+    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return value
+#endif
+#if !defined(JSON_HEDLEY_UNREACHABLE_RETURN)
+    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
+#endif
+
+#if defined(JSON_HEDLEY_ASSUME)
+    #undef JSON_HEDLEY_ASSUME
+#endif
+#if \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
+#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
+    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
+#elif JSON_HEDLEY_TI_VERSION_CHECK(6,0,0)
+    #if defined(__cplusplus)
+        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
+    #endif
+#elif \
+    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && !defined(JSON_HEDLEY_ARM_VERSION)) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5)
+    #define JSON_HEDLEY_ASSUME(expr) ((void) ((expr) ? 1 : (__builtin_unreachable(), 1)))
+#else
+    #define JSON_HEDLEY_ASSUME(expr) ((void) (expr))
+#endif
+
+JSON_HEDLEY_DIAGNOSTIC_PUSH
+#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
+    #pragma clang diagnostic ignored "-Wpedantic"
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
+    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
+    #if defined(__clang__)
+        #pragma clang diagnostic ignored "-Wvariadic-macros"
+    #elif defined(JSON_HEDLEY_GCC_VERSION)
+        #pragma GCC diagnostic ignored "-Wvariadic-macros"
+    #endif
+#endif
+#if defined(JSON_HEDLEY_NON_NULL)
+    #undef JSON_HEDLEY_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
+#else
+    #define JSON_HEDLEY_NON_NULL(...)
+#endif
+JSON_HEDLEY_DIAGNOSTIC_POP
+
+#if defined(JSON_HEDLEY_PRINTF_FORMAT)
+    #undef JSON_HEDLEY_PRINTF_FORMAT
+#endif
+#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
+#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
+#else
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
+#endif
+
+#if defined(JSON_HEDLEY_CONSTEXPR)
+    #undef JSON_HEDLEY_CONSTEXPR
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
+    #endif
+#endif
+#if !defined(JSON_HEDLEY_CONSTEXPR)
+    #define JSON_HEDLEY_CONSTEXPR
+#endif
+
+#if defined(JSON_HEDLEY_PREDICT)
+    #undef JSON_HEDLEY_PREDICT
+#endif
+#if defined(JSON_HEDLEY_LIKELY)
+    #undef JSON_HEDLEY_LIKELY
+#endif
+#if defined(JSON_HEDLEY_UNLIKELY)
+    #undef JSON_HEDLEY_UNLIKELY
+#endif
+#if defined(JSON_HEDLEY_UNPREDICTABLE)
+    #undef JSON_HEDLEY_UNPREDICTABLE
+#endif
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable(!!(expr))
+#endif
+#if \
+  JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0)
+#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(expr, value, probability)
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1, probability)
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0, probability)
+#  define JSON_HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#if !defined(JSON_HEDLEY_BUILTIN_UNPREDICTABLE)
+    #define JSON_HEDLEY_BUILTIN_UNPREDICTABLE(expr) __builtin_expect_with_probability(!!(expr), 1, 0.5)
+#endif
+#elif \
+  JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27)
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
+    (((probability) >= 0.9) ? __builtin_expect(!!(expr), (expected)) : (((void) (expected)), !!(expr)))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
+    (__extension__ ({ \
+        JSON_HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
+    (__extension__ ({ \
+        JSON_HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
+#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (((void) (expected)), !!(expr))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
+#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
+#endif
+#if !defined(JSON_HEDLEY_UNPREDICTABLE)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
+#endif
+
+#if defined(JSON_HEDLEY_MALLOC)
+    #undef JSON_HEDLEY_MALLOC
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
+    #define JSON_HEDLEY_MALLOC __declspec(restrict)
+#else
+    #define JSON_HEDLEY_MALLOC
+#endif
+
+#if defined(JSON_HEDLEY_PURE)
+    #undef JSON_HEDLEY_PURE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_PURE __attribute__((__pure__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif JSON_HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
+#else
+    #define JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_CONST)
+    #undef JSON_HEDLEY_CONST
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_CONST __attribute__((__const__))
+#elif \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
+#else
+    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_RESTRICT)
+    #undef JSON_HEDLEY_RESTRICT
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT restrict
+#elif \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    defined(__clang__)
+    #define JSON_HEDLEY_RESTRICT __restrict
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT _Restrict
+#else
+    #define JSON_HEDLEY_RESTRICT
+#endif
+
+#if defined(JSON_HEDLEY_INLINE)
+    #undef JSON_HEDLEY_INLINE
+#endif
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    (defined(__cplusplus) && (__cplusplus >= 199711L))
+    #define JSON_HEDLEY_INLINE inline
+#elif \
+    defined(JSON_HEDLEY_GCC_VERSION) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
+    #define JSON_HEDLEY_INLINE __inline__
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_INLINE __inline
+#else
+    #define JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_ALWAYS_INLINE)
+    #undef JSON_HEDLEY_ALWAYS_INLINE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+    #define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0)
+    #define JSON_HEDLEY_ALWAYS_INLINE __forceinline
+#elif JSON_HEDLEY_TI_VERSION_CHECK(7,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
+#else
+    #define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_NEVER_INLINE)
+    #undef JSON_HEDLEY_NEVER_INLINE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
+#elif JSON_HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#else
+    #define JSON_HEDLEY_NEVER_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_PRIVATE)
+    #undef JSON_HEDLEY_PRIVATE
+#endif
+#if defined(JSON_HEDLEY_PUBLIC)
+    #undef JSON_HEDLEY_PUBLIC
+#endif
+#if defined(JSON_HEDLEY_IMPORT)
+    #undef JSON_HEDLEY_IMPORT
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+    #define JSON_HEDLEY_PRIVATE
+    #define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
+    #define JSON_HEDLEY_IMPORT   __declspec(dllimport)
+#else
+    #if \
+        JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
+        JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+        JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+        JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+        JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+        JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+        JSON_HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+        (JSON_HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_EABI__) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+        #define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
+        #define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
+    #else
+        #define JSON_HEDLEY_PRIVATE
+        #define JSON_HEDLEY_PUBLIC
+    #endif
+    #define JSON_HEDLEY_IMPORT    extern
+#endif
+
+#if defined(JSON_HEDLEY_NO_THROW)
+    #undef JSON_HEDLEY_NO_THROW
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
+#else
+    #define JSON_HEDLEY_NO_THROW
+#endif
+
+#if defined(JSON_HEDLEY_FALL_THROUGH)
+    #undef JSON_HEDLEY_FALL_THROUGH
+#endif
+#if JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough,7,0,0) && !defined(JSON_HEDLEY_PGI_VERSION)
+    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
+#elif defined(__fallthrough) /* SAL */
+    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
+#else
+    #define JSON_HEDLEY_FALL_THROUGH
+#endif
+
+#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
+    #undef JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
+#elif defined(_Ret_notnull_) /* SAL */
+    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
+#else
+    #define JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+
+#if defined(JSON_HEDLEY_ARRAY_PARAM)
+    #undef JSON_HEDLEY_ARRAY_PARAM
+#endif
+#if \
+    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
+    !defined(__STDC_NO_VLA__) && \
+    !defined(__cplusplus) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
+#else
+    #define JSON_HEDLEY_ARRAY_PARAM(name)
+#endif
+
+#if defined(JSON_HEDLEY_IS_CONSTANT)
+    #undef JSON_HEDLEY_IS_CONSTANT
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
+    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#endif
+/* JSON_HEDLEY_IS_CONSTEXPR_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #undef JSON_HEDLEY_IS_CONSTEXPR_
+#endif
+#if \
+    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(6,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0)
+    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
+#endif
+#if !defined(__cplusplus)
+#  if \
+       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
+#endif
+#  elif \
+       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(JSON_HEDLEY_SUNPRO_VERSION) && !defined(JSON_HEDLEY_PGI_VERSION)) || \
+       JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
+#endif
+#  elif \
+       defined(JSON_HEDLEY_GCC_VERSION) || \
+       defined(JSON_HEDLEY_INTEL_VERSION) || \
+       defined(JSON_HEDLEY_TINYC_VERSION) || \
+       defined(JSON_HEDLEY_TI_VERSION) || \
+       defined(__clang__)
+#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
+        sizeof(void) != \
+        sizeof(*( \
+                  1 ? \
+                  ((void*) ((expr) * 0L) ) : \
+((struct { char v[sizeof(void) * 2]; } *) 1) \
+                ) \
+              ) \
+                                            )
+#  endif
+#endif
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
+#else
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
+    #undef JSON_HEDLEY_BEGIN_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_END_C_DECLS)
+    #undef JSON_HEDLEY_END_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_C_DECL)
+    #undef JSON_HEDLEY_C_DECL
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
+    #define JSON_HEDLEY_END_C_DECLS }
+    #define JSON_HEDLEY_C_DECL extern "C"
+#else
+    #define JSON_HEDLEY_BEGIN_C_DECLS
+    #define JSON_HEDLEY_END_C_DECLS
+    #define JSON_HEDLEY_C_DECL
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_ASSERT)
+    #undef JSON_HEDLEY_STATIC_ASSERT
+#endif
+#if \
+  !defined(__cplusplus) && ( \
+      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+      JSON_HEDLEY_HAS_FEATURE(c_static_assert) || \
+      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
+      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+      defined(_Static_assert) \
+    )
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif \
+  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
+  (defined(__cplusplus) && JSON_HEDLEY_TI_VERSION_CHECK(8,3,0))
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
+#else
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
+#endif
+
+#if defined(JSON_HEDLEY_CONST_CAST)
+    #undef JSON_HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif \
+  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_REINTERPRET_CAST)
+    #undef JSON_HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (*((T*) &(expr)))
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_CAST)
+    #undef JSON_HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_CPP_CAST)
+    #undef JSON_HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_CPP_CAST(T, expr) static_cast<T>(expr)
+#else
+    #define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_NULL)
+    #undef JSON_HEDLEY_NULL
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
+    #elif defined(NULL)
+        #define JSON_HEDLEY_NULL NULL
+    #else
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
+    #endif
+#elif defined(NULL)
+    #define JSON_HEDLEY_NULL NULL
+#else
+    #define JSON_HEDLEY_NULL ((void*) 0)
+#endif
+
+#if defined(JSON_HEDLEY_MESSAGE)
+    #undef JSON_HEDLEY_MESSAGE
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_MESSAGE(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(message msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
+#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_WARNING)
+    #undef JSON_HEDLEY_WARNING
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_WARNING(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(clang warning msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_REQUIRE)
+    #undef JSON_HEDLEY_REQUIRE
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_MSG)
+    #undef JSON_HEDLEY_REQUIRE_MSG
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
+#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
+#    define JSON_HEDLEY_REQUIRE(expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), msg, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
+#  endif
+#else
+#  define JSON_HEDLEY_REQUIRE(expr)
+#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS)
+    #undef JSON_HEDLEY_FLAGS
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum)
+    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS_CAST)
+    #undef JSON_HEDLEY_FLAGS_CAST
+#endif
+#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        _Pragma("warning(disable:188)") \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
+#endif
+
+#if defined(JSON_HEDLEY_EMPTY_BASES)
+    #undef JSON_HEDLEY_EMPTY_BASES
+#endif
+#if JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)
+    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+    #define JSON_HEDLEY_EMPTY_BASES
+#endif
+
+/* Remaining macros are deprecated. */
+
+#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
+#else
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
+    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#endif
+#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
+    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
+    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#endif
+#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
+    #undef JSON_HEDLEY_CLANG_HAS_WARNING
+#endif
+#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)
+
+#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */
+
+
+// This file contains all internal macro definitions
+// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
+
+// exclude unsupported compilers
+#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
+    #if defined(__clang__)
+        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
+            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
+        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #endif
+#endif
+
+// C++ language standard detection
+#if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+    #define JSON_HAS_CPP_17
+    #define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+    #define JSON_HAS_CPP_14
+#endif
+
+// disable float-equal warnings on GCC/clang
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+// disable documentation warnings on clang
+#if defined(__clang__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wdocumentation"
+#endif
+
+// allow to disable exceptions
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
+    #define JSON_THROW(exception) throw exception
+    #define JSON_TRY try
+    #define JSON_CATCH(exception) catch(exception)
+    #define JSON_INTERNAL_CATCH(exception) catch(exception)
+#else
+    #include <cstdlib>
+    #define JSON_THROW(exception) std::abort()
+    #define JSON_TRY if(true)
+    #define JSON_CATCH(exception) if(false)
+    #define JSON_INTERNAL_CATCH(exception) if(false)
+#endif
+
+// override exception macros
+#if defined(JSON_THROW_USER)
+    #undef JSON_THROW
+    #define JSON_THROW JSON_THROW_USER
+#endif
+#if defined(JSON_TRY_USER)
+    #undef JSON_TRY
+    #define JSON_TRY JSON_TRY_USER
+#endif
+#if defined(JSON_CATCH_USER)
+    #undef JSON_CATCH
+    #define JSON_CATCH JSON_CATCH_USER
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
+#endif
+#if defined(JSON_INTERNAL_CATCH_USER)
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
+#endif
+
+/*!
+@brief macro to briefly define a mapping between an enum and JSON
+@def NLOHMANN_JSON_SERIALIZE_ENUM
+@since version 3.4.0
+*/
+#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
+    template<typename BasicJsonType>                                                            \
+    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
+        {                                                                                       \
+            return ej_pair.first == e;                                                          \
+        });                                                                                     \
+        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
+    }                                                                                           \
+    template<typename BasicJsonType>                                                            \
+    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
+        {                                                                                       \
+            return ej_pair.second == j;                                                         \
+        });                                                                                     \
+        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
+    }
+
+// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
+// may be removed in the future once the class is split.
+
+#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
+    template<template<typename, typename, typename...> class ObjectType,   \
+             template<typename, typename...> class ArrayType,              \
+             class StringType, class BooleanType, class NumberIntegerType, \
+             class NumberUnsignedType, class NumberFloatType,              \
+             template<typename> class AllocatorType,                       \
+             template<typename, typename = void> class JSONSerializer>
+
+#define NLOHMANN_BASIC_JSON_TPL                                            \
+    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
+    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
+    AllocatorType, JSONSerializer>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////////
+// exceptions //
+////////////////
+
+/*!
+@brief general exception of the @ref basic_json class
+
+This class is an extension of `std::exception` objects with a member @a id for
+exception ids. It is used as the base class for all exceptions thrown by the
+@ref basic_json class. This class can hence be used as "wildcard" to catch
+exceptions.
+
+Subclasses:
+- @ref parse_error for exceptions indicating a parse error
+- @ref invalid_iterator for exceptions indicating errors with iterators
+- @ref type_error for exceptions indicating executing a member function with
+                  a wrong type
+- @ref out_of_range for exceptions indicating access out of the defined range
+- @ref other_error for exceptions indicating other library errors
+
+@internal
+@note To have nothrow-copy-constructible exceptions, we internally use
+      `std::runtime_error` which can cope with arbitrary-length error messages.
+      Intermediate strings are built with static functions and then passed to
+      the actual constructor.
+@endinternal
+
+@liveexample{The following code shows how arbitrary library exceptions can be
+caught.,exception}
+
+@since version 3.0.0
+*/
+class exception : public std::exception
+{
+  public:
+    /// returns the explanatory string
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* what() const noexcept override
+    {
+        return m.what();
+    }
+
+    /// the id of the exception
+    const int id;
+
+  protected:
+    JSON_HEDLEY_NON_NULL(3)
+    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {}
+
+    static std::string name(const std::string& ename, int id_)
+    {
+        return "[json.exception." + ename + "." + std::to_string(id_) + "] ";
+    }
+
+  private:
+    /// an exception object as storage for error messages
+    std::runtime_error m;
+};
+
+/*!
+@brief exception indicating a parse error
+
+This exception is thrown by the library when a parse error occurs. Parse errors
+can occur during the deserialization of JSON text, CBOR, MessagePack, as well
+as when using JSON Patch.
+
+Member @a byte holds the byte index of the last read character in the input
+file.
+
+Exceptions have ids 1xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.parse_error.101 | parse error at 2: unexpected end of input; expected string literal | This error indicates a syntax error while deserializing a JSON text. The error message describes that an unexpected token (character) was encountered, and the member @a byte indicates the error position.
+json.exception.parse_error.102 | parse error at 14: missing or wrong low surrogate | JSON uses the `\uxxxx` format to describe Unicode characters. Code points above above 0xFFFF are split into two `\uxxxx` entries ("surrogate pairs"). This error indicates that the surrogate pair is incomplete or contains an invalid code point.
+json.exception.parse_error.103 | parse error: code points above 0x10FFFF are invalid | Unicode supports code points up to 0x10FFFF. Code points above 0x10FFFF are invalid.
+json.exception.parse_error.104 | parse error: JSON patch must be an array of objects | [RFC 6902](https://tools.ietf.org/html/rfc6902) requires a JSON Patch document to be a JSON document that represents an array of objects.
+json.exception.parse_error.105 | parse error: operation must have string member 'op' | An operation of a JSON Patch document must contain exactly one "op" member, whose value indicates the operation to perform. Its value must be one of "add", "remove", "replace", "move", "copy", or "test"; other values are errors.
+json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number without a leading `0`.
+json.exception.parse_error.107 | parse error: JSON pointer must be empty or begin with '/' - was: 'foo' | A JSON Pointer must be a Unicode string containing a sequence of zero or more reference tokens, each prefixed by a `/` character.
+json.exception.parse_error.108 | parse error: escape character '~' must be followed with '0' or '1' | In a JSON Pointer, only `~0` and `~1` are valid escape sequences.
+json.exception.parse_error.109 | parse error: array index 'one' is not a number | A JSON Pointer array index must be a number.
+json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vector | When parsing CBOR or MessagePack, the byte vector ends before the complete value has been read.
+json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read.
+json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read.
+json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet).
+
+@note For an input with n bytes, 1 is the index of the first character and n+1
+      is the index of the terminating null byte or the end of file. This also
+      holds true when reading a byte vector (CBOR or MessagePack).
+
+@liveexample{The following code shows how a `parse_error` exception can be
+caught.,parse_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class parse_error : public exception
+{
+  public:
+    /*!
+    @brief create a parse error exception
+    @param[in] id_       the id of the exception
+    @param[in] pos       the position where the error occurred (or with
+                         chars_read_total=0 if the position cannot be
+                         determined)
+    @param[in] what_arg  the explanatory string
+    @return parse_error object
+    */
+    static parse_error create(int id_, const position_t& pos, const std::string& what_arg)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        position_string(pos) + ": " + what_arg;
+        return parse_error(id_, pos.chars_read_total, w.c_str());
+    }
+
+    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        (byte_ != 0 ? (" at byte " + std::to_string(byte_)) : "") +
+                        ": " + what_arg;
+        return parse_error(id_, byte_, w.c_str());
+    }
+
+    /*!
+    @brief byte index of the parse error
+
+    The byte index of the last read character in the input file.
+
+    @note For an input with n bytes, 1 is the index of the first character and
+          n+1 is the index of the terminating null byte or the end of file.
+          This also holds true when reading a byte vector (CBOR or MessagePack).
+    */
+    const std::size_t byte;
+
+  private:
+    parse_error(int id_, std::size_t byte_, const char* what_arg)
+        : exception(id_, what_arg), byte(byte_) {}
+
+    static std::string position_string(const position_t& pos)
+    {
+        return " at line " + std::to_string(pos.lines_read + 1) +
+               ", column " + std::to_string(pos.chars_read_current_line);
+    }
+};
+
+/*!
+@brief exception indicating errors with iterators
+
+This exception is thrown if iterators passed to a library function do not match
+the expected semantics.
+
+Exceptions have ids 2xx.
+
+name / id                           | example message | description
+----------------------------------- | --------------- | -------------------------
+json.exception.invalid_iterator.201 | iterators are not compatible | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.202 | iterator does not fit current value | In an erase or insert function, the passed iterator @a pos does not belong to the JSON value for which the function was called. It hence does not define a valid position for the deletion/insertion.
+json.exception.invalid_iterator.203 | iterators do not fit current value | Either iterator passed to function @ref erase(IteratorType first, IteratorType last) does not belong to the JSON value from which values shall be erased. It hence does not define a valid range to delete values from.
+json.exception.invalid_iterator.204 | iterators out of range | When an iterator range for a primitive type (number, boolean, or string) is passed to a constructor or an erase function, this range has to be exactly (@ref begin(), @ref end()), because this is the only way the single stored value is expressed. All other ranges are invalid.
+json.exception.invalid_iterator.205 | iterator out of range | When an iterator for a primitive type (number, boolean, or string) is passed to an erase function, the iterator has to be the @ref begin() iterator, because it is the only way to address the stored value. All other iterators are invalid.
+json.exception.invalid_iterator.206 | cannot construct with iterators from null | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) belong to a JSON null value and hence to not define a valid range.
+json.exception.invalid_iterator.207 | cannot use key() for non-object iterators | The key() member function can only be used on iterators belonging to a JSON object, because other types do not have a concept of a key.
+json.exception.invalid_iterator.208 | cannot use operator[] for object iterators | The operator[] to specify a concrete offset cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.209 | cannot use offsets with object iterators | The offset operators (+, -, +=, -=) cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.210 | iterators do not fit | The iterator range passed to the insert function are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.211 | passed iterators may not belong to container | The iterator range passed to the insert function must not be a subrange of the container to insert to.
+json.exception.invalid_iterator.212 | cannot compare iterators of different containers | When two iterators are compared, they must belong to the same container.
+json.exception.invalid_iterator.213 | cannot compare order of object iterators | The order of object iterators cannot be compared, because JSON objects are unordered.
+json.exception.invalid_iterator.214 | cannot get value | Cannot get value for iterator: Either the iterator belongs to a null value or it is an iterator to a primitive type (number, boolean, or string), but the iterator is different to @ref begin().
+
+@liveexample{The following code shows how an `invalid_iterator` exception can be
+caught.,invalid_iterator}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class invalid_iterator : public exception
+{
+  public:
+    static invalid_iterator create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("invalid_iterator", id_) + what_arg;
+        return invalid_iterator(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    invalid_iterator(int id_, const char* what_arg)
+        : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating executing a member function with a wrong type
+
+This exception is thrown in case of a type error; that is, a library function is
+executed on a JSON value whose type does not match the expected semantics.
+
+Exceptions have ids 3xx.
+
+name / id                     | example message | description
+----------------------------- | --------------- | -------------------------
+json.exception.type_error.301 | cannot create object from initializer list | To create an object from an initializer list, the initializer list must consist only of a list of pairs whose first element is a string. When this constraint is violated, an array is created instead.
+json.exception.type_error.302 | type must be object, but is array | During implicit or explicit value conversion, the JSON type must be compatible to the target type. For instance, a JSON string can only be converted into string types, but not into numbers or boolean types.
+json.exception.type_error.303 | incompatible ReferenceType for get_ref, actual type is object | To retrieve a reference to a value stored in a @ref basic_json object with @ref get_ref, the type of the reference must match the value type. For instance, for a JSON array, the @a ReferenceType must be @ref array_t &.
+json.exception.type_error.304 | cannot use at() with string | The @ref at() member functions can only be executed for certain JSON types.
+json.exception.type_error.305 | cannot use operator[] with string | The @ref operator[] member functions can only be executed for certain JSON types.
+json.exception.type_error.306 | cannot use value() with string | The @ref value() member functions can only be executed for certain JSON types.
+json.exception.type_error.307 | cannot use erase() with string | The @ref erase() member functions can only be executed for certain JSON types.
+json.exception.type_error.308 | cannot use push_back() with string | The @ref push_back() and @ref operator+= member functions can only be executed for certain JSON types.
+json.exception.type_error.309 | cannot use insert() with | The @ref insert() member functions can only be executed for certain JSON types.
+json.exception.type_error.310 | cannot use swap() with number | The @ref swap() member functions can only be executed for certain JSON types.
+json.exception.type_error.311 | cannot use emplace_back() with string | The @ref emplace_back() member function can only be executed for certain JSON types.
+json.exception.type_error.312 | cannot use update() with string | The @ref update() member functions can only be executed for certain JSON types.
+json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined.
+json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers.
+json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive.
+json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. |
+json.exception.type_error.317 | JSON value cannot be serialized to requested format | The dynamic type of the object cannot be represented in the requested serialization format (e.g. a raw `true` or `null` JSON object cannot be serialized to BSON) |
+
+@liveexample{The following code shows how a `type_error` exception can be
+caught.,type_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class type_error : public exception
+{
+  public:
+    static type_error create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("type_error", id_) + what_arg;
+        return type_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating access out of the defined range
+
+This exception is thrown in case a library function is called on an input
+parameter that exceeds the expected range, for instance in case of array
+indices or nonexisting object keys.
+
+Exceptions have ids 4xx.
+
+name / id                       | example message | description
+------------------------------- | --------------- | -------------------------
+json.exception.out_of_range.401 | array index 3 is out of range | The provided array index @a i is larger than @a size-1.
+json.exception.out_of_range.402 | array index '-' (3) is out of range | The special array index `-` in a JSON Pointer never describes a valid element of the array, but the index past the end. That is, it can only be used to add elements at this position, but not to read it.
+json.exception.out_of_range.403 | key 'foo' not found | The provided key was not found in the JSON object.
+json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved.
+json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value.
+json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF.
+json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. |
+json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. |
+json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string |
+
+@liveexample{The following code shows how an `out_of_range` exception can be
+caught.,out_of_range}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class out_of_range : public exception
+{
+  public:
+    static out_of_range create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("out_of_range", id_) + what_arg;
+        return out_of_range(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating other library errors
+
+This exception is thrown in case of errors that cannot be classified with the
+other exception types.
+
+Exceptions have ids 5xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.other_error.501 | unsuccessful: {"op":"test","path":"/baz", "value":"bar"} | A JSON Patch operation 'test' failed. The unsuccessful operation is also printed.
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+
+@liveexample{The following code shows how an `other_error` exception can be
+caught.,other_error}
+
+@since version 3.0.0
+*/
+class other_error : public exception
+{
+  public:
+    static other_error create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("other_error", id_) + what_arg;
+        return other_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+#include <ciso646> // not
+#include <cstddef> // size_t
+#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
+
+namespace nlohmann
+{
+namespace detail
+{
+// alias templates to reduce boilerplate
+template<bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template<typename T>
+using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+// implementation of C++14 index_sequence and affiliates
+// source: https://stackoverflow.com/a/32223343
+template<std::size_t... Ints>
+struct index_sequence
+{
+    using type = index_sequence;
+    using value_type = std::size_t;
+    static constexpr std::size_t size() noexcept
+    {
+        return sizeof...(Ints);
+    }
+};
+
+template<class Sequence1, class Sequence2>
+struct merge_and_renumber;
+
+template<std::size_t... I1, std::size_t... I2>
+struct merge_and_renumber<index_sequence<I1...>, index_sequence<I2...>>
+        : index_sequence < I1..., (sizeof...(I1) + I2)... > {};
+
+template<std::size_t N>
+struct make_index_sequence
+    : merge_and_renumber < typename make_index_sequence < N / 2 >::type,
+      typename make_index_sequence < N - N / 2 >::type > {};
+
+template<> struct make_index_sequence<0> : index_sequence<> {};
+template<> struct make_index_sequence<1> : index_sequence<0> {};
+
+template<typename... Ts>
+using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+// dispatch utility (taken from ranges-v3)
+template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
+template<> struct priority_tag<0> {};
+
+// taken from ranges-v3
+template<typename T>
+struct static_const
+{
+    static constexpr T value{};
+};
+
+template<typename T>
+constexpr T static_const<T>::value;
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+#include <ciso646> // not
+#include <limits> // numeric_limits
+#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
+#include <utility> // declval
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+
+#include <iterator> // random_access_iterator_tag
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template <typename ...Ts> struct make_void
+{
+    using type = void;
+};
+template <typename ...Ts> using void_t = typename make_void<Ts...>::type;
+} // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template <typename It, typename = void>
+struct iterator_types {};
+
+template <typename It>
+struct iterator_types <
+    It,
+    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
+    typename It::reference, typename It::iterator_category >>
+{
+    using difference_type = typename It::difference_type;
+    using value_type = typename It::value_type;
+    using pointer = typename It::pointer;
+    using reference = typename It::reference;
+    using iterator_category = typename It::iterator_category;
+};
+
+// This is required as some compilers implement std::iterator_traits in a way that
+// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
+template <typename T, typename = void>
+struct iterator_traits
+{
+};
+
+template <typename T>
+struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
+            : iterator_types<T>
+{
+};
+
+template <typename T>
+struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
+{
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = T;
+    using difference_type = ptrdiff_t;
+    using pointer = T*;
+    using reference = T&;
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+
+#include <type_traits>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+// http://en.cppreference.com/w/cpp/experimental/is_detected
+namespace nlohmann
+{
+namespace detail
+{
+struct nonesuch
+{
+    nonesuch() = delete;
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const&) = delete;
+    nonesuch(nonesuch const&&) = delete;
+    void operator=(nonesuch const&) = delete;
+    void operator=(nonesuch&&) = delete;
+};
+
+template <class Default,
+          class AlwaysVoid,
+          template <class...> class Op,
+          class... Args>
+struct detector
+{
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...>
+{
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or = detector<Default, void, Op, Args...>;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+template <class Expected, template <class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template <class To, template <class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/json_fwd.hpp>
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
+#include <vector> // vector
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer>
+class basic_json;
+
+/*!
+@brief JSON Pointer
+
+A JSON pointer defines a string syntax for identifying a specific value
+within a JSON document. It can be used with functions `at` and
+`operator[]`. Furthermore, JSON pointers are the base for JSON patches.
+
+@sa [RFC 6901](https://tools.ietf.org/html/rfc6901)
+
+@since version 2.0.0
+*/
+template<typename BasicJsonType>
+class json_pointer;
+
+/*!
+@brief default JSON class
+
+This type is the default specialization of the @ref basic_json class which
+uses the standard template types.
+
+@since version 1.0.0
+*/
+using json = basic_json<>;
+}  // namespace nlohmann
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+
+namespace nlohmann
+{
+/*!
+@brief detail namespace with internal helper functions
+
+This namespace collects functions that should not be exposed,
+implementations of some @ref basic_json methods, and meta-programming helpers.
+
+@since version 2.1.0
+*/
+namespace detail
+{
+/////////////
+// helpers //
+/////////////
+
+// Note to maintainers:
+//
+// Every trait in this file expects a non CV-qualified type.
+// The only exceptions are in the 'aliases for detected' section
+// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
+//
+// In this case, T has to be properly CV-qualified to constraint the function arguments
+// (e.g. to_json(BasicJsonType&, const T&))
+
+template<typename> struct is_basic_json : std::false_type {};
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+
+//////////////////////////
+// aliases for detected //
+//////////////////////////
+
+template <typename T>
+using mapped_type_t = typename T::mapped_type;
+
+template <typename T>
+using key_type_t = typename T::key_type;
+
+template <typename T>
+using value_type_t = typename T::value_type;
+
+template <typename T>
+using difference_type_t = typename T::difference_type;
+
+template <typename T>
+using pointer_t = typename T::pointer;
+
+template <typename T>
+using reference_t = typename T::reference;
+
+template <typename T>
+using iterator_category_t = typename T::iterator_category;
+
+template <typename T>
+using iterator_t = typename T::iterator;
+
+template <typename T, typename... Args>
+using to_json_function = decltype(T::to_json(std::declval<Args>()...));
+
+template <typename T, typename... Args>
+using from_json_function = decltype(T::from_json(std::declval<Args>()...));
+
+template <typename T, typename U>
+using get_template_function = decltype(std::declval<T>().template get<U>());
+
+// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
+template <typename BasicJsonType, typename T, typename = void>
+struct has_from_json : std::false_type {};
+
+template <typename BasicJsonType, typename T>
+struct has_from_json<BasicJsonType, T,
+           enable_if_t<not is_basic_json<T>::value>>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, from_json_function, serializer,
+        const BasicJsonType&, T&>::value;
+};
+
+// This trait checks if JSONSerializer<T>::from_json(json const&) exists
+// this overload is used for non-default-constructible user-defined-types
+template <typename BasicJsonType, typename T, typename = void>
+struct has_non_default_from_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_non_default_from_json<BasicJsonType, T, enable_if_t<not is_basic_json<T>::value>>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<T, from_json_function, serializer,
+        const BasicJsonType&>::value;
+};
+
+// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
+// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
+template <typename BasicJsonType, typename T, typename = void>
+struct has_to_json : std::false_type {};
+
+template <typename BasicJsonType, typename T>
+struct has_to_json<BasicJsonType, T, enable_if_t<not is_basic_json<T>::value>>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
+        T>::value;
+};
+
+
+///////////////////
+// is_ functions //
+///////////////////
+
+template <typename T, typename = void>
+struct is_iterator_traits : std::false_type {};
+
+template <typename T>
+struct is_iterator_traits<iterator_traits<T>>
+{
+  private:
+    using traits = iterator_traits<T>;
+
+  public:
+    static constexpr auto value =
+        is_detected<value_type_t, traits>::value &&
+        is_detected<difference_type_t, traits>::value &&
+        is_detected<pointer_t, traits>::value &&
+        is_detected<iterator_category_t, traits>::value &&
+        is_detected<reference_t, traits>::value;
+};
+
+// source: https://stackoverflow.com/a/37193089/4116453
+
+template <typename T, typename = void>
+struct is_complete_type : std::false_type {};
+
+template <typename T>
+struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
+
+template <typename BasicJsonType, typename CompatibleObjectType,
+          typename = void>
+struct is_compatible_object_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type_impl <
+    BasicJsonType, CompatibleObjectType,
+    enable_if_t<is_detected<mapped_type_t, CompatibleObjectType>::value and
+    is_detected<key_type_t, CompatibleObjectType>::value >>
+{
+
+    using object_t = typename BasicJsonType::object_t;
+
+    // macOS's is_constructible does not play well with nonesuch...
+    static constexpr bool value =
+        std::is_constructible<typename object_t::key_type,
+        typename CompatibleObjectType::key_type>::value and
+        std::is_constructible<typename object_t::mapped_type,
+        typename CompatibleObjectType::mapped_type>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type
+    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
+
+template <typename BasicJsonType, typename ConstructibleObjectType,
+          typename = void>
+struct is_constructible_object_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type_impl <
+    BasicJsonType, ConstructibleObjectType,
+    enable_if_t<is_detected<mapped_type_t, ConstructibleObjectType>::value and
+    is_detected<key_type_t, ConstructibleObjectType>::value >>
+{
+    using object_t = typename BasicJsonType::object_t;
+
+    static constexpr bool value =
+        (std::is_default_constructible<ConstructibleObjectType>::value and
+         (std::is_move_assignable<ConstructibleObjectType>::value or
+          std::is_copy_assignable<ConstructibleObjectType>::value) and
+         (std::is_constructible<typename ConstructibleObjectType::key_type,
+          typename object_t::key_type>::value and
+          std::is_same <
+          typename object_t::mapped_type,
+          typename ConstructibleObjectType::mapped_type >::value)) or
+        (has_from_json<BasicJsonType,
+         typename ConstructibleObjectType::mapped_type>::value or
+         has_non_default_from_json <
+         BasicJsonType,
+         typename ConstructibleObjectType::mapped_type >::value);
+};
+
+template <typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type
+    : is_constructible_object_type_impl<BasicJsonType,
+      ConstructibleObjectType> {};
+
+template <typename BasicJsonType, typename CompatibleStringType,
+          typename = void>
+struct is_compatible_string_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type_impl <
+    BasicJsonType, CompatibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, CompatibleStringType>::value >>
+{
+    static constexpr auto value =
+        std::is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
+};
+
+template <typename BasicJsonType, typename ConstructibleStringType>
+struct is_compatible_string_type
+    : is_compatible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template <typename BasicJsonType, typename ConstructibleStringType,
+          typename = void>
+struct is_constructible_string_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type_impl <
+    BasicJsonType, ConstructibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, ConstructibleStringType>::value >>
+{
+    static constexpr auto value =
+        std::is_constructible<ConstructibleStringType,
+        typename BasicJsonType::string_t>::value;
+};
+
+template <typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type
+    : is_constructible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template <typename BasicJsonType, typename CompatibleArrayType, typename = void>
+struct is_compatible_array_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type_impl <
+    BasicJsonType, CompatibleArrayType,
+    enable_if_t<is_detected<value_type_t, CompatibleArrayType>::value and
+    is_detected<iterator_t, CompatibleArrayType>::value and
+// This is needed because json_reverse_iterator has a ::iterator type...
+// Therefore it is detected as a CompatibleArrayType.
+// The real fix would be to have an Iterable concept.
+    not is_iterator_traits<
+    iterator_traits<CompatibleArrayType>>::value >>
+{
+    static constexpr bool value =
+        std::is_constructible<BasicJsonType,
+        typename CompatibleArrayType::value_type>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type
+    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
+
+template <typename BasicJsonType, typename ConstructibleArrayType, typename = void>
+struct is_constructible_array_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t<std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value >>
+            : std::true_type {};
+
+template <typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t<not std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value and
+    std::is_default_constructible<ConstructibleArrayType>::value and
+(std::is_move_assignable<ConstructibleArrayType>::value or
+ std::is_copy_assignable<ConstructibleArrayType>::value) and
+is_detected<value_type_t, ConstructibleArrayType>::value and
+is_detected<iterator_t, ConstructibleArrayType>::value and
+is_complete_type<
+detected_t<value_type_t, ConstructibleArrayType>>::value >>
+{
+    static constexpr bool value =
+        // This is needed because json_reverse_iterator has a ::iterator type,
+        // furthermore, std::back_insert_iterator (and other iterators) have a
+        // base class `iterator`... Therefore it is detected as a
+        // ConstructibleArrayType. The real fix would be to have an Iterable
+        // concept.
+        not is_iterator_traits<iterator_traits<ConstructibleArrayType>>::value and
+
+        (std::is_same<typename ConstructibleArrayType::value_type,
+         typename BasicJsonType::array_t::value_type>::value or
+         has_from_json<BasicJsonType,
+         typename ConstructibleArrayType::value_type>::value or
+         has_non_default_from_json <
+         BasicJsonType, typename ConstructibleArrayType::value_type >::value);
+};
+
+template <typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type
+    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType,
+          typename = void>
+struct is_compatible_integer_type_impl : std::false_type {};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type_impl <
+    RealIntegerType, CompatibleNumberIntegerType,
+    enable_if_t<std::is_integral<RealIntegerType>::value and
+    std::is_integral<CompatibleNumberIntegerType>::value and
+    not std::is_same<bool, CompatibleNumberIntegerType>::value >>
+{
+    // is there an assert somewhere on overflows?
+    using RealLimits = std::numeric_limits<RealIntegerType>;
+    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
+
+    static constexpr auto value =
+        std::is_constructible<RealIntegerType,
+        CompatibleNumberIntegerType>::value and
+        CompatibleLimits::is_integer and
+        RealLimits::is_signed == CompatibleLimits::is_signed;
+};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type
+    : is_compatible_integer_type_impl<RealIntegerType,
+      CompatibleNumberIntegerType> {};
+
+template <typename BasicJsonType, typename CompatibleType, typename = void>
+struct is_compatible_type_impl: std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type_impl <
+    BasicJsonType, CompatibleType,
+    enable_if_t<is_complete_type<CompatibleType>::value >>
+{
+    static constexpr bool value =
+        has_to_json<BasicJsonType, CompatibleType>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type
+    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
+
+// https://en.cppreference.com/w/cpp/types/conjunction
+template<class...> struct conjunction : std::true_type { };
+template<class B1> struct conjunction<B1> : B1 { };
+template<class B1, class... Bn>
+struct conjunction<B1, Bn...>
+: std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
+
+template <typename T1, typename T2>
+struct is_constructible_tuple : std::false_type {};
+
+template <typename T1, typename... Args>
+struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<std::is_constructible<T1, Args>...> {};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+#include <array> // array
+#include <ciso646> // and
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t
+#include <string> // string
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////////////
+// JSON type enumeration //
+///////////////////////////
+
+/*!
+@brief the JSON type enumeration
+
+This enumeration collects the different JSON types. It is internally used to
+distinguish the stored values, and the functions @ref basic_json::is_null(),
+@ref basic_json::is_object(), @ref basic_json::is_array(),
+@ref basic_json::is_string(), @ref basic_json::is_boolean(),
+@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
+@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
+@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
+@ref basic_json::is_structured() rely on it.
+
+@note There are three enumeration entries (number_integer, number_unsigned, and
+number_float), because the library distinguishes these three types for numbers:
+@ref basic_json::number_unsigned_t is used for unsigned integers,
+@ref basic_json::number_integer_t is used for signed integers, and
+@ref basic_json::number_float_t is used for floating-point numbers or to
+approximate integers which do not fit in the limits of their respective type.
+
+@sa @ref basic_json::basic_json(const value_t value_type) -- create a JSON
+value with the default value for a given type
+
+@since version 1.0.0
+*/
+enum class value_t : std::uint8_t
+{
+    null,             ///< null value
+    object,           ///< object (unordered set of name/value pairs)
+    array,            ///< array (ordered collection of values)
+    string,           ///< string value
+    boolean,          ///< boolean value
+    number_integer,   ///< number value (signed integer)
+    number_unsigned,  ///< number value (unsigned integer)
+    number_float,     ///< number value (floating-point)
+    discarded         ///< discarded by the the parser callback function
+};
+
+/*!
+@brief comparison operator for JSON types
+
+Returns an ordering that is similar to Python:
+- order: null < boolean < number < object < array < string
+- furthermore, each type is not smaller than itself
+- discarded values are not comparable
+
+@since version 1.0.0
+*/
+inline bool operator<(const value_t lhs, const value_t rhs) noexcept
+{
+    static constexpr std::array<std::uint8_t, 8> order = {{
+            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
+            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */
+        }
+    };
+
+    const auto l_index = static_cast<std::size_t>(lhs);
+    const auto r_index = static_cast<std::size_t>(rhs);
+    return l_index < order.size() and r_index < order.size() and order[l_index] < order[r_index];
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_null()))
+    {
+        JSON_THROW(type_error::create(302, "type must be null, but is " + std::string(j.type_name())));
+    }
+    n = nullptr;
+}
+
+// overloads for basic_json template parameters
+template<typename BasicJsonType, typename ArithmeticType,
+         enable_if_t<std::is_arithmetic<ArithmeticType>::value and
+                     not std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+                     int> = 0>
+void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name())));
+    }
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_boolean()))
+    {
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(j.type_name())));
+    }
+    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name())));
+    }
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template <
+    typename BasicJsonType, typename ConstructibleStringType,
+    enable_if_t <
+        is_constructible_string_type<BasicJsonType, ConstructibleStringType>::value and
+        not std::is_same<typename BasicJsonType::string_t,
+                         ConstructibleStringType>::value,
+        int > = 0 >
+void from_json(const BasicJsonType& j, ConstructibleStringType& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name())));
+    }
+
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void from_json(const BasicJsonType& j, EnumType& e)
+{
+    typename std::underlying_type<EnumType>::type val;
+    get_arithmetic_value(j, val);
+    e = static_cast<EnumType>(val);
+}
+
+// forward_list doesn't have an insert method
+template<typename BasicJsonType, typename T, typename Allocator,
+         enable_if_t<std::is_convertible<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    l.clear();
+    std::transform(j.rbegin(), j.rend(),
+                   std::front_inserter(l), [](const BasicJsonType & i)
+    {
+        return i.template get<T>();
+    });
+}
+
+// valarray doesn't have an insert method
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_convertible<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::valarray<T>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    l.resize(j.size());
+    std::copy(j.begin(), j.end(), std::begin(l));
+}
+
+template <typename BasicJsonType, typename T, std::size_t N>
+auto from_json(const BasicJsonType& j, T (&arr)[N])
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType>
+void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
+{
+    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
+}
+
+template <typename BasicJsonType, typename T, std::size_t N>
+auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
+                          priority_tag<2> /*unused*/)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
+-> decltype(
+    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
+    j.template get<typename ConstructibleArrayType::value_type>(),
+    void())
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    ret.reserve(j.size());
+    std::transform(j.begin(), j.end(),
+                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template <typename BasicJsonType, typename ConstructibleArrayType>
+void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
+                          priority_tag<0> /*unused*/)
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    std::transform(
+        j.begin(), j.end(), std::inserter(ret, end(ret)),
+        [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template <typename BasicJsonType, typename ConstructibleArrayType,
+          enable_if_t <
+              is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value and
+              not is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value and
+              not is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value and
+              not is_basic_json<ConstructibleArrayType>::value,
+              int > = 0 >
+
+auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
+-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
+j.template get<typename ConstructibleArrayType::value_type>(),
+void())
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " +
+                                      std::string(j.type_name())));
+    }
+
+    from_json_array_impl(j, arr, priority_tag<3> {});
+}
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
+void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_object()))
+    {
+        JSON_THROW(type_error::create(302, "type must be object, but is " + std::string(j.type_name())));
+    }
+
+    ConstructibleObjectType ret;
+    auto inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
+    using value_type = typename ConstructibleObjectType::value_type;
+    std::transform(
+        inner_object->begin(), inner_object->end(),
+        std::inserter(ret, ret.begin()),
+        [](typename BasicJsonType::object_t::value_type const & p)
+    {
+        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
+    });
+    obj = std::move(ret);
+}
+
+// overload for arithmetic types, not chosen for basic_json template arguments
+// (BooleanType, etc..); note: Is it really necessary to provide explicit
+// overloads for boolean_t etc. in case of a custom BooleanType which is not
+// an arithmetic type?
+template<typename BasicJsonType, typename ArithmeticType,
+         enable_if_t <
+             std::is_arithmetic<ArithmeticType>::value and
+             not std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value and
+             not std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value and
+             not std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value and
+             not std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+             int> = 0>
+void from_json(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+        case value_t::boolean:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
+            break;
+        }
+
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name())));
+    }
+}
+
+template<typename BasicJsonType, typename A1, typename A2>
+void from_json(const BasicJsonType& j, std::pair<A1, A2>& p)
+{
+    p = {j.at(0).template get<A1>(), j.at(1).template get<A2>()};
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void from_json_tuple_impl(const BasicJsonType& j, Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    t = std::make_tuple(j.at(Idx).template get<typename std::tuple_element<Idx, Tuple>::type>()...);
+}
+
+template<typename BasicJsonType, typename... Args>
+void from_json(const BasicJsonType& j, std::tuple<Args...>& t)
+{
+    from_json_tuple_impl(j, t, index_sequence_for<Args...> {});
+}
+
+template <typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
+          typename = enable_if_t<not std::is_constructible<
+                                     typename BasicJsonType::string_t, Key>::value>>
+void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+template <typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
+          typename = enable_if_t<not std::is_constructible<
+                                     typename BasicJsonType::string_t, Key>::value>>
+void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(not j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+struct from_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(const BasicJsonType& j, T& val) const
+    noexcept(noexcept(from_json(j, val)))
+    -> decltype(from_json(j, val), void())
+    {
+        return from_json(j, val);
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `from_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace
+{
+constexpr const auto& from_json = detail::static_const<detail::from_json_fn>::value;
+} // namespace
+} // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+
+#include <algorithm> // copy
+#include <ciso646> // or, and, not
+#include <iterator> // begin, end
+#include <string> // string
+#include <tuple> // tuple, get
+#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
+#include <utility> // move, forward, declval, pair
+#include <valarray> // valarray
+#include <vector> // vector
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+
+#include <cstddef> // size_t
+#include <iterator> // input_iterator_tag
+#include <string> // string, to_string
+#include <tuple> // tuple_size, get, tuple_element
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename string_type>
+void int_to_string( string_type& target, std::size_t value )
+{
+    target = std::to_string(value);
+}
+template <typename IteratorType> class iteration_proxy_value
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = iteration_proxy_value;
+    using pointer = value_type * ;
+    using reference = value_type & ;
+    using iterator_category = std::input_iterator_tag;
+    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;
+
+  private:
+    /// the iterator
+    IteratorType anchor;
+    /// an index for arrays (used to create key names)
+    std::size_t array_index = 0;
+    /// last stringified array index
+    mutable std::size_t array_index_last = 0;
+    /// a string representation of the array index
+    mutable string_type array_index_str = "0";
+    /// an empty string (to return a reference for primitive values)
+    const string_type empty_str = "";
+
+  public:
+    explicit iteration_proxy_value(IteratorType it) noexcept : anchor(it) {}
+
+    /// dereference operator (needed for range-based for)
+    iteration_proxy_value& operator*()
+    {
+        return *this;
+    }
+
+    /// increment operator (needed for range-based for)
+    iteration_proxy_value& operator++()
+    {
+        ++anchor;
+        ++array_index;
+
+        return *this;
+    }
+
+    /// equality operator (needed for InputIterator)
+    bool operator==(const iteration_proxy_value& o) const
+    {
+        return anchor == o.anchor;
+    }
+
+    /// inequality operator (needed for range-based for)
+    bool operator!=(const iteration_proxy_value& o) const
+    {
+        return anchor != o.anchor;
+    }
+
+    /// return key of the iterator
+    const string_type& key() const
+    {
+        assert(anchor.m_object != nullptr);
+
+        switch (anchor.m_object->type())
+        {
+            // use integer array index as key
+            case value_t::array:
+            {
+                if (array_index != array_index_last)
+                {
+                    int_to_string( array_index_str, array_index );
+                    array_index_last = array_index;
+                }
+                return array_index_str;
+            }
+
+            // use key from the object
+            case value_t::object:
+                return anchor.key();
+
+            // use an empty key for all primitive types
+            default:
+                return empty_str;
+        }
+    }
+
+    /// return value of the iterator
+    typename IteratorType::reference value() const
+    {
+        return anchor.value();
+    }
+};
+
+/// proxy class for the items() function
+template<typename IteratorType> class iteration_proxy
+{
+  private:
+    /// the container to iterate
+    typename IteratorType::reference container;
+
+  public:
+    /// construct iteration proxy from a container
+    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
+        : container(cont) {}
+
+    /// return iterator begin (needed for range-based for)
+    iteration_proxy_value<IteratorType> begin() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.begin());
+    }
+
+    /// return iterator end (needed for range-based for)
+    iteration_proxy_value<IteratorType> end() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.end());
+    }
+};
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template <std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
+{
+    return i.key();
+}
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template <std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
+{
+    return i.value();
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+// The Addition to the STD Namespace is required to add
+// Structured Bindings Support to the iteration_proxy_value class
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+namespace std
+{
+#if defined(__clang__)
+    // Fix: https://github.com/nlohmann/json/issues/1401
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename IteratorType>
+class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>>
+            : public std::integral_constant<std::size_t, 2> {};
+
+template <std::size_t N, typename IteratorType>
+class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >>
+{
+  public:
+    using type = decltype(
+                     get<N>(std::declval <
+                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
+};
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+} // namespace std
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////
+// constructors //
+//////////////////
+
+template<value_t> struct external_constructor;
+
+template<>
+struct external_constructor<value_t::boolean>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
+    {
+        j.m_type = value_t::boolean;
+        j.m_value = b;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::string>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = s;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = std::move(s);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename CompatibleStringType,
+             enable_if_t<not std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
+                         int> = 0>
+    static void construct(BasicJsonType& j, const CompatibleStringType& str)
+    {
+        j.m_type = value_t::string;
+        j.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_float>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
+    {
+        j.m_type = value_t::number_float;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_unsigned>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
+    {
+        j.m_type = value_t::number_unsigned;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_integer>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
+    {
+        j.m_type = value_t::number_integer;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::array>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = arr;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = std::move(arr);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename CompatibleArrayType,
+             enable_if_t<not std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
+                         int> = 0>
+    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
+    {
+        using std::begin;
+        using std::end;
+        j.m_type = value_t::array;
+        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->reserve(arr.size());
+        for (const bool x : arr)
+        {
+            j.m_value.array->push_back(x);
+        }
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename T,
+             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->resize(arr.size());
+        if (arr.size() > 0)
+        {
+            std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin());
+        }
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::object>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = obj;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = std::move(obj);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename CompatibleObjectType,
+             enable_if_t<not std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int> = 0>
+    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_type = value_t::object;
+        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.assert_invariant();
+    }
+};
+
+/////////////
+// to_json //
+/////////////
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
+void to_json(BasicJsonType& j, T b) noexcept
+{
+    external_constructor<value_t::boolean>::construct(j, b);
+}
+
+template<typename BasicJsonType, typename CompatibleString,
+         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
+void to_json(BasicJsonType& j, const CompatibleString& s)
+{
+    external_constructor<value_t::string>::construct(j, s);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+{
+    external_constructor<value_t::string>::construct(j, std::move(s));
+}
+
+template<typename BasicJsonType, typename FloatType,
+         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
+void to_json(BasicJsonType& j, FloatType val) noexcept
+{
+    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
+{
+    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberIntegerType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
+{
+    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void to_json(BasicJsonType& j, EnumType e) noexcept
+{
+    using underlying_type = typename std::underlying_type<EnumType>::type;
+    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const std::vector<bool>& e)
+{
+    external_constructor<value_t::array>::construct(j, e);
+}
+
+template <typename BasicJsonType, typename CompatibleArrayType,
+          enable_if_t<is_compatible_array_type<BasicJsonType,
+                      CompatibleArrayType>::value and
+                      not is_compatible_object_type<
+                          BasicJsonType, CompatibleArrayType>::value and
+                      not is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value and
+                      not is_basic_json<CompatibleArrayType>::value,
+                      int> = 0>
+void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+void to_json(BasicJsonType& j, const std::valarray<T>& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType, typename CompatibleObjectType,
+         enable_if_t<is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value and not is_basic_json<CompatibleObjectType>::value, int> = 0>
+void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
+{
+    external_constructor<value_t::object>::construct(j, obj);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+{
+    external_constructor<value_t::object>::construct(j, std::move(obj));
+}
+
+template <
+    typename BasicJsonType, typename T, std::size_t N,
+    enable_if_t<not std::is_constructible<typename BasicJsonType::string_t,
+                const T(&)[N]>::value,
+                int> = 0 >
+void to_json(BasicJsonType& j, const T(&arr)[N])
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
+void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
+{
+    j = { p.first, p.second };
+}
+
+// for https://github.com/nlohmann/json/pull/1134
+template < typename BasicJsonType, typename T,
+           enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
+void to_json(BasicJsonType& j, const T& b)
+{
+    j = { {b.key(), b.value()} };
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    j = { std::get<Idx>(t)... };
+}
+
+template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
+void to_json(BasicJsonType& j, const T& t)
+{
+    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
+}
+
+struct to_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
+    -> decltype(to_json(j, std::forward<T>(val)), void())
+    {
+        return to_json(j, std::forward<T>(val));
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `to_json` function
+namespace
+{
+constexpr const auto& to_json = detail::static_const<detail::to_json_fn>::value;
+} // namespace
+} // namespace nlohmann
+
+
+namespace nlohmann
+{
+
+template<typename, typename>
+struct adl_serializer
+{
+    /*!
+    @brief convert a JSON value to any value type
+
+    This function is usually called by the `get()` function of the
+    @ref basic_json class (either explicit or via conversion operators).
+
+    @param[in] j        JSON value to read from
+    @param[in,out] val  value to write to
+    */
+    template<typename BasicJsonType, typename ValueType>
+    static auto from_json(BasicJsonType&& j, ValueType& val) noexcept(
+        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
+    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
+    {
+        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
+    }
+
+    /*!
+    @brief convert any value type to a JSON value
+
+    This function is usually called by the constructors of the @ref basic_json
+    class.
+
+    @param[in,out] j  JSON value to write to
+    @param[in] val    value to read from
+    */
+    template <typename BasicJsonType, typename ValueType>
+    static auto to_json(BasicJsonType& j, ValueType&& val) noexcept(
+        noexcept(::nlohmann::to_json(j, std::forward<ValueType>(val))))
+    -> decltype(::nlohmann::to_json(j, std::forward<ValueType>(val)), void())
+    {
+        ::nlohmann::to_json(j, std::forward<ValueType>(val));
+    }
+};
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+
+#include <algorithm> // generate_n
+#include <array> // array
+#include <cassert> // assert
+#include <cmath> // ldexp
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstdio> // snprintf
+#include <cstring> // memcpy
+#include <iterator> // back_inserter
+#include <limits> // numeric_limits
+#include <string> // char_traits, string
+#include <utility> // make_pair, move
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+
+#include <array> // array
+#include <cassert> // assert
+#include <cstddef> // size_t
+#include <cstdio> //FILE *
+#include <cstring> // strlen
+#include <istream> // istream
+#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
+#include <memory> // shared_ptr, make_shared, addressof
+#include <numeric> // accumulate
+#include <string> // string, char_traits
+#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
+#include <utility> // pair, declval
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// the supported input formats
+enum class input_format_t { json, cbor, msgpack, ubjson, bson };
+
+////////////////////
+// input adapters //
+////////////////////
+
+/*!
+@brief abstract input adapter interface
+
+Produces a stream of std::char_traits<char>::int_type characters from a
+std::istream, a buffer, or some other input type. Accepts the return of
+exactly one non-EOF character for future input. The int_type characters
+returned consist of all valid char values as positive values (typically
+unsigned char), plus an EOF value outside that range, specified by the value
+of the function std::char_traits<char>::eof(). This value is typically -1, but
+could be any arbitrary value which is not a valid char value.
+*/
+struct input_adapter_protocol
+{
+    /// get a character [0,255] or std::char_traits<char>::eof().
+    virtual std::char_traits<char>::int_type get_character() = 0;
+    virtual ~input_adapter_protocol() = default;
+};
+
+/// a type to simplify interfaces
+using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
+
+/*!
+Input adapter for stdio file access. This adapter read only 1 byte and do not use any
+ buffer. This adapter is a very low level adapter.
+*/
+class file_input_adapter : public input_adapter_protocol
+{
+  public:
+    JSON_HEDLEY_NON_NULL(2)
+    explicit file_input_adapter(std::FILE* f)  noexcept
+        : m_file(f)
+    {}
+
+    // make class move-only
+    file_input_adapter(const file_input_adapter&) = delete;
+    file_input_adapter(file_input_adapter&&) = default;
+    file_input_adapter& operator=(const file_input_adapter&) = delete;
+    file_input_adapter& operator=(file_input_adapter&&) = default;
+    ~file_input_adapter() override = default;
+
+    std::char_traits<char>::int_type get_character() noexcept override
+    {
+        return std::fgetc(m_file);
+    }
+
+  private:
+    /// the file pointer to read from
+    std::FILE* m_file;
+};
+
+
+/*!
+Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
+beginning of input. Does not support changing the underlying std::streambuf
+in mid-input. Maintains underlying std::istream and std::streambuf to support
+subsequent use of standard std::istream operations to process any input
+characters following those used in parsing the JSON input.  Clears the
+std::istream flags; any input errors (e.g., EOF) will be detected by the first
+subsequent call for input from the std::istream.
+*/
+class input_stream_adapter : public input_adapter_protocol
+{
+  public:
+    ~input_stream_adapter() override
+    {
+        // clear stream flags; we use underlying streambuf I/O, do not
+        // maintain ifstream flags, except eof
+        is.clear(is.rdstate() & std::ios::eofbit);
+    }
+
+    explicit input_stream_adapter(std::istream& i)
+        : is(i), sb(*i.rdbuf())
+    {}
+
+    // delete because of pointer members
+    input_stream_adapter(const input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&) = delete;
+    input_stream_adapter(input_stream_adapter&&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&&) = delete;
+
+    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
+    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
+    // end up as the same value, eg. 0xFFFFFFFF.
+    std::char_traits<char>::int_type get_character() override
+    {
+        auto res = sb.sbumpc();
+        // set eof manually, as we don't use the istream interface.
+        if (res == EOF)
+        {
+            is.clear(is.rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
+  private:
+    /// the associated input stream
+    std::istream& is;
+    std::streambuf& sb;
+};
+
+/// input adapter for buffer input
+class input_buffer_adapter : public input_adapter_protocol
+{
+  public:
+    input_buffer_adapter(const char* b, const std::size_t l) noexcept
+        : cursor(b), limit(b == nullptr ? nullptr : (b + l))
+    {}
+
+    // delete because of pointer members
+    input_buffer_adapter(const input_buffer_adapter&) = delete;
+    input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
+    input_buffer_adapter(input_buffer_adapter&&) = delete;
+    input_buffer_adapter& operator=(input_buffer_adapter&&) = delete;
+    ~input_buffer_adapter() override = default;
+
+    std::char_traits<char>::int_type get_character() noexcept override
+    {
+        if (JSON_HEDLEY_LIKELY(cursor < limit))
+        {
+            assert(cursor != nullptr and limit != nullptr);
+            return std::char_traits<char>::to_int_type(*(cursor++));
+        }
+
+        return std::char_traits<char>::eof();
+    }
+
+  private:
+    /// pointer to the current character
+    const char* cursor;
+    /// pointer past the last character
+    const char* const limit;
+};
+
+template<typename WideStringType, size_t T>
+struct wide_string_input_helper
+{
+    // UTF-32
+    static void fill_buffer(const WideStringType& str,
+                            size_t& current_wchar,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = static_cast<unsigned int>(str[current_wchar++]);
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((wc >> 6u) & 0x1Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((wc >> 12u) & 0x0Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((wc >> 18u) & 0x07u));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 12u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 6u) & 0x3Fu));
+                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+};
+
+template<typename WideStringType>
+struct wide_string_input_helper<WideStringType, 2>
+{
+    // UTF-16
+    static void fill_buffer(const WideStringType& str,
+                            size_t& current_wchar,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = static_cast<unsigned int>(str[current_wchar++]);
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((wc >> 6u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc or wc >= 0xE000)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((wc >> 12u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (current_wchar < str.size())
+                {
+                    const auto wc2 = static_cast<unsigned int>(str[current_wchar++]);
+                    const auto charcode = 0x10000u + (((wc & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
+                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
+                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
+                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    // unknown character
+                    ++current_wchar;
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+};
+
+template<typename WideStringType>
+class wide_string_input_adapter : public input_adapter_protocol
+{
+  public:
+    explicit wide_string_input_adapter(const WideStringType& w) noexcept
+        : str(w)
+    {}
+
+    std::char_traits<char>::int_type get_character() noexcept override
+    {
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            fill_buffer<sizeof(typename WideStringType::value_type)>();
+
+            assert(utf8_bytes_filled > 0);
+            assert(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        assert(utf8_bytes_filled > 0);
+        assert(utf8_bytes_index < utf8_bytes_filled);
+        return utf8_bytes[utf8_bytes_index++];
+    }
+
+  private:
+    template<size_t T>
+    void fill_buffer()
+    {
+        wide_string_input_helper<WideStringType, T>::fill_buffer(str, current_wchar, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
+    }
+
+    /// the wstring to process
+    const WideStringType& str;
+
+    /// index of the current wchar in str
+    std::size_t current_wchar = 0;
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+};
+
+class input_adapter
+{
+  public:
+    // native support
+    JSON_HEDLEY_NON_NULL(2)
+    input_adapter(std::FILE* file)
+        : ia(std::make_shared<file_input_adapter>(file)) {}
+    /// input adapter for input stream
+    input_adapter(std::istream& i)
+        : ia(std::make_shared<input_stream_adapter>(i)) {}
+
+    /// input adapter for input stream
+    input_adapter(std::istream&& i)
+        : ia(std::make_shared<input_stream_adapter>(i)) {}
+
+    input_adapter(const std::wstring& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
+
+    input_adapter(const std::u16string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
+
+    input_adapter(const std::u32string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
+
+    /// input adapter for buffer
+    template<typename CharT,
+             typename std::enable_if<
+                 std::is_pointer<CharT>::value and
+                 std::is_integral<typename std::remove_pointer<CharT>::type>::value and
+                 sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                 int>::type = 0>
+    input_adapter(CharT b, std::size_t l)
+        : ia(std::make_shared<input_buffer_adapter>(reinterpret_cast<const char*>(b), l)) {}
+
+    // derived support
+
+    /// input adapter for string literal
+    template<typename CharT,
+             typename std::enable_if<
+                 std::is_pointer<CharT>::value and
+                 std::is_integral<typename std::remove_pointer<CharT>::type>::value and
+                 sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                 int>::type = 0>
+    input_adapter(CharT b)
+        : input_adapter(reinterpret_cast<const char*>(b),
+                        std::strlen(reinterpret_cast<const char*>(b))) {}
+
+    /// input adapter for iterator range with contiguous storage
+    template<class IteratorType,
+             typename std::enable_if<
+                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
+                 int>::type = 0>
+    input_adapter(IteratorType first, IteratorType last)
+    {
+#ifndef NDEBUG
+        // assertion to check that the iterator range is indeed contiguous,
+        // see http://stackoverflow.com/a/35008842/266378 for more discussion
+        const auto is_contiguous = std::accumulate(
+                                       first, last, std::pair<bool, int>(true, 0),
+                                       [&first](std::pair<bool, int> res, decltype(*first) val)
+        {
+            res.first &= (val == *(std::next(std::addressof(*first), res.second++)));
+            return res;
+        }).first;
+        assert(is_contiguous);
+#endif
+
+        // assertion to check that each element is 1 byte long
+        static_assert(
+            sizeof(typename iterator_traits<IteratorType>::value_type) == 1,
+            "each element in the iterator range must have the size of 1 byte");
+
+        const auto len = static_cast<size_t>(std::distance(first, last));
+        if (JSON_HEDLEY_LIKELY(len > 0))
+        {
+            // there is at least one element: use the address of first
+            ia = std::make_shared<input_buffer_adapter>(reinterpret_cast<const char*>(&(*first)), len);
+        }
+        else
+        {
+            // the address of first cannot be used: use nullptr
+            ia = std::make_shared<input_buffer_adapter>(nullptr, len);
+        }
+    }
+
+    /// input adapter for array
+    template<class T, std::size_t N>
+    input_adapter(T (&array)[N])
+        : input_adapter(std::begin(array), std::end(array)) {}
+
+    /// input adapter for contiguous container
+    template<class ContiguousContainer, typename
+             std::enable_if<not std::is_pointer<ContiguousContainer>::value and
+                            std::is_base_of<std::random_access_iterator_tag, typename iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
+                            int>::type = 0>
+    input_adapter(const ContiguousContainer& c)
+        : input_adapter(std::begin(c), std::end(c)) {}
+
+    operator input_adapter_t()
+    {
+        return ia;
+    }
+
+  private:
+    /// the actual adapter
+    input_adapter_t ia = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+
+#include <cassert> // assert
+#include <cstddef>
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template<typename BasicJsonType>
+struct json_sax
+{
+    /// type for (signed) integers
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    /// type for unsigned integers
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    /// type for floating-point numbers
+    using number_float_t = typename BasicJsonType::number_float_t;
+    /// type for strings
+    using string_t = typename BasicJsonType::string_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief an floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool string(string_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] ex          an exception object describing the error
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool parse_error(std::size_t position,
+                             const std::string& last_token,
+                             const detail::exception& ex) = 0;
+
+    virtual ~json_sax() = default;
+};
+
+
+namespace detail
+{
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template<typename BasicJsonType>
+class json_sax_dom_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+
+    /*!
+    @param[in, out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
+        : root(r), allow_exceptions(allow_exceptions_)
+    {}
+
+    // make class move-only
+    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser(json_sax_dom_parser&&) = default;
+    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default;
+    ~json_sax_dom_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive object size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_value.object->operator[](val));
+        return true;
+    }
+
+    bool end_object()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive array size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const detail::exception& ex)
+    {
+        errored = true;
+        if (allow_exceptions)
+        {
+            // determine the proper exception type from the id
+            switch ((ex.id / 100) % 100)
+            {
+                case 1:
+                    JSON_THROW(*static_cast<const detail::parse_error*>(&ex));
+                case 4:
+                    JSON_THROW(*static_cast<const detail::out_of_range*>(&ex));
+                // LCOV_EXCL_START
+                case 2:
+                    JSON_THROW(*static_cast<const detail::invalid_iterator*>(&ex));
+                case 3:
+                    JSON_THROW(*static_cast<const detail::type_error*>(&ex));
+                case 5:
+                    JSON_THROW(*static_cast<const detail::other_error*>(&ex));
+                default:
+                    assert(false);
+                    // LCOV_EXCL_STOP
+            }
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template<typename Value>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    BasicJsonType* handle_value(Value&& v)
+    {
+        if (ref_stack.empty())
+        {
+            root = BasicJsonType(std::forward<Value>(v));
+            return &root;
+        }
+
+        assert(ref_stack.back()->is_array() or ref_stack.back()->is_object());
+
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v));
+            return &(ref_stack.back()->m_value.array->back());
+        }
+
+        assert(ref_stack.back()->is_object());
+        assert(object_element);
+        *object_element = BasicJsonType(std::forward<Value>(v));
+        return object_element;
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+template<typename BasicJsonType>
+class json_sax_dom_callback_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t = typename BasicJsonType::parse_event_t;
+
+    json_sax_dom_callback_parser(BasicJsonType& r,
+                                 const parser_callback_t cb,
+                                 const bool allow_exceptions_ = true)
+        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
+    {
+        keep_stack.push_back(true);
+    }
+
+    // make class move-only
+    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default;
+    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default;
+    ~json_sax_dom_callback_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        // check object limit
+        if (ref_stack.back() and JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep and ref_stack.back())
+        {
+            object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool end_object()
+    {
+        if (ref_stack.back() and not callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
+        {
+            // discard object
+            *ref_stack.back() = discarded;
+        }
+
+        assert(not ref_stack.empty());
+        assert(not keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (not ref_stack.empty() and ref_stack.back() and ref_stack.back()->is_object())
+        {
+            // remove discarded value
+            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
+            {
+                if (it->is_discarded())
+                {
+                    ref_stack.back()->erase(it);
+                    break;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        // check array limit
+        if (ref_stack.back() and JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        bool keep = true;
+
+        if (ref_stack.back())
+        {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (not keep)
+            {
+                // discard array
+                *ref_stack.back() = discarded;
+            }
+        }
+
+        assert(not ref_stack.empty());
+        assert(not keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (not keep and not ref_stack.empty() and ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->pop_back();
+        }
+
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const detail::exception& ex)
+    {
+        errored = true;
+        if (allow_exceptions)
+        {
+            // determine the proper exception type from the id
+            switch ((ex.id / 100) % 100)
+            {
+                case 1:
+                    JSON_THROW(*static_cast<const detail::parse_error*>(&ex));
+                case 4:
+                    JSON_THROW(*static_cast<const detail::out_of_range*>(&ex));
+                // LCOV_EXCL_START
+                case 2:
+                    JSON_THROW(*static_cast<const detail::invalid_iterator*>(&ex));
+                case 3:
+                    JSON_THROW(*static_cast<const detail::type_error*>(&ex));
+                case 5:
+                    JSON_THROW(*static_cast<const detail::other_error*>(&ex));
+                default:
+                    assert(false);
+                    // LCOV_EXCL_STOP
+            }
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template<typename Value>
+    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
+    {
+        assert(not keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (not keep_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+        // check callback
+        const bool keep = skip_callback or callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (not keep)
+        {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty())
+        {
+            root = std::move(value);
+            return {true, &root};
+        }
+
+        // skip this value if we already decided to skip the parent
+        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+        if (not ref_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // we now only expect arrays and objects
+        assert(ref_stack.back()->is_array() or ref_stack.back()->is_object());
+
+        // array
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->push_back(std::move(value));
+            return {true, &(ref_stack.back()->m_value.array->back())};
+        }
+
+        // object
+        assert(ref_stack.back()->is_object());
+        // check if we should store an element for the current key
+        assert(not key_keep_stack.empty());
+        const bool store_element = key_keep_stack.back();
+        key_keep_stack.pop_back();
+
+        if (not store_element)
+        {
+            return {false, nullptr};
+        }
+
+        assert(object_element);
+        *object_element = std::move(value);
+        return {true, object_element};
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack {};
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+};
+
+template<typename BasicJsonType>
+class json_sax_acceptor
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+
+    bool null()
+    {
+        return true;
+    }
+
+    bool boolean(bool /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_integer(number_integer_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool string(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool start_object(std::size_t  /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool key(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool end_object()
+    {
+        return true;
+    }
+
+    bool start_array(std::size_t  /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool end_array()
+    {
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
+    {
+        return false;
+    }
+};
+}  // namespace detail
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+
+#include <cstdint> // size_t
+#include <utility> // declval
+#include <string> // string
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template <typename T>
+using null_function_t = decltype(std::declval<T&>().null());
+
+template <typename T>
+using boolean_function_t =
+    decltype(std::declval<T&>().boolean(std::declval<bool>()));
+
+template <typename T, typename Integer>
+using number_integer_function_t =
+    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
+
+template <typename T, typename Unsigned>
+using number_unsigned_function_t =
+    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
+
+template <typename T, typename Float, typename String>
+using number_float_function_t = decltype(std::declval<T&>().number_float(
+                                    std::declval<Float>(), std::declval<const String&>()));
+
+template <typename T, typename String>
+using string_function_t =
+    decltype(std::declval<T&>().string(std::declval<String&>()));
+
+template <typename T>
+using start_object_function_t =
+    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
+
+template <typename T, typename String>
+using key_function_t =
+    decltype(std::declval<T&>().key(std::declval<String&>()));
+
+template <typename T>
+using end_object_function_t = decltype(std::declval<T&>().end_object());
+
+template <typename T>
+using start_array_function_t =
+    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
+
+template <typename T>
+using end_array_function_t = decltype(std::declval<T&>().end_array());
+
+template <typename T, typename Exception>
+using parse_error_function_t = decltype(std::declval<T&>().parse_error(
+        std::declval<std::size_t>(), std::declval<const std::string&>(),
+        std::declval<const Exception&>()));
+
+template <typename SAX, typename BasicJsonType>
+struct is_sax
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static constexpr bool value =
+        is_detected_exact<bool, null_function_t, SAX>::value &&
+        is_detected_exact<bool, boolean_function_t, SAX>::value &&
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value &&
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value &&
+        is_detected_exact<bool, number_float_function_t, SAX, number_float_t,
+        string_t>::value &&
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, start_object_function_t, SAX>::value &&
+        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, end_object_function_t, SAX>::value &&
+        is_detected_exact<bool, start_array_function_t, SAX>::value &&
+        is_detected_exact<bool, end_array_function_t, SAX>::value &&
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
+};
+
+template <typename SAX, typename BasicJsonType>
+struct is_sax_static_asserts
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
+                  "Missing/invalid function: bool null()");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value,
+        "Missing/invalid function: bool number_integer(number_integer_t)");
+    static_assert(
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value,
+        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
+    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
+                  number_float_t, string_t>::value,
+                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
+    static_assert(
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
+        "Missing/invalid function: bool string(string_t&)");
+    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_object(std::size_t)");
+    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool key(string_t&)");
+    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_object()");
+    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_array(std::size_t)");
+    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_array()");
+    static_assert(
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
+        "Missing/invalid function: bool parse_error(std::size_t, const "
+        "std::string&, const exception&)");
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// binary reader //
+///////////////////
+
+/*!
+@brief deserialization of CBOR, MessagePack, and UBJSON values
+*/
+template<typename BasicJsonType, typename SAX = json_sax_dom_parser<BasicJsonType>>
+class binary_reader
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using json_sax_t = SAX;
+
+  public:
+    /*!
+    @brief create a binary reader
+
+    @param[in] adapter  input adapter to read from
+    */
+    explicit binary_reader(input_adapter_t adapter) : ia(std::move(adapter))
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+        assert(ia);
+    }
+
+    // make class move-only
+    binary_reader(const binary_reader&) = delete;
+    binary_reader(binary_reader&&) = default;
+    binary_reader& operator=(const binary_reader&) = delete;
+    binary_reader& operator=(binary_reader&&) = default;
+    ~binary_reader() = default;
+
+    /*!
+    @param[in] format  the binary format to parse
+    @param[in] sax_    a SAX event processor
+    @param[in] strict  whether to expect the input to be consumed completed
+
+    @return
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool sax_parse(const input_format_t format,
+                   json_sax_t* sax_,
+                   const bool strict = true)
+    {
+        sax = sax_;
+        bool result = false;
+
+        switch (format)
+        {
+            case input_format_t::bson:
+                result = parse_bson_internal();
+                break;
+
+            case input_format_t::cbor:
+                result = parse_cbor_internal();
+                break;
+
+            case input_format_t::msgpack:
+                result = parse_msgpack_internal();
+                break;
+
+            case input_format_t::ubjson:
+                result = parse_ubjson_internal();
+                break;
+
+            default:            // LCOV_EXCL_LINE
+                assert(false);  // LCOV_EXCL_LINE
+        }
+
+        // strict mode: next byte must be EOF
+        if (result and strict)
+        {
+            if (format == input_format_t::ubjson)
+            {
+                get_ignore_noop();
+            }
+            else
+            {
+                get();
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(current != std::char_traits<char>::eof()))
+            {
+                return sax->parse_error(chars_read, get_token_string(),
+                                        parse_error::create(110, chars_read, exception_message(format, "expected end of input; last byte: 0x" + get_token_string(), "value")));
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief determine system byte order
+
+    @return true if and only if system's byte order is little endian
+
+    @note from http://stackoverflow.com/a/1001328/266378
+    */
+    static constexpr bool little_endianess(int num = 1) noexcept
+    {
+        return *reinterpret_cast<char*>(&num) == 1;
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @brief Reads in a BSON-object and passes it to the SAX-parser.
+    @return whether a valid BSON-value was passed to the SAX parser
+    */
+    bool parse_bson_internal()
+    {
+        std::int32_t document_size;
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(not sax->start_object(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(not parse_bson_element_list(/*is_array*/false)))
+        {
+            return false;
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @brief Parses a C-style string from the BSON input.
+    @param[in, out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @return `true` if the \x00-byte indicating the end of the string was
+             encountered before the EOF; false` indicates an unexpected EOF.
+    */
+    bool get_bson_cstr(string_t& result)
+    {
+        auto out = std::back_inserter(result);
+        while (true)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::bson, "cstring")))
+            {
+                return false;
+            }
+            if (current == 0x00)
+            {
+                return true;
+            }
+            *out++ = static_cast<char>(current);
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief Parses a zero-terminated string of length @a len from the BSON
+           input.
+    @param[in] len  The length (including the zero-byte at the end) of the
+                    string to be read.
+    @param[in, out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 1
+    @return `true` if the string was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_string(const NumberType len, string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 1))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "string length must be at least 1, is " + std::to_string(len), "string")));
+        }
+
+        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) and get() != std::char_traits<char>::eof();
+    }
+
+    /*!
+    @brief Read a BSON document element of the given @a element_type.
+    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
+    @param[in] element_type_parse_position The position in the input stream,
+               where the `element_type` was read.
+    @warning Not all BSON element types are supported yet. An unsupported
+             @a element_type will give rise to a parse_error.114:
+             Unsupported BSON record type 0x...
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_internal(const int element_type,
+                                     const std::size_t element_type_parse_position)
+    {
+        switch (element_type)
+        {
+            case 0x01: // double
+            {
+                double number;
+                return get_number<double, true>(input_format_t::bson, number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0x02: // string
+            {
+                std::int32_t len;
+                string_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) and get_bson_string(len, value) and sax->string(value);
+            }
+
+            case 0x03: // object
+            {
+                return parse_bson_internal();
+            }
+
+            case 0x04: // array
+            {
+                return parse_bson_array();
+            }
+
+            case 0x08: // boolean
+            {
+                return sax->boolean(get() != 0);
+            }
+
+            case 0x0A: // null
+            {
+                return sax->null();
+            }
+
+            case 0x10: // int32
+            {
+                std::int32_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, value) and sax->number_integer(value);
+            }
+
+            case 0x12: // int64
+            {
+                std::int64_t value;
+                return get_number<std::int64_t, true>(input_format_t::bson, value) and sax->number_integer(value);
+            }
+
+            default: // anything else not supported (yet)
+            {
+                std::array<char, 3> cr{{}};
+                (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type));
+                return sax->parse_error(element_type_parse_position, std::string(cr.data()), parse_error::create(114, element_type_parse_position, "Unsupported BSON record type 0x" + std::string(cr.data())));
+            }
+        }
+    }
+
+    /*!
+    @brief Read a BSON element list (as specified in the BSON-spec)
+
+    The same binary layout is used for objects and arrays, hence it must be
+    indicated with the argument @a is_array which one is expected
+    (true --> array, false --> object).
+
+    @param[in] is_array Determines if the element list being read is to be
+                        treated as an object (@a is_array == false), or as an
+                        array (@a is_array == true).
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_list(const bool is_array)
+    {
+        string_t key;
+        while (int element_type = get())
+        {
+            if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::bson, "element list")))
+            {
+                return false;
+            }
+
+            const std::size_t element_type_parse_position = chars_read;
+            if (JSON_HEDLEY_UNLIKELY(not get_bson_cstr(key)))
+            {
+                return false;
+            }
+
+            if (not is_array and not sax->key(key))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(not parse_bson_element_internal(element_type, element_type_parse_position)))
+            {
+                return false;
+            }
+
+            // get_bson_cstr only appends
+            key.clear();
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief Reads an array from the BSON input and passes it to the SAX-parser.
+    @return whether a valid BSON-array was passed to the SAX parser
+    */
+    bool parse_bson_array()
+    {
+        std::int32_t document_size;
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(not sax->start_array(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(not parse_bson_element_list(/*is_array*/true)))
+        {
+            return false;
+        }
+
+        return sax->end_array();
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid CBOR value was passed to the SAX parser
+    */
+    bool parse_cbor_internal(const bool get_char = true)
+    {
+        switch (get_char ? get() : current)
+        {
+            // EOF
+            case std::char_traits<char>::eof():
+                return unexpect_eof(input_format_t::cbor, "value");
+
+            // Integer 0x00..0x17 (0..23)
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            case 0x18: // Unsigned integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_unsigned(number);
+            }
+
+            case 0x19: // Unsigned integer (two-byte uint16_t follows)
+            {
+                std::uint16_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_unsigned(number);
+            }
+
+            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
+            {
+                std::uint32_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_unsigned(number);
+            }
+
+            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
+            {
+                std::uint64_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_unsigned(number);
+            }
+
+            // Negative integer -1-0x00..-1-0x17 (-1..-24)
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));
+
+            case 0x38: // Negative integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
+            {
+                std::uint16_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
+            {
+                std::uint32_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
+            {
+                std::uint64_t number;
+                return get_number(input_format_t::cbor, number) and sax->number_integer(static_cast<number_integer_t>(-1)
+                        - static_cast<number_integer_t>(number));
+            }
+
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                string_t s;
+                return get_cbor_string(s) and sax->string(s);
+            }
+
+            // array (0x00..0x17 data items follow)
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+                return get_cbor_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu));
+
+            case 0x98: // array (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len));
+            }
+
+            case 0x99: // array (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len));
+            }
+
+            case 0x9A: // array (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len));
+            }
+
+            case 0x9B: // array (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_array(static_cast<std::size_t>(len));
+            }
+
+            case 0x9F: // array (indefinite length)
+                return get_cbor_array(std::size_t(-1));
+
+            // map (0x00..0x17 pairs of data items follow)
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+                return get_cbor_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu));
+
+            case 0xB8: // map (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xB9: // map (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xBA: // map (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xBB: // map (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len;
+                return get_number(input_format_t::cbor, len) and get_cbor_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xBF: // map (indefinite length)
+                return get_cbor_object(std::size_t(-1));
+
+            case 0xF4: // false
+                return sax->boolean(false);
+
+            case 0xF5: // true
+                return sax->boolean(true);
+
+            case 0xF6: // null
+                return sax->null();
+
+            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
+            {
+                const int byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+                const int byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
+                const double val = [&half]
+                {
+                    const int exp = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    assert(0 <= exp and exp <= 32);
+                    assert(mant <= 1024);
+                    switch (exp)
+                    {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0)
+                            ? std::numeric_limits<double>::infinity()
+                            : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float((half & 0x8000u) != 0
+                                         ? static_cast<number_float_t>(-val)
+                                         : static_cast<number_float_t>(val), "");
+            }
+
+            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
+            {
+                float number;
+                return get_number(input_format_t::cbor, number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
+            {
+                double number;
+                return get_number(input_format_t::cbor, number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            default: // anything else (0xFF is handled inside the other types)
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+    Additionally, CBOR's strings with indefinite lengths are supported.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_cbor_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::cbor, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            {
+                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len;
+                return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len;
+                return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len;
+                return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len;
+                return get_number(input_format_t::cbor, len) and get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    string_t chunk;
+                    if (not get_cbor_string(chunk))
+                    {
+                        return false;
+                    }
+                    result.append(chunk);
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x" + last_token, "string")));
+            }
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array or std::size_t(-1) for an
+                    array of indefinite size
+    @return whether array creation completed
+    */
+    bool get_cbor_array(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not sax->start_array(len)))
+        {
+            return false;
+        }
+
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                if (JSON_HEDLEY_UNLIKELY(not parse_cbor_internal()))
+                {
+                    return false;
+                }
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(not parse_cbor_internal(false)))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object or std::size_t(-1) for an
+                    object of indefinite size
+    @return whether object creation completed
+    */
+    bool get_cbor_object(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(not get_cbor_string(key) or not sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(not parse_cbor_internal()))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(not get_cbor_string(key) or not sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(not parse_cbor_internal()))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    /*!
+    @return whether a valid MessagePack value was passed to the SAX parser
+    */
+    bool parse_msgpack_internal()
+    {
+        switch (get())
+        {
+            // EOF
+            case std::char_traits<char>::eof():
+                return unexpect_eof(input_format_t::msgpack, "value");
+
+            // positive fixint
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+            case 0x18:
+            case 0x19:
+            case 0x1A:
+            case 0x1B:
+            case 0x1C:
+            case 0x1D:
+            case 0x1E:
+            case 0x1F:
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+            case 0x38:
+            case 0x39:
+            case 0x3A:
+            case 0x3B:
+            case 0x3C:
+            case 0x3D:
+            case 0x3E:
+            case 0x3F:
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58:
+            case 0x59:
+            case 0x5A:
+            case 0x5B:
+            case 0x5C:
+            case 0x5D:
+            case 0x5E:
+            case 0x5F:
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78:
+            case 0x79:
+            case 0x7A:
+            case 0x7B:
+            case 0x7C:
+            case 0x7D:
+            case 0x7E:
+            case 0x7F:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            // fixmap
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+                return get_msgpack_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixarray
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+            case 0x98:
+            case 0x99:
+            case 0x9A:
+            case 0x9B:
+            case 0x9C:
+            case 0x9D:
+            case 0x9E:
+            case 0x9F:
+                return get_msgpack_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            case 0xD9: // str 8
+            case 0xDA: // str 16
+            case 0xDB: // str 32
+            {
+                string_t s;
+                return get_msgpack_string(s) and sax->string(s);
+            }
+
+            case 0xC0: // nil
+                return sax->null();
+
+            case 0xC2: // false
+                return sax->boolean(false);
+
+            case 0xC3: // true
+                return sax->boolean(true);
+
+            case 0xCA: // float 32
+            {
+                float number;
+                return get_number(input_format_t::msgpack, number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCB: // float 64
+            {
+                double number;
+                return get_number(input_format_t::msgpack, number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCC: // uint 8
+            {
+                std::uint8_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number);
+            }
+
+            case 0xCD: // uint 16
+            {
+                std::uint16_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number);
+            }
+
+            case 0xCE: // uint 32
+            {
+                std::uint32_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number);
+            }
+
+            case 0xCF: // uint 64
+            {
+                std::uint64_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_unsigned(number);
+            }
+
+            case 0xD0: // int 8
+            {
+                std::int8_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_integer(number);
+            }
+
+            case 0xD1: // int 16
+            {
+                std::int16_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_integer(number);
+            }
+
+            case 0xD2: // int 32
+            {
+                std::int32_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_integer(number);
+            }
+
+            case 0xD3: // int 64
+            {
+                std::int64_t number;
+                return get_number(input_format_t::msgpack, number) and sax->number_integer(number);
+            }
+
+            case 0xDC: // array 16
+            {
+                std::uint16_t len;
+                return get_number(input_format_t::msgpack, len) and get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDD: // array 32
+            {
+                std::uint32_t len;
+                return get_number(input_format_t::msgpack, len) and get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDE: // map 16
+            {
+                std::uint16_t len;
+                return get_number(input_format_t::msgpack, len) and get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xDF: // map 32
+            {
+                std::uint32_t len;
+                return get_number(input_format_t::msgpack, len) and get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            // negative fixint
+            case 0xE0:
+            case 0xE1:
+            case 0xE2:
+            case 0xE3:
+            case 0xE4:
+            case 0xE5:
+            case 0xE6:
+            case 0xE7:
+            case 0xE8:
+            case 0xE9:
+            case 0xEA:
+            case 0xEB:
+            case 0xEC:
+            case 0xED:
+            case 0xEE:
+            case 0xEF:
+            case 0xF0:
+            case 0xF1:
+            case 0xF2:
+            case 0xF3:
+            case 0xF4:
+            case 0xF5:
+            case 0xF6:
+            case 0xF7:
+            case 0xF8:
+            case 0xF9:
+            case 0xFA:
+            case 0xFB:
+            case 0xFC:
+            case 0xFD:
+            case 0xFE:
+            case 0xFF:
+                return sax->number_integer(static_cast<std::int8_t>(current));
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::msgpack, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_msgpack_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::msgpack, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            {
+                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0xD9: // str 8
+            {
+                std::uint8_t len;
+                return get_number(input_format_t::msgpack, len) and get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDA: // str 16
+            {
+                std::uint16_t len;
+                return get_number(input_format_t::msgpack, len) and get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDB: // str 32
+            {
+                std::uint32_t len;
+                return get_number(input_format_t::msgpack, len) and get_string(input_format_t::msgpack, len, result);
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::msgpack, "expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x" + last_token, "string")));
+            }
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array
+    @return whether array creation completed
+    */
+    bool get_msgpack_array(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not sax->start_array(len)))
+        {
+            return false;
+        }
+
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(not parse_msgpack_internal()))
+            {
+                return false;
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object
+    @return whether object creation completed
+    */
+    bool get_msgpack_object(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(not get_msgpack_string(key) or not sax->key(key)))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(not parse_msgpack_internal()))
+            {
+                return false;
+            }
+            key.clear();
+        }
+
+        return sax->end_object();
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid UBJSON value was passed to the SAX parser
+    */
+    bool parse_ubjson_internal(const bool get_char = true)
+    {
+        return get_ubjson_value(get_char ? get_ignore_noop() : current);
+    }
+
+    /*!
+    @brief reads a UBJSON string
+
+    This function is either called after reading the 'S' byte explicitly
+    indicating a string, or in case of an object key where the 'S' byte can be
+    left out.
+
+    @param[out] result   created string
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether string creation completed
+    */
+    bool get_ubjson_string(string_t& result, const bool get_char = true)
+    {
+        if (get_char)
+        {
+            get();  // TODO(niels): may we ignore N here?
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "value")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            case 'U':
+            {
+                std::uint8_t len;
+                return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'i':
+            {
+                std::int8_t len;
+                return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'I':
+            {
+                std::int16_t len;
+                return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'l':
+            {
+                std::int32_t len;
+                return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'L':
+            {
+                std::int64_t len;
+                return get_number(input_format_t::ubjson, len) and get_string(input_format_t::ubjson, len, result);
+            }
+
+            default:
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token, "string")));
+        }
+    }
+
+    /*!
+    @param[out] result  determined size
+    @return whether size determination completed
+    */
+    bool get_ubjson_size_value(std::size_t& result)
+    {
+        switch (get_ignore_noop())
+        {
+            case 'U':
+            {
+                std::uint8_t number;
+                if (JSON_HEDLEY_UNLIKELY(not get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'i':
+            {
+                std::int8_t number;
+                if (JSON_HEDLEY_UNLIKELY(not get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'I':
+            {
+                std::int16_t number;
+                if (JSON_HEDLEY_UNLIKELY(not get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'l':
+            {
+                std::int32_t number;
+                if (JSON_HEDLEY_UNLIKELY(not get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'L':
+            {
+                std::int64_t number;
+                if (JSON_HEDLEY_UNLIKELY(not get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token, "size")));
+            }
+        }
+    }
+
+    /*!
+    @brief determine the type and size for a container
+
+    In the optimized UBJSON format, a type and a size can be provided to allow
+    for a more compact representation.
+
+    @param[out] result  pair of the size and the type
+
+    @return whether pair creation completed
+    */
+    bool get_ubjson_size_type(std::pair<std::size_t, int>& result)
+    {
+        result.first = string_t::npos; // size
+        result.second = 0; // type
+
+        get_ignore_noop();
+
+        if (current == '$')
+        {
+            result.second = get();  // must not ignore 'N', because 'N' maybe the type
+            if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "type")))
+            {
+                return false;
+            }
+
+            get_ignore_noop();
+            if (JSON_HEDLEY_UNLIKELY(current != '#'))
+            {
+                if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "value")))
+                {
+                    return false;
+                }
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "expected '#' after type information; last byte: 0x" + last_token, "size")));
+            }
+
+            return get_ubjson_size_value(result.first);
+        }
+
+        if (current == '#')
+        {
+            return get_ubjson_size_value(result.first);
+        }
+
+        return true;
+    }
+
+    /*!
+    @param prefix  the previously read or set type prefix
+    @return whether value creation completed
+    */
+    bool get_ubjson_value(const int prefix)
+    {
+        switch (prefix)
+        {
+            case std::char_traits<char>::eof():  // EOF
+                return unexpect_eof(input_format_t::ubjson, "value");
+
+            case 'T':  // true
+                return sax->boolean(true);
+            case 'F':  // false
+                return sax->boolean(false);
+
+            case 'Z':  // null
+                return sax->null();
+
+            case 'U':
+            {
+                std::uint8_t number;
+                return get_number(input_format_t::ubjson, number) and sax->number_unsigned(number);
+            }
+
+            case 'i':
+            {
+                std::int8_t number;
+                return get_number(input_format_t::ubjson, number) and sax->number_integer(number);
+            }
+
+            case 'I':
+            {
+                std::int16_t number;
+                return get_number(input_format_t::ubjson, number) and sax->number_integer(number);
+            }
+
+            case 'l':
+            {
+                std::int32_t number;
+                return get_number(input_format_t::ubjson, number) and sax->number_integer(number);
+            }
+
+            case 'L':
+            {
+                std::int64_t number;
+                return get_number(input_format_t::ubjson, number) and sax->number_integer(number);
+            }
+
+            case 'd':
+            {
+                float number;
+                return get_number(input_format_t::ubjson, number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'D':
+            {
+                double number;
+                return get_number(input_format_t::ubjson, number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'C':  // char
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(input_format_t::ubjson, "char")))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(current > 127))
+                {
+                    auto last_token = get_token_string();
+                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token, "char")));
+                }
+                string_t s(1, static_cast<char>(current));
+                return sax->string(s);
+            }
+
+            case 'S':  // string
+            {
+                string_t s;
+                return get_ubjson_string(s) and sax->string(s);
+            }
+
+            case '[':  // array
+                return get_ubjson_array();
+
+            case '{':  // object
+                return get_ubjson_object();
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @return whether array creation completed
+    */
+    bool get_ubjson_array()
+    {
+        std::pair<std::size_t, int> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(not get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(not sax->start_array(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                if (size_and_type.second != 'N')
+                {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i)
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not get_ubjson_value(size_and_type.second)))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(not sax->start_array(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != ']')
+            {
+                if (JSON_HEDLEY_UNLIKELY(not parse_ubjson_internal(false)))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @return whether object creation completed
+    */
+    bool get_ubjson_object()
+    {
+        std::pair<std::size_t, int> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(not get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(not sax->start_object(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not get_ubjson_string(key) or not sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(not get_ubjson_value(size_and_type.second)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not get_ubjson_string(key) or not sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(not parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(not sax->start_object(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != '}')
+            {
+                if (JSON_HEDLEY_UNLIKELY(not get_ubjson_string(key, false) or not sax->key(key)))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(not parse_ubjson_internal()))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*!
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a -'ve valued
+    `std::char_traits<char>::eof()` in that case.
+
+    @return character read from the input
+    */
+    int get()
+    {
+        ++chars_read;
+        return current = ia->get_character();
+    }
+
+    /*!
+    @return character read from the input after ignoring all 'N' entries
+    */
+    int get_ignore_noop()
+    {
+        do
+        {
+            get();
+        }
+        while (current == 'N');
+
+        return current;
+    }
+
+    /*
+    @brief read a number from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format   the current format (for diagnostics)
+    @param[out] result  number of type @a NumberType
+
+    @return whether conversion completed
+
+    @note This function needs to respect the system's endianess, because
+          bytes in CBOR, MessagePack, and UBJSON are stored in network order
+          (big endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool InputIsLittleEndian = false>
+    bool get_number(const input_format_t format, NumberType& result)
+    {
+        // step 1: read input into array with system's byte order
+        std::array<std::uint8_t, sizeof(NumberType)> vec;
+        for (std::size_t i = 0; i < sizeof(NumberType); ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(format, "number")))
+            {
+                return false;
+            }
+
+            // reverse byte order prior to conversion if necessary
+            if (is_little_endian != InputIsLittleEndian)
+            {
+                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
+            }
+            else
+            {
+                vec[i] = static_cast<std::uint8_t>(current); // LCOV_EXCL_LINE
+            }
+        }
+
+        // step 2: convert array into number of type T and return
+        std::memcpy(&result, vec.data(), sizeof(NumberType));
+        return true;
+    }
+
+    /*!
+    @brief create a string by reading characters from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of characters to read
+    @param[out] result string created by reading @a len bytes
+
+    @return whether string creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of string memory.
+    */
+    template<typename NumberType>
+    bool get_string(const input_format_t format,
+                    const NumberType len,
+                    string_t& result)
+    {
+        bool success = true;
+        std::generate_n(std::back_inserter(result), len, [this, &success, &format]()
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(not unexpect_eof(format, "string")))
+            {
+                success = false;
+            }
+            return static_cast<char>(current);
+        });
+        return success;
+    }
+
+    /*!
+    @param[in] format   the current format (for diagnostics)
+    @param[in] context  further context information (for diagnostics)
+    @return whether the last read character is not EOF
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool unexpect_eof(const input_format_t format, const char* context) const
+    {
+        if (JSON_HEDLEY_UNLIKELY(current == std::char_traits<char>::eof()))
+        {
+            return sax->parse_error(chars_read, "<end of file>",
+                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context)));
+        }
+        return true;
+    }
+
+    /*!
+    @return a string representation of the last read byte
+    */
+    std::string get_token_string() const
+    {
+        std::array<char, 3> cr{{}};
+        (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current));
+        return std::string{cr.data()};
+    }
+
+    /*!
+    @param[in] format   the current format
+    @param[in] detail   a detailed error message
+    @param[in] context  further context information
+    @return a message string to use in the parse_error exceptions
+    */
+    std::string exception_message(const input_format_t format,
+                                  const std::string& detail,
+                                  const std::string& context) const
+    {
+        std::string error_msg = "syntax error while parsing ";
+
+        switch (format)
+        {
+            case input_format_t::cbor:
+                error_msg += "CBOR";
+                break;
+
+            case input_format_t::msgpack:
+                error_msg += "MessagePack";
+                break;
+
+            case input_format_t::ubjson:
+                error_msg += "UBJSON";
+                break;
+
+            case input_format_t::bson:
+                error_msg += "BSON";
+                break;
+
+            default:            // LCOV_EXCL_LINE
+                assert(false);  // LCOV_EXCL_LINE
+        }
+
+        return error_msg + " " + context + ": " + detail;
+    }
+
+  private:
+    /// input adapter
+    input_adapter_t ia = nullptr;
+
+    /// the current character
+    int current = std::char_traits<char>::eof();
+
+    /// the number of characters read
+    std::size_t chars_read = 0;
+
+    /// whether we can assume little endianess
+    const bool is_little_endian = little_endianess();
+
+    /// the SAX parser
+    json_sax_t* sax = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+
+#include <array> // array
+#include <clocale> // localeconv
+#include <cstddef> // size_t
+#include <cstdio> // snprintf
+#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
+#include <initializer_list> // initializer_list
+#include <string> // char_traits, string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////
+// lexer //
+///////////
+
+/*!
+@brief lexical analysis
+
+This class organizes the lexical analysis during JSON deserialization.
+*/
+template<typename BasicJsonType>
+class lexer
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+
+  public:
+    /// token types for the parser
+    enum class token_type
+    {
+        uninitialized,    ///< indicating the scanner is uninitialized
+        literal_true,     ///< the `true` literal
+        literal_false,    ///< the `false` literal
+        literal_null,     ///< the `null` literal
+        value_string,     ///< a string -- use get_string() for actual value
+        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
+        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
+        value_float,      ///< an floating point number -- use get_number_float() for actual value
+        begin_array,      ///< the character for array begin `[`
+        begin_object,     ///< the character for object begin `{`
+        end_array,        ///< the character for array end `]`
+        end_object,       ///< the character for object end `}`
+        name_separator,   ///< the name separator `:`
+        value_separator,  ///< the value separator `,`
+        parse_error,      ///< indicating a parse error
+        end_of_input,     ///< indicating the end of the input buffer
+        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
+    };
+
+    /// return name of values of type token_type (only used for errors)
+    JSON_HEDLEY_RETURNS_NON_NULL
+    JSON_HEDLEY_CONST
+    static const char* token_type_name(const token_type t) noexcept
+    {
+        switch (t)
+        {
+            case token_type::uninitialized:
+                return "<uninitialized>";
+            case token_type::literal_true:
+                return "true literal";
+            case token_type::literal_false:
+                return "false literal";
+            case token_type::literal_null:
+                return "null literal";
+            case token_type::value_string:
+                return "string literal";
+            case lexer::token_type::value_unsigned:
+            case lexer::token_type::value_integer:
+            case lexer::token_type::value_float:
+                return "number literal";
+            case token_type::begin_array:
+                return "'['";
+            case token_type::begin_object:
+                return "'{'";
+            case token_type::end_array:
+                return "']'";
+            case token_type::end_object:
+                return "'}'";
+            case token_type::name_separator:
+                return "':'";
+            case token_type::value_separator:
+                return "','";
+            case token_type::parse_error:
+                return "<parse error>";
+            case token_type::end_of_input:
+                return "end of input";
+            case token_type::literal_or_value:
+                return "'[', '{', or a literal";
+            // LCOV_EXCL_START
+            default: // catch non-enum values
+                return "unknown token";
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    explicit lexer(detail::input_adapter_t&& adapter)
+        : ia(std::move(adapter)), decimal_point_char(get_decimal_point()) {}
+
+    // delete because of pointer members
+    lexer(const lexer&) = delete;
+    lexer(lexer&&) = delete;
+    lexer& operator=(lexer&) = delete;
+    lexer& operator=(lexer&&) = delete;
+    ~lexer() = default;
+
+  private:
+    /////////////////////
+    // locales
+    /////////////////////
+
+    /// return the locale-dependent decimal point
+    JSON_HEDLEY_PURE
+    static char get_decimal_point() noexcept
+    {
+        const auto loc = localeconv();
+        assert(loc != nullptr);
+        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
+    }
+
+    /////////////////////
+    // scan functions
+    /////////////////////
+
+    /*!
+    @brief get codepoint from 4 hex characters following `\u`
+
+    For input "\u c1 c2 c3 c4" the codepoint is:
+      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
+    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
+
+    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
+    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
+    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
+    between the ASCII value of the character and the desired integer value.
+
+    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
+            non-hex character)
+    */
+    int get_codepoint()
+    {
+        // this function only makes sense after reading `\u`
+        assert(current == 'u');
+        int codepoint = 0;
+
+        const auto factors = { 12u, 8u, 4u, 0u };
+        for (const auto factor : factors)
+        {
+            get();
+
+            if (current >= '0' and current <= '9')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
+            }
+            else if (current >= 'A' and current <= 'F')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
+            }
+            else if (current >= 'a' and current <= 'f')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
+            }
+            else
+            {
+                return -1;
+            }
+        }
+
+        assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
+        return codepoint;
+    }
+
+    /*!
+    @brief check if the next byte(s) are inside a given range
+
+    Adds the current byte and, for each passed range, reads a new byte and
+    checks if it is inside the range. If a violation was detected, set up an
+    error message and return false. Otherwise, return true.
+
+    @param[in] ranges  list of integers; interpreted as list of pairs of
+                       inclusive lower and upper bound, respectively
+
+    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
+         1, 2, or 3 pairs. This precondition is enforced by an assertion.
+
+    @return true if and only if no range violation was detected
+    */
+    bool next_byte_in_range(std::initializer_list<int> ranges)
+    {
+        assert(ranges.size() == 2 or ranges.size() == 4 or ranges.size() == 6);
+        add(current);
+
+        for (auto range = ranges.begin(); range != ranges.end(); ++range)
+        {
+            get();
+            if (JSON_HEDLEY_LIKELY(*range <= current and current <= *(++range)))
+            {
+                add(current);
+            }
+            else
+            {
+                error_message = "invalid string: ill-formed UTF-8 byte";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief scan a string literal
+
+    This function scans a string according to Sect. 7 of RFC 7159. While
+    scanning, bytes are escaped and copied into buffer token_buffer. Then the
+    function returns successfully, token_buffer is *not* null-terminated (as it
+    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
+    string.
+
+    @return token_type::value_string if string could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note In case of errors, variable error_message contains a textual
+          description.
+    */
+    token_type scan_string()
+    {
+        // reset token_buffer (ignore opening quote)
+        reset();
+
+        // we entered the function by reading an open quote
+        assert(current == '\"');
+
+        while (true)
+        {
+            // get next character
+            switch (get())
+            {
+                // end of file while parsing string
+                case std::char_traits<char>::eof():
+                {
+                    error_message = "invalid string: missing closing quote";
+                    return token_type::parse_error;
+                }
+
+                // closing quote
+                case '\"':
+                {
+                    return token_type::value_string;
+                }
+
+                // escapes
+                case '\\':
+                {
+                    switch (get())
+                    {
+                        // quotation mark
+                        case '\"':
+                            add('\"');
+                            break;
+                        // reverse solidus
+                        case '\\':
+                            add('\\');
+                            break;
+                        // solidus
+                        case '/':
+                            add('/');
+                            break;
+                        // backspace
+                        case 'b':
+                            add('\b');
+                            break;
+                        // form feed
+                        case 'f':
+                            add('\f');
+                            break;
+                        // line feed
+                        case 'n':
+                            add('\n');
+                            break;
+                        // carriage return
+                        case 'r':
+                            add('\r');
+                            break;
+                        // tab
+                        case 't':
+                            add('\t');
+                            break;
+
+                        // unicode escapes
+                        case 'u':
+                        {
+                            const int codepoint1 = get_codepoint();
+                            int codepoint = codepoint1; // start with codepoint1
+
+                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
+                            {
+                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                return token_type::parse_error;
+                            }
+
+                            // check if code point is a high surrogate
+                            if (0xD800 <= codepoint1 and codepoint1 <= 0xDBFF)
+                            {
+                                // expect next \uxxxx entry
+                                if (JSON_HEDLEY_LIKELY(get() == '\\' and get() == 'u'))
+                                {
+                                    const int codepoint2 = get_codepoint();
+
+                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
+                                    {
+                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                        return token_type::parse_error;
+                                    }
+
+                                    // check if codepoint2 is a low surrogate
+                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF))
+                                    {
+                                        // overwrite codepoint
+                                        codepoint = static_cast<int>(
+                                                        // high surrogate occupies the most significant 22 bits
+                                                        (static_cast<unsigned int>(codepoint1) << 10u)
+                                                        // low surrogate occupies the least significant 15 bits
+                                                        + static_cast<unsigned int>(codepoint2)
+                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                                                        // in the result so we have to subtract with:
+                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                                                        - 0x35FDC00u);
+                                    }
+                                    else
+                                    {
+                                        error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
+                                        return token_type::parse_error;
+                                    }
+                                }
+                                else
+                                {
+                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+                            else
+                            {
+                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF))
+                                {
+                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+
+                            // result of the above calculation yields a proper codepoint
+                            assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
+
+                            // translate codepoint into bytes
+                            if (codepoint < 0x80)
+                            {
+                                // 1-byte characters: 0xxxxxxx (ASCII)
+                                add(codepoint);
+                            }
+                            else if (codepoint <= 0x7FF)
+                            {
+                                // 2-byte characters: 110xxxxx 10xxxxxx
+                                add(static_cast<int>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
+                                add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else if (codepoint <= 0xFFFF)
+                            {
+                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<int>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
+                                add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else
+                            {
+                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<int>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
+                                add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
+                                add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+
+                            break;
+                        }
+
+                        // other characters after escape
+                        default:
+                            error_message = "invalid string: forbidden character after backslash";
+                            return token_type::parse_error;
+                    }
+
+                    break;
+                }
+
+                // invalid control characters
+                case 0x00:
+                {
+                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
+                    return token_type::parse_error;
+                }
+
+                case 0x01:
+                {
+                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
+                    return token_type::parse_error;
+                }
+
+                case 0x02:
+                {
+                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
+                    return token_type::parse_error;
+                }
+
+                case 0x03:
+                {
+                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
+                    return token_type::parse_error;
+                }
+
+                case 0x04:
+                {
+                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
+                    return token_type::parse_error;
+                }
+
+                case 0x05:
+                {
+                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
+                    return token_type::parse_error;
+                }
+
+                case 0x06:
+                {
+                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
+                    return token_type::parse_error;
+                }
+
+                case 0x07:
+                {
+                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
+                    return token_type::parse_error;
+                }
+
+                case 0x08:
+                {
+                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
+                    return token_type::parse_error;
+                }
+
+                case 0x09:
+                {
+                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
+                    return token_type::parse_error;
+                }
+
+                case 0x0A:
+                {
+                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
+                    return token_type::parse_error;
+                }
+
+                case 0x0B:
+                {
+                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
+                    return token_type::parse_error;
+                }
+
+                case 0x0C:
+                {
+                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
+                    return token_type::parse_error;
+                }
+
+                case 0x0D:
+                {
+                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
+                    return token_type::parse_error;
+                }
+
+                case 0x0E:
+                {
+                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
+                    return token_type::parse_error;
+                }
+
+                case 0x0F:
+                {
+                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
+                    return token_type::parse_error;
+                }
+
+                case 0x10:
+                {
+                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
+                    return token_type::parse_error;
+                }
+
+                case 0x11:
+                {
+                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
+                    return token_type::parse_error;
+                }
+
+                case 0x12:
+                {
+                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
+                    return token_type::parse_error;
+                }
+
+                case 0x13:
+                {
+                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
+                    return token_type::parse_error;
+                }
+
+                case 0x14:
+                {
+                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
+                    return token_type::parse_error;
+                }
+
+                case 0x15:
+                {
+                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
+                    return token_type::parse_error;
+                }
+
+                case 0x16:
+                {
+                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
+                    return token_type::parse_error;
+                }
+
+                case 0x17:
+                {
+                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
+                    return token_type::parse_error;
+                }
+
+                case 0x18:
+                {
+                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
+                    return token_type::parse_error;
+                }
+
+                case 0x19:
+                {
+                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
+                    return token_type::parse_error;
+                }
+
+                case 0x1A:
+                {
+                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
+                    return token_type::parse_error;
+                }
+
+                case 0x1B:
+                {
+                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
+                    return token_type::parse_error;
+                }
+
+                case 0x1C:
+                {
+                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
+                    return token_type::parse_error;
+                }
+
+                case 0x1D:
+                {
+                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
+                    return token_type::parse_error;
+                }
+
+                case 0x1E:
+                {
+                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
+                    return token_type::parse_error;
+                }
+
+                case 0x1F:
+                {
+                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
+                    return token_type::parse_error;
+                }
+
+                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
+                case 0x20:
+                case 0x21:
+                case 0x23:
+                case 0x24:
+                case 0x25:
+                case 0x26:
+                case 0x27:
+                case 0x28:
+                case 0x29:
+                case 0x2A:
+                case 0x2B:
+                case 0x2C:
+                case 0x2D:
+                case 0x2E:
+                case 0x2F:
+                case 0x30:
+                case 0x31:
+                case 0x32:
+                case 0x33:
+                case 0x34:
+                case 0x35:
+                case 0x36:
+                case 0x37:
+                case 0x38:
+                case 0x39:
+                case 0x3A:
+                case 0x3B:
+                case 0x3C:
+                case 0x3D:
+                case 0x3E:
+                case 0x3F:
+                case 0x40:
+                case 0x41:
+                case 0x42:
+                case 0x43:
+                case 0x44:
+                case 0x45:
+                case 0x46:
+                case 0x47:
+                case 0x48:
+                case 0x49:
+                case 0x4A:
+                case 0x4B:
+                case 0x4C:
+                case 0x4D:
+                case 0x4E:
+                case 0x4F:
+                case 0x50:
+                case 0x51:
+                case 0x52:
+                case 0x53:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5A:
+                case 0x5B:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                case 0x60:
+                case 0x61:
+                case 0x62:
+                case 0x63:
+                case 0x64:
+                case 0x65:
+                case 0x66:
+                case 0x67:
+                case 0x68:
+                case 0x69:
+                case 0x6A:
+                case 0x6B:
+                case 0x6C:
+                case 0x6D:
+                case 0x6E:
+                case 0x6F:
+                case 0x70:
+                case 0x71:
+                case 0x72:
+                case 0x73:
+                case 0x74:
+                case 0x75:
+                case 0x76:
+                case 0x77:
+                case 0x78:
+                case 0x79:
+                case 0x7A:
+                case 0x7B:
+                case 0x7C:
+                case 0x7D:
+                case 0x7E:
+                case 0x7F:
+                {
+                    add(current);
+                    break;
+                }
+
+                // U+0080..U+07FF: bytes C2..DF 80..BF
+                case 0xC2:
+                case 0xC3:
+                case 0xC4:
+                case 0xC5:
+                case 0xC6:
+                case 0xC7:
+                case 0xC8:
+                case 0xC9:
+                case 0xCA:
+                case 0xCB:
+                case 0xCC:
+                case 0xCD:
+                case 0xCE:
+                case 0xCF:
+                case 0xD0:
+                case 0xD1:
+                case 0xD2:
+                case 0xD3:
+                case 0xD4:
+                case 0xD5:
+                case 0xD6:
+                case 0xD7:
+                case 0xD8:
+                case 0xD9:
+                case 0xDA:
+                case 0xDB:
+                case 0xDC:
+                case 0xDD:
+                case 0xDE:
+                case 0xDF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not next_byte_in_range({0x80, 0xBF})))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
+                case 0xE0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
+                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
+                case 0xE1:
+                case 0xE2:
+                case 0xE3:
+                case 0xE4:
+                case 0xE5:
+                case 0xE6:
+                case 0xE7:
+                case 0xE8:
+                case 0xE9:
+                case 0xEA:
+                case 0xEB:
+                case 0xEC:
+                case 0xEE:
+                case 0xEF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
+                case 0xED:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+                case 0xF0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+                case 0xF1:
+                case 0xF2:
+                case 0xF3:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+                case 0xF4:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // remaining bytes (80..C1 and F5..FF) are ill-formed
+                default:
+                {
+                    error_message = "invalid string: ill-formed UTF-8 byte";
+                    return token_type::parse_error;
+                }
+            }
+        }
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(float& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtof(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtod(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(long double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtold(str, endptr);
+    }
+
+    /*!
+    @brief scan a number literal
+
+    This function scans a string according to Sect. 6 of RFC 7159.
+
+    The function is realized with a deterministic finite state machine derived
+    from the grammar described in RFC 7159. Starting in state "init", the
+    input is read and used to determined the next state. Only state "done"
+    accepts the number. State "error" is a trap state to model errors. In the
+    table below, "anything" means any character but the ones listed before.
+
+    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
+    ---------|----------|----------|----------|---------|---------|----------|-----------
+    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
+    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
+    zero     | done     | done     | exponent | done    | done    | decimal1 | done
+    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
+    decimal1 | decimal2 | [error]  | [error]  | [error] | [error] | [error]  | [error]
+    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
+    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
+    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
+    any2     | any2     | any2     | done     | done    | done    | done     | done
+
+    The state machine is realized with one label per state (prefixed with
+    "scan_number_") and `goto` statements between them. The state machine
+    contains cycles, but any cycle can be left when EOF is read. Therefore,
+    the function is guaranteed to terminate.
+
+    During scanning, the read bytes are stored in token_buffer. This string is
+    then converted to a signed integer, an unsigned integer, or a
+    floating-point number.
+
+    @return token_type::value_unsigned, token_type::value_integer, or
+            token_type::value_float if number could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note The scanner is independent of the current locale. Internally, the
+          locale's decimal point is used instead of `.` to work with the
+          locale-dependent converters.
+    */
+    token_type scan_number()  // lgtm [cpp/use-of-goto]
+    {
+        // reset token_buffer to store the number's bytes
+        reset();
+
+        // the type of the parsed number; initially set to unsigned; will be
+        // changed if minus sign, decimal point or exponent is read
+        token_type number_type = token_type::value_unsigned;
+
+        // state (init): we just found out we need to scan a number
+        switch (current)
+        {
+            case '-':
+            {
+                add(current);
+                goto scan_number_minus;
+            }
+
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            // all other characters are rejected outside scan_number()
+            default:            // LCOV_EXCL_LINE
+                assert(false);  // LCOV_EXCL_LINE
+        }
+
+scan_number_minus:
+        // state: we just parsed a leading minus sign
+        number_type = token_type::value_integer;
+        switch (get())
+        {
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '-'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_zero:
+        // state: we just parse a zero (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_any1:
+        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_decimal1:
+        // state: we just parsed a decimal point
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '.'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_decimal2:
+        // we just parsed at least one number after a decimal point
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_exponent:
+        // we just parsed an exponent
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '+':
+            case '-':
+            {
+                add(current);
+                goto scan_number_sign;
+            }
+
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message =
+                    "invalid number; expected '+', '-', or digit after exponent";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_sign:
+        // we just parsed an exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after exponent sign";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_any2:
+        // we just parsed a number after the exponent or exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_done:
+        // unget the character after the number (we only read it to know that
+        // we are done scanning a number)
+        unget();
+
+        char* endptr = nullptr;
+        errno = 0;
+
+        // try to parse integers first and fall back to floats
+        if (number_type == token_type::value_unsigned)
+        {
+            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            assert(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_unsigned = static_cast<number_unsigned_t>(x);
+                if (value_unsigned == x)
+                {
+                    return token_type::value_unsigned;
+                }
+            }
+        }
+        else if (number_type == token_type::value_integer)
+        {
+            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            assert(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_integer = static_cast<number_integer_t>(x);
+                if (value_integer == x)
+                {
+                    return token_type::value_integer;
+                }
+            }
+        }
+
+        // this code is reached if we parse a floating-point number or if an
+        // integer conversion above failed
+        strtof(value_float, token_buffer.data(), &endptr);
+
+        // we checked the number format before
+        assert(endptr == token_buffer.data() + token_buffer.size());
+
+        return token_type::value_float;
+    }
+
+    /*!
+    @param[in] literal_text  the literal text to expect
+    @param[in] length        the length of the passed literal text
+    @param[in] return_type   the token type to return on success
+    */
+    JSON_HEDLEY_NON_NULL(2)
+    token_type scan_literal(const char* literal_text, const std::size_t length,
+                            token_type return_type)
+    {
+        assert(current == literal_text[0]);
+        for (std::size_t i = 1; i < length; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(get() != literal_text[i]))
+            {
+                error_message = "invalid literal";
+                return token_type::parse_error;
+            }
+        }
+        return return_type;
+    }
+
+    /////////////////////
+    // input management
+    /////////////////////
+
+    /// reset token_buffer; current character is beginning of token
+    void reset() noexcept
+    {
+        token_buffer.clear();
+        token_string.clear();
+        token_string.push_back(std::char_traits<char>::to_char_type(current));
+    }
+
+    /*
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a
+    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
+    for use in error messages.
+
+    @return character read from the input
+    */
+    std::char_traits<char>::int_type get()
+    {
+        ++position.chars_read_total;
+        ++position.chars_read_current_line;
+
+        if (next_unget)
+        {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        }
+        else
+        {
+            current = ia->get_character();
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
+        {
+            token_string.push_back(std::char_traits<char>::to_char_type(current));
+        }
+
+        if (current == '\n')
+        {
+            ++position.lines_read;
+            position.chars_read_current_line = 0;
+        }
+
+        return current;
+    }
+
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read_total,
+    chars_read_current_line, and token_string. The next call to get() will
+    behave as if the unget character is read again.
+    */
+    void unget()
+    {
+        next_unget = true;
+
+        --position.chars_read_total;
+
+        // in case we "unget" a newline, we have to also decrement the lines_read
+        if (position.chars_read_current_line == 0)
+        {
+            if (position.lines_read > 0)
+            {
+                --position.lines_read;
+            }
+        }
+        else
+        {
+            --position.chars_read_current_line;
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
+        {
+            assert(not token_string.empty());
+            token_string.pop_back();
+        }
+    }
+
+    /// add a character to token_buffer
+    void add(int c)
+    {
+        token_buffer.push_back(std::char_traits<char>::to_char_type(c));
+    }
+
+  public:
+    /////////////////////
+    // value getters
+    /////////////////////
+
+    /// return integer value
+    constexpr number_integer_t get_number_integer() const noexcept
+    {
+        return value_integer;
+    }
+
+    /// return unsigned integer value
+    constexpr number_unsigned_t get_number_unsigned() const noexcept
+    {
+        return value_unsigned;
+    }
+
+    /// return floating-point value
+    constexpr number_float_t get_number_float() const noexcept
+    {
+        return value_float;
+    }
+
+    /// return current string value (implicitly resets the token; useful only once)
+    string_t& get_string()
+    {
+        return token_buffer;
+    }
+
+    /////////////////////
+    // diagnostics
+    /////////////////////
+
+    /// return position of last read token
+    constexpr position_t get_position() const noexcept
+    {
+        return position;
+    }
+
+    /// return the last read token (for errors only).  Will never contain EOF
+    /// (an arbitrary value that is not a valid char value, often -1), because
+    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
+    std::string get_token_string() const
+    {
+        // escape control characters
+        std::string result;
+        for (const auto c : token_string)
+        {
+            if ('\x00' <= c and c <= '\x1F')
+            {
+                // escape control characters
+                std::array<char, 9> cs{{}};
+                (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c));
+                result += cs.data();
+            }
+            else
+            {
+                // add character as is
+                result.push_back(c);
+            }
+        }
+
+        return result;
+    }
+
+    /// return syntax error message
+    JSON_HEDLEY_RETURNS_NON_NULL
+    constexpr const char* get_error_message() const noexcept
+    {
+        return error_message;
+    }
+
+    /////////////////////
+    // actual scanner
+    /////////////////////
+
+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool skip_bom()
+    {
+        if (get() == 0xEF)
+        {
+            // check if we completely parse the BOM
+            return get() == 0xBB and get() == 0xBF;
+        }
+
+        // the first character is not the beginning of the BOM; unget it to
+        // process is later
+        unget();
+        return true;
+    }
+
+    token_type scan()
+    {
+        // initially, skip the BOM
+        if (position.chars_read_total == 0 and not skip_bom())
+        {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
+        // read next character and ignore whitespace
+        do
+        {
+            get();
+        }
+        while (current == ' ' or current == '\t' or current == '\n' or current == '\r');
+
+        switch (current)
+        {
+            // structural characters
+            case '[':
+                return token_type::begin_array;
+            case ']':
+                return token_type::end_array;
+            case '{':
+                return token_type::begin_object;
+            case '}':
+                return token_type::end_object;
+            case ':':
+                return token_type::name_separator;
+            case ',':
+                return token_type::value_separator;
+
+            // literals
+            case 't':
+                return scan_literal("true", 4, token_type::literal_true);
+            case 'f':
+                return scan_literal("false", 5, token_type::literal_false);
+            case 'n':
+                return scan_literal("null", 4, token_type::literal_null);
+
+            // string
+            case '\"':
+                return scan_string();
+
+            // number
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                return scan_number();
+
+            // end of input (the null byte is needed when parsing from
+            // string literals)
+            case '\0':
+            case std::char_traits<char>::eof():
+                return token_type::end_of_input;
+
+            // error
+            default:
+                error_message = "invalid literal";
+                return token_type::parse_error;
+        }
+    }
+
+  private:
+    /// input adapter
+    detail::input_adapter_t ia = nullptr;
+
+    /// the current character
+    std::char_traits<char>::int_type current = std::char_traits<char>::eof();
+
+    /// whether the next get() call should just return current
+    bool next_unget = false;
+
+    /// the start position of the current token
+    position_t position {};
+
+    /// raw input token string (for error messages)
+    std::vector<char> token_string {};
+
+    /// buffer for variable-length tokens (numbers, strings)
+    string_t token_buffer {};
+
+    /// a description of occurred lexer errors
+    const char* error_message = "";
+
+    // number values
+    number_integer_t value_integer = 0;
+    number_unsigned_t value_unsigned = 0;
+    number_float_t value_float = 0;
+
+    /// the decimal point
+    const char decimal_point_char = '.';
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/parser.hpp>
+
+
+#include <cassert> // assert
+#include <cmath> // isfinite
+#include <cstdint> // uint8_t
+#include <functional> // function
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////
+// parser //
+////////////
+
+/*!
+@brief syntax analysis
+
+This class implements a recursive decent parser.
+*/
+template<typename BasicJsonType>
+class parser
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using lexer_t = lexer<BasicJsonType>;
+    using token_type = typename lexer_t::token_type;
+
+  public:
+    enum class parse_event_t : uint8_t
+    {
+        /// the parser read `{` and started to process a JSON object
+        object_start,
+        /// the parser read `}` and finished processing a JSON object
+        object_end,
+        /// the parser read `[` and started to process a JSON array
+        array_start,
+        /// the parser read `]` and finished processing a JSON array
+        array_end,
+        /// the parser read a key of a value in an object
+        key,
+        /// the parser finished reading a JSON value
+        value
+    };
+
+    using parser_callback_t =
+        std::function<bool(int depth, parse_event_t event, BasicJsonType& parsed)>;
+
+    /// a parser reading from an input adapter
+    explicit parser(detail::input_adapter_t&& adapter,
+                    const parser_callback_t cb = nullptr,
+                    const bool allow_exceptions_ = true)
+        : callback(cb), m_lexer(std::move(adapter)), allow_exceptions(allow_exceptions_)
+    {
+        // read first token
+        get_token();
+    }
+
+    /*!
+    @brief public parser interface
+
+    @param[in] strict      whether to expect the last token to be EOF
+    @param[in,out] result  parsed JSON value
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    void parse(const bool strict, BasicJsonType& result)
+    {
+        if (callback)
+        {
+            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
+
+            // in strict mode, input must be completely read
+            if (strict and (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value")));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+
+            // set top-level value to null if it was discarded by the callback
+            // function
+            if (result.is_discarded())
+            {
+                result = nullptr;
+            }
+        }
+        else
+        {
+            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
+
+            // in strict mode, input must be completely read
+            if (strict and (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value")));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+        }
+    }
+
+    /*!
+    @brief public accept interface
+
+    @param[in] strict  whether to expect the last token to be EOF
+    @return whether the input is a proper JSON text
+    */
+    bool accept(const bool strict = true)
+    {
+        json_sax_acceptor<BasicJsonType> sax_acceptor;
+        return sax_parse(&sax_acceptor, strict);
+    }
+
+    template <typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse(SAX* sax, const bool strict = true)
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+        const bool result = sax_parse_internal(sax);
+
+        // strict mode: next byte must be EOF
+        if (result and strict and (get_token() != token_type::end_of_input))
+        {
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(),
+                                            exception_message(token_type::end_of_input, "value")));
+        }
+
+        return result;
+    }
+
+  private:
+    template <typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse_internal(SAX* sax)
+    {
+        // stack to remember the hierarchy of structured values we are parsing
+        // true = array; false = object
+        std::vector<bool> states;
+        // value to avoid a goto (see comment where set to true)
+        bool skip_to_state_evaluation = false;
+
+        while (true)
+        {
+            if (not skip_to_state_evaluation)
+            {
+                // invariant: get_token() was called before each iteration
+                switch (last_token)
+                {
+                    case token_type::begin_object:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->start_object(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing } -> we are done
+                        if (get_token() == token_type::end_object)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(not sax->end_object()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // parse key
+                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            exception_message(token_type::value_string, "object key")));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(not sax->key(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        // parse separator (:)
+                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            exception_message(token_type::name_separator, "object separator")));
+                        }
+
+                        // remember we are now inside an object
+                        states.push_back(false);
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    case token_type::begin_array:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->start_array(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing ] -> we are done
+                        if (get_token() == token_type::end_array)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(not sax->end_array()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // remember we are now inside an array
+                        states.push_back(true);
+
+                        // parse values (no need to call get_token)
+                        continue;
+                    }
+
+                    case token_type::value_float:
+                    {
+                        const auto res = m_lexer.get_number_float();
+
+                        if (JSON_HEDLEY_UNLIKELY(not std::isfinite(res)))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'"));
+                        }
+
+                        if (JSON_HEDLEY_UNLIKELY(not sax->number_float(res, m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        break;
+                    }
+
+                    case token_type::literal_false:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->boolean(false)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_null:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->null()))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_true:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->boolean(true)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_integer:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->number_integer(m_lexer.get_number_integer())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_string:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->string(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_unsigned:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(not sax->number_unsigned(m_lexer.get_number_unsigned())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::parse_error:
+                    {
+                        // using "uninitialized" to avoid "expected" message
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::uninitialized, "value")));
+                    }
+
+                    default: // the last token was unexpected
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::literal_or_value, "value")));
+                    }
+                }
+            }
+            else
+            {
+                skip_to_state_evaluation = false;
+            }
+
+            // we reached this line after we successfully parsed a value
+            if (states.empty())
+            {
+                // empty stack: we reached the end of the hierarchy: done
+                return true;
+            }
+
+            if (states.back())  // array
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse a new value
+                    get_token();
+                    continue;
+                }
+
+                // closing ]
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not sax->end_array()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this array. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    assert(not states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(),
+                                                exception_message(token_type::end_array, "array")));
+            }
+            else  // object
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse key
+                    if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::value_string, "object key")));
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(not sax->key(m_lexer.get_string())))
+                    {
+                        return false;
+                    }
+
+                    // parse separator (:)
+                    if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::name_separator, "object separator")));
+                    }
+
+                    // parse values
+                    get_token();
+                    continue;
+                }
+
+                // closing }
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(not sax->end_object()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this object. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    assert(not states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(),
+                                                exception_message(token_type::end_object, "object")));
+            }
+        }
+    }
+
+    /// get next token from lexer
+    token_type get_token()
+    {
+        return last_token = m_lexer.scan();
+    }
+
+    std::string exception_message(const token_type expected, const std::string& context)
+    {
+        std::string error_msg = "syntax error ";
+
+        if (not context.empty())
+        {
+            error_msg += "while parsing " + context + " ";
+        }
+
+        error_msg += "- ";
+
+        if (last_token == token_type::parse_error)
+        {
+            error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" +
+                         m_lexer.get_token_string() + "'";
+        }
+        else
+        {
+            error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token));
+        }
+
+        if (expected != token_type::uninitialized)
+        {
+            error_msg += "; expected " + std::string(lexer_t::token_type_name(expected));
+        }
+
+        return error_msg;
+    }
+
+  private:
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// the type of the last read token
+    token_type last_token = token_type::uninitialized;
+    /// the lexer
+    lexer_t m_lexer;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <limits>  // numeric_limits
+
+namespace nlohmann
+{
+namespace detail
+{
+/*
+@brief an iterator for primitive JSON types
+
+This class models an iterator for primitive JSON types (boolean, number,
+string). It's only purpose is to allow the iterator/const_iterator classes
+to "iterate" over primitive values. Internally, the iterator is modeled by
+a `difference_type` variable. Value begin_value (`0`) models the begin,
+end_value (`1`) models past the end.
+*/
+class primitive_iterator_t
+{
+  private:
+    using difference_type = std::ptrdiff_t;
+    static constexpr difference_type begin_value = 0;
+    static constexpr difference_type end_value = begin_value + 1;
+
+    /// iterator as signed integer type
+    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
+
+  public:
+    constexpr difference_type get_value() const noexcept
+    {
+        return m_it;
+    }
+
+    /// set iterator to a defined beginning
+    void set_begin() noexcept
+    {
+        m_it = begin_value;
+    }
+
+    /// set iterator to a defined past the end
+    void set_end() noexcept
+    {
+        m_it = end_value;
+    }
+
+    /// return whether the iterator can be dereferenced
+    constexpr bool is_begin() const noexcept
+    {
+        return m_it == begin_value;
+    }
+
+    /// return whether the iterator is at end
+    constexpr bool is_end() const noexcept
+    {
+        return m_it == end_value;
+    }
+
+    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it == rhs.m_it;
+    }
+
+    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it < rhs.m_it;
+    }
+
+    primitive_iterator_t operator+(difference_type n) noexcept
+    {
+        auto result = *this;
+        result += n;
+        return result;
+    }
+
+    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it - rhs.m_it;
+    }
+
+    primitive_iterator_t& operator++() noexcept
+    {
+        ++m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator++(int) noexcept
+    {
+        auto result = *this;
+        ++m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator--() noexcept
+    {
+        --m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator--(int) noexcept
+    {
+        auto result = *this;
+        --m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator+=(difference_type n) noexcept
+    {
+        m_it += n;
+        return *this;
+    }
+
+    primitive_iterator_t& operator-=(difference_type n) noexcept
+    {
+        m_it -= n;
+        return *this;
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/*!
+@brief an iterator value
+
+@note This structure could easily be a union, but MSVC currently does not allow
+unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
+*/
+template<typename BasicJsonType> struct internal_iterator
+{
+    /// iterator for JSON objects
+    typename BasicJsonType::object_t::iterator object_iterator {};
+    /// iterator for JSON arrays
+    typename BasicJsonType::array_t::iterator array_iterator {};
+    /// generic iterator for all other types
+    primitive_iterator_t primitive_iterator {};
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iter_impl.hpp>
+
+
+#include <ciso646> // not
+#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
+#include <type_traits> // conditional, is_const, remove_const
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+// forward declare, to be able to friend it later on
+template<typename IteratorType> class iteration_proxy;
+template<typename IteratorType> class iteration_proxy_value;
+
+/*!
+@brief a template for a bidirectional iterator for the @ref basic_json class
+This class implements a both iterators (iterator and const_iterator) for the
+@ref basic_json class.
+@note An iterator is called *initialized* when a pointer to a JSON value has
+      been set (e.g., by a constructor or a copy assignment). If the iterator is
+      default-constructed, it is *uninitialized* and most methods are undefined.
+      **The library uses assertions to detect calls on uninitialized iterators.**
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
+       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
+*/
+template<typename BasicJsonType>
+class iter_impl
+{
+    /// allow basic_json to access private members
+    friend iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
+    friend BasicJsonType;
+    friend iteration_proxy<iter_impl>;
+    friend iteration_proxy_value<iter_impl>;
+
+    using object_t = typename BasicJsonType::object_t;
+    using array_t = typename BasicJsonType::array_t;
+    // make sure BasicJsonType is basic_json or const basic_json
+    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
+                  "iter_impl only accepts (const) basic_json");
+
+  public:
+
+    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
+    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
+    /// A user-defined iterator should provide publicly accessible typedefs named
+    /// iterator_category, value_type, difference_type, pointer, and reference.
+    /// Note that value_type is required to be non-const, even for constant iterators.
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    /// the type of the values when the iterator is dereferenced
+    using value_type = typename BasicJsonType::value_type;
+    /// a type to represent differences between iterators
+    using difference_type = typename BasicJsonType::difference_type;
+    /// defines a pointer to the type iterated over (value_type)
+    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
+          typename BasicJsonType::const_pointer,
+          typename BasicJsonType::pointer>::type;
+    /// defines a reference to the type iterated over (value_type)
+    using reference =
+        typename std::conditional<std::is_const<BasicJsonType>::value,
+        typename BasicJsonType::const_reference,
+        typename BasicJsonType::reference>::type;
+
+    /// default constructor
+    iter_impl() = default;
+
+    /*!
+    @brief constructor for a given JSON instance
+    @param[in] object  pointer to a JSON object for this iterator
+    @pre object != nullptr
+    @post The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    explicit iter_impl(pointer object) noexcept : m_object(object)
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = typename object_t::iterator();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = typename array_t::iterator();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator = primitive_iterator_t();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @note The conventional copy constructor and copy assignment are implicitly
+          defined. Combined with the following converting constructor and
+          assignment, they support: (1) copy from iterator to iterator, (2)
+          copy from const iterator to const iterator, and (3) conversion from
+          iterator to const iterator. However conversion from const iterator
+          to iterator is not defined.
+    */
+
+    /*!
+    @brief const copy constructor
+    @param[in] other const iterator to copy from
+    @note This copy constructor had to be defined explicitly to circumvent a bug
+          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
+          information refer to: https://github.com/nlohmann/json/issues/1608
+    */
+    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+    /*!
+    @brief converting constructor
+    @param[in] other  non-const iterator to copy from
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other  non-const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+  private:
+    /*!
+    @brief set the iterator to the first value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_begin() noexcept
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->begin();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->begin();
+                break;
+            }
+
+            case value_t::null:
+            {
+                // set to end so begin()==end() is true: null is empty
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator.set_begin();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief set the iterator past the last value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_end() noexcept
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->end();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->end();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+        }
+    }
+
+  public:
+    /*!
+    @brief return a reference to the value pointed to by the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator*() const
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                assert(m_it.object_iterator != m_object->m_value.object->end());
+                return m_it.object_iterator->second;
+            }
+
+            case value_t::array:
+            {
+                assert(m_it.array_iterator != m_object->m_value.array->end());
+                return *m_it.array_iterator;
+            }
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief dereference the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    pointer operator->() const
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                assert(m_it.object_iterator != m_object->m_value.object->end());
+                return &(m_it.object_iterator->second);
+            }
+
+            case value_t::array:
+            {
+                assert(m_it.array_iterator != m_object->m_value.array->end());
+                return &*m_it.array_iterator;
+            }
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief post-increment (it++)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator++(int)
+    {
+        auto result = *this;
+        ++(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-increment (++it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator++()
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, 1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, 1);
+                break;
+            }
+
+            default:
+            {
+                ++m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief post-decrement (it--)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator--(int)
+    {
+        auto result = *this;
+        --(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-decrement (--it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator--()
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, -1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, -1);
+                break;
+            }
+
+            default:
+            {
+                --m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief  comparison: equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator==(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers"));
+        }
+
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                return (m_it.object_iterator == other.m_it.object_iterator);
+
+            case value_t::array:
+                return (m_it.array_iterator == other.m_it.array_iterator);
+
+            default:
+                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief  comparison: not equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator!=(const iter_impl& other) const
+    {
+        return not operator==(other);
+    }
+
+    /*!
+    @brief  comparison: smaller
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers"));
+        }
+
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators"));
+
+            case value_t::array:
+                return (m_it.array_iterator < other.m_it.array_iterator);
+
+            default:
+                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief  comparison: less than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<=(const iter_impl& other) const
+    {
+        return not other.operator < (*this);
+    }
+
+    /*!
+    @brief  comparison: greater than
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>(const iter_impl& other) const
+    {
+        return not operator<=(other);
+    }
+
+    /*!
+    @brief  comparison: greater than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>=(const iter_impl& other) const
+    {
+        return not operator<(other);
+    }
+
+    /*!
+    @brief  add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator+=(difference_type i)
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators"));
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, i);
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator += i;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief  subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator-=(difference_type i)
+    {
+        return operator+=(-i);
+    }
+
+    /*!
+    @brief  add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator+(difference_type i) const
+    {
+        auto result = *this;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief  addition of distance and iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    friend iter_impl operator+(difference_type i, const iter_impl& it)
+    {
+        auto result = it;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief  subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator-(difference_type i) const
+    {
+        auto result = *this;
+        result -= i;
+        return result;
+    }
+
+    /*!
+    @brief  return difference
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    difference_type operator-(const iter_impl& other) const
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators"));
+
+            case value_t::array:
+                return m_it.array_iterator - other.m_it.array_iterator;
+
+            default:
+                return m_it.primitive_iterator - other.m_it.primitive_iterator;
+        }
+    }
+
+    /*!
+    @brief  access to successor
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator[](difference_type n) const
+    {
+        assert(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators"));
+
+            case value_t::array:
+                return *std::next(m_it.array_iterator, n);
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief  return the key of an object iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    const typename object_t::key_type& key() const
+    {
+        assert(m_object != nullptr);
+
+        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
+        {
+            return m_it.object_iterator->first;
+        }
+
+        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators"));
+    }
+
+    /*!
+    @brief  return the value of an iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference value() const
+    {
+        return operator*();
+    }
+
+  private:
+    /// associated JSON instance
+    pointer m_object = nullptr;
+    /// the actual iterator of the associated instance
+    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <iterator> // reverse_iterator
+#include <utility> // declval
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////////
+// reverse_iterator //
+//////////////////////
+
+/*!
+@brief a template for a reverse iterator class
+
+@tparam Base the base iterator type to reverse. Valid types are @ref
+iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+create @ref const_reverse_iterator).
+
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
+  It is possible to write to the pointed-to element (only if @a Base is
+  @ref iterator).
+
+@since version 1.0.0
+*/
+template<typename Base>
+class json_reverse_iterator : public std::reverse_iterator<Base>
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    /// shortcut to the reverse iterator adapter
+    using base_iterator = std::reverse_iterator<Base>;
+    /// the reference type for the pointed-to element
+    using reference = typename Base::reference;
+
+    /// create reverse iterator from iterator
+    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
+        : base_iterator(it) {}
+
+    /// create reverse iterator from base class
+    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
+
+    /// post-increment (it++)
+    json_reverse_iterator const operator++(int)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
+    }
+
+    /// pre-increment (++it)
+    json_reverse_iterator& operator++()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
+    }
+
+    /// post-decrement (it--)
+    json_reverse_iterator const operator--(int)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
+    }
+
+    /// pre-decrement (--it)
+    json_reverse_iterator& operator--()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
+    }
+
+    /// add to iterator
+    json_reverse_iterator& operator+=(difference_type i)
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
+    }
+
+    /// add to iterator
+    json_reverse_iterator operator+(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
+    }
+
+    /// subtract from iterator
+    json_reverse_iterator operator-(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
+    }
+
+    /// return difference
+    difference_type operator-(const json_reverse_iterator& other) const
+    {
+        return base_iterator(*this) - base_iterator(other);
+    }
+
+    /// access to successor
+    reference operator[](difference_type n) const
+    {
+        return *(this->operator+(n));
+    }
+
+    /// return the key of an object iterator
+    auto key() const -> decltype(std::declval<Base>().key())
+    {
+        auto it = --this->base();
+        return it.key();
+    }
+
+    /// return the value of an iterator
+    reference value() const
+    {
+        auto it = --this->base();
+        return it.operator * ();
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/json_pointer.hpp>
+
+
+#include <algorithm> // all_of
+#include <cassert> // assert
+#include <cctype> // isdigit
+#include <numeric> // accumulate
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+template<typename BasicJsonType>
+class json_pointer
+{
+    // allow basic_json to access private members
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    friend class basic_json;
+
+  public:
+    /*!
+    @brief create JSON pointer
+
+    Create a JSON pointer according to the syntax described in
+    [Section 3 of RFC6901](https://tools.ietf.org/html/rfc6901#section-3).
+
+    @param[in] s  string representing the JSON pointer; if omitted, the empty
+                  string is assumed which references the whole JSON value
+
+    @throw parse_error.107 if the given JSON pointer @a s is nonempty and does
+                           not begin with a slash (`/`); see example below
+
+    @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s is
+    not followed by `0` (representing `~`) or `1` (representing `/`); see
+    example below
+
+    @liveexample{The example shows the construction several valid JSON pointers
+    as well as the exceptional behavior.,json_pointer}
+
+    @since version 2.0.0
+    */
+    explicit json_pointer(const std::string& s = "")
+        : reference_tokens(split(s))
+    {}
+
+    /*!
+    @brief return a string representation of the JSON pointer
+
+    @invariant For each JSON pointer `ptr`, it holds:
+    @code {.cpp}
+    ptr == json_pointer(ptr.to_string());
+    @endcode
+
+    @return a string representation of the JSON pointer
+
+    @liveexample{The example shows the result of `to_string`.,json_pointer__to_string}
+
+    @since version 2.0.0
+    */
+    std::string to_string() const
+    {
+        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
+                               std::string{},
+                               [](const std::string & a, const std::string & b)
+        {
+            return a + "/" + escape(b);
+        });
+    }
+
+    /// @copydoc to_string()
+    operator std::string() const
+    {
+        return to_string();
+    }
+
+    /*!
+    @brief append another JSON pointer at the end of this JSON pointer
+
+    @param[in] ptr  JSON pointer to append
+    @return JSON pointer with @a ptr appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::string) to append a reference token
+    @sa @ref operator/=(std::size_t) to append an array index
+    @sa @ref operator/(const json_pointer&, const json_pointer&) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(const json_pointer& ptr)
+    {
+        reference_tokens.insert(reference_tokens.end(),
+                                ptr.reference_tokens.begin(),
+                                ptr.reference_tokens.end());
+        return *this;
+    }
+
+    /*!
+    @brief append an unescaped reference token at the end of this JSON pointer
+
+    @param[in] token  reference token to append
+    @return JSON pointer with @a token appended without escaping @a token
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa @ref operator/=(std::size_t) to append an array index
+    @sa @ref operator/(const json_pointer&, std::size_t) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::string token)
+    {
+        push_back(std::move(token));
+        return *this;
+    }
+
+    /*!
+    @brief append an array index at the end of this JSON pointer
+
+    @param[in] array_index  array index to append
+    @return JSON pointer with @a array_index appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa @ref operator/=(std::string) to append a reference token
+    @sa @ref operator/(const json_pointer&, std::string) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::size_t array_index)
+    {
+        return *this /= std::to_string(array_index);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
+
+    @param[in] lhs  JSON pointer
+    @param[in] rhs  JSON pointer
+    @return a new JSON pointer with @a rhs appended to @a lhs
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a lhs and @a rhs.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& lhs,
+                                  const json_pointer& rhs)
+    {
+        return json_pointer(lhs) /= rhs;
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] token  reference token
+    @return a new JSON pointer with unescaped @a token appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::string) to append a reference token
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::string token)
+    {
+        return json_pointer(ptr) /= std::move(token);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] array_index  array index
+    @return a new JSON pointer with @a array_index appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::size_t) to append an array index
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::size_t array_index)
+    {
+        return json_pointer(ptr) /= array_index;
+    }
+
+    /*!
+    @brief returns the parent of this JSON pointer
+
+    @return parent of this JSON pointer; in case this JSON pointer is the root,
+            the root itself is returned
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @liveexample{The example shows the result of `parent_pointer` for different
+    JSON Pointers.,json_pointer__parent_pointer}
+
+    @since version 3.6.0
+    */
+    json_pointer parent_pointer() const
+    {
+        if (empty())
+        {
+            return *this;
+        }
+
+        json_pointer res = *this;
+        res.pop_back();
+        return res;
+    }
+
+    /*!
+    @brief remove last reference token
+
+    @pre not `empty()`
+
+    @liveexample{The example shows the usage of `pop_back`.,json_pointer__pop_back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    void pop_back()
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        reference_tokens.pop_back();
+    }
+
+    /*!
+    @brief return last reference token
+
+    @pre not `empty()`
+    @return last reference token
+
+    @liveexample{The example shows the usage of `back`.,json_pointer__back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    const std::string& back() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        return reference_tokens.back();
+    }
+
+    /*!
+    @brief append an unescaped token at the end of the reference pointer
+
+    @param[in] token  token to add
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows the result of `push_back` for different
+    JSON Pointers.,json_pointer__push_back}
+
+    @since version 3.6.0
+    */
+    void push_back(const std::string& token)
+    {
+        reference_tokens.push_back(token);
+    }
+
+    /// @copydoc push_back(const std::string&)
+    void push_back(std::string&& token)
+    {
+        reference_tokens.push_back(std::move(token));
+    }
+
+    /*!
+    @brief return whether pointer points to the root document
+
+    @return true iff the JSON pointer points to the root document
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example shows the result of `empty` for different JSON
+    Pointers.,json_pointer__empty}
+
+    @since version 3.6.0
+    */
+    bool empty() const noexcept
+    {
+        return reference_tokens.empty();
+    }
+
+  private:
+    /*!
+    @param[in] s  reference token to be converted into an array index
+
+    @return integer representation of @a s
+
+    @throw out_of_range.404 if string @a s could not be converted to an integer
+    */
+    static int array_index(const std::string& s)
+    {
+        std::size_t processed_chars = 0;
+        const int res = std::stoi(s, &processed_chars);
+
+        // check if the string was completely read
+        if (JSON_HEDLEY_UNLIKELY(processed_chars != s.size()))
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'"));
+        }
+
+        return res;
+    }
+
+    json_pointer top() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        json_pointer result = *this;
+        result.reference_tokens = {reference_tokens[0]};
+        return result;
+    }
+
+    /*!
+    @brief create and return a reference to the pointed to value
+
+    @complexity Linear in the number of reference tokens.
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.313 if value cannot be unflattened
+    */
+    BasicJsonType& get_and_create(BasicJsonType& j) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        auto result = &j;
+
+        // in case no reference tokens exist, return a reference to the JSON value
+        // j which will be overwritten by a primitive value
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (result->type())
+            {
+                case detail::value_t::null:
+                {
+                    if (reference_token == "0")
+                    {
+                        // start a new array if reference token is 0
+                        result = &result->operator[](0);
+                    }
+                    else
+                    {
+                        // start a new object otherwise
+                        result = &result->operator[](reference_token);
+                    }
+                    break;
+                }
+
+                case detail::value_t::object:
+                {
+                    // create an entry in the object
+                    result = &result->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // create an entry in the array
+                    JSON_TRY
+                    {
+                        result = &result->operator[](static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                /*
+                The following code is only reached if there exists a reference
+                token _and_ the current value is primitive. In this case, we have
+                an error situation, because primitive values may only occur as
+                single value; that is, with an empty list of reference tokens.
+                */
+                default:
+                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten"));
+            }
+        }
+
+        return *result;
+    }
+
+    /*!
+    @brief return a reference to the pointed to value
+
+    @note This version does not throw if a value is not present, but tries to
+          create nested values instead. For instance, calling this function
+          with pointer `"/this/that"` on a null value is equivalent to calling
+          `operator[]("this").operator[]("that")` on that value, effectively
+          changing the null value to an object.
+
+    @param[in] ptr  a JSON value
+
+    @return reference to the JSON value pointed to by the JSON pointer
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            // convert null values to arrays or objects before continuing
+            if (ptr->is_null())
+            {
+                // check if reference token is a number
+                const bool nums =
+                    std::all_of(reference_token.begin(), reference_token.end(),
+                                [](const unsigned char x)
+                {
+                    return std::isdigit(x);
+                });
+
+                // change value to array for numbers or "-" or to object otherwise
+                *ptr = (nums or reference_token == "-")
+                       ? detail::value_t::array
+                       : detail::value_t::object;
+            }
+
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    if (reference_token == "-")
+                    {
+                        // explicitly treat "-" as index beyond the end
+                        ptr = &ptr->operator[](ptr->m_value.array->size());
+                    }
+                    else
+                    {
+                        // convert array index to number; unchecked access
+                        JSON_TRY
+                        {
+                            ptr = &ptr->operator[](
+                                static_cast<size_type>(array_index(reference_token)));
+                        }
+                        JSON_CATCH(std::invalid_argument&)
+                        {
+                            JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                        }
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_checked(BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    // note: at performs range check
+                    JSON_TRY
+                    {
+                        ptr = &ptr->at(static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @brief return a const reference to the pointed to value
+
+    @param[in] ptr  a JSON value
+
+    @return const reference to the JSON value pointed to by the JSON
+    pointer
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" cannot be used for const access
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    // use unchecked array access
+                    JSON_TRY
+                    {
+                        ptr = &ptr->operator[](
+                            static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    // note: at performs range check
+                    JSON_TRY
+                    {
+                        ptr = &ptr->at(static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    */
+    bool contains(const BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    if (not ptr->contains(reference_token))
+                    {
+                        // we did not find the key in the object
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        return false;
+                    }
+
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    JSON_TRY
+                    {
+                        const auto idx = static_cast<size_type>(array_index(reference_token));
+                        if (idx >= ptr->size())
+                        {
+                            // index out of range
+                            return false;
+                        }
+
+                        ptr = &ptr->operator[](idx);
+                        break;
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                default:
+                {
+                    // we do not expect primitive values if there is still a
+                    // reference token to process
+                    return false;
+                }
+            }
+        }
+
+        // no reference token left means we found a primitive value
+        return true;
+    }
+
+    /*!
+    @brief split the string input to reference tokens
+
+    @note This function is only called by the json_pointer constructor.
+          All exceptions below are documented there.
+
+    @throw parse_error.107  if the pointer is not empty or begins with '/'
+    @throw parse_error.108  if character '~' is not followed by '0' or '1'
+    */
+    static std::vector<std::string> split(const std::string& reference_string)
+    {
+        std::vector<std::string> result;
+
+        // special case: empty reference string -> no reference tokens
+        if (reference_string.empty())
+        {
+            return result;
+        }
+
+        // check if nonempty reference string begins with slash
+        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
+        {
+            JSON_THROW(detail::parse_error::create(107, 1,
+                                                   "JSON pointer must be empty or begin with '/' - was: '" +
+                                                   reference_string + "'"));
+        }
+
+        // extract the reference tokens:
+        // - slash: position of the last read slash (or end of string)
+        // - start: position after the previous slash
+        for (
+            // search for the first slash after the first character
+            std::size_t slash = reference_string.find_first_of('/', 1),
+            // set the beginning of the first reference token
+            start = 1;
+            // we can stop if start == 0 (if slash == std::string::npos)
+            start != 0;
+            // set the beginning of the next reference token
+            // (will eventually be 0 if slash == std::string::npos)
+            start = (slash == std::string::npos) ? 0 : slash + 1,
+            // find next slash
+            slash = reference_string.find_first_of('/', start))
+        {
+            // use the text between the beginning of the reference token
+            // (start) and the last slash (slash).
+            auto reference_token = reference_string.substr(start, slash - start);
+
+            // check reference tokens are properly escaped
+            for (std::size_t pos = reference_token.find_first_of('~');
+                    pos != std::string::npos;
+                    pos = reference_token.find_first_of('~', pos + 1))
+            {
+                assert(reference_token[pos] == '~');
+
+                // ~ must be followed by 0 or 1
+                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 or
+                                         (reference_token[pos + 1] != '0' and
+                                          reference_token[pos + 1] != '1')))
+                {
+                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'"));
+                }
+            }
+
+            // finally, store the reference token
+            unescape(reference_token);
+            result.push_back(reference_token);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief replace all occurrences of a substring by another string
+
+    @param[in,out] s  the string to manipulate; changed so that all
+                   occurrences of @a f are replaced with @a t
+    @param[in]     f  the substring to replace with @a t
+    @param[in]     t  the string to replace @a f
+
+    @pre The search string @a f must not be empty. **This precondition is
+    enforced with an assertion.**
+
+    @since version 2.0.0
+    */
+    static void replace_substring(std::string& s, const std::string& f,
+                                  const std::string& t)
+    {
+        assert(not f.empty());
+        for (auto pos = s.find(f);                // find first occurrence of f
+                pos != std::string::npos;         // make sure f was found
+                s.replace(pos, f.size(), t),      // replace with t, and
+                pos = s.find(f, pos + t.size()))  // find next occurrence of f
+        {}
+    }
+
+    /// escape "~" to "~0" and "/" to "~1"
+    static std::string escape(std::string s)
+    {
+        replace_substring(s, "~", "~0");
+        replace_substring(s, "/", "~1");
+        return s;
+    }
+
+    /// unescape "~1" to tilde and "~0" to slash (order is important!)
+    static void unescape(std::string& s)
+    {
+        replace_substring(s, "~1", "/");
+        replace_substring(s, "~0", "~");
+    }
+
+    /*!
+    @param[in] reference_string  the reference string to the current value
+    @param[in] value             the value to consider
+    @param[in,out] result        the result object to insert values to
+
+    @note Empty objects or arrays are flattened to `null`.
+    */
+    static void flatten(const std::string& reference_string,
+                        const BasicJsonType& value,
+                        BasicJsonType& result)
+    {
+        switch (value.type())
+        {
+            case detail::value_t::array:
+            {
+                if (value.m_value.array->empty())
+                {
+                    // flatten empty array as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate array and use index as reference string
+                    for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
+                    {
+                        flatten(reference_string + "/" + std::to_string(i),
+                                value.m_value.array->operator[](i), result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::object:
+            {
+                if (value.m_value.object->empty())
+                {
+                    // flatten empty object as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate object and use keys as reference string
+                    for (const auto& element : *value.m_value.object)
+                    {
+                        flatten(reference_string + "/" + escape(element.first), element.second, result);
+                    }
+                }
+                break;
+            }
+
+            default:
+            {
+                // add primitive value with its reference string
+                result[reference_string] = value;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @param[in] value  flattened JSON
+
+    @return unflattened JSON
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+    @throw type_error.313  if value cannot be unflattened
+    */
+    static BasicJsonType
+    unflatten(const BasicJsonType& value)
+    {
+        if (JSON_HEDLEY_UNLIKELY(not value.is_object()))
+        {
+            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened"));
+        }
+
+        BasicJsonType result;
+
+        // iterate the JSON object values
+        for (const auto& element : *value.m_value.object)
+        {
+            if (JSON_HEDLEY_UNLIKELY(not element.second.is_primitive()))
+            {
+                JSON_THROW(detail::type_error::create(315, "values in object must be primitive"));
+            }
+
+            // assign value to reference pointed to by JSON pointer; Note that if
+            // the JSON pointer is "" (i.e., points to the whole value), function
+            // get_and_create returns a reference to result itself. An assignment
+            // will then create a primitive value.
+            json_pointer(element.first).get_and_create(result) = element.second;
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief compares two JSON pointers for equality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is equal to @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator==(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return lhs.reference_tokens == rhs.reference_tokens;
+    }
+
+    /*!
+    @brief compares two JSON pointers for inequality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is not equal @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator!=(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return not (lhs == rhs);
+    }
+
+    /// the reference tokens
+    std::vector<std::string> reference_tokens;
+};
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/json_ref.hpp>
+
+
+#include <initializer_list>
+#include <utility>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+class json_ref
+{
+  public:
+    using value_type = BasicJsonType;
+
+    json_ref(value_type&& value)
+        : owned_value(std::move(value)), value_ref(&owned_value), is_rvalue(true)
+    {}
+
+    json_ref(const value_type& value)
+        : value_ref(const_cast<value_type*>(&value)), is_rvalue(false)
+    {}
+
+    json_ref(std::initializer_list<json_ref> init)
+        : owned_value(init), value_ref(&owned_value), is_rvalue(true)
+    {}
+
+    template <
+        class... Args,
+        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
+    json_ref(Args && ... args)
+        : owned_value(std::forward<Args>(args)...), value_ref(&owned_value),
+          is_rvalue(true) {}
+
+    // class should be movable only
+    json_ref(json_ref&&) = default;
+    json_ref(const json_ref&) = delete;
+    json_ref& operator=(const json_ref&) = delete;
+    json_ref& operator=(json_ref&&) = delete;
+    ~json_ref() = default;
+
+    value_type moved_or_copied() const
+    {
+        if (is_rvalue)
+        {
+            return std::move(*value_ref);
+        }
+        return *value_ref;
+    }
+
+    value_type const& operator*() const
+    {
+        return *static_cast<value_type const*>(value_ref);
+    }
+
+    value_type const* operator->() const
+    {
+        return static_cast<value_type const*>(value_ref);
+    }
+
+  private:
+    mutable value_type owned_value = nullptr;
+    value_type* value_ref = nullptr;
+    const bool is_rvalue;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+
+#include <algorithm> // reverse
+#include <array> // array
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstring> // memcpy
+#include <limits> // numeric_limits
+#include <string> // string
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+
+#include <algorithm> // copy
+#include <cstddef> // size_t
+#include <ios> // streamsize
+#include <iterator> // back_inserter
+#include <memory> // shared_ptr, make_shared
+#include <ostream> // basic_ostream
+#include <string> // basic_string
+#include <vector> // vector
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// abstract output adapter interface
+template<typename CharType> struct output_adapter_protocol
+{
+    virtual void write_character(CharType c) = 0;
+    virtual void write_characters(const CharType* s, std::size_t length) = 0;
+    virtual ~output_adapter_protocol() = default;
+};
+
+/// a type to simplify interfaces
+template<typename CharType>
+using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;
+
+/// output adapter for byte vectors
+template<typename CharType>
+class output_vector_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_vector_adapter(std::vector<CharType>& vec) noexcept
+        : v(vec)
+    {}
+
+    void write_character(CharType c) override
+    {
+        v.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        std::copy(s, s + length, std::back_inserter(v));
+    }
+
+  private:
+    std::vector<CharType>& v;
+};
+
+/// output adapter for output streams
+template<typename CharType>
+class output_stream_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
+        : stream(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        stream.put(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        stream.write(s, static_cast<std::streamsize>(length));
+    }
+
+  private:
+    std::basic_ostream<CharType>& stream;
+};
+
+/// output adapter for basic_string
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_string_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_string_adapter(StringType& s) noexcept
+        : str(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        str.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        str.append(s, length);
+    }
+
+  private:
+    StringType& str;
+};
+
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_adapter
+{
+  public:
+    output_adapter(std::vector<CharType>& vec)
+        : oa(std::make_shared<output_vector_adapter<CharType>>(vec)) {}
+
+    output_adapter(std::basic_ostream<CharType>& s)
+        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
+
+    output_adapter(StringType& s)
+        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
+
+    operator output_adapter_t<CharType>()
+    {
+        return oa;
+    }
+
+  private:
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// binary writer //
+///////////////////
+
+/*!
+@brief serialization to CBOR and MessagePack values
+*/
+template<typename BasicJsonType, typename CharType>
+class binary_writer
+{
+    using string_t = typename BasicJsonType::string_t;
+
+  public:
+    /*!
+    @brief create a binary writer
+
+    @param[in] adapter  output adapter to write to
+    */
+    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(adapter)
+    {
+        assert(oa);
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+            {
+                write_bson_object(*j.m_value.object);
+                break;
+            }
+
+            default:
+            {
+                JSON_THROW(type_error::create(317, "to serialize to BSON, top-level type must be object, but is " + std::string(j.type_name())));
+            }
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_cbor(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                oa->write_character(to_char_type(0xF6));
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xF5)
+                                    : to_char_type(0xF4));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // CBOR does not differentiate between positive signed
+                    // integers and unsigned integers. Therefore, we used the
+                    // code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_integer <= 0x17)
+                    {
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x18));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x19));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x1A));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x1B));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    // The conversions below encode the sign in the first
+                    // byte, and the value is converted to a positive number.
+                    const auto positive_number = -1 - j.m_value.number_integer;
+                    if (j.m_value.number_integer >= -24)
+                    {
+                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x38));
+                        write_number(static_cast<std::uint8_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x39));
+                        write_number(static_cast<std::uint16_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x3A));
+                        write_number(static_cast<std::uint32_t>(positive_number));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x3B));
+                        write_number(static_cast<std::uint64_t>(positive_number));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x18));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x19));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x1A));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_unsigned));
+                }
+                else
+                {
+                    oa->write_character(to_char_type(0x1B));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_unsigned));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                oa->write_character(get_cbor_float_prefix(j.m_value.number_float));
+                write_number(j.m_value.number_float);
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x60 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x78));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x79));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x80 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x98));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x99));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_cbor(el);
+                }
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0xA0 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB8));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB9));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBA));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBB));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_cbor(el.first);
+                    write_cbor(el.second);
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_msgpack(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null: // nil
+            {
+                oa->write_character(to_char_type(0xC0));
+                break;
+            }
+
+            case value_t::boolean: // true and false
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xC3)
+                                    : to_char_type(0xC2));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // MessagePack does not differentiate between positive
+                    // signed integers and unsigned integers. Therefore, we used
+                    // the code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_unsigned < 128)
+                    {
+                        // positive fixnum
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        // uint 8
+                        oa->write_character(to_char_type(0xCC));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        // uint 16
+                        oa->write_character(to_char_type(0xCD));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        // uint 32
+                        oa->write_character(to_char_type(0xCE));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                    {
+                        // uint 64
+                        oa->write_character(to_char_type(0xCF));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    if (j.m_value.number_integer >= -32)
+                    {
+                        // negative fixnum
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() and
+                             j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                    {
+                        // int 8
+                        oa->write_character(to_char_type(0xD0));
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() and
+                             j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                    {
+                        // int 16
+                        oa->write_character(to_char_type(0xD1));
+                        write_number(static_cast<std::int16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() and
+                             j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                    {
+                        // int 32
+                        oa->write_character(to_char_type(0xD2));
+                        write_number(static_cast<std::int32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() and
+                             j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                    {
+                        // int 64
+                        oa->write_character(to_char_type(0xD3));
+                        write_number(static_cast<std::int64_t>(j.m_value.number_integer));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned < 128)
+                {
+                    // positive fixnum
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // uint 8
+                    oa->write_character(to_char_type(0xCC));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // uint 16
+                    oa->write_character(to_char_type(0xCD));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // uint 32
+                    oa->write_character(to_char_type(0xCE));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    // uint 64
+                    oa->write_character(to_char_type(0xCF));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                oa->write_character(get_msgpack_float_prefix(j.m_value.number_float));
+                write_number(j.m_value.number_float);
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 31)
+                {
+                    // fixstr
+                    write_number(static_cast<std::uint8_t>(0xA0 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // str 8
+                    oa->write_character(to_char_type(0xD9));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // str 16
+                    oa->write_character(to_char_type(0xDA));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // str 32
+                    oa->write_character(to_char_type(0xDB));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 15)
+                {
+                    // fixarray
+                    write_number(static_cast<std::uint8_t>(0x90 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // array 16
+                    oa->write_character(to_char_type(0xDC));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // array 32
+                    oa->write_character(to_char_type(0xDD));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_msgpack(el);
+                }
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 15)
+                {
+                    // fixmap
+                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // map 16
+                    oa->write_character(to_char_type(0xDE));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // map 32
+                    oa->write_character(to_char_type(0xDF));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_msgpack(el.first);
+                    write_msgpack(el.second);
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @param[in] use_count   whether to use '#' prefixes (optimized format)
+    @param[in] use_type    whether to use '$' prefixes (optimized format)
+    @param[in] add_prefix  whether prefixes need to be used for this value
+    */
+    void write_ubjson(const BasicJsonType& j, const bool use_count,
+                      const bool use_type, const bool add_prefix = true)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('Z'));
+                }
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(j.m_value.boolean
+                                        ? to_char_type('T')
+                                        : to_char_type('F'));
+                }
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix);
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix);
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix);
+                break;
+            }
+
+            case value_t::string:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('S'));
+                }
+                write_number_with_ubjson_prefix(j.m_value.string->size(), true);
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                bool prefix_required = true;
+                if (use_type and not j.m_value.array->empty())
+                {
+                    assert(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.array->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_ubjson(el, use_count, use_type, prefix_required);
+                }
+
+                if (not use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('{'));
+                }
+
+                bool prefix_required = true;
+                if (use_type and not j.m_value.object->empty())
+                {
+                    assert(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin(), j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.object->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_number_with_ubjson_prefix(el.first.size(), true);
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(el.first.c_str()),
+                        el.first.size());
+                    write_ubjson(el.second, use_count, use_type, prefix_required);
+                }
+
+                if (not use_count)
+                {
+                    oa->write_character(to_char_type('}'));
+                }
+
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @return The size of a BSON document entry header, including the id marker
+            and the entry name size (and its null-terminator).
+    */
+    static std::size_t calc_bson_entry_header_size(const string_t& name)
+    {
+        const auto it = name.find(static_cast<typename string_t::value_type>(0));
+        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
+        {
+            JSON_THROW(out_of_range::create(409,
+                                            "BSON key cannot contain code point U+0000 (at byte " + std::to_string(it) + ")"));
+        }
+
+        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
+    }
+
+    /*!
+    @brief Writes the given @a element_type and @a name to the output adapter
+    */
+    void write_bson_entry_header(const string_t& name,
+                                 const std::uint8_t element_type)
+    {
+        oa->write_character(to_char_type(element_type)); // boolean
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(name.c_str()),
+            name.size() + 1u);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and boolean value @a value
+    */
+    void write_bson_boolean(const string_t& name,
+                            const bool value)
+    {
+        write_bson_entry_header(name, 0x08);
+        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and double value @a value
+    */
+    void write_bson_double(const string_t& name,
+                           const double value)
+    {
+        write_bson_entry_header(name, 0x01);
+        write_number<double, true>(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded string in @a value
+    */
+    static std::size_t calc_bson_string_size(const string_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and string value @a value
+    */
+    void write_bson_string(const string_t& name,
+                           const string_t& value)
+    {
+        write_bson_entry_header(name, 0x02);
+
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size() + 1ul));
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(value.c_str()),
+            value.size() + 1);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and null value
+    */
+    void write_bson_null(const string_t& name)
+    {
+        write_bson_entry_header(name, 0x0A);
+    }
+
+    /*!
+    @return The size of the BSON-encoded integer @a value
+    */
+    static std::size_t calc_bson_integer_size(const std::int64_t value)
+    {
+        return (std::numeric_limits<std::int32_t>::min)() <= value and value <= (std::numeric_limits<std::int32_t>::max)()
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and integer @a value
+    */
+    void write_bson_integer(const string_t& name,
+                            const std::int64_t value)
+    {
+        if ((std::numeric_limits<std::int32_t>::min)() <= value and value <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            write_bson_entry_header(name, 0x10); // int32
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
+        }
+        else
+        {
+            write_bson_entry_header(name, 0x12); // int64
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
+        }
+    }
+
+    /*!
+    @return The size of the BSON-encoded unsigned integer in @a j
+    */
+    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
+    {
+        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and unsigned @a value
+    */
+    void write_bson_unsigned(const string_t& name,
+                             const std::uint64_t value)
+    {
+        if (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x10 /* int32 */);
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
+        }
+        else if (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x12 /* int64 */);
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
+        }
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(value) + " cannot be represented by BSON as it does not fit int64"));
+        }
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and object @a value
+    */
+    void write_bson_object_entry(const string_t& name,
+                                 const typename BasicJsonType::object_t& value)
+    {
+        write_bson_entry_header(name, 0x03); // object
+        write_bson_object(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded array @a value
+    */
+    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
+    {
+        std::size_t array_index = 0ul;
+
+        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), 0ul, [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
+        {
+            return result + calc_bson_element_size(std::to_string(array_index++), el);
+        });
+
+        return sizeof(std::int32_t) + embedded_document_size + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and array @a value
+    */
+    void write_bson_array(const string_t& name,
+                          const typename BasicJsonType::array_t& value)
+    {
+        write_bson_entry_header(name, 0x04); // array
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_array_size(value)));
+
+        std::size_t array_index = 0ul;
+
+        for (const auto& el : value)
+        {
+            write_bson_element(std::to_string(array_index++), el);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    /*!
+    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
+    @return The calculated size for the BSON document entry for @a j with the given @a name.
+    */
+    static std::size_t calc_bson_element_size(const string_t& name,
+            const BasicJsonType& j)
+    {
+        const auto header_size = calc_bson_entry_header_size(name);
+        switch (j.type())
+        {
+            case value_t::object:
+                return header_size + calc_bson_object_size(*j.m_value.object);
+
+            case value_t::array:
+                return header_size + calc_bson_array_size(*j.m_value.array);
+
+            case value_t::boolean:
+                return header_size + 1ul;
+
+            case value_t::number_float:
+                return header_size + 8ul;
+
+            case value_t::number_integer:
+                return header_size + calc_bson_integer_size(j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned);
+
+            case value_t::string:
+                return header_size + calc_bson_string_size(*j.m_value.string);
+
+            case value_t::null:
+                return header_size + 0ul;
+
+            // LCOV_EXCL_START
+            default:
+                assert(false);
+                return 0ul;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Serializes the JSON value @a j to BSON and associates it with the
+           key @a name.
+    @param name The name to associate with the JSON entity @a j within the
+                current BSON document
+    @return The size of the BSON entry
+    */
+    void write_bson_element(const string_t& name,
+                            const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+                return write_bson_object_entry(name, *j.m_value.object);
+
+            case value_t::array:
+                return write_bson_array(name, *j.m_value.array);
+
+            case value_t::boolean:
+                return write_bson_boolean(name, j.m_value.boolean);
+
+            case value_t::number_float:
+                return write_bson_double(name, j.m_value.number_float);
+
+            case value_t::number_integer:
+                return write_bson_integer(name, j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return write_bson_unsigned(name, j.m_value.number_unsigned);
+
+            case value_t::string:
+                return write_bson_string(name, *j.m_value.string);
+
+            case value_t::null:
+                return write_bson_null(name);
+
+            // LCOV_EXCL_START
+            default:
+                assert(false);
+                return;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Calculates the size of the BSON serialization of the given
+           JSON-object @a j.
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
+    {
+        std::size_t document_size = std::accumulate(value.begin(), value.end(), 0ul,
+                                    [](size_t result, const typename BasicJsonType::object_t::value_type & el)
+        {
+            return result += calc_bson_element_size(el.first, el.second);
+        });
+
+        return sizeof(std::int32_t) + document_size + 1ul;
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson_object(const typename BasicJsonType::object_t& value)
+    {
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_object_size(value)));
+
+        for (const auto& el : value)
+        {
+            write_bson_element(el.first, el.second);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xFA);  // Single-Precision Float
+    }
+
+    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xFB);  // Double-Precision Float
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xCA);  // float 32
+    }
+
+    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xCB);  // float 64
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    // UBJSON: write number (floating point)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (add_prefix)
+        {
+            oa->write_character(get_ubjson_float_prefix(n));
+        }
+        write_number(n);
+    }
+
+    // UBJSON: write number (unsigned integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_unsigned<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64"));
+        }
+    }
+
+    // UBJSON: write number (signed integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_signed<NumberType>::value and
+                 not std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if ((std::numeric_limits<std::int8_t>::min)() <= n and n <= (std::numeric_limits<std::int8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::int8_t>(n));
+        }
+        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n and n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if ((std::numeric_limits<std::int16_t>::min)() <= n and n <= (std::numeric_limits<std::int16_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if ((std::numeric_limits<std::int32_t>::min)() <= n and n <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if ((std::numeric_limits<std::int64_t>::min)() <= n and n <= (std::numeric_limits<std::int64_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        // LCOV_EXCL_START
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64"));
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    /*!
+    @brief determine the type prefix of container values
+
+    @note This function does not need to be 100% accurate when it comes to
+          integer limits. In case a number exceeds the limits of int64_t,
+          this will be detected by a later call to function
+          write_number_with_ubjson_prefix. Therefore, we return 'L' for any
+          value that does not fit the previous limits.
+    */
+    CharType ubjson_prefix(const BasicJsonType& j) const noexcept
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+                return 'Z';
+
+            case value_t::boolean:
+                return j.m_value.boolean ? 'T' : 'F';
+
+            case value_t::number_integer:
+            {
+                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                {
+                    return 'i';
+                }
+                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    return 'U';
+                }
+                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                {
+                    return 'I';
+                }
+                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                {
+                    return 'l';
+                }
+                // no check and assume int64_t (see note above)
+                return 'L';
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+                {
+                    return 'i';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
+                {
+                    return 'U';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+                {
+                    return 'I';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+                {
+                    return 'l';
+                }
+                // no check and assume int64_t (see note above)
+                return 'L';
+            }
+
+            case value_t::number_float:
+                return get_ubjson_float_prefix(j.m_value.number_float);
+
+            case value_t::string:
+                return 'S';
+
+            case value_t::array:
+                return '[';
+
+            case value_t::object:
+                return '{';
+
+            default:  // discarded values
+                return 'N';
+        }
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
+    {
+        return 'd';  // float 32
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
+    {
+        return 'D';  // float 64
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*
+    @brief write a number to output input
+    @param[in] n number of type @a NumberType
+    @tparam NumberType the type of the number
+    @tparam OutputIsLittleEndian Set to true if output data is
+                                 required to be little endian
+
+    @note This function needs to respect the system's endianess, because bytes
+          in CBOR, MessagePack, and UBJSON are stored in network order (big
+          endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool OutputIsLittleEndian = false>
+    void write_number(const NumberType n)
+    {
+        // step 1: write number to array of length NumberType
+        std::array<CharType, sizeof(NumberType)> vec;
+        std::memcpy(vec.data(), &n, sizeof(NumberType));
+
+        // step 2: write array to output (with possible reordering)
+        if (is_little_endian != OutputIsLittleEndian)
+        {
+            // reverse byte order prior to conversion if necessary
+            std::reverse(vec.begin(), vec.end());
+        }
+
+        oa->write_characters(vec.data(), sizeof(NumberType));
+    }
+
+  public:
+    // The following to_char_type functions are implement the conversion
+    // between uint8_t and CharType. In case CharType is not unsigned,
+    // such a conversion is required to allow values greater than 128.
+    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value and std::is_signed<char>::value > * = nullptr >
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return *reinterpret_cast<char*>(&x);
+    }
+
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value and std::is_unsigned<char>::value > * = nullptr >
+    static CharType to_char_type(std::uint8_t x) noexcept
+    {
+        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
+        static_assert(std::is_pod<CharType>::value, "CharType must be POD");
+        CharType result;
+        std::memcpy(&result, &x, sizeof(x));
+        return result;
+    }
+
+    template<typename C = CharType,
+             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return x;
+    }
+
+    template < typename InputCharType, typename C = CharType,
+               enable_if_t <
+                   std::is_signed<C>::value and
+                   std::is_signed<char>::value and
+                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
+                   > * = nullptr >
+    static constexpr CharType to_char_type(InputCharType x) noexcept
+    {
+        return x;
+    }
+
+  private:
+    /// whether we can assume little endianess
+    const bool is_little_endian = binary_reader<BasicJsonType>::little_endianess();
+
+    /// the output
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/output/serializer.hpp>
+
+
+#include <algorithm> // reverse, remove, fill, find, none_of
+#include <array> // array
+#include <cassert> // assert
+#include <ciso646> // and, or
+#include <clocale> // localeconv, lconv
+#include <cmath> // labs, isfinite, isnan, signbit
+#include <cstddef> // size_t, ptrdiff_t
+#include <cstdint> // uint8_t
+#include <cstdio> // snprintf
+#include <limits> // numeric_limits
+#include <string> // string
+#include <type_traits> // is_same
+#include <utility> // move
+
+// #include <nlohmann/detail/conversions/to_chars.hpp>
+
+
+#include <array> // array
+#include <cassert> // assert
+#include <ciso646> // or, and, not
+#include <cmath>   // signbit, isfinite
+#include <cstdint> // intN_t, uintN_t
+#include <cstring> // memcpy, memmove
+#include <limits> // numeric_limits
+#include <type_traits> // conditional
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/*!
+@brief implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+
+The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
+
+For a detailed description of the algorithm see:
+
+[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
+    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
+    Language Design and Implementation, PLDI 2010
+[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
+    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
+    Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl
+{
+
+template <typename Target, typename Source>
+Target reinterpret_bits(const Source source)
+{
+    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+    Target target;
+    std::memcpy(&target, &source, sizeof(Source));
+    return target;
+}
+
+struct diyfp // f * 2^e
+{
+    static constexpr int kPrecision = 64; // = q
+
+    std::uint64_t f = 0;
+    int e = 0;
+
+    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+    /*!
+    @brief returns x - y
+    @pre x.e == y.e and x.f >= y.f
+    */
+    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
+    {
+        assert(x.e == y.e);
+        assert(x.f >= y.f);
+
+        return {x.f - y.f, x.e};
+    }
+
+    /*!
+    @brief returns x * y
+    @note The result is rounded. (Only the upper q bits are returned.)
+    */
+    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
+    {
+        static_assert(kPrecision == 64, "internal error");
+
+        // Computes:
+        //  f = round((x.f * y.f) / 2^q)
+        //  e = x.e + y.e + q
+
+        // Emulate the 64-bit * 64-bit multiplication:
+        //
+        // p = u * v
+        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
+        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
+        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
+        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
+        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
+        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
+        //
+        // (Since Q might be larger than 2^32 - 1)
+        //
+        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+        //
+        // (Q_hi + H does not overflow a 64-bit int)
+        //
+        //   = p_lo + 2^64 p_hi
+
+        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
+        const std::uint64_t u_hi = x.f >> 32u;
+        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
+        const std::uint64_t v_hi = y.f >> 32u;
+
+        const std::uint64_t p0 = u_lo * v_lo;
+        const std::uint64_t p1 = u_lo * v_hi;
+        const std::uint64_t p2 = u_hi * v_lo;
+        const std::uint64_t p3 = u_hi * v_hi;
+
+        const std::uint64_t p0_hi = p0 >> 32u;
+        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
+        const std::uint64_t p1_hi = p1 >> 32u;
+        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
+        const std::uint64_t p2_hi = p2 >> 32u;
+
+        std::uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+        // The full product might now be computed as
+        //
+        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+        // p_lo = p0_lo + (Q << 32)
+        //
+        // But in this particular case here, the full p_lo is not required.
+        // Effectively we only need to add the highest bit in p_lo to p_hi (and
+        // Q_hi + 1 does not overflow).
+
+        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
+
+        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
+
+        return {h, x.e + y.e + 64};
+    }
+
+    /*!
+    @brief normalize x such that the significand is >= 2^(q-1)
+    @pre x.f != 0
+    */
+    static diyfp normalize(diyfp x) noexcept
+    {
+        assert(x.f != 0);
+
+        while ((x.f >> 63u) == 0)
+        {
+            x.f <<= 1u;
+            x.e--;
+        }
+
+        return x;
+    }
+
+    /*!
+    @brief normalize x such that the result has the exponent E
+    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+    */
+    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
+    {
+        const int delta = x.e - target_exponent;
+
+        assert(delta >= 0);
+        assert(((x.f << delta) >> delta) == x.f);
+
+        return {x.f << delta, target_exponent};
+    }
+};
+
+struct boundaries
+{
+    diyfp w;
+    diyfp minus;
+    diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+
+@pre value must be finite and positive
+*/
+template <typename FloatType>
+boundaries compute_boundaries(FloatType value)
+{
+    assert(std::isfinite(value));
+    assert(value > 0);
+
+    // Convert the IEEE representation into a diyfp.
+    //
+    // If v is denormal:
+    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+    // If v is normalized:
+    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+    static_assert(std::numeric_limits<FloatType>::is_iec559,
+                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
+
+    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
+    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+    constexpr int      kMinExp    = 1 - kBias;
+    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)
+
+    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;
+
+    const std::uint64_t bits = reinterpret_bits<bits_type>(value);
+    const std::uint64_t E = bits >> (kPrecision - 1);
+    const std::uint64_t F = bits & (kHiddenBit - 1);
+
+    const bool is_denormal = E == 0;
+    const diyfp v = is_denormal
+                    ? diyfp(F, kMinExp)
+                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+    // Compute the boundaries m- and m+ of the floating-point value
+    // v = f * 2^e.
+    //
+    // Determine v- and v+, the floating-point predecessor and successor if v,
+    // respectively.
+    //
+    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+    //
+    //      v+ = v + 2^e
+    //
+    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+    // between m- and m+ round to v, regardless of how the input rounding
+    // algorithm breaks ties.
+    //
+    //      ---+-------------+-------------+-------------+-------------+---  (A)
+    //         v-            m-            v             m+            v+
+    //
+    //      -----------------+------+------+-------------+-------------+---  (B)
+    //                       v-     m-     v             m+            v+
+
+    const bool lower_boundary_is_closer = F == 0 and E > 1;
+    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
+    const diyfp m_minus = lower_boundary_is_closer
+                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
+                          : diyfp(2 * v.f - 1, v.e - 1); // (A)
+
+    // Determine the normalized w+ = m+.
+    const diyfp w_plus = diyfp::normalize(m_plus);
+
+    // Determine w- = m- such that e_(w-) = e_(w+).
+    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+    return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power // c = f * 2^e ~= 10^k
+{
+    std::uint64_t f;
+    int e;
+    int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power get_cached_power_for_binary_exponent(int e)
+{
+    // Now
+    //
+    //      alpha <= e_c + e + q <= gamma                                    (1)
+    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+    //
+    // and since the c's are normalized, 2^(q-1) <= f_c,
+    //
+    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+    //      ==> 2^(alpha - e - 1) <= c
+    //
+    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
+    //
+    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+    //        = ceil( (alpha - e - 1) * log_10(2) )
+    //
+    // From the paper:
+    // "In theory the result of the procedure could be wrong since c is rounded,
+    //  and the computation itself is approximated [...]. In practice, however,
+    //  this simple function is sufficient."
+    //
+    // For IEEE double precision floating-point numbers converted into
+    // normalized diyfp's w = f * 2^e, with q = 64,
+    //
+    //      e >= -1022      (min IEEE exponent)
+    //           -52        (p - 1)
+    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+    //           -11        (normalize the diyfp)
+    //         = -1137
+    //
+    // and
+    //
+    //      e <= +1023      (max IEEE exponent)
+    //           -52        (p - 1)
+    //           -11        (normalize the diyfp)
+    //         = 960
+    //
+    // This binary exponent range [-1137,960] results in a decimal exponent
+    // range [-307,324]. One does not need to store a cached power for each
+    // k in this range. For each such k it suffices to find a cached power
+    // such that the exponent of the product lies in [alpha,gamma].
+    // This implies that the difference of the decimal exponents of adjacent
+    // table entries must be less than or equal to
+    //
+    //      floor( (gamma - alpha) * log_10(2) ) = 8.
+    //
+    // (A smaller distance gamma-alpha would require a larger table.)
+
+    // NB:
+    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+    constexpr int kCachedPowersMinDecExp = -300;
+    constexpr int kCachedPowersDecStep = 8;
+
+    static constexpr std::array<cached_power, 79> kCachedPowers =
+    {
+        {
+            { 0xAB70FE17C79AC6CA, -1060, -300 },
+            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
+            { 0xBE5691EF416BD60C, -1007, -284 },
+            { 0x8DD01FAD907FFC3C,  -980, -276 },
+            { 0xD3515C2831559A83,  -954, -268 },
+            { 0x9D71AC8FADA6C9B5,  -927, -260 },
+            { 0xEA9C227723EE8BCB,  -901, -252 },
+            { 0xAECC49914078536D,  -874, -244 },
+            { 0x823C12795DB6CE57,  -847, -236 },
+            { 0xC21094364DFB5637,  -821, -228 },
+            { 0x9096EA6F3848984F,  -794, -220 },
+            { 0xD77485CB25823AC7,  -768, -212 },
+            { 0xA086CFCD97BF97F4,  -741, -204 },
+            { 0xEF340A98172AACE5,  -715, -196 },
+            { 0xB23867FB2A35B28E,  -688, -188 },
+            { 0x84C8D4DFD2C63F3B,  -661, -180 },
+            { 0xC5DD44271AD3CDBA,  -635, -172 },
+            { 0x936B9FCEBB25C996,  -608, -164 },
+            { 0xDBAC6C247D62A584,  -582, -156 },
+            { 0xA3AB66580D5FDAF6,  -555, -148 },
+            { 0xF3E2F893DEC3F126,  -529, -140 },
+            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
+            { 0x87625F056C7C4A8B,  -475, -124 },
+            { 0xC9BCFF6034C13053,  -449, -116 },
+            { 0x964E858C91BA2655,  -422, -108 },
+            { 0xDFF9772470297EBD,  -396, -100 },
+            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
+            { 0xF8A95FCF88747D94,  -343,  -84 },
+            { 0xB94470938FA89BCF,  -316,  -76 },
+            { 0x8A08F0F8BF0F156B,  -289,  -68 },
+            { 0xCDB02555653131B6,  -263,  -60 },
+            { 0x993FE2C6D07B7FAC,  -236,  -52 },
+            { 0xE45C10C42A2B3B06,  -210,  -44 },
+            { 0xAA242499697392D3,  -183,  -36 },
+            { 0xFD87B5F28300CA0E,  -157,  -28 },
+            { 0xBCE5086492111AEB,  -130,  -20 },
+            { 0x8CBCCC096F5088CC,  -103,  -12 },
+            { 0xD1B71758E219652C,   -77,   -4 },
+            { 0x9C40000000000000,   -50,    4 },
+            { 0xE8D4A51000000000,   -24,   12 },
+            { 0xAD78EBC5AC620000,     3,   20 },
+            { 0x813F3978F8940984,    30,   28 },
+            { 0xC097CE7BC90715B3,    56,   36 },
+            { 0x8F7E32CE7BEA5C70,    83,   44 },
+            { 0xD5D238A4ABE98068,   109,   52 },
+            { 0x9F4F2726179A2245,   136,   60 },
+            { 0xED63A231D4C4FB27,   162,   68 },
+            { 0xB0DE65388CC8ADA8,   189,   76 },
+            { 0x83C7088E1AAB65DB,   216,   84 },
+            { 0xC45D1DF942711D9A,   242,   92 },
+            { 0x924D692CA61BE758,   269,  100 },
+            { 0xDA01EE641A708DEA,   295,  108 },
+            { 0xA26DA3999AEF774A,   322,  116 },
+            { 0xF209787BB47D6B85,   348,  124 },
+            { 0xB454E4A179DD1877,   375,  132 },
+            { 0x865B86925B9BC5C2,   402,  140 },
+            { 0xC83553C5C8965D3D,   428,  148 },
+            { 0x952AB45CFA97A0B3,   455,  156 },
+            { 0xDE469FBD99A05FE3,   481,  164 },
+            { 0xA59BC234DB398C25,   508,  172 },
+            { 0xF6C69A72A3989F5C,   534,  180 },
+            { 0xB7DCBF5354E9BECE,   561,  188 },
+            { 0x88FCF317F22241E2,   588,  196 },
+            { 0xCC20CE9BD35C78A5,   614,  204 },
+            { 0x98165AF37B2153DF,   641,  212 },
+            { 0xE2A0B5DC971F303A,   667,  220 },
+            { 0xA8D9D1535CE3B396,   694,  228 },
+            { 0xFB9B7CD9A4A7443C,   720,  236 },
+            { 0xBB764C4CA7A44410,   747,  244 },
+            { 0x8BAB8EEFB6409C1A,   774,  252 },
+            { 0xD01FEF10A657842C,   800,  260 },
+            { 0x9B10A4E5E9913129,   827,  268 },
+            { 0xE7109BFBA19C0C9D,   853,  276 },
+            { 0xAC2820D9623BF429,   880,  284 },
+            { 0x80444B5E7AA7CF85,   907,  292 },
+            { 0xBF21E44003ACDD2D,   933,  300 },
+            { 0x8E679C2F5E44FF8F,   960,  308 },
+            { 0xD433179D9C8CB841,   986,  316 },
+            { 0x9E19DB92B4E31BA9,  1013,  324 },
+        }
+    };
+
+    // This computation gives exactly the same results for k as
+    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+    // for |e| <= 1500, but doesn't require floating-point operations.
+    // NB: log_10(2) ~= 78913 / 2^18
+    assert(e >= -1500);
+    assert(e <=  1500);
+    const int f = kAlpha - e - 1;
+    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
+
+    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
+    assert(index >= 0);
+    assert(static_cast<std::size_t>(index) < kCachedPowers.size());
+
+    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
+    assert(kAlpha <= cached.e + e + 64);
+    assert(kGamma >= cached.e + e + 64);
+
+    return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
+{
+    // LCOV_EXCL_START
+    if (n >= 1000000000)
+    {
+        pow10 = 1000000000;
+        return 10;
+    }
+    // LCOV_EXCL_STOP
+    else if (n >= 100000000)
+    {
+        pow10 = 100000000;
+        return  9;
+    }
+    else if (n >= 10000000)
+    {
+        pow10 = 10000000;
+        return  8;
+    }
+    else if (n >= 1000000)
+    {
+        pow10 = 1000000;
+        return  7;
+    }
+    else if (n >= 100000)
+    {
+        pow10 = 100000;
+        return  6;
+    }
+    else if (n >= 10000)
+    {
+        pow10 = 10000;
+        return  5;
+    }
+    else if (n >= 1000)
+    {
+        pow10 = 1000;
+        return  4;
+    }
+    else if (n >= 100)
+    {
+        pow10 = 100;
+        return  3;
+    }
+    else if (n >= 10)
+    {
+        pow10 = 10;
+        return  2;
+    }
+    else
+    {
+        pow10 = 1;
+        return 1;
+    }
+}
+
+inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
+                         std::uint64_t rest, std::uint64_t ten_k)
+{
+    assert(len >= 1);
+    assert(dist <= delta);
+    assert(rest <= delta);
+    assert(ten_k > 0);
+
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    //                                  ten_k
+    //                                <------>
+    //                                       <---- rest ---->
+    // --------------[------------------+----+--------------]--------------
+    //                                  w    V
+    //                                       = buf * 10^k
+    //
+    // ten_k represents a unit-in-the-last-place in the decimal representation
+    // stored in buf.
+    // Decrement buf by ten_k while this takes buf closer to w.
+
+    // The tests are written in this order to avoid overflow in unsigned
+    // integer arithmetic.
+
+    while (rest < dist
+            and delta - rest >= ten_k
+            and (rest + ten_k < dist or dist - rest > rest + ten_k - dist))
+    {
+        assert(buf[len - 1] != '0');
+        buf[len - 1]--;
+        rest += ten_k;
+    }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
+                             diyfp M_minus, diyfp w, diyfp M_plus)
+{
+    static_assert(kAlpha >= -60, "internal error");
+    static_assert(kGamma <= -32, "internal error");
+
+    // Generates the digits (and the exponent) of a decimal floating-point
+    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
+    //
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    // Grisu2 generates the digits of M+ from left to right and stops as soon as
+    // V is in [M-,M+].
+
+    assert(M_plus.e >= kAlpha);
+    assert(M_plus.e <= kGamma);
+
+    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
+    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)
+
+    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+    //
+    //      M+ = f * 2^e
+    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+    //         = p1 + p2 * 2^e
+
+    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
+
+    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e
+
+    // 1)
+    //
+    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+    assert(p1 > 0);
+
+    std::uint32_t pow10;
+    const int k = find_largest_pow10(p1, pow10);
+
+    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+    //
+    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+    //
+    //      M+ = p1                                             + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+    //
+    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+    //
+    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+    //
+    // but stop as soon as
+    //
+    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+    int n = k;
+    while (n > 0)
+    {
+        // Invariants:
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        //
+        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
+        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
+        //
+        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+        //
+        assert(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+        //
+        p1 = r;
+        n--;
+        //
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+        //      pow10 = 10^n
+        //
+
+        // Now check if enough digits have been generated.
+        // Compute
+        //
+        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+        //
+        // Note:
+        // Since rest and delta share the same exponent e, it suffices to
+        // compare the significands.
+        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
+        if (rest <= delta)
+        {
+            // V = buffer * 10^n, with M- <= V <= M+.
+
+            decimal_exponent += n;
+
+            // We may now just stop. But instead look if the buffer could be
+            // decremented to bring V closer to w.
+            //
+            // pow10 = 10^n is now 1 ulp in the decimal representation V.
+            // The rounding procedure works with diyfp's with an implicit
+            // exponent of e.
+            //
+            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+            //
+            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
+            grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+            return;
+        }
+
+        pow10 /= 10;
+        //
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        // Invariants restored.
+    }
+
+    // 2)
+    //
+    // The digits of the integral part have been generated:
+    //
+    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+    //         = buffer            + p2 * 2^e
+    //
+    // Now generate the digits of the fractional part p2 * 2^e.
+    //
+    // Note:
+    // No decimal point is generated: the exponent is adjusted instead.
+    //
+    // p2 actually represents the fraction
+    //
+    //      p2 * 2^e
+    //          = p2 / 2^-e
+    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+    //
+    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+    //
+    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+    //
+    // using
+    //
+    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+    //                = (                   d) * 2^-e + (                   r)
+    //
+    // or
+    //      10^m * p2 * 2^e = d + r * 2^e
+    //
+    // i.e.
+    //
+    //      M+ = buffer + p2 * 2^e
+    //         = buffer + 10^-m * (d + r * 2^e)
+    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+    //
+    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+    assert(p2 > delta);
+
+    int m = 0;
+    for (;;)
+    {
+        // Invariant:
+        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
+        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
+        //
+        assert(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
+        p2 *= 10;
+        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
+        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
+        //
+        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        assert(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        p2 = r;
+        m++;
+        //
+        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+        // Invariant restored.
+
+        // Check if enough digits have been generated.
+        //
+        //      10^-m * p2 * 2^e <= delta * 2^e
+        //              p2 * 2^e <= 10^m * delta * 2^e
+        //                    p2 <= 10^m * delta
+        delta *= 10;
+        dist  *= 10;
+        if (p2 <= delta)
+        {
+            break;
+        }
+    }
+
+    // V = buffer * 10^-m, with M- <= V <= M+.
+
+    decimal_exponent -= m;
+
+    // 1 ulp in the decimal representation is now 10^-m.
+    // Since delta and dist are now scaled by 10^m, we need to do the
+    // same with ulp in order to keep the units in sync.
+    //
+    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+    //
+    const std::uint64_t ten_m = one.f;
+    grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+    // By construction this algorithm generates the shortest possible decimal
+    // number (Loitsch, Theorem 6.2) which rounds back to w.
+    // For an input number of precision p, at least
+    //
+    //      N = 1 + ceil(p * log_10(2))
+    //
+    // decimal digits are sufficient to identify all binary floating-point
+    // numbers (Matula, "In-and-Out conversions").
+    // This implies that the algorithm does not produce more than N decimal
+    // digits.
+    //
+    //      N = 17 for p = 53 (IEEE double precision)
+    //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline void grisu2(char* buf, int& len, int& decimal_exponent,
+                   diyfp m_minus, diyfp v, diyfp m_plus)
+{
+    assert(m_plus.e == m_minus.e);
+    assert(m_plus.e == v.e);
+
+    //  --------(-----------------------+-----------------------)--------    (A)
+    //          m-                      v                       m+
+    //
+    //  --------------------(-----------+-----------------------)--------    (B)
+    //                      m-          v                       m+
+    //
+    // First scale v (and m- and m+) such that the exponent is in the range
+    // [alpha, gamma].
+
+    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
+
+    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
+    const diyfp w       = diyfp::mul(v,       c_minus_k);
+    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);
+
+    //  ----(---+---)---------------(---+---)---------------(---+---)----
+    //          w-                      w                       w+
+    //          = c*m-                  = c*v                   = c*m+
+    //
+    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+    // w+ are now off by a small amount.
+    // In fact:
+    //
+    //      w - v * 10^k < 1 ulp
+    //
+    // To account for this inaccuracy, add resp. subtract 1 ulp.
+    //
+    //  --------+---[---------------(---+---)---------------]---+--------
+    //          w-  M-                  w                   M+  w+
+    //
+    // Now any number in [M-, M+] (bounds included) will round to w when input,
+    // regardless of how the input rounding algorithm breaks ties.
+    //
+    // And digit_gen generates the shortest possible such number in [M-, M+].
+    // Note that this does not mean that Grisu2 always generates the shortest
+    // possible number in the interval (m-, m+).
+    const diyfp M_minus(w_minus.f + 1, w_minus.e);
+    const diyfp M_plus (w_plus.f  - 1, w_plus.e );
+
+    decimal_exponent = -cached.k; // = -(-k) = k
+
+    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template <typename FloatType>
+JSON_HEDLEY_NON_NULL(1)
+void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
+{
+    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                  "internal error: not enough precision");
+
+    assert(std::isfinite(value));
+    assert(value > 0);
+
+    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
+    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
+    // decimal representations are not exactly "short".
+    //
+    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
+    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
+    // and since sprintf promotes float's to double's, I think this is exactly what 'std::to_chars'
+    // does.
+    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
+    // representation using the corresponding std::from_chars function recovers value exactly". That
+    // indicates that single precision floating-point numbers should be recovered using
+    // 'std::strtof'.
+    //
+    // NB: If the neighbors are computed for single-precision numbers, there is a single float
+    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
+    //     value is off by 1 ulp.
+#if 0
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+    const boundaries w = compute_boundaries(value);
+#endif
+
+    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* append_exponent(char* buf, int e)
+{
+    assert(e > -1000);
+    assert(e <  1000);
+
+    if (e < 0)
+    {
+        e = -e;
+        *buf++ = '-';
+    }
+    else
+    {
+        *buf++ = '+';
+    }
+
+    auto k = static_cast<std::uint32_t>(e);
+    if (k < 10)
+    {
+        // Always print at least two digits in the exponent.
+        // This is for compatibility with printf("%g").
+        *buf++ = '0';
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else if (k < 100)
+    {
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else
+    {
+        *buf++ = static_cast<char>('0' + k / 100);
+        k %= 100;
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+
+    return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* format_buffer(char* buf, int len, int decimal_exponent,
+                           int min_exp, int max_exp)
+{
+    assert(min_exp < 0);
+    assert(max_exp > 0);
+
+    const int k = len;
+    const int n = len + decimal_exponent;
+
+    // v = buf * 10^(n-k)
+    // k is the length of the buffer (number of decimal digits)
+    // n is the position of the decimal point relative to the start of the buffer.
+
+    if (k <= n and n <= max_exp)
+    {
+        // digits[000]
+        // len <= max_exp + 2
+
+        std::memset(buf + k, '0', static_cast<size_t>(n - k));
+        // Make it look like a floating-point number (#362, #378)
+        buf[n + 0] = '.';
+        buf[n + 1] = '0';
+        return buf + (n + 2);
+    }
+
+    if (0 < n and n <= max_exp)
+    {
+        // dig.its
+        // len <= max_digits10 + 1
+
+        assert(k > n);
+
+        std::memmove(buf + (n + 1), buf + n, static_cast<size_t>(k - n));
+        buf[n] = '.';
+        return buf + (k + 1);
+    }
+
+    if (min_exp < n and n <= 0)
+    {
+        // 0.[000]digits
+        // len <= 2 + (-min_exp - 1) + max_digits10
+
+        std::memmove(buf + (2 + -n), buf, static_cast<size_t>(k));
+        buf[0] = '0';
+        buf[1] = '.';
+        std::memset(buf + 2, '0', static_cast<size_t>(-n));
+        return buf + (2 + (-n) + k);
+    }
+
+    if (k == 1)
+    {
+        // dE+123
+        // len <= 1 + 5
+
+        buf += 1;
+    }
+    else
+    {
+        // d.igitsE+123
+        // len <= max_digits10 + 1 + 5
+
+        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k - 1));
+        buf[1] = '.';
+        buf += 1 + k;
+    }
+
+    *buf++ = 'e';
+    return append_exponent(buf, n - 1);
+}
+
+} // namespace dtoa_impl
+
+/*!
+@brief generates a decimal representation of the floating-point number value in [first, last).
+
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+template <typename FloatType>
+JSON_HEDLEY_NON_NULL(1, 2)
+JSON_HEDLEY_RETURNS_NON_NULL
+char* to_chars(char* first, const char* last, FloatType value)
+{
+    static_cast<void>(last); // maybe unused - fix warning
+    assert(std::isfinite(value));
+
+    // Use signbit(value) instead of (value < 0) since signbit works for -0.
+    if (std::signbit(value))
+    {
+        value = -value;
+        *first++ = '-';
+    }
+
+    if (value == 0) // +-0
+    {
+        *first++ = '0';
+        // Make it look like a floating-point number (#362, #378)
+        *first++ = '.';
+        *first++ = '0';
+        return first;
+    }
+
+    assert(last - first >= std::numeric_limits<FloatType>::max_digits10);
+
+    // Compute v = buffer * 10^decimal_exponent.
+    // The decimal digits are stored in the buffer, which needs to be interpreted
+    // as an unsigned decimal integer.
+    // len is the length of the buffer, i.e. the number of decimal digits.
+    int len = 0;
+    int decimal_exponent = 0;
+    dtoa_impl::grisu2(first, len, decimal_exponent, value);
+
+    assert(len <= std::numeric_limits<FloatType>::max_digits10);
+
+    // Format the buffer like printf("%.*g", prec, value)
+    constexpr int kMinExp = -4;
+    // Use digits10 here to increase compatibility with version 2.
+    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
+
+    assert(last - first >= kMaxExp + 2);
+    assert(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
+    assert(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
+
+    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
+}
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// serialization //
+///////////////////
+
+/// how to treat decoding errors
+enum class error_handler_t
+{
+    strict,  ///< throw a type_error exception in case of invalid UTF-8
+    replace, ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore   ///< ignore invalid UTF-8 sequences
+};
+
+template<typename BasicJsonType>
+class serializer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    static constexpr std::uint8_t UTF8_ACCEPT = 0;
+    static constexpr std::uint8_t UTF8_REJECT = 1;
+
+  public:
+    /*!
+    @param[in] s  output stream to serialize to
+    @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
+    */
+    serializer(output_adapter_t<char> s, const char ichar,
+               error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s))
+        , loc(std::localeconv())
+        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep))
+        , decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point))
+        , indent_char(ichar)
+        , indent_string(512, indent_char)
+        , error_handler(error_handler_)
+    {}
+
+    // delete because of pointer members
+    serializer(const serializer&) = delete;
+    serializer& operator=(const serializer&) = delete;
+    serializer(serializer&&) = delete;
+    serializer& operator=(serializer&&) = delete;
+    ~serializer() = default;
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serialization internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is
+    called recursively.
+
+    - strings and object keys are escaped using `escape_string()`
+    - integer numbers are converted implicitly via `operator<<`
+    - floating-point numbers are converted to a string using `"%g"` format
+
+    @param[in] val             value to serialize
+    @param[in] pretty_print    whether the output shall be pretty-printed
+    @param[in] indent_step     the indent level
+    @param[in] current_indent  the current indent level (only used internally)
+    */
+    void dump(const BasicJsonType& val, const bool pretty_print,
+              const bool ensure_ascii,
+              const unsigned int indent_step,
+              const unsigned int current_indent = 0)
+    {
+        switch (val.m_type)
+        {
+            case value_t::object:
+            {
+                if (val.m_value.object->empty())
+                {
+                    o->write_characters("{}", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\": ", 3);
+                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    assert(i != val.m_value.object->cend());
+                    assert(std::next(i) == val.m_value.object->cend());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\": ", 3);
+                    dump(i->second, true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_character('{');
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\":", 2);
+                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    assert(i != val.m_value.object->cend());
+                    assert(std::next(i) == val.m_value.object->cend());
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\":", 2);
+                    dump(i->second, false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character('}');
+                }
+
+                return;
+            }
+
+            case value_t::array:
+            {
+                if (val.m_value.array->empty())
+                {
+                    o->write_characters("[]", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("[\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        dump(*i, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    assert(not val.m_value.array->empty());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character(']');
+                }
+                else
+                {
+                    o->write_character('[');
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        dump(*i, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    assert(not val.m_value.array->empty());
+                    dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character(']');
+                }
+
+                return;
+            }
+
+            case value_t::string:
+            {
+                o->write_character('\"');
+                dump_escaped(*val.m_value.string, ensure_ascii);
+                o->write_character('\"');
+                return;
+            }
+
+            case value_t::boolean:
+            {
+                if (val.m_value.boolean)
+                {
+                    o->write_characters("true", 4);
+                }
+                else
+                {
+                    o->write_characters("false", 5);
+                }
+                return;
+            }
+
+            case value_t::number_integer:
+            {
+                dump_integer(val.m_value.number_integer);
+                return;
+            }
+
+            case value_t::number_unsigned:
+            {
+                dump_integer(val.m_value.number_unsigned);
+                return;
+            }
+
+            case value_t::number_float:
+            {
+                dump_float(val.m_value.number_float);
+                return;
+            }
+
+            case value_t::discarded:
+            {
+                o->write_characters("<discarded>", 11);
+                return;
+            }
+
+            case value_t::null:
+            {
+                o->write_characters("null", 4);
+                return;
+            }
+
+            default:            // LCOV_EXCL_LINE
+                assert(false);  // LCOV_EXCL_LINE
+        }
+    }
+
+  private:
+    /*!
+    @brief dump escaped string
+
+    Escape a string by replacing certain special characters by a sequence of an
+    escape character (backslash) and another character and other control
+    characters by a sequence of "\u" followed by a four-digit hex
+    representation. The escaped string is written to output stream @a o.
+
+    @param[in] s  the string to escape
+    @param[in] ensure_ascii  whether to escape non-ASCII characters with
+                             \uXXXX sequences
+
+    @complexity Linear in the length of string @a s.
+    */
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
+    {
+        std::uint32_t codepoint;
+        std::uint8_t state = UTF8_ACCEPT;
+        std::size_t bytes = 0;  // number of bytes written to string_buffer
+
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars = 0;
+
+        for (std::size_t i = 0; i < s.size(); ++i)
+        {
+            const auto byte = static_cast<uint8_t>(s[i]);
+
+            switch (decode(state, codepoint, byte))
+            {
+                case UTF8_ACCEPT:  // decode found a new code point
+                {
+                    switch (codepoint)
+                    {
+                        case 0x08: // backspace
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'b';
+                            break;
+                        }
+
+                        case 0x09: // horizontal tab
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 't';
+                            break;
+                        }
+
+                        case 0x0A: // newline
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'n';
+                            break;
+                        }
+
+                        case 0x0C: // formfeed
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'f';
+                            break;
+                        }
+
+                        case 0x0D: // carriage return
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'r';
+                            break;
+                        }
+
+                        case 0x22: // quotation mark
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\"';
+                            break;
+                        }
+
+                        case 0x5C: // reverse solidus
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\\';
+                            break;
+                        }
+
+                        default:
+                        {
+                            // escape control characters (0x00..0x1F) or, if
+                            // ensure_ascii parameter is used, non-ASCII characters
+                            if ((codepoint <= 0x1F) or (ensure_ascii and (codepoint >= 0x7F)))
+                            {
+                                if (codepoint <= 0xFFFF)
+                                {
+                                    (std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
+                                                    static_cast<std::uint16_t>(codepoint));
+                                    bytes += 6;
+                                }
+                                else
+                                {
+                                    (std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
+                                                    static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
+                                                    static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu)));
+                                    bytes += 12;
+                                }
+                            }
+                            else
+                            {
+                                // copy byte to buffer (all previous bytes
+                                // been copied have in default case above)
+                                string_buffer[bytes++] = s[i];
+                            }
+                            break;
+                        }
+                    }
+
+                    // write buffer and reset index; there must be 13 bytes
+                    // left, as this is the maximal number of bytes to be
+                    // written ("\uxxxx\uxxxx\0") for one code point
+                    if (string_buffer.size() - bytes < 13)
+                    {
+                        o->write_characters(string_buffer.data(), bytes);
+                        bytes = 0;
+                    }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
+                    undumped_chars = 0;
+                    break;
+                }
+
+                case UTF8_REJECT:  // decode found invalid UTF-8 byte
+                {
+                    switch (error_handler)
+                    {
+                        case error_handler_t::strict:
+                        {
+                            std::string sn(3, '\0');
+                            (std::snprintf)(&sn[0], sn.size(), "%.2X", byte);
+                            JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                        }
+
+                        case error_handler_t::ignore:
+                        case error_handler_t::replace:
+                        {
+                            // in case we saw this character the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (undumped_chars > 0)
+                            {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace)
+                            {
+                                // add a replacement character
+                                if (ensure_ascii)
+                                {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                }
+                                else
+                                {
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
+                                }
+
+                                // write buffer and reset index; there must be 13 bytes
+                                // left, as this is the maximal number of bytes to be
+                                // written ("\uxxxx\uxxxx\0") for one code point
+                                if (string_buffer.size() - bytes < 13)
+                                {
+                                    o->write_characters(string_buffer.data(), bytes);
+                                    bytes = 0;
+                                }
+
+                                bytes_after_last_accept = bytes;
+                            }
+
+                            undumped_chars = 0;
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:            // LCOV_EXCL_LINE
+                            assert(false);  // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+
+                default:  // decode found yet incomplete multi-byte code point
+                {
+                    if (not ensure_ascii)
+                    {
+                        // code point will not be escaped - copy byte to buffer
+                        string_buffer[bytes++] = s[i];
+                    }
+                    ++undumped_chars;
+                    break;
+                }
+            }
+        }
+
+        // we finished processing the string
+        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
+        {
+            // write buffer
+            if (bytes > 0)
+            {
+                o->write_characters(string_buffer.data(), bytes);
+            }
+        }
+        else
+        {
+            // we finish reading, but do not accept: string was incomplete
+            switch (error_handler)
+            {
+                case error_handler_t::strict:
+                {
+                    std::string sn(3, '\0');
+                    (std::snprintf)(&sn[0], sn.size(), "%.2X", static_cast<std::uint8_t>(s.back()));
+                    JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+                }
+
+                case error_handler_t::ignore:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    break;
+                }
+
+                case error_handler_t::replace:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
+                    if (ensure_ascii)
+                    {
+                        o->write_characters("\\ufffd", 6);
+                    }
+                    else
+                    {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
+                    break;
+                }
+
+                default:            // LCOV_EXCL_LINE
+                    assert(false);  // LCOV_EXCL_LINE
+            }
+        }
+    }
+
+    /*!
+    @brief count digits
+
+    Count the number of decimal (base 10) digits for an input unsigned integer.
+
+    @param[in] x  unsigned integer number to count its digits
+    @return    number of decimal digits
+    */
+    inline unsigned int count_digits(number_unsigned_t x) noexcept
+    {
+        unsigned int n_digits = 1;
+        for (;;)
+        {
+            if (x < 10)
+            {
+                return n_digits;
+            }
+            if (x < 100)
+            {
+                return n_digits + 1;
+            }
+            if (x < 1000)
+            {
+                return n_digits + 2;
+            }
+            if (x < 10000)
+            {
+                return n_digits + 3;
+            }
+            x = x / 10000u;
+            n_digits += 4;
+        }
+    }
+
+    /*!
+    @brief dump an integer
+
+    Dump a given integer to output stream @a o. Works internally with
+    @a number_buffer.
+
+    @param[in] x  integer number (signed or unsigned) to dump
+    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
+    */
+    template<typename NumberType, detail::enable_if_t<
+                 std::is_same<NumberType, number_unsigned_t>::value or
+                 std::is_same<NumberType, number_integer_t>::value,
+                 int> = 0>
+    void dump_integer(NumberType x)
+    {
+        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
+        {
+            {
+                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
+                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
+                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
+                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
+                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
+                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
+                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
+                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
+                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
+                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
+            }
+        };
+
+        // special case for "0"
+        if (x == 0)
+        {
+            o->write_character('0');
+            return;
+        }
+
+        // use a pointer to fill the buffer
+        auto buffer_ptr = number_buffer.begin();
+
+        const bool is_negative = std::is_same<NumberType, number_integer_t>::value and not(x >= 0); // see issue #755
+        number_unsigned_t abs_value;
+
+        unsigned int n_chars;
+
+        if (is_negative)
+        {
+            *buffer_ptr = '-';
+            abs_value = remove_sign(x);
+
+            // account one more byte for the minus sign
+            n_chars = 1 + count_digits(abs_value);
+        }
+        else
+        {
+            abs_value = static_cast<number_unsigned_t>(x);
+            n_chars = count_digits(abs_value);
+        }
+
+        // spare 1 byte for '\0'
+        assert(n_chars < number_buffer.size() - 1);
+
+        // jump to the end to generate the string from backward
+        // so we later avoid reversing the result
+        buffer_ptr += n_chars;
+
+        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
+        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
+        while (abs_value >= 100)
+        {
+            const auto digits_index = static_cast<unsigned>((abs_value % 100));
+            abs_value /= 100;
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+
+        if (abs_value >= 10)
+        {
+            const auto digits_index = static_cast<unsigned>(abs_value);
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+        else
+        {
+            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
+        }
+
+        o->write_characters(number_buffer.data(), n_chars);
+    }
+
+    /*!
+    @brief dump a floating-point number
+
+    Dump a given floating-point number to output stream @a o. Works internally
+    with @a number_buffer.
+
+    @param[in] x  floating-point number to dump
+    */
+    void dump_float(number_float_t x)
+    {
+        // NaN / inf
+        if (not std::isfinite(x))
+        {
+            o->write_characters("null", 4);
+            return;
+        }
+
+        // If number_float_t is an IEEE-754 single or double precision number,
+        // use the Grisu2 algorithm to produce short numbers which are
+        // guaranteed to round-trip, using strtof and strtod, resp.
+        //
+        // NB: The test below works if <long double> == <double>.
+        static constexpr bool is_ieee_single_or_double
+            = (std::numeric_limits<number_float_t>::is_iec559 and std::numeric_limits<number_float_t>::digits == 24 and std::numeric_limits<number_float_t>::max_exponent == 128) or
+              (std::numeric_limits<number_float_t>::is_iec559 and std::numeric_limits<number_float_t>::digits == 53 and std::numeric_limits<number_float_t>::max_exponent == 1024);
+
+        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
+    }
+
+    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
+    {
+        char* begin = number_buffer.data();
+        char* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
+
+        o->write_characters(begin, static_cast<size_t>(end - begin));
+    }
+
+    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
+    {
+        // get number of digits for a float -> text -> float round-trip
+        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
+
+        // the actual conversion
+        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
+
+        // negative value indicates an error
+        assert(len > 0);
+        // check if buffer was large enough
+        assert(static_cast<std::size_t>(len) < number_buffer.size());
+
+        // erase thousands separator
+        if (thousands_sep != '\0')
+        {
+            const auto end = std::remove(number_buffer.begin(),
+                                         number_buffer.begin() + len, thousands_sep);
+            std::fill(end, number_buffer.end(), '\0');
+            assert((end - number_buffer.begin()) <= len);
+            len = (end - number_buffer.begin());
+        }
+
+        // convert decimal point to '.'
+        if (decimal_point != '\0' and decimal_point != '.')
+        {
+            const auto dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
+            if (dec_pos != number_buffer.end())
+            {
+                *dec_pos = '.';
+            }
+        }
+
+        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));
+
+        // determine if need to append ".0"
+        const bool value_is_int_like =
+            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
+                         [](char c)
+        {
+            return c == '.' or c == 'e';
+        });
+
+        if (value_is_int_like)
+        {
+            o->write_characters(".0", 2);
+        }
+    }
+
+    /*!
+    @brief check whether a string is UTF-8 encoded
+
+    The function checks each byte of a string whether it is UTF-8 encoded. The
+    result of the check is stored in the @a state parameter. The function must
+    be called initially with state 0 (accept). State 1 means the string must
+    be rejected, because the current byte is not allowed. If the string is
+    completely processed, but the state is non-zero, the string ended
+    prematurely; that is, the last byte indicated more bytes should have
+    followed.
+
+    @param[in,out] state  the state of the decoding
+    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
+    @param[in] byte       next byte to decode
+    @return               new state
+
+    @note The function has been edited: a std::array is used.
+
+    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    */
+    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
+    {
+        static const std::array<std::uint8_t, 400> utf8d =
+        {
+            {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
+                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
+                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
+                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
+                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
+                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
+            }
+        };
+
+        const std::uint8_t type = utf8d[byte];
+
+        codep = (state != UTF8_ACCEPT)
+                ? (byte & 0x3fu) | (codep << 6u)
+                : (0xFFu >> type) & (byte);
+
+        state = utf8d[256u + state * 16u + type];
+        return state;
+    }
+
+    /*
+     * Overload to make the compiler happy while it is instantiating
+     * dump_integer for number_unsigned_t.
+     * Must never be called.
+     */
+    number_unsigned_t remove_sign(number_unsigned_t x)
+    {
+        assert(false); // LCOV_EXCL_LINE
+        return x; // LCOV_EXCL_LINE
+    }
+
+    /*
+     * Helper function for dump_integer
+     *
+     * This function takes a negative signed integer and returns its absolute
+     * value as unsigned integer. The plus/minus shuffling is necessary as we can
+     * not directly remove the sign of an arbitrary signed integer as the
+     * absolute values of INT_MIN and INT_MAX are usually not the same. See
+     * #1708 for details.
+     */
+    inline number_unsigned_t remove_sign(number_integer_t x) noexcept
+    {
+        assert(x < 0 and x < (std::numeric_limits<number_integer_t>::max)());
+        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
+    }
+
+  private:
+    /// the output of the serializer
+    output_adapter_t<char> o = nullptr;
+
+    /// a (hopefully) large enough character buffer
+    std::array<char, 64> number_buffer{{}};
+
+    /// the locale
+    const std::lconv* loc = nullptr;
+    /// the locale's thousand separator character
+    const char thousands_sep = '\0';
+    /// the locale's decimal point character
+    const char decimal_point = '\0';
+
+    /// string buffer
+    std::array<char, 512> string_buffer{{}};
+
+    /// the indentation character
+    const char indent_char;
+    /// the indentation string
+    string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+
+/*!
+@brief a class to store JSON values
+
+@tparam ObjectType type for JSON objects (`std::map` by default; will be used
+in @ref object_t)
+@tparam ArrayType type for JSON arrays (`std::vector` by default; will be used
+in @ref array_t)
+@tparam StringType type for JSON strings and object keys (`std::string` by
+default; will be used in @ref string_t)
+@tparam BooleanType type for JSON booleans (`bool` by default; will be used
+in @ref boolean_t)
+@tparam NumberIntegerType type for JSON integer numbers (`int64_t` by
+default; will be used in @ref number_integer_t)
+@tparam NumberUnsignedType type for JSON unsigned integer numbers (@c
+`uint64_t` by default; will be used in @ref number_unsigned_t)
+@tparam NumberFloatType type for JSON floating-point numbers (`double` by
+default; will be used in @ref number_float_t)
+@tparam AllocatorType type of the allocator to use (`std::allocator` by
+default)
+@tparam JSONSerializer the serializer to resolve internal calls to `to_json()`
+and `from_json()` (@ref adl_serializer by default)
+
+@requirement The class satisfies the following concept requirements:
+- Basic
+ - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible):
+   JSON values can be default constructed. The result will be a JSON null
+   value.
+ - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible):
+   A JSON value can be constructed from an rvalue argument.
+ - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible):
+   A JSON value can be copy-constructed from an lvalue expression.
+ - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable):
+   A JSON value van be assigned from an rvalue argument.
+ - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable):
+   A JSON value can be copy-assigned from an lvalue expression.
+ - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible):
+   JSON values can be destructed.
+- Layout
+ - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType):
+   JSON values have
+   [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
+   All non-static data members are private and standard layout types, the
+   class has no virtual functions or (virtual) base classes.
+- Library-wide
+ - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable):
+   JSON values can be compared with `==`, see @ref
+   operator==(const_reference,const_reference).
+ - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable):
+   JSON values can be compared with `<`, see @ref
+   operator<(const_reference,const_reference).
+ - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable):
+   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
+   other compatible types, using unqualified function call @ref swap().
+ - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer):
+   JSON values can be compared against `std::nullptr_t` objects which are used
+   to model the `null` value.
+- Container
+ - [Container](https://en.cppreference.com/w/cpp/named_req/Container):
+   JSON values can be used like STL containers and provide iterator access.
+ - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer);
+   JSON values can be used like STL containers and provide reverse iterator
+   access.
+
+@invariant The member variables @a m_value and @a m_type have the following
+relationship:
+- If `m_type == value_t::object`, then `m_value.object != nullptr`.
+- If `m_type == value_t::array`, then `m_value.array != nullptr`.
+- If `m_type == value_t::string`, then `m_value.string != nullptr`.
+The invariants are checked by member function assert_invariant().
+
+@internal
+@note ObjectType trick from http://stackoverflow.com/a/9860911
+@endinternal
+
+@see [RFC 7159: The JavaScript Object Notation (JSON) Data Interchange
+Format](http://rfc7159.net/rfc7159)
+
+@since version 1.0.0
+
+@nosubgrouping
+*/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+class basic_json
+{
+  private:
+    template<detail::value_t> friend struct detail::external_constructor;
+    friend ::nlohmann::json_pointer<basic_json>;
+    friend ::nlohmann::detail::parser<basic_json>;
+    friend ::nlohmann::detail::serializer<basic_json>;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::iter_impl;
+    template<typename BasicJsonType, typename CharType>
+    friend class ::nlohmann::detail::binary_writer;
+    template<typename BasicJsonType, typename SAX>
+    friend class ::nlohmann::detail::binary_reader;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_parser;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
+
+    /// workaround type for MSVC
+    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
+
+    // convenience aliases for types residing in namespace detail;
+    using lexer = ::nlohmann::detail::lexer<basic_json>;
+    using parser = ::nlohmann::detail::parser<basic_json>;
+
+    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
+    template<typename BasicJsonType>
+    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
+    template<typename BasicJsonType>
+    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
+    template<typename Iterator>
+    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
+    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
+
+    template<typename CharType>
+    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
+
+    using binary_reader = ::nlohmann::detail::binary_reader<basic_json>;
+    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
+
+    using serializer = ::nlohmann::detail::serializer<basic_json>;
+
+  public:
+    using value_t = detail::value_t;
+    /// JSON Pointer, see @ref nlohmann::json_pointer
+    using json_pointer = ::nlohmann::json_pointer<basic_json>;
+    template<typename T, typename SFINAE>
+    using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
+    /// helper type for initializer lists of basic_json values
+    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
+
+    using input_format_t = detail::input_format_t;
+    /// SAX interface type, see @ref nlohmann::json_sax
+    using json_sax_t = json_sax<basic_json>;
+
+    ////////////////
+    // exceptions //
+    ////////////////
+
+    /// @name exceptions
+    /// Classes to implement user-defined exceptions.
+    /// @{
+
+    /// @copydoc detail::exception
+    using exception = detail::exception;
+    /// @copydoc detail::parse_error
+    using parse_error = detail::parse_error;
+    /// @copydoc detail::invalid_iterator
+    using invalid_iterator = detail::invalid_iterator;
+    /// @copydoc detail::type_error
+    using type_error = detail::type_error;
+    /// @copydoc detail::out_of_range
+    using out_of_range = detail::out_of_range;
+    /// @copydoc detail::other_error
+    using other_error = detail::other_error;
+
+    /// @}
+
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// The canonic container types to use @ref basic_json like any other STL
+    /// container.
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    /// an iterator for a basic_json container
+    using iterator = iter_impl<basic_json>;
+    /// a const iterator for a basic_json container
+    using const_iterator = iter_impl<const basic_json>;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+
+    /*!
+    @brief returns the allocator associated with the container
+    */
+    static allocator_type get_allocator()
+    {
+        return allocator_type();
+    }
+
+    /*!
+    @brief returns version information on the library
+
+    This function returns a JSON object with information about the library,
+    including the version number and information on the platform and compiler.
+
+    @return JSON object holding version information
+    key         | description
+    ----------- | ---------------
+    `compiler`  | Information on the used compiler. It is an object with the following keys: `c++` (the used C++ standard), `family` (the compiler family; possible values are `clang`, `icc`, `gcc`, `ilecpp`, `msvc`, `pgcpp`, `sunpro`, and `unknown`), and `version` (the compiler version).
+    `copyright` | The copyright line for the library as string.
+    `name`      | The name of the library as string.
+    `platform`  | The used platform as string. Possible values are `win32`, `linux`, `apple`, `unix`, and `unknown`.
+    `url`       | The URL of the project as string.
+    `version`   | The version of the library. It is an object with the following keys: `major`, `minor`, and `patch` as defined by [Semantic Versioning](http://semver.org), and `string` (the version string).
+
+    @liveexample{The following code shows an example output of the `meta()`
+    function.,meta}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @complexity Constant.
+
+    @since 2.1.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json meta()
+    {
+        basic_json result;
+
+        result["copyright"] = "(C) 2013-2017 Niels Lohmann";
+        result["name"] = "JSON for Modern C++";
+        result["url"] = "https://github.com/nlohmann/json";
+        result["version"]["string"] =
+            std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_PATCH);
+        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
+        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
+        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
+
+#ifdef _WIN32
+        result["platform"] = "win32";
+#elif defined __linux__
+        result["platform"] = "linux";
+#elif defined __APPLE__
+        result["platform"] = "apple";
+#elif defined __unix__
+        result["platform"] = "unix";
+#else
+        result["platform"] = "unknown";
+#endif
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
+#elif defined(__clang__)
+        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
+#elif defined(__GNUC__) || defined(__GNUG__)
+        result["compiler"] = {{"family", "gcc"}, {"version", std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__)}};
+#elif defined(__HP_cc) || defined(__HP_aCC)
+        result["compiler"] = "hp"
+#elif defined(__IBMCPP__)
+        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
+#elif defined(_MSC_VER)
+        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
+#elif defined(__PGI)
+        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
+#elif defined(__SUNPRO_CC)
+        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
+#else
+        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
+#endif
+
+#ifdef __cplusplus
+        result["compiler"]["c++"] = std::to_string(__cplusplus);
+#else
+        result["compiler"]["c++"] = "unknown";
+#endif
+        return result;
+    }
+
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// The data types to store a JSON value. These types are derived from
+    /// the template arguments passed to class @ref basic_json.
+    /// @{
+
+#if defined(JSON_HAS_CPP_14)
+    // Use transparent comparator if possible, combined with perfect forwarding
+    // on find() and count() calls prevents unnecessary string construction.
+    using object_comparator_t = std::less<>;
+#else
+    using object_comparator_t = std::less<StringType>;
+#endif
+
+    /*!
+    @brief a type for an object
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON objects as follows:
+    > An object is an unordered collection of zero or more name/value pairs,
+    > where a name is a string and a value is a string, number, boolean, null,
+    > object, or array.
+
+    To store objects in C++, a type is defined by the template parameters
+    described below.
+
+    @tparam ObjectType  the container to store objects (e.g., `std::map` or
+    `std::unordered_map`)
+    @tparam StringType the type of the keys or names (e.g., `std::string`).
+    The comparison function `std::less<StringType>` is used to order elements
+    inside the container.
+    @tparam AllocatorType the allocator to use for objects (e.g.,
+    `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ObjectType (`std::map`), @a StringType
+    (`std::string`), and @a AllocatorType (`std::allocator`), the default
+    value for @a object_t is:
+
+    @code {.cpp}
+    std::map<
+      std::string, // key_type
+      basic_json, // value_type
+      std::less<std::string>, // key_compare
+      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
+    >
+    @endcode
+
+    #### Behavior
+
+    The choice of @a object_t influences the behavior of the JSON class. With
+    the default type, objects have the following behavior:
+
+    - When all names are unique, objects will be interoperable in the sense
+      that all software implementations receiving that object will agree on
+      the name-value mappings.
+    - When the names within an object are not unique, it is unspecified which
+      one of the values for a given key will be chosen. For instance,
+      `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or
+      `{"key": 2}`.
+    - Internally, name/value pairs are stored in lexicographical order of the
+      names. Objects will also be serialized (see @ref dump) in this order.
+      For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored
+      and serialized as `{"a": 2, "b": 1}`.
+    - When comparing objects, the order of the name/value pairs is irrelevant.
+      This makes objects interoperable in the sense that they will not be
+      affected by these differences. For instance, `{"b": 1, "a": 2}` and
+      `{"a": 2, "b": 1}` will be treated as equal.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the object's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON object.
+
+    #### Storage
+
+    Objects are stored as pointers in a @ref basic_json type. That is, for any
+    access to object values, a pointer of type `object_t*` must be
+    dereferenced.
+
+    @sa @ref array_t -- type for an array value
+
+    @since version 1.0.0
+
+    @note The order name/value pairs are added to the object is *not*
+    preserved by the library. Therefore, iterating an object may return
+    name/value pairs in a different order than they were originally stored. In
+    fact, keys will be traversed in alphabetical order as `std::map` with
+    `std::less` is used by default. Please note this behavior conforms to [RFC
+    7159](http://rfc7159.net/rfc7159), because any order implements the
+    specified "unordered" nature of JSON objects.
+    */
+    using object_t = ObjectType<StringType,
+          basic_json,
+          object_comparator_t,
+          AllocatorType<std::pair<const StringType,
+          basic_json>>>;
+
+    /*!
+    @brief a type for an array
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON arrays as follows:
+    > An array is an ordered sequence of zero or more values.
+
+    To store objects in C++, a type is defined by the template parameters
+    explained below.
+
+    @tparam ArrayType  container type to store arrays (e.g., `std::vector` or
+    `std::list`)
+    @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ArrayType (`std::vector`) and @a
+    AllocatorType (`std::allocator`), the default value for @a array_t is:
+
+    @code {.cpp}
+    std::vector<
+      basic_json, // value_type
+      std::allocator<basic_json> // allocator_type
+    >
+    @endcode
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the array's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON array.
+
+    #### Storage
+
+    Arrays are stored as pointers in a @ref basic_json type. That is, for any
+    access to array values, a pointer of type `array_t*` must be dereferenced.
+
+    @sa @ref object_t -- type for an object value
+
+    @since version 1.0.0
+    */
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /*!
+    @brief a type for a string
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON strings as follows:
+    > A string is a sequence of zero or more Unicode characters.
+
+    To store objects in C++, a type is defined by the template parameter
+    described below. Unicode values are split by the JSON class into
+    byte-sized characters during deserialization.
+
+    @tparam StringType  the container to store strings (e.g., `std::string`).
+    Note this container is used for keys/names in objects, see @ref object_t.
+
+    #### Default type
+
+    With the default values for @a StringType (`std::string`), the default
+    value for @a string_t is:
+
+    @code {.cpp}
+    std::string
+    @endcode
+
+    #### Encoding
+
+    Strings are stored in UTF-8 encoding. Therefore, functions like
+    `std::string::size()` or `std::string::length()` return the number of
+    bytes in the string rather than the number of characters or glyphs.
+
+    #### String comparison
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > Software implementations are typically required to test names of object
+    > members for equality. Implementations that transform the textual
+    > representation into sequences of Unicode code units and then perform the
+    > comparison numerically, code unit by code unit, are interoperable in the
+    > sense that implementations will agree in all cases on equality or
+    > inequality of two strings. For example, implementations that compare
+    > strings with escaped characters unconverted may incorrectly find that
+    > `"a\\b"` and `"a\u005Cb"` are not equal.
+
+    This implementation is interoperable as it does compare strings code unit
+    by code unit.
+
+    #### Storage
+
+    String values are stored as pointers in a @ref basic_json type. That is,
+    for any access to string values, a pointer of type `string_t*` must be
+    dereferenced.
+
+    @since version 1.0.0
+    */
+    using string_t = StringType;
+
+    /*!
+    @brief a type for a boolean
+
+    [RFC 7159](http://rfc7159.net/rfc7159) implicitly describes a boolean as a
+    type which differentiates the two literals `true` and `false`.
+
+    To store objects in C++, a type is defined by the template parameter @a
+    BooleanType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a BooleanType (`bool`), the default value for
+    @a boolean_t is:
+
+    @code {.cpp}
+    bool
+    @endcode
+
+    #### Storage
+
+    Boolean values are stored directly inside a @ref basic_json type.
+
+    @since version 1.0.0
+    */
+    using boolean_t = BooleanType;
+
+    /*!
+    @brief a type for a number (integer)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store integer numbers in C++, a type is defined by the template
+    parameter @a NumberIntegerType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberIntegerType (`int64_t`), the default
+    value for @a number_integer_t is:
+
+    @code {.cpp}
+    int64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
+    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
+    that are out of range will yield over/underflow when used in a
+    constructor. During deserialization, too large or small integer numbers
+    will be automatically be stored as @ref number_unsigned_t or @ref
+    number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange of the exactly supported range [INT64_MIN,
+    INT64_MAX], this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa @ref number_float_t -- type for number values (floating-point)
+
+    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_integer_t = NumberIntegerType;
+
+    /*!
+    @brief a type for a number (unsigned)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store unsigned integer numbers in C++, a type is defined by the
+    template parameter @a NumberUnsignedType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberUnsignedType (`uint64_t`), the
+    default value for @a number_unsigned_t is:
+
+    @code {.cpp}
+    uint64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `18446744073709551615` (UINT64_MAX) and the minimal integer
+    number that can be stored is `0`. Integer numbers that are out of range
+    will yield over/underflow when used in a constructor. During
+    deserialization, too large or small integer numbers will be automatically
+    be stored as @ref number_integer_t or @ref number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange (when considered in conjunction with the
+    number_integer_t type) of the exactly supported range [0, UINT64_MAX],
+    this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa @ref number_float_t -- type for number values (floating-point)
+    @sa @ref number_integer_t -- type for number values (integer)
+
+    @since version 2.0.0
+    */
+    using number_unsigned_t = NumberUnsignedType;
+
+    /*!
+    @brief a type for a number (floating-point)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store floating-point numbers in C++, a type is defined by the template
+    parameter @a NumberFloatType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberFloatType (`double`), the default
+    value for @a number_float_t is:
+
+    @code {.cpp}
+    double
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in floating-point literals will be ignored. Internally,
+      the value will be stored as decimal number. For instance, the C++
+      floating-point literal `01.2` will be serialized to `1.2`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > This specification allows implementations to set limits on the range and
+    > precision of numbers accepted. Since software that implements IEEE
+    > 754-2008 binary64 (double precision) numbers is generally available and
+    > widely used, good interoperability can be achieved by implementations
+    > that expect no more precision or range than these provide, in the sense
+    > that implementations will approximate JSON numbers within the expected
+    > precision.
+
+    This implementation does exactly follow this approach, as it uses double
+    precision floating-point numbers. Note values smaller than
+    `-1.79769313486232e+308` and values greater than `1.79769313486232e+308`
+    will be stored as NaN internally and be serialized to `null`.
+
+    #### Storage
+
+    Floating-point number values are stored directly inside a @ref basic_json
+    type.
+
+    @sa @ref number_integer_t -- type for number values (integer)
+
+    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_float_t = NumberFloatType;
+
+    /// @}
+
+  private:
+
+    /// helper for exception-safe object creation
+    template<typename T, typename... Args>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    static T* create(Args&& ... args)
+    {
+        AllocatorType<T> alloc;
+        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
+
+        auto deleter = [&](T * object)
+        {
+            AllocatorTraits::deallocate(alloc, object, 1);
+        };
+        std::unique_ptr<T, decltype(deleter)> object(AllocatorTraits::allocate(alloc, 1), deleter);
+        AllocatorTraits::construct(alloc, object.get(), std::forward<Args>(args)...);
+        assert(object != nullptr);
+        return object.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+    /*!
+    @brief a JSON value
+
+    The actual storage for a JSON value of the @ref basic_json class. This
+    union combines the different storage types for the JSON value types
+    defined in @ref value_t.
+
+    JSON type | value_t type    | used type
+    --------- | --------------- | ------------------------
+    object    | object          | pointer to @ref object_t
+    array     | array           | pointer to @ref array_t
+    string    | string          | pointer to @ref string_t
+    boolean   | boolean         | @ref boolean_t
+    number    | number_integer  | @ref number_integer_t
+    number    | number_unsigned | @ref number_unsigned_t
+    number    | number_float    | @ref number_float_t
+    null      | null            | *no value is stored*
+
+    @note Variable-length types (objects, arrays, and strings) are stored as
+    pointers. The size of the union should not exceed 64 bits if the default
+    value types are used.
+
+    @since version 1.0.0
+    */
+    union json_value
+    {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (unsigned integer)
+        number_unsigned_t number_unsigned;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (unsigned)
+        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t)
+        {
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case value_t::boolean:
+                {
+                    boolean = boolean_t(false);
+                    break;
+                }
+
+                case value_t::number_integer:
+                {
+                    number_integer = number_integer_t(0);
+                    break;
+                }
+
+                case value_t::number_unsigned:
+                {
+                    number_unsigned = number_unsigned_t(0);
+                    break;
+                }
+
+                case value_t::number_float:
+                {
+                    number_float = number_float_t(0.0);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    break;
+                }
+
+                default:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
+                    {
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.7.3")); // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value)
+        {
+            string = create<string_t>(value);
+        }
+
+        /// constructor for rvalue strings
+        json_value(string_t&& value)
+        {
+            string = create<string_t>(std::move(value));
+        }
+
+        /// constructor for objects
+        json_value(const object_t& value)
+        {
+            object = create<object_t>(value);
+        }
+
+        /// constructor for rvalue objects
+        json_value(object_t&& value)
+        {
+            object = create<object_t>(std::move(value));
+        }
+
+        /// constructor for arrays
+        json_value(const array_t& value)
+        {
+            array = create<array_t>(value);
+        }
+
+        /// constructor for rvalue arrays
+        json_value(array_t&& value)
+        {
+            array = create<array_t>(std::move(value));
+        }
+
+        void destroy(value_t t) noexcept
+        {
+            // flatten the current json_value to a heap-allocated stack
+            std::vector<basic_json> stack;
+
+            // move the top-level items to stack
+            if (t == value_t::array)
+            {
+                stack.reserve(array->size());
+                std::move(array->begin(), array->end(), std::back_inserter(stack));
+            }
+            else if (t == value_t::object)
+            {
+                stack.reserve(object->size());
+                for (auto&& it : *object)
+                {
+                    stack.push_back(std::move(it.second));
+                }
+            }
+
+            while (not stack.empty())
+            {
+                // move the last item to local variable to be processed
+                basic_json current_item(std::move(stack.back()));
+                stack.pop_back();
+
+                // if current_item is array/object, move
+                // its children to the stack to be processed later
+                if (current_item.is_array())
+                {
+                    std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(),
+                              std::back_inserter(stack));
+
+                    current_item.m_value.array->clear();
+                }
+                else if (current_item.is_object())
+                {
+                    for (auto&& it : *current_item.m_value.object)
+                    {
+                        stack.push_back(std::move(it.second));
+                    }
+
+                    current_item.m_value.object->clear();
+                }
+
+                // it's now safe that current_item get destructed
+                // since it doesn't have any children
+            }
+
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    AllocatorType<object_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    AllocatorType<array_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
+                    break;
+                }
+
+                default:
+                {
+                    break;
+                }
+            }
+        }
+    };
+
+    /*!
+    @brief checks the class invariants
+
+    This function asserts the class invariants. It needs to be called at the
+    end of every constructor to make sure that created objects respect the
+    invariant. Furthermore, it has to be called each time the type of a JSON
+    value is changed, because the invariant expresses a relationship between
+    @a m_type and @a m_value.
+    */
+    void assert_invariant() const noexcept
+    {
+        assert(m_type != value_t::object or m_value.object != nullptr);
+        assert(m_type != value_t::array or m_value.array != nullptr);
+        assert(m_type != value_t::string or m_value.string != nullptr);
+    }
+
+  public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /*!
+    @brief parser event types
+
+    The parser callback distinguishes the following events:
+    - `object_start`: the parser read `{` and started to process a JSON object
+    - `key`: the parser read a key of a value in an object
+    - `object_end`: the parser read `}` and finished processing a JSON object
+    - `array_start`: the parser read `[` and started to process a JSON array
+    - `array_end`: the parser read `]` and finished processing a JSON array
+    - `value`: the parser finished reading a JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    @sa @ref parser_callback_t for more information and examples
+    */
+    using parse_event_t = typename parser::parse_event_t;
+
+    /*!
+    @brief per-element parser callback type
+
+    With a parser callback function, the result of parsing a JSON text can be
+    influenced. When passed to @ref parse, it is called on certain events
+    (passed as @ref parse_event_t via parameter @a event) with a set recursion
+    depth @a depth and context JSON value @a parsed. The return value of the
+    callback function is a boolean indicating whether the element that emitted
+    the callback shall be kept or not.
+
+    We distinguish six scenarios (determined by the event type) in which the
+    callback function can be called. The following table describes the values
+    of the parameters @a depth, @a event, and @a parsed.
+
+    parameter @a event | description | parameter @a depth | parameter @a parsed
+    ------------------ | ----------- | ------------------ | -------------------
+    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
+    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
+    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
+    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
+    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
+    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    Discarding a value (i.e., returning `false`) has different effects
+    depending on the context in which function was called:
+
+    - Discarded values in structured types are skipped. That is, the parser
+      will behave as if the discarded value was never read.
+    - In case a value outside a structured type is skipped, it is replaced
+      with `null`. This case happens if the top-level element is skipped.
+
+    @param[in] depth  the depth of the recursion during parsing
+
+    @param[in] event  an event of type parse_event_t indicating the context in
+    the callback function has been called
+
+    @param[in,out] parsed  the current intermediate parse result; note that
+    writing to this value has no effect for parse_event_t::key events
+
+    @return Whether the JSON value which called the function during parsing
+    should be kept (`true`) or not (`false`). In the latter case, it is either
+    skipped completely or replaced by an empty discarded object.
+
+    @sa @ref parse for examples
+
+    @since version 1.0.0
+    */
+    using parser_callback_t = typename parser::parser_callback_t;
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /// @name constructors and destructors
+    /// Constructors of class @ref basic_json, copy/move constructor, copy
+    /// assignment, static functions creating objects, and the destructor.
+    /// @{
+
+    /*!
+    @brief create an empty value with a given type
+
+    Create an empty JSON value with a given type. The value will be default
+    initialized with an empty value which depends on the type:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+
+    @param[in] v  the type of the value to create
+
+    @complexity Constant.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows the constructor for different @ref
+    value_t values,basic_json__value_t}
+
+    @sa @ref clear() -- restores the postcondition of this constructor
+
+    @since version 1.0.0
+    */
+    basic_json(const value_t v)
+        : m_type(v), m_value(v)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a null object
+
+    Create a `null` JSON value. It either takes a null pointer as parameter
+    (explicitly creating `null`) or no parameter (implicitly creating `null`).
+    The passed null pointer itself is not read -- it is only used to choose
+    the right constructor.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @liveexample{The following code shows the constructor with and without a
+    null pointer parameter.,basic_json__nullptr_t}
+
+    @since version 1.0.0
+    */
+    basic_json(std::nullptr_t = nullptr) noexcept
+        : basic_json(value_t::null)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value
+
+    This is a "catch all" constructor for all compatible JSON types; that is,
+    types for which a `to_json()` method exists. The constructor forwards the
+    parameter @a val to that method (to `json_serializer<U>::to_json` method
+    with `U = uncvref_t<CompatibleType>`, to be exact).
+
+    Template type @a CompatibleType includes, but is not limited to, the
+    following types:
+    - **arrays**: @ref array_t and all kinds of compatible containers such as
+      `std::vector`, `std::deque`, `std::list`, `std::forward_list`,
+      `std::array`, `std::valarray`, `std::set`, `std::unordered_set`,
+      `std::multiset`, and `std::unordered_multiset` with a `value_type` from
+      which a @ref basic_json value can be constructed.
+    - **objects**: @ref object_t and all kinds of compatible associative
+      containers such as `std::map`, `std::unordered_map`, `std::multimap`,
+      and `std::unordered_multimap` with a `key_type` compatible to
+      @ref string_t and a `value_type` from which a @ref basic_json value can
+      be constructed.
+    - **strings**: @ref string_t, string literals, and all compatible string
+      containers can be used.
+    - **numbers**: @ref number_integer_t, @ref number_unsigned_t,
+      @ref number_float_t, and all convertible number types such as `int`,
+      `size_t`, `int64_t`, `float` or `double` can be used.
+    - **boolean**: @ref boolean_t / `bool` can be used.
+
+    See the examples below.
+
+    @tparam CompatibleType a type such that:
+    - @a CompatibleType is not derived from `std::istream`,
+    - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move
+         constructors),
+    - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments)
+    - @a CompatibleType is not a @ref basic_json nested type (e.g.,
+         @ref json_pointer, @ref iterator, etc ...)
+    - @ref @ref json_serializer<U> has a
+         `to_json(basic_json_t&, CompatibleType&&)` method
+
+    @tparam U = `uncvref_t<CompatibleType>`
+
+    @param[in] val the value to be forwarded to the respective constructor
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @liveexample{The following code shows the constructor with several
+    compatible types.,basic_json__CompatibleType}
+
+    @since version 2.1.0
+    */
+    template <typename CompatibleType,
+              typename U = detail::uncvref_t<CompatibleType>,
+              detail::enable_if_t<
+                  not detail::is_basic_json<U>::value and detail::is_compatible_type<basic_json_t, U>::value, int> = 0>
+    basic_json(CompatibleType && val) noexcept(noexcept(
+                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
+                                           std::forward<CompatibleType>(val))))
+    {
+        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value from an existing one
+
+    This is a constructor for existing @ref basic_json types.
+    It does not hijack copy/move constructors, since the parameter has different
+    template arguments than the current ones.
+
+    The constructor tries to convert the internal @ref m_value of the parameter.
+
+    @tparam BasicJsonType a type such that:
+    - @a BasicJsonType is a @ref basic_json type.
+    - @a BasicJsonType has different template arguments than @ref basic_json_t.
+
+    @param[in] val the @ref basic_json value to be converted.
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @since version 3.2.0
+    */
+    template <typename BasicJsonType,
+              detail::enable_if_t<
+                  detail::is_basic_json<BasicJsonType>::value and not std::is_same<basic_json, BasicJsonType>::value, int> = 0>
+    basic_json(const BasicJsonType& val)
+    {
+        using other_boolean_t = typename BasicJsonType::boolean_t;
+        using other_number_float_t = typename BasicJsonType::number_float_t;
+        using other_number_integer_t = typename BasicJsonType::number_integer_t;
+        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+        using other_string_t = typename BasicJsonType::string_t;
+        using other_object_t = typename BasicJsonType::object_t;
+        using other_array_t = typename BasicJsonType::array_t;
+
+        switch (val.type())
+        {
+            case value_t::boolean:
+                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
+                break;
+            case value_t::number_float:
+                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
+                break;
+            case value_t::number_integer:
+                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
+                break;
+            case value_t::number_unsigned:
+                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
+                break;
+            case value_t::string:
+                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
+                break;
+            case value_t::object:
+                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
+                break;
+            case value_t::array:
+                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
+                break;
+            case value_t::null:
+                *this = nullptr;
+                break;
+            case value_t::discarded:
+                m_type = value_t::discarded;
+                break;
+            default:            // LCOV_EXCL_LINE
+                assert(false);  // LCOV_EXCL_LINE
+        }
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a container (array or object) from an initializer list
+
+    Creates a JSON value of type array or object from the passed initializer
+    list @a init. In case @a type_deduction is `true` (default), the type of
+    the JSON value to be created is deducted from the initializer list @a init
+    according to the following rules:
+
+    1. If the list is empty, an empty JSON object value `{}` is created.
+    2. If the list consists of pairs whose first element is a string, a JSON
+       object value is created where the first elements of the pairs are
+       treated as keys and the second elements are as values.
+    3. In all other cases, an array is created.
+
+    The rules aim to create the best fit between a C++ initializer list and
+    JSON values. The rationale is as follows:
+
+    1. The empty initializer list is written as `{}` which is exactly an empty
+       JSON object.
+    2. C++ has no way of describing mapped types other than to list a list of
+       pairs. As JSON requires that keys must be of type string, rule 2 is the
+       weakest constraint one can pose on initializer lists to interpret them
+       as an object.
+    3. In all other cases, the initializer list could not be interpreted as
+       JSON object type, so interpreting it as JSON array type is safe.
+
+    With the rules described above, the following JSON values cannot be
+    expressed by an initializer list:
+
+    - the empty array (`[]`): use @ref array(initializer_list_t)
+      with an empty initializer list in this case
+    - arrays whose elements satisfy rule 2: use @ref
+      array(initializer_list_t) with the same initializer list
+      in this case
+
+    @note When used without parentheses around an empty initializer list, @ref
+    basic_json() is called instead of this function, yielding the JSON null
+    value.
+
+    @param[in] init  initializer list with JSON values
+
+    @param[in] type_deduction internal parameter; when set to `true`, the type
+    of the JSON value is deducted from the initializer list @a init; when set
+    to `false`, the type provided via @a manual_type is forced. This mode is
+    used by the functions @ref array(initializer_list_t) and
+    @ref object(initializer_list_t).
+
+    @param[in] manual_type internal parameter; when @a type_deduction is set
+    to `false`, the created JSON value will use the provided type (only @ref
+    value_t::array and @ref value_t::object are valid); when @a type_deduction
+    is set to `true`, this parameter has no effect
+
+    @throw type_error.301 if @a type_deduction is `false`, @a manual_type is
+    `value_t::object`, but @a init contains an element which is not a pair
+    whose first element is a string. In this case, the constructor could not
+    create an object. If @a type_deduction would have be `true`, an array
+    would have been created. See @ref object(initializer_list_t)
+    for an example.
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows how JSON values are created from
+    initializer lists.,basic_json__list_init_t}
+
+    @sa @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+    @sa @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    basic_json(initializer_list_t init,
+               bool type_deduction = true,
+               value_t manual_type = value_t::array)
+    {
+        // check if each element is an array with two elements whose first
+        // element is a string
+        bool is_an_object = std::all_of(init.begin(), init.end(),
+                                        [](const detail::json_ref<basic_json>& element_ref)
+        {
+            return element_ref->is_array() and element_ref->size() == 2 and (*element_ref)[0].is_string();
+        });
+
+        // adjust type if type deduction is not wanted
+        if (not type_deduction)
+        {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array)
+            {
+                is_an_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object and not is_an_object))
+            {
+                JSON_THROW(type_error::create(301, "cannot create object from initializer list"));
+            }
+        }
+
+        if (is_an_object)
+        {
+            // the initializer list is a list of pairs -> create object
+            m_type = value_t::object;
+            m_value = value_t::object;
+
+            std::for_each(init.begin(), init.end(), [this](const detail::json_ref<basic_json>& element_ref)
+            {
+                auto element = element_ref.moved_or_copied();
+                m_value.object->emplace(
+                    std::move(*((*element.m_value.array)[0].m_value.string)),
+                    std::move((*element.m_value.array)[1]));
+            });
+        }
+        else
+        {
+            // the initializer list describes an array -> create array
+            m_type = value_t::array;
+            m_value.array = create<array_t>(init.begin(), init.end());
+        }
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief explicitly create an array from an initializer list
+
+    Creates a JSON array value from a given initializer list. That is, given a
+    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
+    initializer list is empty, the empty array `[]` is created.
+
+    @note This function is only needed to express two edge cases that cannot
+    be realized with the initializer list constructor (@ref
+    basic_json(initializer_list_t, bool, value_t)). These cases
+    are:
+    1. creating an array whose elements are all pairs whose first element is a
+    string -- in this case, the initializer list constructor would create an
+    object, taking the first elements as keys
+    2. creating an empty array -- passing the empty initializer list to the
+    initializer list constructor yields an empty object
+
+    @param[in] init  initializer list with JSON values to create an array from
+    (optional)
+
+    @return JSON array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `array`
+    function.,array}
+
+    @sa @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json array(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /*!
+    @brief explicitly create an object from an initializer list
+
+    Creates a JSON object value from a given initializer list. The initializer
+    lists elements must be pairs, and their first elements must be strings. If
+    the initializer list is empty, the empty object `{}` is created.
+
+    @note This function is only added for symmetry reasons. In contrast to the
+    related function @ref array(initializer_list_t), there are
+    no cases which can only be expressed by this function. That is, any
+    initializer list @a init can also be passed to the initializer list
+    constructor @ref basic_json(initializer_list_t, bool, value_t).
+
+    @param[in] init  initializer list to create an object from (optional)
+
+    @return JSON object value
+
+    @throw type_error.301 if @a init is not a list of pairs whose first
+    elements are strings. In this case, no object can be created. When such a
+    value is passed to @ref basic_json(initializer_list_t, bool, value_t),
+    an array would have been created from the passed initializer list @a init.
+    See example below.
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `object`
+    function.,object}
+
+    @sa @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json object(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /*!
+    @brief construct an array with count copies of given value
+
+    Constructs a JSON array value by creating @a cnt copies of a passed value.
+    In case @a cnt is `0`, an empty array is created.
+
+    @param[in] cnt  the number of JSON copies of @a val to create
+    @param[in] val  the JSON value to copy
+
+    @post `std::distance(begin(),end()) == cnt` holds.
+
+    @complexity Linear in @a cnt.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows examples for the @ref
+    basic_json(size_type\, const basic_json&)
+    constructor.,basic_json__size_type_basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(size_type cnt, const basic_json& val)
+        : m_type(value_t::array)
+    {
+        m_value.array = create<array_t>(cnt, val);
+        assert_invariant();
+    }
+
+    /*!
+    @brief construct a JSON container given an iterator range
+
+    Constructs the JSON value with the contents of the range `[first, last)`.
+    The semantics depends on the different types a JSON value can have:
+    - In case of a null type, invalid_iterator.206 is thrown.
+    - In case of other primitive types (number, boolean, or string), @a first
+      must be `begin()` and @a last must be `end()`. In this case, the value is
+      copied. Otherwise, invalid_iterator.204 is thrown.
+    - In case of structured types (array, object), the constructor behaves as
+      similar versions for `std::vector` or `std::map`; that is, a JSON array
+      or object is constructed from the values in the range.
+
+    @tparam InputIT an input iterator type (@ref iterator or @ref
+    const_iterator)
+
+    @param[in] first begin of the range to copy from (included)
+    @param[in] last end of the range to copy from (excluded)
+
+    @pre Iterators @a first and @a last must be initialized. **This
+         precondition is enforced with an assertion (see warning).** If
+         assertions are switched off, a violation of this precondition yields
+         undefined behavior.
+
+    @pre Range `[first, last)` is valid. Usually, this precondition cannot be
+         checked efficiently. Only certain edge cases are detected; see the
+         description of the exceptions below. A violation of this precondition
+         yields undefined behavior.
+
+    @warning A precondition is enforced with a runtime assertion that will
+             result in calling `std::abort` if this precondition is not met.
+             Assertions can be disabled by defining `NDEBUG` at compile time.
+             See https://en.cppreference.com/w/cpp/error/assert for more
+             information.
+
+    @throw invalid_iterator.201 if iterators @a first and @a last are not
+    compatible (i.e., do not belong to the same JSON value). In this case,
+    the range `[first, last)` is undefined.
+    @throw invalid_iterator.204 if iterators @a first and @a last belong to a
+    primitive type (number, boolean, or string), but @a first does not point
+    to the first element any more. In this case, the range `[first, last)` is
+    undefined. See example code below.
+    @throw invalid_iterator.206 if iterators @a first and @a last belong to a
+    null value. In this case, the range `[first, last)` is undefined.
+
+    @complexity Linear in distance between @a first and @a last.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows several ways to create JSON values by
+    specifying a subrange with iterators.,basic_json__InputIt_InputIt}
+
+    @since version 1.0.0
+    */
+    template<class InputIT, typename std::enable_if<
+                 std::is_same<InputIT, typename basic_json_t::iterator>::value or
+                 std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int>::type = 0>
+    basic_json(InputIT first, InputIT last)
+    {
+        assert(first.m_object != nullptr);
+        assert(last.m_object != nullptr);
+
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible"));
+        }
+
+        // copy type from first iterator
+        m_type = first.m_object->m_type;
+
+        // check if iterator range is complete for primitive values
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            {
+                if (JSON_HEDLEY_UNLIKELY(not first.m_it.primitive_iterator.is_begin()
+                                         or not last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = first.m_object->m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = first.m_object->m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = first.m_object->m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = first.m_object->m_value.boolean;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *first.m_object->m_value.string;
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object = create<object_t>(first.m_it.object_iterator,
+                                                  last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array = create<array_t>(first.m_it.array_iterator,
+                                                last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+                JSON_THROW(invalid_iterator::create(206, "cannot construct with iterators from " +
+                                                    std::string(first.m_object->type_name())));
+        }
+
+        assert_invariant();
+    }
+
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    /// @private
+    basic_json(const detail::json_ref<basic_json>& ref)
+        : basic_json(ref.moved_or_copied())
+    {}
+
+    /*!
+    @brief copy constructor
+
+    Creates a copy of a given JSON value.
+
+    @param[in] other  the JSON value to copy
+
+    @post `*this == other`
+
+    @complexity Linear in the size of @a other.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - As postcondition, it holds: `other == basic_json(other)`.
+
+    @liveexample{The following code shows an example for the copy
+    constructor.,basic_json__basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(const basic_json& other)
+        : m_type(other.m_type)
+    {
+        // check of passed value is valid
+        other.assert_invariant();
+
+        switch (m_type)
+        {
+            case value_t::object:
+            {
+                m_value = *other.m_value.object;
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value = *other.m_value.array;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *other.m_value.string;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value = other.m_value.boolean;
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                m_value = other.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value = other.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value = other.m_value.number_float;
+                break;
+            }
+
+            default:
+                break;
+        }
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief move constructor
+
+    Move constructor. Constructs a JSON value with the contents of the given
+    value @a other using move semantics. It "steals" the resources from @a
+    other and leaves it as JSON null value.
+
+    @param[in,out] other  value to move to this object
+
+    @post `*this` has the same value as @a other before the call.
+    @post @a other is a JSON null value.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible)
+    requirements.
+
+    @liveexample{The code below shows the move constructor explicitly called
+    via std::move.,basic_json__moveconstructor}
+
+    @since version 1.0.0
+    */
+    basic_json(basic_json&& other) noexcept
+        : m_type(std::move(other.m_type)),
+          m_value(std::move(other.m_value))
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        // invalidate payload
+        other.m_type = value_t::null;
+        other.m_value = {};
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief copy assignment
+
+    Copy assignment operator. Copies a JSON value via the "copy and swap"
+    strategy: It is expressed in terms of the copy constructor, destructor,
+    and the `swap()` member function.
+
+    @param[in] other  value to copy from
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+
+    @liveexample{The code below shows and example for the copy assignment. It
+    creates a copy of value `a` which is then swapped with `b`. Finally\, the
+    copy of `a` (which is the null value after the swap) is
+    destroyed.,basic_json__copyassignment}
+
+    @since version 1.0.0
+    */
+    basic_json& operator=(basic_json other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value and
+        std::is_nothrow_move_assignable<value_t>::value and
+        std::is_nothrow_move_constructible<json_value>::value and
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        using std::swap;
+        swap(m_type, other.m_type);
+        swap(m_value, other.m_value);
+
+        assert_invariant();
+        return *this;
+    }
+
+    /*!
+    @brief destructor
+
+    Destroys the JSON value and frees all allocated memory.
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - All stored elements are destroyed and all memory is freed.
+
+    @since version 1.0.0
+    */
+    ~basic_json() noexcept
+    {
+        assert_invariant();
+        m_value.destroy(m_type);
+    }
+
+    /// @}
+
+  public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// Functions to inspect the type of a JSON value.
+    /// @{
+
+    /*!
+    @brief serialization
+
+    Serialization function for JSON values. The function tries to mimic
+    Python's `json.dumps()` function, and currently supports its @a indent
+    and @a ensure_ascii parameters.
+
+    @param[in] indent If indent is nonnegative, then array elements and object
+    members will be pretty-printed with that indent level. An indent level of
+    `0` will only insert newlines. `-1` (the default) selects the most compact
+    representation.
+    @param[in] indent_char The character to use for indentation if @a indent is
+    greater than `0`. The default is ` ` (space).
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] error_handler  how to react on decoding errors; there are three
+    possible values: `strict` (throws and exception in case a decoding error
+    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
+    and `ignore` (ignore invalid UTF-8 sequences during serialization).
+
+    @return string containing the serialization of the JSON value
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded
+
+    @complexity Linear.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @liveexample{The following example shows the effect of different @a indent\,
+    @a indent_char\, and @a ensure_ascii parameters to the result of the
+    serialization.,dump}
+
+    @see https://docs.python.org/2/library/json.html#json.dump
+
+    @since version 1.0.0; indentation character @a indent_char, option
+           @a ensure_ascii and exceptions added in version 3.0.0; error
+           handlers added in version 3.4.0.
+    */
+    string_t dump(const int indent = -1,
+                  const char indent_char = ' ',
+                  const bool ensure_ascii = false,
+                  const error_handler_t error_handler = error_handler_t::strict) const
+    {
+        string_t result;
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
+
+        if (indent >= 0)
+        {
+            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
+        }
+        else
+        {
+            s.dump(*this, false, ensure_ascii, 0);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief return the type of the JSON value (explicit)
+
+    Return the type of the JSON value as a value from the @ref value_t
+    enumeration.
+
+    @return the type of the JSON value
+            Value type                | return value
+            ------------------------- | -------------------------
+            null                      | value_t::null
+            boolean                   | value_t::boolean
+            string                    | value_t::string
+            number (integer)          | value_t::number_integer
+            number (unsigned integer) | value_t::number_unsigned
+            number (floating-point)   | value_t::number_float
+            object                    | value_t::object
+            array                     | value_t::array
+            discarded                 | value_t::discarded
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `type()` for all JSON
+    types.,type}
+
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
+    @sa @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr value_t type() const noexcept
+    {
+        return m_type;
+    }
+
+    /*!
+    @brief return whether type is primitive
+
+    This function returns true if and only if the JSON type is primitive
+    (string, number, boolean, or null).
+
+    @return `true` if type is primitive (string, number, boolean, or null),
+    `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_primitive()` for all JSON
+    types.,is_primitive}
+
+    @sa @ref is_structured() -- returns whether JSON value is structured
+    @sa @ref is_null() -- returns whether JSON value is `null`
+    @sa @ref is_string() -- returns whether JSON value is a string
+    @sa @ref is_boolean() -- returns whether JSON value is a boolean
+    @sa @ref is_number() -- returns whether JSON value is a number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_primitive() const noexcept
+    {
+        return is_null() or is_string() or is_boolean() or is_number();
+    }
+
+    /*!
+    @brief return whether type is structured
+
+    This function returns true if and only if the JSON type is structured
+    (array or object).
+
+    @return `true` if type is structured (array or object), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_structured()` for all JSON
+    types.,is_structured}
+
+    @sa @ref is_primitive() -- returns whether value is primitive
+    @sa @ref is_array() -- returns whether value is an array
+    @sa @ref is_object() -- returns whether value is an object
+
+    @since version 1.0.0
+    */
+    constexpr bool is_structured() const noexcept
+    {
+        return is_array() or is_object();
+    }
+
+    /*!
+    @brief return whether value is null
+
+    This function returns true if and only if the JSON value is null.
+
+    @return `true` if type is null, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_null()` for all JSON
+    types.,is_null}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_null() const noexcept
+    {
+        return m_type == value_t::null;
+    }
+
+    /*!
+    @brief return whether value is a boolean
+
+    This function returns true if and only if the JSON value is a boolean.
+
+    @return `true` if type is boolean, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_boolean()` for all JSON
+    types.,is_boolean}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_boolean() const noexcept
+    {
+        return m_type == value_t::boolean;
+    }
+
+    /*!
+    @brief return whether value is a number
+
+    This function returns true if and only if the JSON value is a number. This
+    includes both integer (signed and unsigned) and floating-point values.
+
+    @return `true` if type is number (regardless whether integer, unsigned
+    integer or floating-type), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number()` for all JSON
+    types.,is_number}
+
+    @sa @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number() const noexcept
+    {
+        return is_number_integer() or is_number_float();
+    }
+
+    /*!
+    @brief return whether value is an integer number
+
+    This function returns true if and only if the JSON value is a signed or
+    unsigned integer number. This excludes floating-point values.
+
+    @return `true` if type is an integer or unsigned integer number, `false`
+    otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_integer()` for all
+    JSON types.,is_number_integer}
+
+    @sa @ref is_number() -- check if value is a number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_integer() const noexcept
+    {
+        return m_type == value_t::number_integer or m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is an unsigned integer number
+
+    This function returns true if and only if the JSON value is an unsigned
+    integer number. This excludes floating-point and signed integer values.
+
+    @return `true` if type is an unsigned integer number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_unsigned()` for all
+    JSON types.,is_number_unsigned}
+
+    @sa @ref is_number() -- check if value is a number
+    @sa @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 2.0.0
+    */
+    constexpr bool is_number_unsigned() const noexcept
+    {
+        return m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is a floating-point number
+
+    This function returns true if and only if the JSON value is a
+    floating-point number. This excludes signed and unsigned integer values.
+
+    @return `true` if type is a floating-point number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_float()` for all
+    JSON types.,is_number_float}
+
+    @sa @ref is_number() -- check if value is number
+    @sa @ref is_number_integer() -- check if value is an integer number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_float() const noexcept
+    {
+        return m_type == value_t::number_float;
+    }
+
+    /*!
+    @brief return whether value is an object
+
+    This function returns true if and only if the JSON value is an object.
+
+    @return `true` if type is object, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_object()` for all JSON
+    types.,is_object}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_object() const noexcept
+    {
+        return m_type == value_t::object;
+    }
+
+    /*!
+    @brief return whether value is an array
+
+    This function returns true if and only if the JSON value is an array.
+
+    @return `true` if type is array, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_array()` for all JSON
+    types.,is_array}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_array() const noexcept
+    {
+        return m_type == value_t::array;
+    }
+
+    /*!
+    @brief return whether value is a string
+
+    This function returns true if and only if the JSON value is a string.
+
+    @return `true` if type is string, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_string()` for all JSON
+    types.,is_string}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_string() const noexcept
+    {
+        return m_type == value_t::string;
+    }
+
+    /*!
+    @brief return whether value is discarded
+
+    This function returns true if and only if the JSON value was discarded
+    during parsing with a callback function (see @ref parser_callback_t).
+
+    @note This function will always be `false` for JSON values after parsing.
+    That is, discarded values can only occur during parsing, but will be
+    removed when inside a structured value or replaced by null in other cases.
+
+    @return `true` if type is discarded, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_discarded()` for all JSON
+    types.,is_discarded}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_discarded() const noexcept
+    {
+        return m_type == value_t::discarded;
+    }
+
+    /*!
+    @brief return the type of the JSON value (implicit)
+
+    Implicitly return the type of the JSON value as a value from the @ref
+    value_t enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies the @ref value_t operator for
+    all JSON types.,operator__value_t}
+
+    @sa @ref type() -- return the type of the JSON value (explicit)
+    @sa @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr operator value_t() const noexcept
+    {
+        return m_type;
+    }
+
+    /// @}
+
+  private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get a boolean (explicit)
+    boolean_t get_impl(boolean_t* /*unused*/) const
+    {
+        if (JSON_HEDLEY_LIKELY(is_boolean()))
+        {
+            return m_value.boolean;
+        }
+
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(type_name())));
+    }
+
+    /// get a pointer to the value (object)
+    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /*!
+    @brief helper function to implement get_ref()
+
+    This function helps to implement get_ref() without code duplication for
+    const and non-const overloads
+
+    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
+
+    @throw type_error.303 if ReferenceType does not match underlying value
+    type of the current JSON
+    */
+    template<typename ReferenceType, typename ThisType>
+    static ReferenceType get_ref_impl(ThisType& obj)
+    {
+        // delegate the call to get_ptr<>()
+        auto ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
+
+        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
+        {
+            return *ptr;
+        }
+
+        JSON_THROW(type_error::create(303, "incompatible ReferenceType for get_ref, actual type is " + std::string(obj.type_name())));
+    }
+
+  public:
+    /// @name value access
+    /// Direct access to the stored value of a JSON value.
+    /// @{
+
+    /*!
+    @brief get special-case overload
+
+    This overloads avoids a lot of template boilerplate, it can be seen as the
+    identity method
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this
+
+    @complexity Constant.
+
+    @since version 2.1.0
+    */
+    template<typename BasicJsonType, detail::enable_if_t<
+                 std::is_same<typename std::remove_const<BasicJsonType>::type, basic_json_t>::value,
+                 int> = 0>
+    basic_json get() const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads converts the current @ref basic_json in a different
+    @ref basic_json type
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this, converted into @tparam BasicJsonType
+
+    @complexity Depending on the implementation of the called `from_json()`
+                method.
+
+    @since version 3.2.0
+    */
+    template<typename BasicJsonType, detail::enable_if_t<
+                 not std::is_same<BasicJsonType, basic_json>::value and
+                 detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    BasicJsonType get() const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType ret;
+    JSONSerializer<ValueType>::from_json(*this, ret);
+    return ret;
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+    - @ref json_serializer<ValueType> does not have a `from_json()` method of
+      the form `ValueType from_json(const basic_json&)`
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @since version 2.1.0
+    */
+    template<typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+             detail::enable_if_t <
+                 not detail::is_basic_json<ValueType>::value and
+                 detail::has_from_json<basic_json_t, ValueType>::value and
+                 not detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                 int> = 0>
+    ValueType get() const noexcept(noexcept(
+                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
+    {
+        // we cannot static_assert on ValueTypeCV being non-const, because
+        // there is support for get<const basic_json_t>(), which is why we
+        // still need the uncvref
+        static_assert(not std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        static_assert(std::is_default_constructible<ValueType>::value,
+                      "types must be DefaultConstructible when used with get()");
+
+        ValueType ret;
+        JSONSerializer<ValueType>::from_json(*this, ret);
+        return ret;
+    }
+
+    /*!
+    @brief get a value (explicit); special case
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    return JSONSerializer<ValueTypeCV>::from_json(*this);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json and
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `ValueType from_json(const basic_json&)`
+
+    @note If @ref json_serializer<ValueType> has both overloads of
+    `from_json()`, this one is chosen.
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @since version 2.1.0
+    */
+    template<typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+             detail::enable_if_t<not std::is_same<basic_json_t, ValueType>::value and
+                                 detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                                 int> = 0>
+    ValueType get() const noexcept(noexcept(
+                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
+    {
+        static_assert(not std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        return JSONSerializer<ValueType>::from_json(*this);
+    }
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value.
+    The value is filled into the input parameter by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType v;
+    JSONSerializer<ValueType>::from_json(*this, v);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+
+    @tparam ValueType the input parameter type.
+
+    @return the input parameter, allowing chaining calls.
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get_to}
+
+    @since version 3.3.0
+    */
+    template<typename ValueType,
+             detail::enable_if_t <
+                 not detail::is_basic_json<ValueType>::value and
+                 detail::has_from_json<basic_json_t, ValueType>::value,
+                 int> = 0>
+    ValueType & get_to(ValueType& v) const noexcept(noexcept(
+                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<ValueType>::from_json(*this, v);
+        return v;
+    }
+
+    template <
+        typename T, std::size_t N,
+        typename Array = T (&)[N],
+        detail::enable_if_t <
+            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
+    Array get_to(T (&v)[N]) const
+    noexcept(noexcept(JSONSerializer<Array>::from_json(
+                          std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<Array>::from_json(*this, v);
+        return v;
+    }
+
+
+    /*!
+    @brief get a pointer value (implicit)
+
+    Implicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
+    assertion.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get_ptr}
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+    @copydoc get_ptr()
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value and
+                 std::is_const<typename std::remove_pointer<PointerType>::type>::value, int>::type = 0>
+    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning The pointer becomes invalid if the underlying JSON object
+    changes.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa @ref get_ptr() for explicit pointer-member access
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    constexpr auto get() const noexcept -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+
+    Implicit reference access to the internally stored JSON value. No copies
+    are made.
+
+    @warning Writing data to the referee of the result yields an undefined
+    state.
+
+    @tparam ReferenceType reference type; must be a reference to @ref array_t,
+    @ref object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or
+    @ref number_float_t. Enforced by static assertion.
+
+    @return reference to the internally stored JSON value if the requested
+    reference type @a ReferenceType fits to the JSON value; throws
+    type_error.303 otherwise
+
+    @throw type_error.303 in case passed type @a ReferenceType is incompatible
+    with the stored JSON value; see example below
+
+    @complexity Constant.
+
+    @liveexample{The example shows several calls to `get_ref()`.,get_ref}
+
+    @since version 1.1.0
+    */
+    template<typename ReferenceType, typename std::enable_if<
+                 std::is_reference<ReferenceType>::value, int>::type = 0>
+    ReferenceType get_ref()
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+    @copydoc get_ref()
+    */
+    template<typename ReferenceType, typename std::enable_if<
+                 std::is_reference<ReferenceType>::value and
+                 std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int>::type = 0>
+    ReferenceType get_ref() const
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implicit type conversion between the JSON value and a compatible value.
+    The call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays. The character type of @ref string_t
+    as well as an initializer list of this type is excluded to avoid
+    ambiguities as these types implicitly convert to `std::string`.
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw type_error.302 in case passed type @a ValueType is incompatible
+    to the JSON value type (e.g., the JSON value is of type boolean, but a
+    string is requested); see example below
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+
+    @since version 1.0.0
+    */
+    template < typename ValueType, typename std::enable_if <
+                   not std::is_pointer<ValueType>::value and
+                   not std::is_same<ValueType, detail::json_ref<basic_json>>::value and
+                   not std::is_same<ValueType, typename string_t::value_type>::value and
+                   not detail::is_basic_json<ValueType>::value
+
+#ifndef _MSC_VER  // fix for issue #167 operator<< ambiguity under VS2015
+                   and not std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>::value
+#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) and _MSC_VER <= 1914))
+                   and not std::is_same<ValueType, typename std::string_view>::value
+#endif
+#endif
+                   and detail::is_detected<detail::get_template_function, const basic_json_t&, ValueType>::value
+                   , int >::type = 0 >
+    operator ValueType() const
+    {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /// @}
+
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// Access to the JSON value.
+    /// @{
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__size_type}
+    */
+    reference at(size_type idx)
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a const reference to the element at specified location @a idx,
+    with bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__size_type_const}
+    */
+    const_reference at(size_type idx) const
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__object_t_key_type}
+    */
+    reference at(const typename object_t::key_type& key)
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return m_value.object->at(key);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a const reference to the element at with specified key @a key,
+    with bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__object_t_key_type_const}
+    */
+    const_reference at(const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return m_value.object->at(key);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a reference to the element at specified location @a idx.
+
+    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
+    then the array is silently filled up with `null` values to make `idx` a
+    valid reference to the last stored element.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array or null; in that
+    cases, using the [] operator with an index makes no sense.
+
+    @complexity Constant if @a idx is in the range of the array. Otherwise
+    linear in `idx - size()`.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `[]` operator. Note the addition of `null`
+    values.,operatorarray__size_type}
+
+    @since version 1.0.0
+    */
+    reference operator[](size_type idx)
+    {
+        // implicitly convert null value to an empty array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value.array = create<array_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // fill up array with null values if given idx is outside range
+            if (idx >= m_value.array->size())
+            {
+                m_value.array->insert(m_value.array->end(),
+                                      idx - m_value.array->size() + 1,
+                                      basic_json());
+            }
+
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a const reference to the element at specified location @a idx.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array; in that case,
+    using the [] operator with an index makes no sense.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    the `[]` operator.,operatorarray__size_type_const}
+
+    @since version 1.0.0
+    */
+    const_reference operator[](size_type idx) const
+    {
+        // const operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    reference operator[](const typename object_t::key_type& key)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->operator[](key);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    const_reference operator[](const typename object_t::key_type& key) const
+    {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            assert(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    reference operator[](T* key)
+    {
+        // implicitly convert null to object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->operator[](key);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    const_reference operator[](T* key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            assert(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(key);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const typename object_t::key_type&), this function
+    does not throw if the given key @a key was not found.
+
+    @note Unlike @ref operator[](const typename object_t::key_type& key), this
+    function does not implicitly add an element to the position defined by @a
+    key. This function is furthermore also applicable to const objects.
+
+    @param[in] key  key of the element to access
+    @param[in] default_value  the value to return if @a key is not found
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a key
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+
+    @since version 1.0.0
+    */
+    template<class ValueType, typename std::enable_if<
+                 std::is_convertible<basic_json_t, ValueType>::value, int>::type = 0>
+    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end())
+            {
+                return *it;
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const typename object_t::key_type&, const ValueType&) const
+    */
+    string_t value(const typename object_t::key_type& key, const char* default_value) const
+    {
+        return value(key, string_t(default_value));
+    }
+
+    /*!
+    @brief access specified object element via JSON Pointer with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(ptr);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const json_pointer&), this function does not throw
+    if the given key @a key was not found.
+
+    @param[in] ptr  a JSON pointer to the element to access
+    @param[in] default_value  the value to return if @a ptr found no value
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a ptr
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value_ptr}
+
+    @sa @ref operator[](const json_pointer&) for unchecked access by reference
+
+    @since version 2.0.2
+    */
+    template<class ValueType, typename std::enable_if<
+                 std::is_convertible<basic_json_t, ValueType>::value, int>::type = 0>
+    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY
+            {
+                return ptr.get_checked(this);
+            }
+            JSON_INTERNAL_CATCH (out_of_range&)
+            {
+                return default_value;
+            }
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const json_pointer&, ValueType) const
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    string_t value(const json_pointer& ptr, const char* default_value) const
+    {
+        return value(ptr, string_t(default_value));
+    }
+
+    /*!
+    @brief access the first element
+
+    Returns a reference to the first element in the container. For a JSON
+    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.
+
+    @return In case of a structured type (array or object), a reference to the
+    first element is returned. In case of number, string, or boolean values, a
+    reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on `null` value
+
+    @liveexample{The following code shows an example for `front()`.,front}
+
+    @sa @ref back() -- access the last element
+
+    @since version 1.0.0
+    */
+    reference front()
+    {
+        return *begin();
+    }
+
+    /*!
+    @copydoc basic_json::front()
+    */
+    const_reference front() const
+    {
+        return *cbegin();
+    }
+
+    /*!
+    @brief access the last element
+
+    Returns a reference to the last element in the container. For a JSON
+    container `c`, the expression `c.back()` is equivalent to
+    @code {.cpp}
+    auto tmp = c.end();
+    --tmp;
+    return *tmp;
+    @endcode
+
+    @return In case of a structured type (array or object), a reference to the
+    last element is returned. In case of number, string, or boolean values, a
+    reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on a `null` value. See example
+    below.
+
+    @liveexample{The following code shows an example for `back()`.,back}
+
+    @sa @ref front() -- access the first element
+
+    @since version 1.0.0
+    */
+    reference back()
+    {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @copydoc basic_json::back()
+    */
+    const_reference back() const
+    {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @brief remove element given an iterator
+
+    Removes the element specified by iterator @a pos. The iterator @a pos must
+    be valid and dereferenceable. Thus the `end()` iterator (which is valid,
+    but is not dereferenceable) cannot be used as a value for @a pos.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] pos iterator to the element to remove
+    @return Iterator following the last removed element. If the iterator @a
+    pos refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.202 if called on an iterator which does not belong
+    to the current JSON value; example: `"iterator does not fit current
+    value"`
+    @throw invalid_iterator.205 if called on a primitive type with invalid
+    iterator (i.e., any iterator which is not `begin()`); example: `"iterator
+    out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: amortized constant
+    - arrays: linear in distance between @a pos and the end of the container
+    - strings: linear in the length of the string
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType}
+
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template<class IteratorType, typename std::enable_if<
+                 std::is_same<IteratorType, typename basic_json_t::iterator>::value or
+                 std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int>::type
+             = 0>
+    IteratorType erase(IteratorType pos)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            {
+                if (JSON_HEDLEY_UNLIKELY(not pos.m_it.primitive_iterator.is_begin()))
+                {
+                    JSON_THROW(invalid_iterator::create(205, "iterator out of range"));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove elements given an iterator range
+
+    Removes the element specified by the range `[first; last)`. The iterator
+    @a first does not need to be dereferenceable if `first == last`: erasing
+    an empty range is a no-op.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] first iterator to the beginning of the range to remove
+    @param[in] last iterator past the end of the range to remove
+    @return Iterator following the last removed element. If the iterator @a
+    second refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.203 if called on iterators which does not belong
+    to the current JSON value; example: `"iterators do not fit current value"`
+    @throw invalid_iterator.204 if called on a primitive type with invalid
+    iterators (i.e., if `first != begin()` and `last != end()`); example:
+    `"iterators out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: `log(size()) + std::distance(first, last)`
+    - arrays: linear in the distance between @a first and @a last, plus linear
+      in the distance between @a last and end of the container
+    - strings: linear in the length of the string
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType_IteratorType}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template<class IteratorType, typename std::enable_if<
+                 std::is_same<IteratorType, typename basic_json_t::iterator>::value or
+                 std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int>::type
+             = 0>
+    IteratorType erase(IteratorType first, IteratorType last)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != first.m_object or this != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value"));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            {
+                if (JSON_HEDLEY_LIKELY(not first.m_it.primitive_iterator.is_begin()
+                                       or not last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
+                                              last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
+                                             last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove element from a JSON object given a key
+
+    Removes elements from a JSON object with the key value @a key.
+
+    @param[in] key value of the elements to remove
+
+    @return Number of elements removed. If @a ObjectType is the default
+    `std::map` type, the return value will always be `0` (@a key was not
+    found) or `1` (@a key was found).
+
+    @post References and iterators to the erased elements are invalidated.
+    Other references and iterators are not affected.
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+
+    @complexity `log(size()) + count(key)`
+
+    @liveexample{The example shows the effect of `erase()`.,erase__key_type}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    size_type erase(const typename object_t::key_type& key)
+    {
+        // this erase only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->erase(key);
+        }
+
+        JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief remove element from a JSON array given an index
+
+    Removes element from a JSON array at the index @a idx.
+
+    @param[in] idx index of the element to remove
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+    @throw out_of_range.401 when `idx >= size()`; example: `"array index 17
+    is out of range"`
+
+    @complexity Linear in distance between @a idx and the end of the container.
+
+    @liveexample{The example shows the effect of `erase()`.,erase__size_type}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+
+    @since version 1.0.0
+    */
+    void erase(const size_type idx)
+    {
+        // this erase only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
+            {
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+
+            m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
+        }
+        else
+        {
+            JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+    }
+
+    /// @}
+
+
+    ////////////
+    // lookup //
+    ////////////
+
+    /// @name lookup
+    /// @{
+
+    /*!
+    @brief find an element in a JSON object
+
+    Finds an element in a JSON object with key equivalent to @a key. If the
+    element is not found or the JSON value is not an object, end() is
+    returned.
+
+    @note This method always returns @ref end() when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value of the element to search for.
+
+    @return Iterator to an element with key equivalent to @a key. If no such
+    element is found or the JSON value is not an object, past-the-end (see
+    @ref end()) iterator is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `find()` is used.,find__key_type}
+
+    @sa @ref contains(KeyT&&) const -- checks whether a key exists
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    iterator find(KeyT&& key)
+    {
+        auto result = end();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief find an element in a JSON object
+    @copydoc find(KeyT&&)
+    */
+    template<typename KeyT>
+    const_iterator find(KeyT&& key) const
+    {
+        auto result = cend();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief returns the number of occurrences of a key in a JSON object
+
+    Returns the number of elements with key @a key. If ObjectType is the
+    default `std::map` type, the return value will always be `0` (@a key was
+    not found) or `1` (@a key was found).
+
+    @note This method always returns `0` when executed on a JSON type that is
+          not an object.
+
+    @param[in] key key value of the element to count
+
+    @return Number of elements with key @a key. If the JSON value is not an
+    object, the return value will be `0`.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `count()` is used.,count}
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    size_type count(KeyT&& key) const
+    {
+        // return 0 for all nonobject types
+        return is_object() ? m_value.object->count(std::forward<KeyT>(key)) : 0;
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object
+
+    Check whether an element exists in a JSON object with key equivalent to
+    @a key. If the element is not found or the JSON value is not an object,
+    false is returned.
+
+    @note This method always returns false when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value to check its existence.
+
+    @return true if an element with specified @a key exists. If no such
+    element with such key is found or the JSON value is not an object,
+    false is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains}
+
+    @sa @ref find(KeyT&&) -- returns an iterator to an object element
+    @sa @ref contains(const json_pointer&) const -- checks the existence for a JSON pointer
+
+    @since version 3.6.0
+    */
+    template<typename KeyT, typename std::enable_if<
+                 not std::is_same<typename std::decay<KeyT>::type, json_pointer>::value, int>::type = 0>
+    bool contains(KeyT && key) const
+    {
+        return is_object() and m_value.object->find(std::forward<KeyT>(key)) != m_value.object->end();
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object given a JSON pointer
+
+    Check whether the given JSON pointer @a ptr can be resolved in the current
+    JSON value.
+
+    @note This method can be executed on any JSON value type.
+
+    @param[in] ptr JSON pointer to check its existence.
+
+    @return true if the JSON pointer can be resolved to a stored value, false
+    otherwise.
+
+    @post If `j.contains(ptr)` returns true, it is safe to call `j[ptr]`.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains_json_pointer}
+
+    @sa @ref contains(KeyT &&) const -- checks the existence of a key
+
+    @since version 3.7.0
+    */
+    bool contains(const json_pointer& ptr) const
+    {
+        return ptr.contains(this);
+    }
+
+    /// @}
+
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /*!
+    @brief returns an iterator to the first element
+
+    Returns an iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `begin()`.,begin}
+
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    iterator begin() noexcept
+    {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cbegin()
+    */
+    const_iterator begin() const noexcept
+    {
+        return cbegin();
+    }
+
+    /*!
+    @brief returns a const iterator to the first element
+
+    Returns a const iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
+
+    @liveexample{The following code shows an example for `cbegin()`.,cbegin}
+
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    const_iterator cbegin() const noexcept
+    {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to one past the last element
+
+    Returns an iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `end()`.,end}
+
+    @sa @ref cend() -- returns a const iterator to the end
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    iterator end() noexcept
+    {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cend()
+    */
+    const_iterator end() const noexcept
+    {
+        return cend();
+    }
+
+    /*!
+    @brief returns a const iterator to one past the last element
+
+    Returns a const iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
+
+    @liveexample{The following code shows an example for `cend()`.,cend}
+
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_iterator cend() const noexcept
+    {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-beginning
+
+    Returns an iterator to the reverse-beginning; that is, the last element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(end())`.
+
+    @liveexample{The following code shows an example for `rbegin()`.,rbegin}
+
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    reverse_iterator rbegin() noexcept
+    {
+        return reverse_iterator(end());
+    }
+
+    /*!
+    @copydoc basic_json::crbegin()
+    */
+    const_reverse_iterator rbegin() const noexcept
+    {
+        return crbegin();
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-end
+
+    Returns an iterator to the reverse-end; that is, one before the first
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(begin())`.
+
+    @liveexample{The following code shows an example for `rend()`.,rend}
+
+    @sa @ref crend() -- returns a const reverse iterator to the end
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    reverse_iterator rend() noexcept
+    {
+        return reverse_iterator(begin());
+    }
+
+    /*!
+    @copydoc basic_json::crend()
+    */
+    const_reverse_iterator rend() const noexcept
+    {
+        return crend();
+    }
+
+    /*!
+    @brief returns a const reverse iterator to the last element
+
+    Returns a const iterator to the reverse-beginning; that is, the last
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
+
+    @liveexample{The following code shows an example for `crbegin()`.,crbegin}
+
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    /*!
+    @brief returns a const reverse iterator to one before the first
+
+    Returns a const reverse iterator to the reverse-end; that is, one before
+    the first element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
+
+    @liveexample{The following code shows an example for `crend()`.,crend}
+
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+  public:
+    /*!
+    @brief wrapper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without iterator_wrapper:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without iterator proxy:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with iterator proxy:
+
+    @code{cpp}
+    for (auto it : json::iterator_wrapper(j_object))
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example).
+
+    @param[in] ref  reference to a JSON value
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the wrapper is used,iterator_wrapper}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @note The name of this function is not yet final and may change in the
+    future.
+
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use @ref items() instead;
+                that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    */
+    JSON_HEDLEY_DEPRECATED(3.1.0)
+    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @copydoc iterator_wrapper(reference)
+    */
+    JSON_HEDLEY_DEPRECATED(3.1.0)
+    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @brief helper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without `items()` function:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without `items()` function:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with `items()` function:
+
+    @code{cpp}
+    for (auto& el : j_object.items())
+    {
+        std::cout << "key: " << el.key() << ", value:" << el.value() << '\n';
+    }
+    @endcode
+
+    The `items()` function also allows to use
+    [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding)
+    (C++17):
+
+    @code{cpp}
+    for (auto& [key, val] : j_object.items())
+    {
+        std::cout << "key: " << key << ", value:" << val << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example). For primitive types (e.g., numbers),
+          `key()` returns an empty string.
+
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the function is used.,items}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 3.1.0, structured bindings support since 3.5.0.
+    */
+    iteration_proxy<iterator> items() noexcept
+    {
+        return iteration_proxy<iterator>(*this);
+    }
+
+    /*!
+    @copydoc items()
+    */
+    iteration_proxy<const_iterator> items() const noexcept
+    {
+        return iteration_proxy<const_iterator>(*this);
+    }
+
+    /// @}
+
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /*!
+    @brief checks whether the container is empty.
+
+    Checks if a JSON value has no elements (i.e. whether its @ref size is `0`).
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `true`
+            boolean     | `false`
+            string      | `false`
+            number      | `false`
+            object      | result of function `object_t::empty()`
+            array       | result of function `array_t::empty()`
+
+    @liveexample{The following code uses `empty()` to check if a JSON
+    object contains any elements.,empty}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `empty()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return whether a string stored as JSON value
+    is empty - it returns whether the JSON container itself is empty which is
+    false in the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `begin() == end()`.
+
+    @sa @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    bool empty() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return true;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::empty()
+                return m_value.array->empty();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::empty()
+                return m_value.object->empty();
+            }
+
+            default:
+            {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the number of elements
+
+    Returns the number of elements in a JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0`
+            boolean     | `1`
+            string      | `1`
+            number      | `1`
+            object      | result of function object_t::size()
+            array       | result of function array_t::size()
+
+    @liveexample{The following code calls `size()` on the different value
+    types.,size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their size() functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return the length of a string stored as JSON
+    value - it returns the number of elements in the JSON value which is 1 in
+    the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `std::distance(begin(), end())`.
+
+    @sa @ref empty() -- checks whether the container is empty
+    @sa @ref max_size() -- returns the maximal number of elements
+
+    @since version 1.0.0
+    */
+    size_type size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return 0;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::size()
+                return m_value.array->size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::size()
+                return m_value.object->size();
+            }
+
+            default:
+            {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the maximum possible number of elements
+
+    Returns the maximum number of elements a JSON value is able to hold due to
+    system or library implementation limitations, i.e. `std::distance(begin(),
+    end())` for the JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0` (same as `size()`)
+            boolean     | `1` (same as `size()`)
+            string      | `1` (same as `size()`)
+            number      | `1` (same as `size()`)
+            object      | result of function `object_t::max_size()`
+            array       | result of function `array_t::max_size()`
+
+    @liveexample{The following code calls `max_size()` on the different value
+    types. Note the output is implementation specific.,max_size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `max_size()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of returning `b.size()` where `b` is the largest
+      possible JSON value.
+
+    @sa @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    size_type max_size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::array:
+            {
+                // delegate call to array_t::max_size()
+                return m_value.array->max_size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::max_size()
+                return m_value.object->max_size();
+            }
+
+            default:
+            {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /*!
+    @brief clears the contents
+
+    Clears the content of a JSON value and resets it to the default value as
+    if @ref basic_json(value_t) would have been called with the current value
+    type from @ref type():
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+
+    @post Has the same effect as calling
+    @code {.cpp}
+    *this = basic_json(type());
+    @endcode
+
+    @liveexample{The example below shows the effect of `clear()` to different
+    JSON types.,clear}
+
+    @complexity Linear in the size of the JSON value.
+
+    @iterators All iterators, pointers and references related to this container
+               are invalidated.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @sa @ref basic_json(value_t) -- constructor that creates an object with the
+        same value than calling `clear()`
+
+    @since version 1.0.0
+    */
+    void clear() noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = 0;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = 0;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = 0.0;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = false;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value.string->clear();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array->clear();
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object->clear();
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Appends the given element @a val to the end of the JSON value. If the
+    function is called on a JSON null value, an empty array is created before
+    appending @a val.
+
+    @param[in] val the value to add to the JSON array
+
+    @throw type_error.308 when called on a type other than JSON array or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON array. Note how the `null` value was silently
+    converted to a JSON array.,push_back}
+
+    @since version 1.0.0
+    */
+    void push_back(basic_json&& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(not(is_null() or is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (move semantics)
+        m_value.array->push_back(std::move(val));
+        // invalidate object: mark it null so we do not call the destructor
+        // cppcheck-suppress accessMoved
+        val.m_type = value_t::null;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(basic_json&& val)
+    {
+        push_back(std::move(val));
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    void push_back(const basic_json& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(not(is_null() or is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array
+        m_value.array->push_back(val);
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(const basic_json& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    Inserts the given element @a val to the JSON object. If the function is
+    called on a JSON null value, an empty object is created before inserting
+    @a val.
+
+    @param[in] val the value to add to the JSON object
+
+    @throw type_error.308 when called on a type other than JSON object or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON object. Note how the `null` value was silently
+    converted to a JSON object.,push_back__object_t__value}
+
+    @since version 1.0.0
+    */
+    void push_back(const typename object_t::value_type& val)
+    {
+        // push_back only works for null objects or objects
+        if (JSON_HEDLEY_UNLIKELY(not(is_null() or is_object())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array
+        m_value.object->insert(val);
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(const typename object_t::value_type&)
+    */
+    reference operator+=(const typename object_t::value_type& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    This function allows to use `push_back` with an initializer list. In case
+
+    1. the current value is an object,
+    2. the initializer list @a init contains only two elements, and
+    3. the first element of @a init is a string,
+
+    @a init is converted into an object element and added using
+    @ref push_back(const typename object_t::value_type&). Otherwise, @a init
+    is converted to a JSON value and added using @ref push_back(basic_json&&).
+
+    @param[in] init  an initializer list
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @note This function is required to resolve an ambiguous overload error,
+          because pairs like `{"key", "value"}` can be both interpreted as
+          `object_t::value_type` or `std::initializer_list<basic_json>`, see
+          https://github.com/nlohmann/json/issues/235 for more information.
+
+    @liveexample{The example shows how initializer lists are treated as
+    objects when possible.,push_back__initializer_list}
+    */
+    void push_back(initializer_list_t init)
+    {
+        if (is_object() and init.size() == 2 and (*init.begin())->is_string())
+        {
+            basic_json&& key = init.begin()->moved_or_copied();
+            push_back(typename object_t::value_type(
+                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
+        }
+        else
+        {
+            push_back(basic_json(init));
+        }
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(initializer_list_t)
+    */
+    reference operator+=(initializer_list_t init)
+    {
+        push_back(init);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Creates a JSON value from the passed parameters @a args to the end of the
+    JSON value. If the function is called on a JSON null value, an empty array
+    is created before appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return reference to the inserted element
+
+    @throw type_error.311 when called on a type other than JSON array or
+    null; example: `"cannot use emplace_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` can be used to add
+    elements to a JSON array. Note how the `null` value was silently converted
+    to a JSON array.,emplace_back}
+
+    @since version 2.0.8, returns reference since 3.7.0
+    */
+    template<class... Args>
+    reference emplace_back(Args&& ... args)
+    {
+        // emplace_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(not(is_null() or is_array())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+#ifdef JSON_HAS_CPP_17
+        return m_value.array->emplace_back(std::forward<Args>(args)...);
+#else
+        m_value.array->emplace_back(std::forward<Args>(args)...);
+        return m_value.array->back();
+#endif
+    }
+
+    /*!
+    @brief add an object to an object if key does not exist
+
+    Inserts a new element into a JSON object constructed in-place with the
+    given @a args if there is no element with the key in the container. If the
+    function is called on a JSON null value, an empty object is created before
+    appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return a pair consisting of an iterator to the inserted element, or the
+            already-existing element if no insertion happened, and a bool
+            denoting whether the insertion took place.
+
+    @throw type_error.311 when called on a type other than JSON object or
+    null; example: `"cannot use emplace() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `emplace()` can be used to add elements
+    to a JSON object. Note how the `null` value was silently converted to a
+    JSON object. Further note how no value is added if there was already one
+    value stored with the same key.,emplace}
+
+    @since version 2.0.8
+    */
+    template<class... Args>
+    std::pair<iterator, bool> emplace(Args&& ... args)
+    {
+        // emplace only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(not(is_null() or is_object())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace() with " + std::string(type_name())));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        auto res = m_value.object->emplace(std::forward<Args>(args)...);
+        // create result iterator and set iterator to the result of emplace
+        auto it = begin();
+        it.m_it.object_iterator = res.first;
+
+        // return pair of iterator and boolean
+        return {it, res.second};
+    }
+
+    /// Helper for insertion of an iterator
+    /// @note: This uses std::distance to support GCC 4.8,
+    ///        see https://github.com/nlohmann/json/pull/1257
+    template<typename... Args>
+    iterator insert_iterator(const_iterator pos, Args&& ... args)
+    {
+        iterator result(this);
+        assert(m_value.array != nullptr);
+
+        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
+        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_value.array->begin() + insert_pos;
+
+        // This could have been written as:
+        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
+
+        return result;
+    }
+
+    /*!
+    @brief inserts element
+
+    Inserts element @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] val element to insert
+    @return iterator pointing to the inserted @a val.
+
+    @throw type_error.309 if called on JSON values other than arrays;
+    example: `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Constant plus linear in the distance between @a pos and end of
+    the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief inserts element
+    @copydoc insert(const_iterator, const basic_json&)
+    */
+    iterator insert(const_iterator pos, basic_json&& val)
+    {
+        return insert(pos, val);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts @a cnt copies of @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] cnt number of copies of @a val to insert
+    @param[in] val element to insert
+    @return iterator pointing to the first element inserted, or @a pos if
+    `cnt==0`
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Linear in @a cnt plus linear in the distance between @a pos
+    and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__count}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, cnt, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)` before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+    @throw invalid_iterator.211 if @a first or @a last are iterators into
+    container for which insert is called; example: `"passed iterators may not
+    belong to container"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `first==last`
+
+    @complexity Linear in `std::distance(first, last)` plus linear in the
+    distance between @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(not is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
+        {
+            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container"));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from initializer list @a ilist before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] ilist initializer list to insert the values from
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `ilist` is empty
+
+    @complexity Linear in `ilist.size()` plus linear in the distance between
+    @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__ilist}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, initializer_list_t ilist)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(not is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, ilist.begin(), ilist.end());
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)`.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than objects; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity Logarithmic: `O(N*log(size() + N))`, where `N` is the number
+    of elements to insert.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range_object}
+
+    @since version 3.0.0
+    */
+    void insert(const_iterator first, const_iterator last)
+    {
+        // insert only works for objects
+        if (JSON_HEDLEY_UNLIKELY(not is_object()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(not first.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
+        }
+
+        m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from JSON object @a j and overwrites existing keys.
+
+    @param[in] j  JSON object to read values from
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_reference j)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(not is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
+        }
+        if (JSON_HEDLEY_UNLIKELY(not j.is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name())));
+        }
+
+        for (auto it = j.cbegin(); it != j.cend(); ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from from range `[first, last)` and overwrites existing
+    keys.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used__range.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_iterator first, const_iterator last)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(not is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(not first.m_object->is_object()
+                                 or not last.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
+        }
+
+        for (auto it = first; it != last; ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be swapped with
+    `swap()`.,swap__reference}
+
+    @since version 1.0.0
+    */
+    void swap(reference other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value and
+        std::is_nothrow_move_assignable<value_t>::value and
+        std::is_nothrow_move_constructible<json_value>::value and
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        std::swap(m_type, other.m_type);
+        std::swap(m_value, other.m_value);
+        assert_invariant();
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON array with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other array to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an array; example: `"cannot
+    use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how arrays can be swapped with
+    `swap()`.,swap__array_t}
+
+    @since version 1.0.0
+    */
+    void swap(array_t& other)
+    {
+        // swap only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            std::swap(*(m_value.array), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON object with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other object to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an object; example:
+    `"cannot use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how objects can be swapped with
+    `swap()`.,swap__object_t}
+
+    @since version 1.0.0
+    */
+    void swap(object_t& other)
+    {
+        // swap only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            std::swap(*(m_value.object), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other string to exchange the contents with
+
+    @throw type_error.310 when JSON value is not a string; example: `"cannot
+    use swap() with boolean"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how strings can be swapped with
+    `swap()`.,swap__string_t}
+
+    @since version 1.0.0
+    */
+    void swap(string_t& other)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_string()))
+        {
+            std::swap(*(m_value.string), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /// @}
+
+  public:
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+    /*!
+    @brief comparison: equal
+
+    Compares two JSON values for equality according to the following rules:
+    - Two JSON values are equal if (1) they are from the same type and (2)
+      their stored values are the same according to their respective
+      `operator==`.
+    - Integer and floating-point numbers are automatically converted before
+      comparison. Note than two NaN values are always treated as unequal.
+    - Two JSON null values are equal.
+
+    @note Floating-point inside JSON values numbers are compared with
+    `json::number_float_t::operator==` which is `double::operator==` by
+    default. To compare floating-point while respecting an epsilon, an alternative
+    [comparison function](https://github.com/mariokonrad/marnav/blob/master/src/marnav/math/floatingpoint.hpp#L34-#L39)
+    could be used, for instance
+    @code {.cpp}
+    template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
+    inline bool is_same(T a, T b, T epsilon = std::numeric_limits<T>::epsilon()) noexcept
+    {
+        return std::abs(a - b) <= epsilon;
+    }
+    @endcode
+
+    @note NaN values never compare equal to themselves or to other NaN values.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are equal
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__equal}
+
+    @since version 1.0.0
+    */
+    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    return *lhs.m_value.array == *rhs.m_value.array;
+
+                case value_t::object:
+                    return *lhs.m_value.object == *rhs.m_value.object;
+
+                case value_t::null:
+                    return true;
+
+                case value_t::string:
+                    return *lhs.m_value.string == *rhs.m_value.string;
+
+                case value_t::boolean:
+                    return lhs.m_value.boolean == rhs.m_value.boolean;
+
+                case value_t::number_integer:
+                    return lhs.m_value.number_integer == rhs.m_value.number_integer;
+
+                case value_t::number_unsigned:
+                    return lhs.m_value.number_unsigned == rhs.m_value.number_unsigned;
+
+                case value_t::number_float:
+                    return lhs.m_value.number_float == rhs.m_value.number_float;
+
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned and rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned and rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_integer;
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer == static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+
+        return false;
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs == basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) == rhs;
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are not equal
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__notequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs == rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs != basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) != rhs;
+    }
+
+    /*!
+    @brief comparison: less than
+
+    Compares whether one JSON value @a lhs is less than another JSON value @a
+    rhs according to the following rules:
+    - If @a lhs and @a rhs have the same type, the values are compared using
+      the default `<` operator.
+    - Integer and floating-point numbers are automatically converted before
+      comparison
+    - In case @a lhs and @a rhs have different types, the values are ignored
+      and the order of the types is considered, see
+      @ref operator<(const value_t, const value_t).
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__less}
+
+    @since version 1.0.0
+    */
+    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    // note parentheses are necessary, see
+                    // https://github.com/nlohmann/json/issues/1530
+                    return (*lhs.m_value.array) < (*rhs.m_value.array);
+
+                case value_t::object:
+                    return (*lhs.m_value.object) < (*rhs.m_value.object);
+
+                case value_t::null:
+                    return false;
+
+                case value_t::string:
+                    return (*lhs.m_value.string) < (*rhs.m_value.string);
+
+                case value_t::boolean:
+                    return (lhs.m_value.boolean) < (rhs.m_value.boolean);
+
+                case value_t::number_integer:
+                    return (lhs.m_value.number_integer) < (rhs.m_value.number_integer);
+
+                case value_t::number_unsigned:
+                    return (lhs.m_value.number_unsigned) < (rhs.m_value.number_unsigned);
+
+                case value_t::number_float:
+                    return (lhs.m_value.number_float) < (rhs.m_value.number_float);
+
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned and rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer < static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned and rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_integer;
+        }
+
+        // We only reach this line if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        return operator<(lhs_type, rhs_type);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs < basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) < rhs;
+    }
+
+    /*!
+    @brief comparison: less than or equal
+
+    Compares whether one JSON value @a lhs is less than or equal to another
+    JSON value by calculating `not (rhs < lhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greater}
+
+    @since version 1.0.0
+    */
+    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (rhs < lhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs <= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) <= rhs;
+    }
+
+    /*!
+    @brief comparison: greater than
+
+    Compares whether one JSON value @a lhs is greater than another
+    JSON value by calculating `not (lhs <= rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__lessequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs <= rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs > basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) > rhs;
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+
+    Compares whether one JSON value @a lhs is greater than or equal to another
+    JSON value by calculating `not (lhs < rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greaterequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs < rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs >= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) >= rhs;
+    }
+
+    /// @}
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+
+    /*!
+    @brief serialize to stream
+
+    Serialize the given JSON value @a j to the output stream @a o. The JSON
+    value will be serialized using the @ref dump member function.
+
+    - The indentation of the output can be controlled with the member variable
+      `width` of the output stream @a o. For instance, using the manipulator
+      `std::setw(4)` on @a o sets the indentation level to `4` and the
+      serialization result is the same as calling `dump(4)`.
+
+    - The indentation character can be controlled with the member variable
+      `fill` of the output stream @a o. For instance, the manipulator
+      `std::setfill('\\t')` sets indentation to use a tab character rather than
+      the default space character.
+
+    @param[in,out] o  stream to serialize to
+    @param[in] j  JSON value to serialize
+
+    @return the stream @a o
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded
+
+    @complexity Linear.
+
+    @liveexample{The example below shows the serialization with different
+    parameters to `width` to adjust the indentation level.,operator_serialize}
+
+    @since version 1.0.0; indentation character added in version 3.0.0
+    */
+    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
+    {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = o.width() > 0;
+        const auto indentation = pretty_print ? o.width() : 0;
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        serializer s(detail::output_adapter<char>(o), o.fill());
+        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /*!
+    @brief serialize to stream
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use
+                @ref operator<<(std::ostream&, const basic_json&)
+                instead; that is, replace calls like `j >> o;` with `o << j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED(3.0.0)
+    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
+    {
+        return o << j;
+    }
+
+    /// @}
+
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /*!
+    @brief deserialize from a compatible input
+
+    This function reads from a compatible input. Examples are:
+    - an array of 1-byte values
+    - strings with character/literal type with size of 1 byte
+    - input streams
+    - container with contiguous storage of 1-byte values. Compatible container
+      types include `std::vector`, `std::string`, `std::array`,
+      `std::valarray`, and `std::initializer_list`. Furthermore, C-style
+      arrays can be used with `std::begin()`/`std::end()`. User-defined
+      containers can be used as long as they implement random-access iterators
+      and a contiguous storage.
+
+    @pre Each element of the container has a size of 1 byte. Violating this
+    precondition yields undefined behavior. **This precondition is enforced
+    with a static assertion.**
+
+    @pre The container storage is contiguous. Violating this precondition
+    yields undefined behavior. **This precondition is enforced with an
+    assertion.**
+
+    @warning There is no way to enforce all preconditions at compile-time. If
+             the function is called with a noncompliant container and with
+             assertions switched off, the behavior is undefined and will most
+             likely yield segmentation violation.
+
+    @param[in] i  input to read from
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb has a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from an array.,parse__array__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__string__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__istream__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from a contiguous container.,parse__contiguouscontainer__parser_callback_t}
+
+    @since version 2.0.3 (contiguous containers)
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(detail::input_adapter&& i,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true)
+    {
+        basic_json result;
+        parser(i, cb, allow_exceptions).parse(true, result);
+        return result;
+    }
+
+    static bool accept(detail::input_adapter&& i)
+    {
+        return parser(i).accept(true);
+    }
+
+    /*!
+    @brief generate SAX events
+
+    The SAX event lister must follow the interface of @ref json_sax.
+
+    This function reads from a compatible input. Examples are:
+    - an array of 1-byte values
+    - strings with character/literal type with size of 1 byte
+    - input streams
+    - container with contiguous storage of 1-byte values. Compatible container
+      types include `std::vector`, `std::string`, `std::array`,
+      `std::valarray`, and `std::initializer_list`. Furthermore, C-style
+      arrays can be used with `std::begin()`/`std::end()`. User-defined
+      containers can be used as long as they implement random-access iterators
+      and a contiguous storage.
+
+    @pre Each element of the container has a size of 1 byte. Violating this
+    precondition yields undefined behavior. **This precondition is enforced
+    with a static assertion.**
+
+    @pre The container storage is contiguous. Violating this precondition
+    yields undefined behavior. **This precondition is enforced with an
+    assertion.**
+
+    @warning There is no way to enforce all preconditions at compile-time. If
+             the function is called with a noncompliant container and with
+             assertions switched off, the behavior is undefined and will most
+             likely yield segmentation violation.
+
+    @param[in] i  input to read from
+    @param[in,out] sax  SAX event listener
+    @param[in] format  the format to parse (JSON, CBOR, MessagePack, or UBJSON)
+    @param[in] strict  whether the input has to be consumed completely
+
+    @return return value of the last processed SAX event
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the SAX consumer @a sax has
+    a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `sax_parse()` function
+    reading from string and processing the events with a user-defined SAX
+    event consumer.,sax_parse}
+
+    @since version 3.2.0
+    */
+    template <typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(detail::input_adapter&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true)
+    {
+        assert(sax);
+        return format == input_format_t::json
+               ? parser(std::move(i)).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, SAX>(std::move(i)).sax_parse(format, sax, strict);
+    }
+
+    /*!
+    @brief deserialize from an iterator range with contiguous storage
+
+    This function reads from an iterator range of a container with contiguous
+    storage of 1-byte values. Compatible container types include
+    `std::vector`, `std::string`, `std::array`, `std::valarray`, and
+    `std::initializer_list`. Furthermore, C-style arrays can be used with
+    `std::begin()`/`std::end()`. User-defined containers can be used as long
+    as they implement random-access iterators and a contiguous storage.
+
+    @pre The iterator range is contiguous. Violating this precondition yields
+    undefined behavior. **This precondition is enforced with an assertion.**
+    @pre Each element in the range has a size of 1 byte. Violating this
+    precondition yields undefined behavior. **This precondition is enforced
+    with a static assertion.**
+
+    @warning There is no way to enforce all preconditions at compile-time. If
+             the function is called with noncompliant iterators and with
+             assertions switched off, the behavior is undefined and will most
+             likely yield segmentation violation.
+
+    @tparam IteratorType iterator of container with contiguous storage
+    @param[in] first  begin of the range to parse (included)
+    @param[in] last  end of the range to parse (excluded)
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb has a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from an iterator range.,parse__iteratortype__parser_callback_t}
+
+    @since version 2.0.3
+    */
+    template<class IteratorType, typename std::enable_if<
+                 std::is_base_of<
+                     std::random_access_iterator_tag,
+                     typename std::iterator_traits<IteratorType>::iterator_category>::value, int>::type = 0>
+    static basic_json parse(IteratorType first, IteratorType last,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true)
+    {
+        basic_json result;
+        parser(detail::input_adapter(first, last), cb, allow_exceptions).parse(true, result);
+        return result;
+    }
+
+    template<class IteratorType, typename std::enable_if<
+                 std::is_base_of<
+                     std::random_access_iterator_tag,
+                     typename std::iterator_traits<IteratorType>::iterator_category>::value, int>::type = 0>
+    static bool accept(IteratorType first, IteratorType last)
+    {
+        return parser(detail::input_adapter(first, last)).accept(true);
+    }
+
+    template<class IteratorType, class SAX, typename std::enable_if<
+                 std::is_base_of<
+                     std::random_access_iterator_tag,
+                     typename std::iterator_traits<IteratorType>::iterator_category>::value, int>::type = 0>
+    JSON_HEDLEY_NON_NULL(3)
+    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax)
+    {
+        return parser(detail::input_adapter(first, last)).sax_parse(sax);
+    }
+
+    /*!
+    @brief deserialize from stream
+    @deprecated This stream operator is deprecated and will be removed in
+                version 4.0.0 of the library. Please use
+                @ref operator>>(std::istream&, basic_json&)
+                instead; that is, replace calls like `j << i;` with `i >> j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED(3.0.0)
+    friend std::istream& operator<<(basic_json& j, std::istream& i)
+    {
+        return operator>>(i, j);
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    Deserializes an input stream to a JSON value.
+
+    @param[in,out] i  input stream to read a serialized JSON value from
+    @param[in,out] j  JSON value to write the deserialized input to
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below shows how a JSON value is constructed by
+    reading a serialization from a stream.,operator_deserialize}
+
+    @sa parse(std::istream&, const parser_callback_t) for a variant with a
+    parser callback function to filter values while parsing
+
+    @since version 1.0.0
+    */
+    friend std::istream& operator>>(std::istream& i, basic_json& j)
+    {
+        parser(detail::input_adapter(i)).parse(false, j);
+        return i;
+    }
+
+    /// @}
+
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /*!
+    @brief return the type as string
+
+    Returns the type name as string to be used in error messages - usually to
+    indicate that a function was called on a wrong JSON type.
+
+    @return a string representation of a the @a m_type member:
+            Value type  | return value
+            ----------- | -------------
+            null        | `"null"`
+            boolean     | `"boolean"`
+            string      | `"string"`
+            number      | `"number"` (for all number types)
+            object      | `"object"`
+            array       | `"array"`
+            discarded   | `"discarded"`
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies `type_name()` for all JSON
+    types.,type_name}
+
+    @sa @ref type() -- return the type of the JSON value
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
+
+    @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept`
+    since 3.0.0
+    */
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* type_name() const noexcept
+    {
+        {
+            switch (m_type)
+            {
+                case value_t::null:
+                    return "null";
+                case value_t::object:
+                    return "object";
+                case value_t::array:
+                    return "array";
+                case value_t::string:
+                    return "string";
+                case value_t::boolean:
+                    return "boolean";
+                case value_t::discarded:
+                    return "discarded";
+                default:
+                    return "number";
+            }
+        }
+    }
+
+
+  private:
+    //////////////////////
+    // member variables //
+    //////////////////////
+
+    /// the type of the current element
+    value_t m_type = value_t::null;
+
+    /// the value of the current element
+    json_value m_value = {};
+
+    //////////////////////////////////////////
+    // binary serialization/deserialization //
+    //////////////////////////////////////////
+
+    /// @name binary serialization/deserialization support
+    /// @{
+
+  public:
+    /*!
+    @brief create a CBOR serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the CBOR (Concise
+    Binary Object Representation) serialization format. CBOR is a binary
+    serialization format which aims to be more compact than JSON itself, yet
+    more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    CBOR types according to the CBOR specification (RFC 7049):
+
+    JSON value type | value/range                                | CBOR type                          | first byte
+    --------------- | ------------------------------------------ | ---------------------------------- | ---------------
+    null            | `null`                                     | Null                               | 0xF6
+    boolean         | `true`                                     | True                               | 0xF5
+    boolean         | `false`                                    | False                              | 0xF4
+    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3B
+    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3A
+    number_integer  | -32768..-129                               | Negative integer (2 bytes follow)  | 0x39
+    number_integer  | -128..-25                                  | Negative integer (1 byte follow)   | 0x38
+    number_integer  | -24..-1                                    | Negative integer                   | 0x20..0x37
+    number_integer  | 0..23                                      | Integer                            | 0x00..0x17
+    number_integer  | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_integer  | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_unsigned | 0..23                                      | Integer                            | 0x00..0x17
+    number_unsigned | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_unsigned | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_float    | *any value*                                | Double-Precision Float             | 0xFB
+    string          | *length*: 0..23                            | UTF-8 string                       | 0x60..0x77
+    string          | *length*: 23..255                          | UTF-8 string (1 byte follow)       | 0x78
+    string          | *length*: 256..65535                       | UTF-8 string (2 bytes follow)      | 0x79
+    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7A
+    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7B
+    array           | *size*: 0..23                              | array                              | 0x80..0x97
+    array           | *size*: 23..255                            | array (1 byte follow)              | 0x98
+    array           | *size*: 256..65535                         | array (2 bytes follow)             | 0x99
+    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9A
+    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9B
+    object          | *size*: 0..23                              | map                                | 0xA0..0xB7
+    object          | *size*: 23..255                            | map (1 byte follow)                | 0xB8
+    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xB9
+    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xBA
+    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xBB
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a CBOR value.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The following CBOR types are not used in the conversion:
+          - byte strings (0x40..0x5F)
+          - UTF-8 strings terminated by "break" (0x7F)
+          - arrays terminated by "break" (0x9F)
+          - maps terminated by "break" (0xBF)
+          - date/time (0xC0..0xC1)
+          - bignum (0xC2..0xC3)
+          - decimal fraction (0xC4)
+          - bigfloat (0xC5)
+          - tagged items (0xC6..0xD4, 0xD8..0xDB)
+          - expected conversions (0xD5..0xD7)
+          - simple values (0xE0..0xF3, 0xF8)
+          - undefined (0xF7)
+          - half and single-precision floats (0xF9-0xFA)
+          - break (0xFF)
+
+    @param[in] j  JSON value to serialize
+    @return MessagePack serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in CBOR format.,to_cbor}
+
+    @sa http://cbor.io
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool) for the
+        analogous deserialization
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9
+    */
+    static std::vector<uint8_t> to_cbor(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_cbor(j, result);
+        return result;
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_cbor(j);
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_cbor(j);
+    }
+
+    /*!
+    @brief create a MessagePack serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the MessagePack
+    serialization format. MessagePack is a binary serialization format which
+    aims to be more compact than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    MessagePack types according to the MessagePack specification:
+
+    JSON value type | value/range                       | MessagePack type | first byte
+    --------------- | --------------------------------- | ---------------- | ----------
+    null            | `null`                            | nil              | 0xC0
+    boolean         | `true`                            | true             | 0xC3
+    boolean         | `false`                           | false            | 0xC2
+    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xD3
+    number_integer  | -2147483648..-32769               | int32            | 0xD2
+    number_integer  | -32768..-129                      | int16            | 0xD1
+    number_integer  | -128..-33                         | int8             | 0xD0
+    number_integer  | -32..-1                           | negative fixint  | 0xE0..0xFF
+    number_integer  | 0..127                            | positive fixint  | 0x00..0x7F
+    number_integer  | 128..255                          | uint 8           | 0xCC
+    number_integer  | 256..65535                        | uint 16          | 0xCD
+    number_integer  | 65536..4294967295                 | uint 32          | 0xCE
+    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7F
+    number_unsigned | 128..255                          | uint 8           | 0xCC
+    number_unsigned | 256..65535                        | uint 16          | 0xCD
+    number_unsigned | 65536..4294967295                 | uint 32          | 0xCE
+    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_float    | *any value*                       | float 64         | 0xCB
+    string          | *length*: 0..31                   | fixstr           | 0xA0..0xBF
+    string          | *length*: 32..255                 | str 8            | 0xD9
+    string          | *length*: 256..65535              | str 16           | 0xDA
+    string          | *length*: 65536..4294967295       | str 32           | 0xDB
+    array           | *size*: 0..15                     | fixarray         | 0x90..0x9F
+    array           | *size*: 16..65535                 | array 16         | 0xDC
+    array           | *size*: 65536..4294967295         | array 32         | 0xDD
+    object          | *size*: 0..15                     | fix map          | 0x80..0x8F
+    object          | *size*: 16..65535                 | map 16           | 0xDE
+    object          | *size*: 65536..4294967295         | map 32           | 0xDF
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a MessagePack value.
+
+    @note The following values can **not** be converted to a MessagePack value:
+          - strings with more than 4294967295 bytes
+          - arrays with more than 4294967295 elements
+          - objects with more than 4294967295 elements
+
+    @note The following MessagePack types are not used in the conversion:
+          - bin 8 - bin 32 (0xC4..0xC6)
+          - ext 8 - ext 32 (0xC7..0xC9)
+          - float 32 (0xCA)
+          - fixext 1 - fixext 16 (0xD4..0xD8)
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @param[in] j  JSON value to serialize
+    @return MessagePack serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in MessagePack format.,to_msgpack}
+
+    @sa http://msgpack.org
+    @sa @ref from_msgpack for the analogous deserialization
+    @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9
+    */
+    static std::vector<uint8_t> to_msgpack(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_msgpack(j, result);
+        return result;
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_msgpack(j);
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_msgpack(j);
+    }
+
+    /*!
+    @brief create a UBJSON serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the UBJSON
+    (Universal Binary JSON) serialization format. UBJSON aims to be more compact
+    than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    UBJSON types according to the UBJSON specification:
+
+    JSON value type | value/range                       | UBJSON type | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | `Z`
+    boolean         | `true`                            | true        | `T`
+    boolean         | `false`                           | false       | `F`
+    number_integer  | -9223372036854775808..-2147483649 | int64       | `L`
+    number_integer  | -2147483648..-32769               | int32       | `l`
+    number_integer  | -32768..-129                      | int16       | `I`
+    number_integer  | -128..127                         | int8        | `i`
+    number_integer  | 128..255                          | uint8       | `U`
+    number_integer  | 256..32767                        | int16       | `I`
+    number_integer  | 32768..2147483647                 | int32       | `l`
+    number_integer  | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 0..127                            | int8        | `i`
+    number_unsigned | 128..255                          | uint8       | `U`
+    number_unsigned | 256..32767                        | int16       | `I`
+    number_unsigned | 32768..2147483647                 | int32       | `l`
+    number_unsigned | 2147483648..9223372036854775807   | int64       | `L`
+    number_float    | *any value*                       | float64     | `D`
+    string          | *with shortest length indicator*  | string      | `S`
+    array           | *see notes on optimized format*   | array       | `[`
+    object          | *see notes on optimized format*   | map         | `{`
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a UBJSON value.
+
+    @note The following values can **not** be converted to a UBJSON value:
+          - strings with more than 9223372036854775807 bytes (theoretical)
+          - unsigned integer numbers above 9223372036854775807
+
+    @note The following markers are not used in the conversion:
+          - `Z`: no-op values are not created.
+          - `C`: single-byte strings are serialized with `S` markers.
+
+    @note Any UBJSON output created @ref to_ubjson can be successfully parsed
+          by @ref from_ubjson.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The optimized formats for containers are supported: Parameter
+          @a use_size adds size information to the beginning of a container and
+          removes the closing marker. Parameter @a use_type further checks
+          whether all elements of a container have the same type and adds the
+          type marker to the beginning of the container. The @a use_type
+          parameter must only be used together with @a use_size = true. Note
+          that @a use_size = true alone may result in larger representations -
+          the benefit of this parameter is that the receiving side is
+          immediately informed on the number of elements of the container.
+
+    @param[in] j  JSON value to serialize
+    @param[in] use_size  whether to add size annotations to container types
+    @param[in] use_type  whether to add type annotations to container types
+                         (must be combined with @a use_size = true)
+    @return UBJSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in UBJSON format.,to_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        analogous deserialization
+    @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+
+    @since version 3.1.0
+    */
+    static std::vector<uint8_t> to_ubjson(const basic_json& j,
+                                          const bool use_size = false,
+                                          const bool use_type = false)
+    {
+        std::vector<uint8_t> result;
+        to_ubjson(j, result, use_size, use_type);
+        return result;
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<uint8_t> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<uint8_t>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
+    }
+
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and returns a vector
+           containing the corresponding BSON-representation.
+
+    BSON (Binary JSON) is a binary format in which zero or more ordered key/value pairs are
+    stored as a single entity (a so-called document).
+
+    The library uses the following mapping from JSON values types to BSON types:
+
+    JSON value type | value/range                       | BSON type   | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | 0x0A
+    boolean         | `true`, `false`                   | boolean     | 0x08
+    number_integer  | -9223372036854775808..-2147483649 | int64       | 0x12
+    number_integer  | -2147483648..2147483647           | int32       | 0x10
+    number_integer  | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 0..2147483647                     | int32       | 0x10
+    number_unsigned | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 9223372036854775808..18446744073709551615| --   | --
+    number_float    | *any value*                       | double      | 0x01
+    string          | *any value*                       | string      | 0x02
+    array           | *any value*                       | document    | 0x04
+    object          | *any value*                       | document    | 0x03
+
+    @warning The mapping is **incomplete**, since only JSON-objects (and things
+    contained therein) can be serialized to BSON.
+    Also, integers larger than 9223372036854775807 cannot be serialized to BSON,
+    and the keys may not contain U+0000, since they are serialized a
+    zero-terminated c-strings.
+
+    @throw out_of_range.407  if `j.is_number_unsigned() && j.get<std::uint64_t>() > 9223372036854775807`
+    @throw out_of_range.409  if a key in `j` contains a NULL (U+0000)
+    @throw type_error.317    if `!j.is_object()`
+
+    @pre The input `j` is required to be an object: `j.is_object() == true`.
+
+    @note Any BSON output created via @ref to_bson can be successfully parsed
+          by @ref from_bson.
+
+    @param[in] j  JSON value to serialize
+    @return BSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in BSON format.,to_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa @ref from_bson(detail::input_adapter&&, const bool strict) for the
+        analogous deserialization
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+    @sa @ref to_cbor(const basic_json&) for the related CBOR format
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+    */
+    static std::vector<uint8_t> to_bson(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_bson(j, result);
+        return result;
+    }
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and forwards the
+           corresponding BSON-representation to the given output_adapter `o`.
+    @param j The JSON object to convert to BSON.
+    @param o The output adapter that receives the binary BSON representation.
+    @pre The input `j` shall be an object: `j.is_object() == true`
+    @sa @ref to_bson(const basic_json&)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_bson(j);
+    }
+
+    /*!
+    @copydoc to_bson(const basic_json&, detail::output_adapter<uint8_t>)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_bson(j);
+    }
+
+
+    /*!
+    @brief create a JSON value from an input in CBOR format
+
+    Deserializes a given input @a i to a JSON value using the CBOR (Concise
+    Binary Object Representation) serialization format.
+
+    The library maps CBOR types to JSON value types as follows:
+
+    CBOR type              | JSON value type | first byte
+    ---------------------- | --------------- | ----------
+    Integer                | number_unsigned | 0x00..0x17
+    Unsigned integer       | number_unsigned | 0x18
+    Unsigned integer       | number_unsigned | 0x19
+    Unsigned integer       | number_unsigned | 0x1A
+    Unsigned integer       | number_unsigned | 0x1B
+    Negative integer       | number_integer  | 0x20..0x37
+    Negative integer       | number_integer  | 0x38
+    Negative integer       | number_integer  | 0x39
+    Negative integer       | number_integer  | 0x3A
+    Negative integer       | number_integer  | 0x3B
+    Negative integer       | number_integer  | 0x40..0x57
+    UTF-8 string           | string          | 0x60..0x77
+    UTF-8 string           | string          | 0x78
+    UTF-8 string           | string          | 0x79
+    UTF-8 string           | string          | 0x7A
+    UTF-8 string           | string          | 0x7B
+    UTF-8 string           | string          | 0x7F
+    array                  | array           | 0x80..0x97
+    array                  | array           | 0x98
+    array                  | array           | 0x99
+    array                  | array           | 0x9A
+    array                  | array           | 0x9B
+    array                  | array           | 0x9F
+    map                    | object          | 0xA0..0xB7
+    map                    | object          | 0xB8
+    map                    | object          | 0xB9
+    map                    | object          | 0xBA
+    map                    | object          | 0xBB
+    map                    | object          | 0xBF
+    False                  | `false`         | 0xF4
+    True                   | `true`          | 0xF5
+    Null                   | `null`          | 0xF6
+    Half-Precision Float   | number_float    | 0xF9
+    Single-Precision Float | number_float    | 0xFA
+    Double-Precision Float | number_float    | 0xFB
+
+    @warning The mapping is **incomplete** in the sense that not all CBOR
+             types can be converted to a JSON value. The following CBOR types
+             are not supported and will yield parse errors (parse_error.112):
+             - byte strings (0x40..0x5F)
+             - date/time (0xC0..0xC1)
+             - bignum (0xC2..0xC3)
+             - decimal fraction (0xC4)
+             - bigfloat (0xC5)
+             - tagged items (0xC6..0xD4, 0xD8..0xDB)
+             - expected conversions (0xD5..0xD7)
+             - simple values (0xE0..0xF3, 0xF8)
+             - undefined (0xF7)
+
+    @warning CBOR allows map keys of any type, whereas JSON only allows
+             strings as keys in object values. Therefore, CBOR maps with keys
+             other than UTF-8 strings are rejected (parse_error.113).
+
+    @note Any CBOR output created @ref to_cbor can be successfully parsed by
+          @ref from_cbor.
+
+    @param[in] i  an input in CBOR format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from CBOR were
+    used in the given input @a v or if the input is not valid CBOR
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in CBOR
+    format to a JSON value.,from_cbor}
+
+    @sa http://cbor.io
+    @sa @ref to_cbor(const basic_json&) for the analogous serialization
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for the
+        related MessagePack format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        related UBJSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(detail::input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(i)).sax_parse(input_format_t::cbor, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_cbor(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename A1, typename A2,
+             detail::enable_if_t<std::is_constructible<detail::input_adapter, A1, A2>::value, int> = 0>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(A1 && a1, A2 && a2,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).sax_parse(input_format_t::cbor, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @brief create a JSON value from an input in MessagePack format
+
+    Deserializes a given input @a i to a JSON value using the MessagePack
+    serialization format.
+
+    The library maps MessagePack types to JSON value types as follows:
+
+    MessagePack type | JSON value type | first byte
+    ---------------- | --------------- | ----------
+    positive fixint  | number_unsigned | 0x00..0x7F
+    fixmap           | object          | 0x80..0x8F
+    fixarray         | array           | 0x90..0x9F
+    fixstr           | string          | 0xA0..0xBF
+    nil              | `null`          | 0xC0
+    false            | `false`         | 0xC2
+    true             | `true`          | 0xC3
+    float 32         | number_float    | 0xCA
+    float 64         | number_float    | 0xCB
+    uint 8           | number_unsigned | 0xCC
+    uint 16          | number_unsigned | 0xCD
+    uint 32          | number_unsigned | 0xCE
+    uint 64          | number_unsigned | 0xCF
+    int 8            | number_integer  | 0xD0
+    int 16           | number_integer  | 0xD1
+    int 32           | number_integer  | 0xD2
+    int 64           | number_integer  | 0xD3
+    str 8            | string          | 0xD9
+    str 16           | string          | 0xDA
+    str 32           | string          | 0xDB
+    array 16         | array           | 0xDC
+    array 32         | array           | 0xDD
+    map 16           | object          | 0xDE
+    map 32           | object          | 0xDF
+    negative fixint  | number_integer  | 0xE0-0xFF
+
+    @warning The mapping is **incomplete** in the sense that not all
+             MessagePack types can be converted to a JSON value. The following
+             MessagePack types are not supported and will yield parse errors:
+              - bin 8 - bin 32 (0xC4..0xC6)
+              - ext 8 - ext 32 (0xC7..0xC9)
+              - fixext 1 - fixext 16 (0xD4..0xD8)
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @param[in] i  an input in MessagePack format convertible to an input
+                  adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from MessagePack were
+    used in the given input @a i or if the input is not valid MessagePack
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    MessagePack format to a JSON value.,from_msgpack}
+
+    @sa http://msgpack.org
+    @sa @ref to_msgpack(const basic_json&) for the analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool) for the
+        related CBOR format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for
+        the related UBJSON format
+    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(detail::input_adapter&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(i)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_msgpack(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename A1, typename A2,
+             detail::enable_if_t<std::is_constructible<detail::input_adapter, A1, A2>::value, int> = 0>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(A1 && a1, A2 && a2,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @brief create a JSON value from an input in UBJSON format
+
+    Deserializes a given input @a i to a JSON value using the UBJSON (Universal
+    Binary JSON) serialization format.
+
+    The library maps UBJSON types to JSON value types as follows:
+
+    UBJSON type | JSON value type                         | marker
+    ----------- | --------------------------------------- | ------
+    no-op       | *no value, next value is read*          | `N`
+    null        | `null`                                  | `Z`
+    false       | `false`                                 | `F`
+    true        | `true`                                  | `T`
+    float32     | number_float                            | `d`
+    float64     | number_float                            | `D`
+    uint8       | number_unsigned                         | `U`
+    int8        | number_integer                          | `i`
+    int16       | number_integer                          | `I`
+    int32       | number_integer                          | `l`
+    int64       | number_integer                          | `L`
+    string      | string                                  | `S`
+    char        | string                                  | `C`
+    array       | array (optimized values are supported)  | `[`
+    object      | object (optimized values are supported) | `{`
+
+    @note The mapping is **complete** in the sense that any UBJSON value can
+          be converted to a JSON value.
+
+    @param[in] i  an input in UBJSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if a parse error occurs
+    @throw parse_error.113 if a string could not be parsed successfully
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    UBJSON format to a JSON value.,from_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool) for the
+        related CBOR format
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
+        the related MessagePack format
+    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 3.1.0; added @a allow_exceptions parameter since 3.2.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(detail::input_adapter&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(i)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_ubjson(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename A1, typename A2,
+             detail::enable_if_t<std::is_constructible<detail::input_adapter, A1, A2>::value, int> = 0>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(A1 && a1, A2 && a2,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @brief Create a JSON value from an input in BSON format
+
+    Deserializes a given input @a i to a JSON value using the BSON (Binary JSON)
+    serialization format.
+
+    The library maps BSON record types to JSON value types as follows:
+
+    BSON type       | BSON marker byte | JSON value type
+    --------------- | ---------------- | ---------------------------
+    double          | 0x01             | number_float
+    string          | 0x02             | string
+    document        | 0x03             | object
+    array           | 0x04             | array
+    binary          | 0x05             | still unsupported
+    undefined       | 0x06             | still unsupported
+    ObjectId        | 0x07             | still unsupported
+    boolean         | 0x08             | boolean
+    UTC Date-Time   | 0x09             | still unsupported
+    null            | 0x0A             | null
+    Regular Expr.   | 0x0B             | still unsupported
+    DB Pointer      | 0x0C             | still unsupported
+    JavaScript Code | 0x0D             | still unsupported
+    Symbol          | 0x0E             | still unsupported
+    JavaScript Code | 0x0F             | still unsupported
+    int32           | 0x10             | number_integer
+    Timestamp       | 0x11             | still unsupported
+    128-bit decimal float | 0x13       | still unsupported
+    Max Key         | 0x7F             | still unsupported
+    Min Key         | 0xFF             | still unsupported
+
+    @warning The mapping is **incomplete**. The unsupported mappings
+             are indicated in the table above.
+
+    @param[in] i  an input in BSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.114 if an unsupported BSON record type is encountered
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    BSON format to a JSON value.,from_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa @ref to_bson(const basic_json&) for the analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool) for the
+        related CBOR format
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
+        the related MessagePack format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        related UBJSON format
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(detail::input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(i)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_bson(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename A1, typename A2,
+             detail::enable_if_t<std::is_constructible<detail::input_adapter, A1, A2>::value, int> = 0>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(A1 && a1, A2 && a2,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+
+    /// @}
+
+    //////////////////////////
+    // JSON Pointer support //
+    //////////////////////////
+
+    /// @name JSON Pointer functions
+    /// @{
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. Similar to @ref operator[](const typename
+    object_t::key_type&), `null` values are created in arrays and objects if
+    necessary.
+
+    In particular:
+    - If the JSON pointer points to an object key that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned.
+    - If the JSON pointer points to an array index that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned. All indices between the current maximum and the given
+      index are also filled with `null`.
+    - The special value `-` is treated as a synonym for the index past the
+      end.
+
+    @param[in] ptr  a JSON pointer
+
+    @return reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer}
+
+    @since version 2.0.0
+    */
+    reference operator[](const json_pointer& ptr)
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. The function does not change the JSON
+    value; no `null` values are created. In particular, the the special value
+    `-` yields an exception.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return const reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer_const}
+
+    @since version 2.0.0
+    */
+    const_reference operator[](const json_pointer& ptr) const
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a reference to the element at with specified JSON pointer @a ptr,
+    with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer}
+    */
+    reference at(const json_pointer& ptr)
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a const reference to the element at with specified JSON pointer @a
+    ptr, with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer_const}
+    */
+    const_reference at(const json_pointer& ptr) const
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief return flattened JSON value
+
+    The function creates a JSON object whose keys are JSON pointers (see [RFC
+    6901](https://tools.ietf.org/html/rfc6901)) and whose values are all
+    primitive. The original JSON value can be restored using the @ref
+    unflatten() function.
+
+    @return an object that maps JSON pointers to primitive values
+
+    @note Empty objects and arrays are flattened to `null` and will not be
+          reconstructed correctly by the @ref unflatten() function.
+
+    @complexity Linear in the size the JSON value.
+
+    @liveexample{The following code shows how a JSON object is flattened to an
+    object whose keys consist of JSON pointers.,flatten}
+
+    @sa @ref unflatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json flatten() const
+    {
+        basic_json result(value_t::object);
+        json_pointer::flatten("", *this, result);
+        return result;
+    }
+
+    /*!
+    @brief unflatten a previously flattened JSON value
+
+    The function restores the arbitrary nesting of a JSON value that has been
+    flattened before using the @ref flatten() function. The JSON value must
+    meet certain constraints:
+    1. The value must be an object.
+    2. The keys must be JSON pointers (see
+       [RFC 6901](https://tools.ietf.org/html/rfc6901))
+    3. The mapped values must be primitive JSON types.
+
+    @return the original JSON from a flattened version
+
+    @note Empty objects and arrays are flattened by @ref flatten() to `null`
+          values and can not unflattened to their original type. Apart from
+          this example, for a JSON value `j`, the following is always true:
+          `j == j.flatten().unflatten()`.
+
+    @complexity Linear in the size the JSON value.
+
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+
+    @liveexample{The following code shows how a flattened JSON object is
+    unflattened into the original nested JSON object.,unflatten}
+
+    @sa @ref flatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json unflatten() const
+    {
+        return json_pointer::unflatten(*this);
+    }
+
+    /// @}
+
+    //////////////////////////
+    // JSON Patch functions //
+    //////////////////////////
+
+    /// @name JSON Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON patch
+
+    [JSON Patch](http://jsonpatch.com) defines a JSON document structure for
+    expressing a sequence of operations to apply to a JSON) document. With
+    this function, a JSON Patch is applied to the current JSON value by
+    executing all operations from the patch.
+
+    @param[in] json_patch  JSON patch document
+    @return patched document
+
+    @note The application of a patch is atomic: Either all operations succeed
+          and the patched document is returned or an exception is thrown. In
+          any case, the original value is not changed: the patch is applied
+          to a copy of the value.
+
+    @throw parse_error.104 if the JSON patch does not consist of an array of
+    objects
+
+    @throw parse_error.105 if the JSON patch is malformed (e.g., mandatory
+    attributes are missing); example: `"operation add must have member path"`
+
+    @throw out_of_range.401 if an array index is out of range.
+
+    @throw out_of_range.403 if a JSON pointer inside the patch could not be
+    resolved successfully in the current JSON value; example: `"key baz not
+    found"`
+
+    @throw out_of_range.405 if JSON pointer has no parent ("add", "remove",
+    "move")
+
+    @throw other_error.501 if "test" operation was unsuccessful
+
+    @complexity Linear in the size of the JSON value and the length of the
+    JSON patch. As usually only a fraction of the JSON value is affected by
+    the patch, the complexity can usually be neglected.
+
+    @liveexample{The following code shows how a JSON patch is applied to a
+    value.,patch}
+
+    @sa @ref diff -- create a JSON patch by comparing two JSON values
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+    @sa [RFC 6901 (JSON Pointer)](https://tools.ietf.org/html/rfc6901)
+
+    @since version 2.0.0
+    */
+    basic_json patch(const basic_json& json_patch) const
+    {
+        // make a working copy to apply the patch to
+        basic_json result = *this;
+
+        // the valid JSON Patch operations
+        enum class patch_operations {add, remove, replace, move, copy, test, invalid};
+
+        const auto get_op = [](const std::string & op)
+        {
+            if (op == "add")
+            {
+                return patch_operations::add;
+            }
+            if (op == "remove")
+            {
+                return patch_operations::remove;
+            }
+            if (op == "replace")
+            {
+                return patch_operations::replace;
+            }
+            if (op == "move")
+            {
+                return patch_operations::move;
+            }
+            if (op == "copy")
+            {
+                return patch_operations::copy;
+            }
+            if (op == "test")
+            {
+                return patch_operations::test;
+            }
+
+            return patch_operations::invalid;
+        };
+
+        // wrapper for "add" operation; add value at ptr
+        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
+        {
+            // adding to the root of the target document means replacing it
+            if (ptr.empty())
+            {
+                result = val;
+                return;
+            }
+
+            // make sure the top element of the pointer exists
+            json_pointer top_pointer = ptr.top();
+            if (top_pointer != ptr)
+            {
+                result.at(top_pointer);
+            }
+
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result[ptr];
+
+            switch (parent.m_type)
+            {
+                case value_t::null:
+                case value_t::object:
+                {
+                    // use operator[] to add value
+                    parent[last_path] = val;
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    if (last_path == "-")
+                    {
+                        // special case: append to back
+                        parent.push_back(val);
+                    }
+                    else
+                    {
+                        const auto idx = json_pointer::array_index(last_path);
+                        if (JSON_HEDLEY_UNLIKELY(static_cast<size_type>(idx) > parent.size()))
+                        {
+                            // avoid undefined behavior
+                            JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+                        }
+
+                        // default case: insert add offset
+                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
+                    }
+                    break;
+                }
+
+                // if there exists a parent it cannot be primitive
+                default:            // LCOV_EXCL_LINE
+                    assert(false);  // LCOV_EXCL_LINE
+            }
+        };
+
+        // wrapper for "remove" operation; remove value at ptr
+        const auto operation_remove = [&result](json_pointer & ptr)
+        {
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result.at(ptr);
+
+            // remove child
+            if (parent.is_object())
+            {
+                // perform range check
+                auto it = parent.find(last_path);
+                if (JSON_HEDLEY_LIKELY(it != parent.end()))
+                {
+                    parent.erase(it);
+                }
+                else
+                {
+                    JSON_THROW(out_of_range::create(403, "key '" + last_path + "' not found"));
+                }
+            }
+            else if (parent.is_array())
+            {
+                // note erase performs range check
+                parent.erase(static_cast<size_type>(json_pointer::array_index(last_path)));
+            }
+        };
+
+        // type check: top level value must be an array
+        if (JSON_HEDLEY_UNLIKELY(not json_patch.is_array()))
+        {
+            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
+        }
+
+        // iterate and apply the operations
+        for (const auto& val : json_patch)
+        {
+            // wrapper to get a value for an operation
+            const auto get_value = [&val](const std::string & op,
+                                          const std::string & member,
+                                          bool string_type) -> basic_json &
+            {
+                // find value
+                auto it = val.m_value.object->find(member);
+
+                // context-sensitive error message
+                const auto error_msg = (op == "op") ? "operation" : "operation '" + op + "'";
+
+                // check if desired value is present
+                if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end()))
+                {
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have member '" + member + "'"));
+                }
+
+                // check if result is of type string
+                if (JSON_HEDLEY_UNLIKELY(string_type and not it->second.is_string()))
+                {
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have string member '" + member + "'"));
+                }
+
+                // no error: return value
+                return it->second;
+            };
+
+            // type check: every element of the array must be an object
+            if (JSON_HEDLEY_UNLIKELY(not val.is_object()))
+            {
+                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
+            }
+
+            // collect mandatory members
+            const std::string op = get_value("op", "op", true);
+            const std::string path = get_value(op, "path", true);
+            json_pointer ptr(path);
+
+            switch (get_op(op))
+            {
+                case patch_operations::add:
+                {
+                    operation_add(ptr, get_value("add", "value", false));
+                    break;
+                }
+
+                case patch_operations::remove:
+                {
+                    operation_remove(ptr);
+                    break;
+                }
+
+                case patch_operations::replace:
+                {
+                    // the "path" location must exist - use at()
+                    result.at(ptr) = get_value("replace", "value", false);
+                    break;
+                }
+
+                case patch_operations::move:
+                {
+                    const std::string from_path = get_value("move", "from", true);
+                    json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The move operation is functionally identical to a
+                    // "remove" operation on the "from" location, followed
+                    // immediately by an "add" operation at the target
+                    // location with the value that was just removed.
+                    operation_remove(from_ptr);
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::copy:
+                {
+                    const std::string from_path = get_value("copy", "from", true);
+                    const json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The copy is functionally identical to an "add"
+                    // operation at the target location using the value
+                    // specified in the "from" member.
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::test:
+                {
+                    bool success = false;
+                    JSON_TRY
+                    {
+                        // check if "value" matches the one at "path"
+                        // the "path" location must exist - use at()
+                        success = (result.at(ptr) == get_value("test", "value", false));
+                    }
+                    JSON_INTERNAL_CATCH (out_of_range&)
+                    {
+                        // ignore out of range errors: success remains false
+                    }
+
+                    // throw an exception if test fails
+                    if (JSON_HEDLEY_UNLIKELY(not success))
+                    {
+                        JSON_THROW(other_error::create(501, "unsuccessful: " + val.dump()));
+                    }
+
+                    break;
+                }
+
+                default:
+                {
+                    // op must be "add", "remove", "replace", "move", "copy", or
+                    // "test"
+                    JSON_THROW(parse_error::create(105, 0, "operation value '" + op + "' is invalid"));
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief creates a diff as a JSON patch
+
+    Creates a [JSON Patch](http://jsonpatch.com) so that value @a source can
+    be changed into the value @a target by calling @ref patch function.
+
+    @invariant For two JSON values @a source and @a target, the following code
+    yields always `true`:
+    @code {.cpp}
+    source.patch(diff(source, target)) == target;
+    @endcode
+
+    @note Currently, only `remove`, `add`, and `replace` operations are
+          generated.
+
+    @param[in] source  JSON value to compare from
+    @param[in] target  JSON value to compare against
+    @param[in] path    helper value to create JSON pointers
+
+    @return a JSON patch to convert the @a source to @a target
+
+    @complexity Linear in the lengths of @a source and @a target.
+
+    @liveexample{The following code shows how a JSON patch is created as a
+    diff for two JSON values.,diff}
+
+    @sa @ref patch -- apply a JSON patch
+    @sa @ref merge_patch -- apply a JSON Merge Patch
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+
+    @since version 2.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json diff(const basic_json& source, const basic_json& target,
+                           const std::string& path = "")
+    {
+        // the patch
+        basic_json result(value_t::array);
+
+        // if the values are the same, return empty patch
+        if (source == target)
+        {
+            return result;
+        }
+
+        if (source.type() != target.type())
+        {
+            // different types: replace value
+            result.push_back(
+            {
+                {"op", "replace"}, {"path", path}, {"value", target}
+            });
+            return result;
+        }
+
+        switch (source.type())
+        {
+            case value_t::array:
+            {
+                // first pass: traverse common elements
+                std::size_t i = 0;
+                while (i < source.size() and i < target.size())
+                {
+                    // recursive call to compare array values at index i
+                    auto temp_diff = diff(source[i], target[i], path + "/" + std::to_string(i));
+                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    ++i;
+                }
+
+                // i now reached the end of at least one array
+                // in a second pass, traverse the remaining elements
+
+                // remove my remaining elements
+                const auto end_index = static_cast<difference_type>(result.size());
+                while (i < source.size())
+                {
+                    // add operations in reverse order to avoid invalid
+                    // indices
+                    result.insert(result.begin() + end_index, object(
+                    {
+                        {"op", "remove"},
+                        {"path", path + "/" + std::to_string(i)}
+                    }));
+                    ++i;
+                }
+
+                // add other remaining elements
+                while (i < target.size())
+                {
+                    result.push_back(
+                    {
+                        {"op", "add"},
+                        {"path", path + "/" + std::to_string(i)},
+                        {"value", target[i]}
+                    });
+                    ++i;
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // first pass: traverse this object's elements
+                for (auto it = source.cbegin(); it != source.cend(); ++it)
+                {
+                    // escape the key name to be used in a JSON patch
+                    const auto key = json_pointer::escape(it.key());
+
+                    if (target.find(it.key()) != target.end())
+                    {
+                        // recursive call to compare object values at key it
+                        auto temp_diff = diff(it.value(), target[it.key()], path + "/" + key);
+                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    }
+                    else
+                    {
+                        // found a key that is not in o -> remove it
+                        result.push_back(object(
+                        {
+                            {"op", "remove"}, {"path", path + "/" + key}
+                        }));
+                    }
+                }
+
+                // second pass: traverse other object's elements
+                for (auto it = target.cbegin(); it != target.cend(); ++it)
+                {
+                    if (source.find(it.key()) == source.end())
+                    {
+                        // found a key that is not in this -> add it
+                        const auto key = json_pointer::escape(it.key());
+                        result.push_back(
+                        {
+                            {"op", "add"}, {"path", path + "/" + key},
+                            {"value", it.value()}
+                        });
+                    }
+                }
+
+                break;
+            }
+
+            default:
+            {
+                // both primitive type: replace value
+                result.push_back(
+                {
+                    {"op", "replace"}, {"path", path}, {"value", target}
+                });
+                break;
+            }
+        }
+
+        return result;
+    }
+
+    /// @}
+
+    ////////////////////////////////
+    // JSON Merge Patch functions //
+    ////////////////////////////////
+
+    /// @name JSON Merge Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON Merge Patch
+
+    The merge patch format is primarily intended for use with the HTTP PATCH
+    method as a means of describing a set of modifications to a target
+    resource's content. This function applies a merge patch to the current
+    JSON value.
+
+    The function implements the following algorithm from Section 2 of
+    [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396):
+
+    ```
+    define MergePatch(Target, Patch):
+      if Patch is an Object:
+        if Target is not an Object:
+          Target = {} // Ignore the contents and set it to an empty Object
+        for each Name/Value pair in Patch:
+          if Value is null:
+            if Name exists in Target:
+              remove the Name/Value pair from Target
+          else:
+            Target[Name] = MergePatch(Target[Name], Value)
+        return Target
+      else:
+        return Patch
+    ```
+
+    Thereby, `Target` is the current object; that is, the patch is applied to
+    the current value.
+
+    @param[in] apply_patch  the patch to apply
+
+    @complexity Linear in the lengths of @a patch.
+
+    @liveexample{The following code shows how a JSON Merge Patch is applied to
+    a JSON document.,merge_patch}
+
+    @sa @ref patch -- apply a JSON patch
+    @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396)
+
+    @since version 3.0.0
+    */
+    void merge_patch(const basic_json& apply_patch)
+    {
+        if (apply_patch.is_object())
+        {
+            if (not is_object())
+            {
+                *this = object();
+            }
+            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
+            {
+                if (it.value().is_null())
+                {
+                    erase(it.key());
+                }
+                else
+                {
+                    operator[](it.key()).merge_patch(it.value());
+                }
+            }
+        }
+        else
+        {
+            *this = apply_patch;
+        }
+    }
+
+    /// @}
+};
+
+/*!
+@brief user-defined to_string function for JSON values
+
+This function implements a user-defined to_string  for JSON objects.
+
+@param[in] j  a JSON object
+@return a std::string object
+*/
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
+{
+    return j.dump();
+}
+} // namespace nlohmann
+
+///////////////////////
+// nonmember support //
+///////////////////////
+
+// specialization of std::swap, and std::hash
+namespace std
+{
+
+/// hash value for JSON objects
+template<>
+struct hash<nlohmann::json>
+{
+    /*!
+    @brief return a hash value for a JSON object
+
+    @since version 1.0.0
+    */
+    std::size_t operator()(const nlohmann::json& j) const
+    {
+        // a naive hashing via the string representation
+        const auto& h = hash<nlohmann::json::string_t>();
+        return h(j.dump());
+    }
+};
+
+/// specialization for std::less<value_t>
+/// @note: do not remove the space after '<',
+///        see https://github.com/nlohmann/json/pull/679
+template<>
+struct less<::nlohmann::detail::value_t>
+{
+    /*!
+    @brief compare two value_t enum values
+    @since version 3.0.0
+    */
+    bool operator()(nlohmann::detail::value_t lhs,
+                    nlohmann::detail::value_t rhs) const noexcept
+    {
+        return nlohmann::detail::operator<(lhs, rhs);
+    }
+};
+
+/*!
+@brief exchanges the values of two JSON objects
+
+@since version 1.0.0
+*/
+template<>
+inline void swap<nlohmann::json>(nlohmann::json& j1, nlohmann::json& j2) noexcept(
+    is_nothrow_move_constructible<nlohmann::json>::value and
+    is_nothrow_move_assignable<nlohmann::json>::value
+)
+{
+    j1.swap(j2);
+}
+
+} // namespace std
+
+/*!
+@brief user-defined string literal for JSON values
+
+This operator implements a user-defined string literal for JSON objects. It
+can be used by adding `"_json"` to a string literal and returns a JSON object
+if no parse error occurred.
+
+@param[in] s  a string representation of a JSON object
+@param[in] n  the length of string @a s
+@return a JSON object
+
+@since version 1.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+{
+    return nlohmann::json::parse(s, s + n);
+}
+
+/*!
+@brief user-defined string literal for JSON pointer
+
+This operator implements a user-defined string literal for JSON Pointers. It
+can be used by adding `"_json_pointer"` to a string literal and returns a JSON pointer
+object if no parse error occurred.
+
+@param[in] s  a string representation of a JSON Pointer
+@param[in] n  the length of string @a s
+@return a JSON pointer object
+
+@since version 2.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+{
+    return nlohmann::json::json_pointer(std::string(s, n));
+}
+
+// #include <nlohmann/detail/macro_unscope.hpp>
+
+
+// restore GCC/clang diagnostic settings
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+    #pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+    #pragma GCC diagnostic pop
+#endif
+
+// clean up
+#undef JSON_INTERNAL_CATCH
+#undef JSON_CATCH
+#undef JSON_THROW
+#undef JSON_TRY
+#undef JSON_HAS_CPP_14
+#undef JSON_HAS_CPP_17
+#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
+#undef NLOHMANN_BASIC_JSON_TPL
+
+// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
+#undef JSON_HEDLEY_ALWAYS_INLINE
+#undef JSON_HEDLEY_ARM_VERSION
+#undef JSON_HEDLEY_ARM_VERSION_CHECK
+#undef JSON_HEDLEY_ARRAY_PARAM
+#undef JSON_HEDLEY_ASSUME
+#undef JSON_HEDLEY_BEGIN_C_DECLS
+#undef JSON_HEDLEY_C_DECL
+#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#undef JSON_HEDLEY_CLANG_HAS_WARNING
+#undef JSON_HEDLEY_COMPCERT_VERSION
+#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#undef JSON_HEDLEY_CONCAT
+#undef JSON_HEDLEY_CONCAT_EX
+#undef JSON_HEDLEY_CONST
+#undef JSON_HEDLEY_CONST_CAST
+#undef JSON_HEDLEY_CONSTEXPR
+#undef JSON_HEDLEY_CPP_CAST
+#undef JSON_HEDLEY_CRAY_VERSION
+#undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#undef JSON_HEDLEY_DEPRECATED
+#undef JSON_HEDLEY_DEPRECATED_FOR
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#undef JSON_HEDLEY_DIAGNOSTIC_POP
+#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#undef JSON_HEDLEY_DMC_VERSION
+#undef JSON_HEDLEY_DMC_VERSION_CHECK
+#undef JSON_HEDLEY_EMPTY_BASES
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#undef JSON_HEDLEY_END_C_DECLS
+#undef JSON_HEDLEY_FALL_THROUGH
+#undef JSON_HEDLEY_FLAGS
+#undef JSON_HEDLEY_FLAGS_CAST
+#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#undef JSON_HEDLEY_GCC_HAS_FEATURE
+#undef JSON_HEDLEY_GCC_HAS_WARNING
+#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#undef JSON_HEDLEY_GCC_VERSION
+#undef JSON_HEDLEY_GCC_VERSION_CHECK
+#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#undef JSON_HEDLEY_GNUC_HAS_WARNING
+#undef JSON_HEDLEY_GNUC_VERSION
+#undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#undef JSON_HEDLEY_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_BUILTIN
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_EXTENSION
+#undef JSON_HEDLEY_HAS_FEATURE
+#undef JSON_HEDLEY_HAS_WARNING
+#undef JSON_HEDLEY_IAR_VERSION
+#undef JSON_HEDLEY_IAR_VERSION_CHECK
+#undef JSON_HEDLEY_IBM_VERSION
+#undef JSON_HEDLEY_IBM_VERSION_CHECK
+#undef JSON_HEDLEY_IMPORT
+#undef JSON_HEDLEY_INLINE
+#undef JSON_HEDLEY_INTEL_VERSION
+#undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#undef JSON_HEDLEY_IS_CONSTANT
+#undef JSON_HEDLEY_IS_CONSTEXPR_
+#undef JSON_HEDLEY_LIKELY
+#undef JSON_HEDLEY_MALLOC
+#undef JSON_HEDLEY_MESSAGE
+#undef JSON_HEDLEY_MSVC_VERSION
+#undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#undef JSON_HEDLEY_NEVER_INLINE
+#undef JSON_HEDLEY_NO_ESCAPE
+#undef JSON_HEDLEY_NON_NULL
+#undef JSON_HEDLEY_NO_RETURN
+#undef JSON_HEDLEY_NO_THROW
+#undef JSON_HEDLEY_NULL
+#undef JSON_HEDLEY_PELLES_VERSION
+#undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#undef JSON_HEDLEY_PGI_VERSION
+#undef JSON_HEDLEY_PGI_VERSION_CHECK
+#undef JSON_HEDLEY_PREDICT
+#undef JSON_HEDLEY_PRINTF_FORMAT
+#undef JSON_HEDLEY_PRIVATE
+#undef JSON_HEDLEY_PUBLIC
+#undef JSON_HEDLEY_PURE
+#undef JSON_HEDLEY_REINTERPRET_CAST
+#undef JSON_HEDLEY_REQUIRE
+#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#undef JSON_HEDLEY_REQUIRE_MSG
+#undef JSON_HEDLEY_RESTRICT
+#undef JSON_HEDLEY_RETURNS_NON_NULL
+#undef JSON_HEDLEY_SENTINEL
+#undef JSON_HEDLEY_STATIC_ASSERT
+#undef JSON_HEDLEY_STATIC_CAST
+#undef JSON_HEDLEY_STRINGIFY
+#undef JSON_HEDLEY_STRINGIFY_EX
+#undef JSON_HEDLEY_SUNPRO_VERSION
+#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#undef JSON_HEDLEY_TINYC_VERSION
+#undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#undef JSON_HEDLEY_TI_VERSION
+#undef JSON_HEDLEY_TI_VERSION_CHECK
+#undef JSON_HEDLEY_UNAVAILABLE
+#undef JSON_HEDLEY_UNLIKELY
+#undef JSON_HEDLEY_UNPREDICTABLE
+#undef JSON_HEDLEY_UNREACHABLE
+#undef JSON_HEDLEY_UNREACHABLE_RETURN
+#undef JSON_HEDLEY_VERSION
+#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#undef JSON_HEDLEY_VERSION_ENCODE
+#undef JSON_HEDLEY_WARNING
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT
+
+
+
+#endif  // INCLUDE_NLOHMANN_JSON_HPP_
\ No newline at end of file
diff --git a/pecos/core/third_party/robin_hood_hashing/robin_hood.h b/pecos/core/third_party/robin_hood_hashing/robin_hood.h
new file mode 100644
index 00000000..31adf080
--- /dev/null
+++ b/pecos/core/third_party/robin_hood_hashing/robin_hood.h
@@ -0,0 +1,2665 @@
+//                 ______  _____                 ______                _________
+//  ______________ ___  /_ ___(_)_______         ___  /_ ______ ______ ______  /
+//  __  ___/_  __ \__  __ \__  / __  __ \        __  __ \_  __ \_  __ \_  __  /
+//  _  /    / /_/ /_  /_/ /_  /  _  / / /        _  / / // /_/ // /_/ // /_/ /
+//  /_/     \____/ /_.___/ /_/   /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/
+//                                      _/_____/
+//
+// Fast & memory efficient hashtable based on robin hood hashing for C++11/14/17/20
+// https://github.com/martinus/robin-hood-hashing
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2020 Martin Ankerl <http://martin.ankerl.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ROBIN_HOOD_H_INCLUDED
+#define ROBIN_HOOD_H_INCLUDED
+
+// see https://semver.org/
+#define ROBIN_HOOD_VERSION_MAJOR 3 // for incompatible API changes
+#define ROBIN_HOOD_VERSION_MINOR 9 // for adding functionality in a backwards-compatible manner
+#define ROBIN_HOOD_VERSION_PATCH 0 // for backwards-compatible bug fixes
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory> // only to support hash of smart pointers
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#if __cplusplus >= 201703L
+#    include <string_view>
+#endif
+#if defined(__aarch64__)
+#    include <sys/auxv.h> // for getauxval
+#endif
+
+// #define ROBIN_HOOD_LOG_ENABLED
+#ifdef ROBIN_HOOD_LOG_ENABLED
+#    include <iostream>
+#    define ROBIN_HOOD_LOG(x) std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl
+#else
+#    define ROBIN_HOOD_LOG(x)
+#endif
+
+// #define ROBIN_HOOD_TRACE_ENABLED
+#ifdef ROBIN_HOOD_TRACE_ENABLED
+#    include <iostream>
+#    define ROBIN_HOOD_TRACE(x) \
+        std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl
+#else
+#    define ROBIN_HOOD_TRACE(x)
+#endif
+
+// #define ROBIN_HOOD_COUNT_ENABLED
+#ifdef ROBIN_HOOD_COUNT_ENABLED
+#    include <iostream>
+#    define ROBIN_HOOD_COUNT(x) ++counts().x;
+namespace robin_hood {
+struct Counts {
+    uint64_t shiftUp{};
+    uint64_t shiftDown{};
+};
+inline std::ostream& operator<<(std::ostream& os, Counts const& c) {
+    return os << c.shiftUp << " shiftUp" << std::endl << c.shiftDown << " shiftDown" << std::endl;
+}
+
+static Counts& counts() {
+    static Counts counts{};
+    return counts;
+}
+} // namespace robin_hood
+#else
+#    define ROBIN_HOOD_COUNT(x)
+#endif
+
+// all non-argument macros should use this facility. See
+// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/
+#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x()
+
+// mark unused members with this macro
+#define ROBIN_HOOD_UNUSED(identifier)
+
+// bitness
+#if SIZE_MAX == UINT32_MAX
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32
+#elif SIZE_MAX == UINT64_MAX
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64
+#else
+#    error Unsupported bitness
+#endif
+
+// endianess
+#ifdef _MSC_VER
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \
+        (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#endif
+
+// inline
+#ifdef _MSC_VER
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline)
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline))
+#endif
+
+// exceptions
+#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1
+#endif
+
+// count leading/trailing bits
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+#    ifdef _MSC_VER
+#        if ROBIN_HOOD(BITNESS) == 32
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
+#        else
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
+#        endif
+#        include <intrin.h>
+#        pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
+#        define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x)                                       \
+            [](size_t mask) noexcept -> int {                                             \
+                unsigned long index;                                                      \
+                return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast<int>(index) \
+                                                                : ROBIN_HOOD(BITNESS);    \
+            }(x)
+#    else
+#        if ROBIN_HOOD(BITNESS) == 32
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#        else
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#        endif
+#        define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
+#        define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
+#    endif
+#endif
+
+// fallthrough
+#ifndef __has_cpp_attribute // For backwards compatibility
+#    define __has_cpp_attribute(x) 0
+#endif
+#if __has_cpp_attribute(clang::fallthrough)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]]
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH()
+#endif
+
+// likely/unlikely
+#ifdef _MSC_VER
+#    define ROBIN_HOOD_LIKELY(condition) condition
+#    define ROBIN_HOOD_UNLIKELY(condition) condition
+#else
+#    define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1)
+#    define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
+#endif
+
+// detect if native wchar_t type is availiable in MSVC
+#ifdef _MSC_VER
+#    ifdef _NATIVE_WCHAR_T_DEFINED
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#    else
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0
+#    endif
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#endif
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+// See https://stackoverflow.com/a/31798726/48181
+#if defined(__GNUC__) && __GNUC__ < 5
+#    define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+#    define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// helpers for C++ versions, see https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]]
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD()
+#endif
+
+// detect hardware CRC availability.
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+// only use CRC for 64bit targets
+#    if ROBIN_HOOD(BITNESS) == 64 && \
+        (defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32) || defined(_MSC_VER))
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_CRC32() 1
+#        if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
+#            ifdef _M_ARM64
+#                include <arm64_neon.h>
+#            else
+#                include <arm_acle.h>
+#            endif
+
+#            define ROBIN_HOOD_CRC32_64(crc, v) \
+                static_cast<uint64_t>(          \
+                    __crc32cd(static_cast<uint32_t>(crc), static_cast<uint64_t>(v)))
+#            define ROBIN_HOOD_CRC32_32(crc, v) \
+                __crc32cw(static_cast<uint32_t>(crc), static_cast<uint32_t>(v))
+#        else
+#            include <nmmintrin.h>
+#            define ROBIN_HOOD_CRC32_64(crc, v) \
+                static_cast<uint64_t>(          \
+                    _mm_crc32_u64(static_cast<uint64_t>(crc), static_cast<uint64_t>(v)))
+#            define ROBIN_HOOD_CRC32_32(crc, v) \
+                _mm_crc32_u32(static_cast<uint32_t>(crc), static_cast<uint32_t>(v))
+#        endif
+#    else
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_CRC32() 0
+#    endif
+
+#    if defined(_MSC_VER)
+#        include <intrin.h>
+#    endif
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_CRC32() 0
+#endif
+
+namespace robin_hood {
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+#    define ROBIN_HOOD_STD std
+#else
+
+// c++11 compatibility layer
+namespace ROBIN_HOOD_STD {
+template <class T>
+struct alignment_of
+    : std::integral_constant<std::size_t, alignof(typename std::remove_all_extents<T>::type)> {};
+
+template <class T, T... Ints>
+class integer_sequence {
+public:
+    using value_type = T;
+    static_assert(std::is_integral<value_type>::value, "not integral type");
+    static constexpr std::size_t size() noexcept {
+        return sizeof...(Ints);
+    }
+};
+template <std::size_t... Inds>
+using index_sequence = integer_sequence<std::size_t, Inds...>;
+
+namespace detail_ {
+template <class T, T Begin, T End, bool>
+struct IntSeqImpl {
+    using TValue = T;
+    static_assert(std::is_integral<TValue>::value, "not integral type");
+    static_assert(Begin >= 0 && Begin < End, "unexpected argument (Begin<0 || Begin<=End)");
+
+    template <class, class>
+    struct IntSeqCombiner;
+
+    template <TValue... Inds0, TValue... Inds1>
+    struct IntSeqCombiner<integer_sequence<TValue, Inds0...>, integer_sequence<TValue, Inds1...>> {
+        using TResult = integer_sequence<TValue, Inds0..., Inds1...>;
+    };
+
+    using TResult =
+        typename IntSeqCombiner<typename IntSeqImpl<TValue, Begin, Begin + (End - Begin) / 2,
+                                                    (End - Begin) / 2 == 1>::TResult,
+                                typename IntSeqImpl<TValue, Begin + (End - Begin) / 2, End,
+                                                    (End - Begin + 1) / 2 == 1>::TResult>::TResult;
+};
+
+template <class T, T Begin>
+struct IntSeqImpl<T, Begin, Begin, false> {
+    using TValue = T;
+    static_assert(std::is_integral<TValue>::value, "not integral type");
+    static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+    using TResult = integer_sequence<TValue>;
+};
+
+template <class T, T Begin, T End>
+struct IntSeqImpl<T, Begin, End, true> {
+    using TValue = T;
+    static_assert(std::is_integral<TValue>::value, "not integral type");
+    static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+    using TResult = integer_sequence<TValue, Begin>;
+};
+} // namespace detail_
+
+template <class T, T N>
+using make_integer_sequence = typename detail_::IntSeqImpl<T, 0, N, (N - 0) == 1>::TResult;
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <class... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+} // namespace ROBIN_HOOD_STD
+
+#endif
+
+namespace detail {
+
+// make sure we static_cast to the correct type for hash_int
+#if ROBIN_HOOD(BITNESS) == 64
+using SizeT = uint64_t;
+#else
+using SizeT = uint32_t;
+#endif
+
+template <typename T>
+T rotr(T x, unsigned k) {
+    return (x >> k) | (x << (8U * sizeof(T) - k));
+}
+
+// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned char*'} to
+// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target type". Use with
+// care!
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void* ptr) noexcept {
+    return reinterpret_cast<T>(ptr);
+}
+
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void const* ptr) noexcept {
+    return reinterpret_cast<T>(ptr);
+}
+
+// make sure this is not inlined as it is slow and dramatically enlarges code, thus making other
+// inlinings more difficult. Throws are also generally the slow path.
+template <typename E, typename... Args>
+ROBIN_HOOD(NOINLINE)
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+void doThrow(Args&&... args) {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+    throw E(std::forward<Args>(args)...);
+}
+#else
+void doThrow(Args&&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
+    abort();
+}
+#endif
+
+template <typename E, typename T, typename... Args>
+T* assertNotNull(T* t, Args&&... args) {
+    if (ROBIN_HOOD_UNLIKELY(nullptr == t)) {
+        doThrow<E>(std::forward<Args>(args)...);
+    }
+    return t;
+}
+
+template <typename T>
+inline T unaligned_load(void const* ptr) noexcept {
+    // using memcpy so we don't get into unaligned load problems.
+    // compiler should optimize this very well anyways.
+    T t;
+    std::memcpy(&t, ptr, sizeof(T));
+    return t;
+}
+
+// Allocates bulks of memory for objects of type T. This deallocates the memory in the destructor,
+// and keeps a linked list of the allocated memory around. Overhead per allocation is the size of a
+// pointer.
+template <typename T, size_t MinNumAllocs = 4, size_t MaxNumAllocs = 256>
+class BulkPoolAllocator {
+public:
+    BulkPoolAllocator() noexcept = default;
+
+    // does not copy anything, just creates a new allocator.
+    BulkPoolAllocator(const BulkPoolAllocator& ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept
+        : mHead(nullptr)
+        , mListForFree(nullptr) {}
+
+    BulkPoolAllocator(BulkPoolAllocator&& o) noexcept
+        : mHead(o.mHead)
+        , mListForFree(o.mListForFree) {
+        o.mListForFree = nullptr;
+        o.mHead = nullptr;
+    }
+
+    BulkPoolAllocator& operator=(BulkPoolAllocator&& o) noexcept {
+        reset();
+        mHead = o.mHead;
+        mListForFree = o.mListForFree;
+        o.mListForFree = nullptr;
+        o.mHead = nullptr;
+        return *this;
+    }
+
+    BulkPoolAllocator&
+    // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+    operator=(const BulkPoolAllocator& ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept {
+        // does not do anything
+        return *this;
+    }
+
+    ~BulkPoolAllocator() noexcept {
+        reset();
+    }
+
+    // Deallocates all allocated memory.
+    void reset() noexcept {
+        while (mListForFree) {
+            T* tmp = *mListForFree;
+            std::free(mListForFree);
+            mListForFree = reinterpret_cast_no_cast_align_warning<T**>(tmp);
+        }
+        mHead = nullptr;
+    }
+
+    // allocates, but does NOT initialize. Use in-place new constructor, e.g.
+    //   T* obj = pool.allocate();
+    //   ::new (static_cast<void*>(obj)) T();
+    T* allocate() {
+        T* tmp = mHead;
+        if (!tmp) {
+            tmp = performAllocation();
+        }
+
+        mHead = *reinterpret_cast_no_cast_align_warning<T**>(tmp);
+        return tmp;
+    }
+
+    // does not actually deallocate but puts it in store.
+    // make sure you have already called the destructor! e.g. with
+    //  obj->~T();
+    //  pool.deallocate(obj);
+    void deallocate(T* obj) noexcept {
+        *reinterpret_cast_no_cast_align_warning<T**>(obj) = mHead;
+        mHead = obj;
+    }
+
+    // Adds an already allocated block of memory to the allocator. This allocator is from now on
+    // responsible for freeing the data (with free()). If the provided data is not large enough to
+    // make use of, it is immediately freed. Otherwise it is reused and freed in the destructor.
+    void addOrFree(void* ptr, const size_t numBytes) noexcept {
+        // calculate number of available elements in ptr
+        if (numBytes < ALIGNMENT + ALIGNED_SIZE) {
+            // not enough data for at least one element. Free and return.
+            std::free(ptr);
+        } else {
+            add(ptr, numBytes);
+        }
+    }
+
+    void swap(BulkPoolAllocator<T, MinNumAllocs, MaxNumAllocs>& other) noexcept {
+        using std::swap;
+        swap(mHead, other.mHead);
+        swap(mListForFree, other.mListForFree);
+    }
+
+private:
+    // iterates the list of allocated memory to calculate how many to alloc next.
+    // Recalculating this each time saves us a size_t member.
+    // This ignores the fact that memory blocks might have been added manually with addOrFree. In
+    // practice, this should not matter much.
+    ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept {
+        auto tmp = mListForFree;
+        size_t numAllocs = MinNumAllocs;
+
+        while (numAllocs * 2 <= MaxNumAllocs && tmp) {
+            auto x = reinterpret_cast<T***>(tmp);
+            tmp = *x;
+            numAllocs *= 2;
+        }
+
+        return numAllocs;
+    }
+
+    // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree().
+    void add(void* ptr, const size_t numBytes) noexcept {
+        const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE;
+
+        auto data = reinterpret_cast<T**>(ptr);
+
+        // link free list
+        auto x = reinterpret_cast<T***>(data);
+        *x = mListForFree;
+        mListForFree = data;
+
+        // create linked list for newly allocated data
+        auto* const headT =
+            reinterpret_cast_no_cast_align_warning<T*>(reinterpret_cast<char*>(ptr) + ALIGNMENT);
+
+        auto* const head = reinterpret_cast<char*>(headT);
+
+        // Visual Studio compiler automatically unrolls this loop, which is pretty cool
+        for (size_t i = 0; i < numElements; ++i) {
+            *reinterpret_cast_no_cast_align_warning<char**>(head + i * ALIGNED_SIZE) =
+                head + (i + 1) * ALIGNED_SIZE;
+        }
+
+        // last one points to 0
+        *reinterpret_cast_no_cast_align_warning<T**>(head + (numElements - 1) * ALIGNED_SIZE) =
+            mHead;
+        mHead = headT;
+    }
+
+    // Called when no memory is available (mHead == 0).
+    // Don't inline this slow path.
+    ROBIN_HOOD(NOINLINE) T* performAllocation() {
+        size_t const numElementsToAlloc = calcNumElementsToAlloc();
+
+        // alloc new memory: [prev |T, T, ... T]
+        // std::cout << (sizeof(T*) + ALIGNED_SIZE * numElementsToAlloc) << " bytes" << std::endl;
+        size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc;
+        add(assertNotNull<std::bad_alloc>(std::malloc(bytes)), bytes);
+        return mHead;
+    }
+
+    // enforce byte alignment of the T's
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+    static constexpr size_t ALIGNMENT =
+        (std::max)(std::alignment_of<T>::value, std::alignment_of<T*>::value);
+#else
+    static const size_t ALIGNMENT =
+        (ROBIN_HOOD_STD::alignment_of<T>::value > ROBIN_HOOD_STD::alignment_of<T*>::value)
+            ? ROBIN_HOOD_STD::alignment_of<T>::value
+            : +ROBIN_HOOD_STD::alignment_of<T*>::value; // the + is for walkarround
+#endif
+
+    static constexpr size_t ALIGNED_SIZE = ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT;
+
+    static_assert(MinNumAllocs >= 1, "MinNumAllocs");
+    static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs");
+    static_assert(ALIGNED_SIZE >= sizeof(T*), "ALIGNED_SIZE");
+    static_assert(0 == (ALIGNED_SIZE % sizeof(T*)), "ALIGNED_SIZE mod");
+    static_assert(ALIGNMENT >= sizeof(T*), "ALIGNMENT");
+
+    T* mHead{nullptr};
+    T** mListForFree{nullptr};
+};
+
+template <typename T, size_t MinSize, size_t MaxSize, bool IsFlat>
+struct NodeAllocator;
+
+// dummy allocator that does nothing
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, true> {
+
+    // we are not using the data, so just free it.
+    void addOrFree(void* ptr, size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept {
+        std::free(ptr);
+    }
+};
+
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, false> : public BulkPoolAllocator<T, MinSize, MaxSize> {};
+
+// dummy hash, unsed as mixer when robin_hood::hash is already used
+template <typename T>
+struct identity_hash {
+    constexpr size_t operator()(T const& obj) const noexcept {
+        return static_cast<size_t>(obj);
+    }
+};
+
+// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it either, so I'm making
+// my own here.
+namespace swappable {
+#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17)
+using std::swap;
+template <typename T>
+struct nothrow {
+    static const bool value = noexcept(swap(std::declval<T&>(), std::declval<T&>()));
+};
+#else
+template <typename T>
+struct nothrow {
+    static const bool value = std::is_nothrow_swappable<T>::value;
+};
+#endif
+} // namespace swappable
+
+} // namespace detail
+
+struct is_transparent_tag {};
+
+// A custom pair implementation is used in the map because std::pair is not is_trivially_copyable,
+// which means it would  not be allowed to be used in std::memcpy. This struct is copyable, which is
+// also tested.
+template <typename T1, typename T2>
+struct pair {
+    using first_type = T1;
+    using second_type = T2;
+
+    template <typename U1 = T1, typename U2 = T2,
+              typename = typename std::enable_if<std::is_default_constructible<U1>::value &&
+                                                 std::is_default_constructible<U2>::value>::type>
+    constexpr pair() noexcept(noexcept(U1()) && noexcept(U2()))
+        : first()
+        , second() {}
+
+    // pair constructors are explicit so we don't accidentally call this ctor when we don't have to.
+    explicit constexpr pair(std::pair<T1, T2> const& o) noexcept(
+        noexcept(T1(std::declval<T1 const&>())) && noexcept(T2(std::declval<T2 const&>())))
+        : first(o.first)
+        , second(o.second) {}
+
+    // pair constructors are explicit so we don't accidentally call this ctor when we don't have to.
+    explicit constexpr pair(std::pair<T1, T2>&& o) noexcept(noexcept(
+        T1(std::move(std::declval<T1&&>()))) && noexcept(T2(std::move(std::declval<T2&&>()))))
+        : first(std::move(o.first))
+        , second(std::move(o.second)) {}
+
+    constexpr pair(T1&& a, T2&& b) noexcept(noexcept(
+        T1(std::move(std::declval<T1&&>()))) && noexcept(T2(std::move(std::declval<T2&&>()))))
+        : first(std::move(a))
+        , second(std::move(b)) {}
+
+    template <typename U1, typename U2>
+    constexpr pair(U1&& a, U2&& b) noexcept(noexcept(T1(std::forward<U1>(
+        std::declval<U1&&>()))) && noexcept(T2(std::forward<U2>(std::declval<U2&&>()))))
+        : first(std::forward<U1>(a))
+        , second(std::forward<U2>(b)) {}
+
+    template <typename... U1, typename... U2>
+    constexpr pair(
+        std::piecewise_construct_t /*unused*/, std::tuple<U1...> a,
+        std::tuple<U2...> b) noexcept(noexcept(pair(std::declval<std::tuple<U1...>&>(),
+                                                    std::declval<std::tuple<U2...>&>(),
+                                                    ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+                                                    ROBIN_HOOD_STD::index_sequence_for<U2...>())))
+        : pair(a, b, ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+               ROBIN_HOOD_STD::index_sequence_for<U2...>()) {}
+
+    // constructor called from the std::piecewise_construct_t ctor
+    template <typename... U1, size_t... I1, typename... U2, size_t... I2>
+    pair(std::tuple<U1...>& a, std::tuple<U2...>& b, ROBIN_HOOD_STD::index_sequence<I1...> /*unused*/, ROBIN_HOOD_STD::index_sequence<I2...> /*unused*/) noexcept(
+        noexcept(T1(std::forward<U1>(std::get<I1>(
+            std::declval<std::tuple<
+                U1...>&>()))...)) && noexcept(T2(std::
+                                                     forward<U2>(std::get<I2>(
+                                                         std::declval<std::tuple<U2...>&>()))...)))
+        : first(std::forward<U1>(std::get<I1>(a))...)
+        , second(std::forward<U2>(std::get<I2>(b))...) {
+        // make visual studio compiler happy about warning about unused a & b.
+        // Visual studio's pair implementation disables warning 4100.
+        (void)a;
+        (void)b;
+    }
+
+    void swap(pair<T1, T2>& o) noexcept((detail::swappable::nothrow<T1>::value) &&
+                                        (detail::swappable::nothrow<T2>::value)) {
+        using std::swap;
+        swap(first, o.first);
+        swap(second, o.second);
+    }
+
+    T1 first;  // NOLINT(misc-non-private-member-variables-in-classes)
+    T2 second; // NOLINT(misc-non-private-member-variables-in-classes)
+};
+
+template <typename A, typename B>
+inline void swap(pair<A, B>& a, pair<A, B>& b) noexcept(
+    noexcept(std::declval<pair<A, B>&>().swap(std::declval<pair<A, B>&>()))) {
+    a.swap(b);
+}
+
+template <typename A, typename B>
+inline constexpr bool operator==(pair<A, B> const& x, pair<A, B> const& y) {
+    return (x.first == y.first) && (x.second == y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator!=(pair<A, B> const& x, pair<A, B> const& y) {
+    return !(x == y);
+}
+template <typename A, typename B>
+inline constexpr bool operator<(pair<A, B> const& x, pair<A, B> const& y) noexcept(noexcept(
+    std::declval<A const&>() < std::declval<A const&>()) && noexcept(std::declval<B const&>() <
+                                                                     std::declval<B const&>())) {
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator>(pair<A, B> const& x, pair<A, B> const& y) {
+    return y < x;
+}
+template <typename A, typename B>
+inline constexpr bool operator<=(pair<A, B> const& x, pair<A, B> const& y) {
+    return !(x > y);
+}
+template <typename A, typename B>
+inline constexpr bool operator>=(pair<A, B> const& x, pair<A, B> const& y) {
+    return !(x < y);
+}
+
+namespace detail {
+
+static size_t fallback_hash_int(uint64_t x) noexcept {
+    // inspired by lemire's strongly universal hashing
+    // https://lemire.me/blog/2018/08/15/fast-strongly-universal-64-bit-hashing-everywhere/
+    //
+    // Instead of shifts, we use rotations so we don't lose any bits.
+    //
+    // Added a final multiplcation with a constant for more mixing. It is most important that
+    // the lower bits are well mixed.
+    auto h1 = x * UINT64_C(0xA24BAED4963EE407);
+    auto h2 = detail::rotr(x, 32U) * UINT64_C(0x9FB21C651E98DF25);
+    auto h = detail::rotr(h1 + h2, 32U);
+    return static_cast<size_t>(h);
+}
+
+static size_t fallback_hash_bytes(void const* ptr, size_t const len) noexcept {
+    static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
+    static constexpr uint64_t seed = UINT64_C(0xe17a1465);
+    static constexpr unsigned int r = 47;
+
+    auto const* const data64 = static_cast<uint64_t const*>(ptr);
+    uint64_t h = seed ^ (len * m);
+
+    size_t const n_blocks = len / 8;
+    for (size_t i = 0; i < n_blocks; ++i) {
+        auto k = detail::unaligned_load<uint64_t>(data64 + i);
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+    }
+
+    auto const* const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
+    switch (len & 7U) {
+    case 7:
+        h ^= static_cast<uint64_t>(data8[6]) << 48U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 6:
+        h ^= static_cast<uint64_t>(data8[5]) << 40U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 5:
+        h ^= static_cast<uint64_t>(data8[4]) << 32U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 4:
+        h ^= static_cast<uint64_t>(data8[3]) << 24U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 3:
+        h ^= static_cast<uint64_t>(data8[2]) << 16U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 2:
+        h ^= static_cast<uint64_t>(data8[1]) << 8U;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 1:
+        h ^= static_cast<uint64_t>(data8[0]);
+        h *= m;
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    default:
+        break;
+    }
+
+    h ^= h >> r;
+    h *= m;
+    h ^= h >> r;
+    return static_cast<size_t>(h);
+}
+
+#if ROBIN_HOOD(HAS_CRC32)
+
+#    ifndef _M_ARM64
+// see e.g.
+// https://github.com/simdjson/simdjson/blob/9863f62321f59d73c7731d4ada2d7c4ed6a0a251/src/isadetection.h
+static inline void cpuid(uint32_t* eax, uint32_t* ebx, uint32_t* ecx, uint32_t* edx) {
+#        if defined(_MSC_VER)
+    int cpuInfo[4];
+    __cpuid(cpuInfo, static_cast<int>(*eax));
+    *eax = static_cast<uint32_t>(cpuInfo[0]);
+    *ebx = static_cast<uint32_t>(cpuInfo[1]);
+    *ecx = static_cast<uint32_t>(cpuInfo[2]);
+    *edx = static_cast<uint32_t>(cpuInfo[3]);
+#        else
+    uint32_t a = *eax;
+    uint32_t b{};
+    uint32_t c = *ecx;
+    uint32_t d{};
+    // NOLINTNEXTLINE(hicpp-no-assembler)
+    asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+    *eax = a;
+    *ebx = b;
+    *ecx = c;
+    *edx = d;
+#        endif
+}
+#    endif
+
+inline bool hasCrc32Support() noexcept {
+#    if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+    uint32_t eax{};
+    uint32_t ebx{};
+    uint32_t ecx{};
+    uint32_t edx{};
+
+    // EBX for EAX=0x1
+    eax = 0x1;
+    cpuid(&eax, &ebx, &ecx, &edx);
+
+    // check SSE4.2
+    return 0U != (ecx & (1U << 20U));
+#    elif defined(__aarch64__)
+    auto hwcap = getauxval(AT_HWCAP);
+    if (hwcap != ENOENT) {
+        // HWCAP_CRC32 is not necessarily defined, so hardcode it.
+        // see https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+        return (hwcap & (1U << 7U)) != 0;
+    }
+#    elif defined(_M_ARM64)
+    return true;
+#    endif
+    return false;
+}
+
+inline size_t hash_bytes_1_to_16(void const* ptr, size_t len, uint64_t seed) noexcept {
+    // random odd 64bit constants
+    static constexpr uint64_t c1 = UINT64_C(0x38a3affe8230452c);
+    static constexpr uint64_t c2 = UINT64_C(0xd55c04dccfde5383);
+
+    auto const* d8 = reinterpret_cast<uint8_t const*>(ptr);
+
+    if (len > 8) {
+        // 9-16 bytes
+        auto h1 = ROBIN_HOOD_CRC32_64(seed, detail::unaligned_load<uint64_t>(d8));
+        auto h2 = ROBIN_HOOD_CRC32_64(seed, detail::unaligned_load<uint64_t>(d8 + len - 8));
+        return h1 * c1 + h2 * c2;
+    }
+
+    uint64_t input{};
+    if (len <= 4) {
+        uint64_t a = d8[0];             // 0, 0, 0, 0
+        uint64_t b = d8[(len - 1) / 2]; // 0, 0, 1, 1
+        uint64_t c = d8[len / 2];       // 0, 1, 1, 2
+        uint64_t d = d8[len - 1];       // 0, 1, 2, 3
+        input = (a << 24U) | (b << 16U) | (c << 8U) | d;
+    } else {
+        // 5-8 bytes
+        uint64_t a = detail::unaligned_load<uint32_t>(d8);
+        uint64_t b = detail::unaligned_load<uint32_t>(d8 + len - 4);
+        input = (a << 32U) | b;
+    }
+    return ROBIN_HOOD_CRC32_64(seed, input) * c1;
+}
+
+inline size_t hash_bytes_8_to_xxx(void const* ptr, size_t len, uint64_t seed) {
+    auto const* d8 = reinterpret_cast<uint8_t const*>(ptr);
+
+    static constexpr auto bs = 128U;
+    uint64_t h1 = seed;
+    uint64_t h2 = seed;
+    uint64_t h3 = seed;
+    uint64_t h4 = seed;
+
+    auto next = d8;
+    auto numBlocks = (len - 1) / bs;
+    auto end = d8 + numBlocks * bs;
+    while (next != end) {
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(next + 0U));
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(next + 8U));
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(next + 16U));
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(next + 24U));
+
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(next + 32U + 0U));
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(next + 32U + 8U));
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(next + 32U + 16U));
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(next + 32U + 24U));
+
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(next + 64U + 0U));
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(next + 64U + 8U));
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(next + 64U + 16U));
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(next + 64U + 24U));
+
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(next + 96U + 0U));
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(next + 96U + 8U));
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(next + 96U + 16U));
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(next + 96U + 24U));
+
+        next += bs;
+    }
+
+    auto remainingBytes = len - (numBlocks * bs);
+
+    auto numBlocks8 = (remainingBytes + 7U) / 8U;
+    end += numBlocks8 * 8;
+    switch (numBlocks8) {
+    case 16:
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(end - 128U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 15:
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(end - 120U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 14:
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(end - 112U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 13:
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(end - 104U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 12:
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(end - 96U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 11:
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(end - 88U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 10:
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(end - 80U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 9:
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(end - 72U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 8:
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(end - 64U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 7:
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(end - 56U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 6:
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(end - 48U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 5:
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(end - 40U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 4:
+        h1 = ROBIN_HOOD_CRC32_64(h1, detail::unaligned_load<uint64_t>(end - 32U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 3:
+        h2 = ROBIN_HOOD_CRC32_64(h2, detail::unaligned_load<uint64_t>(end - 24U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+    case 2:
+        h3 = ROBIN_HOOD_CRC32_64(h3, detail::unaligned_load<uint64_t>(end - 16U));
+        ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+
+    default:
+        // make sure that we don't skip past the real length with the last one
+        h4 = ROBIN_HOOD_CRC32_64(h4, detail::unaligned_load<uint64_t>(d8 + len - 8U));
+        break;
+    }
+
+    // how best to combine h1 to h4? Multiplying and summing with a random odd number seems to be
+    // very fast and leads to few collisions.
+    return h1 * 0x38a3affe8230452c + h2 * 0xd55c04dccfde5383 + h3 * 0xd348c89fbf80760f +
+           h4 * 0xdc105301318a46f3;
+}
+
+#endif
+
+} // namespace detail
+
+inline size_t hash_bytes(void const* ptr, size_t len) noexcept {
+    if (len == 0) {
+        return 0;
+    }
+#if ROBIN_HOOD(HAS_CRC32) && ROBIN_HOOD(BITNESS) == 64
+    static bool const hasCrc = detail::hasCrc32Support();
+    if (ROBIN_HOOD_LIKELY(hasCrc)) {
+        auto seed = len * UINT64_C(0xf012a09363e97a8f);
+        if (len <= 16U) {
+            return detail::hash_bytes_1_to_16(ptr, len, seed);
+        }
+
+        return detail::hash_bytes_8_to_xxx(ptr, len, seed);
+    }
+#endif
+    return detail::fallback_hash_bytes(ptr, len);
+}
+
+inline size_t hash_int(uint64_t x) noexcept {
+#if ROBIN_HOOD(HAS_CRC32)
+    static bool const hasCrc = detail::hasCrc32Support();
+    if (ROBIN_HOOD_LIKELY(hasCrc)) {
+#    if ROBIN_HOOD(BITNESS) == 64
+        // rotr 32 results in bad hash, when hash_int is applied twice.
+        return ROBIN_HOOD_CRC32_64(0, x ^ UINT64_C(0xA24BAED4963EE407)) ^
+               (ROBIN_HOOD_CRC32_64(0, x) << 32U);
+#    else
+        return ROBIN_HOOD_CRC32_32(ROBIN_HOOD_CRC32_32(0, static_cast<uint32_t>(x)),
+                                   static_cast<uint32_t>(x >> 32U));
+#    endif
+    }
+#endif
+    return detail::fallback_hash_int(x);
+}
+
+inline size_t hash_int(uint32_t x) noexcept {
+#if ROBIN_HOOD(HAS_CRC32)
+    static bool const hasCrc = detail::hasCrc32Support();
+    if (ROBIN_HOOD_LIKELY(hasCrc)) {
+        // rotr 32 results in bad hash, when hash_int is applied twice.
+        return ROBIN_HOOD_CRC32_32(0, x);
+    }
+#endif
+    return detail::fallback_hash_int(x);
+}
+
+// A thin wrapper around std::hash, performing an additional simple mixing step of the result.
+template <typename T, typename Enable = void>
+struct hash : public std::hash<T> {
+    size_t operator()(T const& obj) const
+        noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>()))) {
+        // call base hash
+        auto result = std::hash<T>::operator()(obj);
+        // return mixed of that, to be save against identity has
+        return hash_int(static_cast<detail::SizeT>(result));
+    }
+};
+
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+    size_t operator()(std::basic_string<CharT> const& str) const noexcept {
+        return hash_bytes(str.data(), sizeof(CharT) * str.size());
+    }
+};
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+template <typename CharT>
+struct hash<std::basic_string_view<CharT>> {
+    size_t operator()(std::basic_string_view<CharT> const& sv) const noexcept {
+        return hash_bytes(sv.data(), sizeof(CharT) * sv.size());
+    }
+};
+#endif
+
+template <class T>
+struct hash<T*> {
+    size_t operator()(T* ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr));
+    }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+    size_t operator()(std::unique_ptr<T> const& ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+    }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+    size_t operator()(std::shared_ptr<T> const& ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+    }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+    size_t operator()(Enum e) const noexcept {
+        using Underlying = typename std::underlying_type<Enum>::type;
+        return hash<Underlying>{}(static_cast<Underlying>(e));
+    }
+};
+
+#define ROBIN_HOOD_HASH_INT(T)                                                             \
+    template <>                                                                            \
+    struct hash<T> {                                                                       \
+        size_t operator()(T const& obj) const noexcept {                                   \
+            using Type =                                                                   \
+                std::conditional<sizeof(T) <= sizeof(uint32_t), uint32_t, uint64_t>::type; \
+            return hash_int(static_cast<Type>(obj));                                       \
+        }                                                                                  \
+    }
+
+#if defined(__GNUC__) && !defined(__clang__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+// see https://en.cppreference.com/w/cpp/utility/hash
+ROBIN_HOOD_HASH_INT(bool);
+ROBIN_HOOD_HASH_INT(char);
+ROBIN_HOOD_HASH_INT(signed char);
+ROBIN_HOOD_HASH_INT(unsigned char);
+ROBIN_HOOD_HASH_INT(char16_t);
+ROBIN_HOOD_HASH_INT(char32_t);
+#if ROBIN_HOOD(HAS_NATIVE_WCHART)
+ROBIN_HOOD_HASH_INT(wchar_t);
+#endif
+ROBIN_HOOD_HASH_INT(short);
+ROBIN_HOOD_HASH_INT(unsigned short);
+ROBIN_HOOD_HASH_INT(int);
+ROBIN_HOOD_HASH_INT(unsigned int);
+ROBIN_HOOD_HASH_INT(long);
+ROBIN_HOOD_HASH_INT(long long);
+ROBIN_HOOD_HASH_INT(unsigned long);
+ROBIN_HOOD_HASH_INT(unsigned long long);
+#if defined(__GNUC__) && !defined(__clang__)
+#    pragma GCC diagnostic pop
+#endif
+namespace detail {
+
+template <typename T>
+struct void_type {
+    using type = void;
+};
+
+template <typename T, typename = void>
+struct has_is_transparent : public std::false_type {};
+
+template <typename T>
+struct has_is_transparent<T, typename void_type<typename T::is_transparent>::type>
+    : public std::true_type {};
+
+// using wrapper classes for hash and key_equal prevents the diamond problem when the same type
+// is used. see https://stackoverflow.com/a/28771920/48181
+template <typename T>
+struct WrapHash : public T {
+    WrapHash() = default;
+    explicit WrapHash(T const& o) noexcept(noexcept(T(std::declval<T const&>())))
+        : T(o) {}
+};
+
+template <typename T>
+struct WrapKeyEqual : public T {
+    WrapKeyEqual() = default;
+    explicit WrapKeyEqual(T const& o) noexcept(noexcept(T(std::declval<T const&>())))
+        : T(o) {}
+};
+
+// A highly optimized hashmap implementation, using the Robin Hood algorithm.
+//
+// In most cases, this map should be usable as a drop-in replacement for std::unordered_map, but
+// be about 2x faster in most cases and require much less allocations.
+//
+// This implementation uses the following memory layout:
+//
+// [Node, Node, ... Node | info, info, ... infoSentinel ]
+//
+// * Node: either a DataNode that directly has the std::pair<key, val> as member,
+//   or a DataNode with a pointer to std::pair<key,val>. Which DataNode representation to use
+//   depends on how fast the swap() operation is. Heuristically, this is automatically choosen
+//   based on sizeof(). there are always 2^n Nodes.
+//
+// * info: Each Node in the map has a corresponding info byte, so there are 2^n info bytes.
+//   Each byte is initialized to 0, meaning the corresponding Node is empty. Set to 1 means the
+//   corresponding node contains data. Set to 2 means the corresponding Node is filled, but it
+//   actually belongs to the previous position and was pushed out because that place is already
+//   taken.
+//
+// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at end() without the
+//   need for a idx variable.
+//
+// According to STL, order of templates has effect on throughput. That's why I've moved the
+// boolean to the front.
+// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/
+template <bool IsFlat, size_t MaxLoadFactor100, typename Key, typename T, typename Hash,
+          typename KeyEqual>
+class Table
+    : public WrapHash<Hash>,
+      public WrapKeyEqual<KeyEqual>,
+      detail::NodeAllocator<
+          typename std::conditional<
+              std::is_void<T>::value, Key,
+              robin_hood::pair<typename std::conditional<IsFlat, Key, Key const>::type, T>>::type,
+          4, 16384, IsFlat> {
+public:
+    static constexpr bool is_flat = IsFlat;
+    static constexpr bool is_map = !std::is_void<T>::value;
+    static constexpr bool is_set = !is_map;
+    static constexpr bool is_transparent =
+        has_is_transparent<Hash>::value && has_is_transparent<KeyEqual>::value;
+
+    using key_type = Key;
+    using mapped_type = T;
+    using value_type = typename std::conditional<
+        is_set, Key,
+        robin_hood::pair<typename std::conditional<is_flat, Key, Key const>::type, T>>::type;
+    using size_type = size_t;
+    using hasher = Hash;
+    using key_equal = KeyEqual;
+    using Self = Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
+
+private:
+    static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100,
+                  "MaxLoadFactor100 needs to be >10 && < 100");
+
+    using WHash = WrapHash<Hash>;
+    using WKeyEqual = WrapKeyEqual<KeyEqual>;
+
+    // configuration defaults
+
+    // make sure we have 8 elements, needed to quickly rehash mInfo
+    static constexpr size_t InitialNumElements = sizeof(uint64_t);
+    static constexpr uint32_t InitialInfoNumBits = 5;
+    static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits;
+    static constexpr size_t InfoMask = InitialInfoInc - 1U;
+    static constexpr uint8_t InitialInfoHashShift = 0;
+    using DataPool = detail::NodeAllocator<value_type, 4, 16384, IsFlat>;
+
+    // type needs to be wider than uint8_t.
+    using InfoType = uint32_t;
+
+    // DataNode ////////////////////////////////////////////////////////
+
+    // Primary template for the data node. We have special implementations for small and big
+    // objects. For large objects it is assumed that swap() is fairly slow, so we allocate these
+    // on the heap so swap merely swaps a pointer.
+    template <typename M, bool>
+    class DataNode {};
+
+    // Small: just allocate on the stack.
+    template <typename M>
+    class DataNode<M, true> final {
+    public:
+        template <typename... Args>
+        explicit DataNode(M& ROBIN_HOOD_UNUSED(map) /*unused*/, Args&&... args) noexcept(
+            noexcept(value_type(std::forward<Args>(args)...)))
+            : mData(std::forward<Args>(args)...) {}
+
+        DataNode(M& ROBIN_HOOD_UNUSED(map) /*unused*/, DataNode<M, true>&& n) noexcept(
+            std::is_nothrow_move_constructible<value_type>::value)
+            : mData(std::move(n.mData)) {}
+
+        // doesn't do anything
+        void destroy(M& ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {}
+        void destroyDoNotDeallocate() noexcept {}
+
+        value_type const* operator->() const noexcept {
+            return &mData;
+        }
+        value_type* operator->() noexcept {
+            return &mData;
+        }
+
+        const value_type& operator*() const noexcept {
+            return mData;
+        }
+
+        value_type& operator*() noexcept {
+            return mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type&>::type getFirst() noexcept {
+            return mData.first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT&>::type getFirst() noexcept {
+            return mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type const&>::type
+            getFirst() const noexcept {
+            return mData.first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT const&>::type getFirst() const noexcept {
+            return mData;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, MT&>::type getSecond() noexcept {
+            return mData.second;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, MT const&>::type getSecond() const noexcept {
+            return mData.second;
+        }
+
+        void swap(DataNode<M, true>& o) noexcept(
+            noexcept(std::declval<value_type>().swap(std::declval<value_type>()))) {
+            mData.swap(o.mData);
+        }
+
+    private:
+        value_type mData;
+    };
+
+    // big object: allocate on heap.
+    template <typename M>
+    class DataNode<M, false> {
+    public:
+        template <typename... Args>
+        explicit DataNode(M& map, Args&&... args)
+            : mData(map.allocate()) {
+            ::new (static_cast<void*>(mData)) value_type(std::forward<Args>(args)...);
+        }
+
+        DataNode(M& ROBIN_HOOD_UNUSED(map) /*unused*/, DataNode<M, false>&& n) noexcept
+            : mData(std::move(n.mData)) {}
+
+        void destroy(M& map) noexcept {
+            // don't deallocate, just put it into list of datapool.
+            mData->~value_type();
+            map.deallocate(mData);
+        }
+
+        void destroyDoNotDeallocate() noexcept {
+            mData->~value_type();
+        }
+
+        value_type const* operator->() const noexcept {
+            return mData;
+        }
+
+        value_type* operator->() noexcept {
+            return mData;
+        }
+
+        const value_type& operator*() const {
+            return *mData;
+        }
+
+        value_type& operator*() {
+            return *mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type&>::type getFirst() noexcept {
+            return mData->first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT&>::type getFirst() noexcept {
+            return *mData;
+        }
+
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, typename VT::first_type const&>::type
+            getFirst() const noexcept {
+            return mData->first;
+        }
+        template <typename VT = value_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_set, VT const&>::type getFirst() const noexcept {
+            return *mData;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, MT&>::type getSecond() noexcept {
+            return mData->second;
+        }
+
+        template <typename MT = mapped_type>
+        ROBIN_HOOD(NODISCARD)
+        typename std::enable_if<is_map, MT const&>::type getSecond() const noexcept {
+            return mData->second;
+        }
+
+        void swap(DataNode<M, false>& o) noexcept {
+            using std::swap;
+            swap(mData, o.mData);
+        }
+
+    private:
+        value_type* mData;
+    };
+
+    using Node = DataNode<Self, IsFlat>;
+
+    // helpers for doInsert: extract first entry (only const required)
+    ROBIN_HOOD(NODISCARD) key_type const& getFirstConst(Node const& n) const noexcept {
+        return n.getFirst();
+    }
+
+    // in case we have void mapped_type, we are not using a pair, thus we just route k through.
+    // No need to disable this because it's just not used if not applicable.
+    ROBIN_HOOD(NODISCARD) key_type const& getFirstConst(key_type const& k) const noexcept {
+        return k;
+    }
+
+    // in case we have non-void mapped_type, we have a standard robin_hood::pair
+    template <typename Q = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<!std::is_void<Q>::value, key_type const&>::type
+        getFirstConst(value_type const& vt) const noexcept {
+        return vt.first;
+    }
+
+    // Cloner //////////////////////////////////////////////////////////
+
+    template <typename M, bool UseMemcpy>
+    struct Cloner;
+
+    // fast path: Just copy data, without allocating anything.
+    template <typename M>
+    struct Cloner<M, true> {
+        void operator()(M const& source, M& target) const {
+            auto const* const src = reinterpret_cast<char const*>(source.mKeyVals);
+            auto* tgt = reinterpret_cast<char*>(target.mKeyVals);
+            auto const numElementsWithBuffer = target.calcNumElementsWithBuffer(target.mMask + 1);
+            std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer), tgt);
+        }
+    };
+
+    template <typename M>
+    struct Cloner<M, false> {
+        void operator()(M const& s, M& t) const {
+            auto const numElementsWithBuffer = t.calcNumElementsWithBuffer(t.mMask + 1);
+            std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer), t.mInfo);
+
+            for (size_t i = 0; i < numElementsWithBuffer; ++i) {
+                if (t.mInfo[i]) {
+                    ::new (static_cast<void*>(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]);
+                }
+            }
+        }
+    };
+
+    // Destroyer ///////////////////////////////////////////////////////
+
+    template <typename M, bool IsFlatAndTrivial>
+    struct Destroyer {};
+
+    template <typename M>
+    struct Destroyer<M, true> {
+        void nodes(M& m) const noexcept {
+            m.mNumElements = 0;
+        }
+
+        void nodesDoNotDeallocate(M& m) const noexcept {
+            m.mNumElements = 0;
+        }
+    };
+
+    template <typename M>
+    struct Destroyer<M, false> {
+        void nodes(M& m) const noexcept {
+            m.mNumElements = 0;
+            // clear also resets mInfo to 0, that's sometimes not necessary.
+            auto const numElementsWithBuffer = m.calcNumElementsWithBuffer(m.mMask + 1);
+
+            for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+                if (0 != m.mInfo[idx]) {
+                    Node& n = m.mKeyVals[idx];
+                    n.destroy(m);
+                    n.~Node();
+                }
+            }
+        }
+
+        void nodesDoNotDeallocate(M& m) const noexcept {
+            m.mNumElements = 0;
+            // clear also resets mInfo to 0, that's sometimes not necessary.
+            auto const numElementsWithBuffer = m.calcNumElementsWithBuffer(m.mMask + 1);
+            for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+                if (0 != m.mInfo[idx]) {
+                    Node& n = m.mKeyVals[idx];
+                    n.destroyDoNotDeallocate();
+                    n.~Node();
+                }
+            }
+        }
+    };
+
+    // Iter ////////////////////////////////////////////////////////////
+
+    struct fast_forward_tag {};
+
+    // generic iterator for both const_iterator and iterator.
+    template <bool IsConst>
+    // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions)
+    class Iter {
+    private:
+        using NodePtr = typename std::conditional<IsConst, Node const*, Node*>::type;
+
+    public:
+        using difference_type = std::ptrdiff_t;
+        using value_type = typename Self::value_type;
+        using reference = typename std::conditional<IsConst, value_type const&, value_type&>::type;
+        using pointer = typename std::conditional<IsConst, value_type const*, value_type*>::type;
+        using iterator_category = std::forward_iterator_tag;
+
+        // default constructed iterator can be compared to itself, but WON'T return true when
+        // compared to end().
+        Iter() = default;
+
+        // Rule of zero: nothing specified. The conversion constructor is only enabled for
+        // iterator to const_iterator, so it doesn't accidentally work as a copy ctor.
+
+        // Conversion constructor from iterator to const_iterator.
+        template <bool OtherIsConst,
+                  typename = typename std::enable_if<IsConst && !OtherIsConst>::type>
+        // NOLINTNEXTLINE(hicpp-explicit-conversions)
+        Iter(Iter<OtherIsConst> const& other) noexcept
+            : mKeyVals(other.mKeyVals)
+            , mInfo(other.mInfo) {}
+
+        Iter(NodePtr valPtr, uint8_t const* infoPtr) noexcept
+            : mKeyVals(valPtr)
+            , mInfo(infoPtr) {}
+
+        Iter(NodePtr valPtr, uint8_t const* infoPtr,
+             fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept
+            : mKeyVals(valPtr)
+            , mInfo(infoPtr) {
+            fastForward();
+        }
+
+        template <bool OtherIsConst,
+                  typename = typename std::enable_if<IsConst && !OtherIsConst>::type>
+        Iter& operator=(Iter<OtherIsConst> const& other) noexcept {
+            mKeyVals = other.mKeyVals;
+            mInfo = other.mInfo;
+            return *this;
+        }
+
+        // prefix increment. Undefined behavior if we are at end()!
+        Iter& operator++() noexcept {
+            mInfo++;
+            mKeyVals++;
+            fastForward();
+            return *this;
+        }
+
+        Iter operator++(int) noexcept {
+            Iter tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+
+        reference operator*() const {
+            return **mKeyVals;
+        }
+
+        pointer operator->() const {
+            return &**mKeyVals;
+        }
+
+        template <bool O>
+        bool operator==(Iter<O> const& o) const noexcept {
+            return mKeyVals == o.mKeyVals;
+        }
+
+        template <bool O>
+        bool operator!=(Iter<O> const& o) const noexcept {
+            return mKeyVals != o.mKeyVals;
+        }
+
+    private:
+        // fast forward to the next non-free info byte
+        // I've tried a few variants that don't depend on intrinsics, but unfortunately they are
+        // quite a bit slower than this one. So I've reverted that change again. See map_benchmark.
+        void fastForward() noexcept {
+            size_t n = 0;
+            while (0U == (n = detail::unaligned_load<size_t>(mInfo))) {
+                mInfo += sizeof(size_t);
+                mKeyVals += sizeof(size_t);
+            }
+#if defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+            // we know for certain that within the next 8 bytes we'll find a non-zero one.
+            if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint32_t>(mInfo))) {
+                mInfo += 4;
+                mKeyVals += 4;
+            }
+            if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint16_t>(mInfo))) {
+                mInfo += 2;
+                mKeyVals += 2;
+            }
+            if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) {
+                mInfo += 1;
+                mKeyVals += 1;
+            }
+#else
+#    if ROBIN_HOOD(LITTLE_ENDIAN)
+            auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8;
+#    else
+            auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8;
+#    endif
+            mInfo += inc;
+            mKeyVals += inc;
+#endif
+        }
+
+        friend class Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
+        NodePtr mKeyVals{nullptr};
+        uint8_t const* mInfo{nullptr};
+    };
+
+    ////////////////////////////////////////////////////////////////////
+
+    // highly performance relevant code.
+    // Lower bits are used for indexing into the array (2^n size)
+    // The upper 1-5 bits need to be a reasonable good hash, to save comparisons.
+    template <typename HashKey>
+    void keyToIdx(HashKey&& key, size_t* idx, InfoType* info) const {
+        // for a user-specified hash that is *not* robin_hood::hash, apply robin_hood::hash as
+        // an additional mixing step. This serves as a bad hash prevention, if the given data is
+        // badly mixed.
+        using Mix =
+            typename std::conditional<std::is_same<::robin_hood::hash<key_type>, hasher>::value,
+                                      ::robin_hood::detail::identity_hash<size_t>,
+                                      ::robin_hood::hash<size_t>>::type;
+
+        // the lower InitialInfoNumBits are reserved for info.
+        auto h = Mix{}(WHash::operator()(key));
+        *info = mInfoInc + static_cast<InfoType>((h & InfoMask) >> mInfoHashShift);
+        *idx = (h >> InitialInfoNumBits) & mMask;
+    }
+
+    // forwards the index by one, wrapping around at the end
+    void next(InfoType* info, size_t* idx) const noexcept {
+        *idx = *idx + 1;
+        *info += mInfoInc;
+    }
+
+    void nextWhileLess(InfoType* info, size_t* idx) const noexcept {
+        // unrolling this by hand did not bring any speedups.
+        while (*info < mInfo[*idx]) {
+            next(info, idx);
+        }
+    }
+
+    // Shift everything up by one element. Tries to move stuff around.
+    void
+    shiftUp(size_t startIdx,
+            size_t const insertion_idx) noexcept(std::is_nothrow_move_assignable<Node>::value) {
+        auto idx = startIdx;
+        ::new (static_cast<void*>(mKeyVals + idx)) Node(std::move(mKeyVals[idx - 1]));
+        while (--idx != insertion_idx) {
+            mKeyVals[idx] = std::move(mKeyVals[idx - 1]);
+        }
+
+        idx = startIdx;
+        while (idx != insertion_idx) {
+            ROBIN_HOOD_COUNT(shiftUp)
+            mInfo[idx] = static_cast<uint8_t>(mInfo[idx - 1] + mInfoInc);
+            if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) {
+                mMaxNumElementsAllowed = 0;
+            }
+            --idx;
+        }
+    }
+
+    void shiftDown(size_t idx) noexcept(std::is_nothrow_move_assignable<Node>::value) {
+        // until we find one that is either empty or has zero offset.
+        // TODO(martinus) we don't need to move everything, just the last one for the same
+        // bucket.
+        mKeyVals[idx].destroy(*this);
+
+        // until we find one that is either empty or has zero offset.
+        while (mInfo[idx + 1] >= 2 * mInfoInc) {
+            ROBIN_HOOD_COUNT(shiftDown)
+            mInfo[idx] = static_cast<uint8_t>(mInfo[idx + 1] - mInfoInc);
+            mKeyVals[idx] = std::move(mKeyVals[idx + 1]);
+            ++idx;
+        }
+
+        mInfo[idx] = 0;
+        // don't destroy, we've moved it
+        // mKeyVals[idx].destroy(*this);
+        mKeyVals[idx].~Node();
+    }
+
+    // copy of find(), except that it returns iterator instead of const_iterator.
+    template <typename Other>
+    ROBIN_HOOD(NODISCARD)
+    size_t findIdx(Other const& key) const {
+        size_t idx{};
+        InfoType info{};
+        keyToIdx(key, &idx, &info);
+
+        do {
+            // unrolling this twice gives a bit of a speedup. More unrolling did not help.
+            if (info == mInfo[idx] &&
+                ROBIN_HOOD_LIKELY(WKeyEqual::operator()(key, mKeyVals[idx].getFirst()))) {
+                return idx;
+            }
+            next(&info, &idx);
+            if (info == mInfo[idx] &&
+                ROBIN_HOOD_LIKELY(WKeyEqual::operator()(key, mKeyVals[idx].getFirst()))) {
+                return idx;
+            }
+            next(&info, &idx);
+        } while (info <= mInfo[idx]);
+
+        // nothing found!
+        return mMask == 0 ? 0
+                          : static_cast<size_t>(std::distance(
+                                mKeyVals, reinterpret_cast_no_cast_align_warning<Node*>(mInfo)));
+    }
+
+    void cloneData(const Table& o) {
+        Cloner<Table, IsFlat && ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(Node)>()(o, *this);
+    }
+
+    // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is resized.
+    // @return index where the element was created
+    size_t insert_move(Node&& keyval) {
+        // we don't retry, fail if overflowing
+        // don't need to check max num elements
+        if (0 == mMaxNumElementsAllowed && !try_increase_info()) {
+            throwOverflowError(); // impossible to reach LCOV_EXCL_LINE
+        }
+
+        size_t idx{};
+        InfoType info{};
+        keyToIdx(keyval.getFirst(), &idx, &info);
+
+        // skip forward. Use <= because we are certain that the element is not there.
+        while (info <= mInfo[idx]) {
+            idx = idx + 1;
+            info += mInfoInc;
+        }
+
+        // key not found, so we are now exactly where we want to insert it.
+        auto const insertion_idx = idx;
+        auto const insertion_info = static_cast<uint8_t>(info);
+        if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+            mMaxNumElementsAllowed = 0;
+        }
+
+        // find an empty spot
+        while (0 != mInfo[idx]) {
+            next(&info, &idx);
+        }
+
+        auto& l = mKeyVals[insertion_idx];
+        if (idx == insertion_idx) {
+            ::new (static_cast<void*>(&l)) Node(std::move(keyval));
+        } else {
+            shiftUp(idx, insertion_idx);
+            l = std::move(keyval);
+        }
+
+        // put at empty spot
+        mInfo[insertion_idx] = insertion_info;
+
+        ++mNumElements;
+        return insertion_idx;
+    }
+
+public:
+    using iterator = Iter<false>;
+    using const_iterator = Iter<true>;
+
+    // Creates an empty hash map. Nothing is allocated yet, this happens at the first insert.
+    // This tremendously speeds up ctor & dtor of a map that never receives an element. The
+    // penalty is payed at the first insert, and not before. Lookup of this empty map works
+    // because everybody points to DummyInfoByte::b. parameter bucket_count is dictated by the
+    // standard, but we can ignore it.
+    explicit Table(
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, const Hash& h = Hash{},
+        const KeyEqual& equal = KeyEqual{}) noexcept(noexcept(Hash(h)) && noexcept(KeyEqual(equal)))
+        : WHash(h)
+        , WKeyEqual(equal) {
+        ROBIN_HOOD_TRACE(this)
+    }
+
+    template <typename Iter>
+    Table(Iter first, Iter last, size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+          const Hash& h = Hash{}, const KeyEqual& equal = KeyEqual{})
+        : WHash(h)
+        , WKeyEqual(equal) {
+        ROBIN_HOOD_TRACE(this)
+        insert(first, last);
+    }
+
+    Table(std::initializer_list<value_type> initlist,
+          size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, const Hash& h = Hash{},
+          const KeyEqual& equal = KeyEqual{})
+        : WHash(h)
+        , WKeyEqual(equal) {
+        ROBIN_HOOD_TRACE(this)
+        insert(initlist.begin(), initlist.end());
+    }
+
+    Table(Table&& o) noexcept
+        : WHash(std::move(static_cast<WHash&>(o)))
+        , WKeyEqual(std::move(static_cast<WKeyEqual&>(o)))
+        , DataPool(std::move(static_cast<DataPool&>(o))) {
+        ROBIN_HOOD_TRACE(this)
+        if (o.mMask) {
+            mKeyVals = std::move(o.mKeyVals);
+            mInfo = std::move(o.mInfo);
+            mNumElements = std::move(o.mNumElements);
+            mMask = std::move(o.mMask);
+            mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+            mInfoInc = std::move(o.mInfoInc);
+            mInfoHashShift = std::move(o.mInfoHashShift);
+            // set other's mask to 0 so its destructor won't do anything
+            o.init();
+        }
+    }
+
+    Table& operator=(Table&& o) noexcept {
+        ROBIN_HOOD_TRACE(this)
+        if (&o != this) {
+            if (o.mMask) {
+                // only move stuff if the other map actually has some data
+                destroy();
+                mKeyVals = std::move(o.mKeyVals);
+                mInfo = std::move(o.mInfo);
+                mNumElements = std::move(o.mNumElements);
+                mMask = std::move(o.mMask);
+                mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+                mInfoInc = std::move(o.mInfoInc);
+                mInfoHashShift = std::move(o.mInfoHashShift);
+                WHash::operator=(std::move(static_cast<WHash&>(o)));
+                WKeyEqual::operator=(std::move(static_cast<WKeyEqual&>(o)));
+                DataPool::operator=(std::move(static_cast<DataPool&>(o)));
+
+                o.init();
+
+            } else {
+                // nothing in the other map => just clear us.
+                clear();
+            }
+        }
+        return *this;
+    }
+
+    Table(const Table& o)
+        : WHash(static_cast<const WHash&>(o))
+        , WKeyEqual(static_cast<const WKeyEqual&>(o))
+        , DataPool(static_cast<const DataPool&>(o)) {
+        ROBIN_HOOD_TRACE(this)
+        if (!o.empty()) {
+            // not empty: create an exact copy. it is also possible to just iterate through all
+            // elements and insert them, but copying is probably faster.
+
+            auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+            mKeyVals = static_cast<Node*>(detail::assertNotNull<std::bad_alloc>(
+                std::malloc(calcNumBytesTotal(numElementsWithBuffer))));
+            // no need for calloc because clonData does memcpy
+            mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
+            mNumElements = o.mNumElements;
+            mMask = o.mMask;
+            mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+            mInfoInc = o.mInfoInc;
+            mInfoHashShift = o.mInfoHashShift;
+            cloneData(o);
+        }
+    }
+
+    // Creates a copy of the given map. Copy constructor of each entry is used.
+    // Not sure why clang-tidy thinks this doesn't handle self assignment, it does
+    // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+    Table& operator=(Table const& o) {
+        ROBIN_HOOD_TRACE(this)
+        if (&o == this) {
+            // prevent assigning of itself
+            return *this;
+        }
+
+        // we keep using the old allocator and not assign the new one, because we want to keep
+        // the memory available. when it is the same size.
+        if (o.empty()) {
+            if (0 == mMask) {
+                // nothing to do, we are empty too
+                return *this;
+            }
+
+            // not empty: destroy what we have there
+            // clear also resets mInfo to 0, that's sometimes not necessary.
+            destroy();
+            init();
+            WHash::operator=(static_cast<const WHash&>(o));
+            WKeyEqual::operator=(static_cast<const WKeyEqual&>(o));
+            DataPool::operator=(static_cast<DataPool const&>(o));
+
+            return *this;
+        }
+
+        // clean up old stuff
+        Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}.nodes(*this);
+
+        if (mMask != o.mMask) {
+            // no luck: we don't have the same array size allocated, so we need to realloc.
+            if (0 != mMask) {
+                // only deallocate if we actually have data!
+                std::free(mKeyVals);
+            }
+
+            auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+            mKeyVals = static_cast<Node*>(detail::assertNotNull<std::bad_alloc>(
+                std::malloc(calcNumBytesTotal(numElementsWithBuffer))));
+
+            // no need for calloc here because cloneData performs a memcpy.
+            mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
+            // sentinel is set in cloneData
+        }
+        WHash::operator=(static_cast<const WHash&>(o));
+        WKeyEqual::operator=(static_cast<const WKeyEqual&>(o));
+        DataPool::operator=(static_cast<DataPool const&>(o));
+        mNumElements = o.mNumElements;
+        mMask = o.mMask;
+        mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+        mInfoInc = o.mInfoInc;
+        mInfoHashShift = o.mInfoHashShift;
+        cloneData(o);
+
+        return *this;
+    }
+
+    // Swaps everything between the two maps.
+    void swap(Table& o) {
+        ROBIN_HOOD_TRACE(this)
+        using std::swap;
+        swap(o, *this);
+    }
+
+    // Clears all data, without resizing.
+    void clear() {
+        ROBIN_HOOD_TRACE(this)
+        if (empty()) {
+            // don't do anything! also important because we don't want to write to
+            // DummyInfoByte::b, even though we would just write 0 to it.
+            return;
+        }
+
+        Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}.nodes(*this);
+
+        auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+        // clear everything, then set the sentinel again
+        uint8_t const z = 0;
+        std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z);
+        mInfo[numElementsWithBuffer] = 1;
+
+        mInfoInc = InitialInfoInc;
+        mInfoHashShift = InitialInfoHashShift;
+    }
+
+    // Destroys the map and all it's contents.
+    ~Table() {
+        ROBIN_HOOD_TRACE(this)
+        destroy();
+    }
+
+    // Checks if both tables contain the same entries. Order is irrelevant.
+    bool operator==(const Table& other) const {
+        ROBIN_HOOD_TRACE(this)
+        if (other.size() != size()) {
+            return false;
+        }
+        for (auto const& otherEntry : other) {
+            if (!has(otherEntry)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(const Table& other) const {
+        ROBIN_HOOD_TRACE(this)
+        return !operator==(other);
+    }
+
+    template <typename Q = mapped_type>
+    typename std::enable_if<!std::is_void<Q>::value, Q&>::type operator[](const key_type& key) {
+        ROBIN_HOOD_TRACE(this)
+        return doCreateByKey(key);
+    }
+
+    template <typename Q = mapped_type>
+    typename std::enable_if<!std::is_void<Q>::value, Q&>::type operator[](key_type&& key) {
+        ROBIN_HOOD_TRACE(this)
+        return doCreateByKey(std::move(key));
+    }
+
+    template <typename Iter>
+    void insert(Iter first, Iter last) {
+        for (; first != last; ++first) {
+            // value_type ctor needed because this might be called with std::pair's
+            insert(value_type(*first));
+        }
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> emplace(Args&&... args) {
+        ROBIN_HOOD_TRACE(this)
+        Node n{*this, std::forward<Args>(args)...};
+        auto r = doInsert(std::move(n));
+        if (!r.second) {
+            // insertion not possible: destroy node
+            // NOLINTNEXTLINE(bugprone-use-after-move)
+            n.destroy(*this);
+        }
+        return r;
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(const key_type& key, Args&&... args) {
+        return try_emplace_impl(key, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(key_type&& key, Args&&... args) {
+        return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(const_iterator hint, const key_type& key,
+                                          Args&&... args) {
+        (void)hint;
+        return try_emplace_impl(key, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(const_iterator hint, key_type&& key, Args&&... args) {
+        (void)hint;
+        return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(const key_type& key, Mapped&& obj) {
+        return insert_or_assign_impl(key, std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(key_type&& key, Mapped&& obj) {
+        return insert_or_assign_impl(std::move(key), std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(const_iterator hint, const key_type& key,
+                                               Mapped&& obj) {
+        (void)hint;
+        return insert_or_assign_impl(key, std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(const_iterator hint, key_type&& key, Mapped&& obj) {
+        (void)hint;
+        return insert_or_assign_impl(std::move(key), std::forward<Mapped>(obj));
+    }
+
+    std::pair<iterator, bool> insert(const value_type& keyval) {
+        ROBIN_HOOD_TRACE(this)
+        return doInsert(keyval);
+    }
+
+    std::pair<iterator, bool> insert(value_type&& keyval) {
+        return doInsert(std::move(keyval));
+    }
+
+    // Returns 1 if key is found, 0 otherwise.
+    size_t count(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv != reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            return 1;
+        }
+        return 0;
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<Self_::is_transparent, size_t>::type count(const OtherKey& key) const {
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv != reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            return 1;
+        }
+        return 0;
+    }
+
+    bool contains(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
+        return 1U == count(key);
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<Self_::is_transparent, bool>::type contains(const OtherKey& key) const {
+        return 1U == count(key);
+    }
+
+    // Returns a reference to the value found for key.
+    // Throws std::out_of_range if element cannot be found
+    template <typename Q = mapped_type>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<!std::is_void<Q>::value, Q&>::type at(key_type const& key) {
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv == reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            doThrow<std::out_of_range>("key not found");
+        }
+        return kv->getSecond();
+    }
+
+    // Returns a reference to the value found for key.
+    // Throws std::out_of_range if element cannot be found
+    template <typename Q = mapped_type>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<!std::is_void<Q>::value, Q const&>::type at(key_type const& key) const {
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv == reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            doThrow<std::out_of_range>("key not found");
+        }
+        return kv->getSecond();
+    }
+
+    const_iterator find(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return const_iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey>
+    const_iterator find(const OtherKey& key, is_transparent_tag /*unused*/) const {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return const_iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    typename std::enable_if<Self_::is_transparent, // NOLINT(modernize-use-nodiscard)
+                            const_iterator>::type  // NOLINT(modernize-use-nodiscard)
+    find(const OtherKey& key) const {              // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return const_iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    iterator find(const key_type& key) {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey>
+    iterator find(const OtherKey& key, is_transparent_tag /*unused*/) {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    typename std::enable_if<Self_::is_transparent, iterator>::type find(const OtherKey& key) {
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    iterator begin() {
+        ROBIN_HOOD_TRACE(this)
+        if (empty()) {
+            return end();
+        }
+        return iterator(mKeyVals, mInfo, fast_forward_tag{});
+    }
+    const_iterator begin() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return cbegin();
+    }
+    const_iterator cbegin() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        if (empty()) {
+            return cend();
+        }
+        return const_iterator(mKeyVals, mInfo, fast_forward_tag{});
+    }
+
+    iterator end() {
+        ROBIN_HOOD_TRACE(this)
+        // no need to supply valid info pointer: end() must not be dereferenced, and only node
+        // pointer is compared.
+        return iterator{reinterpret_cast_no_cast_align_warning<Node*>(mInfo), nullptr};
+    }
+    const_iterator end() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return cend();
+    }
+    const_iterator cend() const { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return const_iterator{reinterpret_cast_no_cast_align_warning<Node*>(mInfo), nullptr};
+    }
+
+    iterator erase(const_iterator pos) {
+        ROBIN_HOOD_TRACE(this)
+        // its safe to perform const cast here
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        return erase(iterator{const_cast<Node*>(pos.mKeyVals), const_cast<uint8_t*>(pos.mInfo)});
+    }
+
+    // Erases element at pos, returns iterator to the next element.
+    iterator erase(iterator pos) {
+        ROBIN_HOOD_TRACE(this)
+        // we assume that pos always points to a valid entry, and not end().
+        auto const idx = static_cast<size_t>(pos.mKeyVals - mKeyVals);
+
+        shiftDown(idx);
+        --mNumElements;
+
+        if (*pos.mInfo) {
+            // we've backward shifted, return this again
+            return pos;
+        }
+
+        // no backward shift, return next element
+        return ++pos;
+    }
+
+    size_t erase(const key_type& key) {
+        ROBIN_HOOD_TRACE(this)
+        size_t idx{};
+        InfoType info{};
+        keyToIdx(key, &idx, &info);
+
+        // check while info matches with the source idx
+        do {
+            if (info == mInfo[idx] && WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+                shiftDown(idx);
+                --mNumElements;
+                return 1;
+            }
+            next(&info, &idx);
+        } while (info <= mInfo[idx]);
+
+        // nothing found to delete
+        return 0;
+    }
+
+    // reserves space for the specified number of elements. Makes sure the old data fits.
+    // exactly the same as reserve(c).
+    void rehash(size_t c) {
+        reserve(c);
+    }
+
+    // reserves space for the specified number of elements. Makes sure the old data fits.
+    // Exactly the same as resize(c). Use resize(0) to shrink to fit.
+    void reserve(size_t c) {
+        ROBIN_HOOD_TRACE(this)
+        auto const minElementsAllowed = (std::max)(c, mNumElements);
+        auto newSize = InitialNumElements;
+        while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed && newSize != 0) {
+            newSize *= 2;
+        }
+        if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+            throwOverflowError();
+        }
+
+        rehashPowerOfTwo(newSize);
+    }
+
+    size_type size() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return mNumElements;
+    }
+
+    size_type max_size() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return static_cast<size_type>(-1);
+    }
+
+    ROBIN_HOOD(NODISCARD) bool empty() const noexcept {
+        ROBIN_HOOD_TRACE(this)
+        return 0 == mNumElements;
+    }
+
+    float max_load_factor() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return MaxLoadFactor100 / 100.0F;
+    }
+
+    // Average number of elements per bucket. Since we allow only 1 per bucket
+    float load_factor() const noexcept { // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
+        return static_cast<float>(size()) / static_cast<float>(mMask + 1);
+    }
+
+    ROBIN_HOOD(NODISCARD) size_t mask() const noexcept {
+        ROBIN_HOOD_TRACE(this)
+        return mMask;
+    }
+
+    ROBIN_HOOD(NODISCARD) size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept {
+        if (ROBIN_HOOD_LIKELY(maxElements <= (std::numeric_limits<size_t>::max)() / 100)) {
+            return maxElements * MaxLoadFactor100 / 100;
+        }
+
+        // we might be a bit inprecise, but since maxElements is quite large that doesn't matter
+        return (maxElements / 100) * MaxLoadFactor100;
+    }
+
+    ROBIN_HOOD(NODISCARD) size_t calcNumBytesInfo(size_t numElements) const noexcept {
+        // we add a uint64_t, which houses the sentinel (first byte) and padding so we can load
+        // 64bit types.
+        return numElements + sizeof(uint64_t);
+    }
+
+    ROBIN_HOOD(NODISCARD)
+    size_t calcNumElementsWithBuffer(size_t numElements) const noexcept {
+        auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements);
+        return numElements + (std::min)(maxNumElementsAllowed, (static_cast<size_t>(0xFF)));
+    }
+
+    // calculation only allowed for 2^n values
+    ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const {
+#if ROBIN_HOOD(BITNESS) == 64
+        return numElements * sizeof(Node) + calcNumBytesInfo(numElements);
+#else
+        // make sure we're doing 64bit operations, so we are at least safe against 32bit overflows.
+        auto const ne = static_cast<uint64_t>(numElements);
+        auto const s = static_cast<uint64_t>(sizeof(Node));
+        auto const infos = static_cast<uint64_t>(calcNumBytesInfo(numElements));
+
+        auto const total64 = ne * s + infos;
+        auto const total = static_cast<size_t>(total64);
+
+        if (ROBIN_HOOD_UNLIKELY(static_cast<uint64_t>(total) != total64)) {
+            throwOverflowError();
+        }
+        return total;
+#endif
+    }
+
+private:
+    template <typename Q = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<!std::is_void<Q>::value, bool>::type has(const value_type& e) const {
+        ROBIN_HOOD_TRACE(this)
+        auto it = find(e.first);
+        return it != end() && it->second == e.second;
+    }
+
+    template <typename Q = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<std::is_void<Q>::value, bool>::type has(const value_type& e) const {
+        ROBIN_HOOD_TRACE(this)
+        return find(e) != end();
+    }
+
+    // reserves space for at least the specified number of elements.
+    // only works if numBuckets if power of two
+    void rehashPowerOfTwo(size_t numBuckets) {
+        ROBIN_HOOD_TRACE(this)
+
+        Node* const oldKeyVals = mKeyVals;
+        uint8_t const* const oldInfo = mInfo;
+
+        const size_t oldMaxElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+
+        // resize operation: move stuff
+        init_data(numBuckets);
+        if (oldMaxElementsWithBuffer > 1) {
+            for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) {
+                if (oldInfo[i] != 0) {
+                    insert_move(std::move(oldKeyVals[i]));
+                    // destroy the node but DON'T destroy the data.
+                    oldKeyVals[i].~Node();
+                }
+            }
+
+            // don't destroy old data: put it into the pool instead
+            DataPool::addOrFree(oldKeyVals, calcNumBytesTotal(oldMaxElementsWithBuffer));
+        }
+    }
+
+    ROBIN_HOOD(NOINLINE) void throwOverflowError() const {
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+        throw std::overflow_error("robin_hood::map overflow");
+#else
+        abort();
+#endif
+    }
+
+    template <typename OtherKey, typename... Args>
+    std::pair<iterator, bool> try_emplace_impl(OtherKey&& key, Args&&... args) {
+        ROBIN_HOOD_TRACE(this)
+        auto it = find(key);
+        if (it == end()) {
+            return emplace(std::piecewise_construct,
+                           std::forward_as_tuple(std::forward<OtherKey>(key)),
+                           std::forward_as_tuple(std::forward<Args>(args)...));
+        }
+        return {it, false};
+    }
+
+    template <typename OtherKey, typename Mapped>
+    std::pair<iterator, bool> insert_or_assign_impl(OtherKey&& key, Mapped&& obj) {
+        ROBIN_HOOD_TRACE(this)
+        auto it = find(key);
+        if (it == end()) {
+            return emplace(std::forward<OtherKey>(key), std::forward<Mapped>(obj));
+        }
+        it->second = std::forward<Mapped>(obj);
+        return {it, false};
+    }
+
+    void init_data(size_t max_elements) {
+        mNumElements = 0;
+        mMask = max_elements - 1;
+        mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements);
+
+        auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements);
+
+        // calloc also zeroes everything
+        mKeyVals = reinterpret_cast<Node*>(detail::assertNotNull<std::bad_alloc>(
+            std::calloc(1, calcNumBytesTotal(numElementsWithBuffer))));
+        mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
+
+        // set sentinel
+        mInfo[numElementsWithBuffer] = 1;
+
+        mInfoInc = InitialInfoInc;
+        mInfoHashShift = InitialInfoHashShift;
+    }
+
+    template <typename Arg, typename Q = mapped_type>
+    typename std::enable_if<!std::is_void<Q>::value, Q&>::type doCreateByKey(Arg&& key) {
+        while (true) {
+            size_t idx{};
+            InfoType info{};
+            keyToIdx(key, &idx, &info);
+            nextWhileLess(&info, &idx);
+
+            // while we potentially have a match. Can't do a do-while here because when mInfo is
+            // 0 we don't want to skip forward
+            while (info == mInfo[idx]) {
+                if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+                    // key already exists, do not insert.
+                    return mKeyVals[idx].getSecond();
+                }
+                next(&info, &idx);
+            }
+
+            // unlikely that this evaluates to true
+            if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
+                increase_size();
+                continue;
+            }
+
+            // key not found, so we are now exactly where we want to insert it.
+            auto const insertion_idx = idx;
+            auto const insertion_info = info;
+            if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+                mMaxNumElementsAllowed = 0;
+            }
+
+            // find an empty spot
+            while (0 != mInfo[idx]) {
+                next(&info, &idx);
+            }
+
+            auto& l = mKeyVals[insertion_idx];
+            if (idx == insertion_idx) {
+                // put at empty spot. This forwards all arguments into the node where the object
+                // is constructed exactly where it is needed.
+                ::new (static_cast<void*>(&l))
+                    Node(*this, std::piecewise_construct,
+                         std::forward_as_tuple(std::forward<Arg>(key)), std::forward_as_tuple());
+            } else {
+                shiftUp(idx, insertion_idx);
+                l = Node(*this, std::piecewise_construct,
+                         std::forward_as_tuple(std::forward<Arg>(key)), std::forward_as_tuple());
+            }
+
+            // mKeyVals[idx].getFirst() = std::move(key);
+            mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
+
+            ++mNumElements;
+            return mKeyVals[insertion_idx].getSecond();
+        }
+    }
+
+    // This is exactly the same code as operator[], except for the return values
+    template <typename Arg>
+    std::pair<iterator, bool> doInsert(Arg&& keyval) {
+        while (true) {
+            size_t idx{};
+            InfoType info{};
+            keyToIdx(getFirstConst(keyval), &idx, &info);
+            nextWhileLess(&info, &idx);
+
+            // while we potentially have a match
+            while (info == mInfo[idx]) {
+                if (WKeyEqual::operator()(getFirstConst(keyval), mKeyVals[idx].getFirst())) {
+                    // key already exists, do NOT insert.
+                    // see http://en.cppreference.com/w/cpp/container/unordered_map/insert
+                    return std::make_pair<iterator, bool>(iterator(mKeyVals + idx, mInfo + idx),
+                                                          false);
+                }
+                next(&info, &idx);
+            }
+
+            // unlikely that this evaluates to true
+            if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
+                increase_size();
+                continue;
+            }
+
+            // key not found, so we are now exactly where we want to insert it.
+            auto const insertion_idx = idx;
+            auto const insertion_info = info;
+            if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+                mMaxNumElementsAllowed = 0;
+            }
+
+            // find an empty spot
+            while (0 != mInfo[idx]) {
+                next(&info, &idx);
+            }
+
+            auto& l = mKeyVals[insertion_idx];
+            if (idx == insertion_idx) {
+                ::new (static_cast<void*>(&l)) Node(*this, std::forward<Arg>(keyval));
+            } else {
+                shiftUp(idx, insertion_idx);
+                l = Node(*this, std::forward<Arg>(keyval));
+            }
+
+            // put at empty spot
+            mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
+
+            ++mNumElements;
+            return std::make_pair(iterator(mKeyVals + insertion_idx, mInfo + insertion_idx), true);
+        }
+    }
+
+    bool try_increase_info() {
+        ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements
+                                   << ", maxNumElementsAllowed="
+                                   << calcMaxNumElementsAllowed(mMask + 1))
+        if (mInfoInc <= 2) {
+            // need to be > 2 so that shift works (otherwise undefined behavior!)
+            return false;
+        }
+        // we got space left, try to make info smaller
+        mInfoInc = static_cast<uint8_t>(mInfoInc >> 1U);
+
+        // remove one bit of the hash, leaving more space for the distance info.
+        // This is extremely fast because we can operate on 8 bytes at once.
+        ++mInfoHashShift;
+        auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+
+        for (size_t i = 0; i < numElementsWithBuffer; i += 8) {
+            auto val = unaligned_load<uint64_t>(mInfo + i);
+            val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f);
+            std::memcpy(mInfo + i, &val, sizeof(val));
+        }
+        // update sentinel, which might have been cleared out!
+        mInfo[numElementsWithBuffer] = 1;
+
+        mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+        return true;
+    }
+
+    void increase_size() {
+        // nothing allocated yet? just allocate InitialNumElements
+        if (0 == mMask) {
+            init_data(InitialNumElements);
+            return;
+        }
+
+        auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+        if (mNumElements < maxNumElementsAllowed && try_increase_info()) {
+            return;
+        }
+
+        ROBIN_HOOD_LOG("mNumElements=" << mNumElements << ", maxNumElementsAllowed="
+                                       << maxNumElementsAllowed << ", load="
+                                       << (static_cast<double>(mNumElements) * 100.0 /
+                                           (static_cast<double>(mMask) + 1)))
+        // it seems we have a really bad hash function! don't try to resize again
+        if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) {
+            throwOverflowError();
+        }
+
+        rehashPowerOfTwo((mMask + 1) * 2);
+    }
+
+    void destroy() {
+        if (0 == mMask) {
+            // don't deallocate!
+            return;
+        }
+
+        Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+            .nodesDoNotDeallocate(*this);
+
+        // This protection against not deleting mMask shouldn't be needed as it's sufficiently
+        // protected with the 0==mMask check, but I have this anyways because g++ 7 otherwise
+        // reports a compile error: attempt to free a non-heap object ‘fm’
+        // [-Werror=free-nonheap-object]
+        if (mKeyVals != reinterpret_cast_no_cast_align_warning<Node*>(&mMask)) {
+            std::free(mKeyVals);
+        }
+    }
+
+    void init() noexcept {
+        mKeyVals = reinterpret_cast_no_cast_align_warning<Node*>(&mMask);
+        mInfo = reinterpret_cast<uint8_t*>(&mMask);
+        mNumElements = 0;
+        mMask = 0;
+        mMaxNumElementsAllowed = 0;
+        mInfoInc = InitialInfoInc;
+        mInfoHashShift = InitialInfoHashShift;
+    }
+
+    // members are sorted so no padding occurs
+    Node* mKeyVals = reinterpret_cast_no_cast_align_warning<Node*>(&mMask); // 8 byte  8
+    uint8_t* mInfo = reinterpret_cast<uint8_t*>(&mMask);                    // 8 byte 16
+    size_t mNumElements = 0;                                                // 8 byte 24
+    size_t mMask = 0;                                                       // 8 byte 32
+    size_t mMaxNumElementsAllowed = 0;                                      // 8 byte 40
+    InfoType mInfoInc = InitialInfoInc;                                     // 4 byte 44
+    InfoType mInfoHashShift = InitialInfoHashShift;                         // 4 byte 48
+                                                    // 16 byte 56 if NodeAllocator
+};
+
+} // namespace detail
+
+// map
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_map = detail::Table<true, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_map = detail::Table<false, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_map =
+    detail::Table<sizeof(robin_hood::pair<Key, T>) <= sizeof(size_t) * 6 &&
+                      std::is_nothrow_move_constructible<robin_hood::pair<Key, T>>::value &&
+                      std::is_nothrow_move_assignable<robin_hood::pair<Key, T>>::value,
+                  MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+// set
+
+template <typename Key, typename Hash = hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          size_t MaxLoadFactor100 = 80>
+using unordered_flat_set = detail::Table<true, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          size_t MaxLoadFactor100 = 80>
+using unordered_node_set = detail::Table<false, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          size_t MaxLoadFactor100 = 80>
+using unordered_set = detail::Table<sizeof(Key) <= sizeof(size_t) * 6 &&
+                                        std::is_nothrow_move_constructible<Key>::value &&
+                                        std::is_nothrow_move_assignable<Key>::value,
+                                    MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+} // namespace robin_hood
+
+#endif
diff --git a/pecos/core/utils/clustering.hpp b/pecos/core/utils/clustering.hpp
new file mode 100644
index 00000000..54cd79f1
--- /dev/null
+++ b/pecos/core/utils/clustering.hpp
@@ -0,0 +1,371 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __CLUSTERING_H__
+#define  __CLUSTERING_H__
+
+#include <algorithm>
+#include <vector>
+
+#include "matrix.hpp"
+#include "random.hpp"
+
+namespace pecos {
+
+
+namespace clustering {
+
+enum {
+    KMEANS=0,
+    SKMEANS=5,
+}; /* partition_algo */
+
+struct Node {
+    size_t start;
+    size_t end;
+
+    Node(size_t start=0, size_t end=0): start(start), end(end) {}
+
+    void set(size_t start, size_t end) {
+        this->start = start;
+        this->end = end;
+    }
+
+    size_t size() const { return end - start; }
+};
+
+/*
+ * Each node is a cluster of elements
+ * #leaf nodes = 2^{depth}
+ * #internal nodes = 2^{depth} (where we have a dummy node with node Id = 0)
+ * #nodes = 2^{depth + 1}
+ */
+struct Tree {
+    typedef random_number_generator<> rng_t;
+    typedef dense_vec_t<float32_t> dvec_wrapper_t;
+
+    size_t depth;     // # leaf nodes = 2^depth
+    std::vector<Node> nodes;
+
+    // used for balanced 2-means
+    u64_dvec_t elements;
+    u64_dvec_t previous_elements;
+    std::vector<f32_dvec_t> center1; // need to be duplicated to handle parallel clustering
+    std::vector<f32_dvec_t> center2; // for spherical kmeans
+    u32_dvec_t seed_for_nodes; // random seeds used for each node
+    f32_dvec_t scores;
+
+    // Temporary working spaces for function update_center, will be cleared after clustering to release space
+    std::vector<f32_dvec_t> center_tmp_thread; // thread-private working array for parallel updating center
+
+    Tree(size_t depth=0) { this->reset_depth(depth); }
+
+    void reset_depth(size_t depth) {
+        this->depth = depth;
+        nodes.resize(1 << (depth + 1));
+        seed_for_nodes.resize(nodes.size());
+    }
+
+    struct comparator_by_value_t {
+        const float32_t *pred_val;
+        bool increasing;
+        comparator_by_value_t(const float32_t *val, bool increasing=true):
+            pred_val(val), increasing(increasing) {}
+        bool operator()(const size_t i, const size_t j) const {
+            if(increasing) {
+                return (pred_val[i] < pred_val[j]) || (pred_val[i] == pred_val[j] && i < j);
+            } else {
+                return (pred_val[i] > pred_val[j]) || (pred_val[i] == pred_val[j] && i < j);
+            }
+        }
+    };
+
+
+    Node& root_of(size_t nid) { return nodes[nid]; }
+    Node& left_of(size_t nid) { return nodes[nid << 1]; }
+    Node& right_of(size_t nid) { return nodes[(nid << 1) + 1]; }
+
+    void partition_elements(Node& root, Node& left, Node& right) {
+        size_t middle = (root.start + root.end) >> 1;
+        left.set(root.start, middle);
+        right.set(middle, root.end);
+    }
+
+    // Sort elements by scores on node and return if this function changes the assignment
+    bool sort_elements_by_scores_on_node(const Node& root, bool increasing=true) {
+        auto prev_start_it = previous_elements.begin() + root.start;
+        auto start_it = elements.begin() + root.start;
+        auto middle_it = elements.begin() + ((root.start + root.end) >> 1);
+        auto end_it = elements.begin() + root.end;
+        std::copy(start_it, middle_it, prev_start_it);
+        std::sort(start_it, end_it, comparator_by_value_t(scores.data(), increasing));
+        std::sort(start_it, middle_it);
+        std::sort(middle_it, end_it);
+        return !std::equal(start_it, middle_it, prev_start_it);
+    }
+
+    // X = [x_1, ..., x_L]^T
+    // c_1 = e_1^T X / |e_1|_0, where \be_1 is the indicator for first half elements
+    // c_2 = e_2^T X / |e_2|_0, where \be_2 is the indicator for second half elements
+    // e = e_2/|e_2|_0 - e_1/|e_1|_0
+    // c = c_2 - c_1 = X^T e
+    // score(i) = <c, x_i>
+    // works for both cosine similarity if feat_mat is with unit-length rows
+    //                euclidean similarity
+
+    // Loop through node's elements and update current center
+    template<typename MAT>
+    void update_center(const MAT& feat_mat, Node& cur_node, dvec_wrapper_t& cur_center, float32_t alpha, int threads=1) {
+        if(threads == 1) {
+           for(size_t i = cur_node.start; i < cur_node.end; i++) {
+                size_t eid = elements[i];
+                const auto& feat = feat_mat.get_row(eid);
+                do_axpy(alpha, feat, cur_center);
+            }
+        } else {
+            #pragma omp parallel num_threads(threads)
+            {
+                int thread_id = omp_get_thread_num();
+                std::fill(center_tmp_thread[thread_id].begin(), center_tmp_thread[thread_id].end(), 0);
+                dvec_wrapper_t cur_center_tmp_thread(center_tmp_thread[thread_id]);
+                // use static for reproducibility under multi-trials with same seed.
+                #pragma omp for schedule(static)
+                for(size_t i = cur_node.start; i < cur_node.end; i++) {
+                    size_t eid = elements[i];
+                    const auto& feat = feat_mat.get_row(eid);
+                    do_axpy(alpha, feat, cur_center_tmp_thread);
+                }
+            }
+
+            // global parallel reduction
+            #pragma omp parallel for schedule(static)
+            for(size_t i=0; i<cur_center.len; ++i) {
+                for(size_t thread_id = 0; thread_id < threads; thread_id++) {
+                        cur_center[i] += center_tmp_thread[thread_id][i];
+                }
+            }
+        }
+    }
+
+    template<typename MAT>
+    void partition_kmeans(size_t nid, size_t depth, const MAT& feat_mat, rng_t& rng, size_t max_iter=10, int threads=1, int thread_id=0) {
+        Node& root = root_of(nid);
+        Node& left = left_of(nid);
+        Node& right = right_of(nid);
+        partition_elements(root, left, right);
+
+        dvec_wrapper_t cur_center(center1[thread_id]);
+
+        // perform the clustering and sorting
+        for(size_t iter = 0; iter < max_iter; iter++) {
+            // construct cur_center (for right child)
+            std::fill(center1[thread_id].begin(), center1[thread_id].end(), 0);
+            if(iter == 0) {
+                auto right_idx = rng.randint(0, root.size() - 1);
+                auto left_idx = (right_idx + rng.randint(1, root.size() - 1)) % root.size();
+                right_idx += root.start;
+                left_idx  += root.start;
+
+                const auto& feat_right = feat_mat.get_row(elements[right_idx]);
+                const auto& feat_left = feat_mat.get_row(elements[left_idx]);
+                do_axpy(1.0, feat_right, cur_center);
+                do_axpy(-1.0, feat_left, cur_center);
+
+            } else {
+                float32_t alpha = 0;
+                alpha = +1.0 / right.size();
+                update_center(feat_mat, right, cur_center, alpha, threads);
+
+                alpha = -1.0 / left.size();
+                update_center(feat_mat, left, cur_center, alpha, threads);
+            }
+            u64_dvec_t *elements_ptr = &elements;
+            auto *scores_ptr = &scores;
+            auto *center_ptr = &cur_center;
+            const MAT* feat_mat_ptr = &feat_mat;
+            if(threads == 1) {
+                for(size_t i = root.start; i < root.end; i++) {
+                    size_t eid = elements_ptr->at(i);
+                    const auto& feat = feat_mat_ptr->get_row(eid);
+                    scores_ptr->at(eid) = do_dot_product(*center_ptr, feat);
+                }
+            } else {
+#pragma omp parallel for shared(elements_ptr, scores_ptr, center_ptr, feat_mat_ptr)
+                for(size_t i = root.start; i < root.end; i++) {
+                    size_t eid = elements_ptr->at(i);
+                    const auto& feat = feat_mat_ptr->get_row(eid);
+                    scores_ptr->at(eid) = do_dot_product(*center_ptr, feat);
+                }
+            }
+            bool assignment_changed = sort_elements_by_scores_on_node(root);
+            if(!assignment_changed) {
+                break;
+            }
+        }
+    }
+
+    template<typename MAT>
+    void partition_skmeans(size_t nid, size_t depth, const MAT& feat_mat, rng_t& rng, size_t max_iter=10, int threads=1, int thread_id=0) {
+        Node& root = root_of(nid);
+        Node& left = left_of(nid);
+        Node& right = right_of(nid);
+        partition_elements(root, left, right);
+
+        dvec_wrapper_t cur_center1(center1[thread_id]);
+        dvec_wrapper_t cur_center2(center2[thread_id]);
+
+        // perform the clustering and sorting
+        for(size_t iter = 0; iter < max_iter; iter++) {
+            float32_t one = 1.0;
+            // construct center1 (for right child)
+            std::fill(center1[thread_id].begin(), center1[thread_id].end(), 0);
+            std::fill(center2[thread_id].begin(), center2[thread_id].end(), 0);
+            if(iter == 0) {
+                auto right_idx = rng.randint(0, root.size() - 1);
+                auto left_idx = (right_idx + rng.randint(1, root.size() - 1)) % root.size();
+                right_idx += root.start;
+                left_idx  += root.start;
+
+                const auto& feat_right = feat_mat.get_row(elements[right_idx]);
+                const auto& feat_left = feat_mat.get_row(elements[left_idx]);
+                do_axpy(1.0, feat_right, cur_center1);
+                do_axpy(1.0, feat_left, cur_center2);
+                do_axpy(-1.0, cur_center2, cur_center1);
+            } else {
+                update_center(feat_mat, right, cur_center1, one, threads);
+                float32_t alpha = do_dot_product(cur_center1, cur_center1);
+                if(alpha > 0) {
+                    do_scale(1.0 / sqrt(alpha), cur_center1);
+                }
+
+                update_center(feat_mat, left, cur_center2, one, threads);
+                alpha = do_dot_product(cur_center2, cur_center2);
+                if(alpha > 0) {
+                    do_scale(1.0 / sqrt(alpha), cur_center2);
+                }
+
+                do_axpy(-1.0, cur_center2, cur_center1);
+            }
+
+
+            u64_dvec_t *elements_ptr = &elements;
+            auto *scores_ptr = &scores;
+            auto *center_ptr = &cur_center1;
+            const MAT* feat_mat_ptr = &feat_mat;
+            if(threads == 1) {
+                for(size_t i = root.start; i < root.end; i++) {
+                    size_t eid = elements_ptr->at(i);
+                    const auto& feat = feat_mat_ptr->get_row(eid);
+                    scores_ptr->at(eid) = do_dot_product(*center_ptr, feat);
+                }
+            } else {
+#pragma omp parallel for shared(elements_ptr, scores_ptr, center_ptr, feat_mat_ptr)
+                for(size_t i = root.start; i < root.end; i++) {
+                    size_t eid = elements_ptr->at(i);
+                    const auto& feat = feat_mat_ptr->get_row(eid);
+                    scores_ptr->at(eid) = do_dot_product(*center_ptr, feat);
+                }
+            }
+            bool assignment_changed = sort_elements_by_scores_on_node(root);
+            if(!assignment_changed) {
+                break;
+            }
+        }
+    }
+
+    template<typename MAT, typename IND=unsigned>
+    void run_clustering(const MAT& feat_mat, int partition_algo, int seed=0, IND *label_codes=NULL, size_t max_iter=10, int threads=1) {
+        size_t nr_elements = feat_mat.rows;
+        elements.resize(nr_elements);
+        previous_elements.resize(nr_elements);
+        for(size_t i = 0; i < nr_elements; i++) {
+            elements[i] = i;
+        }
+        rng_t rng(seed);
+        for(size_t nid = 0; nid < nodes.size(); nid++) {
+            seed_for_nodes[nid] = rng.randint<unsigned>();
+        }
+
+        threads = set_threads(threads);
+        center1.resize(threads, f32_dvec_t(feat_mat.cols, 0));
+        center2.resize(threads, f32_dvec_t(feat_mat.cols, 0));
+        scores.resize(feat_mat.rows, 0);
+        nodes[0].set(0, nr_elements);
+        nodes[1].set(0, nr_elements);
+
+        // Allocate tmp arrays for parallel update center
+        center_tmp_thread.resize(threads, f32_dvec_t(feat_mat.cols, 0));
+
+
+        // let's do it layer by layer so we can parallelize it
+        for(size_t d = 0; d < depth; d++) {
+            size_t layer_start = 1U << d;
+            size_t layer_end = 1U << (d + 1);
+            if((layer_end - layer_start) >= threads) {
+#pragma omp parallel for schedule(dynamic)
+                for(size_t nid = layer_start; nid < layer_end; nid++) {
+                    rng_t rng(seed_for_nodes[nid]);
+                    int local_threads = 1;
+                    int thread_id = omp_get_thread_num();
+                    if(partition_algo == KMEANS) {
+                        partition_kmeans(nid, d, feat_mat, rng, max_iter, local_threads, thread_id);
+                    } else if(partition_algo == SKMEANS) {
+                        partition_skmeans(nid, d, feat_mat, rng, max_iter, local_threads, thread_id);
+                    }
+                }
+            } else {
+                for(size_t nid = layer_start; nid < layer_end; nid++) {
+                    rng_t rng(seed_for_nodes[nid]);
+                    int local_threads = threads;
+                    int thread_id = 0;
+                    if(partition_algo == KMEANS) {
+                        partition_kmeans(nid, d, feat_mat, rng, max_iter, local_threads, thread_id);
+                    } else if(partition_algo == SKMEANS) {
+                        partition_skmeans(nid, d, feat_mat, rng, max_iter, local_threads, thread_id);
+                    }
+                }
+            }
+        }
+
+        if(label_codes != NULL) {
+            size_t leaf_start = 1U << depth;
+            size_t leaf_end = 1U << (depth + 1);
+            for(size_t nid = leaf_start; nid < leaf_end; nid++) {
+                for(size_t idx = nodes[nid].start; idx < nodes[nid].end; idx++) {
+                    label_codes[elements[idx]] = nid - leaf_start;
+                }
+            }
+        }
+
+        // clear tmp arrays
+        center_tmp_thread.clear();
+        center_tmp_thread.shrink_to_fit();
+    }
+
+    void output() {
+        size_t nr_internal_nodes = nodes.size() >> 1;
+        for(size_t nid = nr_internal_nodes; nid < nodes.size(); nid++) {
+            const Node& node = nodes[nid];
+            printf("node(%ld): ", nid);
+            for(size_t idx = node.start; idx < node.end; idx++) {
+                printf(" %ld", elements[idx]);
+            }
+            puts("");
+        }
+    }
+};
+
+} // end of namespace clustering
+} // end of namespace pecos
+
+#endif // end of __CLUSTERING_H__
diff --git a/pecos/core/utils/file_util.hpp b/pecos/core/utils/file_util.hpp
new file mode 100644
index 00000000..02477435
--- /dev/null
+++ b/pecos/core/utils/file_util.hpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __FILE_UTIL_H__
+#define __FILE_UTIL_H__
+
+#include <algorithm>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace pecos {
+
+namespace file_util {
+
+using std::string;
+using std::vector;
+
+// get file size in bytes
+size_t get_filesize(const string& filename) {
+    FILE *fp = fopen(filename.c_str(), "rb");
+    if(fp == NULL) {
+        fprintf(stderr, "Error getting file size: can't read %s!!\n", filename.c_str());
+        return 0;
+    }
+    fseek(fp, 0, SEEK_END);
+    size_t filesize = ftell(fp);
+    fclose(fp);
+    return filesize;
+}
+
+// count number of lines in file
+size_t get_linecount(const string& filename, size_t start_pos=0, size_t end_pos=0) {
+    FILE *fp = fopen(filename.c_str(), "rb");
+    if(fp == NULL) {
+        fprintf(stderr, "Error getting line count: can't read %s!!\n", filename.c_str());
+        return 0;
+    }
+    const int chunksize = 10240;
+    char buf[chunksize];
+    size_t filelen;
+    if(end_pos == 0) {
+        fseek(fp, 0, SEEK_END);
+        end_pos = ftell(fp);
+    }
+    size_t linecount = 0;
+    fseek(fp, start_pos, SEEK_SET);
+    for(size_t cnt = start_pos; cnt < end_pos; cnt += chunksize) {
+        size_t buf_len = std::min((size_t)chunksize, end_pos-cnt);
+        size_t tmp;
+        if(buf_len != (tmp=fread(&buf[0], sizeof(char), buf_len, fp))) {
+            fprintf(stderr, "Error: something wrong in linecount() expect %ld bytes but read %ld instead!!\n", tmp, buf_len);
+        }
+        for(size_t i = 0; i < buf_len; i++) {
+            if(buf[i] == '\n') {
+                linecount++;
+            }
+        }
+    }
+    fclose(fp);
+    return linecount;
+}
+
+// load file into memory buffer with extra '\0' at end
+// buffer size should > end_pos - start_pos + 1
+// return the actual overwritten buffer size
+size_t load_file_block(const string& filename, char* buffer, size_t start_pos=0, size_t end_pos=0) {
+    if(end_pos < start_pos) {
+        throw std::invalid_argument("got end_pos < start_pos");
+    }
+    FILE *fp = fopen(filename.c_str(), "rb");
+    if(fp == NULL) {
+        fprintf(stderr, "Error loading file: can't read %s!!\n", filename.c_str());
+    }
+    if(end_pos == 0) {
+        fseek(fp, 0, SEEK_END);
+        end_pos = ftell(fp);
+    }
+    fseek(fp, start_pos, SEEK_SET);
+    size_t read_len = end_pos - start_pos;
+    if(fread(buffer, sizeof(char), read_len, fp) != read_len){
+        fprintf(stderr, "Error: error reading %s!!\n", filename.c_str());
+    }
+    fclose(fp);
+    // set proper end char for loaded buffer
+    buffer[read_len] = '\0';
+    return read_len + 1;
+}
+
+// split file into chunks with sizes approximately equal to given chunk_size
+// chunk i=0,1,2... start from position (chunk_size * i) and search for immediate next \n symbol as corresponding offsets
+// result chunk_offset = {0, offset_1, offset_2, ..., offset_n} where offset_n = filesize
+void get_file_offset(const string& filename, size_t chunk_size, vector<size_t>& chunk_offset) {
+    size_t file_size = get_filesize(filename);
+    chunk_size = std::min(chunk_size, file_size);
+    // Generate offset for each block
+    size_t n_chunks = (file_size + chunk_size - 1) / chunk_size;
+    chunk_offset.resize(n_chunks + 1);
+    chunk_offset[0] = 0; chunk_offset[n_chunks] = file_size;
+    FILE *src_fp = fopen(filename.c_str(), "rb");
+    for(size_t i = 1; i < n_chunks; i++) {
+        chunk_offset[i] = chunk_offset[i - 1] + chunk_size;
+        if(chunk_offset[i] >= file_size) {
+            chunk_offset[i] = file_size;
+            chunk_offset.resize(i + 1);
+            break;
+        }
+        fseek(src_fp, chunk_offset[i] - 1, SEEK_SET);
+        while(!feof(src_fp) && fgetc(src_fp) != '\n');
+        chunk_offset[i] = ftell(src_fp);
+    }
+    fclose(src_fp);
+}
+
+} // end namespace file_util
+} // end namespace pecos
+
+#endif  // end of __FILE_UTIL_H__
diff --git a/pecos/core/utils/matrix.hpp b/pecos/core/utils/matrix.hpp
new file mode 100644
index 00000000..315ace09
--- /dev/null
+++ b/pecos/core/utils/matrix.hpp
@@ -0,0 +1,1038 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __MATRIX_H__
+#define __MATRIX_H__
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <stdexcept>
+#include <vector>
+
+#include "parallel.hpp"
+#include "scipy_loader.hpp"
+
+typedef float float32_t;
+typedef double float64_t;
+
+
+typedef std::vector<uint32_t> u32_dvec_t;
+typedef std::vector<uint64_t> u64_dvec_t;
+typedef std::vector<float32_t> f32_dvec_t;
+typedef std::vector<float64_t> f64_dvec_t;
+
+
+// ===== C Interface for Structure/Types =====
+
+extern "C" {
+
+    // rows, cols, nnz, &row_ptr, &col_ptr, &val_ptr
+    typedef void(*py_coo_allocator_t)(uint64_t, uint64_t, uint64_t, void*, void*, void*);
+    typedef void (*py_sparse_allocator_t)(bool, uint64_t, uint64_t, uint64_t, void*, void*, void*);
+
+    typedef struct {
+        uint32_t rows, cols;
+        uint64_t* col_ptr;
+        uint32_t* row_idx;
+        float* val;
+    } ScipyCscF32;
+
+    typedef struct {
+        uint32_t rows, cols;
+        uint64_t* row_ptr;
+        uint32_t* col_idx;
+        float* val;
+    } ScipyCsrF32;
+
+    typedef struct {
+        uint32_t rows, cols;
+        float* val;
+    } ScipyDrmF32;
+
+    typedef struct {
+        uint32_t rows, cols;
+        float* val;
+    } ScipyDcmF32;
+
+} // end of extern C
+
+namespace pecos {
+
+    // ===== Wrapper for sparse/dense vectors =====
+    template<class IDX_T=uint32_t, class VAL_T=float32_t>
+    struct sparse_vec_t {
+        typedef IDX_T index_type;
+        typedef VAL_T value_type;
+
+        index_type nnz;
+        index_type* idx;
+        value_type* val;
+        sparse_vec_t(index_type nnz=0, index_type* idx=NULL, value_type* val=NULL): nnz(nnz), idx(idx), val(val) {}
+
+        index_type get_nnz() const { return nnz; }
+    };
+
+    template<class VAL_T=float32_t>
+    struct dense_vec_t {
+        typedef VAL_T value_type;
+        typedef uint64_t index_type;
+
+        uint64_t len;
+        value_type* val;
+        dense_vec_t(uint64_t len=0, value_type* val=NULL): len(len), val(val) {}
+        dense_vec_t(std::vector<value_type>& x): len(x.size()), val(x.data()) {}
+
+        value_type& operator[](size_t i) { return val[i]; }
+        const value_type& operator[](size_t i) const { return val[i]; }
+
+        value_type& at(size_t i) { return val[i]; }
+        const value_type& at(size_t i) const { return val[i]; }
+
+        uint64_t get_nnz() const { return len; }
+    };
+
+    // ===== Wrapper for sparse/dense matrices =====
+    struct csr_t;
+    struct csc_t;
+    struct drm_t;
+    struct dcm_t;
+
+    struct csr_t { // Compressed Sparse Rows
+        typedef float32_t value_type;
+        typedef uint32_t index_type;
+        typedef uint64_t mem_index_type;
+        typedef sparse_vec_t<index_type, value_type> row_vec_t;
+
+        const static bool IS_COLUMN_MAJORED = false;
+        index_type rows, cols;
+
+        // create alias names for indptr, indices, data,
+        // which are later used in smat_x_smat function.
+        union {
+            mem_index_type *row_ptr;
+            mem_index_type *indptr;
+        };
+        union {
+            index_type *col_idx;
+            index_type *indices;
+        };
+        union {
+            value_type *val;
+            value_type *data;
+        };
+        csr_t() :
+            rows(0),
+            cols(0),
+            row_ptr(nullptr),
+            col_idx(nullptr),
+            val(nullptr) { }
+
+        csr_t(const ScipyCsrF32* py) :
+            rows(py->rows),
+            cols(py->cols),
+            row_ptr(py->row_ptr),
+            col_idx(py->col_idx),
+            val(py->val) { }
+
+        bool is_empty() const {
+            return val == nullptr;
+        }
+
+        index_type nnz_of_row(index_type idx) const {
+            return static_cast<index_type>(row_ptr[idx + 1] - row_ptr[idx]);
+        }
+
+        mem_index_type get_nnz() const {
+            return row_ptr[rows];
+        }
+
+        row_vec_t get_row(index_type idx) const {
+            auto offset = row_ptr[idx];
+            return row_vec_t(nnz_of_row(idx), &col_idx[offset], &val[offset]);
+        }
+
+        // Frees the underlying memory of the matrix (i.e., col_ptr, row_idx, and val arrays)
+        // Every function in the inference code that returns a matrix has allocated memory, and
+        // therefore one should call this function to free that memory.
+        void free_underlying_memory() {
+            if (row_ptr) {
+                delete[] row_ptr;
+                row_ptr = nullptr;
+            }
+            if (col_idx) {
+                delete[] col_idx;
+                col_idx = nullptr;
+            }
+            if (val) {
+                delete[] val;
+                val = nullptr;
+            }
+        }
+
+        csc_t transpose() const;
+        csc_t to_csc() const;
+
+        // Creates a deep copy of this matrix
+        // This allocates memory, so one should call free_underlying_memory on the copy when
+        // one is finished using it.
+        csr_t deep_copy() const {
+            mem_index_type nnz = row_ptr[rows];
+            csr_t res;
+            res.allocate(rows, cols, nnz);
+            std::memcpy(res.col_idx, col_idx, sizeof(index_type) * nnz);
+            std::memcpy(res.val, val, sizeof(float) * nnz);
+            std::memcpy(res.row_ptr, row_ptr, sizeof(mem_index_type) * (rows + 1));
+            return res;
+        }
+
+        void create_pycsr(const py_sparse_allocator_t& pred_alloc) const {
+            uint64_t nnz = row_ptr[rows];
+            uint32_t* indices = nullptr;
+            uint64_t* indptr = nullptr;
+            float* data = nullptr;
+
+            // Copy everything into the python allocated memory
+            pred_alloc(false, rows, cols, nnz, &indices, &indptr, &data);
+
+            for (mem_index_type i = 0; i < nnz; ++i) {
+                indices[i] = col_idx[i];
+                data[i] = val[i];
+            }
+            for (index_type i = 0; i < rows + 1; ++i) {
+                indptr[i] = row_ptr[i];
+            }
+        }
+
+        void allocate(index_type rows, index_type cols, mem_index_type nnz) {
+            this->rows = rows;
+            this->cols = cols;
+            row_ptr = new mem_index_type[rows + 1];
+            col_idx = new index_type[nnz];
+            val = new value_type[nnz];
+        }
+
+        // Construct a csr_t object with shape _rows x _cols filled by 1.
+        void fill_ones(index_type _rows, index_type _cols) {
+            mem_index_type nnz = _rows * _cols;
+            this->allocate(_rows, _cols, nnz);
+
+            row_ptr[0] = 0;
+            mem_index_type ind = 0;
+            for(index_type r = 0; r < rows; r++) {
+                for(index_type c = 0; c < cols; c++) {
+                    col_idx[ind] = c;
+                    val[ind] = 1.0;
+                    ind++;
+                }
+                row_ptr[r + 1] = row_ptr[r] + cols;
+            }
+        }
+    };
+
+    struct csc_t { // Compressed Sparse Columns
+        typedef float32_t value_type;
+        typedef uint32_t index_type;
+        typedef uint64_t mem_index_type;
+        typedef sparse_vec_t<index_type, value_type> col_vec_t;
+
+        const static bool IS_COLUMN_MAJORED = true;
+        index_type rows, cols;
+
+        // create alias names for indptr, indices, data,
+        // which are later used in smat_x_smat function.
+        union {
+            mem_index_type *col_ptr;
+            mem_index_type *indptr;
+        };
+        union {
+            index_type *row_idx;
+            index_type *indices;
+        };
+        union {
+            value_type *val;
+            value_type *data;
+        };
+
+        csc_t() :
+            rows(0),
+            cols(0),
+            col_ptr(nullptr),
+            row_idx(nullptr),
+            val(nullptr) { }
+
+        csc_t(const ScipyCscF32* py) :
+            rows(py->rows),
+            cols(py->cols),
+            col_ptr(py->col_ptr),
+            row_idx(py->row_idx),
+            val(py->val) { }
+
+        bool is_empty() const {
+            return val == nullptr;
+        }
+
+        index_type nnz_of_col(index_type idx) const {
+            return static_cast<index_type>(col_ptr[idx + 1] - col_ptr[idx]);
+        }
+
+        mem_index_type get_nnz() const {
+            return col_ptr[cols];
+        }
+
+        col_vec_t get_col(index_type idx) const {
+            auto offset = col_ptr[idx];
+            return col_vec_t(nnz_of_col(idx), &row_idx[offset], &val[offset]);
+        }
+
+        // Frees the underlying memory of the matrix (i.e., col_ptr, row_idx, and val arrays)
+        // Every function in the inference code that returns a matrix has allocated memory, and
+        // therefore one should call this function to free that memory.
+        void free_underlying_memory() {
+            if (col_ptr) {
+                delete[] col_ptr;
+                col_ptr = nullptr;
+            }
+            if (row_idx) {
+                delete[] row_idx;
+                row_idx = nullptr;
+            }
+            if (val) {
+                delete[] val;
+                val = nullptr;
+            }
+        }
+
+        csr_t transpose() const ;
+        csr_t to_csr() const;
+
+        // Creates a deep copy of this matrix
+        // This allocates memory, so one should call free_underlying_memory on the copy when
+        // one is finished using it.
+        csc_t deep_copy() const {
+            mem_index_type nnz = col_ptr[cols];
+            csc_t res;
+            res.allocate(rows, cols, nnz);
+            std::memcpy(res.row_idx, row_idx, sizeof(index_type) * nnz);
+            std::memcpy(res.val, val, sizeof(float) * nnz);
+            std::memcpy(res.col_ptr, col_ptr, sizeof(mem_index_type) * (cols + 1));
+            return res;
+        }
+
+        void allocate(index_type rows, index_type cols, mem_index_type nnz) {
+            this->rows = rows;
+            this->cols = cols;
+            col_ptr = new mem_index_type[cols + 1];
+            row_idx = new index_type[nnz];
+            val = new value_type[nnz];
+        }
+
+        // Construct a csc_t object with shape _rows x _cols filled by 1.
+        void fill_ones(index_type _rows, index_type _cols) {
+            mem_index_type nnz = _rows * _cols;
+            this->free_underlying_memory();
+            this->allocate(_rows, _cols, nnz);
+            col_ptr[0] = 0;
+            mem_index_type ind = 0;
+            for(index_type c = 0; c < cols; c++) {
+                for(index_type r = 0; r < rows; r++) {
+                    row_idx[ind] = r;
+                    val[ind] = 1.0;
+                    ind++;
+                }
+                col_ptr[c + 1] = col_ptr[c] + rows;
+            }
+        }
+    };
+
+    struct drm_t { // Dense Row Majored Matrix
+        typedef float32_t value_type;
+        typedef uint32_t index_type;
+        typedef uint64_t mem_index_type;
+        typedef dense_vec_t<value_type> row_vec_t;
+
+        index_type rows, cols;
+        value_type *val;
+
+        drm_t() {}
+
+        drm_t(const ScipyDrmF32* py) {
+            rows = py->rows;
+            cols = py->cols;
+            val = py->val;
+        }
+
+        row_vec_t get_row(index_type idx) const {
+            return row_vec_t(cols,
+                &val[static_cast<mem_index_type>(cols) * static_cast<mem_index_type>(idx)]);
+        }
+
+        dcm_t transpose() const ;
+    };
+
+    struct dcm_t { // Dense Column Majored Matrix
+        typedef float32_t value_type;
+        typedef uint32_t index_type;
+        typedef uint64_t mem_index_type;
+        typedef dense_vec_t<value_type> col_vec_t;
+
+        index_type rows, cols;
+        value_type *val;
+
+        dcm_t() {}
+
+        dcm_t(const ScipyDcmF32* py) {
+            rows = py->rows;
+            cols = py->cols;
+            val = py->val;
+        }
+
+        drm_t transpose() const ;
+
+        col_vec_t get_col(index_type idx) const {
+            return col_vec_t(cols,
+                &val[static_cast<mem_index_type>(rows) * static_cast<mem_index_type>(idx)]);
+        }
+    };
+
+    // Transpose Methods
+    csc_t csr_t::transpose() const {
+        csc_t ret;
+        ret.rows = cols;
+        ret.cols = rows;
+        ret.col_ptr = row_ptr;
+        ret.row_idx = col_idx;
+        ret.val = val;
+        return ret;
+    }
+
+    csr_t csc_t::transpose() const {
+        csr_t ret;
+        ret.rows = cols;
+        ret.cols = rows;
+        ret.row_ptr = col_ptr;
+        ret.col_idx = row_idx;
+        ret.val = val;
+        return ret;
+    }
+
+    dcm_t drm_t::transpose() const {
+        dcm_t ret;
+        ret.rows = cols;
+        ret.cols = rows;
+        ret.val = val;
+        return ret;
+    }
+
+    drm_t dcm_t::transpose() const {
+        drm_t ret;
+        ret.rows = cols;
+        ret.cols = rows;
+        ret.val = val;
+        return ret;
+    }
+
+    // CSC to CSR
+    csc_t csr_t::to_csc() const {
+        csc_t ret;
+        auto nnz = this->get_nnz();
+        ret.rows = rows;
+        ret.cols = cols;
+        ret.col_ptr = new mem_index_type[cols + 1];
+        ret.row_idx = new index_type[nnz];
+        ret.val = new value_type[nnz];
+        memset(ret.col_ptr, 0, sizeof(mem_index_type) * (cols + 1));
+        for(mem_index_type s = 0; s < nnz; ++s) {
+            ++ret.col_ptr[col_idx[s] + 1];
+        }
+        std::partial_sum(ret.col_ptr, ret.col_ptr + cols + 1, ret.col_ptr);
+        for(index_type r = 0; r < rows; r++) {
+            for(mem_index_type s = row_ptr[r]; s < row_ptr[r + 1]; ++s) {
+                index_type c = col_idx[s];
+                ret.row_idx[ret.col_ptr[c]] = r;
+                ret.val[ret.col_ptr[c]++] = val[s];
+            }
+        }
+        std::move_backward(ret.col_ptr, ret.col_ptr + cols, ret.col_ptr + cols + 1);
+        ret.col_ptr[0] = 0;
+        return ret;
+    }
+
+    csr_t csc_t::to_csr() const {
+        return this->transpose().to_csc().transpose();
+    }
+
+
+    // ===== Container for Sparse Coordinate Matrix
+    struct coo_t { // Coordinate Sparse Matrix
+        typedef float32_t value_type;
+
+        uint32_t rows;
+        uint32_t cols;
+        std::vector<uint32_t> row_idx;
+        std::vector<uint32_t> col_idx;
+        std::vector<value_type> val;
+
+        coo_t(uint32_t rows=0, uint32_t cols=0): rows(rows), cols(cols) {}
+
+        size_t nnz() const { return val.size(); }
+
+        void reshape(uint32_t rows_, uint32_t cols_) {
+            rows = rows_;
+            cols = cols_;
+            clear();
+        }
+
+        void clear() {
+            row_idx.clear();
+            col_idx.clear();
+            val.clear();
+        }
+
+        void reserve(size_t capacity) {
+            row_idx.reserve(capacity);
+            col_idx.reserve(capacity);
+            val.reserve(capacity);
+        }
+
+        void swap(coo_t& other) {
+            std::swap(rows, other.rows);
+            std::swap(cols, other.cols);
+            row_idx.swap(other.row_idx);
+            col_idx.swap(other.col_idx);
+            val.swap(other.val);
+        }
+
+        void extends(coo_t& other) {
+            std::copy(other.row_idx.begin(), other.row_idx.end(), std::back_inserter(row_idx));
+            std::copy(other.col_idx.begin(), other.col_idx.end(), std::back_inserter(col_idx));
+            std::copy(other.val.begin(), other.val.end(), std::back_inserter(val));
+        }
+
+        template<typename I, typename V>
+        void push_back(I i, I j, V x, double threshold=0) {
+            if(std::fabs(x) >= threshold) {
+                row_idx.push_back(i);
+                col_idx.push_back(j);
+                val.push_back(x);
+            }
+        }
+
+        void create_pycoo(const py_coo_allocator_t& alloc) const {
+            uint64_t* row_ptr=NULL;
+            uint64_t* col_ptr=NULL;
+            value_type* val_ptr=NULL;
+            alloc(rows, cols, nnz(), &row_ptr, &col_ptr, &val_ptr);
+            for(size_t i = 0; i < nnz(); i++) {
+                row_ptr[i] = row_idx[i];
+                col_ptr[i] = col_idx[i];
+                val_ptr[i] = val[i];
+            }
+        }
+    };
+
+    // ===== Container for Sparse Matrix used by spmm.hpp
+    template<bool is_col_major>
+    struct spmm_mat_t {
+        typedef uint64_t mem_index_type;
+        typedef uint32_t index_type;
+        typedef float value_type;
+
+        const static bool IS_COLUMN_MAJORED = is_col_major;
+        index_type rows, cols;
+        mem_index_type *indptr;
+        index_type *indices;
+        value_type *data;
+        py_sparse_allocator_t pred_alloc;
+
+        spmm_mat_t(py_sparse_allocator_t pred_alloc) :
+            rows(0),
+            cols(0),
+            indptr(nullptr),
+            indices(nullptr),
+            data(nullptr),
+            pred_alloc(pred_alloc) {}
+
+        void allocate(index_type rows, index_type cols, mem_index_type nnz) {
+            this->rows = rows;
+            this->cols = cols;
+            pred_alloc(IS_COLUMN_MAJORED, rows, cols, nnz, &indices, &indptr, &data);
+        }
+    };
+
+    // ===== BLAS C++ Wrapper =====
+
+    extern "C" {
+        double ddot_(ptrdiff_t *, double *, ptrdiff_t *, double *, ptrdiff_t *);
+        float sdot_(ptrdiff_t *, float *, ptrdiff_t *, float *, ptrdiff_t *);
+
+        ptrdiff_t dscal_(ptrdiff_t *, double *, double *, ptrdiff_t *);
+        ptrdiff_t sscal_(ptrdiff_t *, float *, float *, ptrdiff_t *);
+
+        ptrdiff_t daxpy_(ptrdiff_t *, double *, double *, ptrdiff_t *, double *, ptrdiff_t *);
+        ptrdiff_t saxpy_(ptrdiff_t *, float *, float *, ptrdiff_t *, float *, ptrdiff_t *);
+
+        double dcopy_(ptrdiff_t *, double *, ptrdiff_t *, double *, ptrdiff_t *);
+        float scopy_(ptrdiff_t *, float *, ptrdiff_t *, float *, ptrdiff_t *);
+    }
+
+    template<typename val_type> val_type dot(ptrdiff_t *, val_type *, ptrdiff_t *, val_type *, ptrdiff_t *);
+    template<> inline double dot(ptrdiff_t *len, double *x, ptrdiff_t *xinc, double *y, ptrdiff_t *yinc) { return ddot_(len, x, xinc, y, yinc); }
+    template<> inline float dot(ptrdiff_t *len, float *x, ptrdiff_t *xinc, float *y, ptrdiff_t *yinc) { return sdot_(len, x, xinc, y, yinc); }
+
+    template<typename val_type> val_type scal(ptrdiff_t *, val_type *, val_type *, ptrdiff_t *);
+    template<> inline double scal(ptrdiff_t *len, double *a, double *x, ptrdiff_t *xinc) { return dscal_(len, a, x, xinc); }
+    template<> inline float scal(ptrdiff_t *len, float *a,  float *x, ptrdiff_t *xinc) { return sscal_(len, a, x, xinc); }
+
+    template<typename val_type> ptrdiff_t axpy(ptrdiff_t *, val_type *, val_type *, ptrdiff_t *, val_type *, ptrdiff_t *);
+    template<> inline ptrdiff_t axpy(ptrdiff_t *len, double *alpha, double *x, ptrdiff_t *xinc, double *y, ptrdiff_t *yinc) { return daxpy_(len, alpha, x, xinc, y, yinc); };
+    template<> inline ptrdiff_t axpy(ptrdiff_t *len, float *alpha, float *x, ptrdiff_t *xinc, float *y, ptrdiff_t *yinc) { return saxpy_(len, alpha, x, xinc, y, yinc); };
+
+    template<typename val_type> val_type copy(ptrdiff_t *, val_type *, ptrdiff_t *, val_type *, ptrdiff_t *);
+    template<> inline double copy(ptrdiff_t *len, double *x, ptrdiff_t *xinc, double *y, ptrdiff_t *yinc) { return dcopy_(len,x,xinc,y,yinc); }
+    template<> inline float copy(ptrdiff_t *len, float *x, ptrdiff_t *xinc, float *y, ptrdiff_t *yinc) { return scopy_(len,x,xinc,y,yinc); }
+
+    // ===== do_dot_product =====
+    template<typename val_type>
+    val_type do_dot_product(const val_type *x, const val_type *y, size_t size) {
+        // This uses a BLAS implementation
+        val_type *xx = const_cast<val_type*>(x);
+        val_type *yy = const_cast<val_type*>(y);
+        ptrdiff_t inc = 1;
+        ptrdiff_t len = (ptrdiff_t) size;
+        return dot(&len, xx, &inc, yy, &inc);
+    }
+
+    template<class IX, class VX, class IY, class VY>
+    float32_t do_dot_product(const sparse_vec_t<IX, VX>& x, const sparse_vec_t<IY, VY>& y) {
+        // This function assume that nz entries in both x and y are stored in an
+        // ascending order in terms of idx
+        if(x.nnz > y.nnz) { return do_dot_product(y, x); }
+
+        //float64_t ret = 0;
+        float32_t ret = 0;
+        size_t s = 0, t = 0;
+        IX *xend = x.idx + x.nnz;
+        IY *yend = y.idx + y.nnz;
+        while(s < x.nnz && t < y.nnz) {
+            if(x.idx[s] == y.idx[t]) {
+                ret += x.val[s] * y.val[t];
+                s++;
+                t++;
+            } else if(x.idx[s] < y.idx[t]) {
+                s = std::lower_bound(x.idx + s, xend, y.idx[t]) - x.idx;
+            } else {
+                t = std::lower_bound(y.idx + t, yend, x.idx[s]) - y.idx;
+            }
+        }
+        return static_cast<float32_t>(ret);
+    }
+
+    template<class VX, class VY>
+    float32_t do_dot_product(const dense_vec_t<VX>& x, const dense_vec_t<VY>& y) {
+        float32_t ret = 0;
+        for(size_t i = 0; i < x.len; i++) {
+            ret += x[i] * y[i];
+        }
+        return ret;
+    }
+
+    template<class VX, class IY, class VY>
+    float32_t do_dot_product(const dense_vec_t<VX>& x, const sparse_vec_t<IY, VY>& y) {
+        float32_t ret = 0;
+        for(size_t s = 0; s < y.nnz; s++) {
+            ret += x[y.idx[s]] * y.val[s];
+        }
+        return ret;
+    }
+
+    template<class IX, class VX, class VY>
+    float32_t do_dot_product(const sparse_vec_t<IX, VX>& x, const dense_vec_t<VY>& y) {
+        return do_dot_product(y, x);
+    }
+
+
+    // ===== do_axpy =====
+    template<typename val_type, typename T>
+    val_type* do_axpy(T alpha, const val_type *x, val_type *y, size_t size) {
+        // This uses a BLAS implementation
+        if(alpha == 0) { return y; }
+        val_type alpha_ = (val_type)alpha;
+        ptrdiff_t inc = 1;
+        ptrdiff_t len = (ptrdiff_t) size;
+        val_type *xx = const_cast<val_type*>(x);
+        axpy(&len, &alpha_, xx, &inc, y, &inc);
+        return y;
+    }
+
+    template<class VX, class VY, typename T>
+    void do_axpy(T alpha, const dense_vec_t<VX>&x, dense_vec_t<VY>& y) {
+        for(size_t i = 0; i < x.len; i++) {
+            y[i] += alpha * x[i];
+        }
+    }
+
+    template<class IX, class VX, class VY, typename T>
+    void do_axpy(T alpha, const sparse_vec_t<IX, VX>&x, dense_vec_t<VY>& y) {
+        for(size_t s = 0; s < x.nnz; s++) {
+            y[x.idx[s]] += alpha * x.val[s];
+        }
+    }
+
+    template<class VX, class VY, typename T>
+    void do_axpy(T alpha, const VX& x, std::vector<VY>& y) {
+        return do_axpy(alpha, x, dense_vec_t<VY>(y));
+    }
+
+    // ===== do_scale =====
+    template<class val_type, class T>
+    void do_scale(T alpha, val_type *x, size_t size) {
+        // This uses a BLAS implementation
+        if(alpha == 0.0) {
+            memset(x, 0, sizeof(val_type) * size);
+        } else if (alpha == 1.0) {
+            return;
+        } else {
+            val_type alpha_minus_one = (val_type)(alpha - 1);
+            do_axpy(alpha_minus_one, x, x, size);
+        }
+    }
+
+    template<class VX, typename T>
+    void do_scale(T alpha, dense_vec_t<VX>& x) {
+        for(size_t i = 0; i < x.len; i++) {
+            x[i] *= alpha;
+        }
+    }
+
+    template<class VX, typename T>
+    void do_scale(T alpha, sparse_vec_t<VX>& x) {
+        for(size_t s = 0; s < x.nnz; s++) {
+            x.val[s] *= alpha;
+        }
+    }
+
+    template<class X_MAT, class M_MAT, class V>
+    void compute_sparse_entries_from_rowmajored_X_and_colmajored_M(const X_MAT& X, const M_MAT& M, uint64_t len, uint32_t *X_row_idx, uint32_t *M_col_idx, V *val, int threads) {
+        // This function assume that nz entries in both x and y are stored in an
+        // ascending order in terms of idx
+        set_threads(threads);
+#pragma omp parallel for schedule(dynamic,64)
+        for(size_t idx = 0; idx < len; idx++) {
+            const auto& xi = X.get_row(X_row_idx[idx]);
+            const auto& mj = M.get_col(M_col_idx[idx]);
+            val[idx] = static_cast<V>(do_dot_product(xi, mj));
+        }
+    }
+
+    /*
+    spmm_mat_t (defined in xlinear.cpp) is a data structure type that supports
+    (1) spmm_mat_t.allocate(), which allocates memory for self.indptr, self.indices, self.data
+    (2) spmm_mat_t::IS_COLUMN_MAJORED, which specifies if it is column or row major
+
+    smat_x_smat function:
+    given two sparse column major matrices, X and W,
+    multithreaded compute sparse-sparse matrix multiplication of X and W,
+    and output the results into Z, the spmm_mat_t data type.
+    (1) if Z::IS_COL_MAJORED is true, then Z = XW
+    (2) if Z::IS_COL_MAJORED is false, then Z = (XW).transpose()
+    This design choice is to be compatible with the python interface and handling csc_x_csc case.
+    */
+    template <typename spmm_mat_t>
+    void smat_x_smat(
+        const csc_t& X,
+        const csc_t& W,
+        spmm_mat_t& Z,
+        const bool eliminate_zeros=false,
+        const bool sorted_indices=true,
+        int threads=1
+    ) {
+        // sanity check
+        if(X.cols != W.rows) {
+            std::runtime_error("X.cols != W.rows");
+        }
+
+        typedef typename csc_t::index_type index_type;
+        typedef typename csc_t::mem_index_type mem_index_type;
+        typedef typename csc_t::value_type value_type;
+
+        struct sdvec_t {
+            index_type len;
+            index_type nr_touch;
+
+            struct entry_t {
+                value_type val;
+                bool touched;
+                entry_t(value_type val=0, bool touched=0): val(val), touched(touched) {}
+            };
+
+            std::vector<entry_t> entries;
+            std::vector<index_type> touched_indices;
+
+            sdvec_t(index_type len=0) : len(len), nr_touch(0) {
+                entries.resize(len);
+                touched_indices.resize(len);
+            }
+
+            void resize(index_type len_) {
+                len = len_;
+                nr_touch = 0;
+                entries.resize(len);
+                touched_indices.resize(len);
+            }
+
+            value_type& add_nonzero_at(index_type idx, value_type v) {
+                entries[idx].val += static_cast<value_type>(v);
+                if(!entries[idx].touched) {
+                    entries[idx].touched = 1;
+                    touched_indices[nr_touch++] = static_cast<index_type>(idx);
+                }
+                return entries[idx].val;
+            }
+
+            void sort_nz_indices() {
+                std::sort(touched_indices.data(), touched_indices.data() + nr_touch);
+            }
+
+            void clear() {
+                if(nr_touch < (len >> 1)) {
+                    for(size_t t = 0; t < nr_touch; t++) {
+                        entries[touched_indices[t]].val = 0;
+                        entries[touched_indices[t]].touched = 0;
+                    }
+                } else {
+                    memset(entries.data(), 0, sizeof(entry_t) * len);
+                }
+                nr_touch = 0;
+            }
+        };
+
+        struct worker_t {
+            worker_t() {}
+            sdvec_t temp;
+
+            void set_rows(index_type rows) { temp.resize(rows); }
+        };
+
+        index_type rows = X.rows, cols = W.cols;
+        threads = set_threads(threads);
+
+        // compute workloads for each thread
+        std::vector<index_type> workloads(threads + 1);
+        workloads[0] = 0;
+        workloads[threads] = cols;
+        if(threads > 1) {
+            std::vector<size_t> flops(cols);
+#pragma omp parallel for schedule(dynamic,16)
+            for(size_t c = 0; c < cols; c++) {
+                flops[c] = 0;
+                for(size_t s = W.col_ptr[c]; s != W.col_ptr[c + 1]; ++s) {
+                    flops[c] += X.nnz_of_col(W.row_idx[s]);
+                }
+            }
+            parallel_partial_sum(flops.begin(), flops.end(), flops.begin(), threads);
+            size_t avg_flops = flops[cols - 1] / threads + (flops[cols - 1] % threads != 0);
+#pragma omp parallel for schedule(static,1)
+            for (int tid = 1; tid < threads; tid++) {
+                auto low = std::lower_bound(flops.begin(), flops.end(), tid*avg_flops);
+                index_type pos = static_cast<index_type>(low - flops.begin());
+                workloads[tid] = (pos >= cols) ? cols - 1 : pos;
+            }
+        }
+
+        // compute maxnnz of Z = XW, for each column, and use it as col_ptr
+        std::vector<mem_index_type> col_ptr(cols + 1);
+#pragma omp parallel for schedule(static,1) shared(col_ptr)
+        for(int tid = 0; tid < threads; tid++) {
+            // the mask vector is essentially a binary sparse accumulator,
+            // see https://people.eecs.berkeley.edu/~aydin/GALLA-sparse.pdf.
+            // the idx c is strictly smaller than std::numeric_limits<index_type>::max().
+            std::vector<index_type> mask(rows, std::numeric_limits<index_type>::max());
+            index_type c_start = workloads[tid];
+            index_type c_end = workloads[tid + 1];
+            for(index_type c = c_start; c < c_end; ++c) {
+                auto Wc = W.get_col(c);
+                if(Wc.nnz == 0) {
+                    col_ptr[c + 1] = 0;
+                    continue;
+                }
+                for(index_type s = 0; s < Wc.nnz; ++s) {
+                    auto Xs = X.get_col(Wc.idx[s]);
+                    for(index_type t = 0; t < Xs.nnz; ++t) {
+                        auto ridx = Xs.idx[t];
+                        if (mask[ridx] != c) {
+                            mask[ridx] = c;
+                            col_ptr[c + 1]++;
+                        }
+                    }
+                }
+            }
+        }
+        parallel_partial_sum(col_ptr.begin(), col_ptr.end(), col_ptr.begin(), threads);
+        mem_index_type max_nnz = col_ptr[cols];
+
+        // once getting the maxnnz, use Z to allocate memory
+        if(spmm_mat_t::IS_COLUMN_MAJORED) {
+            Z.allocate(rows, cols, max_nnz);
+        } else {
+            Z.allocate(cols, rows, max_nnz);
+        }
+#pragma omp parallel for schedule(static, 1)
+        for (index_type idx = 0; idx < cols + 1; ++idx) {
+            Z.indptr[idx] = col_ptr[idx];
+        }
+
+        // main matmul block
+        std::vector<worker_t> worker_set(threads);
+#pragma omp parallel for schedule(static,1)
+        for(int tid = 0; tid < threads; tid++) {
+            worker_t& worker = worker_set[tid];
+            worker.set_rows(rows);
+            auto& temp = worker.temp;
+
+            index_type c_start = workloads[tid];
+            index_type c_end = workloads[tid + 1];
+            for(index_type c = c_start; c < c_end; ++c) {
+                auto Wc = W.get_col(c);
+                if(Wc.nnz > 0) {
+                    temp.clear();
+                    for(index_type s = 0; s < Wc.nnz; ++s) {
+                        // temp += Wc[i] * Xi
+                        auto Xs = X.get_col(Wc.idx[s]);
+                        auto Wci = Wc.val[s];
+                        for(index_type t = 0; t < Xs.nnz; ++t) {
+                            temp.add_nonzero_at(Xs.idx[t], Wci * Xs.val[t]);
+                        }
+                    }
+                    if(sorted_indices) {
+                        temp.sort_nz_indices();
+                    }
+                    for(index_type s = 0; s < temp.nr_touch; s++) {
+                        size_t r = temp.touched_indices[s];
+                        size_t offset = col_ptr[c] + s;
+                        Z.indices[offset] = r;
+                        Z.data[offset] = temp.entries[r].val;
+                    }
+                }
+            }
+        }
+
+        if(eliminate_zeros) {
+            mem_index_type true_nnz = 0;
+            mem_index_type col_start = 0;
+            for(index_type c = 0; c < cols; c++) {
+                for(auto k = col_start; k < Z.indptr[c + 1]; k++) {
+                    auto idx = Z.indices[k];
+                    auto val = Z.data[k];
+                    if(val != 0) {
+                        Z.indices[true_nnz] = idx;
+                        Z.data[true_nnz] = val;
+                        true_nnz++;
+                    }
+                }
+                col_start = Z.indptr[c + 1];
+                Z.indptr[c + 1] = true_nnz;
+            }
+        }
+    }
+
+    /*
+    smat_x_smat function:
+    given two sparse row major matrices, X and W,
+    multithreaded compute sparse-sparse matrix multiplication of X and W,
+    and output the results into Z, the spmm_mat_t data type.
+    (1) if Z::IS_COL_MAJORED is true, then Z = (XW).transpose()
+    (2) if Z::IS_COL_MAJORED is false, then Z = XW
+    This design choice is to be compatible with the python interface and handling csr_x_csr case.
+    */
+    template <typename spmm_mat_t>
+    void smat_x_smat(
+        const csr_t& X,
+        const csr_t& W,
+        spmm_mat_t& Z,
+        const bool eliminate_zeros=false,
+        const bool sorted_indices=true,
+        int threads=1
+    ) {
+        smat_x_smat(W.transpose(), X.transpose(), Z, eliminate_zeros, sorted_indices, threads);
+    }
+
+    /*
+    Memory efficient method to stack <csr_t> matrices horizontally in parallel.
+    expect stacked_matrix to be empty csr_t or spmm_mat_t
+    */
+    template <class MAT_T>
+    void hstack_csr(const std::vector<csr_t>& matrices, MAT_T& stacked_matrix, int threads=-1) {
+        typedef typename MAT_T::index_type ret_idx_t;
+        typedef typename MAT_T::value_type ret_val_t;
+        typedef typename MAT_T::mem_index_type ret_indptr_t;
+
+        // compute (nr_rows, total_cols, total_nnz) for memory allocation of the stacked_matrix matrix
+        // all mat in matrices should have the same number of rows
+        ret_idx_t nr_rows = matrices[0].rows;
+        ret_idx_t total_cols = 0;
+        ret_indptr_t total_nnz = 0;
+        for(auto& mat : matrices) {
+            total_cols += mat.cols;
+            total_nnz += mat.get_nnz();
+        }
+        stacked_matrix.allocate(nr_rows, total_cols, total_nnz);
+
+        set_threads(threads);
+        // compute indptr row-wise independently for easy parallelism
+#pragma omp parallel for
+        for(int i = 0; i <= nr_rows; i++) {
+            stacked_matrix.indptr[i] = 0;
+            for(auto& mat : matrices) {
+                stacked_matrix.indptr[i] += mat.indptr[i];
+            }
+        }
+
+        // compute indices/data row-wise independently for easy parallelism
+#pragma omp parallel for schedule(dynamic,64)
+        for(int i = 0; i < nr_rows; i++) {
+            // for row_i, column-wise stack mat
+            ret_idx_t col_idx_offset = 0;
+            ret_indptr_t cumulated_nnz = stacked_matrix.indptr[i];
+            for(auto& mat : matrices) {
+                const auto& x_i = mat.get_row(i);
+                std::copy(
+                    x_i.idx,
+                    x_i.idx + x_i.nnz,
+                    &stacked_matrix.indices[cumulated_nnz]
+                );
+                std::transform(
+                    &stacked_matrix.indices[cumulated_nnz],
+                    &stacked_matrix.indices[cumulated_nnz + x_i.nnz],
+                    &stacked_matrix.indices[cumulated_nnz],
+                    [&](ret_idx_t x) { return x + col_idx_offset; }
+                );
+                std::copy(
+                    x_i.val,
+                    x_i.val + x_i.nnz,
+                    &stacked_matrix.data[cumulated_nnz]
+                );
+                col_idx_offset += mat.cols;
+                cumulated_nnz += x_i.nnz;
+            }
+        }
+    }
+
+
+} // end namespace pecos
+
+#endif // end of __MATRIX_H__
diff --git a/pecos/core/utils/parallel.hpp b/pecos/core/utils/parallel.hpp
new file mode 100644
index 00000000..12e114ff
--- /dev/null
+++ b/pecos/core/utils/parallel.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __PARALLEL_H__
+#define  __PARALLEL_H__
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <omp.h>
+
+namespace pecos {
+
+    // ===== Thread Utility =====
+    int set_threads(int threads) {
+        if(threads == -1) {
+            threads = omp_get_num_procs();
+        }
+        threads = std::min(threads, omp_get_num_procs());
+        omp_set_num_threads(threads);
+        return threads;
+    }
+
+    template<class InputIt, class OutputIt>
+    void parallel_partial_sum(InputIt first, InputIt last, OutputIt out, int threads=1) {
+        typedef typename std::iterator_traits<InputIt>::value_type value_type;
+        typedef typename std::iterator_traits<InputIt>::difference_type difference_type;
+        difference_type len = last - first;
+        if(threads == 1 || len < threads) {
+            std::partial_sum(first, last, out);
+        } else {
+            std::vector<value_type> offsets(threads + 1);
+            difference_type workload = (len / threads) + (len % threads != 0);
+#pragma omp parallel for schedule(static,1)
+            for(int tid = 0; tid < threads; tid++) {
+                auto local_first = first + std::min(tid * workload, len);
+                auto local_last = first + std::min((tid + 1) * workload, len);
+                auto local_len = std::distance(local_first, local_last);
+                auto local_out = out + std::distance(first, local_first);
+                if(local_len > 0) {
+                    std::partial_sum(local_first, local_last, local_out);
+                    offsets[tid + 1] = *(local_out + local_len - 1);
+                }
+            }
+
+            std::partial_sum(offsets.begin(), offsets.end(), offsets.begin());
+
+#pragma omp parallel for schedule(static,1) shared(offsets)
+            for(int tid = 0; tid < threads; tid++) {
+                auto local_first = out + std::min(tid * workload, len);
+                auto local_last = out + std::min((tid + 1) * workload, len);
+                auto local_len = std::distance(local_first, local_last);
+                if(local_len > 0) {
+                    std::for_each(
+                        local_first,
+                        local_last,
+                        [&](value_type& x){ x += offsets[tid]; }
+                    );
+                }
+            }
+        }
+    }
+} // end namespace pecos
+
+#endif // end of __PARALLEL_H__
diff --git a/pecos/core/utils/random.hpp b/pecos/core/utils/random.hpp
new file mode 100644
index 00000000..d6fbc51b
--- /dev/null
+++ b/pecos/core/utils/random.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __RANDOM_H__
+#define  __RANDOM_H__
+
+#include <algorithm>
+#include <limits>
+#include <random>
+
+namespace pecos {
+
+    // ===== Random Number Generator: simulate the interface of python random module =====
+    template<typename engine_t=std::mt19937>
+    struct random_number_generator : public engine_t {
+        typedef typename engine_t::result_type result_type;
+
+        random_number_generator(unsigned seed=0): engine_t(seed) {}
+
+        result_type randrange(result_type end=engine_t::max()) { return engine_t::operator()() % end; }
+        template<class T=double, class T2=double> T uniform(T start=0.0, T2 end=1.0) {
+            return std::uniform_real_distribution<T>(start, (T)end)(*this);
+        }
+        template<class T=double> T normal(T mean=0.0, T stddev=1.0) {
+            return std::normal_distribution<T>(mean, stddev)(*this);
+        }
+        template<class T=int, class T2=T> T randint(T start=0, T2 end=std::numeric_limits<T>::max()) {
+            return std::uniform_int_distribution<T>(start, end)(*this);
+        }
+        template<class RandIter> void shuffle(RandIter first, RandIter last) {
+            std::shuffle(first, last, *this);
+        }
+    };
+
+} // end namespace pecos
+
+#endif // end of __RANDOM_H__
diff --git a/pecos/core/utils/scipy_loader.hpp b/pecos/core/utils/scipy_loader.hpp
new file mode 100644
index 00000000..efcbc980
--- /dev/null
+++ b/pecos/core/utils/scipy_loader.hpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __SCIPY_LOADER_H__
+#define  __SCIPY_LOADER_H__
+
+#include <algorithm>
+#include <cstring>
+#include <unordered_map>
+#include <vector>
+
+namespace pecos {
+
+namespace endian {
+    // return '<' for little endian and '>' for big endian
+    static char runtime () {
+        uint32_t x = 1U;
+        return (reinterpret_cast<uint8_t*>(&x)[0]) ? '<' : '>';
+    }
+
+    // return true if the given byte_order code (>, <, |, =) is different from the byte order of runtime
+    static bool different_from_runtime(char byte_order) {
+        if(byte_order == '|' || byte_order == '=' || byte_order == endian::runtime()) {
+            return false;
+        } else {
+            return true;
+        }
+    }
+
+    template<class T>
+    static T byte_swap(T& src) {
+        if(sizeof(T) == 1) {
+            return src;
+        }
+        T dst;
+        auto src_ptr = reinterpret_cast<const std::uint8_t*>(&src);
+        auto dst_ptr = reinterpret_cast<std::uint8_t*>(&dst);
+        std::reverse_copy(src_ptr, src_ptr + sizeof(T), dst_ptr);
+        return dst;
+    }
+
+    template<class T>
+    inline T* fget_multiple(T* dst, size_t num, FILE *stream, bool byte_swap=false) {
+        if(num != fread(dst, sizeof(T), num, stream)) {
+            throw std::runtime_error("Cannot read enough data from the stream");
+        }
+        // swap the endianness
+        if(byte_swap) {
+            for(size_t i = 0; i < num; i++) {
+                dst[i] = endian::byte_swap(dst[i]);
+            }
+        }
+        return dst;
+    }
+
+
+    template<class T>
+    inline T fget_one(FILE *stream, bool byte_swap=false) {
+        T x;
+        endian::fget_multiple<T>(&x, 1U, stream, byte_swap);
+        return x;
+    }
+
+
+    template<class T>
+    inline T fpeek(FILE *stream, bool byte_swap=false) {
+        T x = endian::fget_one<T>(stream, byte_swap);
+        fseek(stream, -sizeof(T), SEEK_CUR);
+        return x;
+    }
+
+} // end of namespace endian
+
+
+//https://numpy.org/devdocs/reference/generated/numpy.lib.format.html
+template<typename T>
+class NpyArray {
+
+public:
+    typedef T value_type;
+    typedef std::vector<value_type> array_t;
+    typedef std::vector<uint64_t> shape_t;
+
+    shape_t shape;
+    array_t array;
+    size_t num_elements;
+    bool fortran_order;
+
+    NpyArray() {}
+
+    NpyArray(const std::string& filename, uint64_t offset=0) { load(filename, offset); }
+
+    NpyArray(const std::vector<uint64_t>& shape, value_type default_value=0) { resize(shape, default_value); }
+
+    /* load an NpyArry<T> starting from the `offset`-th byte in the file with `filename` */
+    NpyArray<T>& load(const std::string& filename, uint64_t offset=0) {
+        //https://numpy.org/devdocs/reference/generated/numpy.lib.format.html
+        FILE *fp = fopen(filename.c_str(), "rb");
+        fseek(fp, offset, SEEK_SET);
+
+        // check magic string
+        std::vector<uint8_t> magic = {0x93u, 'N', 'U', 'M', 'P', 'Y'};
+        for(size_t i = 0; i < magic.size(); i++) {
+            if (endian::fget_one<uint8_t>(fp) != magic[i]) {
+                throw std::runtime_error("file is not a valid NpyFile");
+            }
+        }
+
+        // load version
+        uint8_t major_version = endian::fget_one<uint8_t>(fp);
+        uint8_t minor_version = endian::fget_one<uint8_t>(fp);
+
+        // load header len
+        uint64_t header_len;
+        if(major_version == 1) {
+            header_len = endian::fget_one<uint16_t>(fp);
+        } else if (major_version == 2) {
+            header_len = endian::fget_one<uint32_t>(fp);
+        } else {
+            throw std::runtime_error("unsupported NPY major version");
+        }
+
+        if(minor_version != 0) {
+            throw std::runtime_error("unsupported NPY minor version");
+        }
+
+        // load header
+        std::vector<char> header(header_len + 1, (char) 0);
+        endian::fget_multiple<char>(&header[0], header_len, fp);
+        char endian_code, type_code;
+        uint32_t word_size;
+        std::string dtype;
+        this->parse_header(header, endian_code, type_code, word_size, dtype);
+
+        // load array content
+        this->load_content(fp, word_size, dtype);
+
+        fclose(fp);
+        return *this;
+    }
+
+    void resize(const std::vector<uint64_t>& new_shape, value_type default_value=value_type()) {
+        shape = new_shape;
+        size_t num_elements = 1;
+        for(auto& dim : shape) {
+            num_elements *= dim;
+        }
+        array.resize(num_elements);
+        std::fill(array.begin(), array.end(), default_value);
+    }
+
+    size_t ndim() const { return shape.size(); }
+    size_t size() const { return num_elements; }
+    value_type* data() { return &array[0]; }
+    value_type& at(size_t idx) { return array[idx]; }
+    const value_type& at(size_t idx) const { return array[idx]; }
+    value_type& operator[](size_t idx) { return array[idx]; }
+    const value_type& operator[](size_t idx) const { return array[idx]; }
+
+private:
+
+    void parse_header(const std::vector<char>& header, char& endian_code, char& type_code, uint32_t& word_size, std::string& dtype) {
+        char value_buffer[1024] = {0};
+        const char* header_cstr = &header[0];
+
+        // parse descr in a str form
+        if(1 != sscanf(strstr(header_cstr, "'descr'"), "'descr': '%[^']' ", value_buffer)) {
+            throw std::runtime_error("invalid NPY header (descr)");
+        }
+        dtype = std::string(value_buffer);
+        if(3 != sscanf(value_buffer, "%c%c%u", &endian_code, &type_code, &word_size)) {
+            throw std::runtime_error("invalid NPY header (descr parse)");
+        }
+
+        // parse fortran_order in a boolean form [False, True]
+        if(1 != sscanf(strstr(header_cstr, "'fortran_order'"), "'fortran_order': %[FalseTrue] ", value_buffer)) {
+            throw std::runtime_error("invalid NPY header (fortran_order)");
+        }
+        this->fortran_order = std::string(value_buffer) == "True";
+
+        // parse shape in a tuple form
+        if (0 > sscanf(strstr(header_cstr, "'shape'"), "'shape': (%[^)]) ", value_buffer)) {
+            throw std::runtime_error("invalid NPY header (shape)");
+        }
+
+        char *ptr = &value_buffer[0];
+        int offset;
+        uint64_t dim;
+        num_elements = 1;
+        shape.clear();
+        while(sscanf(ptr, "%lu, %n", &dim, &offset) == 1) {
+            ptr += offset;
+            shape.push_back(dim);
+            num_elements *= dim;
+        }
+        // handle the case with single element case: shape=()
+        if(shape.size() == 0 && num_elements == 1) {
+            shape.push_back(1);
+        }
+    }
+
+    template<typename U=value_type, typename std::enable_if<std::is_arithmetic<U>::value, U>::type* = nullptr>
+    void load_content(FILE *fp, uint32_t& word_size, const std::string& dtype) {
+        array.resize(num_elements);
+        auto type_code = dtype.substr(1);
+#define IF_CLAUSE_FOR(np_type_code, c_type) \
+        if(type_code == np_type_code) { \
+            bool byte_swap = endian::different_from_runtime(dtype[0]); \
+            size_t batch_size = 32768; \
+            std::vector<c_type> batch(batch_size); \
+            for(size_t i = 0; i < num_elements; i += batch_size) { \
+                size_t num = std::min(batch_size, num_elements - i); \
+                endian::fget_multiple<c_type>(batch.data(), num, fp, byte_swap); \
+                for(size_t b = 0; b < num; b++) { \
+                    array[i + b] = static_cast<value_type>(batch[b]); \
+                } \
+            } \
+        }
+
+        IF_CLAUSE_FOR("f4", float)
+        else IF_CLAUSE_FOR("f8", double)
+        else IF_CLAUSE_FOR("f16", long double)
+        else IF_CLAUSE_FOR("i1", int8_t)
+        else IF_CLAUSE_FOR("i2", int16_t)
+        else IF_CLAUSE_FOR("i4", int32_t)
+        else IF_CLAUSE_FOR("i8", int64_t)
+        else IF_CLAUSE_FOR("u1", uint8_t)
+        else IF_CLAUSE_FOR("u2", uint16_t)
+        else IF_CLAUSE_FOR("u4", uint32_t)
+        else IF_CLAUSE_FOR("u8", uint64_t)
+        else IF_CLAUSE_FOR("b1", uint8_t)
+#undef IF_CLAUSE_FOR
+    }
+
+    template<typename U=value_type, typename std::enable_if<std::is_same<value_type, std::basic_string<typename U::value_type>>::value, U>::type* = nullptr>
+    void load_content(FILE *fp, const uint32_t& word_size, const std::string& dtype) {
+        array.resize(num_elements);
+        auto type_code = dtype[1];
+#define IF_CLAUSE_FOR(np_type_code, c_type, char_size) \
+        if(type_code == np_type_code) { \
+            std::vector<c_type> char_buffer(word_size); \
+            bool byte_swap = endian::different_from_runtime(dtype[0]); \
+            for(size_t i = 0; i < num_elements; i++) { \
+                endian::fget_multiple<c_type>(&char_buffer[0], word_size, fp, byte_swap); \
+                array[i] = value_type(reinterpret_cast<typename T::value_type*>(&char_buffer[0]), word_size * char_size); \
+            } \
+        }
+
+        // numpy uses UCS4 to encode unicode https://numpy.org/devdocs/reference/c-api/dtype.html?highlight=ucs4
+        IF_CLAUSE_FOR('U', char32_t, 4)
+        else IF_CLAUSE_FOR('S', char, 1)
+#undef IF_CLAUSE_FOR
+    }
+};
+
+
+class ReadOnlyZipArchive {
+
+private:
+
+    struct FileInfo {
+        std::string name;
+        uint64_t offset_of_content;
+        uint64_t offset_of_header;
+        uint64_t uncompressed_size;
+        uint64_t compressed_size;
+        uint16_t compression_method;
+        uint32_t signature;
+        uint16_t version;
+        uint16_t bit_flag;
+        uint16_t last_modified_time;
+        uint16_t last_modified_date;
+        uint32_t crc_32;
+
+        FileInfo() {}
+
+        static bool valid_start(FILE *fp) {
+            return endian::fpeek<uint32_t>(fp) == 0x04034b50; // local header
+        }
+
+        // https://en.wikipedia.org/wiki/Zip_(file_format)#ZIP64
+        // https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
+        static FileInfo get_one_from(FILE *fp) {
+            // ReadOnlyZipArchive always uses little endian
+            bool byte_swap = endian::different_from_runtime('<');
+
+            FileInfo info;
+            info.offset_of_header = ftell(fp);
+            info.signature = endian::fget_one<uint32_t>(fp, byte_swap);
+            info.version = endian::fget_one<uint16_t>(fp, byte_swap);
+            info.bit_flag = endian::fget_one<uint16_t>(fp, byte_swap);
+            info.compression_method = endian::fget_one<uint16_t>(fp, byte_swap);
+            info.last_modified_time = endian::fget_one<uint16_t>(fp, byte_swap);
+            info.last_modified_date = endian::fget_one<uint16_t>(fp, byte_swap);
+            info.crc_32 = endian::fget_one<uint32_t>(fp, byte_swap);
+
+            if(info.compression_method != 0) {
+                throw std::runtime_error("only uncompressed zip archive is supported.");
+            }
+
+            info.compressed_size = endian::fget_one<uint32_t>(fp, byte_swap);
+            info.uncompressed_size = endian::fget_one<uint32_t>(fp, byte_swap);
+            auto filename_length = endian::fget_one<uint16_t>(fp, byte_swap);
+            auto extra_field_length = endian::fget_one<uint16_t>(fp, byte_swap);
+
+            std::vector<char> filename(filename_length, (char)0);
+            std::vector<char> extra_field(extra_field_length, (char)0);
+            endian::fget_multiple<char>(&filename[0], filename_length, fp, byte_swap);
+            endian::fget_multiple<char>(&extra_field[0], extra_field_length, fp, byte_swap);
+
+            info.name = std::string(&filename[0], filename_length);
+            info.offset_of_content = ftell(fp);
+
+            // handle zip64 extra field to obtain proper size information
+            uint64_t it = 0;
+            while(it < extra_field.size()) {
+                uint16_t header_id = *reinterpret_cast<uint16_t*>(&extra_field[it]);
+                it += sizeof(header_id);
+                uint16_t data_size = *reinterpret_cast<uint16_t*>(&extra_field[it]);
+                it += sizeof(data_size);
+                if(header_id == 0x0001) { // zip64 extended information in extra field
+                    info.uncompressed_size = *reinterpret_cast<uint64_t*>(&extra_field[it]);
+                    info.compressed_size = *reinterpret_cast<uint64_t*>(&extra_field[it + sizeof(info.uncompressed_size)]);
+                }
+                it += data_size;
+            }
+
+            // skip the actual content
+            fseek(fp, info.compressed_size, SEEK_CUR);
+
+            // skip the data descriptor if bit 3 of the general purpose bit flag is set
+            if (info.bit_flag & 8) {
+                fseek(fp, 12, SEEK_CUR);
+            }
+
+            return info;
+        }
+    };
+
+    std::vector<FileInfo> file_info_array;
+    std::unordered_map<std::string, FileInfo*> mapping;
+
+public:
+
+    ReadOnlyZipArchive(const std::string& zip_name) {
+        FILE *fp = fopen(zip_name.c_str(), "rb");
+        while(FileInfo::valid_start(fp)) {
+            file_info_array.emplace_back(FileInfo::get_one_from(fp));
+        }
+        fclose(fp);
+        for(auto& file : file_info_array) {
+            mapping[file.name] = &file;
+        }
+    }
+
+    FileInfo& operator[](const std::string& name) { return *mapping.at(name); }
+    const FileInfo& operator[](const std::string& name) const { return *mapping.at(name); }
+};
+
+
+template<bool IsCsr, typename DataT, typename IndicesT = uint32_t, typename IndptrT = uint64_t, typename ShapeT = uint64_t>
+struct ScipySparseNpz {
+    NpyArray<IndicesT> indices;
+    NpyArray<IndptrT> indptr;
+    NpyArray<DataT> data;
+    NpyArray<ShapeT> shape;
+    NpyArray<std::string> format;
+
+    ScipySparseNpz() {}
+
+    ScipySparseNpz(const std::string& npz_filepath) { load(npz_filepath); }
+
+
+    uint64_t size() const { return data.size(); }
+    uint64_t rows() const { return shape[0]; }
+    uint64_t cols() const { return shape[1]; }
+    uint64_t nnz() const { return data.size(); }
+
+    void load(const std::string& npz_filepath) {
+        auto npz = ReadOnlyZipArchive(npz_filepath);
+        format.load(npz_filepath, npz["format.npy"].offset_of_content);
+        if(IsCsr && format[0] != "csr") {
+            throw std::runtime_error(npz_filepath + " is not a valid scipy CSR npz");
+        } else if (!IsCsr && format[0] != "csc") {
+            throw std::runtime_error(npz_filepath + " is not a valid scipy CSC npz");
+        }
+        indices.load(npz_filepath, npz["indices.npy"].offset_of_content);
+        data.load(npz_filepath, npz["data.npy"].offset_of_content);
+        indptr.load(npz_filepath, npz["indptr.npy"].offset_of_content);
+        shape.load(npz_filepath, npz["shape.npy"].offset_of_content);
+    }
+
+    void fill_ones(size_t rows, size_t cols) {
+        shape.resize({2});
+        shape[0] = rows;
+        shape[1] = cols;
+
+        uint64_t nnz = rows * cols;
+        data.resize({nnz}, DataT(1));
+        indices.resize({nnz});
+        format.resize({1});
+        if(IsCsr) {
+            format[0] = "csr";
+            indptr.resize({rows + 1});
+            for(size_t r = 0; r < rows; r++) {
+                for(size_t c = 0; c < cols; c++) {
+                    indices[r * cols + c] = c;
+                }
+                indptr[r + 1] = indptr[r] + cols;
+            }
+        } else {
+            format[0] = "csc";
+            indptr.resize({cols + 1});
+            for(size_t c = 0; c < cols; c++) {
+                for(size_t r = 0; r < rows; r++) {
+                    indices[c * rows + r] = r;
+                }
+                indptr[c + 1] = indptr[c] + rows;
+            }
+        }
+    }
+};
+
+} // end namespace pecos
+
+#endif // end of __SCIPY_LOADER_H__
diff --git a/pecos/core/utils/tfidf.hpp b/pecos/core/utils/tfidf.hpp
new file mode 100644
index 00000000..f2d31716
--- /dev/null
+++ b/pecos/core/utils/tfidf.hpp
@@ -0,0 +1,1465 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __TFIDF_H__
+#define __TFIDF_H__
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <iterator>
+#include <numeric>
+#include <omp.h>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <sys/stat.h>
+#include <vector>
+// for __gnu_parallel
+#include <parallel/algorithm>
+
+// string_view available since c++17
+// only in experimental/string_view in c++14
+#if __cplusplus >= 201703L
+#include <string_view>
+#else
+#include <experimental/string_view>
+namespace std{
+  using std::experimental::string_view;
+}
+#endif
+
+#include "parallel.hpp"
+#include "file_util.hpp"
+#include "third_party/nlohmann_json/json.hpp"
+#include "third_party/robin_hood_hashing/robin_hood.h"
+
+
+namespace pecos {
+
+namespace tfidf {
+
+
+using std::string;
+using std::vector;
+using std::string_view;
+using robin_hood::unordered_map;
+using robin_hood::unordered_set;
+using robin_hood::hash;
+using robin_hood::pair;
+
+// ===== C Interface for Structure/Types =====
+
+extern "C" {
+
+    // Because TfidfBaseVectorizerParam will be used to communicate with python ctypes,
+    // its data structure need to be a "standard-layout type", see more at
+    // http://www.cplusplus.com/reference/type_traits/is_standard_layout
+    struct TfidfBaseVectorizerParam {
+
+        TfidfBaseVectorizerParam() {}
+
+        TfidfBaseVectorizerParam(
+            int32_t min_ngram,
+            int32_t max_ngram,
+            int32_t max_length,
+            int32_t max_feature,
+            float min_df_ratio,
+            float max_df_ratio,
+            int32_t min_df_cnt,
+            int32_t max_df_cnt,
+            bool binary,
+            bool use_idf,
+            bool smooth_idf,
+            bool sublinear_tf,
+            bool keep_frequent_feature,
+            int32_t norm_p,
+            int32_t tok_type
+        ): min_ngram(min_ngram),
+        max_ngram(max_ngram),
+        max_length(max_length),
+        max_feature(max_feature),
+        min_df_ratio(min_df_ratio),
+        max_df_ratio(max_df_ratio),
+        min_df_cnt(min_df_cnt),
+        max_df_cnt(max_df_cnt),
+        binary(binary),
+        use_idf(use_idf),
+        smooth_idf(smooth_idf),
+        sublinear_tf(sublinear_tf),
+        keep_frequent_feature(keep_frequent_feature),
+        norm_p(norm_p),
+        tok_type(tok_type) {
+            if(min_df_ratio < 0 || max_df_ratio <= min_df_ratio || max_df_ratio > 1.0) {
+                throw std::invalid_argument("expect 0 <= min_df_ratio < max_df_ratio <= 1.0");
+            }
+            if(min_ngram <=0 || min_ngram > max_ngram) {
+                throw std::invalid_argument("expect 0 < min_ngram <= max_ngram");
+            }
+        }
+
+        int32_t min_ngram, max_ngram;
+        int32_t max_length, max_feature;
+        float min_df_ratio, max_df_ratio;
+        int32_t min_df_cnt, max_df_cnt;
+        bool binary, use_idf, smooth_idf, sublinear_tf, keep_frequent_feature;
+        int32_t norm_p, tok_type;
+
+        void save(const string& filepath) const {
+            nlohmann::json j_params = {
+                {"type", "tfidf"},
+                {"kwargs",{
+                    {"ngram_range", {min_ngram, max_ngram}},
+                    {"max_length", max_length},
+                    {"max_feature", max_feature},
+                    {"min_df_ratio", min_df_ratio},
+                    {"max_df_ratio", max_df_ratio},
+                    {"min_df_cnt", min_df_cnt},
+                    {"max_df_cnt", max_df_cnt},
+                    {"binary", binary},
+                    {"use_idf", use_idf},
+                    {"smooth_idf", smooth_idf},
+                    {"sublinear_tf", sublinear_tf},
+                    {"keep_frequent_feature", keep_frequent_feature},
+                    {"norm_p", norm_p == 1 ? "l1" : "l2"}
+                    }
+                }
+            };
+            std::ofstream savefile(filepath, std::ofstream::trunc);
+            if(savefile.is_open()) {
+                savefile << j_params.dump(4);
+                savefile.close();
+            } else {
+                throw std::runtime_error("Unable to save config file to " + filepath);
+            }
+        }
+
+        void load(const string& filepath) {
+            std::ifstream loadfile(filepath);
+            string json_str;
+            if(loadfile.is_open()) {
+                json_str.assign((std::istreambuf_iterator<char>(loadfile)),
+                                   (std::istreambuf_iterator<char>()));
+            } else {
+                throw std::runtime_error("Unable to open config file at " + filepath);
+            }
+            auto j_param = nlohmann::json::parse(json_str);
+            string vectorizer_type = j_param["type"];
+            if(vectorizer_type != "tfidf") {
+                throw std::invalid_argument("Wrong vectorizer type: " + vectorizer_type);
+            }
+            auto kwargs = j_param["kwargs"];
+            auto ngram_range = kwargs["ngram_range"];
+            min_ngram = ngram_range[0];
+            max_ngram = ngram_range[1];
+
+            max_length = kwargs["max_length"];
+            max_feature = kwargs["max_feature"];
+            min_df_ratio = kwargs["min_df_ratio"];
+            max_df_ratio = kwargs["max_df_ratio"];
+            min_df_cnt = kwargs["min_df_cnt"];
+            max_df_cnt = kwargs["max_df_cnt"];
+            binary = kwargs["binary"];
+            use_idf = kwargs["use_idf"];
+            smooth_idf = kwargs["smooth_idf"];
+            sublinear_tf = kwargs["sublinear_tf"];
+            keep_frequent_feature = kwargs["keep_frequent_feature"];
+            if(kwargs["norm_p"] == "l1") {
+                norm_p = 1;
+            } else if (kwargs["norm_p"] == "l2") {
+                norm_p = 2;
+            } else {
+                throw std::invalid_argument("Unknown normalization type");
+            }
+        }
+
+    };
+
+    // Because TfidfVectorizerParam will be used to communicate with python ctypes,
+    // its data structure need to be a "standard-layout type", see more at
+    // http://www.cplusplus.com/reference/type_traits/is_standard_layout
+    struct TfidfVectorizerParam {
+
+        TfidfVectorizerParam() {}
+
+        TfidfVectorizerParam(
+            TfidfBaseVectorizerParam* base_param_ptr,
+            int32_t num_base_vect,
+            int32_t norm_p
+        ): base_param_ptr(base_param_ptr),
+        num_base_vect(num_base_vect),
+        norm_p(norm_p) {}
+
+        TfidfBaseVectorizerParam* base_param_ptr;
+        int32_t num_base_vect;
+        int32_t norm_p; // only support 1 or 2
+
+        void save(const string& filepath) const {
+            nlohmann::json j_params = {
+                {"type", "tfidf"},
+                {"kwargs", {
+                    {"num_base_vect", num_base_vect},
+                    {"norm_p", norm_p}
+                    }
+                }
+            };
+            std::ofstream savefile(filepath, std::ofstream::trunc);
+            if(savefile.is_open()) {
+                savefile << j_params.dump(4);
+                savefile.close();
+            } else {
+                throw std::runtime_error("Unable to save config file to " + filepath);
+            }
+        }
+
+        void load(const string& filepath) {
+            std::ifstream loadfile(filepath);
+            string json_str;
+            if(loadfile.is_open()) {
+                json_str.assign((std::istreambuf_iterator<char>(loadfile)),
+                                   (std::istreambuf_iterator<char>()));
+            } else {
+                throw std::runtime_error("Unable to open config file at " + filepath);
+            }
+            const auto& j_param = nlohmann::json::parse(json_str);
+            const string& vectorizer_type = j_param["type"];
+            if(vectorizer_type != "tfidf") {
+                throw std::invalid_argument("Wrong vectorizer type: " + vectorizer_type);
+            }
+            const auto& kwargs = j_param["kwargs"];
+            num_base_vect = kwargs["num_base_vect"];
+            norm_p = kwargs["norm_p"];
+        }
+
+    };
+
+} // end of extern C
+
+
+typedef int idx_type; // special tokens use negative index
+typedef vector<string> str_vec_t;
+typedef vector<string_view> sv_vec_t;
+typedef vector<idx_type> idx_vec_t;
+
+enum {
+    WORDTOKENIZER=10,
+    CHARTOKENIZER=20,
+    CHARWBTOKENIZER=30,
+};
+
+size_t DEFAULT_BUFFER_SIZE = size_t(2e8); // default max buffer size for file I/O
+
+// Hash function for vector to support arbitrary length ngram feature
+// REF: https://www.boost.org/doc/libs/1_74_0/doc/html/hash/reference.html#boost.hash_combine
+template<class val_type>
+struct VectorHasher {
+    size_t operator()(vector<val_type> const& V) const {
+        size_t val = V.size();
+        for(auto &i : V) {
+            val ^= hash<val_type>()(i) + 0x9e3779b9 + (val << 6) + (val >> 2);
+        }
+        return val;
+    }
+};
+
+// split c str by '\n' and append reference to line_view
+void append_lines_to_string_view(char* buffer, size_t buffer_size, sv_vec_t& line_view) {
+    // split buffer with \n for lines
+    size_t start = 0, end = 0;
+    while(end < buffer_size) {
+        if(buffer[end] == '\n') {
+            string_view lv(buffer + start, end - start);
+            line_view.push_back(lv);
+            start = end + 1;
+        }
+        end++;
+    }
+    if(start < buffer_size && buffer[start] != '\0') {
+        string_view lv(buffer + start, end - start);
+        line_view.push_back(lv);
+    }
+}
+
+template<class InputIt, class Compare>
+void parallel_sort(InputIt first, InputIt last, Compare comp, int threads=-1) {
+    threads = set_threads(threads);
+    typedef typename std::iterator_traits<InputIt>::difference_type difference_type;
+    difference_type len = last - first;
+    if(threads == 1 || len < threads) {
+        std::sort(first, last, comp);
+    } else {
+        __gnu_parallel::multiway_mergesort_tag parallelism(threads);
+        __gnu_parallel::sort(first, last, comp, parallelism);
+    }
+}
+
+class Tokenizer {
+public:
+    typedef unordered_map<string, idx_type> str2idx_map_t;
+    typedef unordered_map<idx_vec_t, idx_type, VectorHasher<idx_type>> vec2idx_map_t;
+    typedef unordered_set<string> str_set_t;
+
+    static const idx_type UNK = -1; // unknown token index, will not appear in feature ngrams
+    const string DELIMS = " "; // in US-ASCII, delimiters for word tokenizer
+
+    str2idx_map_t vocab;
+    int tok_type;
+
+    Tokenizer(int tok_type=WORDTOKENIZER): tok_type(tok_type) {
+        if(tok_type != WORDTOKENIZER && tok_type != CHARTOKENIZER && tok_type != CHARWBTOKENIZER) {
+            throw std::invalid_argument("received unknown tok_type: " + std::to_string(tok_type));
+        }
+    }
+
+    Tokenizer(const string& load_dir) { load(load_dir); }
+
+    void load_config(const string& filepath) {
+        std::ifstream loadfile(filepath);
+        string json_str;
+        if(loadfile.is_open()) {
+            json_str.assign((std::istreambuf_iterator<char>(loadfile)),
+                               (std::istreambuf_iterator<char>()));
+        } else {
+            throw std::runtime_error("Unable to open config file at " + filepath);
+        }
+        auto j_param = nlohmann::json::parse(json_str);
+        tok_type = j_param["token_type"];
+    }
+
+    void save_config(const string& filepath) const {
+        nlohmann::json j_params = {
+            {"token_type", tok_type}
+        };
+        std::ofstream savefile(filepath, std::ofstream::trunc);
+        if(savefile.is_open()) {
+            savefile << j_params.dump(4);
+            savefile.close();
+        } else {
+            throw std::runtime_error("Unable to save config file to " + filepath);
+        }
+    }
+
+    void save(const string& save_dir) const {
+        if(mkdir(save_dir.c_str(), 0777) == -1) {
+            if(errno != EEXIST) {
+                throw std::runtime_error("Unable to create save folder at " + save_dir);
+            }
+        }
+        save_config(save_dir + "/config.json");
+        std::ofstream savefile(save_dir + "/vocab.txt", std::ofstream::trunc);
+        if(savefile.is_open()) {
+            savefile << std::to_string(vocab.size()) << '\n';
+            for(auto iter = vocab.begin(); iter != vocab.end(); iter++) {
+                // (INDEX)<TAB>(KEY)\n
+                savefile << std::to_string(iter->second) << '\t' << iter->first << '\n';
+            }
+            savefile.close();
+        } else {
+            throw std::runtime_error("Unable to save vocab file to " + save_dir + "/vocab.txt");
+        }
+    }
+
+    void load(const string& load_dir) {
+        load_config(load_dir + "/config.json");
+        std::ifstream loadfile(load_dir + "/vocab.txt");
+        if(loadfile.is_open()) {
+            string line;
+            // read number of keys
+            getline(loadfile, line);
+            size_t vocab_size = size_t(std::stoul(line));
+            vocab.reserve(vocab_size);
+            while(getline(loadfile, line)) {
+                size_t pos = line.find('\t');
+                if(pos == string::npos) {
+                    throw std::runtime_error("Corrupted vocab file.");
+                }
+                idx_type idx = idx_type(std::stoi(line.substr(0, pos)));
+                vocab[line.substr(pos + 1, line.size() - pos - 1)] = idx;
+            }
+            loadfile.close();
+        } else {
+            throw std::runtime_error("Unable to open tokenizer vocab file at " + load_dir + "/vocab.txt");
+        }
+    }
+
+    // split given string_view into tokens, clear tokens if not empty
+    void split_into_tokens(const string_view& sv, sv_vec_t& tokens, int override_tok_type=-1) const {
+        if(override_tok_type < 0) {
+            override_tok_type = tok_type;
+        }
+        tokens.clear();
+        if(override_tok_type == WORDTOKENIZER) {
+            for(auto first = sv.data(), second = sv.data(), last = first + sv.size();
+                    second != last && first != last;
+                    first = second + 1) {
+                second = std::find_first_of(first, last, DELIMS.begin(), DELIMS.end());
+
+                if(first != second) {
+                    tokens.emplace_back(first, second - first);
+                }
+            }
+        } else if(override_tok_type == CHARTOKENIZER || override_tok_type == CHARWBTOKENIZER) {
+            auto first = sv.data(), last = first + sv.size();
+            int char_size = 1;
+            while(first != last) {
+                // decide current char_size
+                // REF: https://en.wikipedia.org/wiki/UTF-8#Encoding
+                if((uint8_t)*first >= 0xe0) {
+                    if((uint8_t)*first >= 0xf0) {
+                        char_size = 4; // 11110XXX
+                    } else {
+                        char_size = 3; // 1110XXXX
+                    }
+                } else {
+                    if((uint8_t)*first >= 0xc0) {
+                        char_size = 2; // 110XXXXX
+                    } else if((uint8_t)*first < 0x80) {
+                        char_size = 1; // 0XXXXXXX
+                    } else {
+                        throw std::runtime_error("the string is not utf-8 encoded!"); // 10XXXXXX
+                    }
+                }
+                tokens.emplace_back(first, char_size);
+                first += char_size;
+            }
+        }
+    }
+
+    // convert string_view into token indices no longer than max_length
+    // length constraint ignored if max_length < 0
+    void tokenize(const string_view& sv, idx_vec_t& indices, int max_length) const {
+        sv_vec_t tokens;
+        split_into_tokens(sv, tokens);
+        size_t actual_size = tokens.size();
+        if(max_length > 0 && (size_t)max_length < actual_size) {
+            actual_size = max_length;
+        }
+        indices.resize(actual_size);
+        for(size_t i = 0; i < actual_size; i++) {
+            string cur_token(tokens[i]);
+            if(vocab.find(cur_token) != vocab.end()) {
+                indices[i] = vocab.at(cur_token);
+            } else {
+                indices[i] = UNK;
+            }
+        }
+    }
+
+    // count ngrams appeared in given string_view
+    void count_ngrams(const string_view& line_sv, vec2idx_map_t& ngram_cnt, int min_ngram, int max_ngram, int max_length) const {
+        if(tok_type == CHARWBTOKENIZER) {
+            // for character-word-boundary, n-grams at the edges of words are padded with space
+            // REF: https://github.com/scikit-learn/scikit-learn/blob/0fb307bf3/sklearn/feature_extraction/text.py#L278
+            int length_left = max_length;
+            sv_vec_t word_tokens;
+            split_into_tokens(line_sv, word_tokens, WORDTOKENIZER);
+            for(string_view& cur_word : word_tokens) {
+                idx_vec_t tokens;
+                tokenize(cur_word, tokens, length_left);
+                length_left -= tokens.size();
+                // find all features between min_ngram and max_ngram
+                for(int cur_ngram = min_ngram; cur_ngram <= max_ngram; cur_ngram++) {
+                    idx_type SPS = vocab.at(" ");
+                    if(cur_ngram >= tokens.size() + 2) { // count a short word only once
+                        idx_vec_t feat_key(tokens.size() + 2, SPS);
+                        std::copy(tokens.begin(), tokens.end(), feat_key.begin() + 1);
+                        ngram_cnt[feat_key] += 1;
+                        break;
+                    } else {
+                        idx_vec_t feat_key(cur_ngram, SPS);
+                        // left pad
+                        std::copy(tokens.begin(), tokens.begin() + cur_ngram - 1, feat_key.begin() + 1);
+                        ngram_cnt[feat_key] += 1;
+                        // real ngrams
+                        for(int i = 0; i + cur_ngram <= tokens.size(); i++) {
+                            std::copy(tokens.begin() + i, tokens.begin() + i + cur_ngram, feat_key.begin());
+                            ngram_cnt[feat_key] += 1;
+                        }
+                        // right pad
+                        feat_key[cur_ngram - 1] = SPS;
+                        std::copy(tokens.end() - cur_ngram + 1, tokens.end(), feat_key.begin());
+                        ngram_cnt[feat_key] += 1;
+                    }
+                }
+            }
+        } else {
+            idx_vec_t tokens;
+            tokenize(line_sv, tokens, max_length);
+            // find all features between min_ngram and max_ngram
+            for(int cur_ngram = min_ngram; cur_ngram <= std::min(max_ngram, (int)tokens.size()); cur_ngram++) {
+                for(int i = 0; i <= tokens.size() - cur_ngram; i++) {
+                    idx_vec_t feat_key(tokens.begin() + i, tokens.begin() + i + cur_ngram);
+                    ngram_cnt[feat_key] += 1;
+                }
+            }
+        }
+    }
+
+private:
+    // build vocabulary and count doc_freq with given corpus chunk
+    // results are added to vocab_chunk
+    void incremental_train_chunk_(const sv_vec_t& corpus, str_set_t& vocab_chunk, size_t start_line=0, size_t end_line=0) {
+        if(end_line <= start_line || end_line > corpus.size()) {
+            end_line = corpus.size();
+        }
+
+        sv_vec_t tokens;
+        for(auto line_sv = corpus.begin() + start_line; line_sv != corpus.begin() + end_line; line_sv++) {
+            split_into_tokens(*line_sv, tokens);
+            for(auto& cur_token : tokens) {
+                string cur_str_token(cur_token.begin(), cur_token.end());
+                vocab_chunk.insert(cur_str_token);
+            }
+        }
+    }
+
+    // parallel build vocabulary from single file to vocab_chunks
+    void train_from_file_(const string& corpus_path, vector<str_set_t>& vocab_chunks, vector<vector<char>>& buffer, size_t chunk_size) {
+        vector<size_t> chunk_offset;
+        file_util::get_file_offset(corpus_path, chunk_size, chunk_offset);
+        size_t n_chunks = chunk_offset.size() - 1;
+
+#pragma omp parallel for schedule(dynamic,1)
+        for(int chunk = 0; chunk < n_chunks; chunk++) {
+            int proc_id = omp_get_thread_num();
+            // load file chunk and parse lines to string_views
+            sv_vec_t cur_corpus_sv;
+            if(buffer[proc_id].size() <= chunk_offset[chunk + 1] - chunk_offset[chunk]) {
+                // need to increase buffer size
+                buffer[proc_id].resize(chunk_offset[chunk + 1] - chunk_offset[chunk] + 1);
+            }
+            size_t cache_size = file_util::load_file_block(corpus_path,
+                    buffer[proc_id].data(), chunk_offset[chunk], chunk_offset[chunk + 1]);
+
+            append_lines_to_string_view(buffer[proc_id].data(), cache_size, cur_corpus_sv);
+
+            incremental_train_chunk_(cur_corpus_sv, vocab_chunks[proc_id]);
+        }
+    }
+
+    // parallel build vocabulary from memory to vocab_chunks
+    void train_from_mem_(const sv_vec_t& corpus, vector<str_set_t>& vocab_chunks) {
+        size_t n_chunks = std::min(vocab_chunks.size(), corpus.size());
+        size_t chunk_size = (corpus.size() + n_chunks - 1) / n_chunks;
+
+#pragma omp parallel for schedule(dynamic,1)
+        for(size_t chunk = 0; chunk < n_chunks; chunk++) {
+            size_t start_line = chunk * chunk_size;
+            if(start_line < corpus.size()) {
+                size_t end_line = std::min((chunk + 1) * chunk_size, corpus.size());
+                incremental_train_chunk_(corpus, vocab_chunks[chunk], start_line, end_line);
+            }
+        }
+    }
+
+    void merge_vocabs(vector<str_set_t>& vocab_chunks, int threads) {
+        int n_chunks = vocab_chunks.size();
+        // upper bound for vocab size
+        size_t max_vocab_size = 0;
+        for(auto& cur_chunk : vocab_chunks) {
+            max_vocab_size += cur_chunk.size();
+        }
+        // merge vocab_chunks to first chunk
+        vocab_chunks[0].reserve(max_vocab_size);
+        for(size_t i = 1; i < vocab_chunks.size(); i++) {
+            auto& part = vocab_chunks[i];
+            vocab_chunks[0].insert(part.begin(), part.end());
+            // make sure release mem
+            part.clear();
+            str_set_t().swap(part);
+        }
+        // space token is needed in char_wb as padding token
+        if(tok_type == CHARWBTOKENIZER) {
+            vocab_chunks[0].insert(" ");
+        }
+        // sort tokens with their keys
+        str_vec_t all_token_vec(vocab_chunks[0].begin(), vocab_chunks[0].end());
+        vocab_chunks.clear();
+        vector<size_t> tok_idcs(all_token_vec.size());
+        for(size_t i = 0; i < tok_idcs.size(); i++) {
+            tok_idcs[i] = i;
+        }
+        parallel_sort(tok_idcs.begin(), tok_idcs.end(), [&](const size_t& i, const size_t& j) -> bool {
+                return all_token_vec[i] < all_token_vec[j];
+            },
+            threads
+        );
+        // merge token2count mappings
+        idx_type cur_idx = 0;
+        for(idx_type cur_idx = 0; cur_idx < tok_idcs.size(); cur_idx++) {
+            string& cur_str = all_token_vec[tok_idcs[cur_idx]];
+            vocab[cur_str] = cur_idx;
+        }
+    }
+
+public:
+    // build vocabulary with corpus in memory
+    void train(const sv_vec_t& corpus, int threads=-1) {
+        threads = set_threads(threads);
+        vector<str_set_t> vocab_chunks(threads);
+        train_from_mem_(corpus, vocab_chunks);
+        merge_vocabs(vocab_chunks, threads);
+    }
+
+    // build vocabulary from a single file
+    void train_from_file(const string& corpus_file, size_t buffer_size=0, int threads=-1) {
+        str_vec_t corpus_files{corpus_file};
+        train_from_file(corpus_files, buffer_size, threads);
+    }
+
+    // build vocabulary from multiple files, one file at a time
+    void train_from_file(const str_vec_t& corpus_files, size_t buffer_size=0, int threads=-1) {
+        buffer_size = std::max(DEFAULT_BUFFER_SIZE, buffer_size);
+
+        size_t nr_files = corpus_files.size();
+        threads = set_threads(threads);
+        vector<str_set_t> vocab_chunks(threads);
+
+        // workspace memory to load data
+        size_t proc_buf_size = buffer_size / threads;
+        vector<vector<char>> buffer(threads);
+        for(int i = 0; i < threads; i++) {
+            buffer[i].resize(proc_buf_size);
+        }
+
+        for(auto& cur_corpus_file : corpus_files) {
+            size_t cur_file_size = file_util::get_filesize(cur_corpus_file);
+            if(cur_file_size < buffer[0].size() - 1) {
+                // load whole file in buffer and parse to string_views
+                sv_vec_t cur_corpus_sv;
+                size_t cache_size = file_util::load_file_block(cur_corpus_file, buffer[0].data());
+                append_lines_to_string_view(buffer[0].data(), cache_size, cur_corpus_sv);
+
+                train_from_mem_(cur_corpus_sv, vocab_chunks);
+            } else {
+                // load file in chunks and parallel training
+                // chunk_size may increase depending on the position of next \n
+                // to avoid resizing buffer, use proc_buf_size / 2 as chunk_size
+                train_from_file_(cur_corpus_file, vocab_chunks, buffer, proc_buf_size / 2);
+            }
+        }
+        merge_vocabs(vocab_chunks, threads);
+    }
+
+}; // end Tokenizer
+
+class BaseVectorizer {
+public:
+    typedef unordered_map<idx_type, float> idx2float_map_t;
+    typedef unordered_map<idx_vec_t, idx_type, VectorHasher<idx_type>> vec2idx_map_t;
+    typedef unordered_set<idx_vec_t, VectorHasher<idx_type>> vec_set_t;
+
+    // arguments
+    TfidfBaseVectorizerParam param;
+
+    // TFIDF model
+    Tokenizer tokenizer;
+    vec2idx_map_t feature_vocab;
+    idx2float_map_t idx_idf;
+
+    BaseVectorizer() {};
+
+    BaseVectorizer(const TfidfBaseVectorizerParam* param_ptr):
+        tokenizer(param_ptr->tok_type), param(*param_ptr) { }
+
+    BaseVectorizer(const string& filepath) { load(filepath); }
+
+    size_t nr_features() const { return feature_vocab.size(); }
+
+    void save(const string& save_dir) const {
+        // save tokenizer
+        tokenizer.save(save_dir + "/tokenizer");
+
+        string vectorizer_folder = save_dir + "/vectorizer";
+        if(mkdir(vectorizer_folder.c_str(), 0777) == -1) {
+            if(errno != EEXIST) {
+                throw std::runtime_error("Unable to create save folder at " + vectorizer_folder);
+            }
+        }
+        // save param
+        param.save(vectorizer_folder + "/config.json");
+        // save feature_vocab and idx_idf
+
+        auto model_filename = vectorizer_folder + "/tfidf-model.txt";
+        FILE *fp = fopen(model_filename.c_str(), "w");
+        if(fp == NULL) {
+            throw std::runtime_error("Unable to save tfidf model file to " + model_filename);
+        } else {
+            fprintf(fp, "%ld\n", nr_features());
+            for(auto iter = feature_vocab.begin(); iter != feature_vocab.end(); iter++) {
+                auto& feat_ngram = iter->first;
+                int32_t feat_id = iter->second;
+                // feat_id<TAB>feat_idf<TAB>ngram_length<TAB>idx1 idx2...
+                fprintf(fp, "%d\t%f\t%ld", feat_id, idx_idf.at(feat_id), feat_ngram.size());
+                for(auto tid = 0; tid < feat_ngram.size(); tid++) {
+                    if(tid == 0) {
+                        fprintf(fp, "\t%d", feat_ngram[tid]);
+                    } else {
+                        fprintf(fp, " %d", feat_ngram[tid]);
+                    }
+                }
+                fprintf(fp, "\n");
+            }
+            fclose(fp);
+        }
+    }
+
+    void load(const string& load_dir) {
+        // load tokenizer
+        tokenizer.load(load_dir + "/tokenizer");
+
+        string vectorizer_folder = load_dir + "/vectorizer";
+        // load param
+        param.load(vectorizer_folder + "/config.json");
+        // load feature_vocab and idx_idf
+        auto model_filename = vectorizer_folder + "/tfidf-model.txt";
+
+        FILE *fp = fopen(model_filename.c_str(), "r");
+        if(fp == NULL) {
+            throw std::runtime_error("Unable to load tfidf model file to " + model_filename);
+        } else {
+            size_t total_features = 0;
+            fscanf(fp, "%ld", &total_features);
+            feature_vocab.reserve(total_features);
+            idx_idf.reserve(total_features);
+
+            for(size_t f = 0; f < total_features; f++) {
+                // feat_id<TAB>feat_idf<TAB>ngram_length<TAB>idx1 idx2...
+                int32_t idx = 0;
+                float32_t idf = 0.0;
+                uint64_t ngram_len = 0;
+                fscanf(fp, "%d%f%ld", &idx, &idf, &ngram_len);
+                idx_idf[idx] = idf;
+                idx_vec_t ngram(ngram_len);
+                for(size_t tid = 0; tid < ngram_len; tid++) {
+                    int32_t tok_idx;
+                    fscanf(fp, "%d", &tok_idx);
+                    ngram[tid] = tok_idx;
+                }
+                feature_vocab[ngram] = idx;
+            }
+
+            fclose(fp);
+        }
+    }
+
+private:
+    // train from a data chunk and count document frequency into feat_df
+    void train_feat_df_chunk_(const sv_vec_t& corpus, vec2idx_map_t& feat_df, int start_line=0, int end_line=-1) {
+        if(end_line <= start_line || end_line > corpus.size()) {
+            end_line = corpus.size();
+        }
+
+        for(auto line_sv = corpus.begin() + start_line; line_sv != corpus.begin() + end_line; line_sv++) {
+            vec2idx_map_t feat_cnt;
+            tokenizer.count_ngrams(*line_sv, feat_cnt, param.min_ngram, param.max_ngram, param.max_length);
+            // for document frequency, only record binarized count
+            for(auto& feat_i : feat_cnt) {
+                feat_df[feat_i.first] += 1;
+            }
+        }
+    }
+
+    // return the sorted feature vector of given string_view
+    template<typename IDX_T, typename VAL_T>
+    void get_sorted_feature(const string_view& line, vector<IDX_T>& feat_idx, vector<VAL_T>& feat_val) const {
+        idx_vec_t tokens;
+        tokenizer.tokenize(line, tokens, param.max_length);
+        unordered_map<IDX_T, VAL_T> term_freq_map;
+        // find all features between min_ngram and max_ngram
+        idx_vec_t feat_key;
+        feat_key.reserve(param.max_ngram);
+        for(int cur_ngram = param.min_ngram; cur_ngram <= std::min(param.max_ngram, (int)tokens.size()); cur_ngram++) {
+            feat_key.resize(cur_ngram);
+            for(int i = 0; i <= tokens.size() - cur_ngram; i++) {
+                feat_key.assign(tokens.begin() + i, tokens.begin() + i + cur_ngram);
+                auto feat_pair = feature_vocab.find(feat_key);
+                if(feat_pair != feature_vocab.end()) {
+                    term_freq_map[IDX_T(feat_pair->second)] += 1.0;
+                }
+            }
+        }
+        // sort indices
+        vector<pair<IDX_T, VAL_T>> term_freq(term_freq_map.begin(), term_freq_map.end());
+        std::sort(term_freq.begin(), term_freq.end());
+
+        // convert term frequency to feature
+        VAL_T normalizing_denominator = 0.0;
+        for(auto& dim2feat : term_freq) {
+            dim2feat.second = param.binary ? 1.0 : dim2feat.second;
+            dim2feat.second = param.sublinear_tf ? log(dim2feat.second) + 1.0 : dim2feat.second;
+            if(param.use_idf) {
+                dim2feat.second *= idx_idf.at(dim2feat.first);
+            }
+
+            if(param.norm_p == 1) {
+                normalizing_denominator += std::fabs(dim2feat.second);
+            } else if(param.norm_p == 2) {
+                normalizing_denominator += dim2feat.second * dim2feat.second;
+            } else {
+                throw std::invalid_argument("invalid normalize option, norm_p: [ 1| 2]");
+            }
+        }
+        if(std::fabs(normalizing_denominator) < std::numeric_limits<float>::epsilon()) {
+            normalizing_denominator = 1.0;
+        } else if(param.norm_p == 2) {
+            normalizing_denominator = std::sqrt(normalizing_denominator);
+        }
+        for(auto& dim2feat : term_freq) {
+            dim2feat.second /= normalizing_denominator;
+        }
+        feat_idx.resize(term_freq.size());
+        feat_val.resize(term_freq.size());
+        for(size_t i = 0; i < term_freq.size(); i++) {
+            feat_idx[i] = term_freq[i].first;
+            feat_val[i] = term_freq[i].second;
+        }
+    }
+
+    void train_from_file_(const string& corpus_path, vector<vec2idx_map_t>& feat_df_chunks, vector<vector<char>>& buffer, size_t chunk_size) {
+        vector<size_t> chunk_offset;
+        file_util::get_file_offset(corpus_path, chunk_size, chunk_offset);
+        size_t n_chunks = chunk_offset.size() - 1;
+
+#pragma omp parallel for schedule(dynamic,1)
+        for(int chunk = 0; chunk < n_chunks; chunk++) {
+            int proc_id = omp_get_thread_num();
+
+            if(buffer[proc_id].size() <= chunk_offset[chunk + 1] - chunk_offset[chunk]) {
+                // need to increase buffer size
+                buffer[proc_id].resize(chunk_offset[chunk + 1] - chunk_offset[chunk] + 1);
+            }
+
+            // load file chunk and parse lines to string_views
+            sv_vec_t cur_corpus_sv;
+            size_t cache_size = file_util::load_file_block(corpus_path,
+                    buffer[proc_id].data(), chunk_offset[chunk], chunk_offset[chunk + 1]);
+            append_lines_to_string_view(buffer[proc_id].data(), cache_size, cur_corpus_sv);
+
+            train_feat_df_chunk_(cur_corpus_sv, feat_df_chunks[proc_id]);
+        }
+    }
+
+    // train tfidf vectorizer from corpus in memory
+    void train_from_mem_(const sv_vec_t& corpus, vector<vec2idx_map_t>& feat_df_chunks) {
+        size_t nr_doc = corpus.size();
+        size_t n_chunks = std::min(feat_df_chunks.size(), corpus.size());
+        size_t chunk_size = (nr_doc + n_chunks - 1) / n_chunks;
+
+#pragma omp parallel for schedule(dynamic,1)
+        for(int chunk=0; chunk < n_chunks; chunk++) {
+            int start_line = chunk * chunk_size;
+            if(start_line < nr_doc) {
+                int end_line = std::min(start_line + chunk_size, nr_doc);
+                train_feat_df_chunk_(corpus, feat_df_chunks[chunk], start_line, end_line);
+            }
+        }
+    }
+
+    // merge and sort features
+    void merge_df_chunks(vector<vec2idx_map_t>& feat_df_chunks, size_t nr_doc, int threads) {
+        int n_chunks = feat_df_chunks.size();
+
+        // only keep (real_min_df_cnt, real_max_df_cnt)
+        size_t real_min_df_cnt = (size_t)std::round(param.min_df_ratio * nr_doc);
+        real_min_df_cnt = std::max((size_t)param.min_df_cnt, real_min_df_cnt);
+        size_t real_max_df_cnt = (size_t)std::round(param.max_df_ratio * nr_doc);
+        real_max_df_cnt = param.max_df_cnt > 0 ? std::min((size_t)param.max_df_cnt, real_max_df_cnt) : real_max_df_cnt;
+
+        // merge df counts to its first chunk
+        auto& final_chunk =  feat_df_chunks[0];
+        for(size_t ck_idx = 1; ck_idx < n_chunks; ck_idx++) {
+            for(auto& cur_cnt : feat_df_chunks[ck_idx]) {
+                auto ngram_feat = cur_cnt.first;
+                auto df_cnt = cur_cnt.second;
+                final_chunk[ngram_feat] += df_cnt;
+            }
+            // make sure memory is released
+            feat_df_chunks[ck_idx].clear();
+            vec2idx_map_t().swap(feat_df_chunks[ck_idx]);
+        }
+        // filtering features with min_df, max_df
+        for(auto it = final_chunk.begin(); it != final_chunk.end();) {
+            if(it->second < real_min_df_cnt || it->second > real_max_df_cnt) {
+                // remove this feature
+                it = final_chunk.erase(it);
+            } else {
+                it++;
+            }
+        }
+
+        // vector of cnt2ptr for sort
+        typedef vec2idx_map_t::value_type* vec2idx_pair_p;
+        vector<vec2idx_pair_p> ptr_vec;
+        for(auto it = final_chunk.begin(); it != final_chunk.end(); ++it) {
+            // keep this feature
+            vec2idx_pair_p ptr = &(*it);
+            ptr_vec.push_back(ptr);
+        }
+
+        size_t nr_feat = 0;
+        if(param.max_feature > 0) {
+            nr_feat = std::min(size_t(param.max_feature), ptr_vec.size());
+        } else {
+            nr_feat = ptr_vec.size();
+        }
+        // sort the remaining features by their cnt in ascending order
+        // if cnt equals, sort by ngram length in ascending order
+        // if ngram length equals, sort by token indices in ascending order
+        parallel_sort(ptr_vec.begin(), ptr_vec.end(),
+            [&](const vec2idx_pair_p& lx, const vec2idx_pair_p& rx) -> bool {
+                if(lx->second != rx->second) { // using the count to compare
+                    return lx->second < rx->second;
+                } else if(lx->first.size() != rx->first.size()) {
+                    return lx->first.size() < rx->first.size(); // compare ngram length
+                } else {// compare ngram token idx
+                    for(int i = 0; i < lx->first.size() - 1; i++) {
+                        if(lx->first[i] != rx->first[i]) {
+                            return lx->first[i] < rx->first[i];
+                        }
+                    }
+                    return lx->first.back() < rx->first.back();
+                }
+            },
+            threads
+        );
+
+        // trim features by removing
+        // the least frequent ones (if keep_frequent_feature)
+        // or the most frequent ones
+        // convert feat_key2df to feat_key2idx + idx2idf
+        feature_vocab.reserve(nr_feat);
+        idx_idf.reserve(nr_feat);
+
+        idx_type start_idx = 0;
+        if(param.keep_frequent_feature) {
+            start_idx = ptr_vec.size() - nr_feat;
+        }
+
+        for(idx_type cur_idx = 0; cur_idx < nr_feat; cur_idx++) {
+            auto& cur_ptr = ptr_vec[cur_idx + start_idx];
+            auto cur_df = final_chunk[cur_ptr->first];
+            feature_vocab[cur_ptr->first] = cur_idx;
+            idx_idf[cur_idx] = std::max(log(float(nr_doc) / (cur_df + param.smooth_idf)), 0.0);
+        }
+    }
+
+public:
+    // train from corpus in memory
+    void train(const char** corpus, const size_t* doc_lens, size_t nr_doc, int threads=-1) {
+        // create reference with string_view, no copy
+        sv_vec_t corpus_sv_vec(nr_doc);
+        for(size_t i = 0; i < nr_doc; i++) {
+            string_view cur_doc(corpus[i], doc_lens[i]);
+            corpus_sv_vec[i] = cur_doc;
+        }
+        train(corpus_sv_vec, threads);
+    }
+
+    void train(const sv_vec_t& corpus, int threads=-1) {
+        tokenizer.train(corpus, threads);
+
+        threads = set_threads(threads);
+        vector<vec2idx_map_t> feat_df_chunks(threads);
+
+        train_from_mem_(corpus, feat_df_chunks);
+        merge_df_chunks(feat_df_chunks, corpus.size(), threads);
+    }
+
+    // train from a single file
+    void train_from_file(const string& corpus_file, size_t buffer_size=0, int threads=-1) {
+        str_vec_t corpus_files{corpus_file};
+        train_from_file(corpus_files, buffer_size, threads);
+    }
+
+    // train from multiple files
+    void train_from_file(const char** corpus_files, const size_t* fname_lens, size_t nr_files, size_t buffer_size=0, int threads=-1) {
+        str_vec_t corpus_files_vec(nr_files);
+        for(size_t i = 0; i < nr_files; i++) {
+            string cur_fname(corpus_files[i], fname_lens[i]);
+            corpus_files_vec[i] = cur_fname;
+        }
+        train_from_file(corpus_files_vec, buffer_size, threads);
+    }
+
+    void train_from_file(const str_vec_t& corpus_files, size_t buffer_size=0, int threads=-1) {
+        buffer_size = std::max(DEFAULT_BUFFER_SIZE, buffer_size);
+        // train tokenizer and build vocabulary
+        tokenizer.train_from_file(corpus_files, buffer_size, threads);
+
+        size_t nr_files = corpus_files.size();
+        threads = set_threads(threads);
+        vector<vec2idx_map_t> feat_df_chunks(threads);
+        size_t nr_doc = 0;
+
+        // workspace memory to load data
+        size_t proc_buf_size = buffer_size / threads;
+        vector<vector<char>> buffer(threads);
+        for(int i = 0; i < threads; i++) {
+            buffer[i].resize(proc_buf_size);
+        }
+
+        for(auto& cur_corpus_file : corpus_files) {
+            nr_doc += file_util::get_linecount(cur_corpus_file);
+            size_t cur_file_size = file_util::get_filesize(cur_corpus_file);
+            if(cur_file_size < buffer[0].size() - 1) {
+                // load file chunk and parse lines to string_views
+                sv_vec_t cur_corpus_sv;
+                size_t cache_size = file_util::load_file_block(cur_corpus_file, buffer[0].data());
+                append_lines_to_string_view(buffer[0].data(), cache_size, cur_corpus_sv);
+                train_from_mem_(cur_corpus_sv, feat_df_chunks);
+            } else {
+                // load file in chunks and parallel training
+                // chunk_size may increase depending on the position of next \n
+                // to avoid resizing buffer, use proc_buf_size / 2 as chunk_size
+                train_from_file_(cur_corpus_file, feat_df_chunks, buffer, proc_buf_size / 2);
+            }
+        }
+        merge_df_chunks(feat_df_chunks, nr_doc, threads);
+    }
+
+    // batch inference from single file
+    // expect res to be empty csr_t or spmm_mat_t
+    template<class MAT_T>
+    void predict_from_file(const char* corpus_file, const size_t fname_len, MAT_T& res, size_t buffer_size=0, int threads=-1) const {
+        // create reference with string_view, no copy
+        string corpus_file_str(corpus_file, fname_len);
+        predict_from_file(corpus_file_str, res, buffer_size, threads);
+    }
+
+    template<class MAT_T>
+    void predict_from_file(const string& corpus_file, MAT_T& res, size_t buffer_size=0, int threads=-1) const {
+        typedef typename MAT_T::index_type ret_idx_t;
+        typedef typename MAT_T::value_type ret_val_t;
+        typedef typename MAT_T::mem_index_type ret_indptr_t;
+
+        buffer_size = std::max(DEFAULT_BUFFER_SIZE, buffer_size);
+        threads = set_threads(threads);
+        // workspace memory to load data
+        size_t proc_buf_size = buffer_size / threads;
+        vector<vector<char>> buffer(threads);
+        for(int i = 0; i < threads; i++) {
+            buffer[i].resize(proc_buf_size);
+        }
+        vector<size_t> chunk_offset;
+        file_util::get_file_offset(corpus_file, proc_buf_size / 2, chunk_offset);
+        size_t n_chunks = chunk_offset.size() - 1;
+
+        vector<size_t> chunk_nnz(n_chunks + 1, 0);
+
+        vector<vector<ret_indptr_t>> feat_indptr_vec(n_chunks);
+        vector<vector<ret_idx_t>> feat_indices_vec(n_chunks);
+        vector<vector<ret_val_t>> feat_data_vec(n_chunks);
+        vector<size_t> chunk_nr_doc(n_chunks);
+
+#pragma omp parallel for schedule(dynamic)
+        for(int chunk = 0; chunk < n_chunks; chunk++) {
+            int proc_id = omp_get_thread_num();
+            size_t start_pos = chunk_offset[chunk];
+            size_t end_pos = chunk_offset[chunk + 1];
+
+            if(buffer[proc_id].size() <= end_pos - start_pos) {
+                // need to increase buffer size
+                buffer[proc_id].resize(end_pos - start_pos + 1);
+            }
+            sv_vec_t cur_corpus_sv;
+            size_t cache_size = file_util::load_file_block(corpus_file,
+                    buffer[proc_id].data(), chunk_offset[chunk], chunk_offset[chunk + 1]);
+
+            append_lines_to_string_view(buffer[proc_id].data(), cache_size, cur_corpus_sv);
+            chunk_nr_doc[chunk] = cur_corpus_sv.size();
+
+            for(size_t qi = 0; qi < cur_corpus_sv.size(); qi++) {
+                vector<ret_idx_t> feat_indices;
+                vector<ret_val_t> feat_data;
+                get_sorted_feature(cur_corpus_sv[qi], feat_indices, feat_data);
+
+                size_t cur_nnz = feat_data.size();
+
+                chunk_nnz[chunk + 1] += cur_nnz;
+                feat_indptr_vec[chunk].push_back(cur_nnz);
+                feat_indices_vec[chunk].insert(feat_indices_vec[chunk].end(), feat_indices.begin(), feat_indices.end());
+                feat_data_vec[chunk].insert(feat_data_vec[chunk].end(), feat_data.begin(), feat_data.end());
+            }
+        }
+
+        size_t nr_doc = std::accumulate(chunk_nr_doc.begin(), chunk_nr_doc.end(), 0);
+        std::partial_sum(chunk_nnz.begin(), chunk_nnz.end(), chunk_nnz.begin());
+        vector<ret_indptr_t> feat_sizes(nr_doc + 1, 0);
+        size_t cp_ptr = 1;
+        // merge feat_indptr_vec to one
+        for(auto& cur_feat_sizes : feat_indptr_vec) {
+            std::copy(cur_feat_sizes.begin(), cur_feat_sizes.end(), feat_sizes.begin() + cp_ptr);
+            cp_ptr += cur_feat_sizes.size();
+            // make sure memory is cleared
+            cur_feat_sizes.clear();
+            vector<ret_indptr_t>().swap(cur_feat_sizes);
+        }
+        parallel_partial_sum(feat_sizes.begin(), feat_sizes.end(), feat_sizes.begin(), threads);
+
+        size_t total_nnz = feat_sizes[nr_doc];
+
+        res.allocate(nr_doc, idx_idf.size(), total_nnz);
+        std::memcpy(res.indptr, feat_sizes.data(), sizeof(ret_indptr_t) * (nr_doc + 1));
+
+#pragma omp parallel for schedule(dynamic)
+        for(int chunk = 0; chunk < n_chunks; chunk++) {
+            size_t start = chunk_nnz[chunk];
+            size_t end = chunk_nnz[chunk + 1];
+            std::memcpy(&res.data[start], feat_data_vec[chunk].data(), sizeof(ret_val_t) * (end - start));
+            std::memcpy(&res.indices[start], feat_indices_vec[chunk].data(), sizeof(ret_idx_t) * (end - start));
+        }
+    }
+
+    // batch inference with corpus in memory
+    // expect res to be empty csr_t or spmm_mat_t
+    template<class MAT_T>
+    void predict(const char** corpus, const size_t* doc_lens, size_t nr_doc, MAT_T& res, int threads=-1) const {
+        // create reference with string_view, no copy
+        sv_vec_t corpus_vec(nr_doc);
+        for(size_t i = 0; i < nr_doc; i++) {
+            string_view cur_doc(corpus[i], doc_lens[i]);
+            corpus_vec[i] = cur_doc;
+        }
+        predict(corpus_vec, res, threads);
+    }
+
+    template<class MAT_T>
+    void predict(const sv_vec_t& corpus, MAT_T& res, int threads=-1) const {
+        typedef typename MAT_T::index_type ret_idx_t;
+        typedef typename MAT_T::value_type ret_val_t;
+        typedef typename MAT_T::mem_index_type ret_indptr_t;
+
+        threads = set_threads(threads);
+
+        size_t nr_doc = corpus.size();
+        size_t n_chunks = threads;
+        size_t chunk_size = (nr_doc + n_chunks - 1) / n_chunks;
+
+        vector<ret_indptr_t> feat_sizes(nr_doc + 1, 0);
+        vector<size_t> chunk_nnz(n_chunks + 1, 0);
+        vector<vector<ret_idx_t>> feat_indices_vec(n_chunks);
+        vector<vector<ret_val_t>> feat_data_vec(n_chunks);
+
+#pragma omp parallel for schedule(static)
+        for(int chunk = 0; chunk < n_chunks; chunk++) {
+            size_t start_line = chunk * chunk_size;
+            size_t end_line = std::min(start_line + chunk_size, nr_doc);
+
+            for(size_t qi = start_line; qi < end_line; qi++) {
+                vector<ret_idx_t> feat_indices;
+                vector<ret_val_t> feat_data;
+                get_sorted_feature(corpus[qi], feat_indices, feat_data);
+
+                size_t cur_nnz = feat_data.size();
+                feat_sizes[qi + 1] = cur_nnz;
+
+                chunk_nnz[chunk + 1] += cur_nnz;
+                feat_indices_vec[chunk].insert(feat_indices_vec[chunk].end(), feat_indices.begin(), feat_indices.end());
+                feat_data_vec[chunk].insert(feat_data_vec[chunk].end(), feat_data.begin(), feat_data.end());
+            }
+        }
+        parallel_partial_sum(feat_sizes.begin(), feat_sizes.end(), feat_sizes.begin(), threads);
+        // chunk_nnz only need single thread partial_sum
+        std::partial_sum(chunk_nnz.begin(), chunk_nnz.end(), chunk_nnz.begin());
+        size_t total_nnz = feat_sizes[nr_doc];
+
+        res.allocate(corpus.size(), idx_idf.size(), total_nnz);
+        std::memcpy(res.indptr, feat_sizes.data(), sizeof(ret_indptr_t) * (nr_doc + 1));
+
+#pragma omp parallel for schedule(static)
+        for(int chunk = 0; chunk < n_chunks; chunk++) {
+            size_t start = chunk_nnz[chunk];
+            size_t end = chunk_nnz[chunk + 1];
+            std::memcpy(&res.data[start], feat_data_vec[chunk].data(), sizeof(ret_val_t) * (end - start));
+            std::memcpy(&res.indices[start], feat_indices_vec[chunk].data(), sizeof(ret_idx_t) * (end - start));
+        }
+    }
+
+    // transform input string to feature vector
+    // returns feature vector with sorted indices
+    template<class MAT_T>
+    void predict(const string_view& line, MAT_T& res) const {
+        typedef typename MAT_T::index_type ret_idx_t;
+        typedef typename MAT_T::value_type ret_val_t;
+
+        vector<ret_idx_t> feature_idx;
+        vector<ret_val_t> feature_val;
+        get_sorted_feature(line, feature_idx, feature_val);
+        size_t data_size = feature_idx.size();
+
+        // construct result csr vector
+        res.allocate(1, idx_idf.size(), data_size);
+        res.indptr[0] = 0;
+        res.indptr[1] = data_size;
+        std::memcpy(res.data, feature_val.data(), sizeof(ret_val_t) * data_size);
+        std::memcpy(res.indices, feature_idx.data(), sizeof(ret_idx_t) * data_size);
+    }
+
+}; // end BaseVectorizer
+
+class Vectorizer {
+
+public:
+
+    // arguments
+    TfidfVectorizerParam param;
+    vector<BaseVectorizer> vectorizer_arr;
+
+    Vectorizer() {};
+
+    Vectorizer(const TfidfVectorizerParam* param_ptr): param(*param_ptr) {
+        for (int i = 0; i < param.num_base_vect; i++) {
+            vectorizer_arr.emplace_back(BaseVectorizer(&param.base_param_ptr[i]));
+        }
+    }
+
+    Vectorizer(const string& filepath) { load(filepath); }
+
+    void save(const string& save_dir) const {
+        // save TfidfVectorizerParam
+        string meta_json_path = save_dir + "/meta.json";
+        param.save(meta_json_path);
+
+        // save TfidfBaseVectorizer
+        for (int i = 0; i < param.num_base_vect; i++) {
+            string base_vect_dir = save_dir + "/" + std::to_string(i) + ".base";
+            if(mkdir(base_vect_dir.c_str(), 0777) == -1) {
+                if(errno != EEXIST) {
+                    throw std::runtime_error("Unable to create base_vect_dir at " + base_vect_dir);
+                }
+            }
+            vectorizer_arr[i].save(base_vect_dir);
+        }
+    }
+
+    void load(const string& load_dir) {
+        // check whether "load_dir/meta.json" exists.
+        // If not exist, load_dir is saved from BaseVectorizer
+        string meta_json_path = load_dir + "/meta.json";
+        std::ifstream file_stream(meta_json_path);
+        if (!file_stream.is_open()) {
+            param.num_base_vect = 1;
+            vectorizer_arr.resize(param.num_base_vect);
+            vectorizer_arr[0].load(load_dir);
+            param.norm_p = vectorizer_arr[0].param.norm_p;
+            return;
+        }
+        // Otherwise, load_dir is saved from Vectorizer
+        param.load(load_dir + "/meta.json");
+        vectorizer_arr.resize(param.num_base_vect);
+
+        for (int i = 0; i < param.num_base_vect; i++) {
+            string base_vect_dir = load_dir + "/" + std::to_string(i) + ".base";
+            vectorizer_arr[i].load(base_vect_dir);
+        }
+    }
+
+    size_t nr_features() const {
+        size_t total_features = 0;
+        for(const auto& vectorizer : vectorizer_arr) {
+            total_features += vectorizer.nr_features();
+        }
+        return total_features;
+    }
+
+    // train from corpus in memory
+    void train(const char** corpus, const size_t* doc_lens, size_t nr_doc, int threads=-1) {
+        // create reference with string_view, no copy
+        sv_vec_t corpus_sv_vec(nr_doc);
+        for(size_t i = 0; i < nr_doc; i++) {
+            string_view cur_doc(corpus[i], doc_lens[i]);
+            corpus_sv_vec[i] = cur_doc;
+        }
+        train(corpus_sv_vec, threads);
+    }
+
+    void train(const sv_vec_t& corpus, int threads=-1) {
+        for(auto& vectorizer : vectorizer_arr) {
+            vectorizer.train(corpus, threads);
+        }
+    }
+
+    // train from a single file
+    void train_from_file(const string& corpus_file, size_t buffer_size=0, int threads=-1) {
+        str_vec_t corpus_files{corpus_file};
+        train_from_file(corpus_files, buffer_size, threads);
+    }
+
+    // train from multiple files
+    void train_from_file(const char** corpus_files, const size_t* fname_lens, size_t nr_files, size_t buffer_size=0, int threads=-1) {
+        str_vec_t corpus_files_vec(nr_files);
+        for(size_t i = 0; i < nr_files; i++) {
+            string cur_fname(corpus_files[i], fname_lens[i]);
+            corpus_files_vec[i] = cur_fname;
+        }
+        train_from_file(corpus_files_vec, buffer_size, threads);
+    }
+
+    void train_from_file(const str_vec_t& corpus_files, size_t buffer_size=0, int threads=-1) {
+        for(auto& vectorizer : vectorizer_arr) {
+            vectorizer.train_from_file(corpus_files, buffer_size, threads);
+        }
+    }
+
+    // expect res to be empty csr_t or spmm_mat_t
+    template<class MAT_T>
+    void normalize_csr(MAT_T& res, int norm_p, int threads) const {
+        typedef typename MAT_T::value_type ret_val_t;
+        set_threads(threads);
+        if(norm_p == 1) {
+#pragma omp parallel for schedule(dynamic)
+            for(int i = 0; i < res.rows; i++) {
+                ret_val_t normalizing_denominator = 0.0;
+                for(auto j = res.indptr[i]; j < res.indptr[i + 1]; j++) {
+                    normalizing_denominator += std::fabs(res.data[j]);
+                }
+                if(std::fabs(normalizing_denominator) < std::numeric_limits<float>::epsilon()) {
+                    normalizing_denominator = 1.0;
+                }
+                for(auto j = res.indptr[i]; j < res.indptr[i + 1]; j++) {
+                    res.data[j] /= normalizing_denominator;
+                }
+            }
+        } else if(norm_p == 2) {
+#pragma omp parallel for schedule(dynamic)
+            for(int i = 0; i < res.rows; i++) {
+                ret_val_t normalizing_denominator = 0.0;
+                for(auto j = res.indptr[i]; j < res.indptr[i + 1]; j++) {
+                    normalizing_denominator += res.data[j] * res.data[j];
+                }
+                if(std::fabs(normalizing_denominator) < std::numeric_limits<float>::epsilon()) {
+                    normalizing_denominator = 1.0;
+                } else{
+                    normalizing_denominator = std::sqrt(normalizing_denominator);
+                }
+                for (auto j = res.indptr[i]; j < res.indptr[i + 1]; j++) {
+                    res.data[j] /= normalizing_denominator;
+                }
+            }
+        } else {
+            throw std::invalid_argument("invalid normalize option, norm_p: [ 1| 2]");
+        }
+    }
+
+    // batch inference from single file
+    // expect res to be empty csr_t or spmm_mat_t
+    template<class MAT_T>
+    void predict_from_file(const char* corpus_file, const size_t fname_len, MAT_T& res, size_t buffer_size=0, int threads=-1) const {
+        // create reference with string_view, no copy
+        string corpus_file_str(corpus_file, fname_len);
+        predict_from_file(corpus_file_str, res, buffer_size, threads);
+    }
+
+    template<class MAT_T>
+    void predict_from_file(const string& corpus_file, MAT_T& res, size_t buffer_size=0, int threads=-1) const {
+        // base case, no need to do the extra column-wise concatenate work
+        if(param.num_base_vect == 1) {
+            vectorizer_arr[0].predict_from_file(corpus_file, res, buffer_size, threads);
+            if(param.norm_p != vectorizer_arr[0].param.norm_p) {
+                normalize_csr(res, param.norm_p, threads);
+            }
+            return;
+        }
+
+        // do prediction from each TfidfBaseVectorizer
+        // and save each feat_mat in feat_mat_arr
+        vector<csr_t> feat_mat_arr(param.num_base_vect);
+        for(int i = 0; i < param.num_base_vect; i++) {
+            vectorizer_arr[i].predict_from_file(corpus_file, feat_mat_arr[i], buffer_size, threads);
+        }
+        // hstack feature sub-matrices, copy into res, and normalize
+        hstack_csr(feat_mat_arr, res, threads);
+        normalize_csr(res, param.norm_p, threads);
+
+        for (auto& feat_mat: feat_mat_arr) {
+            feat_mat.free_underlying_memory();
+        }
+    }
+
+    // batch inference with corpus in memory
+    // expect res to be empty csr_t or spmm_mat_t
+    template<class MAT_T>
+    void predict(const char** corpus, const size_t* doc_lens, size_t nr_doc, MAT_T& res, int threads=-1) const {
+        // create reference with string_view, no copy
+        sv_vec_t corpus_vec(nr_doc);
+        for(size_t i = 0; i < nr_doc; i++) {
+            string_view cur_doc(corpus[i], doc_lens[i]);
+            corpus_vec[i] = cur_doc;
+        }
+        predict(corpus_vec, res, threads);
+    }
+
+    template<class MAT_T>
+    void predict(const sv_vec_t& corpus, MAT_T& res, int threads=-1) const {
+        // base case, no need to do the extra column-wise concatenate work
+        if(param.num_base_vect == 1) {
+            vectorizer_arr[0].predict(corpus, res, threads);
+            if(param.norm_p != vectorizer_arr[0].param.norm_p) {
+                normalize_csr(res, param.norm_p, threads);
+            }
+            return;
+        }
+
+        // do prediction from each TfidfBaseVectorizer
+        // and save each feat_mat in feat_mat_arr
+        vector<csr_t> feat_mat_arr(param.num_base_vect);
+        for(int i = 0; i < param.num_base_vect; i++) {
+            vectorizer_arr[i].predict(corpus, feat_mat_arr[i], threads);
+        }
+        // hstack feature sub-matrices, copy into res, and normalize
+        hstack_csr(feat_mat_arr, res, threads);
+        normalize_csr(res, param.norm_p, threads);
+
+        for(auto& feat_mat: feat_mat_arr) {
+            feat_mat.free_underlying_memory();
+        }
+    }
+
+    // transform input string to feature vector
+    // returns feature vector with sorted indices
+    template<class MAT_T>
+    void predict(const string_view& line, MAT_T& res) const {
+        // threads = 1 to simulate online inference setting
+        int threads = 1;
+
+        // base case, no need to do the extra column-wise concatenate work
+        if(param.num_base_vect == 1) {
+            vectorizer_arr[0].predict(line, res);
+            if(param.norm_p != vectorizer_arr[0].param.norm_p) {
+                normalize_csr(res, param.norm_p, threads);
+            }
+            return;
+        }
+
+        // do prediction from each TfidfBaseVectorizer
+        // and save each feat_mat in feat_mat_arr
+        vector<csr_t> feat_mat_arr(param.num_base_vect);
+        for(int i = 0; i < param.num_base_vect; i++) {
+            vectorizer_arr[i].predict(line, feat_mat_arr[i]);
+        }
+        // hstack feature sub-matrices, copy into res, and normalize
+        hstack_csr(feat_mat_arr, res, threads);
+        normalize_csr(res, param.norm_p, threads);
+
+        for(auto& feat_mat: feat_mat_arr) {
+            feat_mat.free_underlying_memory();
+        }
+    }
+
+}; // end Vectorizer
+
+
+} // end namespace tfidf
+} // end namespace pecos
+#endif  // end of __TFIDF_H__
diff --git a/pecos/core/xmc/inference.hpp b/pecos/core/xmc/inference.hpp
new file mode 100644
index 00000000..7074536d
--- /dev/null
+++ b/pecos/core/xmc/inference.hpp
@@ -0,0 +1,1971 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+/*
+* File: inference.hpp
+*
+* Description: Provides functionality for performing PECOS prediction.
+*
+* Note about memory management: Any function that returns a matrix type has allocated memory
+* and it is incumbent upon the user to deallocate that memory by calling the free_underlying_memory
+* method of the matrix in question.
+*/
+
+#ifndef __INFERENCE_H__
+#define __INFERENCE_H__
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <fstream>
+#include <functional>
+#include <stdexcept>
+#include <type_traits>
+#include <unistd.h>
+#include <vector>
+#include "utils/matrix.hpp"
+#include "third_party/nlohmann_json/json.hpp"
+#include "third_party/robin_hood_hashing/robin_hood.h"
+
+#define DEFAULT_LAYER_TYPE LAYER_TYPE_BINARY_SEARCH_CHUNKED
+
+
+namespace pecos {
+
+    typedef ScipySparseNpz<true, float> ScipyCsrF32Npz;
+    typedef ScipySparseNpz<false, float> ScipyCscF32Npz;
+
+    enum layer_type_t {
+        LAYER_TYPE_CSC,
+        LAYER_TYPE_HASH_CHUNKED,
+        LAYER_TYPE_BINARY_SEARCH_CHUNKED
+    };
+
+    struct HierarchicalMLModelMetadata {
+        int depth;
+
+        HierarchicalMLModelMetadata(const std::string& params_filepath) {
+            std::ifstream ifs(params_filepath);
+            if (!ifs.is_open()) {
+                throw std::runtime_error("could not open " + params_filepath);
+            }
+            nlohmann::json j;
+            ifs >> j;
+            ifs.close();
+
+            std::string model_type = j.value("model", "None");
+            if (!(model_type == "HierarchicalMLModel")) {
+                throw std::runtime_error(model_type + " loading is not implemented");
+            }
+
+            depth = j.value("depth", -1);
+            if (depth <= 0) {
+                throw std::runtime_error("model corrupted, depth is 0 or negative");
+            }
+        }
+    };
+
+    struct MLModelMetadata {
+        float bias;
+        int only_topk;
+        std::string post_processor;
+
+        MLModelMetadata(
+            float bias=1.0,
+            int only_topk=10,
+            std::string post_processor="l3-hinge"
+        ) {
+            this->bias = bias;
+            this->only_topk = only_topk;
+            this->post_processor = post_processor;
+        }
+
+        MLModelMetadata(const std::string& params_filepath) {
+            std::ifstream ifs(params_filepath);
+            if (!ifs.is_open()) {
+                throw std::runtime_error("could not open " + params_filepath);
+            }
+            nlohmann::json j;
+            ifs >> j;
+            ifs.close();
+
+            std::string model_type = j.value("model", "None");
+            if (!(model_type == "MLModel")) {
+                throw std::runtime_error(model_type + " loading is not implemented");
+            }
+
+            if (!j.contains("bias")) {
+                 throw std::runtime_error("model corrupted, does not contain bias");
+            }
+
+            bias = j["bias"];
+
+            if (!j.contains("pred_kwargs")) {
+                 throw std::runtime_error("model corrupted, does not contain pred_kwargs");
+            }
+
+            auto& pred_kwargs = j["pred_kwargs"];
+
+             if (!pred_kwargs.contains("only_topk")) {
+                 throw std::runtime_error("model corrupted, does not contain only_topk in pred_kwargs");
+            }
+
+            if (!pred_kwargs.contains("post_processor")) {
+                 throw std::runtime_error("model corrupted, does not contain post_processor in pred_kwargs");
+            }
+
+
+            only_topk = pred_kwargs["only_topk"];
+            post_processor = pred_kwargs["post_processor"];
+        }
+    };
+
+    template <class T>
+    struct PostProcessor {
+        typedef std::function<T(const T&)> Transform;
+        typedef std::function<T(const T&, const T&)> Combiner;
+
+        Transform transform;
+        Combiner combiner;
+
+        PostProcessor(
+            const Transform& transform = [](const T& v) { return v; },
+            const Combiner& combiner = [](const T& x, const T& y) { return x; }
+        ) : transform(transform), combiner(combiner) {}
+
+        static PostProcessor get(const std::string& name) {
+            auto startswith = [](const std::string& full, const std::string& pattern) -> bool {
+                return full.size() >= pattern.size() && full.compare(0, pattern.size(), pattern) == 0;
+            };
+
+            auto endswith = [](const std::string& full, const std::string& pattern) -> bool {
+                return full.size() >= pattern.size() &&
+                       full.compare(full.size() - pattern.size(), pattern.size(), pattern) == 0;
+            };
+            const std::string log_prefix("log-");
+            static robin_hood::unordered_map<std::string, PostProcessor<T>> post_processors;
+
+            if (post_processors.find(name) != post_processors.end()) {
+                return post_processors[name];
+            }
+
+            if (name == "noop") {
+                post_processors[name] = PostProcessor<T>([](const T& v) -> T { return v; },
+                                                         [](const T& x, const T& y) -> T { return x; });
+                return post_processors[name];
+            } else if (name == "sigmoid") {
+                post_processors[name] = PostProcessor<T>(
+                    [](const T& v) -> T { return 1.0 / (1.0 + std::exp(-v)); }, std::multiplies<T>());
+            } else if (name == "log-sigmoid") {
+                post_processors[name] = PostProcessor<T>(
+                    [](const T& v) -> T { return -std::log(1.0 + std::exp(-v)); }, std::plus<T>());
+            } else if (startswith(name, "log-l") && endswith(name, "-hinge")) {
+                const std::string prefix("log-l");
+                const std::string suffix("-hinge");
+                const size_t p = std::atoi(
+                    name.substr(prefix.size(), name.size() - prefix.size() - suffix.size()).c_str());
+                post_processors[name] = PostProcessor<T>{[=](const T& v) -> T {
+                                                             T z = std::max(0.0, 1.0 - v);
+                                                             return -std::pow(z, p);
+                                                         },
+                                                         std::plus<T>()};
+            } else if (startswith(name, "l") && endswith(name, "-hinge")) {
+                const std::string prefix("l");
+                const std::string suffix("-hinge");
+                const size_t p = std::atoi(
+                    name.substr(prefix.size(), name.size() - prefix.size() - suffix.size()).c_str());
+                post_processors[name] = PostProcessor<T>{[=](const T& v) -> T {
+                                                             T z = std::max(0.0, 1.0 - v);
+                                                             return std::exp(-std::pow(z, p));
+                                                         },
+                                                         std::multiplies<T>()};
+            }
+            return post_processors[name];
+        }
+    };
+
+    // A structure that holds a single nonzero entry in a chunked matrix
+    struct chunk_entry_t {
+        typedef typename csc_t::index_type index_type;
+        typedef typename csc_t::value_type value_type;
+
+        // The column offset of this entry in the chunk. (i.e., the column of
+        // this entry is given by the col_offset plus the starting column of
+        // the chunk this entry resides in.)
+        index_type col_offset;
+        // The value of this nonzero entry
+        value_type val;
+    };
+
+    struct hash_chunk_t {
+        typedef typename csc_t::index_type index_type;
+        typedef typename csc_t::mem_index_type mem_index_type;
+        typedef typename chunk_entry_t::value_type value_type;
+
+        // Maps a matrix row index into an index of the row_ptr array below
+        robin_hood::unordered_map<index_type, index_type> row_hash;
+        index_type col_begin; // The column this chunk starts at (inclusive)
+        index_type col_end; // The column this chunk ends at (exclusive)
+        mem_index_type* row_ptr; // An array of where rows begin in hash_chunked_matrix_t::entries
+        bool b_has_explicit_bias; // Whether or not this chunk has an explicit bias term
+
+        hash_chunk_t() :
+            row_ptr(nullptr),
+            b_has_explicit_bias(false) {
+        }
+
+        ~hash_chunk_t() {
+            if (row_ptr) {
+                delete[] row_ptr;
+            }
+        }
+
+        void set_empty() {
+            row_ptr = nullptr;
+        }
+
+        void init(const index_type nnz_rows) {
+            row_ptr = new mem_index_type[nnz_rows + 1];
+        }
+
+        void set_row(index_type row, index_type row_indx, mem_index_type ptr) {
+            row_ptr[row_indx] = ptr;
+            row_hash[row] = row_indx;
+        }
+    };
+
+    struct bin_search_chunk_t {
+        typedef typename csc_t::index_type index_type;
+        typedef typename csc_t::mem_index_type mem_index_type;
+        typedef typename chunk_entry_t::value_type value_type;
+
+        index_type col_begin; // The column this chunk starts at (inclusive)
+        index_type col_end; // The column this chunk ends at (exclusive)
+        index_type* row_idx; // Stores the row id of each nnz row
+        mem_index_type* row_ptr; // Stores the pointer to data of each nnz row
+        index_type nnz_rows;
+        bool b_has_explicit_bias; // Whether or not this chunk has an explicit bias term
+
+        bin_search_chunk_t() :
+            row_idx(nullptr),
+            row_ptr(nullptr),
+            b_has_explicit_bias(false) {
+        }
+
+        ~bin_search_chunk_t() {
+            if (row_ptr) {
+                delete[] row_ptr;
+            }
+            if (row_idx) {
+                delete[] row_idx;
+            }
+        }
+
+        void set_empty() {
+            row_idx = nullptr;
+            row_ptr = nullptr;
+            nnz_rows = 0;
+        }
+
+        void init(const index_type nnz_rows) {
+            row_idx = new index_type[nnz_rows];
+            row_ptr = new mem_index_type[nnz_rows + 1];
+            this->nnz_rows = nnz_rows;
+        }
+
+        void set_row(index_type row, index_type row_indx, mem_index_type ptr) {
+            row_ptr[row_indx] = ptr;
+            row_idx[row_indx] = row;
+        }
+    };
+
+    struct hash_chunked_matrix_t {
+        typedef hash_chunk_t chunk_t;
+        typedef typename chunk_t::index_type index_type;
+        typedef typename chunk_t::mem_index_type mem_index_type;
+        typedef typename chunk_t::value_type value_type;
+        typedef uint32_t chunk_index_type;
+
+        static const layer_type_t layer_type = LAYER_TYPE_HASH_CHUNKED;
+
+        chunk_t* chunks; // The chunks of this matrix
+        chunk_entry_t* entries; // The nz entries of this matrix
+        index_type chunk_count;
+        index_type cols;
+        index_type rows;
+
+        mem_index_type get_nnz() const {
+            auto& lastChunk = chunks[chunk_count - 1];
+            return lastChunk.row_ptr[lastChunk.row_hash.size()];
+        }
+
+        // Frees the underlying memory of the matrix (i.e., chunk and entry arrays)
+        // Every function in the inference code that returns a matrix has allocated memory, and
+        // therefore one should call this function to free that memory.
+        void free_underlying_memory() {
+            delete[] chunks;
+            delete[] entries;
+        }
+
+        bool check_bias_explicit(const chunk_t& chunk) const {
+            return chunk.row_hash.find(rows - 1) != chunk.row_hash.end();
+        }
+    };
+
+    struct bin_search_chunked_matrix_t {
+        typedef bin_search_chunk_t chunk_t;
+        typedef typename chunk_t::index_type index_type;
+        typedef typename chunk_t::mem_index_type mem_index_type;
+        typedef typename chunk_t::value_type value_type;
+        typedef uint32_t chunk_index_type;
+
+        static const layer_type_t layer_type = LAYER_TYPE_BINARY_SEARCH_CHUNKED;
+
+        chunk_t* chunks; // The chunks of this matrix
+        chunk_entry_t* entries; // The nz entries of this matrix
+        index_type chunk_count;
+        index_type cols;
+        index_type rows;
+
+        uint64_t get_nnz() const {
+            auto& lastChunk = chunks[chunk_count - 1];
+            return lastChunk.row_ptr[lastChunk.nnz_rows];
+        }
+
+        // Frees the underlying memory of the matrix (i.e., chunk and entry arrays)
+        // Every function in the inference code that returns a matrix has allocated memory, and
+        // therefore one should call this function to free that memory.
+        void free_underlying_memory() {
+            delete[] chunks;
+            delete[] entries;
+        }
+
+        bool check_bias_explicit(const chunk_t& chunk) const {
+            return chunk.nnz_rows > 0 && chunk.row_idx[chunk.nnz_rows - 1] == rows - 1;
+        }
+    };
+
+    // Adds a scalar multiple of a sparse row of a chunk to a dense output matrix block
+    template <typename chunked_matrix_t>
+    inline void add_scaled_chunk_row_to_output_block(const chunked_matrix_t& matrix,
+        const typename chunked_matrix_t::chunk_t& chunk,
+        const typename chunked_matrix_t::index_type nz_row_idx,
+        const typename chunked_matrix_t::value_type scalar,
+        typename chunked_matrix_t::value_type* output_block) {
+        uint64_t row_start = chunk.row_ptr[nz_row_idx];
+        uint64_t row_end = chunk.row_ptr[nz_row_idx + 1];
+        for (uint64_t j = row_start; j < row_end; ++j) {
+            auto& entry = matrix.entries[j];
+            output_block[entry.col_offset] += scalar * entry.val;
+        }
+    }
+
+    // Create a chunked matrix from a csc matrix. chunk_col_idx specifies the
+    // column starts of each chunk.
+    template <typename matrix_type_t, typename chunk_col_array_index_t>
+    matrix_type_t make_chunked_from_csc(const csc_t& mat,
+        const chunk_col_array_index_t chunk_col_idx[],
+        const uint32_t chunk_count) {
+
+        typedef typename matrix_type_t::index_type index_type;
+        typedef typename matrix_type_t::mem_index_type mem_index_type;
+        typedef typename matrix_type_t::value_type value_type;
+        typedef typename std::make_signed<index_type>::type signed_index_type;
+        typedef typename matrix_type_t::chunk_index_type chunk_index_type;
+
+        struct chunk_nz_entry_t {
+            chunk_entry_t::index_type row;
+            chunk_entry_t::index_type col;
+            chunk_entry_t::value_type val;
+
+            bool operator<(const chunk_nz_entry_t& other) const {
+                return row < other.row;
+            }
+        };
+
+        matrix_type_t chunked;
+        chunked.chunks = new typename matrix_type_t::chunk_t[chunk_count];
+        chunked.entries = new chunk_entry_t[mat.col_ptr[mat.cols]];
+        chunked.chunk_count = chunk_count;
+        chunked.cols = mat.cols;
+        chunked.rows = mat.rows;
+
+        std::vector<mem_index_type> chunk_ptr(chunk_count + 1);
+
+        for (chunk_index_type i = 0; i < chunk_count; ++i) {
+            chunked.chunks[i].col_begin = chunk_col_idx[i];
+            chunked.chunks[i].col_end = chunk_col_idx[i + 1];
+            chunked.chunks[i].row_ptr = nullptr;
+            chunk_ptr[i] = mat.col_ptr[chunk_col_idx[i]];
+        }
+        chunk_ptr[chunk_count] = mat.col_ptr[mat.cols];
+
+        std::vector<chunk_nz_entry_t> nonzeros;
+
+        for (chunk_index_type i_chunk = 0; i_chunk < chunk_count; ++i_chunk) {
+            auto& chunk = chunked.chunks[i_chunk];
+            mem_index_type chunk_nnz = chunk_ptr[i_chunk + 1] - chunk_ptr[i_chunk];
+
+            // No nonzeros, no problem!
+            if (chunk_nnz == 0) {
+                chunk.set_empty();
+                continue;
+            }
+
+            nonzeros.resize(chunk_nnz);
+
+            // Collect nonzeros
+            mem_index_type i_nz = 0;
+            for (index_type col = chunk.col_begin; col < chunk.col_end; ++col) {
+                mem_index_type col_begin = mat.col_ptr[col];
+                mem_index_type col_end = mat.col_ptr[col + 1];
+                for (mem_index_type i = col_begin; i < col_end; ++i) {
+                    auto& entry = nonzeros[i_nz++];
+                    entry.row = mat.row_idx[i];
+                    entry.col = col;
+                    entry.val = mat.val[i];
+                }
+            }
+
+            // Sort by row, remains sorted by columns
+            std::stable_sort(nonzeros.begin(), nonzeros.end());
+
+            // Count the number of nz rows
+            index_type nz_rows = 1;
+            for (mem_index_type i = 0; i < chunk_nnz - 1; ++i) {
+                nz_rows += (nonzeros[i].row != nonzeros[i + 1].row);
+            }
+
+            // Allocate memory for chunk
+            chunk.init(nz_rows);
+
+            chunk.row_ptr[nz_rows] = mat.col_ptr[chunk.col_end];
+
+            mem_index_type chunk_offset = chunk_ptr[i_chunk];
+            signed_index_type last_row = -1;
+            index_type i_nz_row = 0;
+            for (mem_index_type i = 0, entry_location = chunk_offset; i < chunk_nnz; ++i, ++entry_location) {
+                auto& entry = nonzeros[i];
+
+                // The row has changed, save the row's metadata into the chunk
+                if (entry.row != last_row) {
+                    chunk.set_row(entry.row, i_nz_row, entry_location);
+                    ++i_nz_row;
+                    last_row = entry.row;
+                }
+
+                chunked.entries[entry_location].col_offset = entry.col - chunk.col_begin;
+                chunked.entries[entry_location].val = entry.val;
+            }
+        }
+        return chunked;
+    }
+
+    // Checks if the rows of C (i.e., the children nodes) are contiguously ordered.
+    // That is, all of the children of a node are contiguous in row space and these
+    // contiguous groups of children are ordered by their respective parents.
+    // Each row of C has at most one non-zero value.
+    // In the case where the tree is pruned, i.e., some rows of C are empty, the function returns false.
+    // This ordering is necessary to use chunked matrices.
+    bool check_if_contiguously_ordered(const csc_t& C) {
+        // When tree is pruned, return false
+        if (C.get_nnz() < C.rows) {
+            return false;
+        }
+        bool b_ordered = true;
+        for (csc_t::mem_index_type i = 0; i < C.col_ptr[C.cols]; ++i) {
+            b_ordered = b_ordered && C.row_idx[i] == i;
+        }
+        return b_ordered;
+    }
+
+    template <typename matrix_t>
+    matrix_t make_chunked_W_from_layer_matrices(const csc_t& W, const csc_t& C,
+        bool b_use_bias) {
+
+        typedef typename matrix_t::chunk_index_type chunk_index_t;
+        typedef typename csc_t::mem_index_type index_t;
+
+        // Make sure that the rows of C are contiguous in order of parent node
+        matrix_t result = make_chunked_from_csc<matrix_t, index_t>(W, C.col_ptr, C.cols);
+
+        // Precompute whether each chunk actually has a bias term.
+        if (b_use_bias) {
+            for (chunk_index_t i_chunk = 0; i_chunk < result.chunk_count; ++i_chunk) {
+                auto& chunk = result.chunks[i_chunk];
+                chunk.b_has_explicit_bias = result.check_bias_explicit(chunk);
+            }
+        } else {
+            for (chunk_index_t i_chunk = 0; i_chunk < result.chunk_count; ++i_chunk) {
+                auto& chunk = result.chunks[i_chunk];
+                chunk.b_has_explicit_bias = false;
+            }
+        }
+
+        return result;
+    }
+
+    // Defines templated operations for vector x chunk products
+    template <typename query_vector_t, typename chunked_matrix_t>
+    struct chunk_ops {
+        // Sparse vector x chunk
+        static void compute_chunk_inner_product_write_to_zeroed_block(
+            const query_vector_t& v, const typename chunked_matrix_t::chunk_t& chunk,
+            const chunked_matrix_t& chunk_matrix,
+            typename chunked_matrix_t::value_type* output_block,
+            typename chunked_matrix_t::value_type bias, bool b_use_bias);
+    };
+
+    template <>
+    struct chunk_ops<typename csr_t::row_vec_t, hash_chunked_matrix_t> {
+        // Please make sure that the memory in result_dest has already been zeroed!
+        // Compute the inner product of a vector and chunk in hash format.
+        // Inner product is computed via hash table lookup.
+        static void compute_chunk_inner_product_write_to_zeroed_block(
+            const csr_t::row_vec_t& v, const hash_chunk_t& chunk,
+            const hash_chunked_matrix_t& chunk_matrix,
+            typename hash_chunked_matrix_t::value_type* output_block,
+            typename hash_chunked_matrix_t::value_type bias, bool b_use_bias) {
+
+            // The chunk has a bias
+            if (b_use_bias) {
+                // Add bias to result
+                add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                    chunk.row_hash.find(chunk_matrix.rows - 1)->second,
+                    bias, output_block);
+            }
+
+            // Add everything else
+            for (csr_t::row_vec_t::index_type i = 0; i < v.nnz; ++i) {
+                auto v_val = v.val[i];
+                auto it = chunk.row_hash.find(v.idx[i]);
+                if (it != chunk.row_hash.end()) {
+                    add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                        it->second, v_val, output_block);
+                }
+            }
+        }
+    };
+
+    template <>
+    struct chunk_ops<typename drm_t::row_vec_t, hash_chunked_matrix_t> {
+        static void compute_chunk_inner_product_write_to_zeroed_block(
+            const drm_t::row_vec_t& v, const hash_chunk_t& chunk,
+            const hash_chunked_matrix_t& chunk_matrix,
+            typename hash_chunked_matrix_t::value_type* output_block,
+            typename hash_chunked_matrix_t::value_type bias, bool b_use_bias) {
+
+            if (b_use_bias) {
+                // Because the hash map is unordered, we need to check if every
+                // entry is the bias term.
+                // This is very slow, don't use if you can avoid it.
+                for (auto it = chunk.row_hash.begin(); it != chunk.row_hash.end(); ++it) {
+                    if (it->first == v.len) {
+                        // The bias term
+                        add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                            it->second, bias, output_block);
+                    } else {
+                        // Not the bias term
+                        auto v_val = v.val[it->first];
+                        add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                            it->second, v_val, output_block);
+                    }
+                }
+            } else {
+                for (auto it = chunk.row_hash.begin(); it != chunk.row_hash.end(); ++it) {
+                    auto v_val = v.val[it->first];
+                    add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                        it->second, v_val, output_block);
+                }
+            }
+        }
+    };
+
+    template <>
+    struct chunk_ops<typename csr_t::row_vec_t, bin_search_chunked_matrix_t> {
+        // Please make sure that the memory in result_dest has already been zeroed!
+        // Compute the inner product of a vector and chunk in binary search format.
+        // Inner product is computed via binary search.
+        static void compute_chunk_inner_product_write_to_zeroed_block(
+            const csr_t::row_vec_t& v, const bin_search_chunk_t& chunk,
+            const bin_search_chunked_matrix_t& chunk_matrix,
+            typename bin_search_chunked_matrix_t::value_type* output_block,
+            typename bin_search_chunked_matrix_t::value_type bias, bool b_use_bias) {
+
+            typedef typename bin_search_chunked_matrix_t::index_type chunk_index_t;
+            typedef typename csr_t::row_vec_t::index_type vec_index_t;
+
+            chunk_index_t* chunk_idx_end = &chunk.row_idx[chunk.nnz_rows];
+            vec_index_t* v_idx_end = &v.idx[v.nnz];
+            chunk_index_t s = 0;
+            vec_index_t t = 0;
+
+            while (s < chunk.nnz_rows && t < v.nnz) {
+                if (chunk.row_idx[s] == v.idx[t]) {
+                    auto v_val = v.val[t];
+                    add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                        s, v_val, output_block);
+                    ++s;
+                    ++t;
+                } else if (chunk.row_idx[s] < v.idx[t]) {
+                    // Perform a binary search on chunk.row_idx
+                    s = std::lower_bound(&chunk.row_idx[s], chunk_idx_end, v.idx[t]) - chunk.row_idx;
+                }
+                else if (chunk.row_idx[s] > v.idx[t]) {
+                    // Perform a binary search on v.idx
+                    t = std::lower_bound(&v.idx[t], v_idx_end, chunk.row_idx[s]) - v.idx;
+                }
+            }
+
+            // There is a bias
+            if (b_use_bias) {
+                // Add bias term if necessary
+                auto last = chunk.nnz_rows - 1;
+                add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                    last, bias, output_block);
+            }
+        }
+    };
+
+    template <>
+    struct chunk_ops<typename drm_t::row_vec_t, bin_search_chunked_matrix_t> {
+        static void compute_chunk_inner_product_write_to_zeroed_block(
+            const typename drm_t::row_vec_t& v, const bin_search_chunk_t& chunk,
+            const bin_search_chunked_matrix_t& chunk_matrix,
+            typename bin_search_chunked_matrix_t::value_type* output_block,
+            typename bin_search_chunked_matrix_t::value_type bias, bool b_use_bias) {
+
+            uint32_t it_end = chunk.nnz_rows;
+            if (b_use_bias) {
+                // Add bias term
+                add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                    it_end - 1, bias, output_block);
+                // Exclude bias term from below
+                --it_end;
+            }
+
+            // Iterate through all non-bias terms
+            for (uint32_t it = 0; it != it_end; ++it) {
+                auto v_val = v.val[chunk.row_idx[it]];
+                add_scaled_chunk_row_to_output_block(chunk_matrix, chunk,
+                    it, v_val, output_block);
+            }
+        }
+    };
+
+    template <layer_type_t type>
+    struct LAYER_TYPE_METADATA_;
+
+    template <>
+    struct LAYER_TYPE_METADATA_<LAYER_TYPE_CSC> {
+        typedef csc_t matrix_t;
+    };
+
+    template <>
+    struct LAYER_TYPE_METADATA_<LAYER_TYPE_HASH_CHUNKED> {
+        typedef hash_chunked_matrix_t matrix_t;
+    };
+
+    template <>
+    struct LAYER_TYPE_METADATA_<LAYER_TYPE_BINARY_SEARCH_CHUNKED> {
+        typedef bin_search_chunked_matrix_t matrix_t;
+    };
+
+    template <typename matrix_t>
+    struct WEIGHT_MATRIX_METADATA_;
+
+    template <typename matrix_t>
+    struct QUERY_MATRIX_METADATA_;
+
+    template <>
+    struct QUERY_MATRIX_METADATA_<csr_t> {
+        static constexpr const char* TYPE_NAME = "csr_t";
+    };
+
+    template <>
+    struct QUERY_MATRIX_METADATA_<drm_t> {
+        static constexpr const char* TYPE_NAME = "drm_t";
+    };
+
+    template <>
+    struct WEIGHT_MATRIX_METADATA_<csc_t> {
+        const static bool IS_CHUNKED = false;
+        const static layer_type_t LAYER_TYPE = LAYER_TYPE_CSC;
+        static constexpr const char* TYPE_NAME = "csc_t";
+    };
+
+    template <>
+    struct WEIGHT_MATRIX_METADATA_<hash_chunked_matrix_t> {
+        const static bool IS_CHUNKED = true;
+        const static layer_type_t LAYER_TYPE = LAYER_TYPE_HASH_CHUNKED;
+        static constexpr const char* TYPE_NAME = "hash_chunked_matrix_t";
+    };
+
+    template <>
+    struct WEIGHT_MATRIX_METADATA_<bin_search_chunked_matrix_t> {
+        const static bool IS_CHUNKED = true;
+        const static layer_type_t LAYER_TYPE = LAYER_TYPE_BINARY_SEARCH_CHUNKED;
+        static constexpr const char* TYPE_NAME = "bin_search_chunked_matrix_t";
+    };
+
+    template<typename matrix_t,
+        bool chunked = WEIGHT_MATRIX_METADATA_<matrix_t>::IS_CHUNKED>
+    struct w_ops;
+
+    template<typename chunked_matrix_t>
+    struct w_ops<chunked_matrix_t, true> {
+        template <typename query_matrix_t, typename prediction_matrix_t>
+        static void compute_sparse_predictions(const query_matrix_t& X, const chunked_matrix_t& W,
+            typename csr_t::mem_index_type* row_ptr, // Sparsity pattern of prediction at current layer
+            typename csr_t::index_type* col_idx,
+            bool b_sort_by_chunk,
+            float bias,
+            const prediction_matrix_t& prev_layer_pred,
+            prediction_matrix_t& curr_layer_pred);
+    };
+
+    template <>
+    struct w_ops<csc_t, false> {
+        template <typename query_matrix_t, typename prediction_matrix_t>
+        static void compute_sparse_predictions(const query_matrix_t& X, const csc_t& W,
+            typename csr_t::mem_index_type* row_ptr,
+            typename csr_t::index_type* col_idx, // Sparsity pattern of prediction at current layer
+            bool b_sort_by_chunk,
+            float bias,
+            const prediction_matrix_t& prev_layer_pred,
+            prediction_matrix_t& curr_layer_pred);
+    };
+
+    // Compute the predictions of a layer (before post process) on the specified sparsity pattern
+    template <typename chunked_matrix_t>
+    template <typename query_matrix_t, typename prediction_matrix_t>
+    void w_ops<chunked_matrix_t, true>::compute_sparse_predictions(const query_matrix_t& X,
+        const chunked_matrix_t& W,
+        typename csr_t::mem_index_type* row_ptr,
+        typename csr_t::index_type* col_idx, // Sparsity pattern of prediction at current layer
+        bool b_sort_by_chunk,
+        float bias,
+        const prediction_matrix_t& prev_layer_pred,
+        prediction_matrix_t& curr_layer_pred) {
+
+        struct compute_query_t {
+            typename query_matrix_t::index_type row;
+            typename chunked_matrix_t::chunk_index_type chunk;
+            typename csr_t::mem_index_type write_addr;
+
+            bool operator<(const compute_query_t& other) const {
+                return chunk < other.chunk;
+            }
+        };
+
+        typename prediction_matrix_t::mem_index_type* parent_row_ptr = prev_layer_pred.row_ptr;
+        typename prediction_matrix_t::index_type* parent_col_idx = prev_layer_pred.col_idx;
+
+        auto rows = X.rows;
+        auto cols = W.cols;
+        auto nnz = row_ptr[rows];
+        auto parent_nnz = parent_row_ptr[rows];
+
+        typedef typename query_matrix_t::row_vec_t query_row_t;
+        typedef typename csr_t::index_type index_type;
+        typedef typename csr_t::value_type value_type;
+        typedef typename csr_t::mem_index_type mem_index_type;
+
+        std::vector<compute_query_t> compute_queries(parent_nnz);
+
+        curr_layer_pred.rows = rows;
+        curr_layer_pred.cols = cols;
+        curr_layer_pred.row_ptr = new mem_index_type[rows + 1];
+        std::memcpy(curr_layer_pred.row_ptr, row_ptr, sizeof(mem_index_type) * (rows + 1));
+        curr_layer_pred.col_idx = new index_type[nnz];
+        std::memcpy(curr_layer_pred.col_idx, col_idx, sizeof(index_type) * nnz);
+        curr_layer_pred.val = new value_type[nnz];
+
+#pragma omp parallel for schedule(dynamic,4)
+        for (index_type row = 0; row < rows; ++row) {
+            mem_index_type row_start = row_ptr[row];
+            mem_index_type row_end = row_ptr[row + 1];
+
+            mem_index_type write_addr = row_start;
+            index_type parent_read_begin = parent_row_ptr[row];
+            index_type parent_read_end = parent_row_ptr[row + 1];
+            for (index_type parent_read_addr = parent_read_begin; parent_read_addr < parent_read_end;
+                ++parent_read_addr) {
+                compute_queries[parent_read_addr].row = row;
+                compute_queries[parent_read_addr].chunk = parent_col_idx[parent_read_addr];
+                compute_queries[parent_read_addr].write_addr = write_addr;
+                auto& chunk = W.chunks[parent_col_idx[parent_read_addr]];
+                write_addr += (chunk.col_end - chunk.col_begin);
+            }
+
+            // Zero out row
+            std::fill(&curr_layer_pred.val[row_start], &curr_layer_pred.val[row_end], 0.0);
+        }
+
+        // Sort vector x chunk queries by chunk for better cache coherence if requested
+        if (b_sort_by_chunk) {
+            std::stable_sort(compute_queries.begin(), compute_queries.end());
+        }
+
+#pragma omp parallel for schedule(dynamic,64)
+        for (mem_index_type i_query = 0; i_query < parent_nnz; ++i_query) {
+            compute_query_t* query = &compute_queries[i_query];
+            auto xi = X.get_row(query->row);
+            auto& chunk = W.chunks[query->chunk];
+            auto write_ptr = &curr_layer_pred.val[query->write_addr];
+            auto b_use_bias = chunk.b_has_explicit_bias;
+            chunk_ops<query_row_t, chunked_matrix_t>::
+                compute_chunk_inner_product_write_to_zeroed_block(
+                    xi, chunk, W, write_ptr, bias, b_use_bias);
+        }
+
+    }
+
+
+    template <typename query_vec_t, typename weight_vec_t>
+    struct vector_ops {
+        static float inner_product(const query_vec_t& query,
+            const weight_vec_t& weight,
+            const typename weight_vec_t::index_type weight_dim,
+            const typename weight_vec_t::value_type bias, bool b_use_bias);
+    };
+
+    template <>
+    struct vector_ops<typename csr_t::row_vec_t, typename csc_t::col_vec_t> {
+        static float inner_product(const typename csr_t::row_vec_t& query,
+            const typename csc_t::col_vec_t& weight,
+            typename csc_t::col_vec_t::index_type weight_dim,
+            typename csc_t::col_vec_t::value_type bias, bool b_use_bias) {
+
+            typedef typename csc_t::col_vec_t::value_type value_type;
+
+            value_type res = 0.0;
+            if (b_use_bias) {
+                 // Is the bias term in the weight vector nonzero?
+                if (weight.nnz > 0 && weight.idx[weight.nnz - 1] == weight_dim - 1) {
+                    // Add bias to result
+                    res += bias * weight.val[weight.nnz - 1];
+                }
+            }
+
+            res += do_dot_product(query, weight);
+            return res;
+        }
+    };
+
+    template <>
+    struct vector_ops<typename drm_t::row_vec_t, typename csc_t::col_vec_t> {
+        static float inner_product(const typename drm_t::row_vec_t& query,
+            const typename csc_t::col_vec_t& weight,
+            typename csc_t::col_vec_t::index_type weight_dim,
+            typename csc_t::col_vec_t::value_type bias, bool b_use_bias) {
+
+            typedef typename csc_t::col_vec_t::value_type value_type;
+            typedef typename csc_t::col_vec_t::index_type index_type;
+
+            if (b_use_bias) {
+                value_type res = 0.0;
+                index_type loop_range;
+
+                // Is the bias term in the weight vector nonzero?
+                bool nz_bias = weight.nnz > 0 && weight.idx[weight.nnz - 1] == weight_dim - 1;
+
+                // If bias is nonzero in weight vector
+                if (nz_bias) {
+                    // Skip over bias when performing inner product
+                    loop_range = weight.nnz - 1;
+                    // Add bias to result
+                    res += bias * weight.val[loop_range];
+                } else {
+                    loop_range = weight.nnz;
+                }
+
+                // Do remaining inner product
+                for (index_type s = 0; s < loop_range; s++) {
+                    res += query[weight.idx[s]] * weight.val[s];
+                }
+
+                return res;
+            } else {
+                return do_dot_product(query, weight);
+            }
+        }
+    };
+
+    // Unchunked version of compute_sparse_predictions.
+    template <typename query_matrix_t, typename prediction_matrix_t>
+    void w_ops<csc_t, false>::compute_sparse_predictions(const query_matrix_t& X,
+        const csc_t& W,
+        typename csr_t::mem_index_type* row_ptr,
+        typename csr_t::index_type* col_idx, // Sparsity pattern for this layer
+        bool b_sort_by_chunk,
+        float bias,
+        const prediction_matrix_t& prev_layer_pred,
+        prediction_matrix_t& curr_layer_pred) {
+
+        typedef typename query_matrix_t::row_vec_t query_row_t;
+        typedef typename csc_t::col_vec_t weight_col_t;
+
+        struct compute_query_t {
+            typename query_row_t::index_type row;
+            typename weight_col_t::index_type col;
+            typename csr_t::mem_index_type write_addr;
+
+            bool operator<(const compute_query_t& other) const {
+                return col < other.col;
+            }
+        };
+
+        auto rows = X.rows;
+        auto cols = W.cols;
+        auto nnz = row_ptr[rows];
+
+        bool b_use_bias = bias > 0.0;
+
+        typedef typename csr_t::index_type index_type;
+        typedef typename csr_t::value_type value_type;
+        typedef typename csr_t::mem_index_type mem_index_type;
+
+        std::vector<compute_query_t> queries(nnz);
+
+        curr_layer_pred.rows = rows;
+        curr_layer_pred.cols = cols;
+        curr_layer_pred.row_ptr = new mem_index_type[rows + 1];
+        std::memcpy(curr_layer_pred.row_ptr, row_ptr, sizeof(mem_index_type) * (rows + 1));
+        curr_layer_pred.col_idx = new index_type[nnz];
+        std::memcpy(curr_layer_pred.col_idx, col_idx, sizeof(index_type) * nnz);
+        curr_layer_pred.val = new value_type[nnz];
+
+#pragma omp parallel for schedule(dynamic,4)
+        for (index_type row = 0; row < rows; ++row) {
+            for (mem_index_type i = row_ptr[row]; i < row_ptr[row + 1]; ++i) {
+                queries[i].row = row;
+                queries[i].col = col_idx[i];
+                queries[i].write_addr = i;
+            }
+        }
+
+        // Sort by columns
+        if (b_sort_by_chunk) {
+            std::sort(queries.begin(), queries.end());
+        }
+
+#pragma omp parallel for schedule(dynamic,64)
+        for (mem_index_type i_query = 0; i_query < nnz; ++i_query) {
+            compute_query_t* q = &queries[i_query];
+            auto Xi = X.get_row(q->row);
+            auto Wj = W.get_col(q->col);
+
+            // Do dot product
+            curr_layer_pred.val[q->write_addr] = vector_ops<query_row_t, weight_col_t>::inner_product(
+                    Xi, Wj, W.rows, bias, b_use_bias);
+        }
+
+    }
+
+    // Prolongates the predictions of the previous layer to all of the children of nodes in
+    // the active beam. The result is returned as a csr_t matrix. This also has the dual purpose
+    // of computing the sparsity pattern for predictions of the current layer, as the prolongated
+    // labels and the predictions will have the same sparsity pattern.
+    csr_t prolongate_predictions(const csr_t& csr_pred, const csc_t& C) {
+        typedef typename csr_t::mem_index_type mem_index_type;
+        typedef typename csr_t::index_type index_type;
+        typedef typename csr_t::value_type value_type;
+
+        auto rows = csr_pred.rows;
+        auto cols = C.rows;
+
+        // Compute the nnz's of each row
+        // We convert this to row_idx later, so start indexing at 1 instead of 0
+        mem_index_type* row_ptr = new mem_index_type[rows + 1];
+        row_ptr[0] = 0;
+        for (index_type row = 0; row < rows; ++row) {
+            index_type row_nnz = 0;
+
+            mem_index_type row_start = csr_pred.row_ptr[row];
+            mem_index_type row_end = csr_pred.row_ptr[row + 1];
+
+            // Number of elements in a column of C is the number of children of that cluster
+            for (mem_index_type i = row_start; i < row_end; ++i) {
+                row_nnz += C.nnz_of_col(csr_pred.col_idx[i]);
+            }
+
+            row_ptr[row + 1] = row_nnz;
+        }
+
+        // Perform summation
+        for (index_type i = 0; i < rows; ++i) {
+            row_ptr[i + 1] += row_ptr[i];
+        }
+
+        // Allocate the col_idx entries
+        auto nnz = row_ptr[rows];
+        index_type* col_idx = new index_type[nnz];
+        value_type* val = new value_type[nnz];
+
+        // Actually compute the resulting labels
+#pragma omp parallel for schedule(dynamic,4)
+        for (index_type row = 0; row < rows; ++row) {
+            mem_index_type csr_pred_row_start = csr_pred.row_ptr[row];
+            mem_index_type csr_pred_row_end = csr_pred.row_ptr[row + 1];
+
+            mem_index_type output_row_start = row_ptr[row];
+            mem_index_type output_row_end = row_ptr[row + 1];
+            mem_index_type i = output_row_start;
+
+            for (mem_index_type j = csr_pred_row_start; j < csr_pred_row_end; ++j) {
+                mem_index_type C_col_start = C.col_ptr[csr_pred.col_idx[j]];
+                mem_index_type C_col_end = C.col_ptr[csr_pred.col_idx[j] + 1];
+
+                for (mem_index_type k = C_col_start; k < C_col_end; ++k) {
+                    col_idx[i] = C.row_idx[k];
+                    val[i] = csr_pred.val[j];
+                    ++i;
+                }
+            }
+        }
+
+        csr_t result;
+        result.col_idx = col_idx;
+        result.row_ptr = row_ptr;
+        result.rows = rows;
+        result.cols = cols;
+        result.val = val;
+        return result;
+    }
+
+    // Obtain the top k values of each row of X sorted in decreasing order by their value.
+    // This is meant to be the C++ equivalent of the Python function sorted_csr in smat_util.py
+    template <typename prediction_matrix_t>
+    void sorted_csr(prediction_matrix_t& X, const uint32_t k) {
+        typedef typename csr_t::mem_index_type mem_index_type;
+        typedef typename csr_t::index_type index_type;
+        typedef typename csr_t::value_type value_type;
+
+        auto rows = X.rows;
+        auto cols = X.cols;
+
+        mem_index_type* new_row_ptr = new mem_index_type[rows + 1];
+        new_row_ptr[0] = 0;
+
+        // Determine sizes of each row (i.e., a row may have less than k elements)
+        for (index_type row = 0; row < rows; ++row) {
+            new_row_ptr[row + 1] = new_row_ptr[row] + std::min<index_type>(X.nnz_of_row(row), k);
+        }
+        mem_index_type nnz = new_row_ptr[rows];
+
+        auto new_col_idx = new index_type[nnz];
+        auto new_val = new value_type[nnz];
+
+        // X_permutation is used to rearrange the elements so that the top k in
+        // value are at the beginning of every row.
+        std::vector<index_type> X_permutation(X.get_nnz());
+
+#pragma omp parallel for schedule(dynamic,2)
+        for (index_type row = 0; row < rows; ++row) {
+            mem_index_type source_row_start = X.row_ptr[row];
+            mem_index_type source_row_end = X.row_ptr[row + 1];
+            mem_index_type source_row_size = source_row_end - source_row_start;
+            mem_index_type target_write_head = new_row_ptr[row];
+            mem_index_type copy_size = new_row_ptr[row + 1] - new_row_ptr[row];
+            mem_index_type source_read_head = X.row_ptr[row];
+
+            auto vals = X.val;
+
+            // Initially set X_permutation as the identity
+            for (mem_index_type i = X.row_ptr[row]; i < X.row_ptr[row + 1]; ++i) {
+                X_permutation[i] = i;
+            }
+
+            // A compare function to sort elements in this row by value
+            auto comp = [vals](const index_type i, const index_type j) {
+                if (vals[i] == vals[j]) {
+                    // Break ties by column index. Technically this is arbitrary,
+                    // but we need this to pass the tests.
+                    return j > i;
+                } else {
+                    return vals[i] > vals[j];
+                }
+            };
+
+            if (source_row_size > k) {
+                // Select top elements in each row using quick select
+                // Afterwards, they will appear at the beginning of X_permutation
+                std::nth_element(&X_permutation[source_row_start], &X_permutation[source_row_start + k],
+                    &X_permutation[source_row_end], comp);
+            }
+
+            // Sort all entries to copy over in decreasing order by value
+            std::sort(&X_permutation[source_row_start], &X_permutation[source_row_start + copy_size], comp);
+
+            // Copy all selected entries
+            for (index_type i = 0; i < copy_size; ++i, ++target_write_head, ++source_read_head) {
+                new_val[target_write_head] = X.val[X_permutation[source_read_head]];
+                new_col_idx[target_write_head] = X.col_idx[X_permutation[source_read_head]];
+            }
+        }
+        X.free_underlying_memory();
+        X.rows = rows;
+        X.cols = cols;
+        X.val = new_val;
+        X.col_idx = new_col_idx;
+        X.row_ptr = new_row_ptr;
+
+    }
+
+    void transform_matrix_csr(const PostProcessor<typename csr_t::value_type>& post_processor,
+        csr_t& mat) {
+        typedef typename csr_t::value_type value_type;
+        typedef typename csr_t::mem_index_type mem_index_type;
+
+        mem_index_type nnz = mat.get_nnz();
+
+#pragma omp parallel for schedule(dynamic,64)
+        for (mem_index_type i = 0; i < nnz; ++i) {
+            mat.val[i] = post_processor.transform(mat.val[i]);
+        }
+    }
+
+    void combine_matrices_csr(const PostProcessor<typename csr_t::value_type>& post_processor,
+        csr_t& mat1, csr_t& mat2) {
+        typedef typename csr_t::value_type value_type;
+        typedef typename csr_t::mem_index_type mem_index_type;
+
+        mem_index_type nnz = mat1.get_nnz();
+
+#pragma omp parallel for schedule(dynamic,64)
+        for (mem_index_type i = 0; i < nnz; ++i) {
+            mat1.val[i] = post_processor.combiner(mat1.val[i], mat2.val[i]);
+        }
+    }
+
+    template <typename T>
+    struct statistics_t {
+        T q0;
+        T q1;
+        T q2;
+        T q3;
+        T q4;
+        T mean;
+    };
+
+    template <typename T>
+    statistics_t<T> compute_statistics(std::vector<T>& data) {
+        statistics_t<T> result;
+        std::sort(data.begin(), data.end());
+        result.q0 = data[0];
+        result.q1 = data[data.size() / 4];
+        result.q2 = data[data.size() / 2];
+        result.q3 = data[3 * data.size() / 4];
+        result.q4 = data[data.size() - 1];
+
+        T sum = static_cast<T>(0);
+        for (auto dat : data) {
+            sum += dat;
+        }
+
+        result.mean = sum / data.size();
+        return result;
+    }
+
+    struct layer_statistics_t {
+        typedef typename csc_t::index_type index_type;
+        typedef typename csc_t::mem_index_type mem_index_type;
+
+        statistics_t<index_type> nnz_per_col;
+        mem_index_type nnz;
+        index_type num_children;
+        index_type num_parents;
+
+        static layer_statistics_t compute(const csc_t& W, const csc_t& C) {
+            std::vector<index_type> nnz_per_col_data;
+            nnz_per_col_data.reserve(W.cols);
+            for (index_type i = 0; i < W.cols; ++i) {
+                nnz_per_col_data.emplace_back(W.col_ptr[i + 1] - W.col_ptr[i]);
+            }
+
+            layer_statistics_t result;
+            result.nnz_per_col = compute_statistics(nnz_per_col_data);
+            result.nnz = W.get_nnz();
+            result.num_children = C.rows;
+            result.num_parents = C.cols;
+            return result;
+        }
+    };
+
+    struct query_statistics_t {
+        typedef typename csr_t::index_type index_type;
+        typedef typename csr_t::mem_index_type mem_index_type;
+
+        statistics_t<index_type> nnz_per_row;
+        index_type rows;
+        index_type cols;
+        mem_index_type nnz;
+
+        static query_statistics_t compute(const csr_t& X) {
+            std::vector<index_type> nnz_per_row_data;
+            nnz_per_row_data.reserve(X.rows);
+            for (index_type i = 0; i < X.rows; ++i) {
+                nnz_per_row_data.emplace_back(X.row_ptr[i + 1] - X.row_ptr[i]);
+            }
+
+            query_statistics_t result;
+            result.nnz_per_row = compute_statistics(nnz_per_row_data);
+            result.nnz = X.get_nnz();
+            result.rows = X.rows;
+            result.cols = X.cols;
+            return result;
+        }
+    };
+
+    csr_t csr_npz_to_csr_t_deep_copy(ScipyCsrF32Npz& mat) {
+        csr_t result;
+        result.rows = mat.rows();
+        result.cols = mat.cols();
+        result.col_idx = mat.indices.data();
+        result.row_ptr = mat.indptr.data();
+        result.val = mat.data.data();
+        return result.deep_copy();
+    }
+
+    csc_t csc_npz_to_csc_t_deep_copy(ScipyCscF32Npz& mat) {
+        csc_t result;
+        result.rows = mat.rows();
+        result.cols = mat.cols();
+        result.row_idx = mat.indices.data();
+        result.col_ptr = mat.indptr.data();
+        result.val = mat.data.data();
+        return result.deep_copy();
+    }
+
+    // An abstract interface for a layer of the model
+    template <typename index_type, typename value_type>
+    class IModelLayer {
+    protected:
+        virtual void init(
+            csc_t& W,
+            csc_t& C,
+            uint32_t depth,
+            bool b_assumes_ownership,
+            MLModelMetadata& metadata
+        ) = 0;
+        static IModelLayer<index_type, value_type>* instantiate(const layer_type_t layer_type);
+        static void load(const std::string& folderpath, const uint32_t cur_depth,
+            IModelLayer<index_type, value_type>* model);
+
+    public:
+        virtual void predict(
+            const csr_t& X,
+            const csr_t& prev_layer_pred,
+            bool is_first_layer,
+            const uint32_t overridden_only_topk,
+            const char* overridden_post_processor,
+            csr_t& curr_layer_pred,
+            const int threads=-1
+        ) = 0;
+        virtual void predict(
+            const drm_t& X,
+            csr_t& prev_layer_pred,
+            bool is_first_layer,
+            const uint32_t overridden_only_topk,
+            const char* overridden_post_processor,
+            csr_t& curr_layer_pred,
+            const int threads=-1
+        ) = 0;
+
+        virtual ~IModelLayer() = 0;
+
+        // Layer statistics
+        virtual layer_statistics_t get_statistics() const = 0;
+        virtual layer_type_t get_type() const = 0;
+        virtual index_type label_count() const = 0;
+        virtual index_type feature_count() const = 0;
+        virtual index_type code_count() const = 0;
+        virtual value_type bias() const = 0;
+
+        static IModelLayer<index_type, value_type>* instantiate(const std::string& folderpath,
+            const layer_type_t layer_type, const uint32_t cur_depth);
+    };
+
+    template <typename index_type, typename value_type>
+    void IModelLayer<index_type, value_type>::load(const std::string& folderpath,
+        const uint32_t cur_depth,
+        IModelLayer<index_type, value_type>* model) {
+        MLModelMetadata metadata(folderpath + "/param.json");
+        std::string w_npz_path = folderpath + "/W.npz";
+        std::string c_npz_path = folderpath + "/C.npz";
+        ScipyCscF32Npz W;
+        ScipyCscF32Npz C;
+        csc_t py_matrix_csc_C;
+        csc_t py_matrix_csc_W;
+
+        W.load(w_npz_path);
+        if ((cur_depth == 0) && (access(c_npz_path.c_str(), F_OK) != 0)) {
+            // this is to handle the case where the root layer does not have code saved.
+            C.fill_ones(W.cols(), 1);
+        } else {
+            C.load(c_npz_path);
+        }
+
+        // We perform a deep copy because MLModel assumes ownership of the memory.
+        py_matrix_csc_C = csc_npz_to_csc_t_deep_copy(C);
+        py_matrix_csc_W = csc_npz_to_csc_t_deep_copy(W);
+
+        model->init(py_matrix_csc_W, py_matrix_csc_C, cur_depth, true, metadata);
+    }
+
+    template <typename index_type, typename value_type>
+    IModelLayer<index_type, value_type>* IModelLayer<index_type, value_type>::instantiate(
+        const std::string& folderpath,
+        const layer_type_t layer_type, const uint32_t cur_depth) {
+        IModelLayer* result = IModelLayer::instantiate(layer_type);
+        IModelLayer::load(folderpath, cur_depth, result);
+        return result;
+    }
+
+    template <typename index_type, typename value_type>
+    IModelLayer<index_type, value_type>::~IModelLayer() {
+    }
+
+    template <typename matrix_t,
+        bool chunked = WEIGHT_MATRIX_METADATA_<matrix_t>::IS_CHUNKED>
+    class LayerData;
+
+    // Unchunked layer data
+    template <typename matrix_t>
+    class LayerData<matrix_t, false> {
+    public:
+        typedef typename matrix_t::index_type index_type;
+        typedef typename matrix_t::value_type value_type;
+
+        // Classifier weights of each layer
+        // Feature dimension x Cluster dimension
+        matrix_t W;
+
+        // Parent to child indicator matrix
+        // Child cluster dimension x Parent cluster dimension
+        csc_t C;
+
+        // Whether or not this structure has ownership of W and C matrices
+        bool b_assumes_ownership;
+
+        // The bias for this layer if the model uses a bias
+        value_type bias;
+
+        // Initializes this layer data
+        void init(csc_t& W, csc_t& C, bool b_assumes_ownership, value_type bias) {
+            this->bias = bias;
+            this->b_assumes_ownership = b_assumes_ownership;
+            this->W = W;
+            this->C = C;
+        }
+
+        // Not necessary for unchuncked layer data
+        void reorder_prediction(csr_t& prediction) {
+        }
+
+        // Frees all memory that is owned by this class
+        ~LayerData() {
+            if (b_assumes_ownership) {
+                W.free_underlying_memory();
+                C.free_underlying_memory();
+            }
+        }
+    };
+
+    // Chunked layer data
+    template <typename chunked_matrix_t>
+    class LayerData<chunked_matrix_t, true> {
+    public:
+        typedef typename chunked_matrix_t::index_type index_type;
+        typedef typename chunked_matrix_t::value_type value_type;
+
+        template <typename index_type=uint32_t>
+        struct rearrangement_t {
+            std::vector<index_type> perm; // The rearrangement, stored as a std::vector
+            std::vector<index_type> perm_inv; // The inverse of the rearrangement
+
+            ~rearrangement_t() {
+                perm.clear();
+                perm_inv.clear();
+            }
+
+            // Creates rearrangement to reorder the rows of C so that they are in correct contiguous order
+            // The index type of the rearrangement must be the same as the index type of the input matrix
+            // C has at most one non-zero value in each row.
+            void initialize_from_codes(const csc_t& C) {
+
+                typedef typename csc_t::mem_index_type mem_index_type;
+
+                perm.resize(C.rows);
+                mem_index_type new_size = C.get_nnz();
+                for (mem_index_type i=0; i < C.rows; ++i){
+                    perm[i] = new_size;
+                }
+
+                perm_inv.resize(new_size);
+                for (mem_index_type i = 0; i < new_size; ++i) {
+                    perm[C.row_idx[i]] = i;
+                    perm_inv[i] = C.row_idx[i];
+                }
+
+            }
+
+            // Creates a rearranged C (contiguously arranged)from existing C.
+            // The input C matrix here must be the same as the matrix that passes into initialize_from_codes
+            csc_t get_rearranged_codes(const csc_t& C) {
+
+                typedef typename csc_t::mem_index_type mem_index_type;
+                csc_t C_rearranged = C.deep_copy();
+                C_rearranged.rows = perm_inv.size();
+                for (mem_index_type i = 0; i < C_rearranged.rows; ++i) {
+                    C_rearranged.row_idx[i] = perm[C_rearranged.row_idx[i]];
+                }
+                return C_rearranged;
+            }
+
+            void rearrange_prediction_result_back(csr_t& mat){
+
+                typedef typename csr_t::mem_index_type mem_index_type;
+
+                mem_index_type nnz = mat.get_nnz();
+                for (mem_index_type i = 0; i < nnz; ++i)
+                    mat.col_idx[i] = perm_inv[mat.col_idx[i]];
+                mat.cols = perm.size();
+            }
+
+
+            csc_t get_rearranged_weight_matrix(const csc_t& mat) {
+
+                typedef typename csc_t::mem_index_type mem_index_type;
+                typedef typename csc_t::value_type value_type;
+
+                csc_t result;
+                result.rows = mat.rows;
+                result.cols = perm_inv.size();
+                mem_index_type new_nnz = 0;
+                for (index_type col = 0; col < mat.cols; ++col){
+                    if (perm[col] < perm_inv.size()) {
+                        new_nnz += mat.nnz_of_col(col);
+                    }
+                }
+
+                result.row_idx = new index_type[new_nnz];
+                result.val = new value_type[new_nnz];
+                result.col_ptr = new mem_index_type[result.cols + 1];
+                result.col_ptr[0] = 0;
+
+                // Copy memory from source
+                for (index_type col = 0; col < result.cols; ++col) {
+                    index_type original_col = perm_inv[col];
+                    mem_index_type column_size = mat.nnz_of_col(original_col);
+                    result.col_ptr[col + 1] = result.col_ptr[col] + column_size;
+
+                    mem_index_type read_addr = mat.col_ptr[original_col];
+                    mem_index_type write_addr = result.col_ptr[col];
+
+                    std::memcpy(&result.row_idx[write_addr], &mat.row_idx[read_addr],
+                        sizeof(index_type) * column_size);
+                    std::memcpy(&result.val[write_addr], &mat.val[read_addr],
+                        sizeof(value_type) * column_size);
+                }
+                return result;
+            }
+
+        };
+
+        // Classifier weights of each layer
+        // Feature dimension x Cluster dimension
+        chunked_matrix_t W;
+
+        // Parent to child indicator matrix
+        // Child cluster dimension x Parent cluster dimension
+        csc_t C;
+
+        // Algorithm requires children to be contiguous by parent.
+        // If the children are not contiguous by parent, they are reordered,
+        // and the resulting reordering is stored in children_rearrangement.
+        bool b_children_reordered;
+        rearrangement_t<typename csc_t::index_type> children_rearrangement;
+
+        // Whether or not this layer assumes ownership of the W and C matrices it is passed.
+        // If set to true, the user should not access the W and C matrices after they have
+        // been given to the model, as it is not guaranteed that they will still be in memory.
+        bool b_assumes_ownership;
+
+        // The bias for this layer if the model uses a bias
+        value_type bias;
+
+        // Initializes this layer data
+        void init(csc_t& _W, csc_t& _C, bool b_assumes_ownership, value_type bias) {
+            bool b_has_bias = bias > 0.0;
+            this->bias = bias;
+            this->b_assumes_ownership = b_assumes_ownership;
+
+            if (!check_if_contiguously_ordered(_C)) {
+                // For chunked matrices to work, the children of a layer must be contiguously ordered
+                // If this is not the case, then we must compute a rearrangement to reorder them
+                // This rearrangement must then be undone during inference time.
+                children_rearrangement.initialize_from_codes(_C);
+
+                csc_t C_rearranged = children_rearrangement.get_rearranged_codes(_C);
+
+                auto W_rearranged = children_rearrangement.get_rearranged_weight_matrix(_W);
+
+                if (b_assumes_ownership) {
+                    _C.free_underlying_memory();
+                    _W.free_underlying_memory();
+                }
+
+                this->b_children_reordered = true;
+                this->W = make_chunked_W_from_layer_matrices<chunked_matrix_t>(W_rearranged, C_rearranged, b_has_bias);
+                this->C = C_rearranged;
+                W_rearranged.free_underlying_memory();
+            }
+            else {
+                this->b_children_reordered = false;
+                this->W = make_chunked_W_from_layer_matrices<chunked_matrix_t>(_W, _C, b_has_bias);
+                this->C = _C;
+
+                if (b_assumes_ownership) {
+                    _W.free_underlying_memory();
+                }
+            }
+        }
+
+        // Not necessary for unchuncked layer data
+        void reorder_prediction(csr_t& prediction) {
+            if (b_children_reordered) {
+                children_rearrangement.rearrange_prediction_result_back(prediction);
+            }
+        }
+
+        // Frees all memory that is owned by this class
+        ~LayerData() {
+            W.free_underlying_memory();
+            C.free_underlying_memory();
+        }
+
+
+    };
+
+    template <typename w_matrix_t>
+    class MLModel :
+        public IModelLayer<
+            typename w_matrix_t::index_type,
+            typename w_matrix_t::value_type> {
+    public:
+        typedef typename w_matrix_t::index_type index_type;
+        typedef typename w_matrix_t::value_type value_type;
+        typedef IModelLayer<index_type, value_type> ISpecializedModelLayer;
+
+    private:
+        // The matrix data for this layer
+        LayerData<w_matrix_t> layer_data;
+
+        // Layer statistics for benchmarking
+        layer_statistics_t statistics;
+
+        // The depth of this layer
+        uint32_t cur_depth;
+
+        // Prediction kwargs
+        PostProcessor<value_type> post_processor;
+        uint32_t only_topk;
+
+    protected:
+        void init(
+            csc_t& W,
+            csc_t& C,
+            uint32_t depth,
+            bool b_assumes_ownership,
+            MLModelMetadata& metadata
+        ) override {
+            statistics = layer_statistics_t::compute(W, C);
+            layer_data.init(W, C, b_assumes_ownership, metadata.bias);
+            cur_depth = depth;
+
+            post_processor = PostProcessor<value_type>::get(metadata.post_processor);
+            only_topk = metadata.only_topk;
+        }
+
+    public:
+        MLModel() {
+        }
+
+        MLModel(
+            csc_t& W,
+            csc_t& C,
+            uint32_t cur_depth,
+            bool b_assumes_ownership,
+            MLModelMetadata& metadata
+        ) {
+            init(W, C, cur_depth, b_assumes_ownership, metadata);
+        }
+
+        MLModel(
+            ScipyCscF32& W,
+            ScipyCscF32& C,
+            uint32_t cur_depth,
+            bool b_assumes_ownership,
+            MLModelMetadata& metadata
+        ) {
+            init(csc_t(&W),  csc_t(&C), cur_depth, b_assumes_ownership, metadata);
+        }
+
+        // The internal prediction function for a layer, this method is templated to take any
+        // supported query matrix type. It is called by both versions of the ModelLayer::predict method
+        // X should have the same number of rows as csr_codes
+        // csr_codes should have the same number of cols as layer_data.C
+        // If layer_data.bias > 0, the row number of layer_data.W, which is the dimension of W, should be one more than the number of cols of X.
+        // If layer_data.bias > 0, the row number of layer_data.W, which is the dimension of W, should be one more than the number of cols of X.
+        // If layer_data.bias <= 0, the row number of layer_data.W, which is the dimension of W, should be same as the number of cols of X.
+        template <typename query_mat_t, typename prediction_matrix_t>
+        void predict_internal(
+            const query_mat_t& X,
+            const prediction_matrix_t& prev_layer_pred,
+            bool is_first_layer,
+            const uint32_t overridden_only_topk,
+            const char* overridden_post_processor,
+            prediction_matrix_t& curr_layer_pred,
+            const int threads=-1,
+            const bool b_sort_by_chunk=true
+        ) {
+
+            set_threads(threads);
+
+            uint32_t only_topk_to_use = (overridden_only_topk > 0) ? overridden_only_topk : only_topk;
+            const PostProcessor<value_type>& post_processor_to_use =
+                (overridden_post_processor == nullptr) ? post_processor
+                    : PostProcessor<value_type>::get(overridden_post_processor);
+
+            // Prolongate predictions of previous layer to this layer
+            csr_t labels = prolongate_predictions(prev_layer_pred, layer_data.C);
+            auto& W = layer_data.W;
+
+            // Compute predictions for this layer
+            w_ops<w_matrix_t>::compute_sparse_predictions(X, layer_data.W,
+                labels.row_ptr, labels.col_idx,
+                b_sort_by_chunk, layer_data.bias, prev_layer_pred, curr_layer_pred);
+
+            // Transform the predictions for this layer and combine with previous layer
+            transform_matrix_csr(post_processor_to_use, curr_layer_pred);
+            if (!is_first_layer) {
+                combine_matrices_csr(post_processor_to_use, curr_layer_pred, labels);
+            }
+            labels.free_underlying_memory();
+
+            // Narrow the search to the top k results
+            sorted_csr(curr_layer_pred, only_topk_to_use);
+
+            // Reorder columns of prediction if necessary
+            layer_data.reorder_prediction(curr_layer_pred);
+        }
+
+        void predict(
+            const csr_t& X,
+            const csr_t& prev_layer_pred,
+            bool is_first_layer,
+            const uint32_t overridden_only_topk,
+            const char* overridden_post_processor,
+            csr_t& curr_layer_pred,
+            const int threads=-1
+        ) override {
+            bool b_sort_by_chunk = (X.rows > 1) ? true : false;
+            predict_internal<csr_t, csr_t>(
+                X,
+                prev_layer_pred,
+                is_first_layer,
+                overridden_only_topk,
+                overridden_post_processor,
+                curr_layer_pred,
+                threads,
+                b_sort_by_chunk
+            );
+        }
+
+        void predict(
+            const drm_t& X,
+            csr_t& prev_layer_pred,
+            bool is_first_layer,
+            const uint32_t overridden_only_topk,
+            const char* overridden_post_processor,
+            csr_t& curr_layer_pred,
+            const int threads=-1
+        ) override {
+            bool b_sort_by_chunk=false;
+            predict_internal<drm_t, csr_t>(
+                X,
+                prev_layer_pred,
+                is_first_layer,
+                overridden_only_topk,
+                overridden_post_processor,
+                curr_layer_pred,
+                threads,
+                b_sort_by_chunk
+            );
+        }
+
+        ~MLModel() override {
+        }
+
+        layer_statistics_t get_statistics() const override {
+            return statistics;
+        }
+
+        layer_type_t get_type() const override {
+            return WEIGHT_MATRIX_METADATA_<w_matrix_t>::LAYER_TYPE;
+        }
+
+        index_type label_count() const override {
+            return layer_data.W.cols;
+        }
+
+        index_type feature_count() const override {
+            if (layer_data.bias > 0.0) {
+                return layer_data.W.rows - 1;
+            } else {
+                return layer_data.W.rows;
+            }
+        }
+
+        index_type code_count() const override {
+            return layer_data.C.cols;
+        }
+
+        value_type bias() const override {
+            return layer_data.bias;
+        }
+
+        MLModel(const std::string& folderpath, const uint32_t cur_depth) {
+            ISpecializedModelLayer::load(folderpath, cur_depth, this);
+        }
+    };
+
+    template <typename index_type, typename value_type>
+    IModelLayer<index_type, value_type>* IModelLayer<index_type, value_type>::instantiate(
+        const layer_type_t layer_type) {
+        switch (layer_type) {
+            case LAYER_TYPE_BINARY_SEARCH_CHUNKED:
+            {
+                typedef typename LAYER_TYPE_METADATA_<LAYER_TYPE_BINARY_SEARCH_CHUNKED>::matrix_t w_matrix_t;
+                return new MLModel<w_matrix_t>();
+            }
+            case LAYER_TYPE_HASH_CHUNKED:
+            {
+                typedef typename LAYER_TYPE_METADATA_<LAYER_TYPE_HASH_CHUNKED>::matrix_t w_matrix_t;
+                return new MLModel<w_matrix_t>();
+            }
+            case LAYER_TYPE_CSC:
+            {
+                typedef typename LAYER_TYPE_METADATA_<LAYER_TYPE_CSC>::matrix_t w_matrix_t;
+                return new MLModel<w_matrix_t>();
+            }
+            default:
+            {
+                typedef typename LAYER_TYPE_METADATA_<DEFAULT_LAYER_TYPE>::matrix_t w_matrix_t;
+                return new MLModel<w_matrix_t>();
+            }
+        }
+    }
+
+    // A class defining a chain of layers that form a prediction model
+    class HierarchicalMLModel {
+    public:
+        typedef typename csc_t::index_type index_type;
+        typedef typename csc_t::value_type value_type;
+        typedef IModelLayer<index_type, value_type> ISpecializedModelLayer;
+
+    private:
+
+        std::vector<ISpecializedModelLayer*> model_layers;
+
+    public:
+        ISpecializedModelLayer* operator[](const uint32_t i) {
+            return model_layers[i];
+        }
+
+        inline uint32_t depth() const {
+            return model_layers.size();
+        }
+
+        inline index_type label_count() const {
+            return model_layers[model_layers.size() - 1]->label_count();
+        }
+
+        inline index_type feature_count() const {
+            return model_layers[model_layers.size() - 1]->feature_count();
+        }
+
+        inline index_type code_count() const {
+            return model_layers[model_layers.size() - 1]->code_count();
+        }
+
+        inline index_type get_int_attr(const char* attr) {
+            if (std::strcmp(attr, "depth") == 0) {
+                return this->depth();
+            } else if (std::strcmp(attr, "nr_features") == 0) {
+                return this->feature_count();
+            } else if (std::strcmp(attr, "nr_labels") == 0) {
+                return this->label_count();
+            } else if (std::strcmp(attr, "nr_codes") == 0) {
+                return this->code_count();
+            } else {
+                std::string attr_str(attr);
+                std::runtime_error((attr_str, " is not implemented in get_int_attr."));
+            }
+        }
+
+        std::vector<layer_statistics_t> get_layer_statistics() const {
+            std::vector<layer_statistics_t> result;
+            result.reserve(depth());
+
+            for (auto layer : model_layers) {
+                result.emplace_back(layer->get_statistics());
+            }
+
+            return result;
+        }
+
+        inline const std::vector<ISpecializedModelLayer*>& get_model_layers() const {
+            return model_layers;
+        }
+
+
+    private:
+        void destroy_layers() {
+            // Free memory associated with model matrices
+            for (auto layer : model_layers) {
+                delete layer;
+            }
+            model_layers.clear();
+        }
+
+    public:
+        // Initialize this model by creating all layers
+        void init(std::vector<ISpecializedModelLayer*>& layers) {
+
+            // Destroy all memory associated with this layer
+            destroy_layers();
+
+            model_layers = layers;
+
+        }
+
+        HierarchicalMLModel() {}
+
+        HierarchicalMLModel(std::vector<ISpecializedModelLayer*>& layers) {
+            init(layers);
+        }
+
+        ~HierarchicalMLModel() {
+            // Free memory associated with model matrices
+            destroy_layers();
+        }
+
+        /*
+        * Perform a prediction using the specified parameters.
+        * Parameters:
+        *
+        * X: The csr matrix of queries. Every row represents a query to the model.
+        *
+        * overridden_beam_size (optional): The beam size to use in prediction, set to 0 to use defaults.
+        *
+        * overridden_post_processor (optional): A string specifying which post-processor to use for
+        * predictions on each layer of the model. Set to nullptr to use defaults.
+        *
+        * overridden_only_topk (optional): The number of final predictions to return, set to 0 to use defaults.
+        *
+        * threads (optional): The number of threads to use for prediction computations. Set to -1 to use maximum
+        * of threads.
+        *
+        * depth (optional): Allows the user to return predictions for a layer of the model other than the last.
+        * Set this to 0 to perform prediction for the leaves of the tree.
+        *
+        * prediction (prediction_matrix_t): prediction output matrix
+        */
+        template <typename query_matrix_t, typename prediction_matrix_t>
+        void predict(
+            const query_matrix_t& queries,
+            prediction_matrix_t& prediction,
+            const uint32_t overridden_beam_size=0,
+            const char* overridden_post_processor=nullptr,
+            const uint32_t overridden_only_topk=0,
+            const int threads=-1,
+            const uint32_t depth=0
+        ) {
+
+            uint32_t prediction_depth = (depth > 0) ?
+                std::min<uint32_t>(depth, model_layers.size()) : model_layers.size();
+
+
+            // Create first layer's pred;
+            prediction_matrix_t prev_layer_pred;
+            prev_layer_pred.fill_ones(queries.rows, 1);
+
+
+            // Run the prediction loop, passing predictions down through layers of the model
+            for (uint32_t i_layer = 0; i_layer < prediction_depth; ++i_layer) {
+                ISpecializedModelLayer* layer = model_layers[i_layer];
+
+                // Determine topk for this layer
+                uint32_t local_only_topk = (i_layer == prediction_depth - 1) ? overridden_only_topk : overridden_beam_size;
+                bool is_first_layer = (i_layer == 0);
+                // Run beam search for one layer
+                prediction_matrix_t curr_layer_pred;
+                layer->predict(
+                    queries,
+                    prev_layer_pred,
+                    is_first_layer,
+                    local_only_topk,
+                    overridden_post_processor,
+                    curr_layer_pred,
+                    threads
+                );
+                prev_layer_pred.free_underlying_memory();
+                prev_layer_pred = curr_layer_pred;
+            }
+            prediction = prev_layer_pred;
+        }
+
+        static void load(
+            const std::string& folderpath,
+            HierarchicalMLModel* model,
+            layer_type_t layer_type = DEFAULT_LAYER_TYPE
+        ) {
+            HierarchicalMLModelMetadata xlinear_metadata(folderpath + "/param.json");
+            auto depth = xlinear_metadata.depth;
+            std::vector<ISpecializedModelLayer*> layers(depth);
+
+            // Abstractly instantiate every layer
+            for (uint32_t d = 0; d < depth; d++) {
+                std::string layer_path = folderpath + "/" + std::to_string(d) + ".model/";
+                layers[d] = ISpecializedModelLayer::instantiate(layer_path, layer_type, d);
+            }
+
+            // Model chain assumes ownership of the memory associated with the matrices above
+            model->init(layers);
+        }
+
+        HierarchicalMLModel(
+            const std::string& folderpath,
+            layer_type_t layer_type = DEFAULT_LAYER_TYPE
+        ) {
+            HierarchicalMLModel::load(folderpath, this, layer_type);
+        }
+    };
+} // end namespace pecos
+
+#endif // end of __INFERENCE_H__
diff --git a/pecos/core/xmc/linear_solver.hpp b/pecos/core/xmc/linear_solver.hpp
new file mode 100644
index 00000000..762da992
--- /dev/null
+++ b/pecos/core/xmc/linear_solver.hpp
@@ -0,0 +1,585 @@
+/*
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __LINEAR_SOLVER_H__
+#define  __LINEAR_SOLVER_H__
+
+#include <algorithm>
+#include <vector>
+#include "utils/matrix.hpp"
+#include "utils/parallel.hpp"
+#include "utils/random.hpp"
+
+namespace pecos {
+
+namespace linear_solver {
+
+// For L2R_L2LOSS_SVC_DUAL and L2R_L1LOSS_SVC_DUAL, see Algorithm 3 of Hsieh et al., ICML 2008.
+// For L2R_LR_DUAL, see Algorithm 5 of Yu et al., MLJ 2010.
+//
+// A Dual Coordinate Descent Method For Large-Scale Linear SVM (ICML 2008)
+//     C.-J. Hsieh K.-W. Chang, C.-J. Lin, S. S. Keerthi, and S. Sundararajan
+//     https://www.csie.ntu.edu.tw/~cjlin/papers/cddual.pdf
+//
+// Dual coordinate descent methods for logistic regression and maximum entropy models (MLJ 2010)
+//     H.-F. Yu, F.-L. Huang, and C.-J. Lin
+//     https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf
+enum SolverType {
+    L2R_L2LOSS_SVC_DUAL=1,
+    L2R_L1LOSS_SVC_DUAL=3,
+    L2R_LR_DUAL=7,
+}; /* solver_type */
+
+
+// ===== SVM Solvers =====
+struct SVMParameter {
+    SVMParameter(
+        int solver_type=L2R_L1LOSS_SVC_DUAL,
+        double Cp=1.0,
+        double Cn=1.0,
+        int max_iter=1000,
+        double eps=0.1,
+        double bias=1.0
+    ): solver_type(solver_type), max_iter(max_iter), Cp(Cp), Cn(Cn), eps(eps), bias(bias) {}
+
+    int solver_type;
+    size_t max_iter;
+    double Cp, Cn, eps, bias;
+};
+
+#define INF HUGE_VAL
+template<class value_type>
+struct SVMWorker {
+
+    typedef std::vector<value_type> dvec_t;
+    typedef dense_vec_t<value_type> dvec_wrapper_t;
+    typedef random_number_generator<> rng_t;
+
+    struct InstInfo {
+        value_type y;
+        value_type cost;
+        InstInfo(value_type y=0, value_type cost=0):
+            y(y), cost(cost) {}
+
+        void clear() {
+            y = 0;
+            cost = 0;
+        }
+    };
+
+    SVMParameter param;
+    u64_dvec_t index; // used to determine the subset of rows of X are used in the training.
+    u64_dvec_t feat_index; // used to get the ranking of features in terms of weights
+    std::vector<InstInfo> inst_info;
+    dvec_t w;
+    value_type b; // bias parameter
+    dvec_t QD;
+    dvec_t alpha;
+    uint64_t w_size, y_size;
+
+    SVMWorker(): w_size(0), y_size(0) {}
+
+    void init(uint64_t w_size, uint64_t y_size, const SVMParameter *param_ptr=NULL) {
+        if(param_ptr != NULL) {
+            param = *param_ptr;
+        }
+        this->w_size = w_size;
+        this->y_size = y_size;
+        w.resize(w_size, 0);
+        inst_info.resize(y_size);
+        b = 0;
+        this->feat_index.reserve(this->w_size);
+        this->index.reserve(this->y_size);
+
+
+        if(param.solver_type == L2R_L2LOSS_SVC_DUAL) {
+            alpha.resize(y_size, 0);
+            QD.resize(y_size, 0);
+        } else if(param.solver_type == L2R_L1LOSS_SVC_DUAL) {
+            alpha.resize(y_size, 0);
+            QD.resize(y_size, 0);
+        } else if(param.solver_type == L2R_LR_DUAL) {
+            alpha.resize(2 * y_size, 0); // store both alpha and upper_bound - alpha
+            QD.resize(y_size, 0);
+        }
+    }
+
+    void lazy_init(size_t w_size, size_t y_size, const SVMParameter *param_ptr=NULL) {
+        if((w_size != this->w_size)
+                || (y_size != this->y_size)
+                || ((param_ptr != NULL) && (param_ptr->solver_type != param.solver_type))) {
+            init(w_size, y_size, param_ptr);
+        } else {
+            param = *param_ptr;
+        }
+    }
+
+    template<typename MAT>
+    void solve(const MAT& X, int seed=0) {
+        // the solution will be available in w and b
+        if(param.solver_type == L2R_L1LOSS_SVC_DUAL) {
+            solve_l2r_l1l2_svc(X, seed);
+        } else if(param.solver_type == L2R_L2LOSS_SVC_DUAL) {
+            solve_l2r_l1l2_svc(X, seed);
+        } else if(param.solver_type == L2R_LR_DUAL) {
+            solve_l2r_lr(X, seed);
+        }
+    }
+
+    template<typename MAT>
+    void solve_l2r_l1l2_svc(const MAT& X, int seed) {
+        dvec_wrapper_t curr_w(w);
+        rng_t rng(seed);
+
+        for(size_t j = 0; j < w_size; j++) {
+            curr_w[j] = 0;
+        }
+        b = 0;
+
+        auto get_diag = [&](size_t i) {
+            auto class_cost = (inst_info[i].y > 0) ? param.Cp : param.Cn;
+            return (param.solver_type == L2R_L2LOSS_SVC_DUAL)? (0.5 / (class_cost * inst_info[i].cost)) : 0.0;
+        };
+        auto get_upper_bound = [&](size_t i) {
+            auto class_cost = (inst_info[i].y > 0) ? param.Cp : param.Cn;
+            return (param.solver_type == L2R_L2LOSS_SVC_DUAL)? INF : class_cost * inst_info[i].cost;
+        };
+
+        for(auto& i : index) {
+            alpha[i] = 0;
+            QD[i] = get_diag(i);
+
+            const auto& xi = X.get_row(i);
+            QD[i] += do_dot_product(xi, xi) + (param.bias > 0 ? param.bias * param.bias : 0);
+            double coef = inst_info[i].y * alpha[i];
+            do_axpy(coef, xi, curr_w);
+            b += (param.bias > 0 ? coef * param.bias : 0);
+        }
+
+        // PG: projected gradient, for shrinking and stopping
+        double PGmax_old = INF;
+        double PGmin_old = -INF;
+        double PGmax_new, PGmin_new;
+
+        size_t active_size = index.size();
+        size_t iter = 0;
+        while(iter < param.max_iter) {
+            PGmax_new = -INF;
+            PGmin_new = INF;
+
+            // shuffle
+            rng.shuffle(index.begin(), index.begin() + active_size);
+
+            size_t s = 0;
+            for(s = 0; s < active_size; s++) {
+                size_t i = index[s];
+                const signed char yi = inst_info[i].y;
+                const auto& xi = X.get_row(i);
+
+                float64_t G = yi * (do_dot_product(curr_w, xi) + (param.bias > 0 ? b * param.bias : 0.0)) - 1;
+                float64_t C = get_upper_bound(i);
+                G += alpha[i] * get_diag(i);
+
+                double PG = 0;
+                if(alpha[i] == 0) {
+                    if(G > PGmax_old) {
+                        active_size--;
+                        std::swap(index[s], index[active_size]);
+                        s--;
+                        continue;
+                    } else if (G < 0) {
+                        PG = G;
+                    }
+                } else if (alpha[i] == C) {
+                    if (G < PGmin_old) {
+                        active_size--;
+                        std::swap(index[s], index[active_size]);
+                        s--;
+                        continue;
+                    } else if (G > 0) {
+                        PG = G;
+                    }
+                } else {
+                    PG = G;
+                }
+
+                PGmax_new = std::max(PGmax_new, PG);
+                PGmin_new = std::min(PGmin_new, PG);
+
+                if(fabs(PG) > 1.0e-12) {
+                    float64_t alpha_old = alpha[i];
+                    alpha[i] = static_cast<float64_t>(std::min(std::max(alpha[i] - G / QD[i], 0.0), C));
+                    float64_t d = (alpha[i] - alpha_old) * yi;
+                    do_axpy(d, xi, curr_w);
+                    b += (param.bias > 0 ? d * param.bias : 0);
+                }
+            }
+
+            iter++;
+            if(PGmax_new - PGmin_new <= param.eps) {
+                if(active_size == index.size()) {
+                    break;
+                } else {
+                    active_size = index.size();
+                    PGmax_old = INF;
+                    PGmin_old = -INF;
+                    continue;
+                }
+            }
+            PGmax_old = PGmax_new;
+            PGmin_old = PGmin_new;
+            if (PGmax_old <= 0) {
+                PGmax_old = INF;
+            }
+            if (PGmin_old >= 0) {
+                PGmin_old = -INF;
+            }
+        }
+    }
+
+    template<typename MAT>
+    void solve_l2r_lr(const MAT& X, int seed) {
+        dvec_wrapper_t curr_w(w);
+        rng_t rng(seed);
+
+        for(size_t j = 0; j < w_size; j++) {
+            curr_w[j] = 0;
+        }
+        b = 0;
+
+        auto get_upper_bound = [&](size_t i) {
+            auto class_cost = (inst_info[i].y > 0) ? param.Cp : param.Cn;
+            return class_cost * inst_info[i].cost;
+        };
+
+        dvec_t& xTx = QD;
+        size_t max_inner_iter = 100; // for inner Newton
+        double innereps = 1e-2;
+        double innereps_min = std::min(1e-8, param.eps);
+
+        // Initial alpha can be set here. Note that
+        // 0 < alpha[2 * i] < upper_bound[GETI(i)]
+        // alpha[2 * i] + alpha[2 * i + 1] = upper_bound[GETI(i)]
+        for(auto& i : index) {
+            alpha[2 * i] = std::min(0.001 * get_upper_bound(i), 1e-8);
+            alpha[2 * i + 1] = get_upper_bound(i) - alpha[2 * i];
+
+            const auto& xi = X.get_row(i);
+            xTx[i] = do_dot_product(xi, xi) + (param.bias > 0 ? param.bias * param.bias : 0);
+            double coef = inst_info[i].y * alpha[2 * i];
+            do_axpy(coef, xi, curr_w);
+            b += (param.bias > 0 ? coef * param.bias : 0);
+        }
+
+        size_t iter = 0;
+        while(iter < param.max_iter) {
+            // shuffle
+            rng.shuffle(index.begin(), index.end());
+
+            size_t newton_iter = 0;
+            float64_t Gmax = 0;
+            for(auto& i : index) {
+                const signed char yi = inst_info[i].y;
+                const auto& xi = X.get_row(i);
+
+                float64_t C = get_upper_bound(i);
+                float64_t xisq = xTx[i];
+                float64_t ywTx = yi * (do_dot_product(curr_w, xi) + (param.bias > 0 ? b * param.bias : 0.0));
+                float64_t a = xisq, b = ywTx;
+
+                // Decide to minimize g_1(z) or g_2(z)
+                int ind1 = 2 * i, ind2 = 2 * i + 1, sign = 1;
+                if(0.5 * a * (alpha[ind2] - alpha[ind1]) + b < 0) {
+                    ind1 = 2 * i + 1;
+                    ind2 = 2 * i;
+                    sign = -1;
+                }
+
+                //  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
+                float64_t alpha_old = alpha[ind1];
+                float64_t z = alpha_old;
+                if(C - z < 0.5 * C) {
+                    z = 0.1 * z;
+                }
+                float64_t gp = a * (z - alpha_old) + sign * b + log(z / (C - z));
+                Gmax = std::max(Gmax, fabs(gp));
+
+                // Newton method on the sub-problem
+                const float64_t eta = 0.1; // xi in the paper
+                size_t inner_iter = 0;
+                while(inner_iter <= max_inner_iter) {
+                    if(fabs(gp) < innereps) {
+                        break;
+                    }
+                    float64_t gpp = a + C / (C - z) / z;
+                    float64_t tmpz = z - gp / gpp;
+                    if(tmpz <= 0) {
+                        z *= eta;
+                    } else { // tmpz in (0, C)
+                        z = tmpz;
+                    }
+                    gp = a * (z - alpha_old) + sign * b + log(z / (C - z));
+                    newton_iter++;
+                    inner_iter++;
+                }
+                if(inner_iter > 0) { // update curr_w
+                    alpha[ind1] = z;
+                    alpha[ind2] = C - z;
+                    float64_t coef = sign * (z - alpha_old) * yi;
+                    do_axpy(coef, xi, curr_w);
+                    b += (param.bias > 0 ? coef * param.bias : 0);
+                }
+            }
+
+            iter++;
+            if(Gmax < param.eps) {
+                break;
+            }
+
+            if(newton_iter <= index.size() / 10) {
+                innereps = std::max(innereps_min, 0.1 * innereps);
+            }
+        }
+    }
+};
+
+template<class MAT, class value_type=float64_t>
+struct SVMJob {
+    typedef SVMWorker<value_type> svm_worker_t;
+    const MAT* feat_mat; // n \times d
+    const csc_t* Y;      // n \times L
+    const csc_t* C;      // L \times k: NULL to denote the pure Multi-label setting
+    const csc_t* M;      // n \times k: NULL to denote the pure Multi-label setting
+    const csc_t* R;      // n \times L: NULL to denote NOT-USING the relevance value for cost-sensitive learning
+    size_t code;         // code idx in C (i.e., column index of C)
+    size_t subcode;      // index of the label with code (i.e. column index of Y or row index of C)
+    const SVMParameter *param_ptr;
+
+    SVMJob(
+        const MAT* feat_mat,
+        const csc_t* Y,
+        const csc_t* C,
+        const csc_t* M,
+        const csc_t* R,
+        size_t code,
+        size_t subcode,
+        const SVMParameter *param_ptr=NULL
+    ):
+        feat_mat(feat_mat),
+        Y(Y),
+        C(C),
+        M(M),
+        R(R),
+        code(code),
+        subcode(subcode),
+        param_ptr(param_ptr) { }
+
+    void init_worker(svm_worker_t& worker) const {
+        size_t w_size = feat_mat->cols;
+        size_t y_size = feat_mat->rows;
+        worker.lazy_init(w_size, y_size, param_ptr);
+
+        for(auto &i : worker.index) {
+            worker.inst_info[i].clear();
+        }
+        worker.index.clear();
+        if(M != NULL) {
+            // multi-label setting with codes for labels
+            const auto& m_c = M->get_col(code);
+            for(size_t idx = 0; idx < m_c.nnz; idx++) {
+                size_t i = m_c.idx[idx];
+                worker.index.push_back(i);
+                worker.inst_info[i].y = -1;
+                worker.inst_info[i].cost = 1.0;
+            }
+        } else {
+            // pure multi-label setting without additional codes
+            for(size_t i = 0; i < y_size; i++) {
+                worker.index.push_back(i);
+                worker.inst_info[i].y = -1;
+                worker.inst_info[i].cost = 1.0;
+            }
+        }
+        const auto& y_s = Y->get_col(subcode);
+        for(size_t idx = 0; idx < y_s.nnz; idx++) {
+            size_t i = y_s.idx[idx];
+            worker.inst_info[i].y = +1;
+            if(worker.inst_info[i].cost == 0) {
+                // added positive instances which are not included by M.
+                worker.index.push_back(i);
+            }
+            if(R != NULL) {
+                // Cost-sensitive Learning with provided relevance matrix.
+                // Assume Y and R has the same indices and indptr patterns,
+                // which is verified in the (pecos.xmc.base) MLProblem constructor.
+                const auto& r_s = R->get_col(subcode);
+                worker.inst_info[i].cost = r_s.val[idx];
+            }
+            else {
+                worker.inst_info[i].cost = 1.0;
+            }
+        }
+    }
+
+    /*
+     * Solve the SVM Problem by the *worker*
+     * Store *max_nonzeros* parameters with the absolute value >= *threshold* into  *coo_model*
+     * */
+    void solve(svm_worker_t& worker, coo_t& coo_model, double threshold=0.0, uint32_t max_nonzeros=0) const {
+        worker.solve(*feat_mat);
+        if(max_nonzeros == 0) {
+            for(size_t i = 0; i < worker.w_size; i++) {
+                coo_model.push_back(i, subcode, worker.w[i], threshold);
+            }
+            if(param_ptr->bias > 0) {
+                coo_model.push_back(worker.w_size, subcode, worker.b, threshold);
+            }
+        } else { // max_nonzeros >= 1
+            auto feat_index = worker.feat_index;
+            feat_index.clear();
+            for(size_t i = 0; i < worker.w_size; i++) {
+                if(fabs(worker.w[i]) >= threshold) {
+                    feat_index.push_back(i);
+                }
+            }
+
+            if(feat_index.size() >= max_nonzeros) { // feat_index.size() >= 1
+                struct comparator_by_absolute_value_t {
+                    const float32_t *pred_val;
+                    bool increasing;
+                    comparator_by_absolute_value_t(const float32_t *val, bool increasing=true):
+                        pred_val(val), increasing(increasing) {}
+                    bool operator() (const size_t i, const size_t j) const {
+                        if(increasing) {
+                            return (fabs(pred_val[i]) < fabs(pred_val[j])) \
+                                || (fabs(pred_val[i]) == fabs(pred_val[j]) && i < j);
+                        } else {
+                            return (fabs(pred_val[i]) > fabs(pred_val[j])) \
+                                || (fabs(pred_val[i]) == fabs(pred_val[j]) && i < j);
+                        }
+                    }
+                };
+
+                // Keep max_nonzeros feature indices with largest absolute weight values
+                // We are using nth_element to have an O(w_size) implementation, as a result
+                // the index with the least absolute value from the remaining indices is put in the end.
+                comparator_by_absolute_value_t comp(worker.w.data(), false);
+                const auto first = feat_index.begin();
+                const auto last = feat_index.end();
+                const size_t actual_nonzeros = std::min<size_t>(max_nonzeros, feat_index.size());
+                std::nth_element(first, first + actual_nonzeros - 1, last, comp);
+                feat_index.resize(actual_nonzeros);
+            }
+
+            if(param_ptr->bias > 0) {
+                if(max_nonzeros > feat_index.size()) {
+                    coo_model.push_back(worker.w_size, subcode, worker.b, threshold);
+                } else { // i.e., max_nonzeros == feat_index.size()
+                    if(fabs(worker.b) > fabs(worker.w[feat_index.back()])) {
+                        // we should consider to include bias term instead of the feature index with least absolute value
+                        coo_model.push_back(worker.w_size, subcode, worker.b, threshold);
+                        feat_index.pop_back();
+                    }
+                }
+            }
+            for(auto& i : feat_index) {
+                coo_model.push_back(i, subcode, worker.w[i], threshold);
+            }
+        }
+    }
+
+    void reset_worker(svm_worker_t& worker) const {
+        for(auto &i : worker.index) {
+            worker.inst_info[i].clear();
+        }
+        worker.index.clear();
+    }
+};
+
+// Training single-layer of multi-label problem with clustering codes
+// Y: shape of N \times L, the instance-to-label matrix with binary classification signals
+// C: shape of L \times K, the label-to-cluster matrix for selecting inst/labels within same cluster
+// M: shape of N \times K, the instance-to-cluster matrix for negative sampling
+// R: shape of N \times L, the relevance matrix for cost-sensitive learning
+// Note that we assume Y and R has the same nonzero patterns (same indices and indptr),
+// which is verified in the (pecos.xmc.base) MLProblem constructor.
+// See more details in Eq. (10) of PECOS arxiv paper (Yu et. al., 2020)
+template<class MAT>
+void multilabel_train_with_codes(
+    const MAT* feat_mat,
+    const csc_t *Y,
+    const csc_t *C,
+    const csc_t *M,
+    const csc_t *R,
+    coo_t *model,
+    double threshold,
+    uint32_t max_nonzeros_per_label,
+    SVMParameter *param,
+    int threads
+) {
+    typedef typename MAT::value_type value_type;
+    typedef SVMJob<MAT, value_type> svm_job_t;
+    typedef typename svm_job_t::svm_worker_t svm_worker_t;
+
+    size_t w_size = feat_mat->cols;
+    size_t y_size = feat_mat->rows;
+    size_t nr_labels = Y->cols;
+
+    threads = set_threads(threads);
+    std::vector<svm_worker_t> worker_set(threads);
+    std::vector<coo_t> model_set(threads);
+
+#pragma omp parallel for schedule(static, 1)
+    for(int tid = 0; tid < threads; tid++) {
+        worker_set[tid].init(w_size, y_size, param);
+        model_set[tid].reshape(w_size + (param->bias > 0), nr_labels);
+    }
+
+    std::vector<svm_job_t> job_queue;
+    if(C != NULL && M != NULL) {
+        size_t code_size = C->cols;
+        for(size_t code = 0; code < code_size; code++) {
+            const auto& C_code = C->get_col(code);
+            for(size_t idx = 0; idx < C_code.nnz; idx++) {
+                size_t subcode = static_cast<size_t>(C_code.idx[idx]);
+                job_queue.push_back(svm_job_t(feat_mat, Y, C, M, R, code, subcode, param));
+            }
+        }
+    } else {
+        // either C == NULL or M == NULL
+        // pure multi-label setting
+        for(size_t subcode = 0; subcode < nr_labels; subcode++) {
+            job_queue.push_back(svm_job_t(feat_mat, Y, NULL, NULL, R, 0, subcode, param));
+        }
+    }
+#pragma omp parallel for schedule(dynamic, 1)
+    for(size_t job_id = 0; job_id < job_queue.size(); job_id++) {
+        int tid = omp_get_thread_num();
+        auto& worker = worker_set[tid];
+        auto& local_model = model_set[tid];
+        const auto& job = job_queue[job_id];
+        job.init_worker(worker);
+        job.solve(worker, local_model, threshold, max_nonzeros_per_label);
+        job.reset_worker(worker);
+    }
+    model->reshape(w_size + (param->bias > 0), nr_labels);
+    model->swap(model_set[0]);
+    for(int tid = 1; tid < threads; tid++) {
+        model->extends(model_set[tid]);
+    }
+}
+
+} // end of namespace linear_solver
+} // end of namespace pecos
+
+#endif // end of __LINEAR_SOLVER_H__
diff --git a/pecos/utils/__init__.py b/pecos/utils/__init__.py
new file mode 100644
index 00000000..695dd79a
--- /dev/null
+++ b/pecos/utils/__init__.py
@@ -0,0 +1,10 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
diff --git a/pecos/utils/cli.py b/pecos/utils/cli.py
new file mode 100644
index 00000000..75c2ee1f
--- /dev/null
+++ b/pecos/utils/cli.py
@@ -0,0 +1,59 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+def comma_separated_type(type):
+    """Create a function that parses a comma-separated string into a list.
+
+    Args:
+        type (type): The type to convert each element of the string into.
+
+    Returns:
+        function: Parses a comma-separated string into a list of elements of type `type`.
+    """
+
+    return lambda x: [type(y) for y in x.split(",")]
+
+
+def str2bool(x):
+    """Convert a string to a boolean.
+
+    Args:
+        x (str)
+
+    Returns:
+        bool: True if `x.lower()` is 'y', 'yes', '1', 't', or 'true'; False if `x.lower()` is 'n', 'no', '0', 'f', or 'false'.
+
+    Raises:
+        ValueError: If `x.lower()` is not any of the values above.
+    """
+
+    if x.lower() in set(["y", "yes", "1", "t", "true"]):
+        return True
+    elif x.lower() in set(["n", "no", "0", "f", "false"]):
+        return False
+    else:
+        raise ValueError
+
+
+class SubCommand(object):
+    """Interface class for building commands."""
+
+    def __init__(self):
+        pass
+
+    @classmethod
+    def add_parser(cls, super_parser):
+        """Add a parser for the commands."""
+        pass
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments for the parser."""
+        pass
diff --git a/pecos/utils/cluster_util.py b/pecos/utils/cluster_util.py
new file mode 100644
index 00000000..5199b3f9
--- /dev/null
+++ b/pecos/utils/cluster_util.py
@@ -0,0 +1,350 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import json
+import logging
+import multiprocessing as mp
+import os
+from itertools import chain, repeat
+
+import numpy as np
+import scipy.sparse as smat
+from pecos.utils import smat_util
+from sklearn.preprocessing import normalize
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ClusterChain(object):
+    """Utility class for representing a hierarchical clustering as a list of CSC matrices.
+
+    Attributes:
+        chain (list): A list of sparse matrices that form a hierarchical clustering chain.
+    """
+
+    def __init__(self, chain):
+        """Initialization
+
+        Args:
+            chain (list): A list of sparse matrices that form a valid hierarchical clustering chain.
+        """
+
+        if isinstance(chain, type(self)):
+            chain = chain.chain
+        elif isinstance(chain, smat.spmatrix):
+            chain = [chain.tocsc()]
+
+        # check the validity of the clustering chain
+        assert isinstance(chain, list), "clustering chain shall be a list of CSC matrices"
+        for i in range(len(chain) - 1):
+            assert (
+                chain[i].shape[0] == chain[i + 1].shape[1]
+            ), "matrices do not form a valid hierarchical clustering chain"
+
+        self.chain = chain
+
+    def __delitem__(self, key):
+        del self.chain[key]
+
+    def __getitem__(self, key):
+        return self.chain[key]
+
+    def __setitem__(self, key, val):
+        self.chain[key] = val
+
+    def __iter__(self):
+        return iter(self.chain)
+
+    def __len__(self):
+        return len(self.chain)
+
+    def __eq__(self, other):
+        """
+        Returns:
+            True if `ClusterChain`s are of same length and their matrices have the same shapes and number of nonzeros, False otherwise.
+        """
+
+        if len(self) != len(other):
+            return False
+
+        for C_A, C_B in zip(self, other):
+            if C_A.shape != C_B.shape or (C_A != C_B).nnz != 0:
+                return False
+        return True
+
+    def save(self, folder):
+        """Save to disk.
+
+        Args:
+            folder (str): Folder to save to.
+        """
+
+        os.makedirs(folder, exist_ok=True)
+        with open(os.path.join(folder, "config.json"), "w", encoding="utf-8") as fout:
+            fout.write(json.dumps({"len": len(self)}))
+
+        for i, C in enumerate(self):
+            smat_util.save_matrix(os.path.join(folder, f"C{i}.npz"), C)
+
+    @classmethod
+    def load(cls, path_to_cluster):
+        """Load from disk.
+
+        Args:
+            path_to_cluster (str): Folder where `ClusterChain` was saved to using `ClusterChain.save`.
+
+        Returns:
+            ClusterChain: The loaded object.
+        """
+
+        if os.path.isfile(path_to_cluster):
+            C = smat_util.load_matrix(path_to_cluster)
+            return cls.from_partial_chain(C)
+
+        config_path = os.path.join(path_to_cluster, "config.json")
+        if not os.path.exists(config_path):
+            raise ValueError(f"Cluster config file, {config_path}, does not exist")
+
+        with open(config_path, "r", encoding="utf-8") as fin:
+            config = json.loads(fin.read())
+            length = config.get("len", None)
+            if length is None:
+                raise ValueError(
+                    f'Cluster config file, {config_path}, does not have "len" parameter'
+                )
+
+        chain = []
+        for i in range(length):
+            chain.append(
+                smat_util.load_matrix(os.path.join(path_to_cluster, f"C{i}.npz"))
+                .tocsc()
+                .astype(np.float32)
+            )
+
+        return cls(chain)
+
+    @classmethod
+    def from_partial_chain(cls, C, min_codes=2, nr_splits=2):
+        """Creates the clustering matrices necessary for a valid hierarchical clustering chain.
+
+        Except for the bottom level of the hierarchy chain provided, all other levels are filled with dummy CSC matrices (all-one).
+
+        Args:
+            C (scipy.sparse.base.spmatrix or list): Partial chain for the bottom level of the hierarchy.
+                If sparse matrix, this arg should be the matrix representing the bottom level of the hierarchy.
+                If list, this arg should be a list of sparse matrices representing the bottom levels of the hierarchy.
+            min_codes (int): The number of direct child nodes that the top level of the hierarchy should have.
+            nr_splits (int): The out-degree of the clustering matrices that this method will create.
+
+        Returns:
+            ClusterChain: The completed hierarchical clustering chain.
+        """
+
+        if isinstance(C, smat.spmatrix):
+            cluster_chain = [C.tocsc()]
+        else:
+            assert isinstance(C, (cls, list, tuple))
+            cluster_chain = C
+        cur_C = cluster_chain[0]
+        if min_codes is None or min_codes <= 1:
+            min_codes = cur_C.shape[1]
+
+        # where cur_C.shape == (nr_labels, nr_codes)
+        while cur_C.shape[1] > min_codes:
+            nr_codes = cur_C.shape[1]
+            cur_codes = np.arange(nr_codes)
+            new_codes = cur_codes // nr_splits
+            shape = (len(cur_codes), new_codes.max() + 1)
+            new_C = smat.csc_matrix(
+                (np.ones_like(cur_codes), (cur_codes, new_codes)), shape=shape, dtype=np.float32
+            )
+            cluster_chain = [new_C] + cluster_chain
+            cur_C = new_C
+
+        if cur_C.shape[1] > 1:
+            new_C = smat.csc_matrix(np.ones((cur_C.shape[1], 1), dtype=np.float32))
+            cluster_chain = [new_C] + cluster_chain
+        return cls(cluster_chain)
+
+    def genearate_matching_chain(self, M_dict):
+        """Generate a chain of instance to cluster matching matrix for user supplied negative (usn) from partial matching chain.
+
+        Args:
+            M_dict (dict): dictionary of partial matching chains, with keys being number of layers above leaf elements.
+                M_dict[i].shape[0] == nr_inst, for all i.
+                M_dict[0].shape[1] == self.chain[-1].shape[0],
+                M_dict[i].shape[1] == self.chain[-i].shape[1], for i >= 1
+                M_dict.keys() \subset range(len(self.chain)+1)
+
+        Returns:
+            matching_chain: list of csc matrices for user supplied negatives
+        """
+
+        matching_chain = [None] * (len(self) + 1)
+        # if nothing is given, return a chain of None
+        if M_dict is None or all(M_dict[x] is None for x in M_dict):
+            return matching_chain
+        # get/check the dimensions
+        assert isinstance(M_dict, dict)
+        nr_labels = self.chain[-1].shape[0]
+        assert set(M_dict.keys()) <= set(range(len(self) + 1)), "M_dict got invalid key"
+        nr_insts = [v.shape[0] for k, v in M_dict.items() if v is not None]
+        assert nr_insts.count(nr_insts[0]) == len(nr_insts), "M_dict first dim do not match"
+        nr_insts = nr_insts[0]
+
+        # cannot simply use if M_dict.get(0, None) here
+        # since truth value of a [MATRIX/ARRAY] is ambiguous
+        if M_dict.get(0, None) is not None:
+            assert M_dict[0].shape[1] == self.chain[-1].shape[0]
+        for i in range(1, len(self) + 1):
+            if M_dict.get(i, None) is not None:
+                assert M_dict[i].shape[1] == self.chain[-i].shape[1]
+        # construct matching chain from incomplete chain
+        if M_dict.get(0, None) is not None:
+            matching_chain[0] = smat_util.binarized(M_dict[0])
+        else:
+            matching_chain[0] = smat.csc_matrix((nr_insts, nr_labels))
+        for i in range(1, len(self) + 1):
+            matching_chain[i] = matching_chain[i - 1] * self.chain[-i]
+            if M_dict.get(i, None) is not None:
+                matching_chain[i] += smat_util.binarized(M_dict[i])
+            matching_chain[i] = matching_chain[i].tocsc().sorted_indices()
+        matching_chain.reverse()
+
+        return matching_chain[:-1]
+
+
+def hierarchical_kmeans(
+    feat_mat,
+    max_leaf_size=100,
+    imbalanced_ratio=0,
+    imbalanced_depth=100,
+    spherical=True,
+    seed=0,
+    max_iter=20,
+    threads=-1,
+):
+    """Python implementation of hierarchical 2-means.
+
+    Args:
+        feat_mat (numpy.ndarray or scipy.sparse.csr.csr_matrix): Matrix of label features.
+        max_leaf_size (int, optional): The maximum size of each leaf node of the tree. Default is `100`.
+        imbalanced_ratio (float, optional): Indicates how relaxed the balancedness constraint of 2-means can be.
+            Specifically, if an iteration of 2-means is clustering `L` labels, the size of the output 2 clusters will be within approx `imbalanced_ratio * 2 * L` of each other.
+            Value between `0.0` and `0.5` (inclusive). Default is `0.0`.
+        imbalanced_depth (int, optional): Maximum depth of imbalanced clustering. After depth `imbalanced_depth` is reached, balanced clustering will be used. Default is `100`.
+        spherical (bool, optional): True will l2-normalize the centroids of k-means after each iteration. Default is `True`.
+        seed (int, optional): Random seed. Default is `0`.
+        max_iter (int, optional): Maximum number of iterations for each k-means problem. Default is `20`.
+        threads (int, optional): Number of threads to use. `-1` denotes all CPUs. Default is `-1`.
+
+    Returns:
+        list: List of CSC matrices representing the generated hierarchical cluster chain.
+    """
+
+    global run_kmeans
+
+    def run_kmeans(cluster, c1, c2, min_size, max_iter, spherical=True):
+        indexer = kmeans(feat_mat_global[cluster], c1, c2, min_size, max_iter, spherical)
+        return cluster[indexer], cluster[~indexer]
+
+    global kmeans
+
+    def kmeans(feat_mat, c1=-1, c2=-1, min_size=50, max_iter=20, spherical=True):
+        if c1 == -1:
+            c1, c2 = np.random.randint(feat_mat.shape[0]), np.random.randint(1, feat_mat.shape[0])
+        c1, c2 = feat_mat[c1], feat_mat[(c1 + c2) % feat_mat.shape[0]]
+        old_indexer = np.ones(feat_mat.shape[0]) * -1
+
+        for _ in range(max_iter):
+            scores = np.squeeze(np.asarray(feat_mat.multiply(c1 - c2).sum(1)))
+            indexer = scores >= 0
+            if indexer.sum() < min_size:
+                indexer = np.zeros(feat_mat.shape[0], dtype=np.bool)
+                indexer[np.argpartition(-scores, min_size)[:min_size]] = True
+            elif (~indexer).sum() < min_size:
+                indexer = np.zeros(feat_mat.shape[0], dtype=np.bool)
+                indexer[np.argpartition(scores, min_size)[min_size:]] = True
+            if np.array_equal(indexer, old_indexer):
+                break
+            old_indexer = indexer
+            c1 = feat_mat[indexer].sum(0)
+            c2 = feat_mat[~indexer].sum(0)
+            if spherical:
+                c1 = normalize(c1)
+                c2 = normalize(c2)
+        return indexer
+
+    global feat_mat_global
+    feat_mat_global = feat_mat
+    random = np.random.RandomState(seed)
+    cluster_chain = []
+    clusters_big, clusters_small = [], []
+    if feat_mat.shape[0] > max_leaf_size:
+        clusters_big.append(np.arange(feat_mat.shape[0]))
+    else:
+        clusters_small.append(np.arange(feat_mat.shape[0]))
+
+    while len(clusters_big) > 0:
+        # Do balanced clustering beyond imbalanced_depth to ensure reasonably timely termination
+        if len(cluster_chain) >= imbalanced_depth:
+            imbalanced_ratio = 0
+
+        num_parent_clusters = len(clusters_big) + len(clusters_small)
+        new_clusters_big = []
+        cols_big, cols_small = [], [x + len(clusters_big) for x in range(len(clusters_small))]
+        seeds = [(random.randint(s), random.randint(1, s)) for s in map(len, clusters_big)]
+        min_sizes = [int(s * (0.5 - imbalanced_ratio)) for s in map(len, clusters_big)]
+
+        with mp.Pool(threads if threads > 0 else mp.cpu_count()) as p:
+            for col, child_clusters in enumerate(
+                p.starmap(
+                    run_kmeans,
+                    zip(
+                        clusters_big,
+                        *map(list, zip(*seeds)),
+                        min_sizes,
+                        repeat(max_iter),
+                        repeat(spherical),
+                    ),
+                )
+            ):
+                for cluster in child_clusters:
+                    if len(cluster) > max_leaf_size:
+                        new_clusters_big.append(cluster)
+                        cols_big.append(col)
+                    elif len(cluster) > 0:
+                        clusters_small.append(cluster)
+                        cols_small.append(col)
+
+        cols = cols_big + cols_small
+
+        cluster_chain.append(
+            smat.csc_matrix(
+                (np.ones(len(cols)), (range(len(cols)), cols)),
+                shape=(len(new_clusters_big + clusters_small), num_parent_clusters),
+                dtype=np.float32,
+            )
+        )
+
+        clusters_big = new_clusters_big
+
+    C = []
+    for col, cluster in enumerate(chain(clusters_big, clusters_small)):
+        for row in cluster:
+            C.append((row, col))
+    cluster_chain.append(
+        smat.csc_matrix(
+            (np.ones(feat_mat.shape[0]), list(map(list, zip(*C)))),
+            shape=(feat_mat.shape[0], len(clusters_big) + len(clusters_small)),
+            dtype=np.float32,
+        )
+    )
+    return cluster_chain
diff --git a/pecos/utils/featurization/__init__.py b/pecos/utils/featurization/__init__.py
new file mode 100644
index 00000000..695dd79a
--- /dev/null
+++ b/pecos/utils/featurization/__init__.py
@@ -0,0 +1,10 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
diff --git a/pecos/utils/featurization/text/README.md b/pecos/utils/featurization/text/README.md
new file mode 100644
index 00000000..bef92ccf
--- /dev/null
+++ b/pecos/utils/featurization/text/README.md
@@ -0,0 +1,86 @@
+# PECOS for Text Preprocessing/Vectorizing
+
+Given an input text, pecos.utils.featurization.text.preprocess generate numerical vectors from text.
+The input file should be a text sequence for each line.
+
+## Getting started
+### Usage
+Build a preprocessor
+  > python3 -m pecos.utils.featurization.text.preprocess build --help
+Generate numerical vectors from text via a preprocessor
+  > python3 -m pecos.utils.featurization.text.preprocess run --help
+
+### Usage example: TFIDF featurization
+This toy example demonstrates how to construct n-gram TFIDF features containing word unigrams, word bigrams, and character trigrams.
+Note that each of the n-gram feature can have different hyper-parameters such as `max_feature`, `max_df`, and more.
+
+Consider the following toy input file `input.txt`:
+```
+Alan Turing is widely considered to be the father of theoretical computer science and artificial intelligence.
+Hinton was co-author of a highly cited paper published in 1986 that popularized the backpropagation algorithm for training multi-layer neural networks.
+Hinton received the 2018 Turing Award, together with Yoshua Bengio and Yann LeCun, for their work on artificial intelligence and deep learning.
+In 1989, Yann LeCun et al. applied the standard backpropagation algorithm on neural networks for hand digit recognition.
+```
+
+Define the TF-IDF n-gram hyper-parameters in this json file `config.json`:
+```
+{
+  "type": "tfidf",
+  "kwargs": {
+    "base_vect_configs": [
+      {
+        "ngram_range": [1, 1],
+        "max_feature": 20,
+        "analyzer": "word"
+      },
+      {
+        "ngram_range": [2, 2],
+        "max_feature": 30,
+        "analyzer": "word"
+      },
+      {
+        "ngram_range": [3, 3],
+        "max_feature": 10,
+        "analyzer": "char_wb"
+      }
+    ]
+  }
+}
+```
+
+We first build the TF-IDF vectorizer model via this command line
+```
+python3 -m pecos.utils.featurization.text.preprocess build \
+  --text-pos 0 \
+  --input-text-path ./input.txt \
+  --vectorizer-config-path ./config.josn \
+  --output-model-folder ./tfidf-model
+```
+The TF-IDF model is built and saved in the `./tfidf-model` folder.
+Now we are ready to create TF-IDF features via this command line:
+```
+python3 -m pecos.utils.featurization.text.preprocess run \
+  --text-pos 0 \
+  --input-preprocessor-folder ./tfidf-model \
+  --input-text-path ./input.txt \
+  --output-inst-path ./input.tfidf.npz
+```
+Finally, the sparse TF-IDF feature matrix is saved in the `./input.tfidf.npz` file.
+The matrix is stored in the Sparse CSR format, see more details at [link](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html).
+
+***
+
+Copyright (2021) Amazon.com, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
diff --git a/pecos/utils/featurization/text/__init__.py b/pecos/utils/featurization/text/__init__.py
new file mode 100644
index 00000000..695dd79a
--- /dev/null
+++ b/pecos/utils/featurization/text/__init__.py
@@ -0,0 +1,10 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
diff --git a/pecos/utils/featurization/text/preprocess.py b/pecos/utils/featurization/text/preprocess.py
new file mode 100644
index 00000000..2ef292e7
--- /dev/null
+++ b/pecos/utils/featurization/text/preprocess.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3 -u
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+
+import argparse
+import os
+
+import numpy as np
+import scipy.sparse as smat
+from pecos.utils import smat_util
+from pecos.utils.cli import SubCommand
+from pecos.utils.featurization.text.vectorizers import Vectorizer, vectorizer_dict
+
+
+class Preprocessor(object):
+    """Preprocess text to numerical values"""
+
+    def __init__(self, vectorizer=None):
+        """Initialization
+
+        Args:
+            vectorizer (Vectorizer): Text vectorizer class instance.
+        """
+        self.vectorizer = vectorizer
+
+    def save(self, preprocessor_folder):
+        """Save the preprocess object to a folder
+
+        Args:
+            preprocessor_folder (str): The saving folder name
+        """
+        self.vectorizer.save(preprocessor_folder)
+
+    @classmethod
+    def load(cls, preprocessor_folder):
+        """Load preprocessor
+
+        Args:
+            preprocess_folder (str): The folder to load
+
+        Returns:
+            cls: An instance of Preprocessor
+        """
+
+        vectorizer = Vectorizer.load(preprocessor_folder)
+        return cls(vectorizer)
+
+    @classmethod
+    def train(cls, corpus, vectorizer_config, dtype=np.float32):
+        """Train a preprocessor
+
+        Args:
+            corpus (list of strings or a string): Training text input.
+                If given a list of strings, it's the list of training inputs.
+                If given a string, it's the path to a file with lines of text inputs to be trained.
+            vectorizer_config (dict): Config file for the vectorizer
+            dtype (scipy.dtype): Data type for the vectorized output
+
+        Returns:
+            A Preprocessor
+        """
+
+        vectorizer = Vectorizer.train(corpus, vectorizer_config, dtype=dtype)
+        return cls(vectorizer)
+
+    def predict(self, corpus, **kwargs):
+        """Vectorize a corpus
+
+        Args:
+            corpus (list of strings or a string): Predicting text input.
+                If given a list of strings, it's the list of text input to be vectorized.
+                If given a string, it's the path to a file with lines of text inputs to be vectorized.
+            kwargs (optional): Args to be passed to Vectorizer
+
+        Returns:
+            csr_matrix: Vectorized output
+        """
+
+        return self.vectorizer.predict(corpus, **kwargs)
+
+    @staticmethod
+    def load_data_from_file(
+        data_path,
+        label_text_path=None,
+        split_sep="\t",
+        maxsplit=-1,
+        text_pos=1,
+        label_pos=0,
+    ):
+        """Parse a tab-separated text file to a CSR label matrix and a list of text strings.
+
+        Text format for each line:
+        <comma-separated label indices><TAB><space-separated text string>
+        Example: l_1,..,l_k<TAB>w_1 w_2 ... w_t
+            l_k is the zero-based index for the t-th relevant label
+            w_t is the t-th token in the string
+
+        Args:
+            data_path (str): Path to the text file
+            label_text_path (str, optional): Path to the label text file.
+                The main purpose is to obtain the number of labels. Default: None
+            split_sep (str, optional): The separator. Default: "\t".
+            maxsplit (int, optional): The max number of splits for each line. Default: -1 to denote full split
+            text_pos (int, optional): The position of the text part in each line. Default: 1.
+            label_pos (int, optional): The position of the text part in each line. Default: 0.
+        """
+        assert os.path.isfile(data_path)
+        with open(data_path, "r", encoding="utf-8") as fin:
+            label_strings, corpus = [], []
+            for line in fin:
+                parts = line.strip("\n")
+                parts = parts.split(split_sep, maxsplit)
+                label_strings.append(parts[label_pos])
+                text_string = parts[text_pos]
+                corpus.append(text_string)
+
+        def convert_label_to_Y(label_strings, L):
+            rows, cols, vals = [], [], []
+            for i, label in enumerate(label_strings):
+                label_list = list(map(int, label.split(",")))
+                rows += [i] * len(label_list)
+                cols += label_list
+                vals += [1] * len(label_list)
+            Y = smat.csr_matrix(
+                (vals, (rows, cols)), shape=(len(label_strings), L), dtype=np.float32
+            )
+            return Y
+
+        if label_text_path is not None:
+            assert os.path.isfile(label_text_path)
+            # this is used to obtain the total number of labels L to construct Y with a correct shape
+            L = sum(1 for line in open(label_text_path, "r", encoding="utf-8") if line)
+            label_matrix = convert_label_to_Y(label_strings, L)
+        else:
+            label_matrix = None
+        return label_matrix, corpus
+
+
+class BuildPreprocessorCommand(SubCommand):
+    """Command to train a preprocessor"""
+
+    @staticmethod
+    def run(args):
+        """Train a preprocessor.
+
+        Args:
+            args (argparse.Namespace): Command line argument parsed by `parser.parse_args()`
+        """
+        if not args.from_file:
+            _, corpus = Preprocessor.load_data_from_file(
+                args.input_text_path,
+                maxsplit=args.maxsplit,
+                text_pos=args.text_pos,
+            )
+        else:
+            corpus = args.input_text_path
+        vectorizer_config = Vectorizer.load_config_from_args(args)
+        preprocessor = Preprocessor.train(corpus, vectorizer_config, dtype=args.dtype)
+        preprocessor.save(args.output_model_folder)
+
+    @classmethod
+    def add_parser(cls, super_parser):
+        """Add parser to the run.
+
+        Args:
+            super_parser (argparse.ArgumentParser): Argument parser.
+        """
+        parser = super_parser.add_parser("build", aliases=[], help="Build a preprocessor")
+        cls.add_arguments(parser)
+        parser.set_defaults(run=cls.run)
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments for the build.
+
+        Args:
+            parser (argparse.ArgumentParser): Argument parser.
+        """
+        parser.add_argument(
+            "-i", "--input-text-path", type=str, required=True, help="text input file name"
+        )
+
+        vectorizer_config_group_parser = parser.add_mutually_exclusive_group()
+        vectorizer_config_group_parser.add_argument(
+            "--vectorizer-config-path",
+            type=str,
+            default=None,
+            metavar="VECTORIZER_CONFIG_PATH",
+            help="Json file for vectorizer config (default tfidf vectorizer)",
+        )
+
+        vectorizer_config_group_parser.add_argument(
+            "--vectorizer-config-json",
+            type=str,
+            default='{"type":"tfidf", "kwargs":{}}',
+            metavar="VECTORIZER_CONFIG_JSON",
+            help=f'Json-format string for vectorizer config (default {{"type":"tfidf", "kwargs":{{}}}}). Other type option: {list(vectorizer_dict.keys())}',
+        )
+
+        parser.add_argument(
+            "-m", "--output-model-folder", type=str, required=True, help="model folder name"
+        )
+
+        parser.add_argument(
+            "--maxsplit",
+            type=int,
+            default=-1,
+            help="the max number of splits used to partition each line. (default -1 to denote full split)",
+        )
+
+        parser.add_argument(
+            "--text-pos",
+            type=int,
+            default=1,
+            help="the position of the text part in each line. (default 1)",
+        )
+
+        parser.add_argument(
+            "-t",
+            "--dtype",
+            type=lambda x: np.float32 if "32" in x else np.float64,
+            default=np.float32,
+            help="data type for the output csr matrix. float32 | float64. (default float32)",
+        )
+
+        parser.add_argument(
+            "--from-file",
+            action="store_true",
+            help="[Only support tfidf vectorizer] training without preloading corpus to memory. If true, --input-text-path is expected to be a file or a folder containing files that each line contains only input text.",
+        )
+
+
+class RunPreprocessorCommand(SubCommand):
+    """Command to preprocess text using an existing preprocessor"""
+
+    @staticmethod
+    def run(args):
+        """Preprocess text using an existing preprocessor.
+
+        Args:
+            args (argparse.Namespace): Command line argument parsed by `parser.parse_args()`
+        """
+        preprocessor = Preprocessor.load(args.input_preprocessor_folder)
+        if args.from_file and not args.output_label_path:
+            corpus = args.input_text_path
+        else:
+            Y, corpus = Preprocessor.load_data_from_file(
+                args.input_text_path,
+                label_text_path=args.label_text_path,
+                maxsplit=args.maxsplit,
+                text_pos=args.text_pos,
+                label_pos=args.label_pos,
+            )
+        X = preprocessor.predict(
+            corpus,
+            batch_size=args.batch_size,
+            use_gpu_if_available=not args.disable_gpu,
+            buffer_size=args.buffer_size,
+            threads=args.threads,
+        )
+
+        smat_util.save_matrix(args.output_inst_path, X)
+
+        if args.output_label_path and Y is not None:
+            smat_util.save_matrix(args.output_label_path, Y)
+
+    @classmethod
+    def add_parser(cls, super_parser):
+        """Add parser to the run.
+
+        Args:
+            super_parser (argparse.ArgumentParser): Argument parser.
+        """
+        parser = super_parser.add_parser("run", aliases=[], help="Run a pre-built preprocessor")
+        cls.add_arguments(parser)
+        parser.set_defaults(run=cls.run)
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments for the run.
+
+        Args:
+            parser (argparse.ArgumentParser): Argument parser.
+        """
+        parser.add_argument(
+            "-p",
+            "--input-preprocessor-folder",
+            type=str,
+            required=True,
+            help="preprocessor folder name",
+        )
+        parser.add_argument(
+            "-i", "--input-text-path", type=str, required=True, help="text input file name"
+        )
+        parser.add_argument(
+            "-x",
+            "--output-inst-path",
+            type=str,
+            required=True,
+            help="output inst file name",
+        )
+        parser.add_argument(
+            "--maxsplit",
+            type=int,
+            default=-1,
+            help="the number of splits used to partition each line. (default -1 to denote full split))",
+        )
+        parser.add_argument(
+            "--text-pos",
+            type=int,
+            default=1,
+            help="the position of the text part in each line. (default 1)",
+        )
+        parser.add_argument(
+            "-l", "--label-text-path", type=str, default=None, help="label text file name"
+        )
+        parser.add_argument(
+            "-y", "--output-label-path", type=str, default=None, help="output label file name"
+        )
+        parser.add_argument(
+            "--label-pos",
+            type=int,
+            default=0,
+            help="the position of the text part in each line. (default 0)",
+        )
+        parser.add_argument(
+            "--batch-size",
+            type=int,
+            default=8,
+            help="batch size for Transformer vectorizer embedding evaluation (default 8)",
+        )
+        parser.add_argument(
+            "--disable-gpu",
+            action="store_true",
+            help="disable CUDA even if it's available",
+        )
+        parser.add_argument(
+            "--threads",
+            type=int,
+            default=-1,
+            help="number of threads to use for predict (default -1 to use all)",
+        )
+        parser.add_argument(
+            "--from-file",
+            action="store_true",
+            help="[Only support tfidf vectorizer] predict without preloading corpus to memory. If true, --input-text-path is expected to be a file that each line contains only input text.",
+        )
+        parser.add_argument(
+            "--buffer-size",
+            type=int,
+            default=0,
+            help="number of bytes to use as file I/O buffer if --from-file (set to 0 to use default value)",
+        )
+
+
+def get_parser():
+    """Get a parser for training preprocessor and preprocessing text"""
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(help="subcommands", metavar="SUBCOMMAND")
+    subparsers.required = True
+    BuildPreprocessorCommand.add_parser(subparsers)
+    RunPreprocessorCommand.add_parser(subparsers)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    args.run(args)
diff --git a/pecos/utils/featurization/text/sentencepiece/__init__.py b/pecos/utils/featurization/text/sentencepiece/__init__.py
new file mode 100644
index 00000000..566ea755
--- /dev/null
+++ b/pecos/utils/featurization/text/sentencepiece/__init__.py
@@ -0,0 +1,11 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+from .model import SentencePieceModel  # noqa: F401
diff --git a/pecos/utils/featurization/text/sentencepiece/model.py b/pecos/utils/featurization/text/sentencepiece/model.py
new file mode 100644
index 00000000..2d0de4d3
--- /dev/null
+++ b/pecos/utils/featurization/text/sentencepiece/model.py
@@ -0,0 +1,355 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import contextlib
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+import sentencepiece as spm
+
+
+class SentencePieceModel(object):
+    """SentencePiece Model
+
+    This is a wrapper for Sentencepiece text tokenizer.
+
+    For complete list of training and prediction arguments,
+    see https://github.com/google/sentencepiece
+    """
+
+    def __init__(self, model, model_folder):
+        """
+        Args:
+            model (spm.SentencePieceProcessor): Trained SentencePiece model.
+            model_folder (tempfile.TemporaryDirectory): Temporary directory object.
+
+        Attributes:
+            model (spm.SentencePieceProcessor): Trained SentencePiece model.
+            model_folder (tempfile.TemporaryDirectory): Temporary directory object.
+        """
+
+        assert isinstance(model, spm.SentencePieceProcessor)
+        assert isinstance(model_folder, tempfile.TemporaryDirectory)
+        self.model = model
+        self.model_folder = model_folder
+
+    def save(self, save_folder):
+        """Save trained model and vocabulary.
+
+        Args:
+            save_folder (str): Path to folder to save trained model and vocabulary.
+        """
+
+        logger = logging.getLogger(__name__)
+        os.makedirs(save_folder, exist_ok=True)
+        model_path = os.path.join(save_folder, "spm.model")
+        vocab_path = os.path.join(save_folder, "spm.vocab")
+        if os.path.exists(model_path):
+            logger.warning(f"Overwrite existing Sentencepiece model at {model_path}")
+        if os.path.exists(vocab_path):
+            logger.warning(f"Overwrite existing Sentencepiece vocabulary at {vocab_path}")
+
+        curr_model_path = os.path.join(self.model_folder.name, "spm.model")
+        shutil.copyfile(curr_model_path, model_path)
+        curr_vocab_path = os.path.join(self.model_folder.name, "spm.vocab")
+        if os.path.exists(curr_vocab_path):
+            shutil.copyfile(curr_vocab_path, vocab_path)
+
+    @classmethod
+    def load(cls, model_folder):
+        """Load saved model and vocabulary.
+
+        Args:
+            model_folder (str): Path to saved folder with trained model and vocabulary (optional).
+
+        Returns:
+            SentencePieceModel object.
+        """
+
+        model_path = os.path.join(model_folder, "spm.model")
+        vocab_path = os.path.join(model_folder, "spm.vocab")
+        model = spm.SentencePieceProcessor()
+        model.Load(model_path)
+
+        tmp_folder = tempfile.TemporaryDirectory()
+        tmp_model_path = os.path.join(tmp_folder.name, "spm.model")
+        shutil.copyfile(model_path, tmp_model_path)
+
+        if os.path.exists(vocab_path):
+            tmp_vocab_path = os.path.join(tmp_folder.name, "spm.vocab")
+            shutil.copyfile(vocab_path, tmp_vocab_path)
+        return cls(model, tmp_folder)
+
+    @classmethod
+    def train(
+        cls,
+        input_paths,
+        model_type="unigram",
+        input_sentence_size=None,
+        shuffle_input_sentence=True,
+        max_sentence_length=4192,
+        vocab_size=8000,
+        char_coverage=0.9995,
+        num_threads=None,
+        user_defined_symbols=[],
+        extra_args=[],
+    ):
+        """Train SentencePiece model.
+
+        Args:
+            input_paths (list): Paths to input files used for training sentencepiece model.
+            input_sentence_size (int, optional): Sentencepiece trainer only loads the first `input_sentence_size` sentences.
+            shuffle_input_sentence (bool, optional): Randomly sample input sentences in advance. Valid when `input_sentence_size` > 0.
+            max_sentence_length (int, optional): Maximum length of sentence in byte.
+            vocab_size (int, optional): Vocabulary size.
+            char_coverage (float, optional): Character coverage to determine the minimum symbols.
+            num_threads (int, optional): Number of threads for training.
+            user_defined_symbols (list, optional): A list of user defined symbols, each enclosed in angle brackets e.g.<symbol-0>.
+                User defined symbols are handled as one piece in any context, i.e. if it's included in the input text,
+                it's always extracted as one piece.
+            extra_args (list, optional): List of strings of extra arguments that can be passed directly to Sentencepiece.
+
+        Returns:
+            SentencePieceModel object.
+        """
+
+        tmp_folder = tempfile.TemporaryDirectory()
+        model_prefix = os.path.join(tmp_folder.name, "spm")
+
+        if isinstance(input_paths, str):
+            input_paths = [input_paths]
+
+        assert model_type in ["unigram", "bpe", "char", "word"]
+
+        cmd = []
+        cmd += ["--pad_id=0"]
+        cmd += ["--bos_id=1"]
+        cmd += ["--eos_id=2"]
+        cmd += ["--unk_id=3"]
+        cmd += ["--input={}".format(",".join(input_paths))]
+        cmd += ["--model_prefix={}".format(model_prefix)]
+        cmd += ["--model_type={}".format(model_type)]
+        cmd += ["--max_sentence_length={}".format(int(max_sentence_length))]
+        cmd += ["--minloglevel=1"]
+        cmd += ["--vocab_size={}".format(int(vocab_size))]
+        cmd += ["--character_coverage={}".format(float(char_coverage))]
+
+        if input_sentence_size is not None:
+            cmd += ["--input_sentence_size={}".format(int(input_sentence_size))]
+
+        if shuffle_input_sentence:
+            cmd += ["--shuffle_input_sentence=true"]
+
+        if num_threads is not None:
+            cmd += ["--num_threads={}".format(int(num_threads))]
+
+        if len(user_defined_symbols) > 0:
+            cmd += ["--user_defined_symbols=" + ",".join(user_defined_symbols)]
+
+        if isinstance(extra_args, str):
+            if len(extra_args) > 0:
+                extra_args = [extra_args]
+            else:
+                extra_args = []
+        cmd += extra_args
+
+        cmd = " ".join(cmd)
+        SentencePieceModel._train_raw(cmd)
+
+        return SentencePieceModel.load(tmp_folder.name)
+
+    def predict(self, input_lines, to_ids=False):
+        """Tokenize lines of text
+
+        Args:
+            input_lines (list): List of texts to tokenize.
+            to_ids (bool, optional): Whether to return IDs or text pieces of tokens.
+
+        Returns:
+            list: Each item is a list of the tokenization of the  corresponding text in input_lines.
+        """
+
+        return self.encode(input_lines, to_ids)
+
+    def predict_file(self, input_file="-", output_file="-", to_ids=False):
+        """Tokenize file
+
+        Args:
+            input_file (str, optional): Path to input text file. '-' denotes stdin.
+            output_file (str, optional): Path to output file. Each line in the output file is the tokenization of the corresponding line in the input file. Within a line, predicted pieces/IDs are separated by single white space.
+            to_ids (bool, optional): Whether to output IDs or text pieces of tokens.
+        """
+
+        return self.encode_file(input_file, output_file, to_ids)
+
+    def encode_file(self, input_file="-", output_file="-", to_ids=False):
+        """Tokenize file
+
+        Args:
+            input_file (str, optional): Path to input text file. '-' denotes stdin.
+            output_file (str, optional): Path to output file. Each line in the output file is the tokenization of the corresponding line in the input file. Within a line, predicted pieces/IDs are separated by single white space.
+            to_ids (bool, optional): Whether to output IDs or text pieces of tokens.
+        """
+
+        if to_ids:
+            encode = self._encode_to_ids
+        else:
+            encode = self._encode_to_pieces
+
+        with contextlib.ExitStack() as stack:
+            if input_file == "-":
+                fin = sys.stdin
+            else:
+                fin = stack.enter_context(open(input_file, "r", encoding="utf-8"))
+            if output_file == "-":
+                fout = sys.stdout
+            else:
+                fout = stack.enter_context(open(output_file, "w", encoding="utf-8"))
+
+            for line in fin:
+                encoded_output = encode(line.strip())
+                output_line = "{}\n".format(" ".join(map(str, encoded_output)))
+                fout.write(output_line)
+
+    def encode(self, input_lines, to_ids=False):
+        """Tokenize lines of text
+
+        Args:
+            input_lines (list): List of texts to tokenize.
+            to_ids (bool, optional): Whether to return IDs or text pieces of tokens.
+
+        Returns:
+            list: Each item is a list of the tokenization of the  corresponding text in input_lines.
+        """
+
+        if to_ids:
+            encode = self._encode_to_ids
+        else:
+            encode = self._encode_to_pieces
+        if isinstance(input_lines, str):
+            input_lines = [input_lines]
+        outputs = []
+        for line in input_lines:
+            outputs.append(encode(line.strip()))
+        return outputs
+
+    def decode_file(self, input_file="-", output_file="-", from_ids=False):
+        """De-tokenize file
+
+        Args:
+            input_file (str, optional): Path to input text file of encoding. Encoded pieces/IDs within a line are separated by single whitespace.
+            output_file (str, optional): Path to output file. Each line in the output file is the decoding of the corresponding line in the input file.
+            from_ids (bool, optional): Whether encoded file contains IDs/text pieces of tokens.
+        """
+
+        if from_ids:
+            decode = self._decode_from_ids
+        else:
+            decode = self._decode_from_pieces
+
+        with contextlib.ExitStack() as stack:
+            if input_file == "-":
+                fin = sys.stdin
+            else:
+                fin = stack.enter_context(open(input_file, "r", encoding="utf-8"))
+            if output_file == "-":
+                fout = sys.stdout
+            else:
+                fout = stack.enter_context(open(output_file, "w", encoding="utf-8"))
+
+            for line in fin:
+                encoded_input = line.strip().split(" ")
+                if from_ids:
+                    encoded_input = list(map(int, encoded_input))
+                decoded_output = decode(encoded_input)
+                output_line = "{}\n".format(decoded_output)
+                fout.write(output_line)
+
+    def decode(self, inputs, from_ids=False):
+        """De-tokenize list of encoded IDs/text pieces of tokens
+
+        Args:
+            inputs (list): List of lists, each inner list is the encoded pieces/IDs of one raw string
+            from_ids (bool, optional): Whether encoded list contains IDs or text pieces.
+
+        Returns:
+            list: List of string, each string is the decoding of one inner list.
+        """
+
+        if from_ids:
+            decode = self._decode_from_ids
+        else:
+            decode = self._decode_from_pieces
+        output_lines = []
+        for one_input in inputs:
+            output_lines.append(decode(one_input))
+        return output_lines
+
+    @staticmethod
+    def _train_raw(train_args):
+        """Call SentencePiece's SentencePieceTrainer
+
+        Args:
+            train_args (str): SentencePieceTrainer arguments.
+        """
+
+        assert isinstance(train_args, str)
+        spm.SentencePieceTrainer.Train(train_args)
+
+    def _encode_to_pieces(self, raw):
+        """Call SentencePiece's EncodeAsPieces to tokenize into text pieces.
+
+        Args:
+            raw (str): Text to tokenize into pieces.
+
+        Returns:
+            list: Tokenized text pieces.
+        """
+
+        return self.model.EncodeAsPieces(raw)
+
+    def _encode_to_ids(self, raw):
+        """Call SentencePiece's EncodeAsIds to tokenize into IDs.
+
+        Args:
+            raw (str): Text to tokenize into IDs.
+
+        Returns:
+            list: Tokenized integer IDs.
+        """
+
+        return self.model.EncodeAsIds(raw)
+
+    def _decode_from_pieces(self, pieces):
+        """Call SentencePiece's DecodePieces to de-tokenize from text pieces.
+
+        Args:
+            pieces (list): List of text pieces.
+
+        Returns:
+            str: De-tokenized text.
+        """
+
+        return self.model.DecodePieces(pieces)
+
+    def _decode_from_ids(self, ids):
+        """Call SentencePiece's DecodePieces to de-tokenize from IDs.
+
+        Args:
+            ids (list): List of integer IDs.
+
+        Returns:
+            str: De-tokenized text.
+        """
+
+        return self.model.DecodeIds(ids)
diff --git a/pecos/utils/featurization/text/sentencepiece/predict.py b/pecos/utils/featurization/text/sentencepiece/predict.py
new file mode 100644
index 00000000..9ff50830
--- /dev/null
+++ b/pecos/utils/featurization/text/sentencepiece/predict.py
@@ -0,0 +1,71 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+
+from .model import SentencePieceModel
+
+
+def predict(args):
+    """Tokenize text file using trained SentencePiece model
+
+    Args:
+        args (Namespace): Namespace object of prediction arguments from cli below.
+    """
+
+    model = SentencePieceModel.load(args.model_folder)
+    to_ids = True if args.output_type == "ids" else False
+    model.predict_file(args.input_path, args.output_path, to_ids)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="SentencePiece: tokenize text")
+
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="MODEL_FOLDER",
+        help="path to model folder",
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input-path",
+        type=str,
+        default="-",
+        metavar="PATH",
+        help="path to input text file name. (default '-' to denote stdin)",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        default="-",
+        metavar="PATH",
+        help="path to output encoded file name. (default '-' to denote stdout)",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--output-type",
+        type=str,
+        default="pieces",
+        choices=["pieces", "ids"],
+        metavar="OUTPUT_TYPE",
+        help="encode text to: pieces or ids. (default is pieces)",
+    )
+
+    args = parser.parse_args()
+
+    predict(args)
diff --git a/pecos/utils/featurization/text/sentencepiece/train.py b/pecos/utils/featurization/text/sentencepiece/train.py
new file mode 100644
index 00000000..58c43562
--- /dev/null
+++ b/pecos/utils/featurization/text/sentencepiece/train.py
@@ -0,0 +1,113 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+
+from pecos.utils import cli
+
+from .model import SentencePieceModel
+
+
+def train(args):
+    """Train SentencePiece model
+
+    Args:
+        args (Namespace): Namespace object of training arguments from cli below.
+    """
+
+    model = SentencePieceModel.train(
+        input_paths=args.input_paths,
+        model_type=args.model_type,
+        vocab_size=args.vocab_size,
+        input_sentence_size=args.input_sentence_size,
+        shuffle_input_sentence=args.shuffle_input_sentence,
+        max_sentence_length=args.max_sentence_length,
+        char_coverage=args.char_coverage,
+    )
+
+    model.save(args.model_folder)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="SentencePiece: train tokenization model")
+
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="MODEL_FOLDER",
+        help="folder to save trained model and vocabulary",
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input-paths",
+        type=cli.comma_separated_type(str),
+        required=True,
+        metavar="PATHS",
+        help="Paths to input text files. Multiple input files (separated by comma) are supported and will lead to a model trained using all input files. Input files are loaded in sequence.",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--model-type",
+        type=str,
+        default="unigram",
+        choices=["unigram", "bpe", "word", "char"],
+        metavar="MODEL_TYPE",
+        help="model algorithm: unigram, bpe, word for char (default is unigram)",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--vocab-size",
+        type=int,
+        default=8000,
+        metavar="VOCAB_SIZE",
+        help="vocabulary size (default is 8000)",
+    )
+
+    parser.add_argument(
+        "--char-coverage",
+        type=float,
+        default=0.9995,
+        metavar="CHAR_COVERAGE",
+        help="character coverage to determine the minimum symbols (default is 0.9995)",
+    )
+
+    parser.add_argument(
+        "--max-sentence-length",
+        type=int,
+        default=4192,
+        metavar="MAX_SENT_LENGTH",
+        help="maximum length of sentence in byte (default is 4192)",
+    )
+
+    parser.add_argument(
+        "--input-sentence-size",
+        type=int,
+        default=None,
+        metavar="INPUT_SENT_SIZE",
+        help="maximum size of sentences the trainer loads",
+    )
+
+    parser.add_argument(
+        "--shuffle-input-sentence",
+        type=cli.str2bool,
+        default=True,
+        metavar="SHUFFLE_INPUT_SENT",
+        help="Randomly sample input sentences in advance (default is True)",
+    )
+
+    args = parser.parse_args()
+
+    train(args)
diff --git a/pecos/utils/featurization/text/vectorizers.py b/pecos/utils/featurization/text/vectorizers.py
new file mode 100644
index 00000000..3f3c57b3
--- /dev/null
+++ b/pecos/utils/featurization/text/vectorizers.py
@@ -0,0 +1,814 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import json
+import logging
+import os
+import pathlib
+import pickle
+from abc import ABCMeta
+
+import numpy as np
+import torch
+from pecos.core import clib
+from pecos.utils import torch_util
+from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
+from sklearn.preprocessing import normalize
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
+from transformers import (
+    AlbertConfig,
+    AlbertModel,
+    AlbertTokenizer,
+    BertConfig,
+    BertModel,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertModel,
+    DistilBertTokenizer,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMModel,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetModel,
+    XLNetTokenizer,
+)
+
+vectorizer_dict = {}
+
+LOGGER = logging.getLogger(__name__)
+
+
+class VectorizerMeta(ABCMeta):
+    """Metaclass for keeping track of all `Vectorizer` subclasses."""
+
+    def __new__(cls, name, bases, attr):
+        cls = super().__new__(cls, name, bases, attr)
+        if cls.__name__ != "Vectorizer":
+            vectorizer_dict[cls.__name__.lower()] = cls
+        return cls
+
+
+class Vectorizer(metaclass=VectorizerMeta):
+    """Wrapper class for all vectorizers."""
+
+    def __init__(self, config, model):
+        """Initialization
+
+        Args:
+            config (dict): Dict with key `"type"` and value being the lower-cased name of the specific vectorizer class to use.
+                Also contains keyword arguments to pass to the specified vectorizer.
+            model (Vectorizer): Trained vectorizer.
+        """
+
+        self.config = config
+        self.model = model
+
+    def save(self, vectorizer_folder):
+        """Save trained vectorizer to disk.
+
+        Args:
+            vectorizer_folder (str): Folder to save to.
+        """
+
+        os.makedirs(vectorizer_folder, exist_ok=True)
+        with open(os.path.join(vectorizer_folder, "config.json"), "w", encoding="utf-8") as fout:
+            fout.write(json.dumps(self.config))
+        self.model.save(vectorizer_folder)
+
+    @classmethod
+    def load(cls, vectorizer_folder):
+        """Load a saved vectorizer from disk.
+
+        Args:
+            vectorizer_folder (str): Folder where `Vectorizer` was saved to using `Vectorizer.save`.
+
+        Returns:
+            Vectorizer: The loaded object.
+        """
+
+        config_path = os.path.join(vectorizer_folder, "config.json")
+        if not os.path.exists(config_path):
+            # to maintain compatibility with previous versions of pecos models
+            config = {"type": "tfidf", "kwargs": {}}
+        else:
+            with open(config_path, "r", encoding="utf-8") as fin:
+                config = json.loads(fin.read())
+        vectorizer_type = config.get("type", None)
+        assert vectorizer_type is not None, f"{vectorizer_folder} is not a valid vectorizer folder"
+        assert vectorizer_type in vectorizer_dict, f"invalid vectorizer type {config['type']}"
+        model = vectorizer_dict[vectorizer_type].load(vectorizer_folder)
+        return cls(config, model)
+
+    @classmethod
+    def train(cls, trn_corpus, config=None, dtype=np.float32):
+        """Train on a corpus.
+
+        Args:
+            trn_corpus (list or str): Training corpus in the form of a list of strings or path to text file.
+            config (dict, optional): Dict with key `"type"` and value being the lower-cased name of the specific vectorizer class to use.
+                Also contains keyword arguments to pass to the specified vectorizer. Default behavior is to use tfidf vectorizer with default arguments.
+            dtype (type, optional): Data type. Default is `numpy.float32`.
+
+        Returns:
+            Vectorizer: Trained vectorizer.
+        """
+
+        config = config if config is not None else {"type": "tfidf", "kwargs": {}}
+        vectorizer_type = config.get("type", None)
+        assert (
+            vectorizer_type is not None
+        ), f"config {config} should contain a key 'type' for the vectorizer type"
+        assert vectorizer_type in vectorizer_dict, f"invalid vectorizer type {config['type']}"
+        assert (
+            isinstance(trn_corpus, list) or vectorizer_type == "tfidf"
+        ), "only tfidf support from file training"
+        model = vectorizer_dict[vectorizer_type].train(
+            trn_corpus, config=config["kwargs"], dtype=dtype
+        )
+        return cls(config, model)
+
+    def predict(self, corpus, **kwargs):
+        """Vectorize a corpus.
+
+        Args:
+            corpus (list or str): List of strings to vectorize or path to text file.
+            **kwargs: Keyword arguments to pass to the trained vectorizer.
+
+        Returns:
+            numpy.ndarray or scipy.sparse.csr.csr_matrix: Matrix of features.
+        """
+
+        if isinstance(corpus, str) and self.config["type"] != "tfidf":
+            raise ValueError("Iterable over raw text expected for vectorizer other than tfidf.")
+        return self.model.predict(corpus, **kwargs)
+
+    @staticmethod
+    def load_config_from_args(args):
+        """Parse config from a `argparse.Namespace` object.
+
+        Args:
+            args (argparse.Namespace): Contains either a `vectorizer_config_path` (path to a json file) or `vectorizer_config_json` (a json object in string form).
+
+        Returns:
+            dict: The dict resulting from loading the json file or json object.
+
+        Raises:
+            Exception: If json object cannot be loaded.
+        """
+
+        if args.vectorizer_config_path is not None:
+            with open(args.vectorizer_config_path, "r", encoding="utf-8") as fin:
+                vectorizer_config_json = fin.read()
+        else:
+            vectorizer_config_json = args.vectorizer_config_json
+
+        try:
+            vectorizer_config = json.loads(vectorizer_config_json)
+        except json.JSONDecodeError as jex:
+            raise Exception(
+                "Failed to load vectorizer config json from {} ({})".format(
+                    vectorizer_config_json, jex
+                )
+            )
+        return vectorizer_config
+
+
+class Tfidf(Vectorizer):
+    """Multithreaded tfidf vectorizer with C++ backend.
+
+    Supports 'word', 'char' and 'char_wb' tokenization.
+    """
+
+    def __init__(self, model=None):
+        """Initialization
+
+        Args:
+            model (ctypes.c_void_p): pointer to C instance tfidf::Vectorizer
+        """
+        self.model = model
+
+    def __del__(self):
+        """Destruct self model instance"""
+        clib.tfidf_destruct(self.model)
+
+    def save(self, save_dir):
+        """Save trained tfidf vectorizer to disk.
+
+        Args:
+            save_dir (str): Folder to save the model.
+        """
+        os.makedirs(save_dir, exist_ok=True)
+        clib.tfidf_save(self.model, save_dir)
+
+    @classmethod
+    def load(cls, load_dir):
+        """Load a Tfidf vectorizer from disk.
+
+        Args:
+            load_dir (str): Folder inside which the model is loaded.
+
+        Returns:
+            Tfidf: The loaded object.
+        """
+        if not os.path.exists(load_dir):
+            raise ValueError(f"tfidf model not exist at {load_dir}")
+        return cls(clib.tfidf_load(load_dir))
+
+    @classmethod
+    def train(cls, trn_corpus, config=None, dtype=np.float32):
+        """Train on a corpus.
+
+        Args:
+            trn_corpus (list of str or str): Training corpus in the form of a list of strings or path to corpus file/folder.
+            config (dict): Dict with keyword arguments to pass to C++ class tfidf::Vectorizer.
+                The keywords are:
+                    ngram_range (tuple of int): (min_ngram, max_ngram)
+                    truncate_length (int): sequence truncation length, set to negative to disable
+                    max_feature (int): maximum number of features allowed, set to 0 to disable
+                    min_df_ratio (float, [0, max_df_ratio)): min ratio for document frequency truncation
+                    max_df_ratio (float, (min_df_ratio, 1]): max ratio for document frequency truncation
+                    min_df_cnt (int, [0, max_df_cnt)): min count for document frequency truncation
+                    max_df_cnt (int, (min_df_cnt, Inf)): max count for document frequency truncation. Default -1 to disable.
+                    binary (bool): whether to binarize term frequency, default False
+                    use_idf (bool): whether to use inverse document frequency, default True
+                    smooth_idf (bool): whether to smooth IDF by adding 1 to all DF counts, default True
+                    sublinear_tf (bool): whether to use sublinear mapping (log) on term frequency, default False
+                    keep_frequent_feature (bool): if max_feature > 0, will only keep max_feature features by
+                                    ignoring features with low document frequency (if True, default),
+                                    ignoring features with high document frequency (if False)
+                    norm (str, 'l1' or 'l2'): feature vector will have unit l1 or l2 norm
+                    analyzer (str, 'word', 'char' or 'char_wb'): Whether to use word or character n-grams.
+                                    Option ‘char_wb’ creates character n-grams only from text inside word boundaries,
+                                    n-grams at the edges of words are padded with single space.
+                    buffer_size (int): if train from file, number of bytes allocated for file I/O. Set to 0 to use default value.
+                    threads (int): number of threads to use, set to negative to use all
+            dtype (np.dtype): The data type to use. Default to `np.float32`.
+
+        Note:
+            stop word removal: simultaneously satisfy count and ratio constraint.
+                i.e. will use max(min_df_cnt, min_df_ratio * nr_doc) as final min_df_cnt
+                and min(max_df_cnt, max_df_ratio * nr_doc) as final max_df_cnt
+
+        Returns:
+            Tfidf: Trained vectorizer.
+        """
+        DEFAULTS = {
+            "ngram_range": (1, 1),
+            "truncate_length": -1,
+            "max_feature": 0,
+            "min_df_ratio": 0.0,
+            "max_df_ratio": 1.0,
+            "min_df_cnt": 0,
+            "max_df_cnt": -1,
+            "binary": False,
+            "use_idf": True,
+            "smooth_idf": True,
+            "sublinear_tf": False,
+            "keep_frequent_feature": True,
+            "norm": "l2",
+            "analyzer": "word",
+            "buffer_size": 0,
+            "threads": -1,
+        }
+
+        DEFAULTS_META = {
+            "norm_p": 2,
+            "buffer_size": 0,
+            "threads": -1,
+            "base_vect_configs": [DEFAULTS],
+        }
+
+        def check_base_config_key(base_config):
+            unexpected_keys = []
+            for key in base_config:
+                if key not in DEFAULTS:
+                    unexpected_keys.append(key)
+            if len(unexpected_keys) > 0:
+                raise ValueError(f"Unknown argument: {unexpected_keys}")
+            return {**DEFAULTS, **base_config}
+
+        if "base_vect_configs" not in config:
+            config = check_base_config_key(config)
+        else:
+            for idx, base_config in enumerate(config["base_vect_configs"]):
+                base_config = check_base_config_key(base_config)
+                config["base_vect_configs"][idx] = base_config
+            config = {**DEFAULTS_META, **config}
+
+        cmodel = clib.tfidf_train(trn_corpus, config)
+
+        return cls(cmodel)
+
+    def predict(self, corpus, **kwargs):
+        """Vectorize a corpus.
+
+        Args:
+            corpus (list): List of strings to vectorize.
+            **kwargs:
+                threads (int, default -1): number of threads to use for predict, set to negative to use all
+
+        Returns:
+            scipy.sparse.csr.csr_matrix: Matrix of features.
+        """
+        return clib.tfidf_predict(
+            self.model,
+            corpus,
+            buffer_size=kwargs.get("buffer_size", 0),
+            threads=kwargs.get("threads", -1),
+        )
+
+
+class SklearnTfidf(Vectorizer):
+    """Sklearn tfidf vectorizer"""
+
+    def __init__(self, model=None):
+        """Initialization
+
+        Args:
+            model (sklearn.feature_extraction.text.TfidfVectorizer, optional): The trained tfidf vectorizer. Default is `None`.
+        """
+
+        self.model = model
+
+    def save(self, vectorizer_folder):
+        """Save trained sklearn Tfidf vectorizer to disk.
+
+        Args:
+            vectorizer_folder (str): Folder to store serialized object in.
+        """
+
+        os.makedirs(vectorizer_folder, exist_ok=True)
+        with open(os.path.join(vectorizer_folder, "vectorizer.pkl"), "wb") as fout:
+            pickle.dump(self.model, fout)
+
+    @classmethod
+    def load(cls, vectorizer_folder):
+        """Load a saved sklearn Tfidf vectorizer from disk.
+
+        Args:
+            vectorizer_folder (str): Folder where `SklearnTfidf` object was saved to using `SklearnTfidf.save`.
+
+        Returns:
+            SklearnTfidf: The loaded object.
+        """
+
+        vectorizer_path = os.path.join(vectorizer_folder, "vectorizer.pkl")
+        assert os.path.exists(vectorizer_path), "vectorizer path {} does not exist".format(
+            vectorizer_path
+        )
+        with open(vectorizer_path, "rb") as fvec:
+            return cls(pickle.load(fvec))
+
+    @classmethod
+    def train(cls, trn_corpus, config=None, dtype=np.float32):
+        """Train on a corpus.
+
+        Args:
+            trn_corpus (list): Training corpus in the form of a list of strings.
+            config (dict): Dict with keyword arguments to pass to sklearn's TfidfVectorizer.
+            dtype (type, optional): Data type. Default is `numpy.float32`.
+
+        Returns:
+            Tfidf: Trained vectorizer.
+
+        Raises:
+            Exception: If `config` contains keyword arguments that the tfidf vectorizer does not accept.
+        """
+        defaults = {
+            "encoding": "utf-8",
+            "strip_accents": "unicode",
+            "stop_words": None,
+            "ngram_range": (1, 1),
+            "min_df": 1,
+            "lowercase": True,
+            "norm": "l2",
+            "dtype": dtype,
+        }
+        try:
+            model = TfidfVectorizer(**{**defaults, **config})
+        except TypeError:
+            raise Exception(
+                f"vectorizer config {config} contains unexpected keyword arguments for TfidfVectorizer"
+            )
+        model.fit(trn_corpus)
+        return cls(model)
+
+    def predict(self, corpus, **kwargs):
+        """Vectorize a corpus.
+
+        Args:
+            corpus (list): List of strings to vectorize.
+            **kwargs: Ignored.
+
+        Returns:
+            scipy.sparse.csr.csr_matrix: Matrix of features.
+        """
+        result = self.model.transform(corpus)
+        # Indices must be sorted for C++ batch code to work
+        result.sort_indices()
+        return result
+
+
+class SklearnHashing(Vectorizer):
+    """Sklearn hashing vectorizer"""
+
+    def __init__(self, model=None):
+        """Initialization
+
+        Args:
+            model (sklearn.feature_extraction.text.HashingVectorizer, optional): The trained hashing vectorizer. Default is `None`.
+        """
+        self.model = model
+
+    def save(self, vectorizer_folder):
+        """Save trained sklearn hashing vectorizer to disk.
+
+        Args:
+            vectorizer_folder (str): Folder to store serialized object in.
+        """
+
+        os.makedirs(vectorizer_folder, exist_ok=True)
+        with open(os.path.join(vectorizer_folder, "vectorizer.pkl"), "wb") as fout:
+            pickle.dump(self.model, fout)
+
+    @classmethod
+    def load(cls, vectorizer_folder):
+        """Load a saved sklearn hashing vectorizer from disk.
+
+        Args:
+            vectorizer_folder (str): Folder where `SklearnHashing` object was saved to using `SklearnHashing.save`.
+
+        Returns:
+            SklearnHashing: The loaded object.
+        """
+
+        vectorizer_path = os.path.join(vectorizer_folder, "vectorizer.pkl")
+        assert os.path.exists(vectorizer_path), "vectorizer path {} does not exist".format(
+            vectorizer_path
+        )
+        with open(vectorizer_path, "rb") as fvec:
+            return cls(pickle.load(fvec))
+
+    @classmethod
+    def train(cls, trn_corpus, config=None, dtype=np.float32):
+        """Train on a corpus.
+
+        Args:
+            trn_corpus (list): Training corpus in the form of a list of strings.
+            config (dict): Dict with keyword arguments to pass to sklearn's HashingVectorizer.
+            dtype (type, optional): Data type. Default is `numpy.float32`.
+
+        Returns:
+            Hashing: Trained vectorizer.
+
+        Raises:
+            Exception: If `config` contains keyword arguments that the hashing vectorizer does not accept.
+        """
+
+        defaults = {
+            "encoding": "utf-8",
+            "strip_accents": "unicode",
+            "stop_words": None,
+            "ngram_range": (1, 2),
+            "lowercase": True,
+            "norm": "l2",
+            "dtype": dtype,
+            "n_features": 1048576,  # default number in HashingVectorizer
+        }
+        try:
+            model = HashingVectorizer(**{**defaults, **config})
+        except TypeError:
+            raise Exception(
+                f"vectorizer config {config} contains unexpected keyword arguments for HashingVectorizer"
+            )
+        model.fit(trn_corpus)
+        return cls(model)
+
+    def predict(self, corpus, **kwargs):
+        """Vectorize a corpus.
+
+        Args:
+            corpus (list): List of strings to vectorize.
+            **kwargs: Ignored.
+
+        Returns:
+            scipy.sparse.csr.csr_matrix: Matrix of features.
+        """
+        result = self.model.transform(corpus)
+        # Indices must be sorted for C++ batch code to work
+        result.sort_indices()
+        return result
+
+
+class TransformerModelClass(object):
+    """Utility class for representing a Transformer and tokenizer."""
+
+    def __init__(self, config_class, model_class, tokenizer_class):
+        """Initialization
+
+        Args:
+            config_class (transformers.configuration_utils.PretrainedConfig)
+            model_class (transformers.modeling_utils.PreTrainedModel)
+            tokenizer_class (transformers.tokenization_utils.PreTrainedTokenizer)
+        """
+        self.config_class = config_class
+        self.model_class = model_class
+        self.tokenizer_class = tokenizer_class
+
+
+class PretrainedTransformer(Vectorizer):
+    """Vectorizer with a variety of Transformer models."""
+
+    MODEL_CLASSES = {
+        "albert": TransformerModelClass(AlbertConfig, AlbertModel, AlbertTokenizer),
+        "bert": TransformerModelClass(BertConfig, BertModel, BertTokenizer),
+        "distilbert": TransformerModelClass(
+            DistilBertConfig,
+            DistilBertModel,
+            DistilBertTokenizer,
+        ),
+        "roberta": TransformerModelClass(RobertaConfig, RobertaModel, RobertaTokenizer),
+        "xlm": TransformerModelClass(XLMConfig, XLMModel, XLMTokenizer),
+        "xlnet": TransformerModelClass(XLNetConfig, XLNetModel, XLNetTokenizer),
+    }
+
+    def __init__(self, model=None, tokenizer=None, transformer_options=None):
+        """Initialization
+
+        Args:
+            model (transformers.modeling_utils.PreTrainedModel)
+            tokenizer (transformers.tokenization_utils.PreTrainedTokenizer)
+            transformer_options (dict)
+        """
+
+        self.tokenizer = tokenizer
+        self.model = model
+        self.transformer_options = transformer_options
+
+    def save(self, vectorizer_folder):
+        """Save the pre-trained model and tokenizer to disk.
+
+        Args:
+            vectorizer_folder (str): Folder to save to.
+        """
+
+        os.makedirs(vectorizer_folder, exist_ok=True)
+        model_folder = os.path.join(vectorizer_folder, "model")
+        tokenizer_folder = os.path.join(vectorizer_folder, "tokenizer")
+        os.makedirs(model_folder, exist_ok=True)
+        os.makedirs(tokenizer_folder, exist_ok=True)
+        # this creates transformer_options.json
+        with open(
+            os.path.join(vectorizer_folder, "transformer_options.json"), "w", encoding="utf-8"
+        ) as fout:
+            fout.write(json.dumps(self.transformer_options))
+        # this creates config.json, pytorch_model.bin
+        self.model.save_pretrained(model_folder)
+        # this creates added_tokens.json, special_tokens_map.json, tokenizer_config.json, vocab.txt
+        self.tokenizer.save_pretrained(tokenizer_folder)
+
+    @classmethod
+    def load(cls, vectorizer_folder):
+        """Load the pre-trained model and tokenizer from disk.
+
+        Args:
+            vectorizer_folder (str): Folder which was saved to using `PretrainedTransformer.save`.
+
+        Returns:
+            PretrainedTransformer: Loaded object.
+        """
+
+        model_folder = os.path.join(vectorizer_folder, "model")
+        tokenizer_folder = os.path.join(vectorizer_folder, "tokenizer")
+
+        assert os.path.isdir(model_folder), "pre-trained model folder {} does not exist".format(
+            model_folder
+        )
+        assert os.path.isdir(
+            tokenizer_folder
+        ), "pre-trained tokenizer folder {} does not exist".format(tokenizer_folder)
+
+        # Load from transformer_options.json
+        with open(
+            os.path.join(vectorizer_folder, "transformer_options.json"), "r", encoding="utf-8"
+        ) as fin:
+            transformer_options = json.loads(fin.read())
+        dnn_type = cls.MODEL_CLASSES[transformer_options["transformer_type"]]
+        # Load from config.json
+        model_config = dnn_type.config_class.from_pretrained(model_folder)
+        # Load from vocab.txt [,added_tokens.json, special_tokens_map.json, tokenizer_config.json]
+        tokenizer = dnn_type.tokenizer_class.from_pretrained(tokenizer_folder)
+        # Load from pytorch_model.bin
+        model = dnn_type.model_class.from_pretrained(model_folder, config=model_config)
+        return cls(model, tokenizer, transformer_options)
+
+    @classmethod
+    def train(cls, trn_corpus, config=None, dtype=np.float32):
+        """Download a Transformer model.
+
+        Args:
+            trn_corpus (list): Training corpus in the form of a list of strings.
+            config (dict): Dictionary containing keywords/value for training. The keywords are:
+                transformer_type (str): Transformer model type (default 'bert').
+
+                norm (str): embedding normalization method, set to None to skip {'l1','l2'} (default 'l2')
+
+                pooling (str): pooling method {'mean','mask-mean','first','last','cls'} (default 'mean')
+
+                model_shortcut: can be either
+                        [1] str (default): pre-trained transformer model name shortcut (default 'bert-base-uncased')
+                                model configuration & tokenizer & model weights will be downloaded or load from
+                                cache_dir (if previously downloaded).
+                        [2] dictionary: a dictionary containing paths to load model configuration, vocabulary
+                                for tokenizer and pre-trained model weights
+                                (e.g. {"config_path": "path/to/config.json",
+                                        "vocab_path": "path/to/vocab.txt",
+                                        "weight_path": "path/to/pytorch_model.bin" })
+
+                do_fine_tune (bool): whether to fine tune the pre-trained Transformer (default False). Not yet implemented.
+
+                cache_dir (str): cache directory to save downloaded model, set to None (default) to use a temporary folder.
+
+        Returns:
+            PretrainedTransformer: Pre-trained model.
+        """
+
+        defaults = {
+            "transformer_type": "bert",
+            "norm": "l2",
+            "pooling": "mean",
+            "model_shortcut": "bert-base-uncased",
+            "do_fine_tune": False,
+            "cache_dir": None,
+        }
+        config = {**defaults, **config}
+        transformer_type = config["transformer_type"]
+        model_shortcut = config["model_shortcut"]
+        cache_dir = config["cache_dir"]
+
+        assert (
+            transformer_type in cls.MODEL_CLASSES
+        ), "Unsupported model type [{}], available Transformer model types: {}".format(
+            transformer_type, cls.MODEL_CLASSES.keys()
+        )
+        dnn_type = cls.MODEL_CLASSES[transformer_type]
+
+        if isinstance(model_shortcut, str):  # download everything
+            # download model config
+            model_config = dnn_type.config_class.from_pretrained(
+                model_shortcut, cache_dir=cache_dir
+            )
+            # download tokenizer
+            tokenizer = dnn_type.tokenizer_class.from_pretrained(
+                model_shortcut, cache_dir=cache_dir
+            )
+            # download model weights
+            model = dnn_type.model_class.from_pretrained(
+                model_shortcut, config=model_config, cache_dir=cache_dir
+            )
+        else:  # load everything locally
+            model_config_path = model_shortcut["config_path"]
+            vocab_path = model_shortcut["vocab_path"]
+            weight_path = model_shortcut["weight_path"]
+            assert pathlib.Path(model_config_path).is_file(), "model config not found at {}".format(
+                model_config_path
+            )
+            model_config = dnn_type.config_class.from_pretrained(model_config_path)
+            assert pathlib.Path(vocab_path).is_file(), "vocabulary not found at {}".format(
+                vocab_path
+            )
+            tokenizer = dnn_type.tokenizer_class.from_pretrained(vocab_path)
+            assert pathlib.Path(weight_path).is_file(), "model weights not found at {}".format(
+                weight_path
+            )
+            model = dnn_type.model_class.from_pretrained(weight_path, config=model_config)
+
+        if config["do_fine_tune"]:
+            # train the model!
+            raise NotImplementedError("Training for Transformer vectorizer not implemented!")
+
+        return cls(model, tokenizer, config)
+
+    def predict(
+        self, corpus, batch_size=8, truncate_length=300, use_gpu_if_available=True, **kwargs
+    ):
+        """Vectorizer a corpus.
+
+        Args:
+            corpus (list): List of strings to vectorize.
+            batch_size (int, optional): Default is 8.
+            truncate_length (int, optional): Default is 300.
+            use_gpu_if_available (bool, optional): Default is True.
+
+        Returns:
+            numpy.ndarray: Matrix of features.
+        """
+
+        if self.model.config.max_position_embeddings > 0:
+            truncate_length = min(truncate_length, self.model.config.max_position_embeddings)
+
+        # generate feature batches
+        feature_tensors = self.tokenizer.batch_encode_plus(
+            batch_text_or_text_pairs=corpus,
+            return_tensors="pt",
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            add_special_tokens=True,
+            max_length=truncate_length,
+            truncation=True,
+            padding="longest",
+        )
+        # setup device
+        device, n_active_gpu = torch_util.setup_device(use_gpu_if_available=use_gpu_if_available)
+        # start eval
+        transformer_type = self.transformer_options["transformer_type"]
+        norm = self.transformer_options["norm"]
+        pooling = self.transformer_options["pooling"]
+
+        batch_size = batch_size * max(1, n_active_gpu)
+        data = TensorDataset(
+            feature_tensors["input_ids"],
+            feature_tensors["attention_mask"],
+            feature_tensors["token_type_ids"],
+        )
+
+        sampler = SequentialSampler(data)
+        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size, num_workers=4)
+
+        # multi-gpu eval
+        if n_active_gpu > 1 and not isinstance(self.model, torch.nn.parallel.DataParallel):
+            model = torch.nn.parallel.DataParallel(self.model)
+        else:
+            model = self.model
+
+        model.eval()
+        model.to(device)
+        embeddings = []
+        for batch in dataloader:
+            batch = tuple(t.to(device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2],
+                }
+                if transformer_type == "distilbert":
+                    outputs = model(
+                        input_ids=inputs["input_ids"],
+                        attention_mask=inputs["attention_mask"],
+                    )
+                elif transformer_type in ["bert", "roberta", "albert", "xlm", "xlnet"]:
+                    outputs = model(
+                        input_ids=inputs["input_ids"],
+                        attention_mask=inputs["attention_mask"],
+                        token_type_ids=inputs["token_type_ids"],
+                    )
+                else:
+                    raise NotImplementedError(
+                        "Unsupported transformer_type {}".format(transformer_type)
+                    )
+
+                # get the embeddings from model output
+                # REF: https://huggingface.co/transformers/v2.3.0/model_doc/bert.html#bertmodel
+                # For bert,roberta,albert:  outputs = last_hidden_states, pooled_output, (hidden_states)
+                # For xlm,xlnet,distilbert: outputs = last_hidden_states, (hidden_states), (attentions)
+                if pooling == "mean":
+                    pooled_output = outputs[0].mean(dim=1)
+                elif pooling == "mask-mean":
+                    last_hidden_states = torch_util.apply_mask(outputs[0], inputs["attention_mask"])
+                    pooled_output = last_hidden_states.sum(dim=1)
+                    masked_length = inputs["attention_mask"].sum(dim=1)
+                    pooled_output = pooled_output / masked_length.unsqueeze(1).float()
+                elif pooling == "first":
+                    pooled_output = outputs[0][:, 0, :]
+                elif pooling == "last":
+                    pooled_output = outputs[0][:, -1, :]
+                elif pooling == "cls":
+                    assert transformer_type in [
+                        "bert",
+                        "roberta",
+                        "albert",
+                    ], "Only {} models have [CLS] token.".format(["bert", "roberta", "albert"])
+                    # get the [CLS] embedding for the document
+                    pooled_output = outputs[1]
+                else:
+                    raise NotImplementedError("Unsupported pooling method {}".format(pooling))
+
+                embeddings.append(pooled_output.cpu().numpy())
+
+        # construct dense output
+        embeddings = np.concatenate(embeddings, axis=0)
+        if norm is not None:
+            embeddings = normalize(embeddings, norm=norm, axis=1, copy=False)
+        return embeddings
diff --git a/pecos/utils/logging_util.py b/pecos/utils/logging_util.py
new file mode 100644
index 00000000..59d9d99d
--- /dev/null
+++ b/pecos/utils/logging_util.py
@@ -0,0 +1,37 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import logging
+
+log_levels = {
+    0: logging.ERROR,
+    1: logging.WARNING,
+    2: logging.INFO,
+    3: logging.DEBUG,
+}
+
+
+def setup_logging_config(level=1):
+    """Configure logging module.
+
+    Args:
+        level (int, optional): verbose level, 0 for ERROR, 1 for WARNING (default), 2 for INFO, 3 for DEBUG
+    """
+
+    try:
+        logging_level = log_levels[level]
+    except KeyError:
+        raise ValueError(f"expect level to be one of {log_levels.keys()}, but got {level}")
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging_level,
+    )
diff --git a/pecos/utils/parallel_util.py b/pecos/utils/parallel_util.py
new file mode 100644
index 00000000..3be4e19d
--- /dev/null
+++ b/pecos/utils/parallel_util.py
@@ -0,0 +1,32 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import logging
+
+LOGGER = logging.getLogger(__name__)
+
+
+def call_instance_method(instance, name, args, kwargs):
+    """indirect caller for instance methods for multiprocessing
+
+    Args:
+        instance: the instance to call method with
+        name (str): method name to call
+        args (tuple or None): arguments to be passed to getattr(instance, name)
+        kwargs (dict or None): kwargs to be passed to getattr(instance, name)
+
+    Returns:
+        the returned values of getattr(instance, name)
+    """
+    if args is None:
+        args = ()
+    if kwargs is None:
+        kwargs = {}
+    return getattr(instance, name)(*args, **kwargs)
diff --git a/pecos/utils/smat_util.py b/pecos/utils/smat_util.py
new file mode 100644
index 00000000..f2296a8b
--- /dev/null
+++ b/pecos/utils/smat_util.py
@@ -0,0 +1,823 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import collections
+
+import numpy as np
+import scipy.sparse as smat
+
+
+def cs_matrix(arg1, mat_type, shape=None, dtype=None, copy=False, check_contents=False):
+    """Custom compressed sparse matrix constructor that allows indices and indptr to be stored in different types.
+
+    Args:
+        arg1 (tuple): (data, indices, indptr) to construct compressed sparse matrix
+        mat_type (type): the matrix type to construct, one of [scipy.sparse.csr_matrix | scipy.sparse.csc_matrix]
+        shape (tuple, optional): shape of the matrix, default None to infer from arg1
+        dtype (type, optional): type of values in the matrix, default None to infer from data
+        copy (bool, optional): whether to copy the input arrays, defaults to False
+        check_contents (bool, optional): whether to check array contents to determine dtype, defaults to False
+
+    Returns:
+        compressed sparse matrix in mat_type
+    """
+    (data, indices, indptr) = arg1
+    indices_dtype = smat.sputils.get_index_dtype(indices, check_contents=check_contents)
+    indptr_dtype = smat.sputils.get_index_dtype(indptr, check_contents=check_contents)
+
+    ret = mat_type(shape, dtype=dtype)
+    # Read matrix dimensions given, if any
+    if shape is None:
+        # shape not already set, try to infer dimensions
+        try:
+            major_dim = len(ret.indptr) - 1
+            minor_dim = ret.indices.max() + 1
+        except Exception:
+            raise ValueError("unable to infer matrix dimensions")
+        else:
+            shape = ret._swap((major_dim, minor_dim))
+
+    ret.indices = np.array(indices, copy=copy, dtype=indices_dtype)
+    ret.indptr = np.array(indptr, copy=copy, dtype=indptr_dtype)
+    ret.data = np.array(data, copy=copy, dtype=dtype)
+
+    return ret
+
+
+def csr_matrix(arg1, shape=None, dtype=None, copy=False):
+    """Custom csr_matrix constructor that allows indices and indptr to be stored in different types.
+
+    Args:
+        arg1 (tuple): (data, indices, indptr) to construct csr_matrix
+        shape (tuple, optional): shape of the matrix, default None to infer from arg1
+        dtype (type, optional): type of values in the matrix, default None to infer from data
+        copy (bool, optional): whether to copy the input arrays, defaults to False
+
+    Returns:
+        csr_matrix
+    """
+    return cs_matrix(arg1, smat.csr_matrix, shape=shape, dtype=dtype, copy=copy)
+
+
+def csc_matrix(arg1, shape=None, dtype=None, copy=False):
+    """Custom csc_matrix constructor that allows indices and indptr to be stored in different types.
+
+    Args:
+        arg1 (tuple): (data, indices, indptr) to construct csc_matrix
+        shape (tuple, optional): shape of the matrix, default None to infer from arg1
+        dtype (type, optional): type of values in the matrix, default None to infer from data
+        copy (bool, optional): whether to copy the input arrays, defaults to False
+
+    Returns:
+        csc_matrix
+    """
+    return cs_matrix(arg1, smat.csc_matrix, shape=shape, dtype=dtype, copy=copy)
+
+
+def save_matrix(tgt, mat):
+    """Save dense or sparse matrix to file.
+
+    Args:
+        tgt (str): path to save the matrix
+        mat (numpy.ndarray or scipy.sparse.spmatrix): target matrix to save
+    """
+    assert isinstance(tgt, str), "tgt for save_matrix must be a str, but got {}".format(type(tgt))
+    with open(tgt, "wb") as tgt_file:
+        if isinstance(mat, np.ndarray):
+            np.save(tgt_file, mat, allow_pickle=False)
+        elif isinstance(mat, smat.spmatrix):
+            smat.save_npz(tgt_file, mat, compressed=False)
+        else:
+            raise NotImplementedError("Save not implemented for matrix type {}".format(type(mat)))
+
+
+def load_matrix(src, dtype=None):
+    """Load dense or sparse matrix from file.
+
+    Args:
+        src (str): path to load the matrix.
+        dtype (numpy.dtype, optional): if given, convert matrix dtype. otherwise use default type.
+
+    Returns:
+        mat (numpy.ndarray or scipy.sparse.spmatrix): loaded matrix
+
+    Notes:
+        If underlying matrix is {"csc", "csr", "bsr"}, indices will be sorted.
+    """
+    if not isinstance(src, str):
+        raise ValueError("src for load_matrix must be a str")
+
+    mat = np.load(src)
+    # decide whether it's dense or sparse
+    if isinstance(mat, np.ndarray):
+        pass
+    elif isinstance(mat, np.lib.npyio.NpzFile):
+        # Ref code: https://github.com/scipy/scipy/blob/v1.4.1/scipy/sparse/_matrix_io.py#L19-L80
+        matrix_format = mat["format"].item()
+        if not isinstance(matrix_format, str):
+            # files saved with SciPy < 1.0.0 may contain unicode or bytes.
+            matrix_format = matrix_format.decode("ascii")
+        try:
+            cls = getattr(smat, "{}_matrix".format(matrix_format))
+        except AttributeError:
+            raise ValueError("Unknown matrix format {}".format(matrix_format))
+
+        if matrix_format in ("csc", "csr", "bsr"):
+            mat = cls((mat["data"], mat["indices"], mat["indptr"]), shape=mat["shape"])
+            # This is in-place operation
+            mat.sort_indices()
+        elif matrix_format == "dia":
+            mat = cls((mat["data"], mat["offsets"]), shape=mat["shape"])
+        elif matrix_format == "coo":
+            mat = cls((mat["data"], (mat["row"], mat["col"])), shape=mat["shape"])
+        else:
+            raise NotImplementedError(
+                "Load is not implemented for sparse matrix of format {}.".format(matrix_format)
+            )
+    else:
+        raise TypeError("load_feature_matrix encountered unknown input format {}".format(type(mat)))
+
+    if dtype is None:
+        return mat
+    else:
+        return mat.astype(dtype)
+
+
+def transpose(mat):
+    """Transpose a dense/sparse matrix.
+
+    Args:
+        X (np.ndarray, spmatrix): input matrix to be transposed.
+
+    Returns:
+        transposed X
+    """
+
+    if not isinstance(mat, smat.spmatrix):
+        raise ValueError("mat must be a smat.spmatrix type")
+
+    if isinstance(mat, smat.csr_matrix):
+        return csc_matrix((mat.data, mat.indices, mat.indptr), shape=(mat.shape[1], mat.shape[0]))
+    elif isinstance(mat, smat.csc_matrix):
+        return csr_matrix((mat.data, mat.indices, mat.indptr), shape=(mat.shape[1], mat.shape[0]))
+    else:
+        return mat.T
+
+
+def sorted_csr_from_coo(shape, row_idx, col_idx, val, only_topk=None):
+    """Return a row-sorted CSR matrix from a COO sparse matrix.
+
+    Nonzero elements in each row of the returned CSR matrix is sorted in an descending order based on the value. If only_topk is given, only topk largest elements will be kept.
+
+    Args:
+        shape (tuple): the shape of the input COO matrix
+        row_idx (ndarray): row indices of the input COO matrix
+        col_idx (ndarray): col indices of the input COO matrix
+        val (ndarray): values of the input COO matrix
+        only_topk (int, optional): keep only topk elements per row. Default None to ignore
+
+    Returns:
+        csr_matrix
+    """
+    csr = smat.csr_matrix((val, (row_idx, col_idx)), shape=shape)
+    csr.sort_indices()
+    for i in range(shape[0]):
+        rng = slice(csr.indptr[i], csr.indptr[i + 1])
+        sorted_idx = np.argsort(-csr.data[rng], kind="mergesort")
+        csr.indices[rng] = csr.indices[rng][sorted_idx]
+        csr.data[rng] = csr.data[rng][sorted_idx]
+    if only_topk is not None:
+        assert isinstance(only_topk, int), f"Wrong type: type(only_topk) = {type(only_topk)}"
+        only_topk = max(min(1, only_topk), only_topk)
+        nnz_of_insts = csr.indptr[1:] - csr.indptr[:-1]
+        row_idx = np.repeat(np.arange(shape[0], dtype=csr.indices.dtype), nnz_of_insts)
+        selected_idx = (np.arange(len(csr.data)) - csr.indptr[row_idx]) < only_topk
+        row_idx = row_idx[selected_idx]
+        col_idx = csr.indices[selected_idx]
+        val = csr.data[selected_idx]
+        indptr = np.cumsum(np.bincount(row_idx + 1, minlength=(shape[0] + 1)))
+        csr = csr_matrix((val, col_idx, indptr), shape=shape, dtype=val.dtype)
+    return csr
+
+
+def sorted_csc_from_coo(shape, row_idx, col_idx, val, only_topk=None):
+    """Return a column-sorted CSC matrix from a COO sparse matrix.
+
+    Nonzero elements in each col of the returned CSC matrix is sorted in an descending order based on the value. If only_topk is given, only topk largest elements will be kept.
+
+    Args:
+        shape (tuple): the shape of the input COO matrix
+        row_idx (ndarray): row indices of the input COO matrix
+        col_idx (ndarray): col indices of the input COO matrix
+        val (ndarray): values of the input COO matrix
+        only_topk (int, optional): keep only topk elements per col. Default None to ignore
+
+    Returns:
+        csc_matrix
+    """
+    csr = sorted_csr_from_coo(shape[::-1], col_idx, row_idx, val, only_topk=None)
+    return transpose(csr)
+
+
+def binarized(X, inplace=False):
+    """Binarize a dense/sparse matrix. All nonzero elements become 1.
+
+    Args:
+        X (np.ndarray, spmatrix): input matrix to binarize
+        inplace (bool, optional): if True do the binarization in-place, else return a copy. Default False
+
+    Returns:
+        binarized X
+    """
+
+    if not isinstance(X, (np.ndarray, smat.spmatrix)):
+        raise NotImplementedError(
+            "this function only support X being np.ndarray or scipy.sparse.spmatrix."
+        )
+
+    if not inplace:
+        X = X.copy()
+
+    if isinstance(X, smat.spmatrix):
+        X.data[:] = 1
+    else:
+        X[:] = 1
+
+    return X
+
+
+def sorted_csr(csr, only_topk=None):
+    """Return a copy of input CSR matrix where nonzero elements in each row is sorted in an descending order based on the value.
+
+    If `only_topk` is given, only top-k largest elements will be kept.
+
+    Args:
+        csr (csr_matrix): input csr_matrix to sort
+        only_topk (int, optional): keep only topk elements per row. Default None to ignore
+
+    Returns:
+        csr_matrix
+    """
+    if not isinstance(csr, smat.csr_matrix):
+        raise ValueError("the input matrix must be a csr_matrix.")
+
+    row_idx = np.repeat(np.arange(csr.shape[0], dtype=np.uint32), csr.indptr[1:] - csr.indptr[:-1])
+    return sorted_csr_from_coo(csr.shape, row_idx, csr.indices, csr.data, only_topk)
+
+
+def sorted_csc(csc, only_topk=None):
+    """Return a copy of input CSC matrix where nonzero elements in each column is sorted in an descending order based on the value.
+
+    If `only_topk` is given, only top-k largest elements will be kept.
+
+    Args:
+        csc (csc_matrix): input csc_matrix to sort
+        only_topk (int, optional): keep only topk elements per col. Default None to ignore
+
+    Returns:
+        csc_matrix
+    """
+    if not isinstance(csc, smat.csc_matrix):
+        raise ValueError("the input matrix must be a csc_matrix.")
+
+    return transpose(sorted_csr(transpose(csc)))
+
+
+def dense_to_csr(dense, topk=None, batch=None):
+    """Memory efficient method to construct a csr_matrix from a dense matrix.
+
+    Args:
+        dense (ndarray): 2-D dense matrix to convert.
+        topk (int or None, optional): keep topk non-zeros with largest abs value for each row.
+             Default None to keep everything.
+        batch (int or None, optional): the batch size for construction.
+             Default None to use min(dense.shape[0], 10 ** 5).
+
+    Returns:
+        csr_matrix that has topk nnz each row with the same shape as dense.
+    """
+
+    BATCH_LIMIT = 10 ** 5
+
+    if topk is None:
+        keep_topk = dense.shape[1]
+    else:
+        keep_topk = min(dense.shape[1], max(1, int(topk)))
+
+    # if batch is given, use input batch size even if input batch > BATCH_LIMIT
+    if batch is None:
+        chunk_size = min(dense.shape[0], BATCH_LIMIT)
+    else:
+        chunk_size = min(dense.shape[0], max(1, int(batch)))
+
+    max_nnz = keep_topk * dense.shape[0]
+    indptr_dtype = np.int32 if max_nnz < np.iinfo(np.int32).max else np.int64
+    indices_dtype = np.int32 if dense.shape[1] < np.iinfo(np.int32).max else np.int64
+
+    data = np.empty((keep_topk * dense.shape[0],), dtype=dense.dtype)
+    indices = np.empty((keep_topk * dense.shape[0],), dtype=indices_dtype)
+    for i in range(0, dense.shape[0], chunk_size):
+        cur_chunk = dense[i : i + chunk_size, :]
+        chunk_len = cur_chunk.shape[0]
+        if keep_topk < dense.shape[1]:
+            col_indices = np.argpartition(abs(cur_chunk), keep_topk, axis=1)[:, -keep_topk:]
+        else:
+            col_indices = np.repeat(np.arange(keep_topk)[np.newaxis, :], chunk_len, axis=0)
+        row_indices = np.repeat(np.arange(chunk_len)[:, np.newaxis], keep_topk, axis=1)
+        chunk_data = cur_chunk[row_indices, col_indices]
+
+        data[i * keep_topk : i * keep_topk + chunk_data.size] = chunk_data.flatten()
+        indices[i * keep_topk : i * keep_topk + col_indices.size] = col_indices.flatten()
+    indptr = np.arange(0, dense.shape[0] * keep_topk + 1, keep_topk, dtype=indptr_dtype)
+    # Bypass scipy constructor to allow different indices and indptr types
+    return csr_matrix((data, indices, indptr), shape=dense.shape)
+
+
+def vstack_csr(matrices, dtype=None):
+    """Memory efficient method to stack csr_matrices vertically.
+
+    The returned matrix will retain the indices order.
+
+    Args:
+        matrices (list or tuple of csr_matrix): the matrices to stack in order, with shape (M1 x N), (M2 x N), ...
+        dtype (dtype, optional): The data-type of the output matrix. Default None to infer from matrices
+
+    Returns:
+        csr_matrix with shape (M1 + M2 + ..., N)
+    """
+    if not isinstance(matrices, (list, tuple)):
+        raise ValueError("matrices should be either list or tuple")
+    if any(not isinstance(X, smat.csr_matrix) for X in matrices):
+        raise ValueError("all matrix in matrices need to be csr_matrix!")
+    if len(matrices) <= 1:
+        return matrices[0] if len(matrices) == 1 else None
+    nr_cols = matrices[0].shape[1]
+    if any(mat.shape[1] != nr_cols for mat in matrices):
+        raise ValueError("Second dim not match")
+
+    total_nnz = sum([int(mat.nnz) for mat in matrices])
+    total_rows = sum([int(mat.shape[0]) for mat in matrices])
+
+    # infer result dtypes from inputs
+    int32max = np.iinfo(np.int32).max
+    if dtype is None:
+        dtype = smat.sputils.upcast(*[mat.dtype for mat in matrices])
+    indices_dtype = np.int64 if nr_cols > int32max else np.int32
+    indptr_dtype = np.int64 if total_nnz > int32max else np.int32
+
+    indptr = np.empty(total_rows + 1, dtype=indptr_dtype)
+    indices = np.empty(total_nnz, dtype=indices_dtype)
+    data = np.empty(total_nnz, dtype=dtype)
+
+    indptr[0], cur_nnz, cur_row = 0, 0, 0
+    for mat in matrices:
+        indices[cur_nnz : cur_nnz + mat.nnz] = mat.indices
+        data[cur_nnz : cur_nnz + mat.nnz] = mat.data
+        # can not merge the following two lines because
+        # mat.indptr[1:] + cur_nnz may overflow!
+        indptr[cur_row + 1 : cur_row + mat.shape[0] + 1] = mat.indptr[1:]
+        indptr[cur_row + 1 : cur_row + mat.shape[0] + 1] += cur_nnz
+        cur_nnz += mat.nnz
+        cur_row += mat.shape[0]
+
+    return csr_matrix((data, indices, indptr), shape=(total_rows, nr_cols))
+
+
+def hstack_csr(matrices, dtype=None):
+    """Memory efficient method to stack csr_matrices horizontally.
+
+    The returned matrix will retain the indices order.
+
+    Args:
+        matrices (list or tuple of csr_matrix): the matrices to stack in order, with shape (M x N1), (M x N2), ...
+        dtype (dtype, optional): The data-type of the output matrix. Default None to infer from matrices
+
+    Returns:
+        csr_matrix with shape (M, N1 + N2 + ...)
+    """
+    if not isinstance(matrices, (list, tuple)):
+        raise ValueError("matrices should be either list or tuple")
+    if any(not isinstance(X, smat.csr_matrix) for X in matrices):
+        raise ValueError("all matrix in matrices need to be csr_matrix!")
+    if len(matrices) <= 1:
+        return matrices[0] if len(matrices) == 1 else None
+    nr_rows = matrices[0].shape[0]
+    if any(mat.shape[0] != nr_rows for mat in matrices):
+        raise ValueError("First dim not match")
+
+    total_nnz = sum([int(mat.nnz) for mat in matrices])
+    total_cols = sum([int(mat.shape[1]) for mat in matrices])
+    # infer result dtypes from inputs
+    int32max = np.iinfo(np.int32).max
+    if dtype is None:
+        dtype = smat.sputils.upcast(*[mat.dtype for mat in matrices])
+    indices_dtype = np.int64 if nr_rows > int32max else np.int32
+    indptr_dtype = np.int64 if total_nnz > int32max else np.int32
+
+    indptr = np.empty(nr_rows + 1, dtype=indptr_dtype)
+    indices = np.empty(total_nnz, dtype=indices_dtype)
+    data = np.empty(total_nnz, dtype=dtype)
+    indptr[0], cur_ptr = 0, 0
+    for i in range(nr_rows):  # for every row
+        start_col = 0
+        for mat in matrices:
+            cur_nnz = mat.indptr[i + 1] - mat.indptr[i]
+            indices[cur_ptr : cur_ptr + cur_nnz] = (
+                mat.indices[mat.indptr[i] : mat.indptr[i + 1]] + start_col
+            )
+            data[cur_ptr : cur_ptr + cur_nnz] = mat.data[mat.indptr[i] : mat.indptr[i + 1]]
+            cur_ptr += cur_nnz
+            start_col += mat.shape[1]
+        indptr[i + 1] = cur_ptr
+
+    return csr_matrix((data, indices, indptr), shape=(nr_rows, total_cols))
+
+
+def vstack_csc(matrices, dtype=None):
+    """Memory efficient method to stack csc_matrices vertically.
+
+    The returned matrix will retain the indices order.
+
+    Args:
+        matrices (list or tuple of csc_matrix): the matrices to stack in order, with shape (M1 x N), (M2 x N), ...
+        dtype (dtype, optional): The data-type of the output matrix. Default None to infer from matrices
+
+    Returns:
+        csc_matrix with shape (M1 + M2 + ..., N)
+    """
+    if not isinstance(matrices, (list, tuple)):
+        raise ValueError("matrices should be either list or tuple")
+    if any(not isinstance(X, smat.csc_matrix) for X in matrices):
+        raise ValueError("all matrix in matrices need to be csc_matrix!")
+
+    if len(matrices) <= 1:
+        return matrices[0] if len(matrices) == 1 else None
+    return transpose(hstack_csr([transpose(mat) for mat in matrices], dtype=dtype))
+
+
+def hstack_csc(matrices, dtype=None):
+    """Memory efficient method to stack csc_matrices horizontally.
+
+    The returned matrix will retain the indices order.
+
+    Args:
+        matrices (list or tuple of csc_matrix): the matrices to stack in order, with shape (M x N1), (M x N2), ...
+        dtype (dtype, optional): The data-type of the output matrix. Default None to infer from matrices
+
+    Returns:
+        csc_matrix with shape (M, N1 + N2 + ...)
+    """
+    if not isinstance(matrices, (list, tuple)):
+        raise ValueError("matrices should be either list or tuple")
+    if any(not isinstance(X, smat.csc_matrix) for X in matrices):
+        raise ValueError("all matrix in matrices need to be csc_matrix!")
+
+    if len(matrices) <= 1:
+        return matrices[0] if len(matrices) == 1 else None
+    return transpose(vstack_csr([transpose(mat) for mat in matrices], dtype=dtype))
+
+
+def get_row_submatrices(matrices, row_indices):
+    """Get the sub-matrices of given matrices by selecting the rows given in row_indices
+
+    Args:
+        matrices (list of csr_matrix or ndarray): the matrices [mat_1, mat_2, ...] to operate on, with shape (M x N1), (M x N2), ...
+        row_indices (list or ndarray): the row indices to select
+
+    Returns:
+        list of csr_matrix or ndarray
+
+    """
+    if not isinstance(matrices, (list, tuple)):
+        raise ValueError("matrices should be either list or tuple")
+    n_mat = len(matrices)
+    if n_mat == 0:
+        raise ValueError("At least one matrix required as input")
+
+    if any(not isinstance(X, (smat.csr_matrix, np.ndarray)) for X in matrices):
+        raise ValueError("all matrix in matrices need to be csr_matrix or ndarray!")
+    nr_rows = matrices[0].shape[0]
+    if any(mat.shape[0] != nr_rows for mat in matrices):
+        raise ValueError("First dim not match")
+    if any(idx >= nr_rows or idx < 0 for idx in row_indices):
+        raise ValueError("row indices should be positive and do not exceed matrix first dimension")
+
+    results = []
+    for mat in matrices:
+        mat1 = mat[row_indices, :]
+        if isinstance(mat, smat.csr_matrix):
+            mat1.sort_indices()
+        results += [mat1]
+
+    return results
+
+
+def dense_to_coo(dense):
+    """Convert a dense matrix to COO format.
+
+    Args:
+        dense (ndarray): input dense matrix
+
+    Returns:
+        coo_matrix
+    """
+    rows = np.arange(dense.shape[0], dtype=np.uint32)
+    cols = np.arange(dense.shape[1], dtype=np.uint32)
+    row_idx = np.repeat(rows, np.ones_like(rows) * len(cols)).astype(np.uint32)
+    col_idx = np.ones((len(rows), 1), dtype=np.uint32).dot(cols.reshape(1, -1)).ravel()
+    return smat.coo_matrix((dense.ravel(), (row_idx, col_idx)), shape=dense.shape)
+
+
+def get_relevance_csr(csr, mm=None, dtype=np.float64):
+    """Return the csr matrix containing relevance scores based on given prediction csr matrix.
+
+    Relevance score is defined as: max_rank - local_rank + 1
+
+    Args:
+        csr (csr_matrix): input CSR matrix, row indices are sorted in descending order
+        mm (int, optional): max rank, will be inferred from csr if not given
+        dtype (type, optional): datatype for the returned relevance matrix. Default float64.
+
+    Returns:
+        csr_matrix of relevance scores
+    """
+    if mm is None:
+        mm = (csr.indptr[1:] - csr.indptr[:-1]).max()
+    nnz = len(csr.data)
+    nnz_of_rows = csr.indptr[1:] - csr.indptr[:-1]
+    row_idx = np.repeat(np.arange(csr.shape[0]), nnz_of_rows)
+    rel = np.array(
+        mm - (np.arange(nnz) - csr.indptr[row_idx]), dtype=dtype
+    )  # adding 1 to avoiding zero entries
+    return smat.csr_matrix((rel, csr.indices, csr.indptr), csr.shape)
+
+
+def get_sparsified_coo(coo, selected_rows, selected_columns):
+    """
+    Zero out everything not in selected rows and columns.
+
+    Args:
+        coo (coo_matrix): input coo matrix
+        selected_rows (list of int or np.array(int)): list of rows to be not zeroed out
+        selected_columns (list of int or np.array(int)): list of columns to be not zeroed out
+
+    Returns:
+        coo matrix with unwanted rows and columns zeroed out.
+    """
+    valid_rows = np.zeros(coo.shape[0], dtype=bool)
+    valid_cols = np.zeros(coo.shape[1], dtype=bool)
+    valid_rows[selected_rows] = True
+    valid_cols[selected_columns] = True
+    valid_idx = valid_rows[coo.row] & valid_cols[coo.col]
+    coo = smat.coo_matrix(
+        (coo.data[valid_idx], (coo.row[valid_idx], coo.col[valid_idx])), shape=coo.shape
+    )
+    return coo
+
+
+def csr_rowwise_mul(A, v):
+    """Row-wise multiplication between sparse csr matrix A and dense array v.
+
+    Where each row of A is multiplied by the corresponding element in v.
+    The number of rows of A is same as the length of v.
+
+    Args:
+        A (csr_matrix): The matrix to be multiplied.
+        v (ndarray): The multiplying vector.
+
+    Returns:
+        Z (csr_matrix): The product of row-wise multiplication of A and v.
+    """
+
+    if not isinstance(A, smat.csr_matrix):
+        raise ValueError(f"A must be scipy.sparse.csr_matrix")
+    if not isinstance(v, np.ndarray):
+        raise ValueError(f"v must be a numpy ndarray")
+    if v.ndim != 1:
+        raise ValueError(f"v should be an 1-d array")
+    if v.shape[0] != A.shape[0]:
+        raise ValueError(f"The dimension of v should be the same as the number of rows of A")
+
+    Z = A.copy()
+    for i in range(v.shape[0]):
+        Z.data[Z.indptr[i] : Z.indptr[i + 1]] *= v[i]
+    return Z
+
+
+def csc_colwise_mul(A, v):
+    """Column-wise multiplication between sparse csc matrix A and dense array v, where each column of A is multiplied by the corresponding element in v (The number of columns of A is same as the length of v).
+
+    Args:
+        A (csc_matrix): The matrix to be multiplied.
+        v (ndarray): The multiplying vector.
+
+    Returns:
+        Z (csc_matrix): The product of column-wise multiplication of A and v.
+    """
+
+    if not isinstance(A, smat.csc_matrix):
+        raise ValueError(f"A must be scipy.sparse.csc_matrix")
+    if not isinstance(v, np.ndarray):
+        raise ValueError(f"v must be a numpy ndarray")
+    if v.ndim != 1:
+        raise ValueError(f"v should be an 1-d array")
+    if v.shape[0] != A.shape[1]:
+        raise ValueError(f"The dimension of v should be the same as the number of columns of A")
+
+    Z = A.copy()
+    for i in range(v.shape[0]):
+        Z.data[Z.indptr[i] : Z.indptr[i + 1]] *= v[i]
+    return Z
+
+
+def get_cocluster_spectral_embeddings(A, dim=24):
+    """Obtain the co-cluster spectral embeddings for the given bipartite graph described in [1]
+
+    * [1] `Dhillon, Inderjit S, 2001. Co-clustering documents and words using
+          bipartite spectral graph partition`
+
+    Args:
+        A (csr_matrix or csc_matrix): bipartite graph matrix
+        dim (int, optional): the dimension of the returned embeddings. Default 24
+
+    Returns:
+        (row_embedding, col_embedding): a tuple of embeddings for rows and columns respectively
+            row_embedding: numpy.ndarray of shape (A.shape[0], dim).
+            col_embedding: numpy.ndarray of shape (A.shape[1], dim).
+    """
+    assert A.min() >= 0.0, "A must be nonnegative"
+
+    from sklearn.utils.extmath import randomized_svd
+
+    # Obtain An, the normalized adjacency bipartite matrix described in Eq (10) of [1]
+    #   A_n = D_1^{-1/2} A D_2^{-1/2}
+    #   row_diag = diagonal of D_1^{-1/2}
+    #   col_diag = diagonal of D_2^{-1/2}
+    row_diag = np.asarray(np.sqrt(A.sum(axis=1))).squeeze()
+    col_diag = np.asarray(np.sqrt(A.sum(axis=0))).squeeze()
+    row_diag[row_diag == 0] = 1.0
+    col_diag[col_diag == 0] = 1.0
+    row_diag = 1.0 / row_diag
+    col_diag = 1.0 / col_diag
+    if smat.issparse(A):
+        n_rows, n_cols = A.shape
+        r = smat.dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
+        c = smat.dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
+        An = r * A * c
+    else:
+        An = row_diag[:, np.newaxis] * A * col_diag
+
+    # run SVD on An
+    nr_discards = 1  # discarding the first component
+    U, Sigma, VT = randomized_svd(An, dim + nr_discards, random_state=0)
+
+    # Normalized the singular vectors based on Eq (24) of [1]
+    row_embedding = np.ascontiguousarray(row_diag[:, np.newaxis] * U[:, nr_discards:])
+    col_embedding = np.ascontiguousarray(col_diag[:, np.newaxis] * VT[nr_discards:].T)
+
+    return row_embedding, col_embedding
+
+
+class CsrEnsembler(object):
+    """A class implementing several ensemblers for a list sorted CSR predictions"""
+
+    @staticmethod
+    def check_validlity(*args):
+        """Check whether input CSR matrices are valid
+
+        Args:
+            args (iterable over csr_matrix): input CSR matrices
+        """
+        for x in args:
+            assert isinstance(x, smat.csr_matrix), type(x)
+        assert all(x.shape == args[0].shape for x in args)
+
+    @staticmethod
+    def average(*args):
+        """Ensemble predictions by averaging prediction values
+
+        Args:
+            args (iterable over csr_matrix): input CSR matrices
+
+        Returns:
+            ret (csr_matrix): ensembled prediction CSR matrix
+        """
+        CsrEnsembler.check_validlity(*args)
+        ret = sum(args)
+        ret = sorted_csr(ret)
+        ret.data /= len(args)
+        return ret
+
+    @staticmethod
+    def rank_average(*args):
+        """Ensemble predictions by averaging prediction ranks
+
+        Args:
+            args (iterable over csr_matrix): input CSR matrices
+
+        Returns:
+            ret (csr_matrix): ensembled prediction CSR matrix
+        """
+        CsrEnsembler.check_validlity(*args)
+        mm = max((x.indptr[1:] - x.indptr[:-1]).max() for x in args)
+        ret = sum(get_relevance_csr(csr, mm) for csr in args)
+        ret = sorted_csr(ret)
+        ret.data /= len(args)
+        return ret
+
+    @staticmethod
+    def round_robin(*args):
+        """Ensemble predictions by round robin
+
+        Args:
+            args (iterable over csr_matrix): input CSR matrices
+
+        Returns:
+            ret (csr_matrix): ensembled prediction CSR matrix
+        """
+        CsrEnsembler.check_validlity(*args)
+        base = 1.0 / (len(args) + 1.0)
+        mm = max((x.indptr[1:] - x.indptr[:-1]).max() for x in args)
+        ret = get_relevance_csr(args[0], mm)
+        ret.data[:] += len(args) * base
+        for i, x in enumerate(args[1:], 1):
+            tmp = get_relevance_csr(x, mm)
+            tmp.data[:] += (len(args) - i) * base
+            ret = ret.maximum(tmp)
+        ret = sorted_csr(ret)
+        ret.data /= len(args)
+        return ret
+
+    @staticmethod
+    def print_ens(Ytrue, pred_set, param_set, topk=10):
+        """Print matrices before and after ensemble
+
+        Args:
+            Ytrue (csr_matrix): ground truth label matrix
+            pred_set (iterable over csr_matrix): prediction matrices to ensemble
+            param_set (iterable): parameters or model names associated with pred_set
+        """
+
+        for param, pred in zip(param_set, pred_set):
+            print("param: {}".format(param))
+            print(Metrics.generate(Ytrue, pred, topk=topk))
+        for ens in [CsrEnsembler.average, CsrEnsembler.rank_average, CsrEnsembler.round_robin]:
+            print("ens: {}".format(ens.__name__))
+            print(Metrics.generate(Ytrue, ens(*pred_set), topk=topk))
+
+
+class Metrics(collections.namedtuple("Metrics", ["prec", "recall"])):
+    """The metrics (precision, recall) for multi-label classification problems."""
+
+    __slots__ = ()
+
+    def __str__(self):
+        """Format printing"""
+
+        def fmt(key):
+            return " ".join("{:4.2f}".format(100 * v) for v in getattr(self, key)[:])
+
+        return "\n".join("{:7}= {}".format(key, fmt(key)) for key in self._fields)
+
+    @classmethod
+    def default(cls):
+        """Default dummy metric"""
+        return cls(prec=[], recall=[])
+
+    @classmethod
+    def generate(cls, tY, pY, topk=10):
+        """Compute the metrics with given prediction and ground truth.
+
+        Args:
+            tY (csr_matrix): ground truth label matrix
+            pY (csr_matrix): predicted logits
+            topk (int, optional): only generate topk prediction. Default 10
+
+        Returns:
+            Metrics
+        """
+        assert isinstance(tY, smat.csr_matrix), type(tY)
+        assert isinstance(pY, smat.csr_matrix), type(pY)
+        assert tY.shape == pY.shape, "tY.shape = {}, pY.shape = {}".format(tY.shape, pY.shape)
+        pY = sorted_csr(pY)
+        total_matched = np.zeros(topk, dtype=np.uint64)
+        recall = np.zeros(topk, dtype=np.float64)
+        for i in range(tY.shape[0]):
+            truth = tY.indices[tY.indptr[i] : tY.indptr[i + 1]]
+            matched = np.isin(pY.indices[pY.indptr[i] : pY.indptr[i + 1]][:topk], truth)
+            cum_matched = np.cumsum(matched, dtype=np.uint64)
+            total_matched[: len(cum_matched)] += cum_matched
+            recall[: len(cum_matched)] += cum_matched / max(len(truth), 1)
+            if len(cum_matched) != 0:
+                total_matched[len(cum_matched) :] += cum_matched[-1]
+                recall[len(cum_matched) :] += cum_matched[-1] / max(len(truth), 1)
+        prec = total_matched / tY.shape[0] / np.arange(1, topk + 1)
+        recall = recall / tY.shape[0]
+        return cls(prec=prec, recall=recall)
diff --git a/pecos/utils/torch_util.py b/pecos/utils/torch_util.py
new file mode 100644
index 00000000..da68b1af
--- /dev/null
+++ b/pecos/utils/torch_util.py
@@ -0,0 +1,67 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import logging
+
+import numpy as np
+import torch
+
+LOGGER = logging.getLogger(__name__)
+
+
+def setup_device(use_gpu_if_available=True):
+    """Setup device for pytorch.
+
+    Args:
+        use_gpu_if_available (bool, optional): whether to use GPU if available. Default True
+
+    Returns:
+        device (torch.device): torch device
+        n_active_gpu (int): number of GPUs available for torch.cuda
+    """
+    if use_gpu_if_available:  # use all that available
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        n_active_gpu = torch.cuda.device_count()
+        if not torch.cuda.is_available():
+            LOGGER.warning("CUDA is not available, will fall back to CPU.")
+    else:
+        device = torch.device("cpu")
+        if torch.cuda.is_available():
+            LOGGER.warning("CUDA is available but disabled, will only use CPU.")
+        n_active_gpu = 0
+    LOGGER.info("Setting device to {}, number of active GPUs: {}".format(device, n_active_gpu))
+    return device, n_active_gpu
+
+
+def set_seed(seed=0):
+    """Set the random seed for torch.
+
+    Args:
+        seet (int, optional): random seed. Default 0
+    """
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # If CUDA is not available, this is silently ignored.
+    torch.cuda.manual_seed_all(seed)
+
+
+def apply_mask(hidden_states, masks):
+    """Apply mask to masked tokens in a batch
+
+    Args:
+        hidden_states (torch.tensor): shape(batch_size, seq_len(PADDED), hidden_dim)
+        masks (torch.tensor): shape(batch_size, seq_len(PADDED)) where masked tokens=0, unmasked tokens=1
+
+    Returns:
+        masked_hidden_states (torch.tensor): shape(batch_size, seq_len(PADDED), hidden_dim)
+    """
+    hidden_dim = hidden_states.shape[-1]
+    hidden_states.view(-1, hidden_dim)[~masks.view(-1).type(torch.ByteTensor), :] = 0
+    return hidden_states
diff --git a/pecos/xmc/__init__.py b/pecos/xmc/__init__.py
new file mode 100644
index 00000000..fe726024
--- /dev/null
+++ b/pecos/xmc/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+from .base import (  # noqa
+    Indexer,  # noqa
+    PostProcessor,  # noqa
+    MLProblem,  # noqa
+    MLModel,  # noqa
+    HierarchicalMLModel,  # noqa
+    LabelEmbeddingFactory,  # noqa
+)  # noqa
diff --git a/pecos/xmc/base.py b/pecos/xmc/base.py
new file mode 100644
index 00000000..636a2c5f
--- /dev/null
+++ b/pecos/xmc/base.py
@@ -0,0 +1,1685 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import copy
+import json
+import logging
+import math
+import os
+from abc import ABCMeta
+from glob import glob
+from os import path
+
+import dataclasses as dc
+import numpy as np
+import pecos
+import scipy.sparse as smat
+from pecos.core import ScipyCompressedSparseAllocator, ScipyCscF32, ScipyCsrF32, ScipyDrmF32, clib
+from pecos.utils import smat_util
+from pecos.utils.cluster_util import ClusterChain, hierarchical_kmeans
+from sklearn.preprocessing import normalize
+
+LOGGER = logging.getLogger(__name__)
+
+
+class IndexerMeta(ABCMeta):
+    """Metaclass for keeping track of all `Indexer` subclasses."""
+
+    indexer_dict = {}  # type: ignore
+
+    def __new__(cls, name, bases, attr):
+        cls = super().__new__(cls, name, bases, attr)
+        if cls.__name__ != "Indexer":
+            IndexerMeta.indexer_dict[cls.__name__.lower()] = cls
+        return cls
+
+
+class Indexer(metaclass=IndexerMeta):
+    """Enables calling `Indexer` subclasses by string names."""
+
+    indexer_dict = IndexerMeta.indexer_dict
+
+    @classmethod
+    def gen(cls, feat_mat, indexer_type="hierarchicalkmeans", **kwargs):
+        """Generate a cluster chain.
+
+        Args:
+            feat_mat (numpy.ndarray or scipy.sparse.csr.csr_matrix): Matrix of label features.
+            indexer_type (str): Lower-cased name of the desired `Indexer` subclass to use.
+            **kwargs: Keyword arguments to pass to the specified `Indexer` subclass.
+
+        Returns:
+            ClusterChain: The generated cluster chain.
+        """
+
+        return cls.indexer_dict[indexer_type].gen(feat_mat, **kwargs)
+
+
+class HierarchicalKMeans(Indexer):
+    """Indexer using Hierarchical K-means.
+
+    See more details in Algorithm 1 of PECOS paper (Yu et al., 2020).
+
+    PECOS: Prediction for Enormous and Correlated Output Spaces
+        Hsiang-Fu Yu, Kai Zhong, Inderjit S. Dhillon
+        https://arxiv.org/abs/2010.05878
+    """
+
+    KMEANS = 0  # KMEANS
+    SKMEANS = 5  # Spherical KMEANS
+
+    @classmethod
+    def gen(
+        cls,
+        feat_mat,
+        nr_splits=2,
+        min_codes=None,
+        max_leaf_size=100,
+        imbalanced_ratio=0.0,
+        imbalanced_depth=100,
+        spherical=True,
+        seed=0,
+        max_iter=20,
+        threads=-1,
+        dtype=np.float32,
+        **kwargs,
+    ):
+        """Generate a cluster chain by using hierarchical k-means.
+
+        Args:
+            feat_mat (numpy.ndarray or scipy.sparse.csr.csr_matrix): Matrix of label features.
+            nr_splits (int, optional): The out-degree of each internal node of the tree. Ignored if `imbalanced_ratio != 0` because imbalanced clustering supports only 2-means. Default is `2`.
+            min_codes (int): The number of direct child nodes that the top level of the hierarchy should have.
+            max_leaf_size (int, optional): The maximum size of each leaf node of the tree. Default is `100`.
+            imbalanced_ratio (float, optional): Value between `0.0` and `0.5` (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering `L` labels, the size of the output 2 clusters will be within approx `imbalanced_ratio * 2 * L` of each other. Default is `0.0`.
+            imbalanced_depth (int, optional): Maximum depth of imbalanced clustering. After depth `imbalanced_depth` is reached, balanced clustering will be used. Default is `100`.
+            spherical (bool, optional): True will l2-normalize the centroids of k-means after each iteration. Default is `True`.
+            seed (int, optional): Random seed. Default is `0`.
+            max_iter (int, optional): Maximum number of iterations for each k-means problem. Default is `20`.
+            threads (int, optional): Number of threads to use. `-1` denotes all CPUs. Default is `-1`.
+            dtype (type, optional): Data type for matrices. Default is `numpy.float32`.
+            **kwargs: Ignored.
+
+        Returns:
+            ClusterChain: The generated cluster chain.
+        """
+        if min_codes is None:
+            min_codes = nr_splits
+
+        # use optimized c++ clustering code if doing balanced clustering
+        if imbalanced_ratio == 0:
+            nr_instances = feat_mat.shape[0]
+            depth = max(1, int(math.ceil(math.log2(nr_instances / max_leaf_size))))
+            if (2 ** depth) > nr_instances:
+                raise ValueError(
+                    f"max_leaf_size > 1 is needed for feat_mat.shape[0] == {nr_instances} to avoid empty clusters"
+                )
+
+            algo = cls.SKMEANS if spherical else cls.KMEANS
+
+            assert feat_mat.dtype == np.float32
+            if isinstance(feat_mat, (smat.csr_matrix, ScipyCsrF32)):
+                py_feat_mat = ScipyCsrF32.init_from(feat_mat)
+            elif isinstance(feat_mat, (np.ndarray, ScipyDrmF32)):
+                py_feat_mat = ScipyDrmF32.init_from(feat_mat)
+            else:
+                raise NotImplementedError(
+                    "type(feat_mat) = {} is not supported.".format(type(feat_mat))
+                )
+
+            codes = np.zeros(py_feat_mat.rows, dtype=np.uint32)
+            codes = clib.run_clustering(
+                py_feat_mat, depth, algo, seed, codes=codes, max_iter=max_iter, threads=threads
+            )
+            C = cls.convert_codes_to_csc_matrix(codes, depth)
+            cluster_chain = ClusterChain.from_partial_chain(
+                C, min_codes=min_codes, nr_splits=nr_splits
+            )
+        else:
+            cluster_chain = hierarchical_kmeans(
+                feat_mat,
+                max_leaf_size=max_leaf_size,
+                imbalanced_ratio=imbalanced_ratio,
+                imbalanced_depth=imbalanced_depth,
+                spherical=spherical,
+                seed=seed,
+                max_iter=max_iter,
+                threads=threads,
+            )
+            cluster_chain = ClusterChain(cluster_chain)
+        return cluster_chain
+
+    @staticmethod
+    def convert_codes_to_csc_matrix(codes, depth):
+        """Convert a 1d array of cluster assignments into a binary clustering matrix format.
+        Args:
+            codes (numpy.ndarray): 1d array of integers. Each index of the array corresponds to a label index, each value of the array is the cluster index.
+            depth (int): The depth of the hierarchical tree.
+
+        Returns:
+            scipy.sparse.csc.csc_matrix: A binary matrix of shape `(len(codes), 1 << depth)`. An entry `(r, c)` in the matrix has value `1` if and only if `codes[r] == c`.
+        """
+
+        nr_codes = 1 << depth
+        nr_elements = len(codes)
+
+        indptr = np.cumsum(np.bincount(codes + 1, minlength=(nr_codes + 1)), dtype=np.uint64)
+        indices = np.argsort(codes * np.float64(nr_elements) + np.arange(nr_elements))
+        C = smat_util.csc_matrix(
+            (np.ones_like(indices, dtype=np.float32), indices, indptr),
+            shape=(nr_elements, nr_codes),
+        )
+        return C
+
+
+class Transform(object):
+    """The logit transform methods for different post-processors"""
+
+    @staticmethod
+    def identity(v, inplace=False):
+        """Identical transformation
+
+        Args:
+            v (ndarray): The input array
+            inplace (bool, optional): Whether to overwrite input for output
+
+        Returns:
+            out (ndarray): the transformed result.
+        """
+        return v
+
+    @staticmethod
+    def log_lp_hinge(p, v, inplace=False):
+        """Log Lp Hinge transformation
+
+        .. math:: - \max (1 - v, 0)^p
+
+        Args:
+            p (int): The p for Lp formula
+            v (ndarray): The input array
+            inplace (bool, optional): Whether to overwrite input for output
+
+        Returns:
+            out (ndarray): the transformed result.
+        """
+        if inplace:
+            out = v
+        else:
+            out = np.zeros_like(v)
+        out[:] = -(np.maximum(1.0 - v, 0) ** p)
+        return out
+
+    @staticmethod
+    def lp_hinge(p, v, inplace=False):
+        """Lp Hinge transformation
+
+        .. math:: \exp { - \max (1 - v, 0)^p }
+
+        Args:
+            p (int): The p for Lp formula
+            v (ndarray): The input array
+            inplace (bool, optional): Whether to overwrite input for output
+
+        Returns:
+            out (ndarray): the transformed result.
+        """
+        out = Transform.log_lp_hinge(p, v, inplace)
+        np.exp(out, out=out)
+        return out
+
+    @staticmethod
+    def get_log_lp_hinge(p):
+        """Wrapped function of `log_lp_hinge` for fixed `p`
+
+        Args:
+            p (int): The p for Lp formula
+
+        Returns:
+            f (function): the `log_lp_hinge` function with fixed `p`.
+        """
+
+        def f(v, inplace=False):
+            return Transform.log_lp_hinge(p, v, inplace)
+
+        return f
+
+    @staticmethod
+    def get_lp_hinge(p):
+        """Wrapped function of `lp_hinge` for fixed `p`
+
+        Args:
+            p (int): The p for Lp formula
+
+        Returns:
+            f (function): the `lp_hinge` function with fixed `p`.
+        """
+
+        def f(v, inplace=False):
+            return Transform.lp_hinge(p, v, inplace)
+
+        return f
+
+    @staticmethod
+    def sigmoid(v, inplace=False):
+        """Sigmoid transformation
+
+        .. math:: \frac{ 1 }{ 1 + \exp {-v} }
+
+        Args:
+            v (ndarray): The input array
+            inplace (bool, optional): Whether to overwrite input for output
+
+        Returns:
+            out (ndarray): the transformed result.
+        """
+        if inplace:
+            out = v
+        else:
+            out = np.zeros_like(v)
+        out[:] = 1.0 / (1.0 + np.exp(-v))
+        return out
+
+    @staticmethod
+    def log_sigmoid(v, inplace=False):
+        """Log Sigmoid transformation
+
+        .. math:: \log { \frac{ 1 }{ 1 + \exp {-v} } }
+
+        Args:
+            v (ndarray): The input array
+            inplace (bool, optional): Whether to overwrite input for output
+
+        Returns:
+            out (ndarray): the transformed result.
+        """
+        out = Transform.sigmoid(v, inplace)
+        out[:] = np.log(out)
+        return out
+
+
+class Combiner(object):
+    """The combining methods for different post-processors"""
+
+    @staticmethod
+    def noop(x, y):
+        """No operation
+
+        Args:
+            x (ndarray): The first input array
+            y (ndarray): The second input array
+
+        Returns:
+            x (ndarray): The combining method result in `x`.
+        """
+        return x
+
+    @staticmethod
+    def plus(x, y):
+        """Plus y to x, write result in x
+
+        Args:
+            x (ndarray): The first input array
+            y (ndarray): The second input array
+
+        Returns:
+            x (ndarray): The combining method result in `x`.
+        """
+        x[:] += y[:]
+        return x
+
+    @staticmethod
+    def multiplies(x, y):
+        """Multiply x by y, write result in x
+
+        Args:
+            x (ndarray): The first input array
+            y (ndarray): The second input array
+
+        Returns:
+            x (ndarray): The combining method result in `x`.
+        """
+        x[:] *= y[:]
+        return x
+
+    @staticmethod
+    def max(x, y):
+        """Element-wise maximum of x and y, write result in x
+
+        Args:
+            x (ndarray): The first input array
+            y (ndarray): The second input array
+
+        Returns:
+            x (ndarray): The combining method result in `x`.
+        """
+        x[:] = np.maximum(x[:], y[:])
+        return x
+
+
+class PostProcessor(object):
+    """PostProcessor for chaining the values in the beam search."""
+
+    mapping = {}  # type: ignore
+
+    def __init__(self, transform, combiner):
+        """Initialization
+
+        Args:
+            transform (function): Transform method.
+            combiner (function): Combination method.
+        """
+        self.transform = transform
+        self.combiner = combiner
+
+    @classmethod
+    def get(cls, name):
+        """Get the PostProcessor instance from name.
+
+        Args:
+            name (PostProcessor or str): the post-processor type.
+
+        Returns:
+            PostProcessor
+        """
+        cls.initialize()
+        if isinstance(name, cls):
+            return name
+        else:
+            return cls.mapping[name]
+
+    @classmethod
+    def initialize(cls):
+        """Initialize the mapping between Transform and Combiner."""
+        if len(cls.mapping) == 0:
+            cls.mapping["noop"] = cls(Transform.identity, Combiner.noop)
+            cls.mapping["sigmoid"] = cls(Transform.sigmoid, Combiner.multiplies)
+            cls.mapping["log-sigmoid"] = cls(Transform.log_sigmoid, Combiner.plus)
+
+            for p in range(1, 5):
+                cls.mapping["l{}-hinge".format(p)] = cls(
+                    Transform.get_lp_hinge(p), Combiner.multiplies
+                )
+                cls.mapping["log-l{}-hinge".format(p)] = cls(
+                    Transform.get_log_lp_hinge(p), Combiner.plus
+                )
+
+            # for backward compatibility
+            cls.mapping[True] = cls.mapping["l3-hinge"]
+            cls.mapping[False] = cls.mapping["noop"]
+            cls.mapping[None] = cls.mapping["noop"]
+
+    @classmethod
+    def valid_list(cls):
+        """Get the valid post-processor list
+
+        Returns:
+            list of str: valid post-processor types
+        """
+        cls.initialize()
+        return [x for x in cls.mapping.keys() if isinstance(x, str)]
+
+
+class MLProblem(object):
+    """Object containing the X, Y, C, M and R matrices that defines a Multi-Label(ML) problem.
+
+    Creates M from Y*C if not given with multi-threading sparse_matmul.
+    Y: shape of N by L, the instance-to-label matrix with binary classification signals
+    C: shape of L by K, the label-to-cluster matrix for selecting inst/labels within same cluster
+    M: shape of N by K, the instance-to-cluster matrix for negative sampling
+    R: shape of N by L, the relevance matrix for cost-sensitive learning
+
+    See more details in Section 3.3.2 of PECOS paper (Yu et al., 2020).
+
+    PECOS: Prediction for Enormous and Correlated Output Spaces
+        Hsiang-Fu Yu, Kai Zhong, Inderjit S. Dhillon
+        https://arxiv.org/abs/2010.05878
+    """
+
+    def __init__(self, X, Y, C=None, M=None, R=None, threads=8):
+        """Initialization
+
+        Args:
+            X (csr_matrix, np.ndarray or ScipyDrmF32): Instance feature matrix.
+            Y (csr_matrix, np.ndarray or ScipyCscF32): Instance-to-label matrix.
+            C (csc_matrix, np.ndarray or ScipyCscF32, optional): Label-to-cluster matrix.
+                If not given, create an all-one matrix of shape `(Y.shape[1], 1)`.
+            M (csc_matrix, np.ndarray or ScipyCscF32, optional): Instance-to-cluster matrix.
+                If not given, creates M from Y*C with multi-threading sparse_matmul.
+            R (csc_matrix, np.ndarray or ScipyCscF32, optional): Relevance matrix.
+                If not given, will use None.
+            threads(int, optional): Number of threads for multi-threading. Default to 8.
+        """
+        dtype = np.float32
+        assert X.dtype == dtype
+        assert Y.dtype == dtype
+        if isinstance(X, (smat.csr_matrix, ScipyCsrF32)):
+            self.pX = ScipyCsrF32.init_from(X)
+        elif isinstance(X, (np.ndarray, ScipyDrmF32)):
+            self.pX = ScipyDrmF32.init_from(X)
+        else:
+            raise NotImplementedError("type(X) = {} is not supported.".format(type(X)))
+        self.pY = ScipyCscF32.init_from(
+            Y if isinstance(Y, ScipyCscF32) else Y.tocsc().astype(dtype)
+        )
+
+        if R is None or isinstance(R, ScipyCscF32):
+            self.pR = ScipyCscF32.init_from(R)
+        elif smat.isspmatrix(R):
+            self.pR = ScipyCscF32.init_from(R.tocsc().astype(dtype))
+        else:
+            raise NotImplementedError("type(R) = {} is not supported.".format(type(R)))
+        if R is not None:  # verify R and Y has the same non-zero pattern
+            if not np.array_equal(self.pY.buf.indptr, self.pR.buf.indptr):
+                raise ValueError("Invalid relevance matrix: Y.indptr != R.indptr")
+            if not np.array_equal(self.pY.buf.indices, self.pR.buf.indices):
+                raise ValueError("Invalid relevance matrix: Y.indices != R.indices")
+
+        new_C = (
+            smat.csc_matrix(np.ones((Y.shape[1], 1), dtype=dtype))
+            if C is None
+            else C.tocsc().astype(dtype)
+        )
+        self.pC = ScipyCscF32.init_from(new_C)
+        if M is None:
+            if C is not None and C.shape[1] > 1:
+                # partial chain without M, fall back to Teacher Forcing Negatives: M = Y * C
+                new_M = clib.sparse_matmul(Y, C, threads=threads)
+                new_M = new_M.tocsc().astype(dtype)
+            else:
+                new_M = smat.csc_matrix(np.ones((Y.shape[0], 1), dtype=dtype))
+            new_M.sort_indices()
+            self.pM = ScipyCscF32.init_from(new_M)
+        else:
+            assert (
+                M.shape[0] == Y.shape[0]
+            ), f"M.shape[0] = {M.shape[0]} != {Y.shape[0]} = Y.shape[0]"
+            assert (
+                M.shape[1] == C.shape[1]
+            ), f"M.shape[1] = {M.shape[1]} != {C.shape[1]} = C.shape[1]"
+            # tocsc() will create additional copy if M is not csc
+            # use in-place function sort_indices() to avoid copy
+            M = M.tocsc().astype(dtype)
+            M.sort_indices()
+            self.pM = ScipyCscF32.init_from(M)
+        self.dtype = dtype
+
+    @property
+    def X(self):
+        """The instance feature matrix."""
+        return None if self.pX is None else self.pX.buf
+
+    @property
+    def Y(self):
+        """The label matrix."""
+        return None if self.pY is None else self.pY.buf
+
+    @property
+    def C(self):
+        """The clustering chain matrix."""
+        return None if self.pC is None else self.pC.buf
+
+    @property
+    def M(self):
+        """The matching chain matrix."""
+        return None if self.pM is None else self.pM.buf
+
+    @property
+    def R(self):
+        return None if self.pR is None else self.pR.buf
+
+    @property
+    def nr_labels(self):
+        """The number of labels"""
+        return None if self.pY is None else self.Y.shape[1]
+
+
+class MLModel(pecos.BaseClass):
+    """Linear Multi-Label(ML) model containing weight matrix W and code matrix C.
+
+    See more details in Section 3.3.2 of PECOS paper (Yu et al., 2020).
+
+    PECOS: Prediction for Enormous and Correlated Output Spaces
+        Hsiang-Fu Yu, Kai Zhong, Inderjit S. Dhillon
+        https://arxiv.org/abs/2010.05878
+    """
+
+    @dc.dataclass
+    class TrainParams(pecos.BaseParams):
+        """Training Parameters of MLModel.
+
+        Attributes:
+            threshold (float, optional): sparsify the final model by eliminating all entrees with abs value less than threshold.
+                Default to 0.1.
+            max_nonzeros_per_label (int, optional): keep at most NONZEROS weight parameters per label in model.
+                Default None to set to (nr_feat + 1)
+            solver_type (string, optional): backend linear solver type.
+                Options: L2R_L2LOSS_SVC_DUAL(default), L2R_L1LOSS_SVC_DUAL.
+            Cp (float, optional): positive penalty parameter. Defaults to 1.0
+            Cn (float, optional): negative penalty parameter. Defaults to 1.0
+            max_iter (int, optional): maximum iterations. Defaults to 100
+            eps (float, optional): epsilon. Defaults to 0.1
+            bias (float, optional): if >0, append the bias value to each instance feature. Defaults to 1.0
+            threads (int, optional): the number of threads to use for training. Defaults to -1 to use all
+            verbose (int, optional): verbose level. Defaults to 0
+        """
+
+        threshold: float = 0.1
+        max_nonzeros_per_label: int = None  # type: ignore
+        solver_type: str = "L2R_L2LOSS_SVC_DUAL"
+        Cp: float = 1.0
+        Cn: float = 1.0
+        max_iter: int = 100
+        eps: float = 0.1
+        bias: float = 1.0
+        threads: int = -1
+        verbose: int = 0
+
+    @dc.dataclass
+    class PredParams(pecos.BaseParams):  # type: ignore
+        """Prediction Parameters of MLModel
+
+        Attributes:
+            only_topk (int, optional): the only topk specified in the model. Default to 20
+            post_processor (str, optional):  the post_processor specified in the model. Default to "l3-hinge"
+        """
+
+        only_topk: int = 20
+        post_processor: str = "l3-hinge"
+
+        def override_with_kwargs(self, pred_kwargs):
+            """Override Class attributes from prediction key-word arguments.
+
+            Args:
+                pred_kwargs (dict): Args for prediction.
+
+            Returns:
+                self (PredParams): Overriden self instance.
+            """
+            if pred_kwargs is not None:
+                if not isinstance(pred_kwargs, dict):
+                    raise TypeError("type(pred_kwargs) must be dict")
+                overridden_only_topk = pred_kwargs.get("only_topk", None)
+                overridden_post_processor = pred_kwargs.get("post_processor", None)
+                if overridden_only_topk:
+                    self.only_topk = overridden_only_topk
+                if overridden_post_processor:
+                    self.post_processor = overridden_post_processor
+            return self
+
+        def is_valid(self):
+            """Check whether self instance is valid"""
+            return self.post_processor in PostProcessor.valid_list()
+
+    def __init__(self, W, C=None, bias=-1.0, pred_params=None, **kwargs):
+        """Initialization
+
+        Args:
+            W(ScipyCscF32 or np.ndarray): Weight matrix.
+            C(ScipyCscF32 or np.ndarray, optional): Clustering matrix
+            bias(float, optional): The bias of the model. Default to -1.0.
+            pred_params(dict): Parameters to initialize `PredParams`.
+            **kwargs: Other arguments to override `PredParams`.
+        """
+        if C is not None:
+            if isinstance(C, ScipyCscF32):
+                assert C.buf.shape[0] == W.shape[1]
+            else:
+                assert C.shape[0] == W.shape[1], "C:{} W:{}".format(C.shape, W.shape)
+        else:
+            C = smat.csc_matrix(np.ones((W.shape[1], 1), dtype=W.dtype))
+        self.pC = ScipyCscF32.init_from(C)
+        self.pW = ScipyCscF32.init_from(W)
+        self.bias = bias
+        pred_params = self.PredParams.from_dict(pred_params)
+        pred_params.override_with_kwargs(kwargs.get("pred_kwargs", None))
+        self.pred_params = pred_params
+
+    @property
+    def C(self):
+        """The clustering matrix"""
+        return self.pC.buf
+
+    @property
+    def W(self):
+        """The weight matrix"""
+        return None if self.pW is None else self.pW.buf
+
+    @property
+    def nr_labels(self):
+        """The number of labels"""
+        return self.W.shape[1]
+
+    @property
+    def nr_codes(self):
+        """The number of clusters."""
+        return self.C.shape[1]
+
+    @property
+    def nr_features(self):
+        """The feature dimension"""
+        return self.W.shape[0] - (1 if self.bias > 0 else 0)
+
+    @property
+    def dtype(self):
+        """The model dtype"""
+        return self.pW.dtype
+
+    @classmethod
+    def load(cls, folder):
+        """Load MLModel from file
+
+        Args:
+            folder (str): dir from which the model is loaded.
+
+        Returns:
+            MLModel
+        """
+        param = json.loads(open("{}/param.json".format(folder), "r").read())
+        assert param["model"] == cls.__name__
+        W = smat_util.load_matrix("{}/W.npz".format(folder)).tocsc().sorted_indices()
+        C = smat_util.load_matrix("{}/C.npz".format(folder)).tocsc().sorted_indices()
+        pred_params = cls.PredParams.from_dict(param["pred_kwargs"])
+        return cls(W, C, param["bias"], pred_params)
+
+    @classmethod
+    def load_pred_params(cls, folder):
+        """Load prediction parameter from file.
+
+        Args:
+            folder (str): dir from which the prediction parameter is loaded.
+
+        Returns:
+            PredParams
+        """
+        with open("{}/param.json".format(folder), "r") as fin:
+            param = json.loads(fin.read())
+        return cls.PredParams.from_dict(param["pred_kwargs"])
+
+    def save(self, folder):
+        """Save MLModel to file
+
+        Args:
+            folder (str): dir to which the model is saved
+
+        Returns:
+            MLModel
+        """
+        if not path.exists(folder):
+            os.makedirs(folder)
+        param = {
+            "model": self.__class__.__name__,
+            "nr_labels": self.nr_labels,
+            "nr_features": self.nr_features,
+            "nr_codes": self.nr_codes,
+            "bias": self.bias,
+            "pred_kwargs": self.pred_params.to_dict(),
+        }
+        param = self.append_meta(param)
+        with open("{}/param.json".format(folder), "w") as f:
+            f.write(json.dumps(param, indent=True))
+        smat_util.save_matrix("{}/W.npz".format(folder), self.W)
+        smat_util.save_matrix("{}/C.npz".format(folder), self.C)
+
+    @classmethod
+    def train(cls, prob, train_params=None, pred_params=None, **kwargs):
+        """Training method for MLModel
+
+        Args:
+            prob (MLProblem): the problem to solve
+            train_params (TrainParams, optional): instance of TrainParams
+            pred_params (PredParams, optional): instance of PredParams
+            **kwargs: for backward compatibility of old training interface
+                pred_kwargs (dict, optional): prediction kwargs {"only_topk": INT, "post_processor": STR}.
+                    If provided, will override pred_params value. Default None to use pred_params's default
+        Returns:
+            MLModel: the trained MLModel
+        """
+        if train_params is None:  # for backward compatibility
+            train_params = kwargs
+        train_params = cls.TrainParams.from_dict(train_params)
+
+        pred_params = cls.PredParams.from_dict(pred_params)
+        pred_params.override_with_kwargs(kwargs.get("pred_kwargs", None))
+        if not pred_params.is_valid():
+            raise ValueError("pred_params is not valid!")
+
+        model = clib.xlinear_single_layer_train(
+            prob.pX,
+            prob.pY,
+            prob.pC,
+            prob.pM,
+            prob.pR,
+            **train_params.to_dict(),
+        )
+        return cls(model, prob.pC, train_params.bias, pred_params)
+
+    def get_pred_params(self):
+        """Return a deep copy of prediction parameters
+
+        Returns:
+            copied_pred_params (dict): Prediction parameters.
+        """
+        return copy.deepcopy(self.pred_params)
+
+    def predict(
+        self,
+        X,
+        csr_codes=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Predict on given input data
+
+        Args:
+            X (csr_matrix or ndarray): instance feature matrix to predict on
+            csr_codes (csr_matrix, optional): the prediction from previous matchers (nr_inst, nr_codes).
+                Default None to ignore
+            pred_params (MLModel.PredParams, optional): instance of MLModel.PredParams.
+                Default None to use the pred_params used in model training.
+            kwargs: overriding prediction parameters for backward compatibility
+                only_topk (int, optional): override the only topk specified in pred_params
+                    Default None to disable overriding
+                post_processor (str, optional):  override the post_processor in pred_params
+                    Default None to disable overriding
+                threads (int, optional): override the number of threads to use for training in pred_params
+                    Default to -1 to disable overriding
+
+        Returns:
+            pred_csr (csr_matrix): prediction matrix (nr_inst, nr_labels)
+        """
+        if X.shape[1] != self.nr_features:
+            raise ValueError("Feature dimension of query matrix does not match weight matrix")
+
+        pred_params = self.get_pred_params() if pred_params is None else pred_params
+        pred_params.override_with_kwargs(kwargs)
+        if not pred_params.is_valid():
+            raise ValueError("pred_params is not valid!")
+
+        pred_alloc = ScipyCompressedSparseAllocator()
+
+        clib.xlinear_single_layer_predict(
+            X,
+            csr_codes,
+            self.W,
+            self.C,
+            pred_params.post_processor,
+            pred_params.only_topk if pred_params.only_topk else 0,
+            kwargs.get("threads", -1),
+            self.bias,
+            pred_alloc,
+        )
+
+        return pred_alloc.get()
+
+    def get_submodel(self, selected_codes=None, selected_labels=None, reindex=False):
+        """Slice/sparsify the model based on connections to given code and labels.
+
+        The purpose of this function is to slice the W and C matrices of a
+        given MLModel, such that only the rows in C with connections to
+        selected_codes are retained. The rows are further filtered to lie
+        within selected_labels.
+
+        Args:
+            selected_codes (list of int, optional): labels with non-zeros in these columns will be retained.
+                Default None to use all codes
+            selected_labels (list of int, optional): labels will be further filtered to lying in this list.
+                Default None to use all labels
+            reindex (bool, optional):
+                if True: shapes of C and W matrices are changed
+                if False (default): the shapes do not change but the matrices is sparsified
+
+        Returns:
+            dict:{
+                'model': MLModel object with sliced/sparsified C and W matrices
+                'active_codes': a (sorted) list of codes that are retained
+                'active_labels': a (sorted) list of labels that are retained
+                }
+        """
+        if selected_codes is None:
+            selected_codes = np.arange(self.nr_codes)
+        else:
+            if len(selected_codes) > self.nr_codes:
+                raise ValueError("Number of codes are more than number of columns in C")
+            if max(selected_codes) >= self.nr_codes:
+                raise ValueError(
+                    "selected_codes out of range for C of shape: {}".format(self.C.shape)
+                )
+        if selected_labels is None:
+            selected_labels = np.arange(self.nr_labels)
+        else:
+            if len(selected_labels) > self.nr_labels:
+                raise ValueError("Number of labels are more than number of rows in C")
+            if max(selected_labels) >= self.nr_labels:
+                raise ValueError(
+                    "selected_labels out of range for C of shape: {}".format(self.C.shape)
+                )
+
+        coo = smat_util.get_sparsified_coo(smat.coo_matrix(self.C), selected_labels, selected_codes)
+        active_labels = np.sort(np.unique(coo.row))
+        active_codes = np.sort(np.unique(coo.col))
+        if reindex:
+            row_mapping = np.arange(coo.shape[0])
+            row_mapping[active_labels] = np.arange(len(active_labels))
+            col_mapping = np.arange(coo.shape[1])
+            col_mapping[active_codes] = np.arange(len(active_codes))
+            new_C = smat.csc_matrix(
+                (coo.data, (row_mapping[coo.row], col_mapping[coo.col])),
+                shape=(len(active_labels), len(active_codes)),
+            )
+            new_W = self.W[:, active_labels]
+        else:
+            new_C = coo.tocsc()
+            new_W = smat_util.get_sparsified_coo(
+                smat.coo_matrix(self.W),
+                np.arange(self.W.shape[0]),
+                active_labels,
+            ).tocsc()
+        return {
+            "model": MLModel(C=new_C, W=new_W),
+            "active_labels": active_labels,
+            "active_codes": active_codes,
+        }
+
+
+class HierarchicalMLModel(pecos.BaseClass):
+    """Hierarchical Linear ML Model containing a chain of MLModel.
+
+    See more details in Algorithm 2 of PECOS paper (Yu et al., 2020).
+
+    PECOS: Prediction for Enormous and Correlated Output Spaces
+        Hsiang-Fu Yu, Kai Zhong, Inderjit S. Dhillon
+        https://arxiv.org/abs/2010.05878
+    """
+
+    @dc.dataclass
+    class TrainParams(pecos.BaseParams):
+        """Training Parameters of HierarchicalMLModel
+
+        Attributes:
+            neg_mining_chain (str, optional): negative_sampling_scheme. Also support List/Tuple type for sublayers.
+                Default None.
+            model_chain (MLModel.TrainParams, optional): MLModel.TrainParams. Also support List/Tuple type of sublayers.
+                Default None.
+        """
+
+        neg_mining_chain: str = None  # type: ignore
+        model_chain: MLModel.TrainParams = None  # type: ignore
+
+    @dc.dataclass
+    class PredParams(pecos.BaseParams):
+        """Prediction Parameters of HierarchicalMLModel
+
+        Attributes:
+            model_chain (MLModel.PredParams, optional): MLModel.PredParams. Also support List/Tuple type for sublayers.
+                Default None.
+        """
+
+        model_chain: MLModel.PredParams = None  # type: ignore
+
+        def __len__(self):
+            return len(self.model_chain)
+
+        def __add__(self, other):
+            if not isinstance(other, HierarchicalMLModel.PredParams):
+                other = HierarchicalMLModel.PredParams(other)
+            ret_model_chain = self.model_chain + other.model_chain
+            return HierarchicalMLModel.PredParams(model_chain=ret_model_chain)
+
+        def __getitem__(self, key):
+            return HierarchicalMLModel.PredParams(model_chain=self.model_chain[key])
+
+        def override_with_kwargs(self, pred_kwargs):
+            """Override Class attributes from prediction key-word arguments.
+
+            Args:
+                pred_kwargs (dict): Args for prediction.
+
+            Returns:
+                self (PredParams): Overriden self instance.
+            """
+            if pred_kwargs is not None:
+                if not isinstance(pred_kwargs, dict):
+                    raise TypeError("type(pred_kwargs) must be dict")
+                # TODO need to make sure self.model_chain is list/tuple
+                depth = len(self.model_chain)
+                for d in range(depth):
+                    overridden_beam_size = pred_kwargs.get("beam_size", None)
+                    overridden_only_topk = pred_kwargs.get("only_topk", None)
+                    overridden_post_processor = pred_kwargs.get("post_processor", None)
+                    if overridden_beam_size and d < (depth - 1):
+                        self.model_chain[d].only_topk = overridden_beam_size
+                    if overridden_only_topk and d == (depth - 1):
+                        self.model_chain[d].only_topk = overridden_only_topk
+                    if overridden_post_processor:
+                        self.model_chain[d].post_processor = overridden_post_processor
+            return self
+
+    @staticmethod
+    def _duplicate_fields_with_name_ending_with_chain(params, cls, depth):
+        """Copy cls attributes into destination params with given model chain depth.
+
+        Args:
+            params (cls or pecos.BaseParams): Destination params.
+            cls (cls): Current cls.
+            depth (int): The depth of model chain to copy.
+
+        Returns:
+            params (cls or pecos.BaseParams): Copied params.
+        """
+        if not isinstance(params, cls) or not isinstance(params, pecos.BaseParams):
+            raise ValueError("invalid type(params)!")
+
+        for f in dc.fields(cls):
+            old_val = getattr(params, f.name)
+            if isinstance(old_val, f.type) and f.name.endswith("_chain"):
+                setattr(params, f.name, [copy.deepcopy(old_val) for _ in range(depth)])
+            elif isinstance(old_val, (list, tuple)):
+                if len(old_val) != depth:
+                    raise ValueError(f"len(params.{f.name})={len(old_val)} != {depth}")
+                if any(not isinstance(cur_param, f.type) for cur_param in old_val):
+                    raise ValueError("invalid params!")
+        return params
+
+    def __init__(self, model_chain, pred_params=None, is_predict_only=False, **kwargs):
+        """Initialization
+
+        Args:
+            model_chain (ptr, list or tuple): The model chain.
+            pred_params (dict, optional): The prediction parameters. Default to None.
+            is_predict_only (bool, optional): Whether to do prediction only and other operations not allowed. Default to False.
+            **kwargs: Additional arguments to override `PredParams`
+        """
+        if isinstance(model_chain, int):
+            # model_chain is a C++ object
+            # it can only be used for prediction.
+            self.model_chain = model_chain
+        else:
+            if isinstance(model_chain, (list, tuple)):
+                self.model_chain = model_chain
+            else:
+                self.model_chain = [model_chain]
+
+        if pred_params is None:
+            pred_params = self.PredParams(
+                model_chain=[MLModel.PredParams() for _ in range(len(self.model_chain))],
+            )
+        else:
+            pred_params = pred_params.from_dict(pred_params)
+        pred_params.override_with_kwargs(kwargs.get("pred_kwargs", None))
+        self.pred_params = pred_params
+        self.is_predict_only = is_predict_only
+
+    def __del__(self):
+        if self.is_predict_only:
+            clib.xlinear_destruct_model(self.model_chain)
+
+    @property
+    def depth(self):
+        """The model depth"""
+        if self.is_predict_only:
+            return clib.xlinear_get_int_attr(self.model_chain, "depth")
+        else:
+            return len(self.model_chain)
+
+    @property
+    def nr_features(self):
+        """The feature dimension size"""
+        if self.is_predict_only:
+            return clib.xlinear_get_int_attr(self.model_chain, "nr_features")
+        else:
+            return self.model_chain[0].nr_features
+
+    @property
+    def nr_codes(self):
+        """The number of clusters at bottom layer"""
+        if self.is_predict_only:
+            return clib.xlinear_get_int_attr(self.model_chain, "nr_codes")
+        else:
+            return self.model_chain[-1].nr_codes
+
+    @property
+    def nr_labels(self):
+        """The number of labels"""
+        if self.is_predict_only:
+            return clib.xlinear_get_int_attr(self.model_chain, "nr_labels")
+        else:
+            return self.model_chain[-1].nr_labels
+
+    def __add__(self, other):
+        if self.is_predict_only:
+            raise Exception("Model is predict only! __add__ not supported!")
+        if not isinstance(other, HierarchicalMLModel):
+            other = HierarchicalMLModel(other)
+        assert self.model_chain[-1].nr_labels == other.model_chain[0].nr_codes
+        ret_model_chain = self.model_chain + other.model_chain
+        ret_pred_params = self.pred_params + other.pred_params
+        if len(ret_model_chain) != len(ret_pred_params):
+            raise ValueError("len(model_chain) != len(pred_params)")
+        return HierarchicalMLModel(ret_model_chain, ret_pred_params)
+
+    def __getitem__(self, key):
+        if self.is_predict_only:
+            raise Exception("Model is predict only! __getitem__ not supported!")
+        return HierarchicalMLModel(self.model_chain[key], self.pred_params[key])
+
+    def astype(self, dtype):
+        """Cast the models to a specified type
+
+        Args:
+            dtype (str or dtype): Typecode or data-type to which the models are cast.
+
+        Returns:
+            HierarchicalMLModel
+        """
+        if self.is_predict_only:
+            raise Exception("Model is predict only! astype not supported!")
+        if dtype == self.model_chain[0].dtype:
+            return self
+        else:
+            return HierarchicalMLModel(
+                [m.astype(dtype) for m in self.model_chain], self.pred_params
+            )
+
+    @classmethod
+    def load(cls, model_folder, is_predict_only=False, **kwargs):
+        """
+        Load HierarchicalMLModel from file
+
+        Args:
+            model_folder (str): The path to the folder that stores the model.
+            is_predict_only (bool): If true, the XLinear model will be loaded in C++ and can only be used for prediction.
+            If False, the model is loaded in Python and can be changed such as pruning. The predict-only mode typically
+            gives better performance but also means that any method not tied to prediction (e.g., train) cannot be invoked.
+
+            kwargs:
+                weight_matrix_type (string): A string determining which type of weight matrix to use.
+                    The allowed types are:
+                        * "BINARY_SEARCH_CHUNKED": The default, stores the weight matrix in chunked format and
+                    uses binary search for vector x chunk products.
+                        * "HASH_CHUNKED": Stores the weight matrix in chunked format, but uses a hash table
+                    for random row access in the chunks. This hash table is used to compute vector
+                    x chunk products.
+                        * "CSC": Typically the slowest option. Stories the weight matrix in csc format.
+                    This format tends to be the fastest to load.
+
+        Returns:
+            HierarchicalMLModel
+        """
+        param = json.loads(open(f"{model_folder}/param.json", "r", encoding="utf-8").read())
+        assert param["model"] == cls.__name__
+        depth = int(param.get("depth", len(glob("{}/*.model".format(model_folder)))))
+
+        if is_predict_only:
+            model = clib.xlinear_load_predict_only(model_folder, **kwargs)
+        else:
+            model = [MLModel.load(f"{model_folder}/{d}.model") for d in range(depth)]
+
+        pred_params = cls.PredParams(
+            model_chain=[
+                MLModel.load_pred_params(f"{model_folder}/{d}.model") for d in range(depth)
+            ],
+        )
+        return cls(model, pred_params=pred_params, is_predict_only=is_predict_only)
+
+    def save(self, folder):
+        """Save HierarchicalMLModel to file
+
+        Args:
+            folder (str): dir to which the model is saved
+
+        Returns:
+            HierarchicalMLModel
+        """
+        if self.is_predict_only:
+            raise Exception("Model is predict only! save not supported!")
+        if not path.exists(folder):
+            os.makedirs(folder)
+        param = {
+            "model": self.__class__.__name__,
+            "depth": self.depth,
+            "nr_features": self.nr_features,
+            "nr_codes": self.nr_codes,
+            "nr_labels": self.nr_labels,
+        }
+        param = self.append_meta(param)
+        open(f"{folder}/param.json", "w", encoding="utf-8").write(json.dumps(param, indent=True))
+        for d in range(self.depth):
+            local_folder = f"{folder}/{d}.model"
+            self.model_chain[d].save(local_folder)
+
+    @classmethod
+    def train(
+        cls,
+        prob,
+        clustering=None,
+        matching_chain=None,
+        train_params=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Training method for HierarchicalMLModel
+
+        Args:
+            prob (MLProblem): the problem to solve
+            clustering (ClusterChain or None, optional): cluster chain for the model hierarchy
+                Default None for the One-Versus-All problem.
+            matching_chain (list of csr_matrix): the matching_chain generated by user-supplied-negatives.
+                Their indices will be added to the negative samples if 'usn' in negative_sampling_scheme.
+                Default None to ignore.
+            train_params (HierarchicalMLModel.TrainParams, optional): training kwargs for each layer
+            pred_params (HierarchicalMLModel.PredParams, optional): prediction kwargs for each layer
+            kwargs: containing keyword arguments for the solver. See MLModel.TrainParams
+                pred_kwargs (dict, optional): prediction kwargs {"beam_size": INT, "only_topk": INT, "post_processor": STR},
+                    Default None to use HierarchicalMLModel.DEFAULT_PRED_KWARGS
+
+        Returns:
+            HierarchicalMLModel: the trained HierarchicalMLModel
+        """
+
+        if prob.R is not None:
+            raise NotImplementedError(
+                "Cost-senstive learning for HierarchicalMLModel is not yet supported"
+            )
+
+        if clustering is None or clustering is False:
+            depth = 1
+            if train_params is None:
+                train_params = cls.TrainParams(
+                    model_chain=tuple([MLModel.TrainParams.from_dict(kwargs)])
+                )
+            else:
+                train_params = cls.TrainParams.from_dict(train_params)
+                train_params = cls._duplicate_fields_with_name_ending_with_chain(
+                    train_params, cls.TrainParams, depth
+                )
+
+            if pred_params is None:
+                pred_params = cls.PredParams(model_chain=tuple([MLModel.PredParams()]))
+            else:
+                pred_params = cls.PredParams.from_dict(pred_params)
+                pred_params = cls._duplicate_fields_with_name_ending_with_chain(
+                    pred_params, cls.PredParams, depth
+                )
+            pred_params.override_with_kwargs(kwargs.get("pred_kwargs", None))
+
+            ml_model = MLModel.train(
+                prob,
+                train_params=train_params.model_chain[0],
+                pred_params=pred_params.model_chain[0],
+            )
+            return HierarchicalMLModel([ml_model], pred_params=pred_params, is_predict_only=False)
+
+        # assert cluster chain in clustering is valid
+        clustering = ClusterChain(clustering)
+        assert clustering[-1].shape[0] == prob.nr_labels
+        depth = len(clustering)
+
+        # construct train_params
+        if train_params is None:  # for backward compatibility
+            train_params = cls.TrainParams(
+                neg_mining_chain=tuple(
+                    [kwargs.get("negative_sampling_scheme", "tfn") for _ in range(depth)]
+                ),
+                model_chain=tuple([MLModel.TrainParams.from_dict(kwargs) for _ in range(depth)]),
+            )
+        else:
+            train_params = cls.TrainParams.from_dict(train_params)
+            train_params = cls._duplicate_fields_with_name_ending_with_chain(
+                train_params, cls.TrainParams, depth
+            )
+        train_params.neg_mining_chain = [ns.lower() for ns in train_params.neg_mining_chain]
+
+        # construct pred_params
+        if pred_params is None:
+            pred_params = cls.PredParams(
+                model_chain=tuple([MLModel.PredParams() for _ in range(depth)])
+            )
+        else:
+            pred_params = cls.PredParams.from_dict(pred_params)
+            pred_params = cls._duplicate_fields_with_name_ending_with_chain(
+                pred_params, cls.PredParams, depth
+            )
+        pred_params.override_with_kwargs(kwargs.get("pred_kwargs", None))
+
+        # construct Y_chain
+        # avoid large matmul_threads to prevent overhead in Y.dot(C) and save memory
+        matmul_threads = train_params.model_chain[0].threads
+        if matmul_threads <= 0:
+            matmul_threads = max(os.cpu_count(), matmul_threads)
+        matmul_threads = min(32, matmul_threads)
+        Y_chain = [prob.Y]
+        for C in reversed(clustering[1:]):
+            Y_t = clib.sparse_matmul(Y_chain[-1], C, threads=matmul_threads).tocsc()
+            Y_chain.append(Y_t)
+        Y_chain.reverse()
+
+        cur_prob, M_pred = prob, None
+        model_chain = []
+        for t, (Y, C, M_usn) in enumerate(zip(Y_chain, clustering, matching_chain)):
+            negative_sampling_scheme = train_params.neg_mining_chain[t]
+            cur_train_params = train_params.model_chain[t]
+            cur_pred_params = pred_params.model_chain[t]
+            LOGGER.info(
+                f"Training Layer {t} of {len(Y_chain)} Layers in HierarchicalMLModel, neg_mining={negative_sampling_scheme}.."
+            )
+            if t == 0:
+                M = None
+                # if got partial chain, enter hierarchical ranker mode
+                if C.shape[1] > 1:
+                    shape = (cur_prob.Y.shape[0], C.shape[1])
+                    M = smat.csc_matrix(shape, dtype=cur_prob.Y.dtype)
+                    if "usn" in negative_sampling_scheme:
+                        if M_usn is not None:
+                            M += smat_util.binarized(M_usn)
+                    if "tfn" in negative_sampling_scheme:
+                        M_true = clib.sparse_matmul(Y, C, threads=matmul_threads).tocsc()
+                        M += smat_util.binarized(M_true)
+                cur_prob = MLProblem(cur_prob.pX, Y, C=C, M=M, threads=matmul_threads)
+            else:
+                # Preparing negative sampling for M
+                shape = (cur_prob.Y.shape[0], C.shape[1])
+                M = smat.csc_matrix(shape, dtype=cur_prob.Y.dtype)
+                if "usn" in negative_sampling_scheme:
+                    if M_usn is not None:
+                        M += smat_util.binarized(M_usn)
+                if "tfn" in negative_sampling_scheme:
+                    M_true = Y_chain[t - 1].tocsc()
+                    M += smat_util.binarized(M_true)
+                if any("man" in ns for ns in train_params.neg_mining_chain[t:]):
+                    M_pred = model_chain[-1].predict(cur_prob.pX, csr_codes=M_pred)
+                if "man" in negative_sampling_scheme:
+                    M += smat_util.binarized(M_pred)
+                cur_prob = MLProblem(cur_prob.pX, Y, C=C, M=M, threads=matmul_threads)
+            cur_model = MLModel.train(
+                cur_prob, train_params=cur_train_params, pred_params=cur_pred_params
+            )
+            model_chain.append(cur_model)
+        return cls(model_chain, pred_params=pred_params, is_predict_only=False)
+
+    def get_pred_params(self):
+        return copy.deepcopy(self.pred_params)
+
+    def predict(
+        self,
+        X,
+        csr_codes=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Predict on given input data
+
+        Args:
+            X (csr_matrix or ndarray): instance feature matrix to predict on
+            csr_codes (csr_matrix, optional): the prediction from pervious matchers (nr_inst, K0).
+            pred_params (HierarchicalMLModel.PredParams, optional): instance of HierarchicalMLModel.PredParams.
+                Default None to use the pred_params used in model training.
+            kwargs: overriding prediction parameters for backward compatibility
+                beam_size (int, optional): override the beam_size specified in pred_params (except last layer)
+                    Default None to disable overriding
+                only_topk (int, optional): override the only_topk specified in pred_params (only last layer)
+                    Default None to disable overriding
+                post_processor (str, optional):  override the post_processor specified in pred_params (all layers)
+                    Default None to disable overriding
+                threads (int, optional): the number of threads to use for training.
+                    Defaults to -1 to use all
+        Returns:
+            pred_csr (csr_matrix): prediction matrix (nr_inst, nr_labels)
+        """
+        assert X.dtype == np.float32
+        assert isinstance(X, smat.csr_matrix) or (
+            isinstance(X, np.ndarray) and X.flags["C_CONTIGUOUS"]
+        )
+        assert X.shape[1] == self.nr_features
+
+        # construct pred_params
+        if pred_params is None:
+            pred_params = self.get_pred_params()
+        elif isinstance(pred_params, self.PredParams):
+            pred_params = self.PredParams.from_dict(pred_params)
+            pred_params = self._duplicate_fields_with_name_ending_with_chain(
+                pred_params, self.PredParams, self.depth
+            )
+        else:
+            raise ValueError("unknown type(pred_params)!!")
+        pred_params.override_with_kwargs(kwargs)
+
+        if self.is_predict_only:
+            if csr_codes is not None:
+                raise NotImplementedError(
+                    "is_predict_only=True did not support csr_codes being not None"
+                )
+
+            old_chain = self.get_pred_params().model_chain
+            new_chain = pred_params.model_chain
+
+            # check if post_processor is valid (support by C++) after overriding
+            if all(
+                old_p.post_processor == new_p.post_processor
+                for (old_p, new_p) in zip(old_chain, new_chain)
+            ):
+                overridden_post_processor = None
+            elif all(new_chain[0].post_processor == new_p.post_processor for new_p in new_chain):
+                overridden_post_processor = new_chain[0].post_processor
+            else:
+                raise NotImplementedError(
+                    "when is_predict_only=True, post_processor is not supported for overriddng"
+                )
+
+            # check if beam_size is valid (support by C++) after overriding
+            if all(
+                old_p.only_topk == new_p.only_topk
+                for (old_p, new_p) in zip(old_chain[:-1], new_chain[:-1])
+            ):
+                overridden_beam_size = None
+            elif all(new_chain[0].only_topk == new_p.only_topk for new_p in new_chain[:-1]):
+                overridden_beam_size = new_chain[0].only_topk
+            else:
+                raise NotImplementedError(
+                    "when is_predict_only=True, beam_size is not supported for overriding"
+                )
+
+            # Call C++ code
+            pred_alloc = ScipyCompressedSparseAllocator()
+            clib.xlinear_predict(
+                self.model_chain,
+                X,
+                overridden_beam_size,
+                overridden_post_processor,
+                new_chain[-1].only_topk,
+                kwargs.get("threads", -1),
+                pred_alloc,
+            )
+
+            return pred_alloc.get()
+        else:
+            pred_csr = csr_codes
+            for d in range(self.depth):
+                cur_model = self.model_chain[d]
+                pred_csr = cur_model.predict(
+                    X,
+                    csr_codes=pred_csr,
+                    pred_params=pred_params.model_chain[d],
+                    threads=kwargs.get("threads", -1),
+                )
+
+            return pred_csr
+
+    def set_output_constraint(self, labels_to_keep):
+        """
+        Prune clustering tree to only output labels in labels_to_keep set.
+        C matrices denote connectivities between nodes in a level and nodes in level below.
+        C matrices are tall, rows are nodes in level below, columns are nodes in level above. Stored in CSC format.
+        Update C matrices from bottom-up.
+
+        Args:
+            labels_to_keep (iterable over int): label indices to keep
+        """
+        if self.is_predict_only:
+            raise Exception("Model is predict only! set_output_constraint not supported!")
+
+        for cur_dep, cur_model in enumerate(self.model_chain[::-1]):
+            if len(labels_to_keep) == cur_model.C.shape[0]:
+                # keep all labels / nodes
+                # so keep all branches in levels above
+                break
+            for i, label in enumerate(cur_model.C.indices):
+                if label not in labels_to_keep:
+                    cur_model.C.data[i] = 0
+            cur_model.C.eliminate_zeros()
+            labels_to_keep = set(
+                np.nonzero((cur_model.C.indptr[1:] - cur_model.C.indptr[:-1]) != 0)[0]
+            )
+
+    def get_submodel_rooted_at(self, given_depth, child_node_id, reindex=False):
+        """
+        Get a HierarchicalMLModel sub-model which represents the subtree of child_node_id and at depth: given_depth.
+
+        Args:
+            given_depth (int): depth of the model chain
+            child_node_id (int): index of the node whose subtree needs to be fetched
+            reindex (bool, optional):
+                if True: shapes of C and W matrices are changed
+                if False (default): the shapes do not change but the matrices is sparsified
+
+        Returns:
+            submodel (HierarchicalMLModel): subtree as HierarchicalMLModel object
+            label_mapping (MLModel): mapping of indices of the final model to original labels (or None if reindex = False)
+        """
+        if self.is_predict_only:
+            raise Exception("Model is predict only! get_submodel_rooted_at not supported!")
+
+        subtree_chain = []
+        parents = [child_node_id]
+        for d in range(given_depth, len(self.model_chain)):
+            sub_model = self.model_chain[d].get_submodel(selected_codes=parents, reindex=reindex)
+            if d == given_depth and not reindex:
+                sub_model["model"] = MLModel(
+                    C=sub_model["model"].C[:, parents], W=sub_model["model"].W
+                )
+            parents = sub_model["active_labels"]
+            subtree_chain.append(sub_model["model"])
+        submodel = HierarchicalMLModel(subtree_chain, pred_params=self.pred_params)
+        label_mapping = parents if reindex else None
+        return submodel, label_mapping
+
+    def split_model_at_depth(self, given_depth, reindex=False):
+        """Splits model tree at a given depth to yield separate models.
+
+        Args:
+            given_depth (int): depth at which the model needs to be split.
+            reindex (bool, optional):
+                if True: shapes of C and W matrices are changed
+                if False (default): the shapes do not change but the matrix is sparsified
+
+        Returns:
+            a dictionary containing the following keys:
+                "parent_model":
+                    a HierarchicalMLModel that has model_chain of the
+                    original model till depth: given_depth
+                "child_models":
+                    A list of child model tuples where each tuple has:
+                        HierarchicalMLModel corresponding to the subtree at child id corresponding to list index
+                        mapping to original labels from the child model output (None if reindex is False)
+
+        Notes:
+            As an example if the model tree is a complete binary tree of depth 2,
+            and we call this function with given_depth = 1, this is the output (reindex = True)
+            "parent_model": 2*1 HierarchicalMLModel having the top-layer of the original tree
+            "child_models"[0]:
+                model corresponding to child tree of node 0 at height 1
+                list([0, 1])
+            "child_models"[1]:
+                model corresponding to child tree of node 1 at height 1
+                list([2, 3])
+        """
+        if self.is_predict_only:
+            raise Exception("Model is predict only! split_model_at_depth not supported!")
+
+        if given_depth < 1:
+            raise ValueError("depth has to be a positive integer")
+        if given_depth > len(self.model_chain) - 1:
+            raise ValueError("depth has to be one less than length of model_chain")
+        parent_model = self[:given_depth]
+        child_models = []
+        for i in range(self.model_chain[given_depth].nr_codes):
+            subtree_model, mapping_to_labels = self.get_submodel_rooted_at(given_depth, i, reindex)
+            child_models.append((subtree_model, mapping_to_labels))
+        return {"parent_model": parent_model, "child_models": child_models}
+
+
+class LabelEmbeddingFactory(object):
+    @staticmethod
+    def create(Y=None, X=None, method="pifa", **kwargs):
+        """Create label embedding
+
+        Args:
+            Y (smat.spmatrix): label matrix (num_samples, num_labels).
+            X (smat.csr_matrix or np.ndarray): input feature matrix (num_samples x num_features)
+            method (string): label embedding method. (default pifa)
+            kwargs:
+                Z (smat.csr_matrix or np.ndarray): label feature matrix (num_samples x num_label_features)
+                threads (int): number of threads for doing sparse matrix multiplication in parallel.
+                normalized_Y (bool): if true, the rows of Y will be l2-normalized.
+        Returns:
+            label_embedding: embedding matrix. (num_labels x num_features)
+        """
+
+        mapping = {
+            "pifa": LabelEmbeddingFactory.pifa,
+            "pifa_lf_concat": LabelEmbeddingFactory.pifa_lf_concat,
+            "pifa_lf_convex_combine": LabelEmbeddingFactory.pifa_lf_convex_combine,
+        }
+        if method.lower() in mapping:
+            return mapping[method.lower()](Y, X, **kwargs)
+        else:
+            assert NotImplementedError(
+                f"Label embedding method '{method}' is not implemented. Valid ones: {mapping.keys()}"
+            )
+
+    @staticmethod
+    def pifa(Y, X, threads=-1, normalized_Y=True):
+        """Create pifa embedding
+
+        Args:
+            Y (smat.spmatrix): label matrix (num_samples, num_labels).
+            X (smat.csr_matrix or np.ndarray): input feature matrix (num_samples x num_features)
+            threads (int): number of threads for doing sparse matrix multiplication in parallel.
+            normalized_Y (bool): if true, the rows of Y will be l2-normalized.
+        Returns:
+            label_embedding: pifa embeddings. (num_labels x num_features)
+            The returned format will be the same as X.
+        """
+
+        if not isinstance(Y, smat.spmatrix):
+            raise NotImplementedError("type(Y) should be scipy.sparse.spmatrix")
+
+        if normalized_Y:
+            Y = normalize(Y, axis=1, norm="l2")
+        YT = Y.T.tocsr()
+
+        if isinstance(X, smat.csr_matrix):
+            # YT is csr, X is csr.
+            # Note that ScipyCsrF32.init_from() will copy data (incur additional memory),
+            # if input matrix's indptr/indices/data dtype is different from its expected dtype.
+            pYT = ScipyCsrF32.init_from(YT)
+            pX = ScipyCsrF32.init_from(X)
+            label_embedding = clib.sparse_matmul(
+                pYT, pX, eliminate_zeros=False, sorted_indices=True, threads=threads
+            )
+        elif isinstance(X, np.ndarray):
+            # YT is csr, X is row-major dense nparray
+            # Thus, the scipy matmul will not create extra memory
+            label_embedding = YT.dot(X)
+            label_embedding = np.ascontiguousarray(label_embedding)
+        else:
+            raise NotImplementedError("type(X) should be row-major spmatrix or ndarray")
+
+        label_embedding = normalize(label_embedding, axis=1, norm="l2", copy=False)
+        return label_embedding
+
+    @staticmethod
+    def pifa_lf_concat(Y, X, Z, threads=-1, normalized_Y=True):
+        """Create label embedding by concatenating pifa embedding and provided existing label embedding
+
+        Args:
+            Y (smat.spmatrix): label matrix (num_samples, num_labels).
+            X (smat.csr_matrix or np.ndarray): input feature matrix (num_samples x num_features)
+            Z (smat.csr_matrix or np.ndarray): existing label feature matrix (num_labels x num_label_features)
+            threads (int): number of threads for doing sparse matrix multiplication in parallel.
+            normalized_Y (bool): if true, the rows of Y will be l2-normalized.
+
+        Returns:
+            label_embedding: pifa_lf_concat embeddings. (num_labels x (num_features + num_label_features)).
+            The returned format will be csr_matrix if either X or Z is csr_matrix. Otherwise, it will be ndarray.
+        """
+
+        pifa = LabelEmbeddingFactory.pifa(Y, X, threads=threads, normalized_Y=normalized_Y)
+        if not isinstance(pifa, (smat.csr_matrix, np.ndarray)) or not isinstance(
+            Z, (smat.csr_matrix, np.ndarray)
+        ):
+            raise NotImplementedError(
+                "type(pifa) and type(Z) should be row-major spmatrix or ndarray"
+            )
+        if isinstance(pifa, np.ndarray) and isinstance(Z, np.ndarray):
+            label_embedding = np.hstack([pifa, Z])
+        else:
+            if isinstance(pifa, np.ndarray):
+                pifa = smat_util.dense_to_csr(pifa)
+            if isinstance(Z, np.ndarray):
+                Z = smat_util.dense_to_csr(Z)
+            label_embedding = smat_util.hstack_csr([pifa, Z])
+        return label_embedding
+
+    @staticmethod
+    def pifa_lf_convex_combine(Y, X, Z, alpha=0.5, threads=-1, normalized_Y=True):
+        """Create label embedding by doing convex-combination of pifa embedding and existing label embedding.
+
+        To use this function, Input feature (X) dimension should be the same as existing label feature (Z) dimension.
+
+        Args:
+            Y (smat.spmatrix): label matrix (num_samples, num_labels).
+            X (smat.csr_matrix or np.ndarray): input feature matrix (num_samples x num_features)
+            Z (smat.csr_matrix or np.ndarray): existing label feature matrix (num_labels x num_features)
+            alpha (float, int or np.ndarray): weight(s) for pifa. The value(s) of alpha should be between 0.0 and 1.0.
+                The returned label embedding is (alpha * pifa + (1 - alpha) * Z).
+                If alpha is ndarray, it should be a 1-d array and the length should be num_labels.
+            threads (int): number of threads for doing sparse matrix multiplication in parallel.
+            normalized_Y (bool): if true, the rows of Y will be l2-normalized.
+        Returns:
+            label_embedding: pifa_lf_convex_combine embeddings. (num_labels x num_features).
+            The returned format will be ndarray if either X or Z is ndarray. Otherwise, it will be csr_matrix.
+        """
+
+        if X.shape[1] != Z.shape[1]:
+            raise ValueError(
+                f"X and Z should have same dimension to do convex combination. {X.shape[1]}!={X.shape[1]}."
+            )
+        if isinstance(alpha, np.ndarray):
+            if alpha.ndim != 1:
+                raise ValueError(f"If alpha is a numpy array, it should be an 1-d array")
+
+            if alpha.shape[0] != Z.shape[0]:
+                raise ValueError(
+                    f"If alpha is a numpy array, the length should be equal to the number of samples."
+                )
+
+            if np.any(alpha < 0.0) or np.any(alpha > 1.0):
+                raise ValueError(f"All values in alpha should be between 0 and 1.")
+        elif isinstance(alpha, (int, float)):
+            if alpha < 0.0 or alpha > 1.0:
+                raise ValueError(f"alpha should be between 0 and 1.")
+        else:
+            raise NotImplementedError("alpha should be a number or a numpy array")
+
+        pifa = LabelEmbeddingFactory.pifa(Y, X, threads=threads, normalized_Y=normalized_Y)
+
+        if isinstance(alpha, (int, float)):
+            alpha = np.repeat(alpha, Z.shape[0])
+
+        alpha_complementary = 1.0 - alpha
+
+        if not isinstance(pifa, (smat.csr_matrix, np.ndarray)) or not isinstance(
+            Z, (smat.csr_matrix, np.ndarray)
+        ):
+            raise NotImplementedError(
+                "type(pifa) and type(Z) should be row-major spmatrix or ndarray"
+            )
+        if isinstance(pifa, smat.csr_matrix) and isinstance(Z, smat.csr_matrix):
+            label_embedding = smat_util.csr_rowwise_mul(pifa, alpha) + smat_util.csr_rowwise_mul(
+                Z, alpha_complementary
+            )
+        else:
+            if isinstance(pifa, smat.csr_matrix):
+                pifa = pifa.toarray()
+            if isinstance(Z, smat.csr_matrix):
+                Z = Z.toarray()
+            label_embedding = alpha[:, None] * pifa + alpha_complementary[:, None] * Z
+
+        return label_embedding
diff --git a/pecos/xmc/xlinear/README.md b/pecos/xmc/xlinear/README.md
new file mode 100644
index 00000000..d7e575a2
--- /dev/null
+++ b/pecos/xmc/xlinear/README.md
@@ -0,0 +1,100 @@
+# PECOS eXtreme Multi-label Classification: XLinear
+
+`pecos.xmc.xlinear` is a PECOS module for extreme multi-label classification and ranking.
+It takes sparse or dense numerical vectors as the input and outputs relevant labels for the input vectors.
+
+## Getting started
+### Command line usage
+
+Basic Training and predicting:
+```bash
+  > python3 -m pecos.xmc.xlinear.train -x ${X_path} -y ${Y_path} -o ${model_dir}
+  > python3 -m pecos.xmc.xlinear.predict -x ${Xt_path} -m ${model_dir} -o ${Yp_path}
+```
+
+To get the evaluation metrics for top-10 predictions
+```bash
+  > python3 -m pecos.xmc.xlinear.evaluate -y ${Yt_path} -p ${Yp_path} -k 10
+```
+where
+* `X_path` and `Xt_path` are the paths to the CSR npz or Row-majored npy files of the training/test feature matrices with shape `(N, d)` and `(Nt, d)`.
+* `Y_path` and `Yt_path` are the paths to the CSR npz files of the training/test label matrices with shape `(N, L)` and `(Nt, L)`.
+* `model_dir` is the path to the model folder where the trained model will be saved to, will be created if not exist.
+* `Yp_path` is the path to save the prediction label matrix with shape `(Nt, L)`
+
+For detailed usage, please refer to
+```bash
+  > python3 -m pecos.xmc.xlinear.train --help
+  > python3 -m pecos.xmc.xlinear.predict --help
+  > python3 -m pecos.xmc.xlinear.evaluate --help
+```
+
+### Python Example
+This toy example demonstrates how to train and predict with PECOS XLinear module using Python API.
+
+Loading the training data
+```python
+from pecos.xmc.xlinear.model import XLinearModel
+# load training feature with shape=(N, d) and label with shape=(N, L)
+X = XLinearModel.load_feature_matrix("test/tst-data/xmc/xlinear/X.npz")
+Y = XLinearModel.load_label_matrix("test/tst-data/xmc/xlinear/Y.npz", for_training=True)
+```
+Train the one-versus-all model
+```python
+xlm = XLinearModel.train(X, Y)
+```
+(optional) You can construct label indexing before training the model by using
+`pecos.xmc.Indexer` and `pecos.xmc.LabelEmbeddingFactory` modules.
+This creates label-hierarchical tree and allows O(log(L)) time
+complexity in prediction.
+For more information, please refer to our [tech doc](https://arxiv.org/abs/2010.05878).
+```python
+from pecos.xmc import Indexer, LabelEmbeddingFactory
+# construct label feature for clustering
+label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")
+# generate label indexing
+cluster_chain = Indexer.gen(label_feat, indexer_type="hierarchicalkmeans")
+
+xlm = XLinearModel.train(X, Y, C=cluster_chain)
+```
+
+Save and load model to/from the disk.
+Note that loading with `is_predict_only=True` will result in faster prediction speed but will disable you from further modify the model such as pruning.
+See `XLinearModel.load` for details.
+```python
+xlm.save("model")
+xlm = XLinearModel.load("model", is_predict_only=False)
+```
+
+Predict on test data
+```python
+# load test data with shape=(Nt, d)
+Xt = XLinearModel.load_feature_matrix("test/tst-data/xmc/xlinear/Xt.npz")
+# Batch prediction, Y_pred is a csr_matrix with shape=(Nt, L)
+Y_pred = xlm.predict(Xt)
+```
+Evaluate prediction result
+```python
+from pecos.utils import smat_util
+# load test labels with shape=(Nt, L)
+Yt = XLinearModel.load_label_matrix("test/tst-data/xmc/xlinear/Yt.npz")
+metric = smat_util.Metrics.generate(Yt, Yt_pred, topk=10)
+print(metric)
+```
+
+***
+
+Copyright (2021) Amazon.com, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
diff --git a/pecos/xmc/xlinear/__init__.py b/pecos/xmc/xlinear/__init__.py
new file mode 100644
index 00000000..c74e12cd
--- /dev/null
+++ b/pecos/xmc/xlinear/__init__.py
@@ -0,0 +1,11 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+from .model import XLinearModel  # noqa
diff --git a/pecos/xmc/xlinear/evaluate.py b/pecos/xmc/xlinear/evaluate.py
new file mode 100644
index 00000000..8f34c97e
--- /dev/null
+++ b/pecos/xmc/xlinear/evaluate.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3 -u
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+
+import argparse
+
+from pecos.utils import smat_util
+
+
+def parse_arguments():
+    """Parse evaluation arguments"""
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-y",
+        "--truth-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the file of with ground truth output (CSR: nr_insts * nr_items)",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--pred-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the file of predicted output (CSR: nr_insts * nr_items)",
+    )
+
+    parser.add_argument("-k", "--topk", type=int, default=10, metavar="INT", help="evaluate @k")
+
+    return parser
+
+
+def do_evaluation(args):
+    """Evaluate xlinear predictions
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    Y_true = smat_util.load_matrix(args.truth_path).tocsr()
+    Y_pred = smat_util.load_matrix(args.pred_path).tocsr()
+    metric = smat_util.Metrics.generate(Y_true, Y_pred, args.topk)
+    print("==== evaluation results ====")
+    print(metric)
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    do_evaluation(args)
diff --git a/pecos/xmc/xlinear/model.py b/pecos/xmc/xlinear/model.py
new file mode 100644
index 00000000..7f345107
--- /dev/null
+++ b/pecos/xmc/xlinear/model.py
@@ -0,0 +1,388 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import json
+import os
+from os import path
+
+import dataclasses as dc
+import numpy as np
+import pecos
+import scipy.sparse as smat
+from pecos.utils import smat_util
+from pecos.utils.cluster_util import ClusterChain
+from pecos.xmc import HierarchicalMLModel, MLModel, MLProblem
+
+
+class XLinearModel(pecos.BaseClass):
+    """Linear models for extreme multi-label classification.
+
+    See more details in Algorithm 2 of PECOS paper (Yu et al., 2020).
+
+    PECOS: Prediction for Enormous and Correlated Output Spaces
+        Hsiang-Fu Yu, Kai Zhong, Inderjit S. Dhillon
+        https://arxiv.org/abs/2010.05878
+    """
+
+    @dc.dataclass
+    class TrainParams(pecos.BaseParams):
+        """Training parameters of XLinearModel
+
+        Attributes:
+            mode (str, optional): training mode, one of [full-model(default)| matcher| ranker]
+                Examples:
+                    Given X, Y and a ClusterChain C_chain with D layers.
+                    (1) XLinearModel.train(X, Y, C=C_chain, mode='full-model') returns a D layer model.
+                    (2) XLinearModel.train(X, Y, C=C_chain, mode='matcher', ranker_level=t) returns a matcher model trained on top (D-t) layers of C_chain
+                    (3) XLinearModel.train(X, Y, C=C_chain, mode='ranker', ranker_level=t) returns a ranker model trained on bottom t layers of C_chain
+                    (4) XLinearModel.train(X, Y, C=None, mode='full-model') for a single OVA model
+            ranker_level (int, optional): the level above leaf node to split matcher/ranker.
+                Ignored when mode='full-model'. Defaults to 1.
+            nr_splits (int, optional): number of splits used to construct indexer hierarchy if C is not a complete chain.
+                Ignored if shallow is True. Defaults to 2.
+            min_codes (int, optional): the minimal number of clusters in the cluster chain. Defaults to nr_splits
+            shallow (bool, optional): whether to continue constructing a full cluster chain based on the C given, default False
+            hlm_args (HierarchicalMLModel.TrainParams, optional): HierarchicalMLModel.TrainParams. Default None.
+        """
+
+        mode: str = "full-model"
+        ranker_level: int = 1
+        nr_splits: int = 2
+        min_codes: int = None  # type: ignore
+        shallow: bool = False
+        hlm_args: HierarchicalMLModel.TrainParams = None  # type: ignore
+
+    @dc.dataclass
+    class PredParams(pecos.BaseParams):
+        """Prediction parameters of XLinearModel
+
+        Attributes:
+            hlm_args (HierarchicalMLModel.PredParams, optional): Default None.
+        """
+
+        hlm_args: HierarchicalMLModel.PredParams = None  # type: ignore
+
+        def override_with_kwargs(self, pred_kwargs):
+            self.hlm_args.override_with_kwargs(pred_kwargs)
+            return self
+
+    def __init__(self, model=None):
+        """Initialization
+
+        Args:
+            model (HierarchicalMLModel): The XLinear model.
+        """
+        self.model = model
+
+    def save(self, model_folder):
+        """Save the XLinear model to file
+
+        Args:
+            model_folder (str): dir to save the model
+        """
+        if not path.exists(model_folder):
+            os.makedirs(model_folder)
+        param = self.append_meta({})
+        with open(f"{model_folder}/param.json", "w", encoding="utf-8") as fout:
+            fout.write(json.dumps(param, indent=True))
+        self.model.save(path.join(model_folder, "ranker"))
+
+    @classmethod
+    def load(cls, model_folder, is_predict_only=False, **kwargs):
+        """Load the XLinear model from file
+
+        Args:
+            model_folder (str): The path to the folder that stores the model.
+            is_predict_only (bool): If true, the XLinear model will be loaded in C++ and can only be used for prediction.
+            If False, the model is loaded in Python and can be changed such as pruning. The predict-only mode typically
+            gives better performance but also means that any method not tied to prediction (e.g., train) cannot be invoked.
+
+            kwargs:
+                weight_matrix_type (string, used when is_predict_only=True): A string determining which type of weight matrix to use.
+                    The allowable types are:
+                        * "BINARY_SEARCH_CHUNKED": The default, stores the weight matrix in chunked format and
+                    uses binary search for vector x chunk products. Typically runs the fastest unless
+                    the queries are extremely sparse.
+                        * "HASH_CHUNKED": Stores the weight matrix in chunked format, but uses a hash table
+                    for random row access in the chunks. This hash table is used to compute vector
+                    x chunk products.
+                        * "CSC": Typically the slowest option. Stories the weight matrix in csc format.
+                    This format tends to be the fastest to load.
+
+                    Note: If you intend to use this model for prediction with dense queries, weight-matrix must be csc.
+        Returns:
+            XLinearModel
+        """
+        model = HierarchicalMLModel.load(
+            path.join(model_folder, "ranker"), is_predict_only, **kwargs
+        )
+        return cls(model)
+
+    @property
+    def is_predict_only(self):
+        """
+        Whether or not this model has been loaded in predict only mode.
+        """
+        return self.model.is_predict_only
+
+    @property
+    def nr_labels(self):
+        """
+        Get the number of labels
+        """
+        return self.model.nr_labels
+
+    @classmethod
+    def train(
+        cls,
+        X,
+        Y,
+        C=None,
+        user_supplied_negatives=None,
+        train_params=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Training method for XLinearModel
+
+        Args:
+            X (csr_matrix(float32) or ndarray(float32)): instance feature matrix of shape (nr_inst, nr_feat)
+            Y (csc_matrix(float32)): label matrix of shape (nr_inst, nr_labels)
+            C (csc_matrix(float32), list/tuple of csc_matrices or ClusterChain, optional): indexer matrix or cluster chain.
+                Defaults to None
+            user_supplied_negatives (dict, optional): dictionary of usn matching matrices.
+                See ClusterChain.generate_matching_chain. Defaults to None.
+            train_params (XLinearModel.TrainParams, optional): instance of XLinearModel.TrainParams
+            pred_params (XLinearModel.PredParams, optional): instance of XLinearModel.PredParams
+            kwargs:
+                {"beam_size": INT, "only_topk": INT, "post_processor": STR},
+                Default None to use HierarchicalMLModel.PredParams defaults
+
+        Returns:
+            XLinearModel: the trained XLinearModel
+        """
+
+        if train_params is None:  # for backward compatibility
+            train_params = cls.TrainParams.from_dict(kwargs)
+            train_params.hlm_args = HierarchicalMLModel.TrainParams(
+                neg_mining_chain=kwargs.get("negative_sampling_scheme", "tfn"),
+                model_chain=MLModel.TrainParams.from_dict(kwargs),
+            )
+        else:
+            train_params = cls.TrainParams.from_dict(train_params)
+
+        if pred_params is None:
+            pred_params = cls.PredParams()
+            pred_params.hlm_args = HierarchicalMLModel.PredParams(model_chain=MLModel.PredParams())
+        else:
+            pred_params = cls.PredParams.from_dict(pred_params)
+        # we don't override pred_params with kwargs["pred_kwargs"] because model depth is unknown!
+
+        if not train_params.min_codes:
+            train_params.min_codes = train_params.nr_splits
+
+        if C is None or (isinstance(C, (list, tuple)) and len(C) == 0):
+            clustering = None
+            matching_chain = None
+        else:
+            if train_params.shallow:
+                clustering = ClusterChain.from_partial_chain(C, min_codes=None)
+            else:
+                clustering = ClusterChain.from_partial_chain(
+                    C, min_codes=train_params.min_codes, nr_splits=train_params.nr_splits
+                )
+            matching_chain = clustering.genearate_matching_chain(user_supplied_negatives)
+
+        if train_params.mode == "full-model":
+            pass
+        elif train_params.mode == "matcher":
+            if clustering is None:
+                raise ValueError("Expect non-trivial clustering for matcher mode")
+            for cc in reversed(clustering[-train_params.ranker_level :]):
+                Y = Y.dot(cc).tocsc()
+            clustering = ClusterChain(clustering[: -train_params.ranker_level])
+            matching_chain = matching_chain[: -train_params.ranker_level]
+        elif train_params.mode == "ranker":
+            if clustering is None:
+                raise ValueError("Expect non-trivial clustering for ranker mode")
+            clustering = ClusterChain(clustering[-train_params.ranker_level :])
+            matching_chain = matching_chain[-train_params.ranker_level :]
+        else:
+            raise ValueError(f"Wrong value for the mode attribute: {train_params.mode}")
+
+        prob = MLProblem(X, Y)
+
+        model = HierarchicalMLModel.train(
+            prob,
+            clustering=clustering,
+            matching_chain=matching_chain,
+            train_params=train_params.hlm_args,
+            pred_params=pred_params.hlm_args,
+            **kwargs,
+        )
+        return cls(model)
+
+    def set_output_constraint(self, labels_to_keep):
+        """
+        Prune clustering tree to only output labels in labels_to_keep set.
+        Update C matrices from bottom-up.
+
+        Args:
+            labels_to_keep (iterable over int): label indices to keep
+        """
+        self.model.set_output_constraint(labels_to_keep)
+
+    def get_submodel_rooted_at(self, given_depth, child_node_id, reindex=False):
+        """
+        Get sub-model which represents the subtree of child_node_id and at depth: given_depth.
+
+        Args:
+            given_depth (int): depth of the model chain
+            child_node_id (int): index of the node whose subtree needs to be fetched
+            reindex (bool, optional):
+                if True: shapes of C and W matrices are changed
+                if False(default): the shapes do not change but the matrix is sparsified
+
+        Returns:
+            ret (tuple): tuple of,
+                subtree as XLinearModel object
+                mapping of indices of the final model to original labels (or None if reindex = False)
+        """
+        sub_model, mapping_to_labels = self.model.get_submodel_rooted_at(
+            given_depth, child_node_id, reindex
+        )
+        return XLinearModel(sub_model), mapping_to_labels
+
+    def split_model_at_depth(self, given_depth, reindex=False):
+        """Splits model tree at a given depth to yield separate models.
+
+        Args:
+            given_depth (int): depth at which the model needs to be split.
+            reindex (bool, optional):
+                if True: shapes of C and W matrices are changed
+                if False: the shapes do not change but the matrix is sparsified
+
+        Returns:
+            split_model(dict): a dictionary containing the following keys:
+                "parent_model":
+                    a XLinearModel that has model_chain of the
+                    original model till depth: depth
+                "child_models":
+                    A list of child model tuples where each tuple has:
+                        XLinearModel corresponding to the subtree at child id corresponding to list index
+                        mapping to original labels from the child model output (None if reindex is False)
+
+        Notes:
+            As an example if the model tree is a complete binary tree of depth 2,
+            and we call this function with depth = 1, this is the output (reindex = True)
+            "parent_model": 2*1 XLinearModel having the top-layer of the original tree
+            "child_models"[0]:
+                model corresponding to child tree of node 0 at height 1
+                list([0, 1])
+            "child_models"[1]:
+                model corresponding to child tree of node 1 at height 1
+                list([2, 3])
+        """
+        split_model = self.model.split_model_at_depth(given_depth, reindex)
+        split_model["parent_model"] = XLinearModel(split_model["parent_model"])
+        for i in range(len(split_model["child_models"])):
+            split_model["child_models"][i] = (
+                XLinearModel(split_model["child_models"][i][0]),
+                split_model["child_models"][i][1],
+            )
+        return split_model
+
+    @staticmethod
+    def save_feature_matrix(tgt, feat_mat):
+        """Save feature matrix to file
+
+        Args:
+            tgt (str or file-like object): destination to save the feature matrix
+            feat_mat (sparse matrix or ndarray): feature matrix to save
+        """
+        smat_util.save_matrix(tgt, feat_mat)
+
+    @staticmethod
+    def load_feature_matrix(src):
+        """Load feature matrix from file
+
+        Args:
+            src (str or file-like object): file to load the feature matrix
+
+        Returns:
+            matrix (csr_matrix or ndarray): loaded feature matrix
+        """
+        feat_mat = smat_util.load_matrix(src)
+        if isinstance(feat_mat, np.ndarray):
+            feat_mat = np.ascontiguousarray(feat_mat)
+        elif isinstance(feat_mat, smat.spmatrix):
+            feat_mat = feat_mat.tocsr()
+            feat_mat.sort_indices()
+        return feat_mat
+
+    @staticmethod
+    def load_label_matrix(src, for_training=False):
+        """Load label matrix from file
+
+        Args:
+            src (str or file-like object): file to load the label matrix
+            for_training (bool, optional): if False(default) return csr_matrix, else return csc_matrix
+
+        Returns:
+            matrix (csr_matrix or csc_matrix): loaded label matrix
+        """
+        assert isinstance(src, str), "src for load_label_matrix must be a str"
+        dtype = np.float32
+        feat_mat = smat_util.load_matrix(src)
+        feat_mat = feat_mat.tocsc() if for_training else feat_mat.tocsr()
+        return feat_mat.astype(dtype)
+
+    def get_pred_params(self):
+        """Get HierarchicalMLModel's pred_params for creating the XLinearModel.PredParams instance
+
+        Returns:
+            PredParams: Model's prediction parameters.
+        """
+        ret_pred_params = self.PredParams(
+            hlm_args=self.model.get_pred_params(),
+        )
+        return ret_pred_params
+
+    def predict(
+        self,
+        X,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Predict on given input data
+
+        Args:
+            X (csr_matrix(float32) or ndarray(float32)): instance feature matrix to predict on
+            pred_params (XLinearModel.PredParams, optional): instance of XLinearModel.PredParams
+            kwargs:
+                beam_size (int, optional): override the beam size specified in the model.
+                    Default None to disable overriding
+                only_topk (int, optional): override the only topk specified in the model
+                    Default None to disable overriding
+                post_processor (str, optional):  override the post_processor specified in the model
+                    Default None to disable overriding
+                threads (int, optional): the number of threads to use for training.
+                    Defaults to -1 to use all
+
+        Returns:
+            Y_pred (csr_matrix): prediction matrix
+        """
+        if pred_params is None:
+            Y_pred = self.model.predict(X, pred_params=None, **kwargs)
+        elif isinstance(pred_params, self.PredParams):
+            Y_pred = self.model.predict(X, pred_params=pred_params.hlm_args, **kwargs)
+        else:
+            raise TypeError("type(pred_kwargs) is not supported")
+        return Y_pred
diff --git a/pecos/xmc/xlinear/predict.py b/pecos/xmc/xlinear/predict.py
new file mode 100644
index 00000000..c989256c
--- /dev/null
+++ b/pecos/xmc/xlinear/predict.py
@@ -0,0 +1,159 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+
+from pecos.utils import smat_util
+from pecos.xmc import PostProcessor
+
+from .model import XLinearModel
+
+
+def parse_arguments():
+    """Parse prediction arguments"""
+
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "-x",
+        "--inst-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the npz file of the feature matrix (CSR, nr_insts * nr_feats)",
+    )
+
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="DIR",
+        help="path to the model folder.",
+    )
+
+    # Optional
+    parser.add_argument(
+        "-k",
+        "--only-topk",
+        type=int,
+        default=None,
+        metavar="INT",
+        help="override the only topk specified in the model (default None to disable overriding)",
+    )
+
+    parser.add_argument(
+        "-b",
+        "--beam-size",
+        type=int,
+        default=None,
+        metavar="INT",
+        help="override the beam size specified in the model (default None to disable overriding)",
+    )
+
+    parser.add_argument(
+        "-pp",
+        "--post-processor",
+        type=str,
+        choices=PostProcessor.valid_list(),
+        default=None,
+        metavar="STR",
+        help="override the post processor specified in the model (default None to disable overriding)",
+    )
+
+    parser.add_argument(
+        "-y",
+        "--label-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to the npz file of the label matrix (CSR, nr_insts * nr_labels)",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--save-pred-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to save the predictions (sorted CSR, nr_insts * nr_labels)",
+    )
+
+    parser.add_argument(
+        "-B",
+        "--batch-size",
+        type=int,
+        default=None,
+        help="Batch size for prediction (default None)",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--threads",
+        type=int,
+        default=-1,
+        metavar="THREADS",
+        help="number of threads to use (default -1 to denote all the CPUs)",
+    )
+    return parser
+
+
+def do_predict(args):
+    """Predict and Evaluate for xlinear model
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    # Load data
+    Xt = XLinearModel.load_feature_matrix(args.inst_path)
+
+    # Model Predicting
+    xlinear_model = XLinearModel.load(args.model_folder, is_predict_only=True)
+
+    if args.batch_size is not None:
+        Yts = []
+        for i in range(0, Xt.shape[0], args.batch_size):
+            Yte = xlinear_model.predict(
+                Xt[i : i + args.batch_size, :],
+                only_topk=args.only_topk,
+                beam_size=args.beam_size,
+                post_processor=args.post_processor,
+                threads=args.threads,
+            )
+            Yts.append(Yte)
+        # vstack_csr will retain indices order
+        Yt_pred = smat_util.vstack_csr(Yts)
+    else:
+        Yt_pred = xlinear_model.predict(
+            Xt,
+            only_topk=args.only_topk,
+            beam_size=args.beam_size,
+            post_processor=args.post_processor,
+            threads=args.threads,
+        )
+
+    # Save prediction
+    if args.save_pred_path:
+        smat_util.save_matrix(args.save_pred_path, Yt_pred)
+
+    # Evaluate
+    if args.label_path:
+        Yt = XLinearModel.load_label_matrix(args.label_path)
+        metric = smat_util.Metrics.generate(Yt, Yt_pred, topk=10)
+        print("==== evaluation results ====")
+        print(metric)
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    do_predict(args)
diff --git a/pecos/xmc/xlinear/train.py b/pecos/xmc/xlinear/train.py
new file mode 100644
index 00000000..02d4b23b
--- /dev/null
+++ b/pecos/xmc/xlinear/train.py
@@ -0,0 +1,328 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+import os
+
+from pecos.core import XLINEAR_SOLVERS
+from pecos.utils import smat_util
+from pecos.utils.cluster_util import ClusterChain
+from pecos.xmc import Indexer, LabelEmbeddingFactory, PostProcessor
+
+from .model import XLinearModel
+
+
+def parse_arguments():
+    """Parse training arguments"""
+
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "-x",
+        "--inst-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the CSR npz or Row-majored npy file of the feature matrix (nr_insts * nr_feats)",
+    )
+
+    parser.add_argument(
+        "-y",
+        "--label-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the CSR npz file of the label matrix (nr_insts * nr_labels)",
+    )
+
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="DIR",
+        help="path to the model folder.",
+    )
+
+    # Optional
+
+    # Indexing parameters
+    parser.add_argument(
+        "-f",
+        "--label-feat-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to the CSR npz or Row-majored npy file of the label feature matrix (nr_labels * nr_label_feats)",
+    )
+
+    parser.add_argument(
+        "--nr-splits",
+        type=int,
+        default=2,
+        metavar="INT",
+        help="number of splits used to construct hierarchy (a power of 2 is recommended)",
+    )
+
+    parser.add_argument(
+        "--indexer",
+        choices=Indexer.indexer_dict.keys(),
+        default="hierarchicalkmeans",
+        metavar="STR",
+        help=f"Indexer algorithm (default hierarchicalkmeans). Available choices are {', '.join(Indexer.indexer_dict.keys())}",
+    )
+
+    parser.add_argument(
+        "--max-leaf-size",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="The max size of the leaf nodes of hierarchical 2-means clustering. Multiple values (separated by comma) are supported and will lead to different individual models for ensembling. (default [100])",
+    )
+
+    parser.add_argument(
+        "--imbalanced-ratio",
+        type=float,
+        default=0.0,
+        metavar="FLOAT",
+        help="Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering L labels, the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other. (default 0.0)",
+    )
+
+    parser.add_argument(
+        "--imbalanced-depth",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="After hierarchical 2-means clustering has reached this depth, it will continue clustering as if --imbalanced-ratio is set to 0.0. (default 100)",
+    )
+
+    parser.add_argument(
+        "--no-spherical",
+        action="store_true",
+        default=False,
+        help="Do not l2-normalize cluster centers while clustering",
+    )
+
+    parser.add_argument(
+        "--seed", type=int, default=0, metavar="INT", help="random seed (default 0)"
+    )
+
+    parser.add_argument(
+        "--max-iter",
+        type=int,
+        default=20,
+        metavar="INT",
+        help="max iterations for indexer (default 20)",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--threads",
+        type=int,
+        default=-1,
+        metavar="INT",
+        help="number of threads to use (default -1 to denote all the CPUs)",
+    )
+
+    parser.add_argument(
+        "-c",
+        "--code-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to the npz file of the code matrix (CSC, nr_labels * nr_codes)",
+    )
+
+    parser.add_argument(
+        "-um",
+        "--usn-match-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to the npz file of the user supplied matching matrix (CSR, nr_insts * nr_codes), will be add to negative sampling if given",
+    )
+
+    parser.add_argument(
+        "-uy",
+        "--usn-label-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to the npz file of the user supplied label importance matrix (CSR, nr_insts * nr_labels), will be add to negative sampling if given",
+    )
+
+    # Linear matching/ranking parameters
+    parser.add_argument(
+        "-s",
+        "--solver-type",
+        type=str,
+        default="L2R_L2LOSS_SVC_DUAL",
+        metavar="STR",
+        help="{} (default L2R_L2LOSS_SVC_DUAL)".format(" | ".join(XLINEAR_SOLVERS.keys())),
+    )
+
+    parser.add_argument(
+        "--Cp",
+        type=float,
+        default=1.0,
+        metavar="VAL",
+        help="coefficient for positive class in the loss function (default 1.0)",
+    )
+
+    parser.add_argument(
+        "--Cn",
+        type=float,
+        default=1.0,
+        metavar="VAL",
+        help="coefficient for negative class in the loss function (default 1.0)",
+    )
+
+    parser.add_argument(
+        "--bias", type=float, default=1.0, metavar="VAL", help="bias term (default 1.0)"
+    )
+
+    parser.add_argument(
+        "-ns",
+        "--negative-sampling",
+        type=str,
+        choices=["tfn", "man", "tfn+man", "usn", "usn+tfn", "usn+man", "usn+tfn+man"],
+        default="tfn",
+        metavar="STR",
+        help="Negative Sampling Schemes",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        type=float,
+        default=0.1,
+        metavar="VAL",
+        help="threshold to sparsify the model weights (default 0.1)",
+    )
+
+    parser.add_argument(
+        "-z",
+        "--max-nonzeros-per-label",
+        type=int,
+        default=0,
+        metavar="NONZEROS",
+        help="keep at most NONZEROS weight parameters per label in model(default 0 to denote nr_features + 1)",
+    )
+
+    # Prediction kwargs
+    parser.add_argument(
+        "-k",
+        "--only-topk",
+        type=int,
+        default=20,
+        metavar="INT",
+        help="the default number of top labels used in the prediction",
+    )
+
+    parser.add_argument(
+        "-b",
+        "--beam-size",
+        type=int,
+        default=10,
+        metavar="INT",
+        help="the default size of beam search used in the prediction",
+    )
+
+    parser.add_argument(
+        "-pp",
+        "--post-processor",
+        type=str,
+        choices=PostProcessor.valid_list(),
+        default="l3-hinge",
+        metavar="STR",
+        help="the default post processor used in the prediction",
+    )
+
+    return parser
+
+
+def do_train(args):
+    """Train and Save xlinear model
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    # Create model folder
+    if not os.path.exists(args.model_folder):
+        os.makedirs(args.model_folder)
+
+    # Load training inputs and labels
+    X = XLinearModel.load_feature_matrix(args.inst_path)
+    Y = XLinearModel.load_label_matrix(args.label_path, for_training=True)
+
+    if args.code_path:
+        cluster_chain = ClusterChain.load(args.code_path)
+    else:
+        if args.label_feat_path:
+            label_feat = XLinearModel.load_feature_matrix(args.label_feat_path)
+        else:
+            label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")
+
+        cluster_chain = Indexer.gen(
+            label_feat,
+            args.indexer,
+            nr_splits=args.nr_splits,
+            max_leaf_size=args.max_leaf_size,
+            imbalanced_depth=args.imbalanced_depth,
+            imbalanced_ratio=args.imbalanced_ratio,
+            seed=args.seed,
+            max_iter=args.max_iter,
+            threads=args.threads,
+            spherical=not args.no_spherical,
+        )
+
+    # load label importance matrix if given
+    if args.usn_label_path:
+        usn_label_mat = smat_util.load_matrix(args.usn_label_path)
+    else:
+        usn_label_mat = None
+    # load user supplied matching matrix if given
+    if args.usn_match_path:
+        usn_match_mat = smat_util.load_matrix(args.usn_match_path)
+    else:
+        usn_match_mat = None
+    usn_match_dict = {0: usn_label_mat, 1: usn_match_mat}
+
+    pred_kwargs = {}
+    for kw in ["beam_size", "only_topk", "post_processor"]:
+        if getattr(args, kw, None) is not None:
+            pred_kwargs[kw] = getattr(args, kw)
+
+    xlm = XLinearModel.train(
+        X,
+        Y,
+        cluster_chain,
+        user_supplied_negatives=usn_match_dict,
+        negative_sampling_scheme=args.negative_sampling,
+        pred_kwargs=pred_kwargs,
+        nr_splits=args.nr_splits,
+        threads=args.threads,
+        solver_type=args.solver_type,
+        Cp=args.Cp,
+        Cn=args.Cn,
+        bias=args.bias,
+        threshold=args.threshold,
+        max_nonzeros_per_label=args.max_nonzeros_per_label,
+    )
+
+    xlm.save(args.model_folder)
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    do_train(args)
diff --git a/pecos/xmc/xtransformer/README.md b/pecos/xmc/xtransformer/README.md
new file mode 100644
index 00000000..d1ebce89
--- /dev/null
+++ b/pecos/xmc/xtransformer/README.md
@@ -0,0 +1,59 @@
+# PECOS eXtreme Multi-label Classification: XTransformer
+
+`pecos.xmc.xtransformer` is a PECOS module for extreme multi-label classification and ranking using transformer models.
+It takes both numerical vectors and instance text as the input and outputs relevant labels for the input vectors.
+GPUs with CUDA support is recommended to achieve the best performance of the module.
+
+## Getting started
+
+### Command line usage
+Basic Training command:
+```bash
+  > python3 -m pecos.xmc.xtransformer.train --trn-text-path ${X_txt_path} \
+                                            --trn-feat-path ${X_path}  \
+                                            --trn-label-path ${Y_path} \
+                                            --model-dir ${model_dir}
+```
+Predict on training dataset
+```bash
+  > python3 -m pecos.xmc.xtransformer.predict --feat-path ${X_path} \
+                                              --text-path ${X_txt_path} \
+                                              --model-folder ${model_dir} \
+                                              --output-dir ${Yp_path}
+```
+where
+* `X_txt_path` is the path to the input text file of the training instances. Should be a text file with `N` lines where each line is the text feature of the corresponding training instance.
+* `X_path` is the path to the CSR npz or Row-majored npy file of the training feature matrices with shape `(N, d)`.
+* `Y_path` is the path to the CSR npz file of the training label matrices with shape `(N, L)`.
+* `model_dir` is the path to the model folder where the trained model will be saved to, will be created if not exist.
+* `Yp_path` is the path to save the prediction label matrix with shape `(N, L)`
+
+
+
+To get the evaluation metrics for top-10 predictions
+```bash
+  > python3 -m pecos.xmc.xlinear.evaluate -y ${Y_path} -p ${Yp_path} -k 10
+```
+
+For detailed usage, please refer to
+```bash
+  > python3 -m pecos.xmc.xtransformer.train --help
+  > python3 -m pecos.xmc.xtransformer.predict --help
+```
+
+***
+
+Copyright (2021) Amazon.com, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
diff --git a/pecos/xmc/xtransformer/__init__.py b/pecos/xmc/xtransformer/__init__.py
new file mode 100644
index 00000000..95fa1843
--- /dev/null
+++ b/pecos/xmc/xtransformer/__init__.py
@@ -0,0 +1,12 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+from .network import BertForXMC  # noqa
+from .matcher import TransformerMatcher  # noqa
diff --git a/pecos/xmc/xtransformer/matcher.py b/pecos/xmc/xtransformer/matcher.py
new file mode 100644
index 00000000..30bb174c
--- /dev/null
+++ b/pecos/xmc/xtransformer/matcher.py
@@ -0,0 +1,1439 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import copy
+import json
+import logging
+import multiprocessing as mp
+import os
+import tempfile
+import time
+
+import dataclasses as dc
+import numpy as np
+import pecos
+import scipy.sparse as smat
+import torch
+import transformers
+from pecos.core import clib
+from pecos.utils import parallel_util, smat_util, torch_util
+from pecos.xmc import MLModel, MLProblem, PostProcessor
+from sklearn.preprocessing import normalize as sk_normalize
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from transformers import AdamW, AutoConfig, get_linear_schedule_with_warmup
+
+from .module import XMCDataset
+from .network import ENCODER_CLASSES, HingeLoss, TransformerLinearXMCHead
+
+logging.getLogger(transformers.__name__).setLevel(logging.WARNING)
+
+LOGGER = logging.getLogger(__name__)
+
+
+class TransformerMatcher(pecos.BaseClass):
+    """Transformer Matcher Model
+
+    Construct, fine-tune the transformers and predict on a fixed output label space.
+    """
+
+    LOSS_FUNCTION_TYPES = {
+        "hinge": HingeLoss(margin=1.0, power=1),
+        "squared-hinge": HingeLoss(margin=1.0, power=2),
+        "weighted-hinge": HingeLoss(margin=1.0, power=1, cost_weighted=True),
+        "weighted-squared-hinge": HingeLoss(margin=1.0, power=2, cost_weighted=True),
+        "cross-entropy": torch.nn.BCEWithLogitsLoss(reduction="sum"),
+    }
+
+    @dc.dataclass
+    class TrainParams(pecos.BaseParams):  # type: ignore
+        """Training Parameters of MLModel
+
+        model_shortcut (str): string of pre-trained model shortcut. Default 'bert-base-cased'
+        negative_sampling (str): negative sampling types. Default tfn
+        loss_function (str): type of loss function to use for transformer
+            training. Default 'squared-hinge'
+
+        threshold (float): threshold to sparsify the model weights. Default 0.1
+        hidden_dropout_prob (float): hidden dropout prob in deep transformer models. Default 0.1
+        batch_size (int):  batch size for transformer training. Default 8
+        batch_gen_workers (int): number of workers for batch generation. Default 4
+        max_active_matching_labels (int): max number of active matching labels,
+            will subsample from existing negative samples if necessary. Default None
+            to ignore
+        max_steps (int): if > 0: set total number of training steps to perform.
+            Override num-train-epochs. Default -1.
+        max_no_improve_cnt (int): if > 0, training will stop when this number of
+            validation steps result in no improvement. Default -1.
+        num_train_epochs (int): total number of training epochs to perform. Default 5
+        gradient_accumulation_steps (int): number of updates steps to accumulate
+            before performing a backward/update pass. Default 1.
+        weight_decay (float): weight decay rate for regularization. Default 0 to ignore
+        max_grad_norm (float): max gradient norm used for gradient clipping. Default 1.0
+        learning_rate (float): maximum learning rate for Adam. Default 5e-5
+        adam_epsilon (float): epsilon for Adam optimizer.Default 1e-8
+        warmup_steps (float): learning rate warmup over warmup-steps. Default 0
+        logging_steps (int): log training information every NUM updates steps. Default 50
+        save_steps (int): save checkpoint every NUM updates steps. Default 100
+
+
+        no_fine_tune (bool, optional): not to do fine-tuning on the transformer text_encoder. Default False
+        disable_gpu (bool, optional): not to use GPU even if available. Default False
+        do_encoder_bootstrap (bool, optional): If True, Initialize text_encoder
+            with weights from text_encoder of previous layer. Default False
+        do_text_model_bootstrap (bool, optional): If True, initialize
+            TransformerMatcher projection layer with linear solver. Default False
+        force_label_embed_in_gpu (bool): If True, always put label embed in GPU.
+            This will increase GPU memory cost but accelerate training. Default False
+
+        model_dir (str): path to save training checkpoints. Default empty to use a temp dir.
+        cache_dir (str): dir to store the pre-trained models downloaded from
+            s3. Default empty to use a temp dir.
+        init_model_dir (str): path to load checkpoint of TransformerMatcher. If given,
+            start from the given checkpoint rather than downloading a
+            pre-trained model from S3. Default empty to ignore
+        saved_trn_pt: (str): dir to save/load tokenized train tensors. Default empty to ignore
+        saved_val_pt: (str): dir to save/load tokenized validation tensors. Default empty to ignore
+        save_emb_dir (str): dir to save instance embeddings. Default empty to ignore
+        """
+
+        model_shortcut: str = "bert-base-cased"
+        negative_sampling: str = "tfn"
+        loss_function: str = "squared-hinge"
+
+        threshold: float = 0.1
+        hidden_dropout_prob: float = 0.1
+        batch_size: int = 8
+        batch_gen_workers: int = 4
+        max_active_matching_labels: int = None  # type: ignore
+        max_steps: int = 0
+        max_no_improve_cnt: int = -1
+        num_train_epochs: int = 5
+        gradient_accumulation_steps: int = 1
+        weight_decay: float = 0
+        max_grad_norm: float = 1.0
+        learning_rate: float = 5e-5
+        adam_epsilon: float = 1e-8
+        warmup_steps: int = 0
+        logging_steps: int = 50
+        save_steps: int = 100
+
+        no_fine_tune: bool = False
+        disable_gpu: bool = False
+        do_encoder_bootstrap: bool = False
+        do_text_model_bootstrap: bool = False
+        force_label_embed_in_gpu: bool = False
+
+        model_dir: str = ""
+        cache_dir: str = ""
+        init_model_dir: str = ""
+        saved_trn_pt: str = ""
+        saved_val_pt: str = ""
+        save_emb_dir: bool = False
+
+    @dc.dataclass
+    class PredParams(pecos.BaseParams):  # type: ignore
+        """Prediction Parameters of MLModel
+
+        only_topk (int, optional): the only topk specified in the model.
+            Default to 20
+        post_processor (str, optional):  the post_processor specified in the model.
+            Default to "noop"
+        ensemble_method (str, optional): micro ensemble method to generate prediction.
+            Default to "transformer-only". See TransformerMatcher.ensemble_prediction for details.
+        truncate_length (int, optional): length to truncate input text, default None to skip truncation.
+
+        """
+
+        only_topk: int = 20
+        post_processor: str = "noop"
+        ensemble_method: str = "transformer-only"
+        truncate_length: int = None  # type: ignore
+
+    def __init__(
+        self,
+        text_encoder,
+        text_tokenizer,
+        text_model,
+        C=None,
+        concat_model=None,
+        train_params=None,
+        pred_params=None,
+    ):
+        """
+        Args:
+            text_encoder (BertForXMC, RobertaForXMC or XLNetForXMC): text text_encoder
+                using transformer model
+            text_tokenizer (BertTokenizer, RobertaTokenizer or XLNetTokenizer): text
+                text_tokenizer for to convert raw text to torch tensors
+            text_model (TransformerLinearXMCHead): linear projection from transformer
+                text_encoder to label space
+            C (csr_matrix): clustering matrix, shape = (nr_labels, nr_code)
+            concat_model (MLModel): linear model that takes concatenation of transformer
+                embeddings and input numerical features to predict on label space
+            train_params (TransformerMatcher.TrainParams, optional): instance of TransformerMatcher.TrainParams.
+            pred_params (TransformerMatcher.PredParams, optional): instance of TransformerMatcher.PredParams.
+        """
+        self.text_encoder = text_encoder
+        self.text_tokenizer = text_tokenizer
+        self.C = C
+
+        self.text_model = text_model
+        self.concat_model = concat_model
+
+        self.train_params = self.TrainParams.from_dict(train_params)
+        self.pred_params = self.PredParams.from_dict(pred_params)
+
+    def get_pred_params(self):
+        return copy.deepcopy(self.pred_params)
+
+    def to_device(self, device, n_gpu=0):
+        """Move the text_encoder to desired device
+
+        Args:
+            device (torch.device): the destination device
+            n_gpu (int, optional): if > 1, text_encoder will be converted to torch.nn.DataParallel to use multi-GPU
+        """
+        self.text_encoder.to(device)
+        # multi-gpu eval
+        if n_gpu > 1 and not isinstance(self.text_encoder, torch.nn.DataParallel):
+            self.text_encoder = torch.nn.DataParallel(self.text_encoder)
+        return self
+
+    def clear_cuda(self):
+        """Clear CUDA memory"""
+        if hasattr(self.text_encoder, "module"):
+            self.text_encoder = self.text_encoder.module
+        self.text_encoder.to(torch.device("cpu"))
+        self.text_model.to(torch.device("cpu"))
+        torch.cuda.empty_cache()
+        return self
+
+    @classmethod
+    def get_loss_function(cls, loss_function):
+        """Get the loss function for training
+
+        Args:
+            loss_function (str): type of loss function, in TransformerMatcher.LOSS_FUNCTION_TYPES
+
+        Returns:
+            loss_function (torch.nn.module)
+        """
+        return cls.LOSS_FUNCTION_TYPES[loss_function]
+
+    @property
+    def device(self):
+        """Get the current device of the text_encoder
+
+        Returns:
+            torch.device
+        """
+        if hasattr(self.text_encoder, "module"):
+            return self.text_encoder.module.device
+        else:
+            return self.text_encoder.device
+
+    @property
+    def nr_codes(self):
+        """Get the number of codes"""
+        return self.C.shape[1]
+
+    @property
+    def nr_features(self):
+        """Get the feature dimension of concat_model"""
+        return self.concat_model.nr_features if self.concat_model else None
+
+    @property
+    def nr_labels(self):
+        """Get the number of labels"""
+        return self.text_model.num_labels
+
+    def save(self, save_dir):
+        """Save the models, text_tokenizer and training arguments to file
+
+        Args:
+            save_dir (str): dir to save the model, will be created if it doesn't exist
+        """
+        os.makedirs(save_dir, exist_ok=True)
+        # use .module when do parallel training
+        encoder_to_save = (
+            self.text_encoder.module if hasattr(self.text_encoder, "module") else self.text_encoder
+        )
+
+        param = {
+            "model": self.__class__.__name__,
+            "text_encoder": encoder_to_save.__class__.__name__,
+            "nr_labels": self.nr_labels,
+            "nr_features": self.nr_features,
+            "nr_codes": self.nr_codes,
+            "train_params": self.train_params.to_dict(),
+            "pred_params": self.pred_params.to_dict(),
+        }
+        param = self.append_meta(param)
+        with open(os.path.join(save_dir, "param.json"), "w", encoding="utf-8") as f:
+            f.write(json.dumps(param, indent=True))
+
+        smat_util.save_matrix(os.path.join(save_dir, "C.npz"), self.C)
+
+        encoder_dir = os.path.join(save_dir, "text_encoder")
+        os.makedirs(encoder_dir, exist_ok=True)
+        # this creates config.json, pytorch_model.bin
+        encoder_to_save.save_pretrained(encoder_dir)
+        # this creates text_tokenizer files
+        tokenizer_dir = os.path.join(save_dir, "text_tokenizer")
+        os.makedirs(tokenizer_dir, exist_ok=True)
+        self.text_tokenizer.save_pretrained(tokenizer_dir)
+        # this creates text_model
+        text_model_dir = os.path.join(save_dir, "text_model")
+        torch.save(self.text_model, text_model_dir)
+        # save the concat_model
+        concat_model_dir = os.path.join(save_dir, "concat_model")
+        if self.concat_model:
+            self.concat_model.save(concat_model_dir)
+
+    @classmethod
+    def load(cls, load_dir):
+        """Load models, text_tokenizer and training arguments from file
+
+        Args:
+            load_dir (str): dir to load the models, text_tokenizer and training arguments
+
+        Returns:
+            TransformerMatcher
+        """
+        # load TrainParams and PredParams
+        param_dir = os.path.join(load_dir, "param.json")
+        param = dict()
+        if os.path.exists(param_dir):
+            param = json.loads(open(param_dir, "r").read())
+        train_params = cls.TrainParams.from_dict(param.get("train_params", None))
+        pred_params = cls.PredParams.from_dict(param.get("pred_params", None))
+
+        # load text_encoder
+        encoder_dir = os.path.join(load_dir, "text_encoder")
+        if not os.path.isdir(encoder_dir):
+            raise ValueError(f"text_encoder does not exist at {encoder_dir}")
+
+        with open(os.path.join(encoder_dir, "config.json"), "r", encoding="utf-8") as fin:
+            transformer_type = json.loads(fin.read())["model_type"]
+        dnn_type = ENCODER_CLASSES[transformer_type]
+        encoder_config = dnn_type.config_class.from_pretrained(encoder_dir)
+        text_encoder, loading_info = dnn_type.model_class.from_pretrained(
+            encoder_dir, config=encoder_config, output_loading_info=True
+        )
+        if len(loading_info["missing_keys"]) > 0:
+            LOGGER.warning(
+                "Weights of {} not initialized from pre-trained text_encoder: {}".format(
+                    text_encoder.__class__.__name__, loading_info["missing_keys"]
+                )
+            )
+
+        # load text_tokenizer
+        tokenizer_dir = os.path.join(load_dir, "text_tokenizer")
+        if not os.path.isdir(tokenizer_dir):
+            raise ValueError(f"text_tokenizer does not exist at {tokenizer_dir}")
+        text_tokenizer = dnn_type.tokenizer_class.from_pretrained(tokenizer_dir)
+
+        # load text_model
+        text_model_dir = os.path.join(load_dir, "text_model")
+        if os.path.exists(text_model_dir):
+            text_model = torch.load(text_model_dir)
+        else:
+            text_model = TransformerLinearXMCHead(encoder_config)
+            LOGGER.warning(
+                f"XMC text_model of {text_encoder.__class__.__name__} not initialized from pre-trained model."
+            )
+
+        # load C
+        C_path = os.path.join(load_dir, "C.npz")
+        if not os.path.exists(C_path):
+            raise ValueError(f"Cluster code does not exist at {C_path}")
+        C = smat_util.load_matrix(C_path)
+
+        # load concat_model
+        concat_model_dir = os.path.join(load_dir, "concat_model")
+        concat_model = None
+        if os.path.exists(concat_model_dir):
+            concat_model = MLModel.load(concat_model_dir)
+
+        return cls(
+            text_encoder,
+            text_tokenizer,
+            text_model,
+            C=C,
+            concat_model=concat_model,
+            train_params=train_params,
+            pred_params=pred_params,
+        )
+
+    @classmethod
+    def download_model(cls, model_shortcut, num_labels, hidden_dropout_prob=0.1, cache_dir=""):
+        """Initialize a matcher by downloading a pre-trained model from s3
+
+        Args:
+            model_shortcut (str): model name shortcut, e.g. 'bert-base-cased'
+            num_labels (int): model output size
+            hidden_dropout_prob (float, optional): hidden states dropout probability. Default 0.1
+            cache_dir (str, optional): path to store downloaded model, if the model already exists
+                            at cache_dir, downloading will be ignored
+
+        Returns:
+            TransformerMatcher
+        """
+        use_cache = cache_dir if cache_dir else None
+        # AutoConfig will infer transformer type from shortcut
+        config = AutoConfig.from_pretrained(
+            model_shortcut,
+            hidden_dropout_prob=hidden_dropout_prob,
+            output_hidden_states=False,
+            summary_use_proj=False,
+            num_labels=num_labels,
+            finetuning_task=None,
+            cache_dir=use_cache,  # if None, create temp folder
+        )
+        if config.model_type not in ENCODER_CLASSES:
+            raise ValueError(f"Model type {config.model_type} not supported.")
+
+        dnn_type = ENCODER_CLASSES[config.model_type]
+        text_tokenizer = dnn_type.tokenizer_class.from_pretrained(
+            model_shortcut,
+            cache_dir=use_cache,
+        )
+        text_encoder = dnn_type.model_class.from_pretrained(
+            model_shortcut,
+            config=config,
+            cache_dir=use_cache,
+        )
+        text_model = TransformerLinearXMCHead(config)
+        return cls(text_encoder, text_tokenizer, text_model)
+
+    def text_to_tensor(self, corpus, num_workers=4, max_length=None):
+        """Convert input text corpus into padded tensors
+
+        Args:
+            corpus (iterable over str): input text strings
+            num_workers (int, optional): number of processors to use for data encoding. Default 4
+            max_length(int, optional): max length to which input text will be padded/truncated.
+                                    Default None to use the max length in the corpus
+
+        Returns:
+            feature_tensors (dict): {
+                                    "input_ids": tensor of input token ids,
+                                    "attention_mask": tensor of attention masks,
+                                    "token_type_ids": tensor of token type ids,
+                                    }
+        """
+        convert_kwargs = {
+            "add_special_tokens": True,
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": max_length,
+            "return_tensors": "pt",  # return pytorch tensors
+            "return_token_type_ids": True,
+            "return_attention_mask": True,
+        }
+        num_workers = min(len(corpus), num_workers)
+        # generate inst feature batches
+        chunk_size = (len(corpus) + num_workers - 1) // num_workers
+        data_chunks = [corpus[chunk_size * i : chunk_size * (i + 1)] for i in range(num_workers)]
+        LOGGER.info(
+            "***** Encoding data with {} workers, len={} truncation={}*****".format(
+                num_workers, len(corpus), max_length
+            )
+        )
+        t_start = time.time()
+        pool = mp.get_context("spawn").Pool(processes=num_workers)
+        async_results = [
+            pool.apply_async(
+                parallel_util.call_instance_method,
+                args=(
+                    self.text_tokenizer,
+                    self.text_tokenizer.batch_encode_plus.__name__,
+                    (),
+                    dict(
+                        convert_kwargs,
+                        batch_text_or_text_pairs=data_chunks[i],
+                    ),
+                ),
+            )
+            for i in range(num_workers)
+        ]
+        pool.close()
+        map(mp.pool.ApplyResult.get, async_results)
+        result_lists = [r.get() for r in async_results]
+
+        LOGGER.info("***** Finished with time cost={} *****".format(time.time() - t_start))
+        feature_tensors = {}
+
+        if len(result_lists) > 1:
+            keys_ = [
+                ("input_ids", self.text_tokenizer.pad_token_id),
+                ("attention_mask", 0),
+                ("token_type_ids", 1),
+            ]
+            # for split sequence chunks to list of sequences since pad_sequence requires
+            # all trailing dimensions to be the same
+            for kw, pad_val in keys_:
+                feature_tensors[kw] = (
+                    pad_sequence(
+                        sum(
+                            [
+                                list(torch.chunk(t[kw].T, t[kw].shape[0], dim=1))
+                                for t in result_lists
+                            ],
+                            [],
+                        ),
+                        batch_first=True,
+                        padding_value=pad_val,
+                    )
+                    .permute(1, 0, 2)
+                    .flatten(1)
+                    .T
+                )
+        else:
+            feature_tensors = result_lists[0]
+
+        return feature_tensors
+
+    @staticmethod
+    def _get_label_tensors(M, Y, idx_padding=-1, val_padding=0, max_labels=None):
+        """
+        Given matching matrix M and label matrix Y, construct label tensors for XMC training
+        The non-zero indices of Y are seen as positive labels and therefore all
+        included in the result.
+
+        Example:
+            M = smat.csr_matrix([[1, 1, 0, 0],
+                                 [0, 0, 1, 1]])
+            Y = smat.csr_matrix([[0, 1, 0, 2],
+                                 [0, 0, 0, 3]])
+            then the returned values will be:
+            label_indices = torch.LongTensor([[1, 3, 0], [3, 2, -1]])
+            label_values = torch.FloatTensor([[1., 2., 0.], [3., 0., 0.]])
+
+        Args:
+            M (csr_matrix or None): matching matrix, shape = (nr_inst, nr_labels)
+                It's indices are the candidate label indices to consider
+                It's values will not be used
+            Y (csr_matrix or None): label matrix, shape = (nr_inst, nr_labels)
+                It's non-zero indices are positive labels and will always be
+                included.
+            idx_padding (int, optional): the index used to pad all label_indices
+                to the same length. Default -1
+            val_padding (float, optional): the value used to fill in
+                label_values corresponding to the zero entrees in Y. Default 0
+            max_labels (int, optional): max number of labels considered for each
+                instance, will subsample from existing label indices if need to.
+                Default None to use max row nnz of M.
+
+        Returns:
+            label_indices (torch.LongTensor or None): containing label indices with
+                shape = (nr_inst, max_labels). Return None if M is None
+            label_values (torch.FloatTensor or None): containing label values
+                with shape = (nr_inst, max_labels). If Y is None, return None
+        """
+        if M is None and Y is None:
+            return None, None
+        elif M is None and Y is not None:
+            # if M is None, taking all labels into account
+            return None, torch.FloatTensor(Y.toarray())
+
+        if Y is not None:
+            if Y.shape != M.shape:
+                raise ValueError("Y and M shape mismatch: {} and {}".format(Y.shape, M.shape))
+            label_lower_bound = max(Y.indptr[1:] - Y.indptr[:-1])
+            # make sure all positive labels are included
+            M1 = smat_util.binarized(M) + smat_util.binarized(Y)
+        else:
+            M1 = M
+            label_lower_bound = 0
+
+        label_upper_bound = max(M1.indptr[1:] - M1.indptr[:-1])
+        if max_labels is None:
+            max_labels = label_upper_bound
+        else:
+            max_labels = min(max_labels, label_upper_bound)
+            if max_labels < label_lower_bound:
+                max_labels = label_lower_bound
+                LOGGER.warning(
+                    f"Increasing max_labels to {label_lower_bound} to accommodate all positive labels."
+                )
+
+        nr_inst = M1.shape[0]
+        label_indices = np.zeros((nr_inst, max_labels), dtype=np.int64) + idx_padding
+        if Y is not None:
+            label_values = np.zeros((nr_inst, max_labels), dtype=np.float32) + val_padding
+
+        for i in range(nr_inst):
+            offset = 0
+            neg_samples = M1.indices[M1.indptr[i] : M1.indptr[i + 1]]
+            # fill with positive samples first
+            if Y is not None:
+                y_nnz = Y.indptr[i + 1] - Y.indptr[i]
+                rng = slice(Y.indptr[i], Y.indptr[i + 1])
+                label_indices[i, :y_nnz] = Y.indices[rng]
+                label_values[i, :y_nnz] = Y.data[rng]
+                offset += y_nnz
+                neg_samples = neg_samples[np.invert(np.isin(neg_samples, Y.indices[rng]))]
+            # fill the rest slots with negative samples
+            if neg_samples.size > max_labels - offset:
+                # random sample negative labels
+                neg_samples = np.random.choice(neg_samples, max_labels - offset)
+
+            label_indices[i, offset : offset + neg_samples.size] = neg_samples
+
+        label_indices = torch.LongTensor(label_indices)
+
+        return label_indices, None if Y is None else torch.FloatTensor(label_values)
+
+    @staticmethod
+    def ensemble_prediction(transformer_pred_csr, concat_pred_csr, only_topk, ens_method):
+        """Generate micro ensemble of concat predictions and transformer predictions
+
+        Args:
+            transformer_pred_csr (csr_matrix): transformer prediction matrix
+            concat_pred_csr (csr_matrix): concat_model prediction matrix
+            only_topk (int): number of top predictions to gather
+            ens_method (str): the method used for micro ensemble. Choices:
+                    concat-only: only use concat model predictions
+                    transformer-only: only use transformer predictions
+                    rank_average: rank-average concat/transformer predictions
+                    round_robin: round robin ensemble liner/transformer predictions
+                    average: average concat/transformer predictions
+
+        Returns:
+            ensemble_pred_csr (csr_matrix)
+        """
+
+        if transformer_pred_csr.shape != concat_pred_csr.shape:
+            raise ValueError(
+                f"Transformer/concat prediction mismatch: {transformer_pred_csr.shape} and {concat_pred_csr.shape}"
+            )
+        LOGGER.info(f"Using {ens_method} for transformer/concat ensemble of pred_csr")
+        if ens_method == "concat-only":
+            pred_csr_codes = concat_pred_csr
+        elif ens_method == "transformer-only":
+            pred_csr_codes = transformer_pred_csr
+        elif ens_method == "average":
+            pred_csr_codes = smat_util.CsrEnsembler.average(
+                smat_util.sorted_csr(transformer_pred_csr),
+                smat_util.sorted_csr(concat_pred_csr),
+            )
+        elif ens_method == "rank_average":
+            pred_csr_codes = smat_util.CsrEnsembler.rank_average(
+                smat_util.sorted_csr(transformer_pred_csr),
+                smat_util.sorted_csr(concat_pred_csr),
+            )
+        elif ens_method == "round_robin":
+            pred_csr_codes = smat_util.CsrEnsembler.round_robin(
+                smat_util.sorted_csr(transformer_pred_csr),
+                smat_util.sorted_csr(concat_pred_csr),
+            )
+        else:
+            raise ValueError(f"Unknown ensemble method {ens_method}")
+        return smat_util.sorted_csr(pred_csr_codes.astype(np.float32), only_topk=only_topk)
+
+    def predict(
+        self,
+        X_text,
+        X_feat=None,
+        csr_codes=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Predict with the transformer matcher, allow batch prediction to reduce memory cost
+
+        Args:
+            X_text (list of str or dict): prediction input text or
+                dictionary of encoded tensors:
+                    {
+                    "input_ids": tensor of input token ids,
+                    "attention_mask": tensor of attention masks,
+                    "token_type_ids": tensor of token type ids,
+                    }
+            X_feat (csr_matrix or ndarray, optional): prediction instance
+                    feature matrix, shape = (nr_insts, nr_features)
+            csr_codes (csr_matrix, optional): matching matrix, shape = (nr_inst, nr_codes),
+                    only its non-zero entrees will be evaluated.
+                    Its values will be combined with current prediction through post_processor.
+                    Default None to consider all labels.
+            pred_params (TransformerMatcher.PredParams, optional): instance of TransformerMatcher.PredParams
+                    or dictionary to override stored pred_params. Default None to ignore overriding
+            kwargs:
+                batch_size (int, optional): total batch_size for (multi-GPU) forward propagation. Default 8
+                batch_gen_workers (int, optional): number of CPU workers for batch generation. Default 4
+                pred_chunk_size (int, optional): maximum number of instances to
+                        predict on for each round. Default None to predict on all
+                        instances at once.
+
+        Returns:
+            label_pred (csr_matrix): label prediction logits, shape = (nr_inst, nr_labels)
+            embeddings (ndarray): array of instance embeddings shape = (nr_inst, hidden_dim)
+        """
+        if isinstance(X_text, list):
+            X_text = self.text_to_tensor(
+                X_text,
+                num_workers=kwargs.get("batch_gen_workers", 4),
+                max_length=pred_params.truncate_length,
+            )
+
+        nr_inst = X_text["input_ids"].shape[0]
+        pred_chunk_size = kwargs.pop("pred_chunk_size", None)
+
+        if pred_chunk_size is None or pred_chunk_size >= nr_inst:
+            label_pred, embeddings = self._predict(
+                X_text,
+                X_feat=X_feat,
+                csr_codes=csr_codes,
+                pred_params=pred_params,
+                **kwargs,
+            )
+        else:
+            # batch prediction to avoid OOM
+            embedding_chunks = []
+            P_chunks = []
+            for i in range(0, nr_inst, pred_chunk_size):
+                cur_P, cur_embedding = self._predict(
+                    {k: v[i : i + pred_chunk_size] for k, v in X_text.items()},
+                    X_feat=None if X_feat is None else X_feat[i : i + pred_chunk_size, :],
+                    csr_codes=None if csr_codes is None else csr_codes[i : i + pred_chunk_size, :],
+                    pred_params=pred_params,
+                    **kwargs,
+                )
+                embedding_chunks.append(cur_embedding)
+                P_chunks.append(cur_P)
+            label_pred = smat_util.vstack_csr(P_chunks)
+            embeddings = np.vstack(embedding_chunks)
+        return label_pred, embeddings
+
+    def _predict(
+        self,
+        X_text,
+        X_feat=None,
+        csr_codes=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Predict with the transformer matcher
+
+        Args:
+            X_text (dict): prediction inputs, dictionary of tensors
+                    {
+                    "input_ids": tensor of input token ids,
+                    "attention_mask": tensor of attention masks,
+                    "token_type_ids": tensor of token type ids,
+                    }
+            X_feat (csr_matrix or ndarray, optional): prediction instance feature matrix,
+                    shape = (nr_insts, nr_features)
+            csr_codes (csr_matrix, optional): matching matrix, shape = (nr_inst, nr_codes),
+                    only its non-zero entrees will be evaluated.
+                    Its values will be combined with current prediction through post_processor.
+                    Default None to consider all labels.
+            pred_params (TransformerMatcher.PredParams, optional): instance of TransformerMatcher.PredParams
+                    or dictionary to override stored pred_params. Default None to ignore overriding
+            kwargs:
+                batch_size (int, optional): total batch_size for (multi-GPU) forward propagation. Default 8
+                batch_gen_workers (int, optional): number of CPU workers for batch generation. Default 4
+
+        Returns:
+            label_pred (csr_matrix): label prediction logits, shape = (nr_inst, nr_labels)
+            embeddings (ndarray): array of instance embeddings shape = (nr_inst, hidden_dim)
+        """
+        if pred_params is None:
+            pred_params = self.pred_params
+        elif isinstance(pred_params, dict):
+            pred_params = self.pred_params.override_with_kwargs(pred_params)
+        elif not isinstance(pred_params, TransformerMatcher.PredParams):
+            raise TypeError(f"Unsupported type for pred_params: {type(pred_params)}")
+
+        batch_gen_workers = kwargs.get("batch_gen_workers", 4)
+
+        if csr_codes is not None:
+            # need to keep explicit zeros in csr_codes_next
+            # therefore do not pass it through constructor
+            if not isinstance(csr_codes, smat.csr_matrix):
+                raise TypeError(f"Got type={type(csr_codes)} for csr_codes!")
+            # getting the result in csr by computing csr * csr
+            csr_codes_next = clib.sparse_matmul(
+                csr_codes,
+                self.C.T,
+                eliminate_zeros=False,
+                threads=batch_gen_workers,
+            )
+
+            LOGGER.info(
+                "Predict with csr_codes_next({}) with avr_nnz={}".format(
+                    csr_codes_next.shape, csr_codes_next.nnz / csr_codes_next.shape[0]
+                )
+            )
+        else:
+            csr_codes_next = None
+            LOGGER.info("Predict on input text tensors({})".format(X_text["input_ids"].shape))
+
+        label_indices_pt, label_values_pt = TransformerMatcher._get_label_tensors(
+            csr_codes_next, None, idx_padding=self.text_model.label_pad
+        )
+        data = XMCDataset(
+            X_text["input_ids"],
+            X_text["attention_mask"],
+            X_text["token_type_ids"],
+            torch.arange(X_text["input_ids"].shape[0]),
+            label_values=label_values_pt,
+            label_indices=label_indices_pt,
+        )
+
+        # since number of active labels may vary
+        # using pinned memory will slow down data loading
+        dataloader = DataLoader(
+            data,
+            sampler=SequentialSampler(data),
+            pin_memory=False,
+            batch_size=kwargs.get("batch_size", 8),
+            num_workers=batch_gen_workers,
+        )
+
+        local_topk = min(pred_params.only_topk, self.nr_labels)
+
+        embeddings = []
+        batch_cpred = []
+        for batch in dataloader:
+            self.text_encoder.eval()
+            self.text_model.eval()
+            cur_batch_size = batch[0].shape[0]
+
+            with torch.no_grad():
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2],
+                    "instance_number": batch[3],
+                    "label_values": None,
+                    "label_indices": None if csr_codes_next is None else batch[-1],
+                }
+
+                text_model_W_seq, text_model_b_seq = self.text_model(
+                    output_indices=inputs["label_indices"],
+                    num_device=len(self.text_encoder.device_ids)
+                    if hasattr(self.text_encoder, "device_ids")
+                    else 1,
+                )
+
+                outputs = self.text_encoder(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    token_type_ids=inputs["token_type_ids"],
+                    label_embedding=(text_model_W_seq, text_model_b_seq),
+                )
+                c_pred = outputs["logits"]
+                # get topk prediction
+                if csr_codes_next is None:  # take all labels into consideration
+                    cpred_csr = smat.csr_matrix(c_pred.cpu().numpy())
+                    cpred_csr.data = PostProcessor.get(pred_params.post_processor).transform(
+                        cpred_csr.data, inplace=True
+                    )
+                    cpred_csr = smat_util.sorted_csr(cpred_csr, only_topk=local_topk)
+                    batch_cpred.append(cpred_csr)
+                else:
+                    cur_act_labels = csr_codes_next[inputs["instance_number"]]
+                    nnz_of_insts = cur_act_labels.indptr[1:] - cur_act_labels.indptr[:-1]
+                    inst_idx = np.repeat(np.arange(cur_batch_size, dtype=np.uint32), nnz_of_insts)
+                    label_idx = cur_act_labels.indices.astype(np.uint32)
+                    val = c_pred.cpu().numpy().flatten()
+                    val = val[
+                        np.argwhere(inputs["label_indices"].flatten() != self.text_model.label_pad)
+                    ].flatten()
+                    val = PostProcessor.get(pred_params.post_processor).transform(val, inplace=True)
+                    val = PostProcessor.get(pred_params.post_processor).combiner(
+                        val, cur_act_labels.data
+                    )
+                    cpred_csr = smat_util.sorted_csr_from_coo(
+                        cur_act_labels.shape, inst_idx, label_idx, val, only_topk=local_topk
+                    )
+
+                    batch_cpred.append(cpred_csr)
+
+                embeddings.append(outputs["pooled_output"].cpu().numpy())
+
+        pred_csr_codes = smat_util.vstack_csr(batch_cpred)
+        embeddings = np.concatenate(embeddings, axis=0)
+
+        ens_method = pred_params.ensemble_method
+        # concat_model prediction requires concat_model and X_feat
+        if all(v is not None for v in [self.concat_model, X_feat]):
+            cat_embeddings = sk_normalize(embeddings, axis=1, copy=True)
+            if isinstance(X_feat, smat.csr_matrix):
+                cat_embeddings = smat_util.dense_to_csr(cat_embeddings)
+                cat_embeddings = smat_util.hstack_csr([X_feat, cat_embeddings], dtype=np.float32)
+            else:
+                cat_embeddings = np.hstack([X_feat, cat_embeddings])
+            concat_pred_csr_codes = self.concat_model.predict(
+                cat_embeddings,
+                csr_codes=csr_codes,  # use original csr_codes rather than csr_codes_next
+                only_topk=local_topk,
+                post_processor=pred_params.post_processor,
+            )
+            pred_csr_codes = TransformerMatcher.ensemble_prediction(
+                pred_csr_codes, concat_pred_csr_codes, local_topk, ens_method
+            )
+        elif self.concat_model is not None and ens_method != "transformer-only":
+            LOGGER.warning(
+                f"X_feat is missing for {ens_method} prediction, fall back to transformer-only"
+            )
+
+        return pred_csr_codes, embeddings
+
+    def fine_tune_encoder(self, prob, val_prob=None, val_csr_codes=None):
+        """Fine tune the transformer text_encoder
+
+        Args:
+            prob (MLProblemWithText): training problem
+            val_prob (MLProblemWithText, optional): validation problem
+            val_csr_codes (csr_matrix, optional): prediction matrix for
+                    validation data, shape = (nr_val_inst, nr_codes)
+                    its values and indices will be used in combiner for next
+                    prediction
+
+        Returns:
+            TransformerMatcher
+        """
+        train_params = self.train_params
+        pred_params = self.pred_params
+
+        loss_function = TransformerMatcher.get_loss_function(train_params.loss_function).to(
+            self.device
+        )
+
+        max_act_labels = train_params.max_active_matching_labels
+        logging_steps = train_params.logging_steps
+        max_steps = train_params.max_steps
+        max_no_improve_cnt = train_params.max_no_improve_cnt
+        if prob.M is not None:
+            # need to keep explicit zeros in csr_codes_next
+            # therefore do not pass it through constructor
+            if not isinstance(prob.M, smat.csr_matrix):
+                raise TypeError(f"Got type={type(prob.M)} for M!")
+            # getting the result in csr by computing csr * csr
+            M_next = clib.sparse_matmul(
+                prob.M,
+                self.C.T,
+                eliminate_zeros=False,
+                threads=train_params.batch_gen_workers,
+            )
+
+            do_resample = max_act_labels is not None and max_act_labels < max(
+                M_next.indptr[1:] - M_next.indptr[:-1]
+            )
+        else:
+            M_next = None
+            do_resample = False
+
+        if prob.M is None or train_params.force_label_embed_in_gpu:
+            # put text_model to GPU
+            self.text_model.to(self.device)
+
+        label_indices_pt, label_values_pt = TransformerMatcher._get_label_tensors(
+            M_next,
+            prob.Y,
+            idx_padding=self.text_model.label_pad,
+            max_labels=max_act_labels,
+        )
+        train_data = XMCDataset(
+            prob.X_text["input_ids"],
+            prob.X_text["attention_mask"],
+            prob.X_text["token_type_ids"],
+            torch.arange(prob.X_text["input_ids"].shape[0]),  # instance number
+            label_values=label_values_pt,
+            label_indices=label_indices_pt,
+        )
+
+        # since number of active labels may vary
+        # using pinned memory will slow down data loading
+        train_dataloader = DataLoader(
+            train_data,
+            sampler=RandomSampler(train_data),
+            pin_memory=False,
+            batch_size=train_params.batch_size,
+            num_workers=train_params.batch_gen_workers,
+        )
+
+        # compute stopping criteria
+        if train_params.max_steps > 0:
+            t_total = train_params.max_steps
+            steps_per_epoch = len(train_dataloader) // train_params.gradient_accumulation_steps
+            train_params.num_train_epochs = train_params.max_steps // steps_per_epoch + 1
+        else:
+            steps_per_epoch = len(train_dataloader) // train_params.gradient_accumulation_steps
+            t_total = steps_per_epoch * train_params.num_train_epochs
+
+        # Prepare optimizer, disable weight decay for bias and layernorm weights
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [
+                    p
+                    for n, p in self.text_encoder.named_parameters()
+                    if not any(nd in n for nd in no_decay)
+                ],
+                "weight_decay": train_params.weight_decay,
+            },
+            {
+                "params": [
+                    p
+                    for n, p in self.text_encoder.named_parameters()
+                    if any(nd in n for nd in no_decay)
+                ],
+                "weight_decay": 0.0,
+            },
+        ]
+
+        optimizer = AdamW(
+            optimizer_grouped_parameters,
+            lr=train_params.learning_rate,
+            eps=train_params.adam_epsilon,
+        )
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=train_params.warmup_steps, num_training_steps=t_total
+        )
+
+        sparse_parameters = list(self.text_model.parameters())
+        if prob.M is not None:
+            emb_optimizer = torch.optim.SparseAdam(
+                sparse_parameters,
+                lr=train_params.learning_rate,
+                eps=train_params.adam_epsilon,
+            )
+        else:
+            # For the top matcher treat text_model as dense layer
+            emb_optimizer = AdamW(
+                sparse_parameters,
+                lr=train_params.learning_rate,
+                eps=train_params.adam_epsilon,
+            )
+        emb_scheduler = get_linear_schedule_with_warmup(
+            emb_optimizer, num_warmup_steps=train_params.warmup_steps, num_training_steps=t_total
+        )
+
+        # Start Batch Training
+        LOGGER.info("***** Running training *****")
+        LOGGER.info("  Num examples = %d", prob.X_text["input_ids"].shape[0])
+        LOGGER.info("  Num labels = %d", self.nr_labels)
+        if prob.M is not None:
+            LOGGER.info("  Num active labels per instance = %d", label_indices_pt.shape[1])
+        LOGGER.info("  Num Epochs = %d", train_params.num_train_epochs)
+        LOGGER.info("  Batch size = %d", train_params.batch_size)
+        LOGGER.info("  Gradient Accumulation steps = %d", train_params.gradient_accumulation_steps)
+        LOGGER.info("  Total optimization steps = %d", t_total)
+
+        global_step = 0
+        tr_loss, logging_loss = 0.0, 0.0
+        total_train_time, logging_elapsed = 0.0, 0.0
+        best_matcher_prec = -1
+        avg_matcher_prec = 0
+        save_cur_model = False
+        no_improve_cnt = 0
+
+        self.text_encoder.zero_grad()
+        self.text_model.zero_grad()
+        for epoch in range(1, int(train_params.num_train_epochs) + 1):
+            if do_resample and epoch > 1:  # redo subsample negative labels
+                label_indices_pt, label_values_pt = TransformerMatcher._get_label_tensors(
+                    M_next,
+                    prob.Y,
+                    idx_padding=self.text_model.label_pad,
+                    max_labels=train_params.max_active_matching_labels,
+                )
+                train_data.refresh_labels(
+                    label_values=label_values_pt,
+                    label_indices=label_indices_pt,
+                )
+            for batch_cnt, batch in enumerate(train_dataloader):
+                self.text_encoder.train()
+                self.text_model.train()
+                start_time = time.time()
+
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2],
+                    "instance_number": batch[3],
+                    "label_values": batch[4],
+                    "label_indices": batch[-1] if prob.M is not None else None,
+                }
+                text_model_W_seq, text_model_b_seq = self.text_model(
+                    output_indices=inputs["label_indices"],
+                    num_device=len(self.text_encoder.device_ids)
+                    if hasattr(self.text_encoder, "device_ids")
+                    else 1,
+                )
+                outputs = self.text_encoder(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    token_type_ids=inputs["token_type_ids"],
+                    label_embedding=(text_model_W_seq, text_model_b_seq),
+                )
+                loss = loss_function(outputs["logits"], inputs["label_values"].to(self.device))
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+                if train_params.gradient_accumulation_steps > 1:
+                    loss = loss / train_params.gradient_accumulation_steps
+
+                loss.backward()
+
+                tr_loss += loss.item()
+
+                logging_elapsed += time.time() - start_time
+                total_train_time += time.time() - start_time
+                if (batch_cnt + 1) % train_params.gradient_accumulation_steps == 0:
+                    torch.nn.utils.clip_grad_norm_(
+                        self.text_encoder.parameters(), train_params.max_grad_norm
+                    )
+
+                    optimizer.step()  # perform gradient update
+                    scheduler.step()  # update learning rate schedule
+                    optimizer.zero_grad()  # clear gradient accumulation
+
+                    torch.nn.utils.clip_grad_norm_(
+                        self.text_model.parameters(), train_params.max_grad_norm
+                    )
+                    emb_optimizer.step()  # perform gradient update
+                    emb_scheduler.step()  # update learning rate schedule
+                    emb_optimizer.zero_grad()  # clear gradient accumulation
+                    global_step += 1
+
+                    if logging_steps > 0 and global_step % logging_steps == 0:
+                        cur_loss = (tr_loss - logging_loss) / logging_steps
+                        LOGGER.info(
+                            "| [{:4d}/{:4d}][{:6d}/{:6d}] | {:4d}/{:4d} batches | ms/batch {:5.4f} | train_loss {:6e} | lr {:.6e}".format(
+                                int(epoch),
+                                int(train_params.num_train_epochs),
+                                int(global_step),
+                                int(t_total),
+                                int(batch_cnt),
+                                len(train_dataloader),
+                                logging_elapsed * 1000.0 / logging_steps,
+                                cur_loss,
+                                scheduler.get_last_lr()[0],
+                            )
+                        )
+                        logging_loss = tr_loss
+                        logging_elapsed = 0
+
+                    if train_params.save_steps > 0 and global_step % train_params.save_steps == 0:
+                        if val_prob is not None:
+                            if val_prob.M is None:
+                                test_combos = zip(["all"], [None])
+                            else:
+                                test_combos = zip(
+                                    ["trn_ns", "pred_ns"], [val_prob.M, val_csr_codes]
+                                )
+                            for val_type, valid_M in test_combos:
+                                avr_beam = 1 if valid_M is None else valid_M.nnz / valid_M.shape[0]
+                                # compute loss and prediction on test set
+                                val_pred, _ = self.predict(
+                                    val_prob.X_text,
+                                    csr_codes=valid_M,
+                                    batch_size=train_params.batch_size,
+                                    batch_gen_workers=train_params.batch_gen_workers,
+                                )
+                                LOGGER.info("-" * 89)
+                                LOGGER.info(
+                                    "| epoch {:3d} step {:6d} evaluation | training-time: {:5.4f}s average-beam: {:5.1f}".format(
+                                        epoch,
+                                        global_step,
+                                        total_train_time,
+                                        avr_beam,
+                                    )
+                                )
+                                # compute precision on test set
+                                val_metrics = smat_util.Metrics.generate(
+                                    val_prob.Y,
+                                    val_pred,
+                                    topk=pred_params.only_topk,
+                                )
+                                LOGGER.info(
+                                    "| {} test-prec {}".format(
+                                        val_type,
+                                        " ".join(
+                                            "{:4.2f}".format(100 * v) for v in val_metrics.prec
+                                        ),
+                                    )
+                                )
+                                LOGGER.info(
+                                    "| {} test-recl {}".format(
+                                        val_type,
+                                        " ".join(
+                                            "{:4.2f}".format(100 * v) for v in val_metrics.recall
+                                        ),
+                                    )
+                                )
+
+                            avg_matcher_prec = np.mean(val_metrics.prec)
+                            # save the model with highest val precision
+                            save_cur_model = avg_matcher_prec > best_matcher_prec
+                        else:
+                            # if val set not given, always save
+                            save_cur_model = True
+
+                        if save_cur_model:
+                            no_improve_cnt = 0
+                            LOGGER.info(
+                                "| **** saving model (avg_prec={}) to {} at global_step {} ****".format(
+                                    100 * avg_matcher_prec,
+                                    train_params.model_dir,
+                                    global_step,
+                                )
+                            )
+                            best_matcher_prec = avg_matcher_prec
+                            self.save(train_params.model_dir)
+                        else:
+                            no_improve_cnt += 1
+                        LOGGER.info("-" * 89)
+
+                if (max_steps > 0 and global_step > max_steps) or (
+                    max_no_improve_cnt > 0 and no_improve_cnt >= max_no_improve_cnt
+                ):
+                    break
+            if (max_steps > 0 and global_step > max_steps) or (
+                max_no_improve_cnt > 0 and no_improve_cnt >= max_no_improve_cnt
+            ):
+                break
+
+        return self
+
+    @classmethod
+    def train(
+        cls,
+        prob,
+        csr_codes=None,
+        val_prob=None,
+        val_csr_codes=None,
+        train_params=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Train the transformer matcher
+
+        Args:
+            prob (MLProblemWithText): training problem
+            csr_codes (csr_matrix, optional): prediction matrix, shape = (nr_inst, nr_codes)
+                    its values and indices will be used in combiner for next
+                    prediction
+            val_prob (MLProblemWithText, optional): validation problem
+            val_csr_codes (csr_matrix, optional): prediction matrix for
+                    validation data, shape = (nr_val_inst, nr_codes)
+                    its values and indices will be used in combiner for next
+                    prediction
+            train_params (TransformerMatcher.TrainParams, optional): instance of TransformerMatcher.TrainParams.
+            pred_params (TransformerMatcher.PredParams, optional): instance of TransformerMatcher.PredParams.
+            kwargs:
+                bootstrapping (tuple): (init_encoder, init_embeddings) the
+                    text_encoder and corresponding instance embeddings generated by it.
+                    Used for bootstrap current text_encoder and text_model. Default None to
+                    ignore
+                return_dict (bool): if True, return a dictionary with model
+                     and its prediction/embeddings on train/validation dataset.
+                     Default False.
+        Returns:
+            results (TransformerMatcher or dict):
+            if return_dict=True, return a dictionary:
+                {
+                    matcher: TransformerMatcher instance
+                    trn_pred (csr_matrix): topk prediction on training data, shape = (nr_inst, nr_labels)
+                    val_pred (csr_matrix or None): topk prediction on validation data, shape = (nr_val_inst, nr_labels)
+                    trn_embeddings (ndarray): instance embedding on training data, shape = (nr_inst, hidden_dim).
+                    val_embeddings (ndarray or None): instance embedding on validation data, shape = (nr_val_inst, hidden_dim).
+                }
+            otherwise return the trained TransformerMatcher instance
+        """
+        train_params = cls.TrainParams.from_dict(train_params)
+        pred_params = cls.PredParams.from_dict(pred_params)
+        LOGGER.debug(f"TransformerMatcher train_params: {train_params.to_dict()}")
+        LOGGER.debug(f"TransformerMatcher pred_params: {pred_params.to_dict()}")
+
+        # save to a temp dir if not given
+        if not train_params.model_dir:
+            temp_dir = tempfile.TemporaryDirectory()
+            train_params.model_dir = temp_dir.name
+
+        if train_params.init_model_dir:
+            matcher = cls.load(train_params.init_model_dir)
+        else:
+            matcher = cls.download_model(
+                train_params.model_shortcut,
+                prob.Y.shape[1],
+                hidden_dropout_prob=train_params.hidden_dropout_prob,
+                cache_dir=train_params.cache_dir,
+            )
+            LOGGER.info("Downloaded {} model from s3.".format(train_params.model_shortcut))
+
+        # assign clusters and train/pred params
+        matcher.C = prob.C
+        matcher.train_params = train_params
+        matcher.pred_params = pred_params
+
+        # tokenize X_text if X_text is given as raw text
+        saved_trn_pt = train_params.saved_trn_pt
+        if not prob.is_tokenized:
+            if saved_trn_pt and os.path.isfile(saved_trn_pt):
+                trn_tensors = torch.load(saved_trn_pt)
+                LOGGER.info("trn tensors loaded_from {}".format(saved_trn_pt))
+            else:
+                trn_tensors = matcher.text_to_tensor(
+                    prob.X_text,
+                    num_workers=train_params.batch_gen_workers,
+                    max_length=pred_params.truncate_length,
+                )
+                if saved_trn_pt:
+                    torch.save(trn_tensors, saved_trn_pt)
+                    LOGGER.info("trn tensors saved to {}".format(saved_trn_pt))
+            prob.X_text = trn_tensors
+
+        if val_prob is not None and not val_prob.is_tokenized:
+            saved_val_pt = train_params.saved_val_pt
+            if saved_val_pt and os.path.isfile(saved_val_pt):
+                val_tensors = torch.load(saved_val_pt)
+                LOGGER.info("val tensors loaded from {}".format(saved_val_pt))
+            else:
+                val_tensors = matcher.text_to_tensor(
+                    val_prob.X_text,
+                    num_workers=train_params.batch_gen_workers,
+                    max_length=pred_params.truncate_length,
+                )
+                if saved_val_pt:
+                    torch.save(val_tensors, saved_val_pt)
+                    LOGGER.info("val tensors saved to {}".format(saved_val_pt))
+            val_prob.X_text = val_tensors
+
+        bootstrapping = kwargs.get("bootstrapping", None)
+        if bootstrapping is not None:
+            init_encoder, init_embeddings = bootstrapping
+            matcher.text_encoder.init_from(init_encoder)
+            LOGGER.info("Initialized transformer text_encoder form given text_encoder!")
+            if train_params.do_text_model_bootstrap and init_embeddings is not None:
+                matcher.text_model.bootstrap(init_embeddings, prob.Y, C=prob.C, M=prob.M)
+                LOGGER.info("Initialized transformer text_model form given embeddings!")
+
+        # move matcher to desired hardware
+        device, n_gpu = torch_util.setup_device(not train_params.disable_gpu)
+        matcher.to_device(device, n_gpu)
+        train_params.batch_size *= max(1, n_gpu)
+
+        # train the matcher
+        if not train_params.no_fine_tune and (
+            train_params.max_steps > 0 or train_params.num_train_epochs > 0
+        ):
+            LOGGER.info("Start fine-tuning transformer matcher...")
+            matcher.fine_tune_encoder(prob, val_prob=val_prob, val_csr_codes=val_csr_codes)
+            if os.path.exists(train_params.model_dir):
+                LOGGER.info("Reload the best checkpoint from {}".format(train_params.model_dir))
+                matcher = TransformerMatcher.load(train_params.model_dir)
+                matcher.to_device(device, n_gpu)
+
+        # ignore concat_model even if there exist one
+        matcher.concat_model = None
+        # getting the instance embeddings of training data
+        # since X_feat is not passed, transformer-only result is produced
+        P_trn, inst_embeddings = matcher.predict(
+            prob.X_text,
+            csr_codes=csr_codes,
+            pred_params=pred_params,
+            batch_size=train_params.batch_size,
+            batch_gen_workers=train_params.batch_gen_workers,
+        )
+
+        if pred_params.ensemble_method not in ["transformer-only"]:
+            # train the same layer concat_model with current embedding
+            LOGGER.info("Concatenating instance embeddings with features...")
+            normed_embeddings = sk_normalize(inst_embeddings, axis=1, copy=True)
+            if isinstance(prob.X, smat.csr_matrix):
+                normed_embeddings = smat_util.dense_to_csr(normed_embeddings)
+                cat_embeddings = smat_util.hstack_csr([prob.X, normed_embeddings], dtype=np.float32)
+            else:
+                cat_embeddings = np.hstack([prob.X, normed_embeddings])
+
+            LOGGER.info("Start training concat_model of transformer matcher...")
+            lprob = MLProblem(
+                cat_embeddings,
+                prob.Y,
+                C=prob.C if prob.M is not None else None,
+                M=prob.M,
+            )
+            matcher.concat_model = MLModel.train(lprob, threshold=train_params.threshold)
+            matcher.save(train_params.model_dir)
+
+            # P_trn with concat_model
+            concat_P_trn = matcher.concat_model.predict(
+                lprob.X,
+                csr_codes=csr_codes,
+                only_topk=pred_params.only_topk,
+                post_processor=pred_params.post_processor,
+            )
+            P_trn = TransformerMatcher.ensemble_prediction(
+                P_trn,
+                concat_P_trn,
+                pred_params.only_topk,
+                pred_params.ensemble_method,
+            )
+
+        P_val, val_inst_embeddings = None, None
+        if val_prob is not None:
+            P_val, val_inst_embeddings = matcher.predict(
+                val_prob.X_text,
+                X_feat=val_prob.X,
+                csr_codes=val_csr_codes,
+                batch_size=train_params.batch_size,
+                batch_gen_workers=train_params.batch_gen_workers,
+            )
+            LOGGER.info("*************** Final Evaluation ***************")
+            # compute precision on test set
+            val_type = "man" if val_csr_codes is not None else "all"
+            val_metrics = smat_util.Metrics.generate(val_prob.Y, P_val, topk=pred_params.only_topk)
+            avr_val_beam = (
+                1 if val_csr_codes is None else val_csr_codes.nnz / val_csr_codes.shape[0]
+            )
+            LOGGER.debug("avr_beam={}".format(avr_val_beam))
+            LOGGER.info(
+                "| {} test-prec {}".format(
+                    val_type,
+                    " ".join("{:4.2f}".format(100 * v) for v in val_metrics.prec),
+                )
+            )
+            LOGGER.info(
+                "| {} test-recl {}".format(
+                    val_type,
+                    " ".join("{:4.2f}".format(100 * v) for v in val_metrics.recall),
+                )
+            )
+            LOGGER.info("*" * 72)
+
+        if train_params.save_emb_dir:
+            smat_util.save_matrix(
+                os.path.join(train_params.save_emb_dir, "X.trn.npy"),
+                inst_embeddings,
+            )
+            if val_inst_embeddings is not None:
+                smat_util.save_matrix(
+                    os.path.join(train_params.save_emb_dir, "X.val.npy"),
+                    val_inst_embeddings,
+                )
+            LOGGER.info(f"Instance embeddings saved to {train_params.save_emb_dir}")
+
+        matcher.clear_cuda()
+
+        if kwargs.get("return_dict", False):
+            return {
+                "matcher": matcher,
+                "trn_pred": P_trn,
+                "val_pred": P_val,
+                "trn_embeddings": inst_embeddings,
+                "val_embeddings": val_inst_embeddings,
+            }
+        else:
+            return matcher
diff --git a/pecos/xmc/xtransformer/model.py b/pecos/xmc/xtransformer/model.py
new file mode 100644
index 00000000..f837d9db
--- /dev/null
+++ b/pecos/xmc/xtransformer/model.py
@@ -0,0 +1,529 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import json
+import logging
+import os
+from copy import deepcopy
+from glob import glob
+
+import dataclasses as dc
+import numpy as np
+import pecos
+import scipy.sparse as smat
+import torch
+from pecos.core import clib
+from pecos.utils import smat_util, torch_util
+from pecos.utils.cluster_util import ClusterChain
+from pecos.xmc.base import HierarchicalMLModel
+from pecos.xmc.xlinear.model import XLinearModel
+from sklearn.preprocessing import normalize as sk_normalize
+
+from .matcher import TransformerMatcher
+from .module import MLProblemWithText
+
+LOGGER = logging.getLogger(__name__)
+
+
+class XTransformer(pecos.BaseClass):
+    """Hierarchical-XTransformer for XMC.
+    Consists of one or more TransformerMatcher models followed by an optional XLinearModel.
+
+    See more details in Section 5 of PECOS paper (Yu et al., 2020) and X-Transformer paper (Chang et al., 2020).
+
+    PECOS: Prediction for Enormous and Correlated Output Spaces
+        Hsiang-Fu Yu, Kai Zhong, Inderjit S. Dhillon
+        https://arxiv.org/abs/2010.05878
+
+    Taming Pre-trained Transformers for Extreme Multi-label Text Classification (KDD 2020)
+        Wei-Cheng Chang, Hsiang-Fu Yu, Kai Zhong, Yiming Yang, Inderjit Dhillon
+        https://arxiv.org/abs/1905.02331
+    """
+
+    @dc.dataclass
+    class TrainParams(pecos.BaseParams):
+        """Training Parameters of XTransformer.
+
+        model_dir (str, optional): the i-th trained matcher will be saved to 'model_dir/{i}.model'
+        ranker_level (int, optional): number of ranker levels. Default 1 to use a single layer ranker
+        matcher_params_chain (TransformerMatcher.TrainParams or list): chain of params for TransformerMatchers.
+        ranker_params (XLinearModel.TrainParams): train params for linear ranker
+        """
+
+        model_dir: str = ""
+        ranker_level: int = 1
+        matcher_params_chain: TransformerMatcher.TrainParams = None  # type: ignore
+        ranker_params: XLinearModel.TrainParams = None  # type: ignore
+
+    @dc.dataclass
+    class PredParams(pecos.BaseParams):
+        """Pred Parameters of XTransformer.
+
+        matcher_params_chain (TransformerMatcher.PredParams or list): chain of params for TransformerMatchers
+        ranker_params (XLinearModel.PredParams): pred params for linear ranker
+        """
+
+        matcher_params_chain: TransformerMatcher.PredParams = None  # type: ignore
+        ranker_params: XLinearModel.PredParams = None  # type: ignore
+
+        def override_with_kwargs(self, pred_kwargs, no_ranker=False):
+            """override pred_params with kwargs.
+
+            Args:
+                pred_kwargs:
+                    beam_size (int): the beam search size.
+                        Overrides only_topk for all models except for the bottom one.
+                    only_topk (int): the final topk predictions to generate.
+                        Overrides only_topk for bottom model.
+                    post_processor (str): post processor scheme for prediction.
+                        Overrides post_processor for all models.
+                no_ranker (bool, optional): if there is a linear ranker in the
+                    model list. Used to decide which one is the bottom model.
+            """
+            if pred_kwargs is not None:
+                if not isinstance(pred_kwargs, dict):
+                    raise TypeError("type(pred_kwargs) must be dict")
+                overridden_beam_size = pred_kwargs.get("beam_size", None)
+                overridden_only_topk = pred_kwargs.get("only_topk", None)
+                overridden_post_processor = pred_kwargs.get("post_processor", None)
+                depth = len(self.matcher_params_chain)
+                for d in range(depth):
+                    if overridden_beam_size:
+                        if no_ranker and d == depth - 1:
+                            continue
+                        self.matcher_params_chain[d].only_topk = overridden_beam_size
+                    if overridden_post_processor:
+                        self.matcher_params_chain[d].post_processor = overridden_post_processor
+                if no_ranker:
+                    if overridden_only_topk:
+                        self.matcher_params_chain[-1].only_topk = overridden_only_topk
+                else:
+                    self.ranker_params.override_with_kwargs(pred_kwargs)
+            return self
+
+    def __init__(self, model_list):
+        """Initialization
+
+        Args:
+            model_list (list of TransformerMatcher or XLinearModel): List of model.
+        """
+        self.model_list = model_list
+
+    @property
+    def depth(self):
+        """Get the depth of the hierarchical model
+
+        Returns:
+            depth (int): Depth of model.
+        """
+        return len(self.model_list)
+
+    @property
+    def nr_labels(self):
+        """Get the number of labels
+
+        Returns:
+            nr_labels (int): Number of labels.
+        """
+        return self.model_list[-1].nr_labels
+
+    def save(self, save_dir):
+        """Save the X-Transformer model to file.
+
+        Args:
+            save_dir (str): dir to save the model, will be created if not exist
+                save params to save_dir/param.json
+                save model_list[i] to save_dir/{i}.model
+        """
+        os.makedirs(save_dir, exist_ok=True)
+        params = {
+            "model": self.__class__.__name__,
+            "depth": self.depth,
+            "nr_labels": self.nr_labels,
+        }
+        params = self.append_meta(params)
+        param_dir = os.path.join(save_dir, "param.json")
+        with open(param_dir, "w", encoding="utf-8") as fpa:
+            fpa.write(json.dumps(params, indent=True))
+        LOGGER.info("Parameters saved to {}".format(param_dir))
+
+        for i, model in enumerate(self.model_list):
+            model_dir = os.path.join(save_dir, "{}.model".format(i))
+            model.save(model_dir)
+            LOGGER.info("Model {}({}) saved to {}".format(i, type(model), model_dir))
+
+    @classmethod
+    def load(cls, load_dir):
+        """Load X-Transformer model from file
+
+        Args:
+            load_dir (str): dir to load the model
+
+        Returns:
+            XTransformer
+        """
+        if not os.path.isdir(load_dir):
+            raise ValueError(f"load dir does not exist at: {load_dir}")
+
+        param_dir = os.path.join(load_dir, "param.json")
+        with open(param_dir, "r", encoding="utf-8") as fpa:
+            params = json.loads(fpa.read())
+        LOGGER.info("Params loaded from {}".format(param_dir))
+
+        depth = int(params.get("depth", len(glob("{}/*.model".format(load_dir)))))
+        model_list = []
+        for i in range(depth):
+            model_dir = os.path.join(load_dir, "{}.model".format(i))
+            # load params for single model and get model type
+            with open(os.path.join(model_dir, "param.json"), "r", encoding="utf-8") as fin:
+                model_params = json.loads(fin.read())
+            model_class = eval(model_params["__meta__"]["class_fullname"].split("###")[-1])
+            cur_model = model_class.load(model_dir)
+            LOGGER.info("Model {}({}) loaded from {}".format(i, type(cur_model), model_dir))
+            model_list.append(cur_model)
+
+        return cls(model_list)
+
+    def get_pred_params(self):
+        """Get model's pred_params for creating the XTransformer.PredParams instance"""
+        ret = self.PredParams()
+
+        ret.matcher_params_chain = []
+        for m in self.model_list:
+            if isinstance(m, TransformerMatcher):
+                ret.matcher_params_chain.append(m.get_pred_params())
+            elif isinstance(m, XLinearModel):
+                ret.ranker_params = m.get_pred_params()
+            else:
+                raise TypeError("Unsupported model type: {type(m)}")
+        return ret
+
+    @classmethod
+    def train(
+        cls,
+        prob,
+        clustering,
+        val_prob=None,
+        train_params=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Train the X-transformer model with the given input data.
+
+        Args:
+            prob (MLProblemWithText): ML problem to solve.
+            clustering (ClusterChain): cluster-chain for the ranker, matcher will be applied on its ranker_level clustering.
+                        i.e. matcher.nr_match_labels = clustering[-ranker_level].shape[1]
+            val_prob (MLProblemWithText, optional): ML problem for validation.
+            train_params (XTransformer.TrainParams): training parameters for XTransformer
+            pred_params (XTransformer.pred_params): pred parameters for XTransformer
+            kwargs:
+                matmul_threads (int, optional): number of threads to use for
+                    constructing label tree. Default to use at most 32 threads
+                beam_size (int, optional): overrides only_topk for models except
+                    bottom layer one
+
+        Returns:
+            XTransformer
+        """
+        # assert cluster chain in clustering is valid
+        clustering = ClusterChain(clustering)
+        if clustering[-1].shape[0] != prob.nr_labels:
+            raise ValueError("nr_labels mismatch!")
+        nr_levels = len(clustering)
+
+        if nr_levels <= train_params.ranker_level:
+            raise ValueError(f"Expect ranker_level < depth, got {train_params.ranker_level}")
+
+        nr_transformers = nr_levels - train_params.ranker_level
+        nr_linears = train_params.ranker_level
+
+        # construct train_params
+        train_params = cls.TrainParams.from_dict(train_params)
+        train_params.ranker_params.mode = "ranker"
+
+        train_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
+            train_params, cls.TrainParams, nr_transformers
+        )
+        if nr_linears > 0:
+            train_params.ranker_params.hlm_args = (
+                HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
+                    train_params.ranker_params.hlm_args,
+                    HierarchicalMLModel.TrainParams,
+                    nr_linears,
+                )
+            )
+
+        # construct pred_params
+        pred_params = cls.PredParams.from_dict(pred_params)
+        pred_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
+            pred_params, cls.PredParams, nr_transformers
+        )
+        if nr_linears > 0:
+            pred_params.ranker_params.hlm_args = (
+                HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
+                    pred_params.ranker_params.hlm_args,
+                    HierarchicalMLModel.PredParams,
+                    nr_linears,
+                )
+            )
+        pred_params = pred_params.override_with_kwargs(
+            kwargs,
+            no_ranker=(nr_linears == 0),
+        )
+
+        LOGGER.debug(
+            f"XTransformer train_params: {json.dumps(train_params.to_dict(), indent=True)}"
+        )
+        LOGGER.debug(f"XTransformer pred_params: {json.dumps(pred_params.to_dict(), indent=True)}")
+
+        def get_negative_samples(mat_true, mat_pred, scheme):
+            if scheme == "tfn":
+                result = smat_util.binarized(mat_true)
+            elif scheme == "man":
+                result = smat_util.binarized(mat_pred)
+            elif "tfn" in scheme and "man" in scheme:
+                result = smat_util.binarized(mat_true) + smat_util.binarized(mat_pred)
+            else:
+                raise ValueError("Unrecognized negative sampling method {}".format(scheme))
+            LOGGER.debug(
+                f"Construct {scheme} with shape={result.shape} avr_M_nnz={result.nnz/result.shape[0]}"
+            )
+            return result
+
+        # construct label chain for training and validation set
+        # avoid large matmul_threads to prevent overhead in Y.dot(C) and save memory
+        matmul_threads = kwargs.get("threads", os.cpu_count())
+        matmul_threads = min(32, matmul_threads)
+        YC_list = [prob.Y]
+        for cur_C in reversed(clustering[1:]):
+            Y_t = clib.sparse_matmul(YC_list[-1], cur_C, threads=matmul_threads).tocsr()
+            YC_list.append(Y_t)
+        YC_list.reverse()
+
+        if val_prob is not None:
+            val_YC_list = [val_prob.Y]
+            for cur_C in reversed(clustering[1:]):
+                Y_t = clib.sparse_matmul(val_YC_list[-1], cur_C, threads=matmul_threads).tocsr()
+                val_YC_list.append(Y_t)
+            val_YC_list.reverse()
+
+        model_list = []
+        M, val_M = None, None
+        M_pred, val_M_pred = None, None
+        bootstrapping, inst_embeddings = None, None
+        for i in range(nr_transformers):
+            cur_train_params = train_params.matcher_params_chain[i]
+            cur_pred_params = pred_params.matcher_params_chain[i]
+            cur_train_params.model_dir = os.path.join(train_params.model_dir, "{}.model".format(i))
+
+            cur_ns = cur_train_params.negative_sampling
+            if i > 0:
+                M = get_negative_samples(YC_list[i - 1], M_pred, cur_ns)
+
+            cur_prob = MLProblemWithText(
+                prob.X_text,
+                prob.X,
+                YC_list[i],
+                C=clustering[i],
+                M=M,
+            )
+            if val_prob is not None:
+                if i > 0:
+                    val_M = get_negative_samples(val_YC_list[i - 1], val_M_pred, cur_ns)
+                cur_val_prob = MLProblemWithText(
+                    val_prob.X_text,
+                    val_prob.X,
+                    val_YC_list[i],
+                    C=clustering[i],
+                    M=val_M,
+                )
+            else:
+                cur_val_prob = None
+
+            avr_trn_labels = (
+                float(cur_prob.M.nnz) / YC_list[i].shape[0]
+                if cur_prob.M is not None
+                else YC_list[i].shape[1]
+            )
+            LOGGER.info(
+                "Training Hierarchical-XTransformer with {} at level {}, nr_labels={}, avr_M_nnz={}".format(
+                    cur_ns, i, YC_list[i].shape[1], avr_trn_labels
+                )
+            )
+
+            # bootstrapping with previous text_encoder and instance embeddings
+            if len(model_list) > 0:
+                init_encoder = deepcopy(model_list[-1].text_encoder)
+                bootstrapping = (init_encoder, inst_embeddings)
+
+            res_dict = TransformerMatcher.train(
+                cur_prob,
+                csr_codes=M_pred,
+                val_prob=cur_val_prob,
+                val_csr_codes=val_M_pred,
+                train_params=cur_train_params,
+                pred_params=cur_pred_params,
+                bootstrapping=bootstrapping,
+                return_dict=True,
+            )
+            cur_model = res_dict["matcher"]
+            M_pred = res_dict["trn_pred"]
+            val_M_pred = res_dict["val_pred"]
+            inst_embeddings = res_dict["trn_embeddings"]
+
+            model_list.append(cur_model)
+
+        # Train the subsequent layers with XLinearModel
+        if train_params.ranker_level > 0:
+            inst_embeddings = sk_normalize(inst_embeddings, axis=1, copy=False)
+            if isinstance(prob.X, smat.csr_matrix):
+                inst_embeddings = smat_util.dense_to_csr(inst_embeddings)
+                prob.X = smat_util.hstack_csr([prob.X, inst_embeddings], dtype=np.float32)
+            else:
+                prob.X = np.hstack([prob.X, inst_embeddings])
+            del inst_embeddings
+            LOGGER.info("Constructed instance feature matrix with shape={}".format(prob.X.shape))
+
+            # train the ranker
+            LOGGER.info("Start training ranker...")
+
+            # getting the top model negative sampling scheme
+            # and add user supplied negatives to all subsequent ranker layers
+            cur_ns = train_params.ranker_params.hlm_args.neg_mining_chain
+            if isinstance(cur_ns, list):
+                cur_ns = cur_ns[0]
+                train_params.ranker_params.hlm_args.neg_mining_chain = [
+                    v + "+usn" for v in train_params.ranker_params.hlm_args.neg_mining_chain
+                ]
+            else:
+                train_params.ranker_params.hlm_args.neg_mining_chain += "+usn"
+
+            M = get_negative_samples(YC_list[-train_params.ranker_level - 1], M_pred, cur_ns)
+
+            ranker = XLinearModel.train(
+                prob.X,
+                prob.Y,
+                C=clustering,
+                user_supplied_negatives={train_params.ranker_level: M},
+                train_params=train_params.ranker_params,
+                pred_params=pred_params.ranker_params,
+            )
+            model_list.append(ranker)
+
+        return cls(model_list)
+
+    def predict(
+        self,
+        X_text,
+        X_feat=None,
+        pred_params=None,
+        **kwargs,
+    ):
+        """Use the X-Transformer model to predict on given data.
+
+        Args:
+            X_text (iterable over str): instance text input to predict on
+            X_feat (csr_matrix or ndarray): instance feature matrix (nr_insts, feature_dim)
+            pred_kwargs (XTransformer.PredParams, optional): instance of
+                XTransformer.PredParams. Default None to use pred_params stored
+                during model training.
+            kwargs:
+                beam_size (int, optional): override the beam size specified in the model.
+                    Default None to disable overriding
+                only_topk (int, optional): override the only topk specified in the model
+                    Default None to disable overriding
+                post_processor (str, optional):  override the post_processor specified in the model
+                    Default None to disable overriding
+                saved_pt (str, optional): if given, will try to load encoded tensors and skip text encoding
+                embeddings_save_path (str, optional): if given, will save the instance embeddings matrix
+                batch_size (int, optional): per device batch size for transformer evaluation. Default 8
+                batch_gen_workers (int, optional): number of CPUs to use for batch generation. Default 4
+                disable_gpu (bool, optional): not use GPU even if available. Default False
+                threads (int, optional): the number of threads to use for linear model prediction.
+
+        Returns:
+            P_matrix (csr_matrix): instance to label prediction (csr_matrix, nr_insts * nr_labels)
+        """
+        saved_pt = kwargs.get("saved_pt", None)
+        batch_size = kwargs.get("batch_size", 8)
+        batch_gen_workers = kwargs.get("batch_gen_workers", 4)
+        disable_gpu = kwargs.get("disable_gpu", False)
+        embeddings_save_path = kwargs.get("embeddings_save_path", None)
+        device, n_gpu = torch_util.setup_device(not disable_gpu)
+
+        # get the override pred_params
+        nr_transformers = sum([isinstance(m, TransformerMatcher) for m in self.model_list])
+        if pred_params is None:
+            # copy stored params
+            pred_params = self.get_pred_params()
+        else:
+            pred_params = self.PredParams.from_dict(pred_params)
+            pred_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
+                pred_params,
+                self.PredParams,
+                nr_transformers,
+            )
+        pred_params.override_with_kwargs(kwargs, no_ranker=(nr_transformers == self.depth))
+        LOGGER.debug(f"XTransformer prediction with pred_params: {pred_params.to_dict()}")
+
+        # generate instance-to-cluster prediction
+        if saved_pt and os.path.isfile(saved_pt):
+            text_tensors = torch.load(saved_pt)
+            LOGGER.info("Predict tensors loaded_from {}".format(saved_pt))
+        else:
+            text_tensors = self.model_list[0].text_to_tensor(
+                X_text,
+                num_workers=batch_gen_workers,
+                max_length=self.model_list[0].pred_params.truncate_length,
+            )
+
+        pred_csr = None
+        for d, cur_model in enumerate(self.model_list):
+            if isinstance(cur_model, TransformerMatcher):
+                cur_model.to_device(device, n_gpu=n_gpu)
+                pred_csr, embeddings = cur_model.predict(
+                    text_tensors,
+                    X_feat=X_feat,
+                    csr_codes=pred_csr,
+                    pred_params=pred_params.matcher_params_chain[d],
+                    batch_size=batch_size * max(1, n_gpu),
+                    batch_gen_workers=batch_gen_workers,
+                )
+                cur_model.to_device(torch.device("cpu"))
+                torch.cuda.empty_cache()
+
+            elif isinstance(cur_model, XLinearModel):
+                # concatenate instance feature matrix with embeddings
+                cat_embeddings = sk_normalize(embeddings, axis=1, copy=True)
+                if isinstance(X_feat, smat.csr_matrix):
+                    cat_embeddings = smat_util.dense_to_csr(cat_embeddings)
+                    cat_embeddings = smat_util.hstack_csr(
+                        [X_feat, cat_embeddings], dtype=np.float32
+                    )
+                else:
+                    cat_embeddings = np.hstack([X_feat, cat_embeddings])
+                LOGGER.info(
+                    "Constructed instance feature matrix with shape={}".format(X_feat.shape)
+                )
+                pred_csr = cur_model.predict(
+                    cat_embeddings,
+                    csr_codes=pred_csr,
+                    pred_params=None if pred_params is None else pred_params.ranker_params,
+                    threads=kwargs.get("threads", -1),
+                )
+            else:
+                raise ValueError("Unknown model type {}".format(type(cur_model)))
+
+        if embeddings_save_path:
+            smat_util.save_matrix(embeddings_save_path, embeddings)
+            LOGGER.info("Saved embeddings({}) to {}".format(embeddings.shape, embeddings_save_path))
+
+        return pred_csr
diff --git a/pecos/xmc/xtransformer/module.py b/pecos/xmc/xtransformer/module.py
new file mode 100644
index 00000000..14bcae4e
--- /dev/null
+++ b/pecos/xmc/xtransformer/module.py
@@ -0,0 +1,114 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import numpy as np
+import scipy.sparse as smat
+from torch.utils.data import Dataset, TensorDataset
+
+
+class MLProblemWithText(object):
+    """Object that defines a ML-Problem with text input.
+    Containing the input text and X, Y, C, M, M_pred matrices.
+
+        X_text (list of str or dict of tensors): instance text, len(text) = nr_inst
+                or dictionary of tokenized text (dict of torch.tensor)
+        X (csr_matrix or ndarray): instance numerical features, shape = (nr_inst, nr_features)
+        Y (csr_matrix): training labels, shape = (nr_inst, nr_labels)
+        C (csc_matrix, optional): clustering matrix, shape = (nr_labels, nr_codes)
+        M (csr_matrix, optional): matching matrix, shape = (nr_inst, nr_codes)
+                model will be trained only on its non-zero indices
+                its values will not be used.
+    """
+
+    def __init__(self, X_text, X, Y, C=None, M=None):
+        self.X_text = X_text
+
+        self.X = X
+        self.Y = Y
+        self.C = C
+        self.M = M
+
+        self.type_check()
+
+    @property
+    def is_tokenized(self):
+        return isinstance(self.X_text, dict)
+
+    def type_check(self):
+        if not isinstance(self.X, (smat.csr_matrix, np.ndarray)):
+            raise TypeError(f"Expect X to be csr_matrix or ndarray, got {type(self.X)}")
+        if not isinstance(self.Y, smat.csr_matrix):
+            raise TypeError(f"Expect Y to be csr_matrix, got {type(self.Y)}")
+        if self.C is not None and not isinstance(self.C, smat.csc_matrix):
+            raise TypeError(f"Expect C to be csc_matrix, got {type(self.C)}")
+        if self.M is not None and not isinstance(self.M, smat.csr_matrix):
+            raise TypeError(f"Expect M to be csr_matrix, got {type(self.M)}")
+
+    @property
+    def nr_labels(self):
+        return self.Y.shape[1]
+
+    @property
+    def nr_features(self):
+        return self.X.shape[1]
+
+    @property
+    def nr_codes(self):
+        return self.C.shape[1]
+
+
+class XMCDataset(Dataset):
+    """Dataset to hold feature and label tensors for XMC training and prediction.
+
+    Args:
+        *features (tensors): feature tensors, required to have same first
+            dimension: nr_inst
+        label_values (tensor or None): label values with shape = (nr_inst, num_active_labels)
+        label_indices (tensor or None): label indices with shape = (nr_inst, num_active_labels)
+
+    Return values depend on the label_values and label_indices:
+        if label_values is None and label_indices is not None:
+            data[i] = (feature[0][i], feature[1][i], ..., label_values[i], label_indices[i])
+        elif label_values is not None:
+            data[i] = (feature[0][i], feature[1][i], ..., label_values[i])
+        elif label_indices is not None:
+            data[i] = (feature[0][i], feature[1][i], ..., label_indices[i])
+        else:
+            data[i] = (feature[0][i], feature[1][i], ...)
+    """
+
+    def __init__(self, *features, label_values=None, label_indices=None):
+        self.nr_inst = features[0].size(0)
+        self.data = TensorDataset(*features)
+        if label_values is not None and label_values.size(0) != self.nr_inst:
+            raise ValueError("First dimension mismatch between features and label_values")
+        if label_indices is not None and label_indices.size(0) != self.nr_inst:
+            raise ValueError("First dimension mismatch between features and label_indices")
+
+        self.label_values = label_values
+        self.label_indices = label_indices
+
+    def __getitem__(self, index):
+        if self.label_values is not None and self.label_indices is not None:
+            return self.data[index] + (self.label_values[index], self.label_indices[index])
+        elif self.label_indices is not None:
+            return self.data[index] + (self.label_indices[index],)
+        elif self.label_values is not None:
+            return self.data[index] + (self.label_values[index],)
+        else:
+            return self.data[index]
+
+    def __len__(self):
+        return self.nr_inst
+
+    def refresh_labels(self, label_values=None, label_indices=None):
+        """Refresh label-values and label-indices from given tensors"""
+        self.label_values = label_values
+        self.label_indices = label_indices
diff --git a/pecos/xmc/xtransformer/network.py b/pecos/xmc/xtransformer/network.py
new file mode 100644
index 00000000..99f42f97
--- /dev/null
+++ b/pecos/xmc/xtransformer/network.py
@@ -0,0 +1,394 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import transformers
+from pecos.utils.featurization.text.vectorizers import TransformerModelClass
+from pecos.xmc import MLModel, MLProblem
+from transformers import (
+    BertConfig,
+    BertModel,
+    BertPreTrainedModel,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+    XLNetConfig,
+    XLNetModel,
+    XLNetPreTrainedModel,
+    XLNetTokenizer,
+)
+from transformers.file_utils import add_start_docstrings
+from transformers.modeling_utils import SequenceSummary
+
+TRANSFORMERS_MAJOR_VERSION = int(transformers.__version__.split(".")[0])
+if TRANSFORMERS_MAJOR_VERSION >= 4:
+    from transformers.models.bert.modeling_bert import BERT_INPUTS_DOCSTRING, BERT_START_DOCSTRING
+    from transformers.models.roberta.modeling_roberta import (
+        ROBERTA_INPUTS_DOCSTRING,
+        ROBERTA_START_DOCSTRING,
+    )
+    from transformers.models.xlnet.modeling_xlnet import (
+        XLNET_INPUTS_DOCSTRING,
+        XLNET_START_DOCSTRING,
+    )
+else:
+    from transformers.modeling_bert import BERT_INPUTS_DOCSTRING, BERT_START_DOCSTRING
+    from transformers.modeling_roberta import (
+        ROBERTA_INPUTS_DOCSTRING,
+        ROBERTA_START_DOCSTRING,
+    )
+    from transformers.modeling_xlnet import XLNET_INPUTS_DOCSTRING, XLNET_START_DOCSTRING
+
+
+class HingeLoss(nn.Module):
+    """Hinge loss function module for multi-label classification"""
+
+    def __init__(self, margin=1.0, power=2, cost_weighted=False):
+        """
+        Args:
+            margin (float, optional): margin for the hinge loss. Default 1.0
+            power (int, optional): exponent for the hinge loss. Default to 2 for squared-hinge
+            cost_weighted (bool, optional): whether to use label value as weight. Default False
+        """
+        super(HingeLoss, self).__init__()
+        self.margin = margin
+        self.power = power
+        self.cost_weighted = cost_weighted
+
+    def forward(self, z, y, C_pos=1.0, C_neg=1.0):
+        """Compute the hinge loss
+
+        Args:
+            z (torch.tensor): predicted matrix of size: (batch_size * output_size)
+            y (torch.tensor): 0/1 ground truth of size: (batch_size * output_size)
+            C_pos (float, optional): positive penalty for the hinge loss. Default 1.0
+            C_neg (float, optional): negative penalty for the hinge loss. Default 1.0
+
+        Returns:
+            loss (torch.tensor): the tensor of average loss
+        """
+        # convert y into {-1,1}
+        y_binary = (y > 0).float()
+        y_new = 2.0 * y_binary - 1.0
+
+        # Hinge loss
+        loss = F.relu(self.margin - y_new * z)
+        loss = loss ** self.power
+        # if y = [1, 4, 0, 0]
+        if self.cost_weighted:
+            # weight = [1, 4, 1, 1]
+            loss = loss * (C_pos * y + C_neg * (1.0 - y_binary))
+        else:
+            # weight = [1, 1, 1, 1]
+            loss = loss * (C_pos * y_binary + C_neg * (1.0 - y_binary))
+        return loss.mean(1)
+
+
+class TransformerLinearXMCHead(nn.Module):
+    """XMC head for Transformers
+
+    Containing label weight embeddings and label bias embeddings
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.label_pad = config.num_labels
+        self.num_labels = config.num_labels
+        self.W = nn.Embedding(config.num_labels + 1, config.hidden_size, padding_idx=self.label_pad)
+        self.b = nn.Embedding(config.num_labels + 1, 1, padding_idx=self.label_pad)
+
+        self.random_init()
+
+    @property
+    def device(self):
+        return self.W.weight.device
+
+    def random_init(self):
+        """Initialize the weight and bias embeddings
+
+        Initialize label weight embedding with N(0, 0.02) while keeping PAD
+        column to be 0. Initialize label bias embedding with 0.
+        """
+        mat = 0.02 * np.random.randn(self.label_pad, self.W.weight.shape[1])
+        mat = np.hstack([mat, np.zeros([mat.shape[0], 1])])
+        self.init_from(mat)
+
+    def bootstrap(self, X_emb, Y, C=None, M=None):
+        """Initialize head with weights learned from linear model using transformer embeddings
+
+        Args:
+            X_emb (ndarray): the instance embeddings generated by the encoder,
+                shape = (nr_inst, hidden_dim)
+            Y (csr_matrix): label matrix, shape = (nr_inst, nr_labels)
+            C (csc_matrix, optional): clustering matrix, shape = (nr_labels, nr_codes)
+            M (csr_matrix, optional): matching matrix, shape = (nr_inst, nr_codes)
+        """
+        prob = MLProblem(X_emb, Y, C=C if M is not None else None, M=M)
+        # use large Cp and Cn to reduce regularization
+        mat = MLModel.train(prob, threshold=0.0, Cp=100.0, Cn=100.0)
+        mat = mat.W.toarray().T
+        self.init_from(mat)
+
+    def init_from(self, mat):
+        """Initialize the weight and bias embeddings with given matrix
+
+        Args:
+            mat (ndarray): matrix used for initialize, shape = (nr_labels, hidden_size + 1)
+        """
+        if not isinstance(mat, np.ndarray):
+            raise ValueError("Expect ndarray to initialize label embedding")
+        if mat.shape[0] != self.label_pad:
+            raise ValueError("nr_labels mismatch!")
+
+        # add padding index by appending an all-zero row
+        mat = np.vstack([mat, np.zeros([1, mat.shape[1]])])
+        # split weight and bias
+        self.W = nn.Embedding.from_pretrained(
+            torch.FloatTensor(mat[:, :-1]),
+            freeze=False,
+            sparse=True,
+            padding_idx=self.label_pad,
+        )
+        self.b = nn.Embedding.from_pretrained(
+            torch.FloatTensor(mat[:, -1]).view((self.label_pad + 1, 1)),
+            freeze=False,
+            sparse=True,
+            padding_idx=self.label_pad,
+        )
+
+    def forward(self, pooled_output=None, output_indices=None, num_device=1):
+        if output_indices is None:
+            # for parallel training, need to send a copy to each device
+            W_act = self.W.weight[:-1, :].repeat(num_device, 1, 1)
+            b_act = self.b.weight[:-1].repeat(num_device, 1, 1)
+        else:
+            output_indices = output_indices.to(self.device)
+            W_act = self.W(output_indices)  # (batch_size, nr_act_labels, dim)
+            b_act = self.b(output_indices)
+        return W_act, b_act
+
+
+@add_start_docstrings(
+    """Bert Model with mutli-label classification head on top for XMC.\n""",
+    BERT_START_DOCSTRING,
+)
+class BertForXMC(BertPreTrainedModel):
+    """
+    Examples:
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForXMC.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("iphone 11 case", add_special_tokens=True)).unsqueeze(0)
+        outputs = model(input_ids)
+        last_hidden_states = outputs["hidden_states"]
+    """
+
+    def __init__(self, config):
+        super(BertForXMC, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.output_hidden_states = config.output_hidden_states
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.init_weights()
+
+    def init_from(self, model):
+        self.bert = model.bert
+
+    @add_start_docstrings(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        label_embedding=None,
+    ):
+        r"""
+        Returns:
+          :obj:`dict` containing:
+                {'logits': (:obj:`torch.FloatTensor` of shape (batch_size, num_labels)) pred logits for each label,
+                 'pooled_output': (:obj:`torch.FloatTensor` of shape (batch_size, hidden_dim)) input sequence embedding vector,
+                 'hidden_states': (:obj:`torch.FloatTensor` of shape (batch_size, sequence_length, hidden_dim)) the last layer hidden states,
+                }
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
+        pooled_output = self.dropout(outputs[1])
+        instance_hidden_states = outputs[0] if self.output_hidden_states else None
+        W_act, b_act = label_embedding
+        W_act = W_act.to(pooled_output.device)
+        b_act = b_act.to(pooled_output.device)
+        logits = (pooled_output.unsqueeze(1) * W_act).sum(dim=-1) + b_act.squeeze(2)
+        return {
+            "logits": logits,
+            "pooled_output": pooled_output,
+            "hidden_states": instance_hidden_states,
+        }
+
+
+@add_start_docstrings(
+    """Roberta Model with mutli-label classification head on top for XMC.\n""",
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForXMC(BertPreTrainedModel):
+    """
+    Examples:
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaForXMC.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("iphone 11 case", add_special_tokens=True)).unsqueeze(0)
+        outputs = model(input_ids)
+        last_hidden_states = outputs["hidden_states"]
+    """
+
+    def __init__(self, config):
+        super(RobertaForXMC, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.output_hidden_states = config.output_hidden_states
+
+        self.roberta = RobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.init_weights()
+
+    def init_from(self, model):
+        self.roberta = model.roberta
+
+    @add_start_docstrings(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        label_embedding=None,
+    ):
+        r"""
+        Returns:
+          :obj:`dict` containing:
+                {'logits': (:obj:`torch.FloatTensor` of shape (batch_size, num_labels)) pred logits for each label,
+                 'pooled_output': (:obj:`torch.FloatTensor` of shape (batch_size, hidden_dim)) input sequence embedding vector,
+                 'hidden_states': (:obj:`torch.FloatTensor` of shape (batch_size, sequence_length, hidden_dim)) the last layer hidden states,
+                }
+        """
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
+        pooled_output = self.dropout(outputs[1])
+        instance_hidden_states = outputs[0] if self.output_hidden_states else None
+        W_act, b_act = label_embedding
+        W_act = W_act.to(pooled_output.device)
+        b_act = b_act.to(pooled_output.device)
+        logits = (pooled_output.unsqueeze(1) * W_act).sum(dim=-1) + b_act.squeeze(2)
+        return {
+            "logits": logits,
+            "pooled_output": pooled_output,
+            "hidden_states": instance_hidden_states,
+        }
+
+
+@add_start_docstrings(
+    """XLNet Model with mutli-label classification head on top for XMC.\n""",
+    XLNET_START_DOCSTRING,
+)
+class XLNetForXMC(XLNetPreTrainedModel):
+    """
+    Examples:
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetForXMC.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("iphone 11 case", add_special_tokens=True)).unsqueeze(0)
+        outputs = model(input_ids)
+        last_hidden_states = outputs["hidden_states"]
+    """
+
+    def __init__(self, config):
+        super(XLNetForXMC, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.output_hidden_states = config.output_hidden_states
+
+        self.transformer = XLNetModel(config)
+        self.sequence_summary = SequenceSummary(config)
+
+        self.init_weights()
+
+    def init_from(self, model):
+        self.transformer = model.transformer
+
+    @add_start_docstrings(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        label_embedding=None,
+    ):
+        r"""
+        Returns:
+          :obj:`dict` containing:
+                {'logits': (:obj:`torch.FloatTensor` of shape (batch_size, num_labels)) pred logits for each label,
+                 'pooled_output': (:obj:`torch.FloatTensor` of shape (batch_size, hidden_dim)) input sequence embedding vector,
+                 'hidden_states': (:obj:`torch.FloatTensor` of shape (batch_size, sequence_length, hidden_dim)) the last layer hidden states,
+                }
+        """
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+        )
+        instance_hidden_states = outputs[0] if self.output_hidden_states else None
+        pooled_output = self.sequence_summary(instance_hidden_states)
+        W_act, b_act = label_embedding
+        W_act = W_act.to(pooled_output.device)
+        b_act = b_act.to(pooled_output.device)
+        logits = (pooled_output.unsqueeze(1) * W_act).sum(dim=-1) + b_act.squeeze(2)
+        return {
+            "logits": logits,
+            "pooled_output": pooled_output,
+            "hidden_states": instance_hidden_states,
+        }
+
+
+ENCODER_CLASSES = {
+    "bert": TransformerModelClass(BertConfig, BertForXMC, BertTokenizer),
+    "roberta": TransformerModelClass(RobertaConfig, RobertaForXMC, RobertaTokenizer),
+    "xlnet": TransformerModelClass(XLNetConfig, XLNetForXMC, XLNetTokenizer),
+}
diff --git a/pecos/xmc/xtransformer/predict.py b/pecos/xmc/xtransformer/predict.py
new file mode 100644
index 00000000..d60e3b8d
--- /dev/null
+++ b/pecos/xmc/xtransformer/predict.py
@@ -0,0 +1,181 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+import logging
+import os
+
+from pecos.utils import logging_util, smat_util, torch_util
+from pecos.utils.featurization.text.preprocess import Preprocessor
+from pecos.xmc import PostProcessor
+
+from .model import XTransformer
+
+LOGGER = logging.getLogger(__name__)
+
+
+def parse_arguments():
+    """Parse predicting arguments"""
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--feat-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="Path to the instance feature matrix.",
+    )
+    parser.add_argument(
+        "--text-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="Path to the instance text file.",
+    )
+    parser.add_argument(
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="Path to load x-transformer model.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="The directory where the model predictions will be written.",
+    )
+    # ======= Other parameters ========
+    parser.add_argument(
+        "--batch-size",
+        default=8,
+        type=int,
+        metavar="INT",
+        help="Batch size per GPU.",
+    )
+    parser.add_argument(
+        "--max-pred-chunk",
+        default=None,
+        metavar="INT",
+        type=int,
+        help="Max number of instances to predict on at once, set to avoid OOM. Default None to predict on all instances at once",
+    )
+    parser.add_argument(
+        "--only-topk",
+        default=None,
+        type=int,
+        metavar="INT",
+        help="override the topk specified in the ranker (default None to disable overriding) ",
+    )
+    parser.add_argument(
+        "-b",
+        "--beam-size",
+        type=int,
+        default=None,
+        metavar="INT",
+        help="override the beam size specified in the ranker (default None to disable overriding)",
+    )
+
+    parser.add_argument(
+        "-pp",
+        "--post-processor",
+        type=str,
+        choices=PostProcessor.valid_list(),
+        default=None,
+        metavar="STR",
+        help="override the post processor specified in the ranker (default None to disable overriding)",
+    )
+    parser.add_argument(
+        "--disable-gpu",
+        action="store_true",
+        help="disable CUDA training even if it's available",
+    )
+    parser.add_argument(
+        "--batch-gen-workers",
+        type=int,
+        metavar="INT",
+        default=4,
+        help="number of CPUs to use for batch generation",
+    )
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=-1,
+        metavar="THREADS",
+        help="number of threads to use for linear models(default -1 to denote all the CPUs)",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=0, metavar="INT", help="random seed for initialization"
+    )
+    parser.add_argument(
+        "--verbose-level",
+        type=int,
+        choices=logging_util.log_levels.keys(),
+        default=1,
+        metavar="INT",
+        help=f"the verbose level, {', '.join([str(k) + ' for ' + logging.getLevelName(v) for k, v in logging_util.log_levels.items()])}, default 1",
+    )
+    return parser
+
+
+def do_predict(args):
+    """Predict with XTransformer and save the result.
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+    if not os.path.isdir(args.output_dir):
+        raise ValueError(f"output-dir is not a directory:{args.output_dir}")
+
+    torch_util.set_seed(args.seed)
+
+    xtf = XTransformer.load(args.model_folder)
+
+    # load instance feature and text
+    X_feat = smat_util.load_matrix(args.feat_path)
+    _, X_text = Preprocessor.load_data_from_file(args.text_path, label_text_path=None, text_pos=0)
+
+    if args.max_pred_chunk is not None:
+        P_chunks = []
+        for i in range(0, len(X_text), args.max_pred_chunk):
+            cur_P_chunk = xtf.predict(
+                X_text[i : i + args.max_pred_chunk],
+                X_feat=X_feat[i : i + args.max_pred_chunk, :],
+                batch_size=args.batch_size,
+                batch_gen_workers=args.batch_gen_workers,
+                disable_gpu=args.disable_gpu,
+                beam_size=args.beam_size,
+                only_topk=args.only_topk,
+                post_processor=args.post_processor,
+                threads=args.threads,
+            )
+            P_chunks.append(cur_P_chunk)
+        P_matrix = smat_util.vstack_csr(P_chunks)
+    else:
+        P_matrix = xtf.predict(
+            X_text,
+            X_feat=X_feat,
+            batch_size=args.batch_size,
+            batch_gen_workers=args.batch_gen_workers,
+            disable_gpu=args.disable_gpu,
+            beam_size=args.beam_size,
+            only_topk=args.only_topk,
+            post_processor=args.post_processor,
+            threads=args.threads,
+        )
+    smat_util.save_matrix(os.path.join(args.output_dir, "P.npz"), P_matrix)
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    logging_util.setup_logging_config(level=args.verbose_level)
+    do_predict(args)
diff --git a/pecos/xmc/xtransformer/train.py b/pecos/xmc/xtransformer/train.py
new file mode 100644
index 00000000..ab7ad67f
--- /dev/null
+++ b/pecos/xmc/xtransformer/train.py
@@ -0,0 +1,567 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+import gc
+import logging
+import os
+
+import numpy as np
+from pecos.utils import logging_util, smat_util, torch_util
+from pecos.utils.cluster_util import ClusterChain
+from pecos.utils.featurization.text.preprocess import Preprocessor
+from pecos.xmc import Indexer, LabelEmbeddingFactory, PostProcessor
+
+from .matcher import TransformerMatcher
+from .model import XTransformer
+from .module import MLProblemWithText
+
+LOGGER = logging.getLogger(__name__)
+
+
+def parse_arguments():
+    """Parse training arguments"""
+    parser = argparse.ArgumentParser()
+    # ========= Required parameters =========
+    parser.add_argument(
+        "--trn-text-path",
+        type=str,
+        metavar="PATH",
+        required=True,
+        help="path to the training text file",
+    )
+    parser.add_argument(
+        "--trn-feat-path",
+        type=str,
+        metavar="PATH",
+        required=True,
+        help="path to the instance feature matrix (CSR matrix, nr_insts * nr_features)",
+    )
+    parser.add_argument(
+        "--trn-label-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the training label matrix (CSR matrix, nr_insts * nr_labels)",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="the output directory where the models will be saved.",
+    )
+    # ========= test data paths ============
+    parser.add_argument(
+        "--tst-text-path",
+        type=str,
+        metavar="PATH",
+        default="",
+        help="path to the test text file",
+    )
+    parser.add_argument(
+        "--tst-feat-path",
+        type=str,
+        metavar="PATH",
+        default="",
+        help="path to the test instance feature matrix",
+    )
+    parser.add_argument(
+        "--tst-label-path",
+        type=str,
+        metavar="PATH",
+        default="",
+        help="path to the file of the test label matrix",
+    )
+    # ========= indexer parameters ============
+    parser.add_argument(
+        "--code-path",
+        type=str,
+        default="",
+        metavar="PATH",
+        help="path to the clustering file (CSR matrix, nr_insts * nr_labels)",
+    )
+    parser.add_argument(
+        "--label-feat-path",
+        type=str,
+        default="",
+        metavar="PATH",
+        help="path to the CSR npz or Row-majored npy file of the label feature matrix (nr_labels * nr_label_feats)",
+    )
+    parser.add_argument(
+        "--nr-splits",
+        type=int,
+        default=2,
+        metavar="INT",
+        help="number of splits used to construct hierarchy (a power of 2 is recommended)",
+    )
+    parser.add_argument(
+        "--min-codes",
+        type=int,
+        default=-1,
+        metavar="INT",
+        help="minimal number of codes, default -1 to use number of leaf clusters",
+    )
+    parser.add_argument(
+        "--indexer",
+        choices=Indexer.indexer_dict.keys(),
+        default="hierarchicalkmeans",
+        metavar="STR",
+        help=f"Indexer algorithm (default hierarchicalkmeans). Available choices are {', '.join(Indexer.indexer_dict.keys())}",
+    )
+    parser.add_argument(
+        "--max-leaf-size",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="The max size of the leaf nodes of hierarchical 2-means clustering. Multiple values (separated by comma) are supported and will lead to different individual models for ensembling. (default [100])",
+    )
+    parser.add_argument(
+        "--imbalanced-ratio",
+        type=float,
+        default=0.0,
+        metavar="FLOAT",
+        help="Value between 0.0 and 0.5 (inclusive). Indicates how relaxed the balancedness constraint of 2-means can be. Specifically, if an iteration of 2-means is clustering L labels, the size of the output 2 clusters will be within approx imbalanced_ratio * 2 * L of each other. (default 0.0)",
+    )
+    parser.add_argument(
+        "--imbalanced-depth",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="After hierarchical 2-means clustering has reached this depth, it will continue clustering as if --imbalanced-ratio is set to 0.0. (default 100)",
+    )
+    parser.add_argument(
+        "--no-spherical",
+        action="store_true",
+        default=False,
+        help="Do not l2-normalize cluster centers while clustering",
+    )
+    parser.add_argument(
+        "--max-iter",
+        type=int,
+        default=20,
+        metavar="INT",
+        help="max iterations for indexer (default 20)",
+    )
+    parser.add_argument(
+        "--max-no-improve-cnt",
+        type=int,
+        default=-1,
+        metavar="INT",
+        help="if > 0, training will stop when this number of validation steps result in no improvment.",
+    )
+    # ========= matcher parameters ============
+    parser.add_argument(
+        "--max-match-clusters",
+        type=int,
+        default=-1,
+        metavar="INT",
+        help="max number of clusters on which to train matcher; if <0, set to number of leaf clusters. Default -1",
+    )
+    parser.add_argument(
+        "--no-fine-tune",
+        action="store_true",
+        help="whether do fine-tune on loaded/downloaded transformers",
+    )
+    parser.add_argument(
+        "--model-shortcut",
+        type=str,
+        metavar="STR",
+        default="bert-base-uncased",
+        help="pre-trained transformer model name shortcut for download (default bert-base-uncased)",
+    )
+    parser.add_argument(
+        "--init-model-dir",
+        type=str,
+        metavar="PATH",
+        default="",
+        help="path to load existing TransformerMatcher checkpoint from disk, overrides model-shortcut",
+    )
+    # ========== ranker parameters =============
+    parser.add_argument(
+        "-b",
+        "--beam-size",
+        type=int,
+        default=10,
+        metavar="INT",
+        help="the default size of beam search used in the prediction",
+    )
+    parser.add_argument(
+        "-pp",
+        "--post-processor",
+        type=str,
+        choices=PostProcessor.valid_list(),
+        default="noop",
+        metavar="STR",
+        help="the default post processor used in the prediction",
+    )
+    parser.add_argument(
+        "-ns",
+        "--negative-sampling",
+        type=str,
+        choices=["tfn", "man", "tfn+man"],
+        default="tfn",
+        metavar="STR",
+        help="Negative Sampling Schemes",
+    )
+    parser.add_argument(
+        "--ensemble-method",
+        type=str,
+        choices=["concat-only", "transformer-only", "average", "rank_average", "round_robin"],
+        default="transformer-only",
+        metavar="STR",
+        help="ensemble method for transformer/concat prediction ensemble",
+    )
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        type=float,
+        default=0.1,
+        metavar="VAL",
+        help="threshold to sparsify the model weights (default 0.1)",
+    )
+    # ========== Other parameters ===========
+    parser.add_argument(
+        "--loss-function",
+        type=str,
+        choices=TransformerMatcher.LOSS_FUNCTION_TYPES,
+        default="squared-hinge",
+        metavar="STR",
+        help="loss function type",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        default="",
+        metavar="PATH",
+        type=str,
+        help="dir to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--saved-trn-pt",
+        default="",
+        metavar="PATH",
+        type=str,
+        help="dir to save/load tokenized train tensor",
+    )
+    parser.add_argument(
+        "--saved-val-pt",
+        default="",
+        metavar="PATH",
+        type=str,
+        help="dir to save/load tokenized validation tensor",
+    )
+    parser.add_argument(
+        "--save-m-mat",
+        default="",
+        type=str,
+        metavar="PATH",
+        help="path to save the matching matrix",
+    )
+    parser.add_argument(
+        "--truncate-length",
+        default=128,
+        metavar="INT",
+        type=int,
+        help="if given, truncate input text to this length, else use longest input length as truncate-length.",
+    )
+    parser.add_argument(
+        "--hidden-dropout-prob",
+        default=0.1,
+        metavar="VAL",
+        type=float,
+        help="hidden dropout prob in deep transformer models.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        default=8,
+        metavar="INT",
+        type=int,
+        help="batch size per GPU.",
+    )
+    parser.add_argument(
+        "--max-pred-chunk",
+        default=None,
+        metavar="INT",
+        type=int,
+        help="Max number of instances to predict on at once, set to avoid OOM. Default None to disable",
+    )
+    parser.add_argument(
+        "--gradient-accumulation-steps",
+        type=int,
+        metavar="INT",
+        default=1,
+        help="number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        default=5e-5,
+        metavar="VAL",
+        type=float,
+        help="maximum learning rate for Adam.",
+    )
+    parser.add_argument(
+        "--weight-decay",
+        default=0.0,
+        metavar="VAL",
+        type=float,
+        help="weight decay rate for regularization",
+    )
+    parser.add_argument(
+        "--adam-epsilon",
+        default=1e-8,
+        metavar="VAL",
+        type=float,
+        help="epsilon for Adam optimizer.",
+    )
+    parser.add_argument(
+        "--max-grad-norm", default=1.0, metavar="VAL", type=float, help="max gradient norm."
+    )
+    parser.add_argument(
+        "--num-train-epochs",
+        default=5.0,
+        metavar="INT",
+        type=int,
+        help="total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--max-steps",
+        default=-1,
+        metavar="INT",
+        type=int,
+        help="if > 0: set total number of training steps to perform. Override num-train-epochs.",
+    )
+    parser.add_argument(
+        "--warmup-steps",
+        default=0,
+        metavar="INT",
+        type=int,
+        help="Linear warmup over warmup-steps.",
+    )
+    parser.add_argument(
+        "--logging-steps", type=int, metavar="INT", default=50, help="log every X updates steps."
+    )
+    parser.add_argument(
+        "--save-steps",
+        type=int,
+        metavar="INT",
+        default=100,
+        help="save checkpoint every X updates steps.",
+    )
+    parser.add_argument(
+        "--only-topk",
+        default=20,
+        metavar="INT",
+        type=int,
+        help="store topk prediction at ranker prediction stage",
+    )
+    parser.add_argument(
+        "--max-active-matching-labels",
+        default=None,
+        metavar="INT",
+        type=int,
+        help="max number of active matching labels, will subsample from existing negative samples if necessary",
+    )
+    parser.add_argument(
+        "--save-emb-dir",
+        default="",
+        metavar="PATH",
+        type=str,
+        help="dir to save instance embeddings.",
+    )
+    parser.add_argument(
+        "--disable-gpu",
+        action="store_true",
+        help="disable CUDA training even if it's available",
+    )
+    parser.add_argument(
+        "--force-label-embed-in-gpu",
+        action="store_true",
+        help="always put label embed in GPU. This will increase GPU memory cost but accelerate training.",
+    )
+    parser.add_argument(
+        "--do-encoder-bootstrap",
+        action="store_true",
+        help="initialize lower layer model weights from upper layer model",
+    )
+    parser.add_argument(
+        "--do-text-model-bootstrap",
+        action="store_true",
+        help="initialize the text_model from xlinear training. Ignored if do-encoder-bootstrap is not used.",
+    )
+    parser.add_argument(
+        "--batch-gen-workers",
+        type=int,
+        metavar="INT",
+        default=4,
+        help="number of CPUs to use for batch generation",
+    )
+    parser.add_argument(
+        "--seed", type=int, metavar="INT", default=0, help="random seed for initialization"
+    )
+    parser.add_argument(
+        "--verbose-level",
+        type=int,
+        choices=logging_util.log_levels.keys(),
+        default=1,
+        metavar="INT",
+        help=f"the verbose level, {', '.join([str(k) + ' for ' + logging.getLevelName(v) for k, v in logging_util.log_levels.items()])}, default 1",
+    )
+
+    return parser
+
+
+def do_train(args):
+    """Train and save X-Transformer model.
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+    torch_util.set_seed(args.seed)
+    LOGGER.info("Setting random seed {}".format(args.seed))
+
+    # Load training feature
+    X_trn = smat_util.load_matrix(args.trn_feat_path, dtype=np.float32)
+    LOGGER.info("Loaded training feature matrix with shape={}".format(X_trn.shape))
+
+    # Load training labels
+    Y_trn = smat_util.load_matrix(args.trn_label_path, dtype=np.float32)
+    LOGGER.info("Loaded training label matrix with shape={}".format(Y_trn.shape))
+
+    # Load test feature if given
+    if args.tst_feat_path:
+        X_tst = smat_util.load_matrix(args.tst_feat_path, dtype=np.float32)
+        LOGGER.info("Loaded test feature matrix with shape={}".format(X_tst.shape))
+    else:
+        X_tst = None
+
+    # Load test labels if given
+    if args.tst_label_path:
+        Y_tst = smat_util.load_matrix(args.tst_label_path, dtype=np.float32)
+        LOGGER.info("Loaded test label matrix with shape={}".format(Y_tst.shape))
+    else:
+        Y_tst = None
+
+    # Load training texts
+    _, trn_corpus = Preprocessor.load_data_from_file(
+        args.trn_text_path,
+        label_text_path=None,
+        text_pos=0,
+    )
+    LOGGER.info("Loaded {} training sequences".format(len(trn_corpus)))
+
+    # Load test text if given
+    if args.tst_text_path:
+        _, tst_corpus = Preprocessor.load_data_from_file(
+            args.tst_text_path,
+            label_text_path=None,
+            text_pos=0,
+        )
+        LOGGER.info("Loaded {} test sequences".format(len(tst_corpus)))
+    else:
+        tst_corpus = None
+
+    # construct full cluster chain
+    if os.path.exists(args.code_path):
+        cluster_chain = ClusterChain.load(args.code_path)
+        LOGGER.info("Loaded from code-path: {}".format(args.code_path))
+    else:
+        if os.path.isfile(args.label_feat_path):
+            label_feat = smat_util.load_matrix(args.label_feat_path, dtype=np.float32)
+            LOGGER.info(
+                "Loaded label feature matrix shape={}, from {}".format(
+                    label_feat.shape, args.label_feat_path
+                )
+            )
+        else:
+            label_feat = LabelEmbeddingFactory.pifa(Y_trn, X_trn)
+            if args.label_feat_path:
+                smat_util.save_matrix(args.label_feat_path, label_feat)
+                LOGGER.info(
+                    "Created label feature matrix with shape={}, saved to {}".format(
+                        label_feat.shape, args.label_feat_path
+                    )
+                )
+
+        cluster_chain = Indexer.gen(
+            label_feat,
+            args.indexer,
+            nr_splits=args.nr_splits,
+            min_codes=args.min_codes,
+            max_leaf_size=args.max_leaf_size,
+            imbalanced_depth=args.imbalanced_depth,
+            imbalanced_ratio=args.imbalanced_ratio,
+            seed=args.seed,
+            max_iter=args.max_iter,
+            spherical=not args.no_spherical,
+        )
+        del label_feat
+        gc.collect()
+        if args.code_path:
+            cluster_chain.save(args.code_path)
+            LOGGER.info("Created clustering chain, saved to {}".format(args.code_path))
+
+    LOGGER.info(
+        "Constructed clustering chain for ranker: {}".format([cc.shape for cc in cluster_chain])
+    )
+    # if not given, match number of leaf clusters
+    nr_leaf_clusters = cluster_chain[-1].shape[1]
+    if args.max_match_clusters < 0:
+        args.max_match_clusters = nr_leaf_clusters
+
+    if args.min_codes < 0:
+        args.min_codes = nr_leaf_clusters
+
+    # get the matcher-ranker split level
+    if args.max_match_clusters < cluster_chain[-1].shape[0]:  # if not matcher for all
+        args.ranker_level = len(cluster_chain) - next(
+            level
+            for level, C in enumerate(cluster_chain[:])
+            if C.shape[1] >= args.max_match_clusters
+        )
+        LOGGER.info(
+            "Apply matcher at ranker-level {} with nr_labels={}".format(
+                args.ranker_level, cluster_chain[-args.ranker_level].shape[1]
+            )
+        )
+    else:
+        args.ranker_level = 0
+        LOGGER.info(
+            "Apply matcher at ranker-level 0 with nr_labels={}".format(cluster_chain[-1].shape[0])
+        )
+
+    trn_prob = MLProblemWithText(trn_corpus, X_trn, Y_trn)
+    if all(v is not None for v in [tst_corpus, X_tst, Y_tst]):
+        val_prob = MLProblemWithText(tst_corpus, X_tst, Y_tst)
+    else:
+        val_prob = None
+
+    # for HierarchicalMLModel.TrainParams
+    args.neg_mining_chain = args.negative_sampling
+
+    train_params = XTransformer.TrainParams.from_dict(vars(args), recursive=True)
+    pred_params = XTransformer.PredParams.from_dict(vars(args), recursive=True)
+
+    xtf = XTransformer.train(
+        trn_prob,
+        cluster_chain,
+        val_prob=val_prob,
+        train_params=train_params,
+        pred_params=pred_params,
+        beam_size=args.beam_size,
+    )
+
+    xtf.save(args.model_dir)
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    logging_util.setup_logging_config(level=args.verbose_level)
+    do_train(args)
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..e22d6514
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,46 @@
+[aliases]
+test=pytest
+
+# Configuration for pytest; enable coverage for pecos, emit
+# XML, HTML, and terminal reports.
+[tool:pytest]
+xfail_strict = true
+addopts =
+    --verbose
+    --ignore=build/private
+    --doctest-modules
+    --cov pecos
+    --cov-report term-missing
+    --cov-report html:build/coverage
+    --cov-report xml:build/coverage/coverage.xml
+    # Uncomment to enforce a minimum code coverage threshold.
+    # --cov-fail-under 50
+testpaths = test
+
+# Additional coverage.py settings. The parallel flag and source values are
+# necessary so that pytest-cov knows to alias a build artifact's absolute
+# paths to the relative ones from your package source.
+[coverage:run]
+branch = true
+parallel = true
+
+[coverage:paths]
+source =
+    src/
+    build/lib/*/site-packages/
+
+[coverage:html]
+directory = build/coverage
+
+[coverage:xml]
+output = build/coverage/coverage.xml
+
+[flake8]
+include =
+    test
+    examples
+exclude =
+    doc
+    setup.py
+ignore=E203,E501
+max_line_length = 100
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..ff7c387f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,174 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+
+import codecs
+import os
+import setuptools
+import subprocess
+import re
+import warnings
+
+
+class VersionHelper(object):
+    """Helper class to figure out current package version from git tag."""
+    __VERSION_FP = "pecos/_version.py"
+    __VERSION_PY = \
+"""
+# This file is automatically generated from Git version tag by running setup.
+# Only distribution/installed packages contain this file.
+
+__version__ = "%s"
+"""
+
+    @classmethod
+    def __update_version_py(cls):
+        """Update version from git tag infomation.
+        If git tag missing, will use a dummy version 0.0.0
+        """
+        assert os.path.isdir(".git"), "This does not appear to be a Git repository."
+
+        # Run git describe to get current tag, commit hash is not included
+        git_desc = subprocess.run(["git", "describe", "--tags", "--abbrev=0"],
+                                 stdout=subprocess.PIPE)
+
+        if git_desc.returncode != 0:
+            # Use dummy version for the initial version or tag info missing
+            warnings.warn(f"Unable to run git describe, maybe tag info missing? "
+                          f"Will write dummy version 0.0.0 to {cls.__VERSION_FP}")
+            ver = "0.0.0"
+        else :
+            # Clean version tag
+            git_tag = git_desc.stdout.decode('utf-8')
+            assert re.match(r'v\d+.\d+.\d+', git_tag), f"We use tags like v0.1.0, but got {git_tag}"
+            ver = git_tag[len("v"):].strip()
+
+        # Write version tag
+        with open(cls.__VERSION_FP, "w") as ver_fp:
+            ver_fp.write(cls.__VERSION_PY % ver)
+
+        assert os.path.isfile(cls.__VERSION_FP), f"{cls.__VERSION_FP} does not exist."
+        print(f"Set version to {ver}")
+
+    @classmethod
+    def __read_version_file(cls):
+        """Read version from file."""
+        here = os.path.abspath(os.path.dirname(__file__))
+        with codecs.open(os.path.join(here, cls.__VERSION_FP), 'r') as fp:
+            return fp.read()
+
+    @classmethod
+    def get_version(cls):
+        """Get version from git tag and write to file.
+        Return version info.
+        """
+        cls.__update_version_py()
+        for line in cls.__read_version_file().splitlines():
+            if line.startswith('__version__'):
+                delim = '"' if '"' in line else "'"
+                return line.split(delim)[1]
+        else:
+            raise RuntimeError("Unable to find version string.")
+
+
+class BlasHelper(object):
+    """Helper class to figure out BLAS library with path from user's Numpy backend libraries."""
+
+    @classmethod
+    def __get_numpy_openblas_lib(cls):
+        """Return NumPy's backend OpenBLAS lib if installed from pip wheel"""
+        import numpy
+        if numpy.distutils.system_info.get_info('openblas'):
+            # System has OpenBLAS, no need to get NumPy's copy
+            return ""
+
+        np_lib_path = os.path.dirname(numpy.__file__) + '.libs'
+
+        for lib_file in os.listdir(np_lib_path):
+            if lib_file.startswith("libopenblas"):
+                lib_path = os.path.join(np_lib_path, lib_file)
+                print(f"NumPy's backend OpenBLAS lib is: {lib_path}")
+                return lib_path
+        else: # Not Found
+            print("NumPy's backend OpenBLAS lib is not found")
+            return ""
+
+    @classmethod
+    def get_blas_lib_dir(cls):
+        """Return user's NumPy backend BLAS library. If not found, will raise error."""
+        import numpy.distutils.system_info as nps
+
+        # Add NumPy's backend openblas lib into search scope
+        os.environ["LAPACK"] = cls.__get_numpy_openblas_lib()
+
+        blas_info = nps.get_info('lapack_opt')
+        assert blas_info, "NumPy backend BLAS/LAPACK library not found, need to re-install NumPy"
+
+        blas_lib = blas_info['libraries']
+        blas_dir = blas_info['library_dirs']
+
+        assert blas_lib, "NumPy backend BLAS/LAPACK library empty, need to re-install NumPy."
+        assert blas_dir, "NumPy backend BLAS/LAPACK library directory empty, need to re-install NumPy."
+
+        return blas_lib, blas_dir
+
+
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+
+# Requirements
+numpy_requires = [
+    'numpy<1.20.0; python_version<"3.7"', # setup_requires needs correct version for <3.7
+    'numpy>=1.19.5; python_version>="3.7"'
+]
+setup_requires = numpy_requires + [
+    'pytest-runner',
+    'sphinx_rtd_theme'
+]
+install_requires = numpy_requires + [
+    'scipy>=1.4.1',
+    'scikit-learn>=0.24.1',
+    'torch>=1.4.0',
+    'sentencepiece>=0.1.86,!=0.1.92', # 0.1.92 results in error for transformers
+    'transformers>=3.3.1; python_version<"3.9"',
+    'transformers==4.4.2; python_version>="3.9"' # Python 3.9 only support transformer 4.4.2
+]
+
+# Fetch Numpy before building Numpy-dependent extension, if Numpy required version was not installed
+setuptools.dist.Distribution().fetch_build_eggs(numpy_requires)
+blas_lib, blas_dir = BlasHelper.get_blas_lib_dir()
+
+ext_module = setuptools.Extension(
+    "pecos.core.libpecos_float32",
+    sources=["pecos/core/libpecos.cpp"],
+    include_dirs=["pecos/core", "/usr/include/", "/usr/local/include"],
+    libraries=["gomp"] + blas_lib,
+    library_dirs=blas_dir,
+    extra_compile_args=["-fopenmp", "-O3", "-std=c++14"],
+    extra_link_args=['-Wl,--no-as-needed', f"-Wl,-rpath,{':'.join(blas_dir)}"]
+    )
+
+setuptools.setup(
+    name="libpecos",
+    version=VersionHelper.get_version(),
+    description="PECOS - Predictions for Enormous and Correlated Output Spaces",
+    url="https://github.com/amzn/pecos",
+    author="Amazon.com, Inc.",
+    license="Apache 2.0",
+    packages=setuptools.find_packages(where="."),
+    package_dir={"": "."},
+    include_package_data=True,
+    ext_modules=[ext_module],
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    setup_requires=setup_requires,
+    install_requires=install_requires,
+    tests_require=["pytest"]
+)
diff --git a/test/pecos/apps/text2text/test_text2text.py b/test/pecos/apps/text2text/test_text2text.py
new file mode 100644
index 00000000..97b628ce
--- /dev/null
+++ b/test/pecos/apps/text2text/test_text2text.py
@@ -0,0 +1,194 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+
+def test_importable():
+    import pecos.apps.text2text  # noqa: F401
+    from pecos.apps import text2text  # noqa: F401
+    from pecos.apps.text2text import Text2Text  # noqa: F401
+
+
+def test_cli(tmpdir):
+    import subprocess
+    import shlex
+    import json
+
+    def assert_json_string(str_a, str_b):
+        json_a = json.loads(str_a)
+        json_b = json.loads(str_b)
+        json_a["schema"] == json_b["schema"]
+        sorted_output_a = sorted(json_a["data"], key=lambda x: (-x[1], x[0]))
+        sorted_output_b = sorted(json_b["data"], key=lambda x: (-x[1], x[0]))
+        for aa, bb in zip(sorted_output_a, sorted_output_b):
+            assert aa[0] == bb[0]
+            assert pytest.approx(aa[1], 1e-6) == bb[1]
+
+    train_file = "test/tst-data/apps/text2text/test.txt"
+    test_file = "test/tst-data/apps/text2text/test.txt"
+    item_file = "test/tst-data/apps/text2text/query_text.txt"
+    item_to_keep_file = "test/tst-data/apps/text2text/query_to_keep_text.txt"
+    sklearn_tfidf_true_pred_file = "test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items.txt"
+    true_pred_file = "test/tst-data/apps/text2text/true_pred_items.txt"
+    true_pred_keep_file = "test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items_keep.txt"
+    truth_format1_file = "test/tst-data/apps/text2text/truth_items.txt"
+    truth_format2_file = "test/tst-data/apps/text2text/test.txt"
+
+    test_pred_file = str(tmpdir.join("pred_items.txt"))
+    test_pred_keep_file = str(tmpdir.join("pred_items_keep.txt"))
+    model_folder = str(tmpdir.join("save_model"))
+
+    # Training with SklearnTfidf
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.train"]
+    cmd += ["-i {}".format(train_file)]
+    cmd += ["-q {}".format(item_file)]
+    cmd += ["-m {}".format(model_folder)]
+    cmd += ['--vectorizer-config-json \{\\"type\\":\\"sklearntfidf\\",\\"kwargs\\":\{\}\}']
+    print(" ".join(cmd))
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+
+    # Batch Inference
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.predict"]
+    cmd += ["-i {}".format(test_file)]
+    cmd += ["-m {}".format(model_folder)]
+    cmd += ["-o {}".format(test_pred_file)]
+    cmd += ["-T 0.001"]
+    cmd += ["--meta-info-path {}".format(test_file)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    print(" ".join(cmd))
+    assert process.returncode == 0
+    for line_test, line_true in zip(
+        open(test_pred_file, "r", encoding="utf-8"),
+        open(sklearn_tfidf_true_pred_file, "r", encoding="utf-8"),
+    ):
+        assert_json_string(line_test, line_true)
+
+    # Batch Inference with subset of keep-outputs
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.predict"]
+    cmd += ["-i {}".format(test_file)]
+    cmd += ["-m {}".format(model_folder)]
+    cmd += ["-o {}".format(test_pred_keep_file)]
+    cmd += ["--output-to-keep-path {}".format(item_to_keep_file)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    print(" ".join(cmd))
+    assert process.returncode == 0
+    for line_test, line_true in zip(
+        open(test_pred_keep_file, "r", encoding="utf-8"),
+        open(true_pred_keep_file, "r", encoding="utf-8"),
+    ):
+        assert_json_string(line_test, line_true)
+
+    # Training with Tfidf
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.train"]
+    cmd += ["-i {}".format(train_file)]
+    cmd += ["-q {}".format(item_file)]
+    cmd += ["-m {}".format(model_folder)]
+    print(" ".join(cmd))
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+
+    # Batch Inference
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.predict"]
+    cmd += ["-i {}".format(test_file)]
+    cmd += ["-m {}".format(model_folder)]
+    cmd += ["-o {}".format(test_pred_file)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    print(" ".join(cmd))
+    assert process.returncode == 0
+    for line_test, line_true in zip(
+        open(test_pred_file, "r", encoding="utf-8"), open(true_pred_file, "r", encoding="utf-8")
+    ):
+        assert_json_string(line_test, line_true)
+
+    # Realtime Inference
+    cmd = []
+    cmd += ["python3", "-m", "pecos.apps.text2text.predict"]
+    cmd += ["-m", model_folder]
+    process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    with open(test_file, "r", encoding="utf-8") as fin, open(
+        true_pred_file, "r", encoding="utf-8"
+    ) as f_expect:
+        test_output = (
+            process.communicate(input=fin.read().encode("utf-8"))[0].decode("utf-8").strip("\n")
+        )
+        expected_output = f_expect.read().strip("\n")
+        for test_string, expected_string in zip(
+            test_output.split("\n"), expected_output.split("\n")
+        ):
+            assert_json_string(test_string, expected_string)
+    assert process.returncode == 0
+
+    std_output = b"==== evaluation results ====\nprec   = 100.00 50.00 33.33 25.00 20.00 16.67 14.29 12.50 11.11 10.00\nrecall = 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00\n"
+
+    # For ground truth file with Format 1
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.evaluate"]
+    cmd += ["-p {}".format(sklearn_tfidf_true_pred_file)]
+    cmd += ["-y {}".format(truth_format1_file)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+    assert process.stdout == std_output
+
+    # For ground truth file with Format 2
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.evaluate"]
+    cmd += ["-p {}".format(sklearn_tfidf_true_pred_file)]
+    cmd += ["-y {}".format(truth_format2_file)]
+    cmd += ["-q {}".format(item_file)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+    assert process.stdout == std_output
+
+    # Training for testing the workspace folder that stores intermediate variables
+    # First-time train and store intermediate variables in workspace_folder
+    workspace_folder = str(tmpdir.join("tmp_ws"))
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.train"]
+    cmd += ["-i {}".format(train_file)]
+    cmd += ["-q {}".format(item_file)]
+    cmd += ["-m {}".format(model_folder)]
+    cmd += ["--workspace-folder {}".format(workspace_folder)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+
+    # Second-time train and load intermediate variables from workspace_folder
+    cmd = []
+    cmd += ["python3 -m pecos.apps.text2text.train"]
+    cmd += ["-i {}".format(train_file)]
+    cmd += ["-q {}".format(item_file)]
+    cmd += ["-m {}".format(model_folder)]
+    cmd += ["--workspace-folder {}".format(workspace_folder)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
diff --git a/test/pecos/test_pecos.py b/test/pecos/test_pecos.py
new file mode 100644
index 00000000..acddf750
--- /dev/null
+++ b/test/pecos/test_pecos.py
@@ -0,0 +1,15 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+
+def test_midas_pecos_importable():
+    import pecos  # noqa: F401
diff --git a/test/pecos/utils/featurization/text/sentencepiece/test_sentencepiece.py b/test/pecos/utils/featurization/text/sentencepiece/test_sentencepiece.py
new file mode 100644
index 00000000..7c49e963
--- /dev/null
+++ b/test/pecos/utils/featurization/text/sentencepiece/test_sentencepiece.py
@@ -0,0 +1,92 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+import shlex
+import subprocess
+
+from pecos.utils.featurization.text.sentencepiece import SentencePieceModel as SPM
+
+
+def test_importable():
+    import pecos.utils.featurization.text.sentencepiece  # noqa: F401
+    from pecos.utils.featurization.text import sentencepiece  # noqa: F401
+    from pecos.utils.featurization.text.sentencepiece import SentencePieceModel  # noqa: F401
+
+
+train_files = [
+    "test/tst-data/utils/data/sentencepiece_train.src",
+    "test/tst-data/utils/data/sentencepiece_train.tgt",
+]
+
+
+def test_word_model(tmpdir):
+    model = SPM.train(train_files, model_type="word", vocab_size="110")
+    predicted_file = tmpdir.join("predicted_file")
+    ref_file = "test/tst-data/utils/data/sentencepiece_word_tokenized.src"
+    model.predict_file(train_files[0], str(predicted_file), to_ids=False)
+    with open(ref_file, "r", encoding="utf-8") as fref:
+        assert predicted_file.read_text(encoding="utf-8") == fref.read()
+
+
+def test_unigram_model(tmpdir):
+    model = SPM.train(
+        train_files,
+        vocab_size=60,
+        model_type="unigram",
+        max_sentence_length=200,
+        shuffle_input_sentence=False,
+    )
+    model.save(str(tmpdir.join("unigram").mkdir()))
+
+    test_input = ["hello world"]
+    encoded_ids = model.predict(test_input, to_ids=True)
+    decoded_output = model.decode(encoded_ids, from_ids=True)
+    assert decoded_output[0] == test_input[0]
+
+
+def test_cli_bpe_model(tmpdir, capsys):
+
+    test_input_file = tmpdir.join("test_input")
+    test_input_file.write_text("hello world", encoding="utf-8")
+    test_encoded_file = tmpdir.join("test_encoded")
+    test_decoded_file = tmpdir.join("test_decoded")
+
+    cmd = []
+    cmd += ["python3 -m pecos.utils.featurization.text.sentencepiece.train"]
+    cmd += ["-m {}".format(str(tmpdir))]
+    cmd += ["-i {}".format(",".join(train_files))]
+    cmd += ["-t bpe"]
+    cmd += ["-v 60"]
+    cmd += ["--max-sentence-length 30"]
+    cmd += ["--shuffle-input-sentence yes"]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+
+    cmd = []
+    cmd += ["python3 -m pecos.utils.featurization.text.sentencepiece.predict"]
+    cmd += ["-m {}".format(str(tmpdir))]
+    cmd += ["-i {}".format(test_input_file)]
+    cmd += ["-o {}".format(str(test_encoded_file))]
+    cmd += ["--output-type pieces"]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+
+    model = SPM.load(str(tmpdir))
+    model.decode_file(str(test_encoded_file), str(test_decoded_file), from_ids=False)
+    assert test_decoded_file.read_text(encoding="utf-8") == "hello world\n"
+
+    model.decode_file(str(test_encoded_file), "-", from_ids=False)
+    out, err = capsys.readouterr()
+    assert out == "hello world\n"
diff --git a/test/pecos/utils/featurization/text/test_preprocessor.py b/test/pecos/utils/featurization/text/test_preprocessor.py
new file mode 100644
index 00000000..8c30d27b
--- /dev/null
+++ b/test/pecos/utils/featurization/text/test_preprocessor.py
@@ -0,0 +1,159 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+import numpy as np
+import scipy.sparse as smat
+from pecos.utils import smat_util
+from pytest import approx
+
+src_input_file = "test/tst-data/utils/data/train.src"
+src_text_input_file = "test/tst-data/utils/data/train_text.src"
+label_file = "test/tst-data/utils/data/label_vocab.txt"
+tfidf_vectorizer_config_path = "test/tst-data/utils/data/tfidf_vectorizer_config.json"
+tfidf_tgt_input_file = "test/tst-data/utils/data/train.tfidf.tgt.npz"
+sklearn_tfidf_vectorizer_config_path = (
+    "test/tst-data/utils/data/sklearn_tfidf_vectorizer_config.json"
+)
+sklearn_tfidf_tgt_input_file = "test/tst-data/utils/data/train.sklearn_tfidf.tgt.npz"
+hashing_vectorizer_config_path = "test/tst-data/utils/data/hashing_vectorizer_config.json"
+hashing_tgt_input_file = "test/tst-data/utils/data/train.hashing.tgt.npz"
+transformer_vectorizer_config_path = "test/tst-data/utils/data/transformer_vectorizer_config.json"
+transformer_tgt_input_file = "test/tst-data/utils/data/train.transformer.tgt.npy"
+
+
+def assert_matrix_equal(Xtgt, X, tolerance=1e-6):
+    assert type(X) == type(Xtgt)
+    if isinstance(Xtgt, np.ndarray):
+        assert Xtgt == approx(X, abs=tolerance)
+    elif isinstance(Xtgt, smat.spmatrix):
+        assert Xtgt.todense() == approx(X.todense(), abs=tolerance)
+    else:
+        raise ValueError("Un recognized matrix format {}".format(type(X)))
+
+
+def preprocessor_cli(tmpdir, config_path, tgt_input_file):
+    import subprocess
+    import shlex
+
+    model_folder = str(tmpdir.join("vectorizer"))
+    x_file = str(tmpdir.join("x"))
+    y_file = str(tmpdir.join("y.npz"))
+
+    # Build
+    cmd = []
+    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
+    cmd += ["build"]
+    cmd += ["-i {}".format(src_input_file)]
+    cmd += ["--text-pos 1"]
+    cmd += ["--vectorizer-config-path {}".format(config_path)]
+    cmd += ["-m {}".format(model_folder)]
+    print(" ".join(cmd))
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+
+    # Run
+    cmd = []
+    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
+    cmd += ["run"]
+    cmd += ["-i {}".format(src_input_file)]
+    cmd += ["-l {}".format(label_file)]
+    cmd += ["-p {}".format(model_folder)]
+    cmd += ["-x {}".format(x_file)]
+    cmd += ["-y {}".format(y_file)]
+    cmd += ["--text-pos 1"]
+    cmd += ["--label-pos 0"]
+    cmd += ["--threads 1"]
+    print(" ".join(cmd))
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+    X = smat_util.load_matrix(x_file)
+    Xtgt = smat_util.load_matrix(tgt_input_file)
+    assert_matrix_equal(Xtgt, X)
+
+    # Run without labels
+    cmd = []
+    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
+    cmd += ["run"]
+    cmd += ["-i {}".format(src_input_file)]
+    cmd += ["-p {}".format(model_folder)]
+    cmd += ["-x {}".format(x_file)]
+    cmd += ["--text-pos 1"]
+    print(" ".join(cmd))
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+    X = smat_util.load_matrix(x_file)
+    Xtgt = smat_util.load_matrix(tgt_input_file)
+    assert_matrix_equal(Xtgt, X)
+
+
+def preprocessor_from_file_cli(tmpdir, config_path, tgt_input_file):
+    import subprocess
+    import shlex
+
+    model_folder = str(tmpdir.join("vectorizer"))
+    x_file = str(tmpdir.join("x"))
+
+    # Build
+    cmd = []
+    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
+    cmd += ["build"]
+    cmd += ["-i {}".format(src_text_input_file)]
+    cmd += ["--text-pos 0"]
+    cmd += ["--from-file"]
+    cmd += ["--vectorizer-config-path {}".format(config_path)]
+    cmd += ["-m {}".format(model_folder)]
+    print(" ".join(cmd))
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+
+    # Run without labels
+    cmd = []
+    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
+    cmd += ["run"]
+    cmd += ["-i {}".format(src_text_input_file)]
+    cmd += ["-p {}".format(model_folder)]
+    cmd += ["-x {}".format(x_file)]
+    cmd += ["--text-pos 0"]
+    cmd += ["--from-file"]
+    print(" ".join(cmd))
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0
+    X = smat_util.load_matrix(x_file)
+    Xtgt = smat_util.load_matrix(tgt_input_file)
+    assert_matrix_equal(Xtgt, X)
+
+
+def test_tfidf_vectorizer(tmpdir):
+    preprocessor_cli(tmpdir, tfidf_vectorizer_config_path, tfidf_tgt_input_file)
+    preprocessor_from_file_cli(tmpdir, tfidf_vectorizer_config_path, tfidf_tgt_input_file)
+
+
+def test_sklearn_tfidf_vectorizer(tmpdir):
+    preprocessor_cli(tmpdir, sklearn_tfidf_vectorizer_config_path, sklearn_tfidf_tgt_input_file)
+
+
+def test_hashing_vectorizer(tmpdir):
+    preprocessor_cli(tmpdir, hashing_vectorizer_config_path, hashing_tgt_input_file)
+
+
+def test_transformer_vectorizer(tmpdir):
+    preprocessor_cli(tmpdir, transformer_vectorizer_config_path, transformer_tgt_input_file)
diff --git a/test/pecos/utils/featurization/text/test_vectorizer.py b/test/pecos/utils/featurization/text/test_vectorizer.py
new file mode 100644
index 00000000..131209fa
--- /dev/null
+++ b/test/pecos/utils/featurization/text/test_vectorizer.py
@@ -0,0 +1,281 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+import numpy as np
+from pecos.utils import smat_util
+from pecos.utils.featurization.text.vectorizers import Vectorizer
+from pytest import approx
+
+
+def check_result_of_vectorizer_config(corpus, X, config, error_msg=""):
+    vect = Vectorizer.train(corpus, config=config)
+    Xp = vect.predict(corpus).toarray()
+    assert X == approx(Xp), error_msg
+
+
+def test_tfidf_vectorizer_tf():
+    corpus = [
+        "midas amazon pecos",
+        "amazon pecos pecos pecos",
+        "iphone",
+    ]
+
+    # config: {'use_idf': False, 'smooth_idf': False, norm: 'l2'}
+    # ground TF matrix and IDF array manually computed by
+    # tf(t,d) = f_{t,d} / sum_{t'} f_{t',d}
+    # idf(t,D) = log(N/n_t)
+    # more details at https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+
+    # feat_dim = {0: "iphone", 1: "midas", 2: "amazon", 3: "pecos"}
+    X_1 = np.array(
+        [
+            [0.00000000, 0.57735026, 0.57735026, 0.57735026],
+            [0.00000000, 0.00000000, 0.31622776, 0.94868326],
+            [1.00000000, 0.00000000, 0.00000000, 0.00000000],
+        ]
+    )
+    # feat_dim = {0: "midas amazon", 1: "pecos pecos", 2: "amzon pecos"}
+    X_2 = np.array(
+        [
+            [0.70710677, 0.00000000, 0.70710677],
+            [0.00000000, 0.89442720, 0.44721360],
+            [0.00000000, 0.00000000, 0.00000000],
+        ]
+    )
+    # stack X_1 and X_2 column-wise
+    X_3 = np.array(
+        [
+            [0.00000000, 0.40824829, 0.40824829, 0.40824829, 0.50000000, 0.00000000, 0.50000000],
+            [0.00000000, 0.00000000, 0.22360680, 0.67082038, 0.00000000, 0.63245555, 0.31622777],
+            [1.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000],
+        ]
+    )
+
+    # ==== test case 1 ====
+    # build from c++ tfidf word unigram (TfidfBaseVectorizerParam)
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "use_idf": False,
+            "smooth_idf": False,
+            "norm": "l2",
+        },
+    }
+    check_result_of_vectorizer_config(corpus, X_1, config, error_msg="word unigram config")
+
+    # ==== test case 2 ====
+    # build from c++ tfidf word bigram (TfidfVectorizerParam)
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "norm_p": 2,
+            "threads": 4,
+            "base_vect_configs": [
+                {
+                    "ngram_range": [2, 2],
+                    "use_idf": False,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                }
+            ],
+        },
+    }
+    check_result_of_vectorizer_config(corpus, X_2, config, error_msg="word bigram config")
+
+    # ==== test case 3 ====
+    # build from c++ tfidf word bigram (TfidfVectorizerParam)
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "norm_p": 2,
+            "threads": 4,
+            "base_vect_configs": [
+                {
+                    "ngram_range": [1, 1],
+                    "use_idf": False,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+                {
+                    "ngram_range": [2, 2],
+                    "use_idf": False,
+                    "smooth_idf": False,
+                    "sublinear_tf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+            ],
+        },
+    }
+    check_result_of_vectorizer_config(
+        corpus, X_3, config, error_msg="word [unigram, bigram] config"
+    )
+
+    # ==== test case 4, online prediction ====
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "norm_p": 2,
+            "threads": 4,
+            "base_vect_configs": [
+                {
+                    "ngram_range": [1, 1],
+                    "use_idf": False,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+                {
+                    "ngram_range": [2, 2],
+                    "use_idf": False,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+            ],
+        },
+    }
+    vect = Vectorizer.train(corpus, config=config)
+    Xp_4 = [vect.predict([line]) for line in corpus]
+    Xp_4 = smat_util.vstack_csr(Xp_4).toarray()
+    assert X_3 == approx(Xp_4), "online prediction failed"
+
+
+def test_tfidf_vectorizer_idf():
+    corpus = [
+        "midas amazon pecos",
+        "amazon pecos pecos pecos",
+        "iphone",
+    ]
+
+    # config: {'use_idf': True, 'smooth_idf': False, norm: 'l2'}
+    # ground TF matrix and IDF array manually computed by
+    # tf(t,d) = f_{t,d} / sum_{t'} f_{t',d}
+    # idf(t,D) = log(N/n_t)
+    # more details at https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+
+    # feat_dim = {0: "iphone", 1: "midas", 2: "amazon", 3: "pecos"}
+    X_1 = np.array(
+        [
+            [0.00000000, 0.88651040, 0.32718456, 0.32718456],
+            [0.00000000, 0.00000000, 0.31622776, 0.94868330],
+            [1.00000000, 0.00000000, 0.00000000, 0.00000000],
+        ]
+    )
+    # feat_dim = {0: "midas amazon", 1: "pecos pecos", 2: "amzon pecos"}
+    X_2 = np.array(
+        [
+            [0.93814546, 0.00000000, 0.34624156],
+            [0.00000000, 0.98339630, 0.18147115],
+            [0.00000000, 0.00000000, 0.00000000],
+        ]
+    )
+    # stack X_1 and X_2 column-wise
+    X_3 = np.array(
+        [
+            [0.00000000, 0.62685746, 0.23135442, 0.23135442, 0.66336906, 0.00000000, 0.24482976],
+            [0.00000000, 0.00000000, 0.22360680, 0.67082040, 0.00000000, 0.69536614, 0.12831947],
+            [1.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000],
+        ]
+    )
+
+    # ==== test case 1 ====
+    # build from c++ tfidf word unigram (TfidfBaseVectorizerParam)
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "use_idf": True,
+            "smooth_idf": False,
+            "norm": "l2",
+        },
+    }
+    check_result_of_vectorizer_config(corpus, X_1, config, error_msg="word unigram config")
+
+    # ==== test case 2 ====
+    # build from c++ tfidf word bigram (TfidfVectorizerParam)
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "norm_p": 2,
+            "threads": 4,
+            "base_vect_configs": [
+                {
+                    "ngram_range": [2, 2],
+                    "use_idf": True,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                }
+            ],
+        },
+    }
+    check_result_of_vectorizer_config(corpus, X_2, config, error_msg="word bigram config")
+
+    # ==== test case 3 ====
+    # build from c++ tfidf word bigram (TfidfVectorizerParam)
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "norm_p": 2,
+            "threads": 4,
+            "base_vect_configs": [
+                {
+                    "ngram_range": [1, 1],
+                    "use_idf": True,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+                {
+                    "ngram_range": [2, 2],
+                    "use_idf": True,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+            ],
+        },
+    }
+    check_result_of_vectorizer_config(
+        corpus, X_3, config, error_msg="word [unigram, bigram] config"
+    )
+
+    # ==== test case 4, online prediction ====
+    config = {
+        "type": "tfidf",
+        "kwargs": {
+            "norm_p": 2,
+            "threads": 4,
+            "base_vect_configs": [
+                {
+                    "ngram_range": [1, 1],
+                    "use_idf": True,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+                {
+                    "ngram_range": [2, 2],
+                    "use_idf": True,
+                    "smooth_idf": False,
+                    "norm": "l2",
+                    "analyzer": "word",
+                },
+            ],
+        },
+    }
+    vect = Vectorizer.train(corpus, config=config)
+    Xp_4 = [vect.predict([line]) for line in corpus]
+    Xp_4 = smat_util.vstack_csr(Xp_4).toarray()
+    assert X_3 == approx(Xp_4), "online prediction failed"
diff --git a/test/pecos/utils/test_cluster_util.py b/test/pecos/utils/test_cluster_util.py
new file mode 100644
index 00000000..6126f5af
--- /dev/null
+++ b/test/pecos/utils/test_cluster_util.py
@@ -0,0 +1,74 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+from pytest import approx
+
+
+def test_importable():
+    import pecos.utils.cluster_util  # noqa: F401
+    from pecos.utils.cluster_util import ClusterChain  # noqa: F401
+
+
+def test_cluster_chain(tmpdir):
+    import scipy.sparse as smat
+    import numpy as np
+    from pecos.utils.cluster_util import ClusterChain
+
+    C0 = smat.csc_matrix([[1], [1]])
+    C1 = smat.csc_matrix([[1, 0], [1, 0], [0, 1], [0, 1]])
+    C2 = smat.csc_matrix(
+        [
+            [1, 0, 0, 0],
+            [1, 0, 0, 0],
+            [0, 1, 0, 0],
+            [0, 1, 0, 0],
+            [0, 0, 1, 0],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1],
+            [0, 0, 0, 1],
+        ]
+    )
+
+    chain_orig = ClusterChain([C0, C1, C2])
+
+    # test save & load
+    save_path = tmpdir.join("chain")
+    chain_orig.save(save_path)
+    chain_loaded = ClusterChain.load(save_path)
+    assert chain_orig == chain_loaded
+
+    # test chain construction
+    chain_reconstructed = ClusterChain.from_partial_chain(chain_orig[-1])
+    assert chain_orig == chain_reconstructed
+
+    # test matching chain
+    Y = smat.csr_matrix([[1, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0]])
+    M2 = smat.csr_matrix([[0, 1, 0, 0], [0, 0, 0, 1]])
+    matching_chain = chain_orig.genearate_matching_chain({0: Y, 1: M2, 2: None})
+    M2_res = np.array([[1, 1, 1, 0], [0, 1, 0, 1]])
+    M1_res = np.array([[2, 1], [1, 1]])
+    M0_res = np.array([[3], [2]])
+    for pred, res in zip(matching_chain, [M0_res, M1_res, M2_res]):
+        assert pred.toarray() == approx(res)
+
+
+def test_hierarchical_kmeans():
+    import scipy.sparse as smat
+    import numpy as np
+    from pecos.utils.cluster_util import hierarchical_kmeans
+
+    X = smat.csr_matrix([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1], [0, 0, 1, 1]])
+    cluster_chain = hierarchical_kmeans(X, max_leaf_size=2, seed=0, threads=-1)
+    C1_res = np.array([[1.0, 0], [1.0, 0], [0, 1.0], [0, 1.0]])
+    C0_res = np.array([[1.0], [1.0]])
+    for pred, res in zip(cluster_chain, [C0_res, C1_res]):
+        assert pred.toarray() == approx(res)
diff --git a/test/pecos/utils/test_smat_utils.py b/test/pecos/utils/test_smat_utils.py
new file mode 100644
index 00000000..7484d988
--- /dev/null
+++ b/test/pecos/utils/test_smat_utils.py
@@ -0,0 +1,163 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+from pytest import approx
+
+
+def test_save_load_matrix(tmpdir):
+    from pecos.utils import smat_util
+    import numpy as np
+    import scipy.sparse as smat
+
+    A = smat.csr_matrix([[0, 1, 0, 1], [3, 0, 3, 0], [1, 0, 0, 1], [3, 0, 3, 0]])
+    A_dir = tmpdir.join("A").realpath().strpath
+    smat_util.save_matrix(A_dir, A)
+    A_load = smat_util.load_matrix(A_dir)
+    assert isinstance(A, smat.spmatrix)
+    assert A.todense() == approx(A_load.todense(), abs=1e-6)
+    # dense case
+    B = A.toarray()
+    B_dir = tmpdir.join("B").realpath().strpath
+    smat_util.save_matrix(B_dir, B)
+    B_load = smat_util.load_matrix(B_dir)
+    assert isinstance(B_load, np.ndarray)
+    assert B == approx(B_load, abs=1e-6)
+
+
+def test_get_sparsified_coo():
+    from pecos.utils import smat_util
+    from scipy import sparse as smat
+
+    data = [[1, 0, 0, 1], [0, 1, 0, 1], [1, 0, 0, 1], [1, 1, 0, 0]]
+    coo = smat.coo_matrix(data)
+    selected_rows = [1, 2]
+    selected_columns = [0, 2]
+    new_coo = smat_util.get_sparsified_coo(coo, selected_rows, selected_columns).toarray()
+    for i in range(new_coo.shape[0]):
+        for j in range(new_coo.shape[1]):
+            if i in selected_rows and j in selected_columns:
+                assert new_coo[i, j] == data[i][j]
+            else:
+                assert new_coo[i, j] == 0
+
+
+def test_get_cocluster_spectral_embeddings():
+    from pecos.utils import smat_util
+    import numpy as np
+    from scipy import sparse as smat
+
+    A = smat.csr_matrix([[0, 1, 0, 1], [3, 0, 3, 0], [1, 0, 0, 1], [3, 0, 3, 0]])
+    expected_row_embedding = np.array([[0.60162737], [-0.13128586], [0.18608776], [-0.13128586]])
+    expected_col_embedding = np.array([[-0.09449112], [0.66143783], [-0.14433757], [0.4330127]])
+
+    row_embedding, col_embedding = smat_util.get_cocluster_spectral_embeddings(A, dim=1)
+
+    assert row_embedding == approx(expected_row_embedding, abs=1e-6)
+    assert col_embedding == approx(expected_col_embedding, abs=1e-6)
+
+
+def test_dense_to_csr():
+    from pecos.utils import smat_util
+    import numpy as np
+
+    X = np.array([[-5.0, 1.0, 2.0, 10.0], [-4.0, 2.0, 0.0, 1.0], [-10.0, 11.0, 2.0, 1.0]])
+    X_csr = smat_util.dense_to_csr(X, topk=2, batch=2)
+    X_res = np.array([[-5.0, 0.0, 0.0, 10.0], [-4.0, 2.0, 0.0, 0.0], [-10.0, 11.0, 0.0, 0.0]])
+    assert X_csr.todense() == approx(X_res)
+    X_csr = smat_util.dense_to_csr(X, topk=1000, batch=1000)
+    assert X_csr.todense() == approx(X)
+
+
+def test_stack_csr():
+    from pecos.utils import smat_util
+    from scipy import sparse as smat
+    import numpy as np
+
+    X0 = np.array([[-5.0, 1.0, 0.0, 10.0], [0.0, 2.0, 0.0, 1.0], [-10.0, 11.0, 2.0, 0.0]])
+    X1 = smat.csr_matrix(X0)
+    X2 = smat.csr_matrix(X0)
+    X_hstack = smat_util.hstack_csr([X1, X2])
+    assert X_hstack.todense() == approx(np.hstack([X0, X0]))
+    assert X_hstack.dtype == X1.dtype
+    assert type(X_hstack) == smat.csr_matrix
+    X_vstack = smat_util.vstack_csr([X1, X2])
+    assert X_vstack.todense() == approx(np.vstack([X0, X0]))
+    assert X_vstack.dtype == X1.dtype
+    assert type(X_vstack) == smat.csr_matrix
+
+
+def test_stack_csc():
+    from pecos.utils import smat_util
+    from scipy import sparse as smat
+    import numpy as np
+
+    X0 = np.array([[-5.0, 1.0, 0.0, 10.0], [0.0, 2.0, 0.0, 1.0], [-10.0, 11.0, 2.0, 0.0]])
+    X1 = smat.csc_matrix(X0)
+    X2 = smat.csc_matrix(X0)
+    X_hstack = smat_util.hstack_csc([X1, X2])
+    assert X_hstack.todense() == approx(np.hstack([X0, X0]))
+    assert X_hstack.dtype == X1.dtype
+    assert type(X_hstack) == smat.csc_matrix
+    X_vstack = smat_util.vstack_csc([X1, X2])
+    assert X_vstack.todense() == approx(np.vstack([X0, X0]))
+    assert X_vstack.dtype == X1.dtype
+    assert type(X_vstack) == smat.csc_matrix
+
+
+def test_csr_rowwise_mul():
+    from pecos.utils import smat_util
+    from scipy import sparse as smat
+    import numpy as np
+
+    X0 = np.array([[-5.0, 1.0, 0.0, 10.0], [0.0, 2.0, 0.0, 1.0], [-10.0, 11.0, 2.0, 0.0]])
+    X1 = smat.csr_matrix(X0)
+    v = np.array([3.0, 4.0, 5.0])
+    prod = smat_util.csr_rowwise_mul(X1, v)
+    assert isinstance(prod, smat.csr_matrix)
+    assert prod.todense() == approx(X0 * v[:, None])
+
+
+def test_csc_colwise_mul():
+    from pecos.utils import smat_util
+    from scipy import sparse as smat
+    import numpy as np
+
+    X0 = np.array([[-5.0, 1.0, 0.0, 10.0], [0.0, 2.0, 0.0, 1.0], [-10.0, 11.0, 2.0, 0.0]])
+    X1 = smat.csc_matrix(X0)
+    v = np.array([3.0, 4.0, 5.0, 6.0])
+    prod = smat_util.csc_colwise_mul(X1, v)
+    assert isinstance(prod, smat.csc_matrix)
+    assert prod.todense() == approx(X0 * v[None, :])
+
+
+def test_get_row_submatrices():
+    from pecos.utils import smat_util
+    import numpy as np
+    import scipy.sparse as smat
+
+    row_indices = [0, 2, 1]
+    rows = []
+
+    rows += [-5.0, 1.0]
+    rows += [0.0, 2.0]
+    rows += [2.0, 0.0]
+    rows += [1.2, 0.0]
+
+    X0 = np.vstack(rows)
+    X1 = smat.csr_matrix(X0)
+    Xres = np.vstack([rows[i] for i in row_indices])
+
+    X0_sub, X1_sub = smat_util.get_row_submatrices([X0, X1], row_indices)
+
+    assert type(X0_sub) == type(X0)
+    assert X0_sub == approx(Xres)
+    assert type(X1_sub) == type(X1)
+    assert X1_sub.todense() == approx(Xres)
diff --git a/test/pecos/utils/test_torch_util.py b/test/pecos/utils/test_torch_util.py
new file mode 100644
index 00000000..e9a9dda7
--- /dev/null
+++ b/test/pecos/utils/test_torch_util.py
@@ -0,0 +1,31 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+
+def test_setup_device():
+    import torch
+    from pecos.utils import torch_util
+
+    if torch.cuda.is_available():  # GPU machine
+        device, n_active_gpu = torch_util.setup_device(use_gpu_if_available=True)
+        assert device == torch.device("cuda")
+        assert n_active_gpu == torch.cuda.device_count()
+        device, n_active_gpu = torch_util.setup_device(use_gpu_if_available=False)
+        assert device == torch.device("cpu")
+        assert n_active_gpu == 0
+    else:
+        device, n_active_gpu = torch_util.setup_device(use_gpu_if_available=True)
+        assert device == torch.device("cpu")
+        assert n_active_gpu == 0
+        device, n_active_gpu = torch_util.setup_device(use_gpu_if_available=False)
+        assert device == torch.device("cpu")
+        assert n_active_gpu == 0
diff --git a/test/pecos/utils/test_utils.py b/test/pecos/utils/test_utils.py
new file mode 100644
index 00000000..3340d70c
--- /dev/null
+++ b/test/pecos/utils/test_utils.py
@@ -0,0 +1,17 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+
+def test_importable():
+    import pecos.utils  # noqa: F401
+    from pecos.utils import logging_util  # noqa: F401
+    from pecos.utils import cli  # noqa: F401
diff --git a/test/pecos/xmc/test_xmc.py b/test/pecos/xmc/test_xmc.py
new file mode 100644
index 00000000..a32b766b
--- /dev/null
+++ b/test/pecos/xmc/test_xmc.py
@@ -0,0 +1,245 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+from pytest import approx
+
+
+def test_importable():
+    import pecos.xmc  # noqa: F401
+    from pecos import xmc  # noqa: F401
+    from pecos.xmc import xlinear  # noqa: F401
+    from pecos.xmc import xtransformer  # noqa: F401
+    from pecos.xmc import Indexer  # noqa: F401
+
+
+def test_hierarchicalkmeans():
+    import numpy as np
+    import scipy.sparse as smat
+    from sklearn.preprocessing import normalize
+    from pecos.xmc import Indexer
+
+    feat_mat = normalize(
+        smat.csr_matrix([[1, 0], [0.95, 0.05], [0.9, 0.1], [0, 1]], dtype=np.float32)
+    )
+    target_balanced = [0, 0, 1, 1]
+    target_imbalanced = [0, 0, 0, 1]
+
+    balanced_chain = Indexer.gen(feat_mat, max_leaf_size=3)
+    balanced_assignments = (balanced_chain[-1].todense() == [0, 1]).all(axis=1).A1
+    assert np.array_equal(balanced_assignments, target_balanced) or np.array_equal(
+        ~balanced_assignments, target_balanced
+    )
+
+    imbalanced_chain = Indexer.gen(feat_mat, imbalanced_ratio=0.4, max_leaf_size=3)
+    imbalanced_assignments = (imbalanced_chain[-1].todense() == [0, 1]).all(axis=1).A1
+    assert np.array_equal(imbalanced_assignments, target_imbalanced) or np.array_equal(
+        ~imbalanced_assignments, target_imbalanced
+    )
+
+    chain2 = Indexer.gen(feat_mat, max_leaf_size=1, nr_splits=2)
+    chain4 = Indexer.gen(feat_mat, max_leaf_size=1, nr_splits=4)
+
+    assert (chain2.chain[-1] - chain4.chain[-1]).nnz == 0
+
+    assert (chain2.chain[1].dot(chain2.chain[0]) - chain4.chain[0]).nnz == 0
+
+
+def test_label_embedding():
+    import random
+    import numpy as np
+    import scipy.sparse as smat
+    from sklearn.preprocessing import normalize
+    from pecos.xmc import LabelEmbeddingFactory
+
+    X = smat.csr_matrix(smat.eye(3)).astype(np.float32)
+    X_dense = X.toarray()
+    Y = np.array([[1, 1, 1, 1, 0], [1, 1, 0, 1, 1], [0, 1, 1, 1, 1]])
+    Y = smat.csr_matrix(Y).astype(np.float32)
+    Lt_dense = np.array(
+        [
+            [0.70710678, 0.70710678, 0.0],
+            [0.57735027, 0.57735027, 0.57735027],
+            [0.70710678, 0.0, 0.70710678],
+            [0.57735027, 0.57735027, 0.57735027],
+            [0.0, 0.70710678, 0.70710678],
+        ]
+    )
+    Lt = smat.csr_matrix(Lt_dense)
+
+    # pifa, X.dtype = csr_matrix, and simple X/Y with closed-form Lt_dense
+    Lp = LabelEmbeddingFactory.create(Y, X, method="pifa").toarray()
+    assert Lt_dense == approx(
+        Lp, abs=1e-6
+    ), f"Lt_dense (true label embedding) != Lp (pifa label embedding), where closed-form X is sparse"
+
+    # pifa, X.dtype = np.array, and the same X/Y with previous closed-form Lt_dense
+    Lp = LabelEmbeddingFactory.create(Y, X_dense, method="pifa")
+    assert Lt_dense == approx(
+        Lp, abs=1e-6
+    ), f"Lt_dense (true label embedding) != Lp (pifa label embedding), where closed-form X is dense"
+
+    # test data for pifa_lf_concat and pifa_lf_convex_combine
+    Lp = LabelEmbeddingFactory.create(Y, X_dense, method="pifa")
+    Lt_half_dense = Lt_dense * 0.5
+    Lt_half = smat.csr_matrix(Lt_half_dense)
+
+    # test data for pifa_lf_concat
+    Lplc_true = np.hstack([Lp, Lt_half_dense])
+
+    # pifa_lf_concat, X.dtype = ndarray, Z.dtype = ndarray
+    Lplc = LabelEmbeddingFactory.create(Y, X_dense, Z=Lt_half_dense, method="pifa_lf_concat")
+    assert isinstance(
+        Lplc, np.ndarray
+    ), f"Return matrix should be np.ndarray when X.dtype = ndarray, Z.dtype = ndarray"
+    assert Lplc == approx(
+        Lplc_true
+    ), f"Lplc_true (true label embedding) != Lplc (pifa_lf_concat label embedding), where X.dtype = ndarray, Z.dtype = ndarray"
+
+    # pifa_lf_concat, X.dtype = ndarray, Z.dtype = csr_matrix
+    Lplc = LabelEmbeddingFactory.create(Y, X_dense, Z=Lt_half, method="pifa_lf_concat")
+    assert isinstance(
+        Lplc, smat.csr_matrix
+    ), f"Return matrix should be csr_matrix when X.dtype = ndarray, Z.dtype = csr_matrix"
+    assert Lplc.toarray() == approx(
+        Lplc_true
+    ), f"Lplc_true (true label embedding) != Lplc (pifa_lf_concat label embedding), where X.dtype = ndarray, Z.dtype = csr_matrix"
+
+    # pifa_lf_concat, X.dtype = csr_matrix, Z.dtype = ndarray
+    Lplc = LabelEmbeddingFactory.create(Y, X, Z=Lt_half_dense, method="pifa_lf_concat")
+    assert isinstance(
+        Lplc, smat.csr_matrix
+    ), f"Return matrix should be csr_matrix when X.dtype = csr_matrix, Z.dtype = ndarray"
+    assert Lplc.toarray() == approx(
+        Lplc_true
+    ), f"Lplc_true (true label embedding) != Lplc (pifa_lf_concat label embedding), where X.dtype = csr_matrix, Z.dtype = ndarray"
+
+    # pifa_lf_concat, X.dtype = csr_matrix, Z.dtype = csr_matrix
+    Lplc = LabelEmbeddingFactory.create(Y, X, Z=Lt_half, method="pifa_lf_concat")
+    assert isinstance(
+        Lplc, smat.csr_matrix
+    ), f"Return matrix should be csr_matrix when X.dtype = csr_matrix, Z.dtype = csr_matrix"
+    assert Lplc.toarray() == approx(
+        Lplc_true
+    ), f"Lplc_true (true label embedding) != Lplc (pifa_lf_concat label embedding), where X.dtype = csr_matrix, Z.dtype = csr_matrix"
+
+    # pifa_lf_convex_combine, alpha is a number
+    alpha = 0.3
+    Lplcvx_true = alpha * Lp + (1.0 - alpha) * Lt_half_dense
+
+    # pifa_lf_convex_combine, X.dtype = ndarray, Z.dtype = ndarray
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X_dense, Z=Lt_half_dense, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, np.ndarray
+    ), f"Return matrix should be ndarray when X.dtype = ndarray, Z.dtype = ndarray"
+    assert Lplcvx == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = ndarray, Z.dtype = ndarray"
+
+    # pifa_lf_convex_combine, X.dtype = ndarray, Z.dtype = csr_matrix
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X_dense, Z=Lt_half, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, np.ndarray
+    ), f"Return matrix should be ndarray when X.dtype = ndarray, Z.dtype = csr_matrix"
+    assert Lplcvx == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = ndarray, Z.dtype = csr_matrix"
+
+    # pifa_lf_convex_combine, X.dtype = csr_matrix, Z.dtype = ndarray
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X, Z=Lt_half_dense, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, np.ndarray
+    ), f"Return matrix should be ndarray when X.dtype = csr_matrix, Z.dtype = ndarray"
+    assert Lplcvx == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = csr_matrix, Z.dtype = ndarray"
+
+    # pifa_lf_convex_combine, X.dtype = csr_matrix, Z.dtype = csr_matrix
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X, Z=Lt_half, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, smat.csr_matrix
+    ), f"Return matrix should be csr_matrix when X.dtype = csr_matrix, Z.dtype = csr_matrix"
+    assert Lplcvx.toarray() == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = csr_matrix, Z.dtype = csr_matrix"
+
+    # pifa_lf_convex_combine, alpha is an 1-D array
+    alpha = np.array([0.1, 0.2, 0.3, 0.4, 0.5])
+    Lplcvx_true = np.zeros_like(Lp)
+    for i in range(Lp.shape[0]):
+        Lplcvx_true[i, :] = alpha[i] * Lp[i, :] + (1.0 - alpha[i]) * Lt_half_dense[i, :]
+
+    # pifa_lf_convex_combine, X.dtype = ndarray, Z.dtype = ndarray
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X_dense, Z=Lt_half_dense, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, np.ndarray
+    ), f"Return matrix should be ndarray when X.dtype = ndarray, Z.dtype = ndarray"
+    assert Lplcvx == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = ndarray, Z.dtype = ndarray"
+
+    # pifa_lf_convex_combine, X.dtype = ndarray, Z.dtype = csr_matrix
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X_dense, Z=Lt_half, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, np.ndarray
+    ), f"Return matrix should be ndarray when X.dtype = ndarray, Z.dtype = csr_matrix"
+    assert Lplcvx == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = ndarray, Z.dtype = csr_matrix"
+
+    # pifa_lf_convex_combine, X.dtype = csr_matrix, Z.dtype = ndarray
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X, Z=Lt_half_dense, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, np.ndarray
+    ), f"Return matrix should be ndarray when X.dtype = csr_matrix, Z.dtype = ndarray"
+    assert Lplcvx == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = csr_matrix, Z.dtype = ndarray"
+
+    # pifa_lf_convex_combine, X.dtype = csr_matrix, Z.dtype = csr_matrix
+    Lplcvx = LabelEmbeddingFactory.create(
+        Y, X, Z=Lt_half, alpha=alpha, method="pifa_lf_convex_combine"
+    )
+    assert isinstance(
+        Lplcvx, smat.csr_matrix
+    ), f"Return matrix should be csr_matrix when X.dtype = csr_matrix, Z.dtype = csr_matrix"
+    assert Lplcvx.toarray() == approx(
+        Lplcvx_true
+    ), f"Lplcvx_true (true label embedding) != Lplcvx (pifa_lf_convex_combine label embedding), where X.dtype = csr_matrix, Z.dtype = csr_matrix"
+
+    # pifa, X.dtype = csr_matrix, and random sampling X/Y
+    np.random.seed(1234)
+    random.seed(1234)
+    N = 30
+    D = 40
+    Z = 50
+    p = 0.05
+    X = smat.random(N, D, density=p, format="csr").astype(np.float32)
+    Y = smat.random(N, Z, density=p, format="csr").astype(np.float32)
+    Y_avg = normalize(Y, axis=1, norm="l2").tocsc()
+    Lt = normalize(X.T.dot(Y_avg).T, axis=1, norm="l2", copy=False).toarray()
+    Lp = LabelEmbeddingFactory.create(Y, X, method="pifa").toarray()
+    assert Lt == approx(
+        Lp, abs=1e-6
+    ), f"Lt (true label embedding) != Lp (pifa label embedding), where random X is sparse"
diff --git a/test/pecos/xmc/xlinear/test_xlinear.py b/test/pecos/xmc/xlinear/test_xlinear.py
new file mode 100644
index 00000000..06270119
--- /dev/null
+++ b/test/pecos/xmc/xlinear/test_xlinear.py
@@ -0,0 +1,745 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+from pytest import approx
+
+
+def test_importable():
+    import pecos.xmc.xlinear  # noqa: F401
+    from pecos.xmc.xlinear import XLinearModel  # noqa: F401
+    from pecos.xmc import Indexer  # noqa: F401
+
+
+def test_cost_sensitive():
+    from pecos.utils import smat_util
+    from pecos.xmc import MLProblem, MLModel
+
+    X = smat_util.load_matrix("test/tst-data/xmc/xlinear/X.npz")
+    Y = smat_util.load_matrix("test/tst-data/xmc/xlinear/Y.npz")
+
+    # Cp=2.0 and R=none should equiv to Cp=1.0 and R=2.0
+    Cp = 2.0
+    model_v0 = MLModel.train(
+        MLProblem(X, Y, C=None, M=None, R=None),
+        train_params=MLModel.TrainParams(Cp=Cp),
+    )
+
+    R = smat_util.binarized(Y)
+    R.data = Cp * R.data
+    model_v1 = MLModel.train(
+        MLProblem(X, Y, C=None, M=None, R=R),
+        train_params=MLModel.TrainParams(Cp=1.0),
+    )
+    assert model_v0.W.todense() == approx(model_v1.W.todense(), abs=1e-9)
+
+
+def test_predict_consistency_between_python_and_cpp(tmpdir):
+    import subprocess
+    import shlex
+    from pecos.xmc import PostProcessor
+    from pecos.xmc.xlinear import XLinearModel as py_xlm
+
+    train_X_file = "test/tst-data/xmc/xlinear/X.npz"
+    train_Y_file = "test/tst-data/xmc/xlinear/Y.npz"
+    test_X_file = "test/tst-data/xmc/xlinear/Xt.npz"
+    model_folder = str(tmpdir.join("save_model"))
+
+    models = []
+
+    # Obtain a xlinear model
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xlinear.train"]
+    cmd += ["-x {}".format(train_X_file)]
+    cmd += ["-y {}".format(train_Y_file)]
+    cmd += ["-m {}".format(model_folder)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0, " ".join(cmd)
+    models += [model_folder]
+
+    # Obtain xlinear models with vairous number of splits
+    for splits in [2, 4]:
+        model_folder_local = f"{model_folder}-{splits}"
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.train"]
+        cmd += [f"-x {train_X_file}"]
+        cmd += [f"-y {train_Y_file}"]
+        cmd += [f"--nr-splits {splits}"]
+        cmd += [f"--max-leaf-size 2"]
+        cmd += [f"-m {model_folder_local}"]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        models += [model_folder_local]
+
+    X = py_xlm.load_feature_matrix(test_X_file)
+    for model in models:
+        py_m = py_xlm.load(model)
+        py_bin_search_m = py_xlm.load(
+            model, is_predict_only=True, weight_matrix_type="BINARY_SEARCH_CHUNKED"
+        )
+        py_hash_m = py_xlm.load(model, is_predict_only=True, weight_matrix_type="HASH_CHUNKED")
+        py_csc_m = py_xlm.load(model, is_predict_only=True, weight_matrix_type="CSC")
+
+        for pp in PostProcessor.valid_list():
+            kwargs = {"post_processor": pp, "beam_size": 2}
+
+            # in batch mode
+
+            py_pred = py_m.predict(X, **kwargs).todense()
+
+            # Sparse inputs
+            # Test csr_t x bin_search_chunked_matrix_t
+            py_bin_search_pred = py_bin_search_m.predict(X, **kwargs).todense()
+            # Test csr_t x hash_chunked_matrix_t
+            py_hash_pred = py_hash_m.predict(X, **kwargs).todense()
+            # Test csr_t x csc_t
+            py_csc_pred = py_csc_m.predict(X, **kwargs).todense()
+
+            # Dense inputs
+            # Test drm_ x binary search chunked
+            py_bin_search_dense_pred = py_bin_search_m.predict(X.todense(), **kwargs).todense()
+            # Test drm_t x hash chunked
+            py_hash_chunked_dense_pred = py_hash_m.predict(X.todense(), **kwargs).todense()
+            # Test drm_t x csc_t
+            py_csc_dense_pred = py_csc_m.predict(X.todense(), **kwargs).todense()
+
+            assert py_bin_search_pred == approx(
+                py_pred, abs=1e-6
+            ), f"model:{model} (sparse, bin-search) post_processor:{pp}"
+            assert py_hash_pred == approx(
+                py_pred, abs=1e-6
+            ), f"model:{model} (sparse, hash) post_processor:{pp}"
+            assert py_csc_pred == approx(
+                py_pred, abs=1e-6
+            ), f"model:{model} (sparse, csc) post_processor:{pp}"
+
+            assert py_bin_search_dense_pred == approx(
+                py_pred, abs=1e-6
+            ), f"model:{model} (dense, bin-search) post_processor:{pp}"
+            assert py_hash_chunked_dense_pred == approx(
+                py_pred, abs=1e-6
+            ), f"model:{model} (dense, hash) post_processor:{pp}"
+            assert py_csc_dense_pred == approx(
+                py_pred, abs=1e-6
+            ), f"model:{model} (dense, csc) post_processor:{pp}"
+
+            # in realtime mode
+            for i in range(X.shape[0]):
+                query_slice = X[[i], :]
+                # Some versions of Scipy don't maintain sortedness when slicing
+                query_slice.sort_indices()
+
+                py_pred = py_m.predict(query_slice, **kwargs).todense()
+
+                # Sparse Inputs
+                # Test csr_t x bin_search_chunked_matrix_t
+                py_bin_search_pred = py_bin_search_m.predict(query_slice, **kwargs).todense()
+                # Test csr_t x hash_chunked_matrix_t
+                py_hash_pred = py_hash_m.predict(query_slice, **kwargs).todense()
+                # Test csr_t x csc_t
+                py_csc_pred = py_csc_m.predict(query_slice, **kwargs).todense()
+
+                # Dense Inputs
+                # Test drm_ x binary search chunked
+                py_bin_search_dense_pred = py_bin_search_m.predict(
+                    query_slice.todense(), **kwargs
+                ).todense()
+                # Test drm_t x hash chunked
+                py_hash_chunked_dense_pred = py_hash_m.predict(
+                    query_slice.todense(), **kwargs
+                ).todense()
+                # Test csr_t x csc_t
+                py_csc_dense_pred = py_csc_m.predict(query_slice.todense(), **kwargs).todense()
+
+                assert py_bin_search_pred == approx(
+                    py_pred, abs=1e-6
+                ), f"model:{model} (sparse, bin-search) post_processor:{pp}, inst:{i}"
+                assert py_hash_pred == approx(
+                    py_pred, abs=1e-6
+                ), f"model:{model} (sparse, hash) post_processor:{pp}, inst:{i}"
+                assert py_csc_pred == approx(
+                    py_pred, abs=1e-6
+                ), f"model:{model} (sparse, csc) post_processor:{pp}, inst:{i}"
+
+                assert py_bin_search_dense_pred == approx(
+                    py_pred, abs=1e-6
+                ), f"model:{model} (dense, bin-search) post_processor:{pp}, inst:{i}"
+                assert py_hash_chunked_dense_pred == approx(
+                    py_pred, abs=1e-6
+                ), f"model:{model} (dense, hash) post_processor:{pp}, inst:{i}"
+                assert py_csc_dense_pred == approx(
+                    py_pred, abs=1e-6
+                ), f"model:{model} (dense, csc) post_processor:{pp}, inst:{i}"
+
+
+def test_cli(tmpdir):
+    import subprocess
+    import shlex
+    import numpy as np
+    import scipy.sparse as smat
+    from pecos.xmc.xlinear import XLinearModel as xlm
+    from pecos.xmc import Indexer, LabelEmbeddingFactory
+    from pecos.utils import smat_util
+
+    train_sX_file = "test/tst-data/xmc/xlinear/X.npz"
+    train_dX_file = str(tmpdir.join("X.trn.npy"))
+    train_Y_file = "test/tst-data/xmc/xlinear/Y.npz"
+    test_sX_file = "test/tst-data/xmc/xlinear/Xt.npz"
+    test_dX_file = str(tmpdir.join("X.tst.npy"))
+    test_Y_file = "test/tst-data/xmc/xlinear/Yt.npz"
+    true_Y_pred_file = "test/tst-data/xmc/xlinear/Yt_pred.npz"
+    true_Y_pred_with_man_file = "test/tst-data/xmc/xlinear/Yt_pred_with_tfn+man.npz"
+    true_Yt_pred_with_splits = {
+        2: "test/tst-data/xmc/xlinear/P:nr_splits=2.npz",
+        4: "test/tst-data/xmc/xlinear/P:nr_splits=4.npz",
+    }
+    test_Y_pred_file = str(tmpdir.join("Yt_pred_test.npz"))
+    code_file = str(tmpdir.join("codes.npz"))
+    cluster_chain_folder = str(tmpdir.join("cluster_chain"))
+    match_file = str(tmpdir.join("M.npz"))
+    model_folder = str(tmpdir.join("save_model"))
+
+    np.save(train_dX_file, smat_util.load_matrix(train_sX_file).toarray(), allow_pickle=False)
+    np.save(test_dX_file, smat_util.load_matrix(test_sX_file).toarray(), allow_pickle=False)
+
+    for train_X, test_X in [(train_sX_file, test_sX_file), (train_dX_file, test_dX_file)]:
+        # Training
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.train"]
+        cmd += ["-x {}".format(train_X)]
+        cmd += ["-y {}".format(train_Y_file)]
+        cmd += ["-m {}".format(model_folder)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+
+        # Batch Inference
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.predict"]
+        cmd += ["-x {}".format(test_X)]
+        cmd += ["-y {}".format(test_Y_file)]
+        cmd += ["-o {}".format(test_Y_pred_file)]
+        cmd += ["-m {}".format(model_folder)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        true_Yt_pred = smat_util.load_matrix(true_Y_pred_file)
+        Yt_pred = smat_util.load_matrix(test_Y_pred_file)
+        assert Yt_pred.todense() == approx(true_Yt_pred.todense(), abs=1e-6)
+
+        # Evaluate
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.evaluate"]
+        cmd += ["-y {}".format(test_Y_file)]
+        cmd += ["-p {}".format(true_Y_pred_file)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        std_output = b"==== evaluation results ====\nprec   = 100.00 50.00 33.33 25.00 20.00 16.67 14.29 12.50 11.11 10.00\nrecall = 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00\n"
+        assert process.stdout == std_output
+
+        # Training with Existing Clustering
+        X = xlm.load_feature_matrix(train_X)
+        Y = xlm.load_label_matrix(train_Y_file)
+        label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")
+
+        # Training with cluster chain stored in a cluster folder
+        cluster_chain = Indexer.gen(label_feat)
+        cluster_chain.save(cluster_chain_folder)
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.train"]
+        cmd += ["-x {}".format(train_X)]
+        cmd += ["-y {}".format(train_Y_file)]
+        cmd += ["-c {}".format(cluster_chain_folder)]
+        cmd += ["-m {}".format(model_folder)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+
+        # Training with last layer code matrix stored in a scipy.sparse matrix
+        C = cluster_chain[-1]
+        smat.save_npz(code_file, C)
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.train"]
+        cmd += ["-x {}".format(train_X)]
+        cmd += ["-y {}".format(train_Y_file)]
+        cmd += ["-c {}".format(code_file)]
+        cmd += ["-m {}".format(model_folder)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+
+        # Batch Inference
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.predict"]
+        cmd += ["-x {}".format(test_X)]
+        cmd += ["-y {}".format(test_Y_file)]
+        cmd += ["-o {}".format(test_Y_pred_file)]
+        cmd += ["-m {}".format(model_folder)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        true_Yt_pred = smat_util.load_matrix(true_Y_pred_file)
+        Yt_pred = smat_util.load_matrix(test_Y_pred_file)
+        assert Yt_pred.todense() == approx(true_Yt_pred.todense(), abs=1e-6)
+
+        # Training with User Supplied Negative
+        M = (Y * C).tocsc()
+        smat.save_npz(match_file, M)
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.train"]
+        cmd += ["-x {}".format(train_X)]
+        cmd += ["-y {}".format(train_Y_file)]
+        cmd += ["-c {}".format(code_file)]
+        cmd += ["-m {}".format(model_folder)]
+        cmd += ["-um {}".format(match_file)]
+        cmd += ["-uy {}".format(train_Y_file)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+
+        # Batch Inference
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.predict"]
+        cmd += ["-x {}".format(test_X)]
+        cmd += ["-y {}".format(test_Y_file)]
+        cmd += ["-o {}".format(test_Y_pred_file)]
+        cmd += ["-m {}".format(model_folder)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        true_Yt_pred = smat_util.load_matrix(true_Y_pred_file)
+        Yt_pred = smat_util.load_matrix(test_Y_pred_file)
+        assert Yt_pred.todense() == approx(true_Yt_pred.todense(), abs=1e-6)
+
+        # Evaluate
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.evaluate"]
+        cmd += ["-y {}".format(test_Y_file)]
+        cmd += ["-p {}".format(true_Y_pred_file)]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        std_output = b"==== evaluation results ====\nprec   = 100.00 50.00 33.33 25.00 20.00 16.67 14.29 12.50 11.11 10.00\nrecall = 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00\n"
+        assert process.stdout == std_output
+
+        # Training with Matcher Aware Negatives
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.train"]
+        cmd += ["-x {}".format(train_X)]
+        cmd += ["-y {}".format(train_Y_file)]
+        cmd += ["-m {}".format(model_folder)]
+        cmd += ["-pp noop"]
+        cmd += ["-b 2"]
+        cmd += ["-ns tfn+man"]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+
+        # Batch Inference
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.predict"]
+        cmd += ["-x {}".format(test_X)]
+        cmd += ["-y {}".format(test_Y_file)]
+        cmd += ["-o {}".format(test_Y_pred_file)]
+        cmd += ["-m {}".format(model_folder)]
+        cmd += ["-pp sigmoid"]
+        cmd += ["-b 4"]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        true_Yt_pred_with_man = smat_util.load_matrix(true_Y_pred_with_man_file)
+        Yt_pred = smat_util.load_matrix(test_Y_pred_file)
+        assert Yt_pred.todense() == approx(true_Yt_pred_with_man.todense(), abs=1e-6)
+
+        # Training with Matcher Aware Negatives
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.train"]
+        cmd += ["-x {}".format(train_X)]
+        cmd += ["-y {}".format(train_Y_file)]
+        cmd += ["-m {}".format(model_folder)]
+        cmd += ["-pp noop"]
+        cmd += ["-b 2"]
+        cmd += ["-ns tfn+man"]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+
+        # Batch Inference
+        cmd = []
+        cmd += ["python3 -m pecos.xmc.xlinear.predict"]
+        cmd += ["-x {}".format(test_X)]
+        cmd += ["-y {}".format(test_Y_file)]
+        cmd += ["-o {}".format(test_Y_pred_file)]
+        cmd += ["-m {}".format(model_folder)]
+        cmd += ["-pp sigmoid"]
+        cmd += ["-b 4"]
+        process = subprocess.run(
+            shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        assert process.returncode == 0, " ".join(cmd)
+        true_Yt_pred_with_man = smat_util.load_matrix(true_Y_pred_with_man_file)
+        Yt_pred = smat_util.load_matrix(test_Y_pred_file)
+        assert Yt_pred.todense() == approx(true_Yt_pred_with_man.todense(), abs=1e-6)
+
+        # Training with various number of splits to construct hierarchy
+        for splits in [2, 4]:
+            model_folder_local = f"{model_folder}-{splits}"
+
+            cmd = []
+            cmd += ["python3 -m pecos.xmc.xlinear.train"]
+            cmd += [f"-x {train_X}"]
+            cmd += [f"-y {train_Y_file}"]
+            cmd += [f"--nr-splits {splits}"]
+            cmd += [f"--max-leaf-size 2"]
+            cmd += [f"-m {model_folder_local}"]
+
+            process = subprocess.run(
+                shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            assert process.returncode == 0, " ".join(cmd)
+
+            cmd = []
+            cmd += ["python3 -m pecos.xmc.xlinear.predict"]
+            cmd += [f"-x {test_X}"]
+            cmd += [f"-y {test_Y_file}"]
+            cmd += [f"-m {model_folder_local}"]
+            cmd += [f"-o {test_Y_pred_file}"]
+            cmd += [f"-B 2"]
+
+            process = subprocess.run(
+                shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            assert process.returncode == 0, " ".join(cmd)
+
+            true_Yt_pred = smat_util.load_matrix(true_Yt_pred_with_splits[splits])
+            Yt_pred = smat_util.load_matrix(test_Y_pred_file)
+            assert Yt_pred.todense() == approx(true_Yt_pred.todense(), abs=1e-6)
+
+
+def test_split_model_at_depth():
+    import numpy as np
+    import scipy.sparse as smat
+    from pecos.xmc.xlinear.model import XLinearModel
+    from pecos.xmc import MLModel, HierarchicalMLModel
+
+    c_matrix_1 = smat.csc_matrix([[1], [1]], dtype=np.float32)
+    w_matrix_1 = smat.csc_matrix(np.random.normal(size=(10, 2)), dtype=np.float32)
+    c_matrix_2 = smat.csc_matrix([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=np.float32)
+    w_matrix_2 = smat.csc_matrix(np.random.normal(size=(10, 4)), dtype=np.float32)
+    model_chain = [MLModel(C=c_matrix_1, W=w_matrix_1), MLModel(C=c_matrix_2, W=w_matrix_2)]
+    xlm = XLinearModel(HierarchicalMLModel(model_chain))
+    model_group = xlm.split_model_at_depth(given_depth=1, reindex=True)
+    parent_model = model_group["parent_model"]
+    child_models = model_group["child_models"]
+    assert len(parent_model.model.model_chain) == 1
+    assert len(child_models) == 2
+    assert len(child_models[0][0].model.model_chain) == 1
+    assert (parent_model.model.model_chain[0].C != c_matrix_1).nnz == 0
+    assert (parent_model.model.model_chain[0].W != w_matrix_1).nnz == 0
+    assert (child_models[0][0].model.model_chain[0].C != c_matrix_1).nnz == 0
+    assert (child_models[0][0].model.model_chain[0].W != w_matrix_2[:, 0:2]).nnz == 0
+    assert (child_models[1][0].model.model_chain[0].C != c_matrix_1).nnz == 0
+    assert (child_models[1][0].model.model_chain[0].W != w_matrix_2[:, 2::]).nnz == 0
+    assert child_models[0][1][0] == 0
+    assert child_models[0][1][1] == 1
+    assert child_models[1][1][0] == 2
+    assert child_models[1][1][1] == 3
+
+    c_matrix_1 = smat.csc_matrix([[1], [1]], dtype=np.float32)
+    w_matrix_1 = smat.csc_matrix(np.random.normal(size=(10, 2)), dtype=np.float32)
+    c_matrix_2 = smat.csc_matrix([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=np.float32)
+    w_matrix_2 = smat.csc_matrix(np.random.normal(size=(10, 4)), dtype=np.float32)
+    model_chain = [MLModel(C=c_matrix_1, W=w_matrix_1), MLModel(C=c_matrix_2, W=w_matrix_2)]
+    xlm = XLinearModel(HierarchicalMLModel(model_chain))
+    model_group = xlm.split_model_at_depth(given_depth=1, reindex=False)
+    parent_model = model_group["parent_model"]
+    child_models = model_group["child_models"]
+    assert (parent_model.model.model_chain[0].C != c_matrix_1[:, [0]]).nnz == 0
+    assert (parent_model.model.model_chain[0].W != w_matrix_1).nnz == 0
+    assert child_models[0][0].model.model_chain[0].C.shape == (4, 1)
+    assert child_models[0][0].model.model_chain[0].W.shape == (10, 4)
+    assert len(child_models[0][0].model.model_chain[0].W.data) == 20
+
+
+def test_manual_init(tmpdir):
+    import numpy as np
+    from pecos.xmc.xlinear.model import XLinearModel
+    from pecos.xmc import MLModel, HierarchicalMLModel
+    from pecos.utils import smat_util
+
+    train_X = smat_util.load_matrix("test/tst-data/xmc/xlinear/X.npz").astype(np.float32)
+    train_Y = smat_util.load_matrix("test/tst-data/xmc/xlinear/Y.npz").astype(np.float32)
+    test_X = smat_util.load_matrix("test/tst-data/xmc/xlinear/Xt.npz").astype(np.float32)
+
+    xlm = XLinearModel.train(train_X, train_Y, bias=1.0)
+    cluster_chain = [model.C for model in xlm.model.model_chain]
+    weight_chain = [model.W for model in xlm.model.model_chain]
+
+    # Initialize XLinearModel using weight and clustering matrices
+    model_chain = [MLModel(C=C, W=W, bias=1.0) for C, W in zip(cluster_chain, weight_chain)]
+    xlm_manual_init = XLinearModel(HierarchicalMLModel(model_chain))
+
+    Yt_pred = xlm.predict(test_X)
+    Yt_pred_manual = xlm_manual_init.predict(test_X)
+    assert Yt_pred.todense() == approx(Yt_pred_manual.todense(), abs=1e-6)
+
+
+def test_matcher_ranker_mode():
+    from pecos.utils import smat_util
+    from pecos.xmc.xlinear.model import XLinearModel
+    from pecos.xmc import Indexer, LabelEmbeddingFactory
+
+    X = smat_util.load_matrix("test/tst-data/xmc/xlinear/X.npz")
+    Y = smat_util.load_matrix("test/tst-data/xmc/xlinear/Y.npz")
+    test_X = smat_util.load_matrix("test/tst-data/xmc/xlinear/Xt.npz")
+    pred_kwargs = {"post_processor": "noop"}
+    label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")
+    cluster_chain = Indexer.gen(label_feat, max_leaf_size=2)
+    xlmatcher = XLinearModel.train(
+        X,
+        Y,
+        C=cluster_chain,
+        ranker_level=3,
+        mode="matcher",
+        negative_sampling_scheme="tfn+man",
+        pred_kwargs=pred_kwargs,
+    )
+    M_pred = xlmatcher.predict(test_X, only_topk=2)
+    xlranker = XLinearModel.train(
+        X,
+        Y,
+        C=cluster_chain,
+        ranker_level=3,
+        mode="ranker",
+        user_supplied_negatives={3: M_pred},
+        negative_sampling_scheme="usn+tfn+man",
+        pred_kwargs=pred_kwargs,
+    )
+    Y_pred = xlranker.predict(test_X, only_topk=2)
+    xlfull = XLinearModel.train(
+        X,
+        Y,
+        C=cluster_chain,
+        mode="full-model",
+        negative_sampling_scheme="tfn+man",
+        pred_kwargs=pred_kwargs,
+    )
+    Y_pred_full = xlfull.predict(test_X, only_topk=2)
+    assert Y_pred.todense() == approx(Y_pred_full.todense(), abs=1e-6)
+
+
+def test_ova_shallow_mode(tmpdir):
+    from pecos.utils import smat_util
+    from pecos.xmc.xlinear.model import XLinearModel
+    from pecos.xmc import Indexer, LabelEmbeddingFactory
+    import subprocess
+    import shlex
+
+    X = smat_util.load_matrix("test/tst-data/xmc/xlinear/X.npz")
+    Y = smat_util.load_matrix("test/tst-data/xmc/xlinear/Y.npz")
+    test_X = smat_util.load_matrix("test/tst-data/xmc/xlinear/Xt.npz")
+    label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")
+    cluster_chain = Indexer.gen(label_feat)
+    xlova = XLinearModel.train(
+        X,
+        Y,
+        C=None,
+    )
+    ova_pred = str(tmpdir.join("P_ova.npz"))
+    smat_util.save_matrix(ova_pred, xlova.predict(test_X))
+
+    xlshallow = XLinearModel.train(
+        X,
+        Y,
+        C=cluster_chain[-1],
+        shallow=True,
+    )
+    shallow_pred = str(tmpdir.join("P_shallow.npz"))
+    smat_util.save_matrix(shallow_pred, xlshallow.predict(test_X))
+    # Evaluate
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xlinear.evaluate"]
+    cmd += ["-y {}".format("test/tst-data/xmc/xlinear/Yt.npz")]
+    cmd += ["-p {}".format(ova_pred)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0, " ".join(cmd)
+    std_output = b"==== evaluation results ====\nprec   = 100.00 50.00 33.33 25.00 20.00 16.67 14.29 12.50 11.11 10.00\nrecall = 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00\n"
+    assert process.stdout == std_output
+
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xlinear.evaluate"]
+    cmd += ["-y {}".format("test/tst-data/xmc/xlinear/Yt.npz")]
+    cmd += ["-p {}".format(shallow_pred)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0, " ".join(cmd)
+    assert process.stdout == std_output
+
+
+def test_set_output_constraint(tmpdir):
+    from pecos.utils import smat_util
+    from pecos.xmc.xlinear import XLinearModel
+    from pecos.xmc import Indexer, LabelEmbeddingFactory
+
+    train_X_file = "test/tst-data/xmc/xlinear/X.npz"
+    train_Y_file = "test/tst-data/xmc/xlinear/Y.npz"
+    test_X_file = "test/tst-data/xmc/xlinear/Xt.npz"
+    model_folder = str(tmpdir.join("save_model"))
+    X = smat_util.load_matrix(train_X_file)
+    Y = smat_util.load_matrix(train_Y_file)
+    label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")
+    Xt = smat_util.load_matrix(test_X_file)
+    label_size = Y.shape[1]
+
+    model_folder_list = []
+    # Obtain xlinear models with vairous number of splits
+    for splits in [2, 4]:
+        model_folder_local = f"{model_folder}-{splits}"
+        cluster_chain = Indexer.gen(label_feat, nr_splits=splits, max_leaf_size=2)
+        py_model = XLinearModel.train(X, Y, C=cluster_chain)
+        py_model.save(model_folder_local)
+        model_folder_list.append(model_folder_local)
+
+    # Obtain the OVA model
+    py_model_ova = XLinearModel.train(X, Y, C=None)
+    model_folder_local_ova = model_folder + "-ova"
+    py_model_ova.save(model_folder_local_ova)
+    model_folder_list.append(model_folder_local_ova)
+
+    labels_to_keep_list = [[]]  # empty list
+    labels_to_keep_list.append(
+        list(set(map(int, [0, label_size / 3, label_size / 2, label_size * 2 / 3, label_size - 1])))
+    )  # symmetric label indices to keep
+    labels_to_keep_list.append(
+        list(set(map(int, [0, label_size / 3, label_size / 2])))
+    )  # asymmetric label indices to keep
+
+    beam_size_list = [2, label_size]
+
+    def obtain_ground_truth_pred(model, X, pruned_model, beam_size=None, post_processor=None):
+        pred_csr = None
+        default_kwargs = {"beam_size": 10, "only_topk": 20, "post_processor": "l3-hinge"}
+        if post_processor is None:
+            post_processor = default_kwargs.get("post_processor")
+
+        if beam_size is None:
+            beam_size = default_kwargs.get("beam_size")
+
+        for d in range(model.depth):
+            cur_model = model.model_chain[d]
+
+            pred_csr = cur_model.predict(
+                X, only_topk=cur_model.C.shape[0], csr_codes=pred_csr, post_processor=post_processor
+            )
+            kept_children = pruned_model.model_chain[d].C.indices
+
+            for i_nnz in range(pred_csr.nnz):
+                if pred_csr.indices[i_nnz] not in kept_children:
+                    pred_csr.data[i_nnz] = 0
+            pred_csr.eliminate_zeros()
+            pred_csr = smat_util.sorted_csr(pred_csr, only_topk=beam_size)
+        return pred_csr
+
+    for model_folder_local in model_folder_list:
+        for labels_to_keep in labels_to_keep_list:
+            for beam_size in beam_size_list:
+                py_model = XLinearModel.load(model_folder_local)
+                py_model.set_output_constraint(labels_to_keep)
+                model_folder_local_with_constraint = model_folder_local + "-constraint"
+                py_model.save(model_folder_local_with_constraint)
+
+                py_model_full = XLinearModel.load(model_folder_local)
+                pred_ground_truth = obtain_ground_truth_pred(
+                    py_model_full.model, Xt, py_model.model, beam_size
+                ).todense()
+
+                py_model_with_constraint = XLinearModel.load(model_folder_local_with_constraint)
+                pred_with_constraint = py_model_with_constraint.predict(
+                    X, beam_size=beam_size, only_topk=beam_size
+                ).todense()
+
+                py_model_with_constraint_predict_only = XLinearModel.load(
+                    model_folder_local_with_constraint, is_predict_only=True
+                )
+                pred_with_constraint_predict_only = py_model_with_constraint_predict_only.predict(
+                    X, beam_size=beam_size, only_topk=beam_size
+                ).todense()
+
+                assert pred_ground_truth[:, labels_to_keep] == approx(
+                    pred_with_constraint[:, labels_to_keep], abs=1e-6
+                ), f"prediction result for label constraints {labels_to_keep}, beam_size={beam_size}, model={model_folder_local_with_constraint} and XLinearModel.load is not correct"
+
+                assert pred_ground_truth[:, labels_to_keep] == approx(
+                    pred_with_constraint_predict_only[:, labels_to_keep], abs=1e-6
+                ), f"prediction result for label constraints {labels_to_keep}, beam_size={beam_size}, model={model_folder_local_with_constraint} and XLinearModel.load in predict-only mode is not correct"
+
+
+def test_get_submodel():
+    import numpy as np
+    import scipy.sparse as smat
+    from pecos.xmc import MLModel
+
+    c_matrix = smat.csc_matrix([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1]], dtype=np.float32)
+    w_matrix = smat.csc_matrix(np.eye(4), dtype=np.float32)
+    ml_model = MLModel(C=c_matrix, W=w_matrix)
+    out = ml_model.get_submodel(selected_codes=[1, 2], reindex=True)
+    assert len(out["active_labels"]) == 3
+    assert len(out["active_codes"]) == 2
+    assert 0 in out["active_labels"]
+    assert 1 in out["active_labels"]
+    assert 3 in out["active_labels"]
+    new_C = ml_model.C[out["active_labels"], :]
+    new_C = new_C[:, [1, 2]]
+    assert (out["model"].C != new_C).nnz == 0  # check for equality of sparse matrices
+    assert (out["model"].W != ml_model.W[:, out["active_labels"]]).nnz == 0
+
+    c_matrix = smat.csc_matrix([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1]], dtype=np.float32)
+    w_matrix = smat.csc_matrix(np.random.normal(size=(10, 4)), dtype=np.float32)
+    ml_model = MLModel(C=c_matrix, W=w_matrix)
+    out = ml_model.get_submodel(selected_codes=[1, 2], reindex=False)
+    assert out["model"].C.shape == c_matrix.shape
+    assert out["model"].W.shape == w_matrix.shape
+    for r in range(c_matrix.shape[0]):
+        for c in range(c_matrix.shape[1]):
+            if r in [0, 1, 3] and c in [1, 2]:
+                assert out["model"].C[r, c] == c_matrix[r, c]
+            else:
+                assert out["model"].C[r, c] == 0
+    for r in range(w_matrix.shape[0]):
+        for c in range(w_matrix.shape[1]):
+            if c in [0, 1, 3]:
+                assert out["model"].W[r, c] == w_matrix[r, c]
+            else:
+                assert out["model"].W[r, c] == 0
+    assert len(out["active_labels"]) == 3
+    assert len(out["active_codes"]) == 2
+    assert 0 in out["active_labels"]
+    assert 1 in out["active_labels"]
+    assert 3 in out["active_labels"]
diff --git a/test/pecos/xmc/xtransformer/test_model.py b/test/pecos/xmc/xtransformer/test_model.py
new file mode 100644
index 00000000..a12436ac
--- /dev/null
+++ b/test/pecos/xmc/xtransformer/test_model.py
@@ -0,0 +1,17 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+
+def test_importable():
+    import pecos.xmc.xtransformer.model  # noqa: F401
+    from pecos.xmc.xtransformer.matcher import TransformerMatcher  # noqa: F401
+    from pecos.xmc.xtransformer.network import BertForXMC  # noqa: F401
diff --git a/test/pecos/xmc/xtransformer/test_xtransformer.py b/test/pecos/xmc/xtransformer/test_xtransformer.py
new file mode 100644
index 00000000..2c0ba055
--- /dev/null
+++ b/test/pecos/xmc/xtransformer/test_xtransformer.py
@@ -0,0 +1,121 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import numpy as np
+
+import pytest  # noqa: F401; pylint: disable=unused-variable
+
+bert_model_path = "test/tst-data/xmc/xtransformer/saved_model/"
+train_feat_file = "test/tst-data/xmc/xtransformer/train_feat.npz"
+train_dense_feat_file = "test/tst-data/xmc/xtransformer/dense_train_feat.npy"
+
+
+def test_bert(tmpdir):
+    xtransformer_cli(tmpdir.join("sparse"), bert_model_path, train_feat_file)
+    xtransformer_cli(tmpdir.join("dense"), bert_model_path, train_dense_feat_file)
+
+
+def xtransformer_cli(tmpdir, load_model_path, X_feat_file):
+    import subprocess
+    import shlex
+
+    X_trn_file = "test/tst-data/xmc/xtransformer/train.txt"
+    code_file = "test/tst-data/xmc/xtransformer/clusters.npz"
+    Y_trn_file = "test/tst-data/xmc/xtransformer/train_label.npz"
+    save_P_file = str(tmpdir.join("P.npz"))
+
+    # Training matcher
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xtransformer.train"]
+    cmd += ["--trn-feat-path {}".format(X_feat_file)]
+    cmd += ["--trn-label-path {}".format(Y_trn_file)]
+    cmd += ["--model-dir {}".format(str(tmpdir))]
+    cmd += ["--code-path {}".format(code_file)]
+    cmd += ["--trn-text-path {}".format(X_trn_file)]
+    cmd += ["--init-model-dir {}".format(load_model_path)]
+    cmd += ["--batch-size {}".format(1)]
+    cmd += ["--num-train-epochs {}".format(1)]
+    cmd += ["--save-steps {}".format(2)]
+    cmd += ["--only-topk {}".format(2)]
+    cmd += ["--batch-gen-workers {}".format(2)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0, " ".join(cmd)
+
+    # Training matcher with indexing
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xtransformer.train"]
+    cmd += ["--trn-feat-path {}".format(X_feat_file)]
+    cmd += ["--trn-label-path {}".format(Y_trn_file)]
+    cmd += ["--model-dir {}".format(str(tmpdir))]
+    cmd += ["--trn-text-path {}".format(X_trn_file)]
+    cmd += ["--init-model-dir {}".format(load_model_path)]
+    cmd += ["--batch-size {}".format(1)]
+    cmd += ["--num-train-epochs {}".format(1)]
+    cmd += ["--save-steps {}".format(2)]
+    cmd += ["--only-topk {}".format(2)]
+    cmd += ["--batch-gen-workers {}".format(2)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0, " ".join(cmd)
+
+    # Predict
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xtransformer.predict"]
+    cmd += ["--feat-path {}".format(X_feat_file)]
+    cmd += ["--text-path {}".format(X_trn_file)]
+    cmd += ["--model-folder {}".format(str(tmpdir))]
+    cmd += ["--output-dir {}".format(str(tmpdir))]
+    cmd += ["--batch-size {}".format(1)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0, " ".join(cmd)
+
+    # predict with no batching while encoding
+    second_predict_tmpdir = tmpdir.mkdir("second_predict")
+    save_P_file2 = str(second_predict_tmpdir.join("P.npz"))
+    # Predict
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xtransformer.predict"]
+    cmd += ["--feat-path {}".format(X_feat_file)]
+    cmd += ["--text-path {}".format(X_trn_file)]
+    cmd += ["--model-folder {}".format(str(tmpdir))]
+    cmd += ["--output-dir {}".format(str(second_predict_tmpdir))]
+    cmd += ["--batch-gen-workers {0}".format(1)]
+    cmd += ["--batch-size {}".format(1)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    assert process.returncode == 0, " ".join(cmd)
+
+    first_predict = np.load(save_P_file)
+    second_predict = np.load(save_P_file2)
+
+    for key in first_predict.keys():
+        if key != "format":
+            assert np.allclose(
+                first_predict[key], second_predict[key]
+            ), "batched encoding and single encoding gives different results"
+
+    # Evaluate ranker prediction
+    cmd = []
+    cmd += ["python3 -m pecos.xmc.xlinear.evaluate"]
+    cmd += ["-y {}".format(Y_trn_file)]
+    cmd += ["-p {}".format(save_P_file)]
+    process = subprocess.run(
+        shlex.split(" ".join(cmd)), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    print(process.stdout)
+    assert process.returncode == 0, " ".join(cmd)
+    std_output = b"==== evaluation results ====\nprec   = 100.00 100.00 66.67 50.00 40.00 33.33 28.57 25.00 22.22 20.00\nrecall = 41.67 83.33 83.33 83.33 83.33 83.33 83.33 83.33 83.33 83.33\n"
+    assert process.stdout == std_output
diff --git a/test/tst-data/apps/text2text/query_text.txt b/test/tst-data/apps/text2text/query_text.txt
new file mode 100644
index 00000000..fe688859
--- /dev/null
+++ b/test/tst-data/apps/text2text/query_text.txt
@@ -0,0 +1,20 @@
+appetite suppressant
+womens columbia boots
+diet pills that work fast for women
+sugar container
+liquid foundation
+cream and sugar set
+appetite suppressant for women
+sugar dispenser
+red booties women ankle boots
+weight loss pills for men
+swing dress
+novelty pens
+butter knife
+floating locket charms
+meditation gifts
+delta shower trim kit
+spreader knife
+gowntown 1950s vintage dress
+50s dresses for women vintage
+sugar canister
diff --git a/test/tst-data/apps/text2text/query_to_keep_text.txt b/test/tst-data/apps/text2text/query_to_keep_text.txt
new file mode 100644
index 00000000..4db44288
--- /dev/null
+++ b/test/tst-data/apps/text2text/query_to_keep_text.txt
@@ -0,0 +1,5 @@
+appetite suppressant
+womens columbia boots
+diet pills that work fast for women
+sugar container
+liquid foundation
diff --git a/test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items.txt b/test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items.txt
new file mode 100644
index 00000000..29d379ec
--- /dev/null
+++ b/test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items.txt
@@ -0,0 +1,2 @@
+{"schema": ["output_item", "score"], "data": [["womens columbia boots", 0.95887], ["liquid foundation", 0.0103], ["appetite suppressant", 0.0015704], ["diet pills that work fast for women", 0.0015704], ["sugar container", 0.0015704], ["cream and sugar set", 0.0015704], ["appetite suppressant for women", 0.0015704], ["sugar dispenser", 0.0015704], ["red booties women ankle boots", 0.0015704], ["weight loss pills for men", 0.0015704]], "meta_info": "1\tColumbia Women's Bugaboot Winter Boot, Black/Shale, 5 M US"}
+{"schema": ["output_item", "score"], "data": [["liquid foundation", 0.95999], ["womens columbia boots", 0.0099539], ["red booties women ankle boots", 0.0016382], ["diet pills that work fast for women", 0.0016382], ["appetite suppressant for women", 0.0016382], ["cream and sugar set", 0.0016382], ["sugar dispenser", 0.0016382], ["weight loss pills for men", 0.0016382], ["appetite suppressant", 0.0016382], ["sugar container", 0.0016382]], "meta_info": "4\tKooK Sugar and Creamer Set, Ceramic Make, White\t"}
diff --git a/test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items_keep.txt b/test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items_keep.txt
new file mode 100644
index 00000000..38d7fbe8
--- /dev/null
+++ b/test/tst-data/apps/text2text/sklearn_tfidf_true_pred_items_keep.txt
@@ -0,0 +1,2 @@
+{"schema": ["output_item", "score"], "data": [["womens columbia boots", 0.95887], ["liquid foundation", 0.0103], ["sugar container", 0.0015704], ["diet pills that work fast for women", 0.0015704], ["appetite suppressant", 0.0015704]]}
+{"schema": ["output_item", "score"], "data": [["liquid foundation", 0.95999], ["womens columbia boots", 0.0099539], ["sugar container", 0.0016382], ["diet pills that work fast for women", 0.0016382], ["appetite suppressant", 0.0016382]]}
diff --git a/test/tst-data/apps/text2text/test.txt b/test/tst-data/apps/text2text/test.txt
new file mode 100644
index 00000000..84ee8084
--- /dev/null
+++ b/test/tst-data/apps/text2text/test.txt
@@ -0,0 +1,2 @@
+1	Columbia Women's Bugaboot Winter Boot, Black/Shale, 5 M US
+4	KooK Sugar and Creamer Set, Ceramic Make, White
diff --git a/test/tst-data/apps/text2text/train.txt b/test/tst-data/apps/text2text/train.txt
new file mode 100644
index 00000000..3f6a336f
--- /dev/null
+++ b/test/tst-data/apps/text2text/train.txt
@@ -0,0 +1,8 @@
+1	Columbia Womens Powder Summit Shorty Waterproof Boots Insulated Bootie (9.5, Purple Dahlia/deep Blush)
+2,6	HOPLYNN Neoprene Sauna Sweat Vest, Waist Trainer Slimming Vest for Women Weight Loss with Adjustable Waist Shaper Belt Gray/XL
+3,7	Zevro Honey-Can KCH-06071 Indispensable Sugar 'N More Dispenser, Black/Chrome
+4	COVERGIRL Simply Ageless 3-in-1 Liquid Foundation, Ivory 205, 1 oz (Packaging May Vary)
+5	Flower Market Collection, Sugar and Creamer Set
+7	Palais Glassware 'Sucre', Clear Glass Sugar Dispenser (Round, 8 Oz)
+8	Soda Women's Western Ankle Bootie w Low Chunky Block Stacked Heel Lipstick Red (7.5)
+9	Thermogenic Fat Burners for Men/Women - Hardcore Weight Loss Pills - Appetite Suppressant- Premium Metabolism/Energy Booster - 60 Gelatin Capsules - Keto Friendly - Iron Brothers Thermo Burn
diff --git a/test/tst-data/apps/text2text/true_pred_items.txt b/test/tst-data/apps/text2text/true_pred_items.txt
new file mode 100644
index 00000000..c48a78d3
--- /dev/null
+++ b/test/tst-data/apps/text2text/true_pred_items.txt
@@ -0,0 +1,2 @@
+{"schema": ["output_item", "score"], "data": [["womens columbia boots", 0.36528], ["liquid foundation", 0.36528], ["appetite suppressant", 0.0026979], ["diet pills that work fast for women", 0.0026979], ["sugar container", 0.0026979], ["cream and sugar set", 0.0026979], ["appetite suppressant for women", 0.0026979], ["sugar dispenser", 0.0026979], ["red booties women ankle boots", 0.0026979], ["weight loss pills for men", 0.0026979], ["swing dress", 0.00099957], ["novelty pens", 0.00099957], ["butter knife", 0.00099957], ["floating locket charms", 0.00099957], ["meditation gifts", 0.00099957], ["delta shower trim kit", 0.00099957], ["spreader knife", 0.00099957], ["gowntown 1950s vintage dress", 0.00099957], ["50s dresses for women vintage", 0.00099957], ["sugar canister", 0.00099957]]}
+{"schema": ["output_item", "score"], "data": [["womens columbia boots", 0.36528], ["liquid foundation", 0.36528], ["appetite suppressant", 0.0026979], ["diet pills that work fast for women", 0.0026979], ["sugar container", 0.0026979], ["cream and sugar set", 0.0026979], ["appetite suppressant for women", 0.0026979], ["sugar dispenser", 0.0026979], ["red booties women ankle boots", 0.0026979], ["weight loss pills for men", 0.0026979], ["swing dress", 0.00099957], ["novelty pens", 0.00099957], ["butter knife", 0.00099957], ["floating locket charms", 0.00099957], ["meditation gifts", 0.00099957], ["delta shower trim kit", 0.00099957], ["spreader knife", 0.00099957], ["gowntown 1950s vintage dress", 0.00099957], ["50s dresses for women vintage", 0.00099957], ["sugar canister", 0.00099957]]}
diff --git a/test/tst-data/apps/text2text/truth_items.txt b/test/tst-data/apps/text2text/truth_items.txt
new file mode 100644
index 00000000..0cb32f73
--- /dev/null
+++ b/test/tst-data/apps/text2text/truth_items.txt
@@ -0,0 +1,2 @@
+womens columbia boots
+liquid foundation
diff --git a/test/tst-data/utils/data/dummy_transformer_config.json b/test/tst-data/utils/data/dummy_transformer_config.json
new file mode 100644
index 00000000..a58cbbf9
--- /dev/null
+++ b/test/tst-data/utils/data/dummy_transformer_config.json
@@ -0,0 +1,36 @@
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "finetuning_task": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 8,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 10,
+  "is_decoder": false,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 256,
+  "model_type": "bert",
+  "num_attention_heads": 1,
+  "num_hidden_layers": 1,
+  "num_labels": 2,
+  "output_attentions": false,
+  "output_hidden_states": true,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pruned_heads": {},
+  "torchscript": false,
+  "type_vocab_size": 2,
+  "use_bfloat16": false,
+  "vocab_size": 15
+}
diff --git a/test/tst-data/utils/data/dummy_transformer_model.bin b/test/tst-data/utils/data/dummy_transformer_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a14ae76b637d1636a686a8b1a5df9e4687f7d24c
GIT binary patch
literal 15759
zcmb`O2{={X*T>Cch73_jG>Hn4>F#x;iAsYsky1pZ6o=cOQpga6B1MBxNFo)LyVofi
z6e^krG)HLglR{GO6~Ev6>;LcnzQ6Z*-upZ}&s}Hj_1WLO_t|^xea;OMi<0yHcF?w6
zXXP=6zJ@_!)9b~B{{vZ(;ZDjTA|k$iK7600KHiJhdHVT#`K)0@g$_;%!Ykfhe7?8m
zV$U@mUT$kxvF%QBLL)!G*K@JI*G5l&Ry>fE815t~H2Sab@#C|S_F_)r!f#${_^gz@
zgp)*944*a2S!sKTwQGgUr9zCi=Tg3q;x9MT&%@Kt(_^C7QobB3qVLFx4tLTOmhfEd
z?&;y-wPv}$;d&oGkHvrg?&-CB1)mjL%J=bGy25brTE3UJzuaO!PxrN6-X4qjo@@Ml
z{1&e>kz>V&4|kI8s+iEzZMi2a(OpE&*4xL8KfBu~$4WXW34KPJT3MQ#j<Gbh9Ajx_
zVa`hZ6tUEIt+3u3X7E{=S@t6KO7=40tgO9^y=XWq7rCOVzVdvwmm{m-H1yX>`}+8M
zbt(K!#T8cw>$O5+#jo7aW30?9ElgOY71Gm1WQ0IvA#l2gXcw_}H&LaVsLE&699eZI
zy<drZpH-e~7XR#+e{ZZlzh#;j8=D)CHfJ?fNZTt5k(yl^#kz=nyNUg}iT(MkmLogB
zssFFUX>I|YelvXhRvWJG?h@_aQbt>iHZe6dX9o%?UBYy_C^BKJuDxg&JE)s5n9mMz
zWQRKS{Z9mUFE@X7*l**Tn3!6a8INK0dg2f7iLc)k-=I7G2tGT~ksakUqAUKIr9Q$z
zXXxEkeM4hIH@<MlboG^?zo+*yLw2pFU%;P=41Y^B9W&b8La4|{NVE|uGX6>IYPZqd
z?Pk(VHRZEo99c6b{eMdRsnYzn%+Z$S7AB@9tVNGX%N~_hT`GCqJi=$ek!Ah~Z<Xg?
z8m)iJGd43bHL)^h$M$Hn`Ag%tZjIx+nG^WzL`QZ~_h|f|Tege&Q=#qeh!(<I&&<-&
zlC|qmIJrmRlrDu+yJ_}(cA6tQ{h##XIydjNo`3e^jNcP2EX|FD8fW%sbofi-tZt2t
z-PGB9c8()E_n)+DH}$8=dB0~`S(%#~Te9<eR66yjbna5QpquByXBRrMi=0ONkK@K?
zE#G%7-_S!i-~FZ2^><85lhLMTR_2!M;vSt#{?h5zt<$}myOhs*II^DqOY8o^{i$@>
zf1p~Jj<y`bF7HvgqDQG$m(rEp#8rIO+mT)EH2R+?{kQpM&41up3Y&Ng>(j&a?cuUr
zT)%FvKcD40vTL1;{%_o$YS;aaYGG+&YG!V1%&zaDZs?%~bWt~UQ#bM1&5mr~Z^w~v
zx%Bi~?I~Pj`JTV_>6YK|EJs@y|D1!i3V9B~IVh-$C)YIx1$XypNH=pEpAB_n!+v{z
z{)YKeVfb%}!g<!xQaA%e2#In+;`X1!FgCJFVN^G52cO;P$VNL2{!P37GH!ZSjF{h&
z#+X=ISXh~`u|3*${cKBnxo~#3P@-_{jq9ex^VvO)Y(jU!@Go=EFSMVk_Wq6~>^CdR
z(L&a~o_<U0>9?dVRr|X+2l(tkNA}Qf+pvf8Z(}q0w=^qrQ!`T`?Qjn*rH7W<MN8|Z
z9pSS_9oh8mjN<pI{+sqwS;p^JmL_JSjZKAjeP$2qSPv_!i*>x4mCa{QII=lTs>1&A
z_3`l*E{MO(C%M0k+|@r;!u9W@Fml)Wm)G4t!m)R%d+eR=j(&#E<~y<lPQCv%`p;UQ
z{cUVhv(XkN!kpw>Pp!}Q)Vi>%)`IT1l+U6g%dHJ$FF45xV=rFq$#?T`<GZmJf97OG
z!<~dy5w<vxz4S8y{4=S&Y%l8nGk^W>42iuG$X;FX^Flving71<uX!h163AZbcGUjW
z@i)01dp(dX?RM7r)%mw+8e0~~mUp|W{p$W#PqQ}y*_+)iD!;mT`1$w>M<IJFkge!;
z=*oF}vK+QDkiFe)?Mmi<$$i*6f$ZJxYh4-eFV_r>*?WQP{qCzjv$B5~iGO8H?1Mn|
zVYlDUcKJ_!KeH?LQ6T%c+j+>pBY&HUu}=cos_r6#{`Vrn9D}V6WNZF0<L83SJ`H4_
zbr%`-zcc=9@aKW-i|#@?|6`%<DS~|&$iDg~jK9xZ?CU`GO?M%Kf50&O@8MD#$kufi
z9R5!V{;BD0AY0#EOzStr{=Q(a4S{T9x9jiv<?pWDa|-(|kbU1>pwDk&_q5W7K=$K*
zuD~yC(-g@5(e0}9ALIS4MVbTIPu=dn8>fFp?zx#-0@=^q&QikI-BTm`C6I0HHun0<
z*rTm2kZte2^tZ{*O}NQj=hn5gV7~^k9Y3$H^>-KUffse{yZ^TB{44nt{ngr&REYhH
zwR_1D|JB-)%u4)f?MXL(*=$<;xeQ7D-PiKZlwbN+YtM#Q<`?Uq+f~_LtvyMJ+^^Q2
zt&9Ax)}C!)uV1WxZb=n>we}>iivMEmPB4}J#oC?zDgSEi*(CP<SL@G|M&(y)&%&wt
zi}lY<pW3h1p5;pY7wex1OrL+Vb}gG4o`y2Q@$*kU|KEN#J<wI@dQAT55b)_7$V;XT
zFu+lj8yygW6?F^{)tSg<DPi=8UG&lN>$IeBfj~>rgSyKP5u_b#;c*J@3YT9xgte8&
z!17KgDu3?>t+{1bFs`z2^NpMM>8LFRC!e7v=4MROy%(Uns|x2Q5Yn_n6SWR0pv$@?
zsL{`h>Z>k<Q8Oo?if%F-Azv`*;#lrhz6|)q9fDi8b)kLE3`lueNM58NzKnfHBe~UB
zp1c$*f+V0(YYMiHN+vTEGl-PwF0jw+#eA$)<C<>A;LPG|Ecn!l3$>mK-km%^Dy`p>
z1}RgLY_1DuF5B`N&XmFby<LOGiirH@zdzg6*IS#@_>lvVBYnvprFPt5>5KHUF4sBM
z0>ewRAw&L@z;`;*%zzYlI;<a*)&3yML$;yv;OBT$RuM$*Od$n#dHCYmT<Gt#ouox%
zp#2kTMnUcvH9dP3zMW%n=4^G&vh*nZVxm&0-C2$AhEx#w9dgXVk*1{R@qFgX%QU(n
zFAc&?v!G$<DbSC1;(7-hqz{`9@WS_)F=r1d!_^U0gsw4y^gh1SQ$n7~gvrD9e(HFK
z8sSOhTNtl(0uG*2A<^pL@WN&rRv7=FlMkyig$|2x=Xp1>%F-CiMs-57<PzKxB+fWw
z`_e6=&(RG%<16#M0hu&igb6ym9foaA!(x#OQ1>AM-QSO*{COV4HEbeg?4QWfTepg-
z-<t^E&g9a_xl?eR=>bxADFW^r-NjvEM^Q=kCFyXwgxuae)XGQ&k8BEr>81U+56wmp
za9}(Y&FmzaZHrN9XdIk>c?8UlhryWF3+d3tuh{3=Hd=J~F1=rkG<c~I*SB9j3}_#R
z*;kKZQ%N<IoRkB_hYX-KVknbS952wAR{~e$OHo@b1?{q%al~W`5b@Ul`7t_z+{DX-
zi#tqTd^3mqhc38ULzYuIJ{ykjdI1xqV?g161o!ROS3LLfIfe`7Q2qJ(pjDv5eF@Hl
znnM>*^UMQ!qU08NxN|o=I(&{)oE^@E*99}$n_mjnPYLBcG8m7p{rf@Giqo`Li4zn*
z*g>bb?x7y9)6i2i2Md&U!iexM@JZi@^Ih?PI!{BQ6Mi3edOLHqbHnkvPBhLh(Sg?Y
z&N$bsmd4Z$g)Q?NaACzbOBG{n?pxY%a=vK|+SKa6q_JkW?ENB46!`#x2LqVzpQE8I
z={#Jj`2%B)J)`E?$LKsq1F*e!4Qth6iOSv%fwJCo$TfLMbRK^u!G3xu(sKhD9wU&`
zpv)K=6+!d&oAj<`0dhsLa3N8GNdrwT>0}C>`nCxaBTfj;@xS2hNhw(9!s9}B?<CSO
z>5zM(ovb~iz(fuVXUL;CT156(w&-Qj-suCF0ZuX0_p>WTObr5uTx%xp^#CSHw~d;n
z4;AFvC4fgkJgN>cBK#Q?TJI>4eeD{MO90*#?hve9D6r~Zbq5|E-vd<@6;S+kG~_qE
zqQ@<7;PNaHX0v4`hTXN`4u(d6=8+-HvV(GvO05dbiX@=&{%6=SEgmM7jiu@}^Kk3Q
z-MpDo39+8qhuO3CB9`h%K(oqjT%WsuWS!CE28j-44jifA4clgj?K>Yrv}p(_6DtL>
zB!hZ?-v+h&I!MvXdbnhri1VxmLAL%QG9qgz6EeOYuJ%2Hwr6v3+?YmMxM(PY=9_7s
z1X<X#MVxCEY$J?97JagH7lxkhjX4eJ*nC!xexJJ;HCzti0B0FARMW#5t2H?7*6|RM
zXUn`ADGRgvOo2^{j$ysAD(w0C2k8t+NBJ$ch=_G5jH!>n7iSAdU&Se$t$d5X=6Mpn
zOc!cTu_P%+SEIG`D5MTXwC&ztE1Z;v#wU_U--Mx1ZZU`>PN!)MI^o=R=LPF_D1tt2
zrt*(>Vz?{EyHJt=E*A(y@IA3pT$}4>rp7hAk;RZxd*MvzHzF^lf}W#Nd1t%>A<-cQ
zC(Y5v4X>rpv3@by*>54%uGJ75aTG*U_VJt@7m#pQWhUh_#qS$!fV!+C11>#=5FagM
zjPKGJk^tw#O0jIS1@^fVNv&j@F-)Zh4fa`cRTffUJahvFF{h~hrgS|0wveRh#(?|O
z_oT|W18$vqMRRv$;`nGa$hJ+un(JoVK2a&0E$a#s#}0+}gGVv06K|7G{mkIfNF|K#
z8;B-0>A1|xhKr8PqDl`#amgKVs{B|RW8G`$?v#PB!8#l*S3RPg8iSccm*E6gJR^6{
zWC3r(PQ0ad9#`JKPR;KP!^jchxPRgim|ioLXgIwkzWz0WPEduY)AzB~={#7EyNfT^
zZ4k6fjDd)e;iTq$9GQ1k702`*i8fKDynrQ1w7Aevu<z(ZX3f+T>U6LMJ(mX%u>(c)
zsBIN!nbxAH^?J;@eG*Dd^r=md4ve{{$elaT1l)+#=r+NG!OnKPRWTF!SKjeDVvpn5
z*28%Ek5s{W4;QW?U?99W|DLz|gDw{sdK3@8j}^pSd=2CE!@+9j3@}=Ci^^D}(KuZ_
zEW39Glb61LsVGN2S}*18a2kXWjTLmj9RS5v1x{(|IFR1o3>~}q<X~P3+Kj)(yVxd*
z*ZeAJVpb+?`n(XrnSAIU6<_#PG#<nA?7_gc3Oftd6D^xfpfZo+>5Nn2hEz_V(~hly
zysP)H?fG>9V|5z6=zc7{`GkJY7>CjYV_;B#6^yU0#{unqi3>fCZ&Gif`k2Wm*>nIt
zeDJ^!FDIsB^GNLTa0qJ8DW@MMY#<#489-MH@TK}l+^|}i8E_*F6%zX~4d1TPlSUIj
z*|86|eL)Nv=dB3wM~{Jx)ofUPV<uX^lYpEhO59rI7S!my07dQ}0M}j&+4Rl;D%Qu7
znbKFFK=%nauiK2@w&%j@d(X*Hw+z@Gvjdq|aa6&vAG2>oFVYc`PV9o0fR=$Y-1#<_
zcXif6m~vSWFZJGrW&0Okjd3jQ;s-$V1RH8slt^ppzoKnnK5lAN6^sZuXXR(-ATY_?
zg`aAtG0G2{QR(YuY#D0C>?ysBwe6HDn?55zAJSo}X4twxg$r|f2qJkrs%&tHm${%8
z?NsYP?y~|>kxj?2ygIO*`~aUXP2yZ0O=eWDEPy=<t|<52jOb234<GB^Q47gy;QYZ8
zYUL!zyVKF|XiyD(-DpIPgvEly>O4W+#s?^QE}GIhEk;Y>9c~_#fNDkaF~`l8^gXRi
zRtmDI-avPJux})8c{!4Mp|u33vZdG&7Y1c&Qp^x@J5+yug}#1Mj00|DK*{|;I#jX%
zJO>YBER<Hz=Woq<e;9^AL2L=!WA36Mu^`Du_*i!K6eMqL#|;BtVbI;XbVPwQ`akvs
z2hAp^`Vk36V#c^Run0b=@26yhJ0z`nW|h(B6gE1PL*V%=l(%Ui!|uPs!y7`NTs8sx
zoi~HdgBf@yKNWIlJZw621)EQ=rJ9O^alDQyI1~<NB7DaXY%$071I@&vWjmd|N|Kj&
zSD#BNm1hjz1Yqp-a4Tj}Z|37zA3AEgH1k37A$57OhWW;y1j0cBv!C3+H%?#3DzCFx
z=KGQE%95d+lsq$jVLsYExCn!`t;CCghcT*lFue3H0l_$VI{QQ&tf2wSnjbRMAu|`V
zd#Av*H7lq~q#Bdeew7-V%X5b9<8jcK;i&(3B{$$w0Vdy(#?G-z1RY1RNm+CSh+S62
z>}6p*W9#F*q$zHox#KELP<Ehdy(Um;dsVV6RSh4liU&J(I3wn9m_C-C0*X!9MB3;U
z?dKuK*=d%bRhc#pbO@u4AL|QWXfDPV;so9lR1s$;2DH}}lM4?TAunq;43=-Oidu01
zzr|kxS&bkld3>2Z9aTuB`;@@z*9F8q^&8q(h@tHMDAKHL0B3%T<ej!I!cbanHKd~i
zB8o=B!Ba2sLPa(`&v3wpIpn*#EVHca613xdD=hUU_bhAxlI6Jg%O7dj+6Uxus3e!@
zwj83DUnN0b)tQ3nHG+a+o2dIyEjXU_nOb+aQsar@+^C39%*a!NMwLtq?l59HuIoVk
zwZkAfTmUM4a=^dYjn17VM%Lf6hb4NS&}PaFTqUsu%a8ccr2Fs5u2&pYHp-@Lt0&?~
zokUDdBRD&v8bx<SLQA6@@?BM^i&Y<Hpm#K$pYA|K>?gtV`K6G$xEj9CI0E076k>tF
z5m@$FqHx`6S<d<9Ah-}(OwWwGMit-Qf`-COIMA>X+x*h#=Nw(`_}PU#9S47yd*us;
z92Txi%TsBv+H>O6S`GcrS%6~Nc`%B(Pb0JJ$vH=F++1w|Q|3Jd2mL5mHs}I4c&4K)
zvkmQ}L+JsH8?d46rXc2Z5Gp4BAdI;QHu?_+<G}CqRq8>we8~@*-UabaKVD01tj-B?
zcc#L6vtX3(AB6tzS7N~0FpSg7MB~epX4U2r$1B6Qrrd=BiOFLyQfvlzC@TpL_IF0(
z*K48fq@k7D3@K)cN(AJei9qhDAzhSf&vVWFj?;>RNWQEr4*hbPE(u#eHs)(V&U^(Z
ziOz@ANggO8q0OjW4}=CaIf%a|LE9(kqKSqM<Fu*__H7D84L5rnH_28o>FhHo9Fhq}
z9ck!j?@V6qK7wy{+$;R>EE*%PYGP!#Eb5!7;M_$&P;$LDG}hU{nLApHi)REJ+H;47
z$d(r#c=7?{o_WJL_feqS_Y}`P_%Ni5E`pf+n{>~8XFOxE3tYIDDB<DEe2ig8<c_tt
zE#wCoR{V|5Sz3;^{Li2~P>!1~8wL0K_F`(jjA1H@H$bW=557!^Ct9(D^qRSqmKI0D
zOW!Z>W?C`2(IgVxumh*<s(|!4>jl#W&xULXW%`|_3R))kq3V^}<mH`F$W}!{+P;gJ
z^Eevab03kQ*Y)sl`Zvg}l%Vt5{cxy3EVOIvp&RPv!?;gww4(451`X&Gc)GU;_T*IJ
zatU8t)qIqUaf!Ct=hq0!TB5LI(=5CYGK$f+d_|A^SSer?_F*YA5p-IkN$~Xn#IDo;
zOgl$HXmdWuCN<#$sovnLF%4@?!${g>S!TEMVYK<QgdWFtpr|y6ie(IBaEv_(-1rrI
zJI{fLsT}ovJ73VqFU9TOq___Ga=epjhv66Bk?9u=nWz=V(Mw7b&XXYUTDuQwY^3qy
zjOF+y>IgW*T_fL?JL2=yQD{2&3+4&#(dH%Fh;P9V+A%eX=PjmyjQS<AjE#WuEeY`G
zY$x6Dmc^Wh`j}EM3C|^dAgysK98=dD*ZaiMAwiN{X@UdH+3(EV(bj`~m15lY^sAV%
zUk6)umqMR3B_=vbpSai7;2vj1l>U-U$E2AuNfT#6fMx-GcrOq3n9qbi?B&s~tv}kW
zo==B(BomGApGduq0<M1>#WiN_#CG#R%y)(BU^uKCn?J9|Jdt?N>u^S>I)h+Z2YKR&
zWK-%fLGL9g*f;4I4N{O0WR%uX*A`DO=rD!vu2;ZB-wKkWk}!90Z#<hEMJrsqAt^?f
zqb$iq+ofihRH=y5cBT@I!=IqMmj?H;I*jp~cN1=Xx{gxSA&}TG9mgu|0NG`UWLdyb
z6pNdP>(}^jBHzD~%ZW`?G(ZkxqF$54D{>gZlZA%apJ~uJ7xHcRF><LR9V6uL!Lp#m
zV9QG&E`w7AK80miD|&@az1D(#OJ!ihz+SLraVScM4aJSVsSp@e297rjup{F$S-VVw
zIiRzh8qJ#p4lYG_a>IK%dr2~#XuKQ;EVxXKuAG99{o&a6LNG8}MstZSb3p4(U#{5c
zB!1V~#p~y>2OJLUp(4I}L9*127xUhNDfGKVoI`E^Gn|r3_EWjWd*^BF5KrbK5#uav
z>aALa&&Ar@w;&%ijO&}E$)&t*1#U(R-l%GWnQlR3)%rSE^!x%=kE@2%KGTWYfU{_(
zTmd_#9|6%(d!cakFy_UlcFeHWXHpUlK_nA`YJE?FY}Q)TX(*#X;!k-&wheUHNL@zv
z#XhjE^Q5+}3(?|Rv~Zr|fyT#NE3!BN?k_q>ydr9;>{nB)QwgGXZ8j140XjJU!ABaf
z!H%<^BZE#~r(<m8E3}CINJCnlB5Az~$~l({cWGo3(c~>C=h^|+7H1N(OkJ!}*$yuw
z#=(rzivm%JY3NWT#=J-x03s20aD#o3AS`4TzBGD9Et`$;&Cn{CF)<JANAJa*4!R(g
zK7#YC6`qk5)u8JOai(p>1JILZ(9AG^oCxZq`7H_9_PrXSnIXh)cVpodSrKA#L<bx;
z9Kv90ceGPz$Kw^@pnXFU4=Elld~G$5Q!v_x@4sHawv-I~W*ANS1`i@$%>v$riy3hC
zc^HV4mh+w~#v{MiPV)KK#X`$R3S83LWPGlbi)W`C1+{`3JVpDnAb0GGAa9f^7vK7s
z=czXYXK(&U)3c?SeBqhlo(>UaSoTM<vHwA6ycUfw9n7J5#&Af#luva$pOVZkN?=!d
z1#VqkhR#dY(X%0;xK*GEWf$Dw!XI;p%(33srd~lT_>#<Rfi~yi5kxl~H-OWNLh<;7
zGsH4M20G%up-I>W66_cQuT3(@YfX7hpYO;u(@l{3y$)_K?u{k}0z4HPPa8uAF=_K7
z$fWRZnAG10iq1U65kb9}(>|G0{sd1D_9O=?>)hzIh5&Hb>xz1*;*89_7%CAY$vli*
zgoj=Y0rBZAxNG!nuo+|pFJ3EiQffh%tA2#cpR$-%{z2ieK{zx!+tHy~-$<azR8o6p
z1Kg^gMyBd6BWbhcP-dzqlW;p9w3~TWQ;z?E+M`xt{e2bgJbt5@RUJ5?Jc!g*`l5d5
z5gKv39nLjT9Huy*XQ+|~k%Ay*X31bU=c)&>ZgaS&F$@Iw%HoG@cj3_CC|o=C9!Lg9
z!Qs5!usB(Xd9Jfx&@@m2P_zoqKHdQua*ez}o5z#Z*RmM5eIaa49?$r{+J|xEF;)+~
z!fQHk2w%LrLqtCh#}Lt8P(1MpX>0Q$5zQk(|G_R$(u%`-e;g#b$GxbUAOmc&r8$e7
z@ASy(4yxSCmAaYk#))a!(5Ais9)68O{ZG$fz}pA-YE^(>c7!(sS*dX@6GCBuAcjb8
zZih+Z)}hA5Vyxw#6P_W8qW6G5a9ZF~>QJoAXx4kt>YzI?Cqa{YK$Bqo_(br)GC`JG
zB8}2%q0lJ7y(#yG1D2^+zG5k?GNPb9hr<KHwV@<Pkr|#AjcNH|u=#pC5iJ-BmD)Pc
zB6n0Ez56)$e|do6PB~EX@)609zXX=AYG9LlUr5@Rg6(`!wB0Yx8P7?>y}bs|l6Wyr
zbk-z%dd&zP%Fl%4qq0~#E(p$4@tH$ET8Yori&(K~6=}80CO*e)z{O}H9@kqTShV0K
z?h3m`TnC#oCb!3;UmqLh<)<v%+>*w#9uZB3=oxWqB5n!K{5(m>o0l}+GYih;aWpG4
z85_S0<Mtg{fJ;VwBdY$+ASSYrN<7?6)XsL$`N@?yP)>!ZSj2+A&Reo4d>goGKBLQe
zIUti2huIx^j99fTlR2@B4zfNdNZYar7~@dfGoS(DkG+Kx8H=%e@<Bmwxm4KF?*XPr
zF95Y>KdiBo#Fi;BxPF~8;~SAem3Y2z`fDu5v@PTLxV`}UXJv$)wu91LMs(fo;c&x9
zjJc(qE<D#S=M9-I2boDtJhgC12=wnq4;YAX{fZ;8^Iad@do_p*60w4r<>!gC$!R>$
zoQk=J(}`)~b$I7ApBmhGN-xqQ<b;|V+KlESoIDN0Pl2Hxj)Eca#-RJ=J`Io9LZojD
z#A~Cba+jt`FwV~(@)lJ#;LEoF3%}H(LC0Ql$K)9JmG6Y2eNP}i|0Wr8<0dWJZ~?83
z)k5C9V;J`+khb18#;XO-!D&toCjaQe6k%_=W${{GwTuGn@-HObGEZR4ACfRB8o@9v
z9#R9RVs5l3=f<kTA=Q4^I{m%iUF~cTyH~?251s=)X0cfRPLXN-IFRVfUP6te)tFm>
zqjB%?ND?+>EL?gOk5?A-CxaG7lYJ_Zyw$~%QAhe9Nu4j;6Rs!{l=f=E+S3i_|05a$
zG~5^`5e3L_*5;<Tr$fp9w>WUM4D%`X8j<hiR=8Pd8k}5}glndxf<)SKP~|HzLynF@
zsjBNFEq^%bwoijy*q_cDyBnWul*Ss>M!~IiLyldkiQj%0Qq70{*ibnU*7&}mxuy>V
z3Fcpkae@>x>SGMuCLIMUFR0+9Q>OUFCYU^!F%qGqA7rVEfx;icb4!gYc&23}C(`+q
z^b-F>i}asU%g4cxpsxj@5)<%on-8dmt)$VeJVv%c2kv|~!y0BRB=lQ?y4#ZJoa5D)
zJhcX6R!E?O!3t=+xertuLoj0SID85_z;)hf48Iu*kshIV$-ghXvFtT{XcmJbww?fG
zg9d1cm4Mb^cTha@Jqg$03C@>(7g&_OMDFZd?qtm(GVX~jRC2e-qtlHrVx~BnG~R~f
zS1H6sEsuIF^Mzo3IM^NYAhnZjf?tLt^E5plEtlB~-bAbh#d)>ZxVAs7(^&zYWD^OD
zGlV$HUT`$P6(86|;`ibc=r&{tdD`a?iCl1yPTEL#ij$E(-**XgqT9f4&|&JnXSiUq
z)kt*THbIbFwx7<pst5yqtiz27@$e~DIKK}rfMBIJ;4<eERrQ!cmcM#NGUNWB$6KtS
z&g>T5{PZ%MyRl9%O1}j3p8J4;MFhy2IC9oWpV08EE|c<bD+b?;MU7Z@$evIDD;(sw
zc_zKN0|Pb4f$urE`LqtOufnm!Y$<O#jl`qY2`Ie?vDU{JKSX^+E2CHR(S`5WKnet8
zlRjZc2$oDZKs`Ql<Q5#muxCmre_x*2w!eYq>y?4^O#!Sbd_$kRmGk(9Ipl(U4z$IV
zps)8}8fUs42a6pbe*V%x*66@;xA#PG^lU~%BOfwC4pD8j?W8quGPCc;G`OrAjdd5d
zLaAOg4f1Hjpy~O7;GI)RZsHWou~uOe>km<v1NY!w%v@?F&e1X4MXLAG6mNc(g36Tn
z5LvmMH+0JZsNB#WHhEv86)8a^@b+^wv42Hk++M(?9i9-|c@l@d8H3{_OoZ(pOz#;*
zax*4|3e-2~;D{<Um_At+iz9{Soa@UW*C!7*O%%u3!aY#lx&(+Wv%q?-Jksj0h}<yf
z&4?`72ZlArab?GJ#Bqt><sr`Ls5rxGRbyQ1t;I|nJOT9Tqd{YiE{yo0NwdWxa8-^D
z(Ofx&*a{PUN%vbY^~4jhX;v|0hmYXOZzj=towb%CM~rcK!5mQ0o6ATg?x8n5Eg{lz
zAWm6!95-GY1jeL}X6OjVT=g&_rFevFd#Y(6s&*5UuU7)!aWfu$Uy8BY{9u{S8B()K
zgPT@cC!EjDfsTDWn6FDfr+#N}4LcL}I>#}3K6wyZWeP5Xa^TJG0Z_hfJI^QJGpr#?
zF>tvangu;Z5k(s&U%U!;>UnVE={neYOOkF5ucEGR35*(-h}R;_VMs|TZ}TjqiyrJC
z2i=lT^@Ai67A3{`KRXKL3!X!2R6OJrM?$cA7|oY7!#k6ksrKho{Cq_epEd7-n{UqJ
z>Zc)vCpx1s)9wIT9c;!AnJ!=?Bf@!I5Ww5OWNd1cMh7wv8^7z5pv(g>>9{hJ)Z$Ol
z(!Y}9A97%R8jDjtM_U=LK2M9ZF449r`kZK#2I_kZ#W8Vl5PN?+ScR)IA&fKlPiVnK
zOHP1RvMzR%6wtV|Iar+|#chp}<XY2paYo&3xa_5cuWw6n-(wo6%JeH3`Su3LSNh=5
zXQGI!cES#wet2cmT-<T?25)D5JCPq`3GbR#0Vo@i5u<JuUb?gwjPE21_wFowJGq(B
zdt69Pq>6Dx_Wg0B&RT4D@qkZ@g?sy@kH9B83Tzj265~y!kgXmKA?Bgz9%O)B=LjY#
zHy}LVEt&JKlbBBV1NnpQ5~XRvGt}mXC{}U&Y{_1Nt(WI92lY2X(!fb@vn>KX*IfdQ
zQO9|cE?%Q2ANGb#^E_awMIwE@>oydfk0Mv=Z=(G}dC<SDflpHdAS34#eSEu;MmNUb
zyZ#a6k-rktrhQZRT*e<ir1d3}ZtNh14^x4@y<qj|{5Cp%wmfs@h%~coY#&^wD8q!`
zdxly$99<RP31XrBxsy*Muy5~Jsy*y7%(=B2l=t{Tq*4>{`0@fH4Zc#;&8Bua+28`{
z5O(SUU4A7B2VI&;Pb^A;yj8ccDnf@D(WJ>KZb&1MNkyQYT?FP<Vz5<Njd@_80L}-+
zagAdRDUA*$WoP<xR>7XwZn_FnvV)1Xej}-L2!Me1CSWjiJwB4UPN;e=-LpXk2Hw_$
z2Wq_-uld7q?$fR4qv%WYeyB1|i;+~VHsqGJigE83#DPdZpushlv0bt^zNk&ddS@9y
zm6kGRF>XK1UU3Op7Y$;(oK*0Zvkgqm-vk?;%)r;Z@<~I-B-Fn>nQD%j1!oo2P%L#A
z^4+h(^#FOMV)bHtCMPQBKctn~`rm`lcYAT{@V#)_b}QX?r<|&cb-_;0EBM0f0SU`p
zL{sL;;q<Gqq+^#D7#QBh(^=ty*gnpTUQq{8I6oOSJ>U~AI2Za|3nf8QcjL8wa?JMj
zI9NPS5;v`!4O^OOc(bj6`t!r^j3k4)Pp)Hi9}l=FkmkN$9!Q^Jf5vX+3v_(`jy5e%
zrq7rN5*`sGc<fRN8VyBo>-hyxtarj)MyIL93;|5kxs4+q){)w+n)va@c#M+>Bhl`$
zINU89Kl?5s!K)YID(5(isaT5PdxjA^bsN-b?$4c-S7xf$*ML#|8Vs5-jC;Jlf@t^M
zh$&K)=sju)4)c74Z4o9Y+vHDk;$qKhG_bVZX9#oW{yIpNAPir0JTLrk6U<yB1yk&G
zxe%`hbeFg`W3*}y>@#nmRa+0jvsb;CLB=}Zv}z=0u=5P&t-D1`^KV#=>1_{{T5HJO
zcX3!8dJB_=8S|FMN;9@*vzTpi%1|c|!>nZi81#G&S>!yQZcPiqIsS9tc%lNvYv_Wd
z!&N+_dmZatLddeVSpu(lBe{G7N|>n>)gxn(3Ql18?qHZGcN-!nwc>~lKeE##2m%^r
zLa<vU9v?8C%X?Kq3ttUH6~ilJS;reXU;P`A9zP5H{M49itEbfEy*qS1k)}I38ii-$
z^_K5^?+OMD?F9Q0DX1vhNMr*%d9$N-!fKNp=&G7U1`TLLPlsc$^XOgjX@@O&FFg%=
zSx4gmsTmkk-2sQz9ECCOXJE!oX(ry?7zVrfqt2KEc(gqdCss=^wNHiT4TnC0`lA{=
zs67iz#tNT%e3j)?vQzQO_-7y)nGBYeNAdH77uZr<%p2w%KtqW<cXwbT+8gzvevhoU
z$5Yd3=#$0Bzhw*#J@0$moV5*2I?v;(f*rh!6e}`c>A7%!avg8HkYwh{UB>tGreNLq
zsW_yc3`DRVFe3H}EmnGoQen>^vL#XYY|oxX+V{d7O>s~`;rVyd6L|Fb8rE^A=*xSt
zWXA<VVk0LbeD0}<jVINiZF@6WpsY(w+In+OR;JP#TOH2Ttrp9S-+}7%D%x2gJaczB
zMl2%SVfD-+)Nw5qd^#D7@h>OBpsyof%^YEj;tlAtdNq-lup8D(On@8WHDpHAUV1L3
z4rWxZgs|;-Xw`0ui}qIYTqTZyf4@GA|H2)RwpEheOf*0TyJ9$c=m2!wNTiX^p2IRR
z;TfPo4P6+5B!Mr3v@#HrHV!4>(WO|DFakeZnM}vI4-&|Ve;_ga{Ag{+7o3+Qd<V-s
z;Q#%7q^_Uzb$w=Lh;p6FgzrI^?8MwrdeGTd11foacygmQLu=DXlnS_qo1UL0s@xA~
z&#fZNLuDr1{s?ss4WK)C(wxoDC?X;wY;p_b_eWUeF8V~5o|0jTyb392_r5To@hjEa
zwnHEy63nxznTzlHOTn(Ey|@drPLml1I!uDUaQ@Fc1zV;Hpm>8Dka?PTSn3eow(A2<
zHWoN;ax&zOHp7RPR5|xn6(%vJ47J2-VA(7aqH#eS3?9wm9QrB4NU{hgw99fLp?dU$
z=X}VU(?Qa<ijWX9Kd=hw%YnE#cYk^vU1VND_Oxk$%AxCm`pBh%l@5cMgCD72SmO`!
zD*Y|QsAuBr=_-Q8As4BLsXD||PGAPl3}dWcUcqk)FG%q4TVQiW4IL||F;-d=xZKzv
z`pBjf|G(cC`={}`erx%E`TWzkHa$U$TFR(;x;C$Btr7(HIg4qBg3-3K9F$L7h6EpJ
zX4u6u&`U=F_N81Uwm~a!k*zVU-SZXqtggoPzUy%4i<>ypZW)wKyaJQ%3+=+^H+wwy
z!>fYTn0kW>T0fP+UZo(MQ8Ac{jk!@+H|+%JXpiN+m}<p@`W}OvrfBfj{f1#8j*xsZ
z1EuOmW1Cn$T~bsH7mw{n_ahr%uVf~u8Hg}C@k)5uE))j&gn+M98?kt~9UaOwX?5*+
zoE)eQ3L{PmpZ!l{MC$Tjxl}prEgQ#sA^rgFoIXjb3S6n`ED8e-2a%nc(&%I@jvrm_
z|Kt1a&v~wSh&9CCE`uxDjxbwW_)MrWn`BK?;zcby4Rzx*abL+W=zlT+cO-lxGa`>d
zjB6)dX{AejglBHW-iu&^q6W8VB@ZL00oS4#LOvcY#~Y`2;wFi!c&?!YGS_X#{_3pY
zvZxYM7pH;a)DP3zlrOYMMwR1l4JLJ&r_i!`4xBxh0L3GofXKNjFcD7xgA7A%$s0`w
z%FQdh@iYYV((-7NT@5_WDuIJ$a$Hc&5U3udkNvZpae&?%!E?KGs=anP2HhKo9jW_Z
z)c_gnKdu>k^;5{Xo1U<xwE@#p`Z8kz`*9+%(Qu|^6Id*GfXiNc!kjIxsBbHUD{r2~
zll`7hiKEGoB`?AC2|5UORjS}rRx&6L9tviL_aXiAYZ7a@4#O1Rqlt?z{yFYcrA=st
zY%ZC;&j1v@CxLVO4XRc$2a!j45ywAcme^Z5=iE#fllg#58orw}SNaeM>pGI(*h#E<
zeK|jD)-=o;`<|>(`2(!>7L)T;YFzz|K;eG2h*oP>!mOjxXqR6NN%@{AGV?L`vL&b_
zQh=)Mlc9<ILNXisZ~;ayfb%*68mgM8(@O&#Oh$1Tc5`9m-2le!Y5>sk%hbkPht9uN
z2MzleR36<)Wik&!{$3mE=~p5sn%@_`+3gXO85z+X!Zx>_@5~sm!uP&1(`j2=AkwxS
c;9Drkh2~9UA~xFMYJDv-v-vjNG+#jf3yov#UjP6A

literal 0
HcmV?d00001

diff --git a/test/tst-data/utils/data/hashing_vectorizer_config.json b/test/tst-data/utils/data/hashing_vectorizer_config.json
new file mode 100644
index 00000000..fbab3985
--- /dev/null
+++ b/test/tst-data/utils/data/hashing_vectorizer_config.json
@@ -0,0 +1,7 @@
+{
+    "type": "sklearnhashing",
+    "kwargs":
+    {
+        "n_features": 8 
+    }
+}
diff --git a/test/tst-data/utils/data/label_vocab.txt b/test/tst-data/utils/data/label_vocab.txt
new file mode 100644
index 00000000..c4ade656
--- /dev/null
+++ b/test/tst-data/utils/data/label_vocab.txt
@@ -0,0 +1,2 @@
+label1
+label2
diff --git a/test/tst-data/utils/data/sentencepiece_train.src b/test/tst-data/utils/data/sentencepiece_train.src
new file mode 100644
index 00000000..3a5b89b5
--- /dev/null
+++ b/test/tst-data/utils/data/sentencepiece_train.src
@@ -0,0 +1,20 @@
+nina wilde and eddie chase
+minivestido sexy sin mangas de mujer de rejilla
+autógrafo de lance briggs
+esta no es mi bonita vida
+plátano
+arlequín de juguete
+papel de fotocopias en color hammermill 8-1 2 x 14
+d1000
+emily bleeker when i'm gone
+auriculares bluetooth prime con amplificador samsung
+ssd tb
+rumi gratuito
+minnesota vikings
+motivador
+cadenas luces para fotos
+panel protector de mesa
+cepillo mason pearson
+correa reloj nylon 24 mm
+boina
+cargador coche usb 2.0 amp
diff --git a/test/tst-data/utils/data/sentencepiece_train.tgt b/test/tst-data/utils/data/sentencepiece_train.tgt
new file mode 100644
index 00000000..10e31db5
--- /dev/null
+++ b/test/tst-data/utils/data/sentencepiece_train.tgt
@@ -0,0 +1,20 @@
+nina wilde and eddie chase
+women's sleeveless sexy net yarn package hip mini dress
+lance briggs autograph
+this is not my beautiful life
+banana
+harlequin toy
+hammermill color copy paper 8-1 2 x 14
+d1000
+emily bleeker when i'm gone
+samsung amp prime bluetooth head phones
+ssd tb
+rumi free
+minnesota vikings
+motivational
+light strands for pictures
+table protective pad
+mason pearson hair brush
+24mm watch band nylon
+boonie hat
+2.0 amp usb car charger
diff --git a/test/tst-data/utils/data/sentencepiece_word_tokenized.src b/test/tst-data/utils/data/sentencepiece_word_tokenized.src
new file mode 100644
index 00000000..59dcee45
--- /dev/null
+++ b/test/tst-data/utils/data/sentencepiece_word_tokenized.src
@@ -0,0 +1,20 @@
+▁nina ▁wilde ▁and ▁eddie ▁chase
+▁minivestido ▁sexy ▁sin ▁mangas ▁de ▁mujer ▁de ▁rejilla
+▁autógrafo ▁de ▁lance ▁briggs
+▁esta ▁no ▁es ▁mi ▁bonita ▁vida
+▁plátano
+▁arlequín ▁de ▁juguete
+▁papel ▁de ▁fotocopias ▁en ▁color ▁hammermill ▁8-1 ▁2 ▁x ▁14
+▁d1000
+▁emily ▁bleeker ▁when ▁i'm ▁gone
+▁auriculares ▁bluetooth ▁prime ▁con ▁amplificador ▁samsung
+▁ssd ▁tb
+▁rumi ▁gratuito
+▁minnesota ▁vikings
+▁motivador
+▁cadenas ▁luces ▁para ▁fotos
+▁panel ▁protector ▁de ▁mesa
+▁cepillo ▁mason ▁pearson
+▁correa ▁reloj ▁nylon ▁24 ▁mm
+▁boina
+▁cargador ▁coche ▁usb ▁2.0 ▁amp
diff --git a/test/tst-data/utils/data/sklearn_tfidf_vectorizer_config.json b/test/tst-data/utils/data/sklearn_tfidf_vectorizer_config.json
new file mode 100644
index 00000000..7ab9d5a7
--- /dev/null
+++ b/test/tst-data/utils/data/sklearn_tfidf_vectorizer_config.json
@@ -0,0 +1,7 @@
+{
+    "type": "sklearntfidf",
+    "kwargs":
+    {
+        "min_df": 1
+    }
+}
diff --git a/test/tst-data/utils/data/tfidf_vectorizer_config.json b/test/tst-data/utils/data/tfidf_vectorizer_config.json
new file mode 100644
index 00000000..5b6211e4
--- /dev/null
+++ b/test/tst-data/utils/data/tfidf_vectorizer_config.json
@@ -0,0 +1,9 @@
+{
+    "type": "tfidf",
+    "kwargs":
+    {
+        "analyzer": "char_wb",
+        "smooth_idf": false,
+        "threads": 1
+    }
+}
diff --git a/test/tst-data/utils/data/train.hashing.tgt.npz b/test/tst-data/utils/data/train.hashing.tgt.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7672c010be3ed1a26ad6b06aeffb8a1489339181
GIT binary patch
literal 1007
zcmWIWW@Zs#U|`??Vnv4QdehcN0a;ssn43X_Au}%}GdZ<bFR!4IkwE|~43uF2DFK1m
zZ^U0o3!FR=a4cZ$yh%}WVwU7BU6409bJ4oR3+7FW4+)wwLwtVxlu2Ad<=$sB{aCxi
z(^bTlF#VKjJ7zV@ifb#^6J|X&g#@OgG=((Aq%egv<|H?nM{gW>7y?k7<@;6ow-3<1
z9w6p|IIEzf2<9q~C<uU5fB@O9f;vis0puv=q&NjOG<PgMTr%AiXhSm)gWQppUzD3z
z0&@pQ6a+vjKme;d;)HY75DFb|(1<Xud-6yL%>ia1B7t5&qbC3{C(r@K8Hoj{F#AD*
zAOKPT0$A+_#?C@Q_PaqM2N(`&Y#a|7E}=Q1^o}lX3ebE|BH;i!A|<gT5#|Pv7zlvm
zK>&vv(nv@l>#nX2yUF;`PKD=5_D%EYN8U3h?OyWo)$hbN&nEp=*uxKW9V3$nGp<Ao
zaR-QOU<8p!Ng7=vYBB>UhJXgf=|BchBQ(*W>qAWzFkK)E<$+cr5(&CK)YwGUw*aUQ
k9=GV)P{SWtn<^79<e{+<;LXYgl4AuzF`#jWfpQEC05--3mjD0&

literal 0
HcmV?d00001

diff --git a/test/tst-data/utils/data/train.sklearn_tfidf.tgt.npz b/test/tst-data/utils/data/train.sklearn_tfidf.tgt.npz
new file mode 100644
index 0000000000000000000000000000000000000000..94365216906b7b316edc3817ee890013e0d70eab
GIT binary patch
literal 998
zcmWIWW@Zs#U|`??Vnv3A7K4|GK-LBz=4KFK$jnR0OinG<%PXj4WDo!g17#RMN<d)t
z8}V1t0w+%d91B=GZ&K8pm?e2j7vxROT(oZSf_c;8LxQHv5T74EWfGTAx%U}OKh`es
zbQQ5BOh2XC+^l9<ac$*#!mL-u#>VERke1lMV5X49lvJj1mh;FNhck>v);RDm1faN1
zW72YGAE328K+FYkT|r3^%yA%55CEwF0kR!;4B|Es20b=~1m+|;1vWHyEIwQ^-4$p<
zGZ2H^k(OVSn^*#K2S^kIKq^20t2^R^bJh?FByb3cFs^&@ND0jW-ob{#UO=NK05K=f
z0mT`K1*tImL4qIvQUC&2?FUBELPGXK;|LfIYHS=2nl7O^Vq4kCB_TlbL1~2p=!lfW
zl0=vrKw=;Ok_Q1CZb%~`p{)CPTkPh^+=`|q$0UOuJ$9gL7@0(vaiv{|y&$rI5kw-T
zWOR+F$qJ+x0vZ_S0U1Dz&_sr=4>cLUbb%~X16qkl6zKX;V-H#1a-cqVT%v134R2&^
aI!p}k&=2rtWdq5v0-+etxLrUw1_l7MB>k!Y

literal 0
HcmV?d00001

diff --git a/test/tst-data/utils/data/train.src b/test/tst-data/utils/data/train.src
new file mode 100644
index 00000000..0f298314
--- /dev/null
+++ b/test/tst-data/utils/data/train.src
@@ -0,0 +1,2 @@
+1	red apple iphone 11 pro case
+0,1	red samsung galaxy s10 plus case
diff --git a/test/tst-data/utils/data/train.tfidf.tgt.npz b/test/tst-data/utils/data/train.tfidf.tgt.npz
new file mode 100644
index 0000000000000000000000000000000000000000..102af082b2af4964d592c5ecf199b9001e2a61e5
GIT binary patch
literal 1241
zcmWIWW@Zs#fB;2?3c2b3mH;^*%nihuc`2F6sl|GE1(lQiLVW`w85znLs?}3ci<67g
ztrXO4GELNV6x7r5i%N<T^WyW1Qc^+UZizX?sX+1KjKqReAYa2$M^i_kR)GS*#RPN?
zGZ3=^F$gdMF$)m00Wmuea{zGwif8I_X4>=sMM0Pgh=HCdC@G@YFUFMn1?U*CUpRp@
zs+$)dE}7m86a`_Bn}H#ln^=O=%{9TsM1?=4p|7C8m0Vne>eQeDhN=lbCxS4@BjCu!
z=~A0a3!+?VM0x0fQVIybk~ONcS)WFHf+cZKnn_74NyP2yG`gkl*Q^)qAUM$8>g;4Y
ztFyvr^klmLZ$>5&W|+S)vn$x!K%xPZIgm3ix<>S@fzWssq7j-=(Dk7wPlUckpp}Tk
wimnemtswO20AmZDa?rJ*#}z``W1u!z%msL}f(ij($-v4W26V<+P%*&(0B|AiFaQ7m

literal 0
HcmV?d00001

diff --git a/test/tst-data/utils/data/train.transformer.tgt.npy b/test/tst-data/utils/data/train.transformer.tgt.npy
new file mode 100644
index 0000000000000000000000000000000000000000..46caacd85b1aad0a8485e5ed97018df23bc3835c
GIT binary patch
literal 192
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7Itmt=ItsN4WCJeC4(a{N-b>g&U&UlwY_idIGt;>}Z92DY=RdBsllFVC
nPjmB|eTSHY?QhmJ?Aac2z?SuwsqK;t?`;?9HrXZ1Nbd&#6fils

literal 0
HcmV?d00001

diff --git a/test/tst-data/utils/data/train_text.src b/test/tst-data/utils/data/train_text.src
new file mode 100644
index 00000000..d90f620e
--- /dev/null
+++ b/test/tst-data/utils/data/train_text.src
@@ -0,0 +1,2 @@
+red apple iphone 11 pro case
+red samsung galaxy s10 plus case
diff --git a/test/tst-data/utils/data/transformer_vectorizer_config.json b/test/tst-data/utils/data/transformer_vectorizer_config.json
new file mode 100644
index 00000000..12758e21
--- /dev/null
+++ b/test/tst-data/utils/data/transformer_vectorizer_config.json
@@ -0,0 +1,10 @@
+{
+    "type": "pretrainedtransformer",
+    "kwargs": {
+        "model_shortcut": {
+            "config_path": "test/tst-data/utils/data/dummy_transformer_config.json",
+            "vocab_path": "test/tst-data/utils/data/vocab.txt",
+            "weight_path": "test/tst-data/utils/data/dummy_transformer_model.bin"
+        }
+    }
+}
diff --git a/test/tst-data/utils/data/vocab.txt b/test/tst-data/utils/data/vocab.txt
new file mode 100644
index 00000000..0da6920f
--- /dev/null
+++ b/test/tst-data/utils/data/vocab.txt
@@ -0,0 +1,15 @@
+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+red
+apple
+iphone
+11
+pro
+case
+samsung
+galaxy
+s10
+plus
diff --git a/test/tst-data/xmc/xlinear/L.pifa.npz b/test/tst-data/xmc/xlinear/L.pifa.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3a4b4d3b636c7b2ce5d2ec961595a0a34608efc0
GIT binary patch
literal 1002
zcmWIWW@Zs#U|`??Vnv4aSJGK>fvf{S%*`OekeQc~nVedzmse29$RGd~2Ffsilz_nO
zH{!3P1x}s_I2N#W-lV8GF-!86F36jlxoF+u1@orGhXhTTAwEBT$|NqKa_=*meym;M
z=_+DNn0`vNrCH6g;@Zmfgjvsxjg3uBA&o65OyL<*lAOXb=A=3{js(UeGlewvBtC^_
zEJ=9^&sdWf0#ID~N&U-RU!W7N05KQDr3EELFo%LfK>(xz1ju%%TN~V=Y#I+74l#+K
zJ7n?UlIgBME1H2A<dC%dqTIw1m_tCKAOKPU0$3dqC!Di}P>6wpON4RVlSfKuA#q9b
zkGB`l=m|i~33NblMq)uK%zls{2!IrT09N~fF}9G9{cezO5Mj_0WaAKMxP<12S$Ug2
zcmvHp0K^<XN2DZ{B*NSP5(5E{JP6=$LmCOGX5D$6=(idNfsSBg5@E)bej(O@$Oc9b
ziIkerHKHabkYWgEU|a=c05w7r7rH*w*oWx?S!fKj5|I|r^`S-@vc4@qeeftm*M=I*
c$l5HK7~p{(;LXYgl4AuzF`#k#fN~5B0PHgZP5=M^

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/P:nr_splits=2.npz b/test/tst-data/xmc/xlinear/P:nr_splits=2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..af46c2cab64687d0458179f38cd21c35ee496845
GIT binary patch
literal 956
zcmWIWW@Zs#U|`??Vnv2j&CSQNfUE~V%nihuc`2F6sl|GE1(mbkh`*8+IC&!ASistO
zlcMItEXiBCAa8Q!qIHWG%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YGk6W;M%-
zYb)0iW<57HHa0Ofjs&(OHH9?RBr}CH_9Q-qXDms23eVbBrMd*2c5tv|2taX?hMx?t
z56~$+K+FYkQb9=(MJ_r9agYdu9-G1g#w1P!HZ(gIA1;~h3bd;kYG+!0QEp-hPCMg-
zbJh?F1#l3EFs^&@ND0ldpaO;}FQ7#efS41Ai!%}nQgPY^4Dp48>~e$qPmPU3pz#u#
zg<0~8p0@%mdH}?rXirHjNyKAc8YwY+oS*gEWb6FM%`>-0*S>hR&8lwoxlc#79~FD?
z<IL&)X9_y&BQy3pJ>B&6td{ZX?}=~2?p;fr=Di~<I^n?@`)3L~K&LV?i7?|%kwEu?
zK?5U*LQ0?L8qw1YL>mJ`1LI_fMrbNR*M}bW2z~58D-kIHT_1WhBJ`C2^}!<*T^o92
bAhbONYJ){hfHx}}NQM;%#em`ym_R%Lzf%+z

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/P:nr_splits=4.npz b/test/tst-data/xmc/xlinear/P:nr_splits=4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..24658dd891703ef08e3fa35b15bd5553def8c29d
GIT binary patch
literal 936
zcmWIWW@Zs#U|`??Vnv3;O=~66fvg8W%nihuc`2F6sl|GE1(mbkh`*8+IC&!ASistO
zlcMItEXiBCAa8Q!qIHWG%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YGk6W;M%-
zYb)0iW<57HHa0PZw6q2WHH9?RBr}CH_9Q-qXDms23eVbBrK&jEG6bNwN5fBs*9Yj1
z9w6p|xTm0`h$814gSbY7L61%00b>%U0vnp0iw~DfcLmzj47D>YzbH4c1gD*G!Z~XQ
z1p(OqB8=;vJW@ilEU18?$_r@G1R&-F;^K_Nf>fM#0fT!XA-mk*{!?S)5NN!FX5sHT
zhs7m8iyi<mD9%$7OA_(emqtSLt}{P-^xN~g4YyzDU*1}zA;SK+qp89>X2&O=jeGN~
zHlO|L#r?W=f67{iA8kd;RP}g(E@Na8VaA>MfX)Mh21XEtloru7q9+rGHU@?UMo_o{
zH9`{&x<2%%M(AS!T8W5wbbaWthR|0C)CZ44bZzKikI?oAs0|hq0p6@^AQ@I56a$La
HFoAdg1=t8O

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/X.npz b/test/tst-data/xmc/xlinear/X.npz
new file mode 100644
index 0000000000000000000000000000000000000000..211cb04b25504ee820327e50ba1114bb54f42f63
GIT binary patch
literal 1001
zcmWIWW@Zs#U|`??Vnv3$xw>k(K-K{u=4KFK$jnR0OinG<%PXj4WDo!g17#RMN<d)t
z8}V1t0w+%d91B=GZ&K8pm?e2j7vxROT(oZSf_c;8LxQHv5T74EWfGTAx%U}OKh`es
zbQQ5BOh2XC(yV4#ac$*#!mL-v#>Qr+@Qg7@Pa%ypDNP}bDalRY8EaCW!ZVg6K7};)
zBr}Dy#0Cb202G(nW<2uo0Xm@vh`AsxEhs61ITR!c0w5J2K(<4VLEI_ApvR{0fFX%d
zfep<aiw~DfcLmze48$OJq~#aoCYHe50TKlPkO~mM>W(<!oHc|33>;b_jO(5}QbKdU
ziaM`<UO=NK05K=f0mT`K1*tImL4qIvQUC&2?FUBKLPGYt!NWm~jYFW}5}G4s<!$=l
z4K)7%5OV+>k&;-F2y+8S3<N;(Ab`USX(S|?b?0@W-)bBLI)af&gc(=zg;)n78yG<(
zQesBeh?<%}iXotZaTSmO)Cf&m==xA&AEparp%KtZL|Q=ChZ<?f`Zfdg!J`me8)`5k
cYqMZtfCqYjH!B-Rjui;SfX3|w$}uni0P^(x_y7O^

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/Xt.npz b/test/tst-data/xmc/xlinear/Xt.npz
new file mode 100644
index 0000000000000000000000000000000000000000..211cb04b25504ee820327e50ba1114bb54f42f63
GIT binary patch
literal 1001
zcmWIWW@Zs#U|`??Vnv3$xw>k(K-K{u=4KFK$jnR0OinG<%PXj4WDo!g17#RMN<d)t
z8}V1t0w+%d91B=GZ&K8pm?e2j7vxROT(oZSf_c;8LxQHv5T74EWfGTAx%U}OKh`es
zbQQ5BOh2XC(yV4#ac$*#!mL-v#>Qr+@Qg7@Pa%ypDNP}bDalRY8EaCW!ZVg6K7};)
zBr}Dy#0Cb202G(nW<2uo0Xm@vh`AsxEhs61ITR!c0w5J2K(<4VLEI_ApvR{0fFX%d
zfep<aiw~DfcLmze48$OJq~#aoCYHe50TKlPkO~mM>W(<!oHc|33>;b_jO(5}QbKdU
ziaM`<UO=NK05K=f0mT`K1*tImL4qIvQUC&2?FUBKLPGYt!NWm~jYFW}5}G4s<!$=l
z4K)7%5OV+>k&;-F2y+8S3<N;(Ab`USX(S|?b?0@W-)bBLI)af&gc(=zg;)n78yG<(
zQesBeh?<%}iXotZaTSmO)Cf&m==xA&AEparp%KtZL|Q=ChZ<?f`Zfdg!J`me8)`5k
cYqMZtfCqYjH!B-Rjui;SfX3|w$}uni0P^(x_y7O^

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/Y.npz b/test/tst-data/xmc/xlinear/Y.npz
new file mode 100644
index 0000000000000000000000000000000000000000..46825e9686316b6d4ade844791bd7a91468f9622
GIT binary patch
literal 866
zcmWIWW@Zs#U|`??Vnv4Q%MNmQ16dtF%nihuc`2F6sl|GE1(mbkh`*8+IC&!ASistO
zlcMItEXiBCAa8Q!qIHWG%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YGjRHOq=?
zE0+j^o*Nq*o0tL{Lja1MOmFtC^a0w{1H@bqI}1vRD6;Yx+)6fu1ePQ<3_BMeE}8BM
zw5u6vXIg$yZej^eJL80N))4Y0#FLEco;*@Q^I%W`LzNfMq6t9E3B<)2i3O=R?E(hr
zLPB;Sf>e!-L!j{znuRrXlVm^^f?^RAuPKQoiFoWwBOwae+SsZEKrzV3B*Kh4-2g2G
zg9b(rg_MfWHKIp8L>mJ`17it9BQz<X>qCz~guWL*CL%h~^`XZFLSHD*sqmOV*M=UV
b2yMrJ+F;=u;LXYgl3@ixF`&2^6Nm=@JzwQ=

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/Yt.npz b/test/tst-data/xmc/xlinear/Yt.npz
new file mode 100644
index 0000000000000000000000000000000000000000..46825e9686316b6d4ade844791bd7a91468f9622
GIT binary patch
literal 866
zcmWIWW@Zs#U|`??Vnv4Q%MNmQ16dtF%nihuc`2F6sl|GE1(mbkh`*8+IC&!ASistO
zlcMItEXiBCAa8Q!qIHWG%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YGjRHOq=?
zE0+j^o*Nq*o0tL{Lja1MOmFtC^a0w{1H@bqI}1vRD6;Yx+)6fu1ePQ<3_BMeE}8BM
zw5u6vXIg$yZej^eJL80N))4Y0#FLEco;*@Q^I%W`LzNfMq6t9E3B<)2i3O=R?E(hr
zLPB;Sf>e!-L!j{znuRrXlVm^^f?^RAuPKQoiFoWwBOwae+SsZEKrzV3B*Kh4-2g2G
zg9b(rg_MfWHKIp8L>mJ`17it9BQz<X>qCz~guWL*CL%h~^`XZFLSHD*sqmOV*M=UV
b2yMrJ+F;=u;LXYgl3@ixF`&2^6Nm=@JzwQ=

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/Yt_pred.npz b/test/tst-data/xmc/xlinear/Yt_pred.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dab8f9269b797d4625184b68bb213251aa4ca374
GIT binary patch
literal 1067
zcmWIWW@Zs#U|`??Vnv2|pQFS2fGh<@1_o{h5r)jXl+5JRV!gbAN=60&urN@T0i*;3
zX1@`CB`t9BM8L6tweu!L&52o(w{$_?<jh6u7B84LEj}b@$_(-O@lz&o36*=F(ez{O
z5>HnVTf+2HN{!8GmKE1lt|!cTZftC9Vr(1<Oi5`9X{<?R3Tf;~d<xH4k`fi3u_ehV
zJY!9=Q+URn#Hb*~lBB5ctZCL!4pZ(lKK3g*X)7XmGSU{#R^~C~PGM&VK=F$BoXj8}
zpc{LDm<!^Sf|4SbPe7s|08#-0Wc%b8#3LdMdTa^;j7hu-Y-sLSe7Iz~E6|2!AO^W3
zEx#x?u>|H0kSGX%RDb|hcf<+jtRWPl;NTQtT=(RW5}E^o3K*)qfJRRMVosm~iZc=m
zQepOk1VI3#00gkw4~)@;gzR^Nhl3g$hd|>cG)F`%KHZW7G#`|(IDn2wNi0c(xd9{w
z0w8%1z~P285>nMVRnev2;@C}d-o;yHxAL#wrROJNSGM6w-O?4Cd@p^=4*OByv`;2b
z0_aLcCJ|;_835uQ5ZM5BBT{xi*NB?jkTsqLa)274i4R>LYVttV7Ywu#kyz06p~fn*
nzAHd|@Hj@-h8hpZ+9H`4;IR_m&B_LnV+BGnpmG0!atsUr#ElO{

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xlinear/Yt_pred_with_tfn+man.npz b/test/tst-data/xmc/xlinear/Yt_pred_with_tfn+man.npz
new file mode 100644
index 0000000000000000000000000000000000000000..74b790ab3f2ac8ace6647dd77367e028bc73af42
GIT binary patch
literal 948
zcmWIWW@Zs#U|`??Vnqg)A{~}$AWMOffq@%HXXd43CZ`tb<rP%Uek1-$THxe~fMWq`
z=S_;56SE|5>4Ln;nTysfUNCQ3d`Qrg8RGNfr%d7!D)&C4>Brh7o~|Ocgz2Z08k^ND
zE3U0vPnh-G*x1;_*f<iHlF}5?Sd+{Y(%6&u6rQmp=_x#8OOjJ~#+qcO@Qgi)Q9+C)
zNl`(pZPrl_*Ut<AC~g#=lNsa#bXyM)b3xo#P*OyZ6OTb$D8it}rXawW#H+xDX6NF=
zCDUDjb~QuoOv^9IO)SA_XPj`(8bZMW4igc^bx$5Cp;;DGz)<A{v}ghla{_U3Mq)uK
zPP>3nu#k{lZgBspv2h4AUP7~Q;_pz?9H2#@SO=wpl*E!mJocrL5asL4<MQ9`arSyE
zH;?0J_x`hYlDv85N&dQ)R}|BESMsUBo*&O9t=)Dc0q7z|CJ|=bDG}%_FlYd}5hY!s
zYeY{T5N!+$4U9`58li~=T_1WBBlJlCtwcmPx<2&SLg;G+>VwA}x;FGMM`-&7)CP-z
T0B=?{kPIsjiUGwtm_R%L*lz@Y

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xtransformer/clusters.npz b/test/tst-data/xmc/xtransformer/clusters.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ae79e7cd8711d74aa15e760bad270c3223684d8d
GIT binary patch
literal 1309
zcmWIWW@Zs#fB;2?pWkh>CjdDh%*`OekeQc~nVedzmse29$RGd~1}OzegTQ3JP~U(^
zMuswmYW0-V;^ZQAD+P6%OcQk-1@*N2qLQM-y!ia0lvI$oTVhUeDp0&QBe5VA$k#B@
z(bQ3>RiFTHL0rMWzzAf3Z~%&DO#h@NPXUU8Fc-u#1tmo=zkozR07g^f7gOr^1>^@N
zAZ7+)79d9T&*H-+)0=^!APn*kFobgxOJM#1iGl!(#_FG%U}K`<hSHc(P~b`~E<$y_
zl%nfoSlZ$QIv<=6V6KOyC>V{^^){InM7iFG^4I`{J_vwf0|ZdL(r+Ve2J;FB&?_m4
zC5fc@B#n-Fq`{tn0f7U&8JR?wMIhc{K+p0}VIb82N{q+_0J=uh?1ilHEJPzTv!Uxl
zO~%Oj<^rumBx-bhsObk;Ukoso;He2+8){r3Yg1=tfX7^bH!B-Rjui;SfX?^`lw)83
E0F6B6$N&HU

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xtransformer/dense_train_feat.npy b/test/tst-data/xmc/xtransformer/dense_train_feat.npy
new file mode 100644
index 0000000000000000000000000000000000000000..69a0ac6ee8dc6c51338c1af19661b7fd80b02d5c
GIT binary patch
literal 248
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+l>qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I%oItqpcnmP)#3giMVxeZV3KzRPPMRp880Aj;vm>7r;10XSEH6U@A7|bkm
L^I+mI^FVw6I`l)?

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xtransformer/saved_model/C.npz b/test/tst-data/xmc/xtransformer/saved_model/C.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ae79e7cd8711d74aa15e760bad270c3223684d8d
GIT binary patch
literal 1309
zcmWIWW@Zs#fB;2?pWkh>CjdDh%*`OekeQc~nVedzmse29$RGd~1}OzegTQ3JP~U(^
zMuswmYW0-V;^ZQAD+P6%OcQk-1@*N2qLQM-y!ia0lvI$oTVhUeDp0&QBe5VA$k#B@
z(bQ3>RiFTHL0rMWzzAf3Z~%&DO#h@NPXUU8Fc-u#1tmo=zkozR07g^f7gOr^1>^@N
zAZ7+)79d9T&*H-+)0=^!APn*kFobgxOJM#1iGl!(#_FG%U}K`<hSHc(P~b`~E<$y_
zl%nfoSlZ$QIv<=6V6KOyC>V{^^){InM7iFG^4I`{J_vwf0|ZdL(r+Ve2J;FB&?_m4
zC5fc@B#n-Fq`{tn0f7U&8JR?wMIhc{K+p0}VIb82N{q+_0J=uh?1ilHEJPzTv!Uxl
zO~%Oj<^rumBx-bhsObk;Ukoso;He2+8){r3Yg1=tfX7^bH!B-Rjui;SfX?^`lw)83
E0F6B6$N&HU

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xtransformer/saved_model/encoder/config.json b/test/tst-data/xmc/xtransformer/saved_model/encoder/config.json
new file mode 100644
index 00000000..bc5d2bb2
--- /dev/null
+++ b/test/tst-data/xmc/xtransformer/saved_model/encoder/config.json
@@ -0,0 +1,37 @@
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "finetuning_task": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 8,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 10,
+  "is_decoder": false,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 256,
+  "model_type": "bert",
+  "num_attention_heads": 1,
+  "num_hidden_layers": 1,
+  "num_labels": 2,
+  "output_attentions": false,
+  "output_hidden_states": true,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pruned_heads": {},
+  "torchscript": false,
+  "type_vocab_size": 2,
+  "use_bfloat16": false,
+  "vocab_size": 15,
+  "truncate_length": 128
+}
diff --git a/test/tst-data/xmc/xtransformer/saved_model/encoder/pytorch_model.bin b/test/tst-data/xmc/xtransformer/saved_model/encoder/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7361b679edb8f44fd01cdc30f058c2020127dd21
GIT binary patch
literal 16473
zcmbW82{@JA*Z<99h73_jG>HnCoxN^pqS7Euq!gi);^tIT3K^nMG-(hjDv^rH+3PME
z6e^krG)HLgXpp4-N7V1>`FY;A_kXUdb6wV6dw=$KkLzCNE>tX5&iCscyB3|rha7tv
zhKfz96BmAmup)z<l|@8E`~&>>ehd73=dJJx2=wt=#)=9ZoE3y6zCL`uuh%@UWu89n
z%UH1u&T>K{KiJ=EUZBq^uRvBjgq0ZVEGaYwF7gZDvyu*C&f>zqe3tQ9DF+E>iS{=9
zmMvtZ9VC`77cv(JF}_|4_(Dpc+>`)MuK+L4F+L0Ua;%8H6DvB{d62M%hgSgK&}*rO
zm#3%CvW0<$EByjI=l%U}FQ0{r_^j9hzF)wCMTYa1^L=~+<>m!=c`Wzw^_<7|S{CRR
zFmHv494kI}u(Nb~?}VQ23%yv0jw*6?zJBifX&pv6R?=BX=ri2R*mSt5*>E!p6KfMQ
zD^}`<k_G<Dg`E$Y%x7h$I*2$ZImkq@vJNs1qEW0|%%b*A%k$Z8POO5nK^Hyu_Y3rC
z*V;wz6&DFRw@6~q?{IS=-NM9-RazuHNkm3SRTfewiHNpiyLVt!I<TsIR?Uf3cOKdW
zmhZR3YuUUX^Yq`t)}w2%wY9PNa1&Nzk+g%dkgM6QR;(S{vjf|!1KXR=YB{leoOQau
zPIM3U3YhE{u+(s6#|&wAi88S?A8u`I#`YDW+9m3=qhumkT?f%fwqFOPKc5}o#13@U
z{vR<OKJJ0+pe`X6)|OV*)+Vf8CuDFZM86$k&;c33XNNkm!<>zOL6$A>6D~<Z-*zE}
z#)j^E;aX~+Kf^#TUvEQpxmQ5&Pi=->Vl9VTSXvkxvqnO!tx%is4{ZA|4(}L7lMb{g
zpEYx0&7BSZPv{?dExHDqm|Iy22i3AuuT`gB>vp}o4k+QX;KVY23|->&ORY_pPz&?n
z#%98;GNMzh?Ju<>JJgQq0FUOgW1QHre_N~n^FX(Qe`vMq5^G{5+#}`|tbM1}ah+Pn
zw`-lyfpp-r6P?&ee>}G<+<ljO{hZs$U1H6LTMi#?YQawFRO|Rl?bHsnP95lJe0I7M
zJL8XsxdZ(}@60a2!!68AER3z#S)F>FJN3G>>z&;Jb>*{joY=X}Cja|_@>|aLU(Pr5
z6dsem6uWf^HnuXgv=R>VyiUdQe<^nFQ0&nGU%+QQomj6w8SLNSKQw!Hi58wk78X`k
z?7~jXi#j#?v}<16fnCC9eVy2)&K7^H`M*x6WnIEetj*0WEX-KHPPl(3oNb2(bif1o
zEZ>P;?riq|0so<UMVDxEQ!`893Bs=IL<e=EgWJ)oI?${6>>4LF<lh&V@cQ)%Sn4Ib
zwE14Y&-2<Yp=MU*X2wFj>x588;i(nc4wY*^wZb~)IlKeBp3g=&v6272kGlZ>&>Gbx
z)<k&C8e18&(L$`85WC?AHj<5L*BaY_+{kA)Ik9ohga2)qe_K$U_fCA5I8!q#Q;Xpi
zY(l5X%|C|LK`x5jA`~mU&=Wh5Nqlyz6T9tip~yc@tKX16^lk4NXgS>2+|q*G(K(kp
zJLfXFUEi(_*ls?%$BEth??c)N`>&Oo(k0StxV5#J@Tl0=iA?Q8rnMu}JCOVN>;WhC
z;NJq4uJ!!~`9otymq2r~;lr(khhk<Y@K7i4a69lw2QZ7z9(7`~oi%^Xp}(J>ukeEX
z?F7o{x_N6ellCQYOxS$;eVW@bXTp_vyklja=s=w0vw2QzzH^T*5kLBWs!M>GskNoK
z@ZLP#+5a=0{V!<mzo4UW%4gAu<(#_<8(-k-9vJB3?IY}d=VCkCWxE!}rp8v*7VJ4;
zyY2B*VSBqj;;EvJj-T&n`vRZ6=)_+75rO`#?H@fZ?y_ZbYwO`wX2$GgVasB|maqJ1
zIg%}D?{R5It7UxlsuO!{c?f&mSytHVc}u<c?w;;^ceeaT%vv$nS!fkuZ-lTne+0-s
zBmY|tqJcl6_VyT@y&b~dS@gry9w7fCENAbAu=hGj`~G+7KZADmehB-Z!@GYM-dzOB
z?86YYvZKbpE^2fc<FZvDY;{MSo?X=WwW8TaA?)K0KaDQ@JOljvh5L?u62d<1@ap!<
zt7EUQ&qCPe9YretDC&qF*%u+~%MJ&%E*%Vw*_sfxw!`H=VeUV+;;#sleHFsibvXZL
z{Qk}PN4Uz?hp-JDHT3_b2VDlZ?CTKrO-DVwKddK=*Vwlq?7ROP?MK|hHioc&b<`X3
zhqRxw(-gwK@2L5o_3^JG(s7KkA41rVe@y%PInRCyVVgT@8vXD6{O6&~ehy(<I%*I5
z<Jv!UehFb)JL>BGTiw6kyX@BxwyneeU+3@d{vF3K`z?h1-chUfzfn3=1&D}>h=c@)
z3cvrCUj2Ri0>pmQ68}}}U+3(fV;3Otqo(Aqn*X}~fATtZZ-CT~deXn@$^X>Vas33y
z{3w_GRo;HQ|2X6Remw=q{qU0i<@L`S#$A}qSmECOkQmVIhpWOb*Y?aq=OhLw{wP!W
zRo0$J`OkSDp!}n_`>$ddVX?1YVBkFO1^fV&ABCzv3zr9a2s0^j+p{-+&s+Qq)<u7_
z{(N>2`_1|@@)G~u+WBBE@w>J2Y3uhy#m^8~>Nnq?A(r&-*3LLc<~QrlP(}84Yv+9|
z_q(+-CnNv6wKHt(_M7$RlY_!PtQ`+;ioaVsqiv<%t(`HA@;|H{shRG-TRQ_3m48?}
zvMH+nwEnoc)qb=7j1<&=w{|{S_xRn~nbpwv&H6Lxp!vJCGb7N`%g|Q1n%X}@Y;y|a
zcST$0%zaN69G77VeF`XN|E3_g;S1GUzfm9}62`Nxo`G+AOTp$x-MF(;Pmsw5I?T4f
zEufrv9M(<{Kv9r8keQmePiimTu<rrRww5?@Tngk2H^+PDRXLAl6=r9ADQbyVgZESu
zqH$In4DL_m9D6ClP%;-sx5#oL5qk8f*DT1L-b&KfiI8yf0I&}2$$_{9cV|*9ooi7{
zwtm(CmAzL4bukMBiyiwjd)`sOpoZ_{`N3BZubzq1CaDM-2ArcJrs@!1F`DT=C6cju
zb`ifSJSAa+uY>JLHFT<&$XIKQ=5i83>3!Q0Y~NLX@@e0k|Ci5y+6^5cBJ%J5{^R#Q
zxBef$e{ZXu<Jj3u(*PStxKRohwVhy^_A`{I$RdZwDDh(FoPgSqnz*BQ5cEE_4L5H4
zN+!n~fOxkyy4YHm`t6IwBHy_Xq^QBIUd+R2YQTNa3@7jQmEqMBn{c(nB|KeU44Er7
zU~hF+a6wdwsZG?tk?Q+sP3k9FD5J{p*M*VV%;RWPH62dv*#<>J9)if}N-z=M1_l|1
z-29iC5So))aP?6*=%we<M*C`baJU%un9FgY)dQevm_GJC?1FvtUJ9PrAEesLCt>LA
zzSx?!1D5oW!QLaAz+XR=oWAA-Yn$uwU}{gsETk7Fk`M<cKdc7J*>}<Vg%?a;>xTMv
zQn>iqDLmHeA(c3g0*B=#xE`T<;Fd}y96y`_%KZm|x#1l+c;N*}uv&qUif_=w)&Gyz
z#h=d0fBgREc@F9<&U|A)#!eDpLQiafL2J^nNaQTkzKuqYH^V4@rYCWW9D^CV#_;r3
zEMe-l?}V=>b7;(r@wmctH>o`z4R?%g;byS|s3iN0v^t+hZu?ehZKQ(xSBJo)l3v`~
zCL;*mJ&KBEwh_(G^H6DEBAj`)A1sbUg4v5XbYR05?D2R#Exd4x-l;+ww!nz%*((qF
zw2Z{8O9!yAxQa@S&4!}A2GATmkjXAe5@^gUhKuqgsI8WY_E}9hWSk|41Zsf1nT{Z5
z=LNzg?xRn?T0q`CS6r$g%PAe121hnOg)!3cps-tl`+DdLo__WOqXg5b{w#ga%GcpO
zg=IqZ-m|EA@-97Ee4X6ev<2?(J59<@4d$Y1!x-%~&jc&SNAT_&j6%2Gy&!hc3EHjL
z8H(<1q~qPTQqLFZ=p~ws`O2GMNYp2IuW!WpFS<)zCL+;^x`Ug1UAUSVQFuis4rdkX
zK=T_HoMB!=<7)=O+FA8Dr+lQ9im^8LHT?)V)3^+6Yjj}j2y^s)GZ%M?yamDCKFqg|
zaqv0$44kk23*!$xrWRR;=u9UAu)BR3Yt#~m%Jx=)vfd=fF?mLG9(*KW0ebjzpI)1~
z9!=WL;FA1}yo^+9GE3<RJ)C(3uRfJzX2@N@H#5g$?U@NUpqC6pvz{;{;UX<kdWKSw
zk0Iv6PW=4DfyOv=!)#4)P{B;t-S`mhf4q#f+;RHsb^_UW){xlB$uP?%Xkx=Lb@;rY
ziOg2kB_^M{a}O7%(P}##&dt3BOO0QH>ZD5ARz4P<xE>;w(H^jLN+IgF6$#!S3&W&m
zW1!!cA+T(^u#KW1^jo@=NQ~YBD<ww5Rq<*vId(fe9bXHRs}@7#hFr96vBSCBt9Wh_
zhaj+54<>NVMo3>LNw4iRKu7x`IIwp&v|imwV;(;NZ?S&NH-l<ACmhK(z6{ce5KLY*
zkVM6mVDYvg`1ayBI?|(`Kvw)MiSHFaYr;R_%;a$Jvj~Pi-B-4q`)bKSJtpsjux?vl
zUJ9*;K2ECK@ZfMPuVsL!PC@pt62=YLOz$teLW>J#3$!FXsfYXkLHeE#JWk<t!NT)<
zv8Lh>Slx_3<!`;9Ij0o!M^+T9xq1!XAF#u)l#|rN!klTm{S<UJSK_Q~gf!0AM6JCF
z=(=J)YV`7<`l@qa*p$(zqMHKy$tO%cH-fvKCj$YAd*S*GU1*s;8B!k=kf-U0&l2v@
z7;Y(+r7XbmPzh+z8jmf*Qpgm=3?gN^85}aZG4E>BxW*gtIHf2H^WQh)9IZ!!*T;5~
z3Y#~iUdoiDSm?sZ3wFHvlcj$=UVl2SBAt$ghX`cXD>H^hh0yfv8oi~Nk6d8_oZTtG
zq=P1xd@Pktc-07s(MJWR`JeE{*i<ZV<#7>PHWBIggOGEyg)HBzz{K>8V#xhOT1d89
zebCFK-4FI*`Z&i^|Br4MJs}hvb8ML07k!vm-OtqY;6Oo+{WkE-PeRoJMub0^Li0@}
zvZF-<atOfdf{lXZa|G7CD{sQRBU_=eyc~*N4TrqO=k$oxRa|&jgjr*iiIKM~xjhll
zpt*kl<Gn`?(x`QTd7%VU-1!J=Cnmwz(h*d>dM2(rwuLuk0wFdNdN5m;pTiOz320K;
zf-7@olfx%9xqhPknce%#d4tv)V#}s`5N8@rO2tZm%+H{{-_}FTj#g4Qr4G*9?8KQi
z{UA&KJ{fX&AQL{S4leaPiFT)QaHLrSEtoryL5nrC$2M8mx>lTP609eT!eRPw-DZq9
z(H*nv4`S0PJ^F3N8q{#zi+x;V&`?beCok3Dw3|mkc&;7ue5fo;>oFcy&pm{7#;UOO
z%U`4|{2<D&y-q}IO2Dix8lRrZCp{I%b9V9{1h!9-@!3J4_Eam9dSEHqNDo8mXhc8X
z?r)7_bJ6%{GU>T(Ae345<B0PK8jsF6<MkQAij9h(k87y>gH0Ia#_`S;XMpQj0?~Xg
zY!lb!dYP+n^)F>H{P=b_8S$0Ki>aX3ur%ID-w@d87>{G8>toOhDRiovhxQI@iH%zo
zBt#zo5tSW07pK`I%1xO`{YdfKDqEngi%FmJ4<Ou63mM~Ebh0GCX|WP4U1Nzo&c{$|
z85fLHDMW)EHe98p6c`T-!cgWo)n9!OPrNE1>ALaYG2soVG;W3K$Dh-j&6zkVP7Si`
zwqf-Zb8d&I6i$<MgE1op!khlX7`HJu$opRAaDJ!~CiM(K6WfF6?PJTuB^;(o_aboq
zO>wIHKpPW0s_B;0z7S*+g;q=M(>9I%%ud(A1Q$Ifw@w}gUeG4Iu670&-?>69ZV$qk
zA>z1e%zl_uJ%MOAzasvD)q*xqh1e5!u*Ufe*o?e|&sGEpK8%Tn=%G=h`b{F4c}f+{
zx(`L$SW{l`{A5~GU?|veV2towPo>U#s?lp<FcI5bNDtUmf|h9wirTEi!#9pWiHSb7
z4b=g&+lt)j-HpHvS&HtXO&Dxz!RzHykbm(tuQlNao@(BQC;mzktn_r{%7gpD(=%^)
zTi)t&ArS{~-<t$M;<*<vN<Rv$H%$hkCD*BpWjamN)x*-;CoyHgQ<#8q<ekj|-bUws
z7~N1#``iRjY*yfuCX58>T}{xsg-`b6rlRer%e-@+Me%Y#1>Jc#lQw>w15r#K^o~s`
zcqN*IQMnFaU{{H4`74Q*?P^e&$?<eXDsclUM$?IhmO<{N+rm5Vih!{`fj)E>mRx&C
zzh#U>>3lQj7i<lqs_L*$OHblT&*00nYp8BE4ka6R!`ru>81CcDw5}P7J?;%a?dfIo
z?dTxVnx6r5sQ{m;55=IR%1ocD>8P-?7gPWB5<O-#8kC)Sa2saFlaaoPkaXY>*ji76
zg;%Gb&1(tBp0C6$SN?z+-Dji7o!#KpjUlUF8$kKWBr-+%BIN5n1eX<S@au*gcyap)
zIpCfF8{#)2^E{C%IQ3$7Eb2yD!w(Ysu=${6APqOa&fr~|ItRvIP{i}y*JJ6f*;s9y
zfSdWj5I5SE+86Gm)pcLct{@LrH>(PUgrBw!uy+)gWNyayH4_=-drheHWet89XwGac
zxqvk-lq#D(CP44eL8|819HhcUI^P44Tpm?6IM2(RU4!<jwIKIVfvCtH#K_!Qup4(5
zpDak`T<?!#R4>kktqN`^_syK>PC5hcYF|@J$;;sK)(dLnB*^O%ad5w1HGR=wMD|A}
zfW*>VLG7x$D0w=L(poJ>OW`%H8MY183TI)qyB+CyLYXWUWKq4o9(Z@hP+a?LDECxr
zK2Bguur)CfO4X#80T%YC{^TNk@v;c}T+M*uJ0WzSWIlNHAH-NHEuv3eS@8ZcjD-Az
zVz|xRLPKInQugz)^we=kS=WL=eV=3Kty^?Rz6}OG@CQfDMyUKA14d%TxF)0!-m34S
zWQYeOFMDjA(c?HaIF>=knZqb=`+*F)^BVUBg+rO_HVAZC13Gsn<ITJ@$filKdhbPS
zI<cH;D)z@wI;!AUFqnz<HzWAL0$X-B5zh}B=%gi*yq&l7x#SXg#^7ZzCR~ZKX6ANh
z-i`31!!}4WZzb<h*N4lPul%tfTr@E4;Z=O;{D~~_IfbSE@95^kGL(~&XGYD*L%X}@
zpx^q%crIig#@6(QXMx2a7%5Mu9j%3BG?-cTU4}Yl=3rL$R9L@k5p|7GV-B}mqQ(~T
zoMFo->}NI@^&c$e`kc?ll$+AnHe$Y@b$=EqjVlMS3#ypq9mzAcIl@aG?+%(9FVStv
zj#RDNXe#ZXO4g^T;r%5^V9yR_#60)W2h!s~u`!EC8(pWpJmolh&0@4J)yBS#k<{s3
zUBOe$dH7VEz{~tf;=;s(_VOZf_HF~@9^L}|<?F3u7wyKcNf$v@BNU1sT%eDJ6;SCO
z#qi=qKJiHViVo#sD7!0`G-(^a$?rpXCu|Baf|gkiXf1~5!lAI|_%l3Po<+|v9PnW}
z`KB(*c$c1s7Mx{`CBEdgr7b{;9G7(A9gSRmmpq7&<aW9*gt&#5Naz=JCVx`3Ab-$m
z>Ty5|jvW3-ZCc%^@fdM#Sabwt<f=h~N+yQ28ZoU`bfE6?J`f!&0F@ru5ZL5SXG|3%
zD{ni%e7*N*JN_yzkywjm`vYk5oi}9jbB-z-Wzo+|$KWxYotTnNa9VT~if)d94-In2
zcT=IR);*ZMzHxYFk|Pyy5XRQCN+4}s6?~h#AHK~mz<h)K;QdjeV8v2d&gEJ^I2%z!
zPY%6I6<=M4`T}7*U%wbX2c*-F*}B}3Q*(Gaj)5@a;wKE>C%l&yrqM99C&anA3VNTm
z1jY0-U=)9c#vFDar=5IpO_e2#pZN$J^<%-i-&t_<I*78&dbF30pu07$LeS@Hg7_Dq
zsF?DdFcv1*5ZE7#L%z}HX?x(p`2c8q9m+fLU^%t5J}t=Elm;u!!%)6=C<eY+jKRwz
zF;OoQjW1AoxF&}<T^z(U=FAaDj5EU+vB}`6tR&de+XanZEQi`-hSu(rrI_(5(U5mC
z8o5V?bZ(9V&n@R0PAm!~d9t!N@Y4x8KXNu%m8S*SvlO5>E)UYidZLVkHlubW1nSk~
zAnCFMZ5gYJCK@`7^O927u{s1b+#PV_SUbVkQ;(rwKqeTqrlXUC3wgF>Kfc^}yWs8P
zIE=ZZi7`>KsBf-<Gv<Cr$(6p)P-_n-Z)!2FUeU03>rEOiTUN09;aiY<><gzohJkX=
z<2;YBeULi55aRQ$(XDq}@TBEtaOH%#6i*lCT|7f#HZI5Y;or%iqOWxNf-<z@e+1>e
za@;K0Sh&-(8&mzsj43Y)f;3Scd>Wrbv=RvEHf0?xDT;$<{-5CG#3FR3$t13RBTn30
z4hN^N6in<t4YDMZ={K4t_%J#ERWIHk&u$JwwlW6NcbvoQ2XW|;bDxC1sDpcxzCuog
z1fA6qfCCK@phaUV4XT|5Bj3Bz@`Cdi+NVw6<?%tVHM;^AO8DcFrUS&xHO_iRKm&Mx
zh{fX7Q}Jy0Fh<|%Io<z#v4B<BfhEis&}oh%VORPP`w{~%Z5s*^O?e=j+=zFjx`V&Q
zM65B5B<bU1nJq5+(Dwa&dIVpCqEbI9meH3%GY1l~>I?X{odyw8IqLsvmY{)Of*Za{
zajo)Ycr(o&qt3l1lg=43v5StNkCY~yA)(;2d<Rt9O5?l93-M*_esD~@OujC3!Y66N
z(6s+2%oW_GP4m|i|NH^8bwVu9S4;sJ_4CA=jfS$d+u;7GHX8Jb#q4|fn3_KpPw#w7
zniEwxrnWn-^h=-vLM6G9ZH_Q~mkW1OTMu?rh;iQzUc%H}I{0Br3G_%;V&Y=;iAPN}
zZgo*a=}%eIEZvkz9y0}kHS_7c+qtmSVha4_Addl`d!zl*S#*GB3eouXp490m;L2CA
zT*Ki_*kU1kmQ}a{hJ(tm>ElYw6-ffURu_cIlL)4@kSo5EtWG;5=srIcdnO;Ep$Za$
zjFMXF_Q4AbT20}b+eI+Zw}zD1WX$Q`9Z#ji(sEZ{NRAiADD!jBZh<)_S197dO=(19
z-+L(Qrola{iev(2UW4oJub@;_IP9#Sgd>zTf~@yW;vIYd#S+Kh%4L3>$hR-#!p=r2
z8Z3wLu`kHZi*gvwlZE<eA8F`mSMqi6A#%R;AV$mI2Jg^$V8`1=T>Ga9{0d63M)V?`
zaQOrFERlgBeY?Thc@ZcbIS^O*r$I<$DL7p<z}Ae9WVyEnvs-5aHJUjU99;|XSkN0f
zZGH+JW4sXi%)UU4E*^*QT~XNcY#1<WhjTk!r-Rnbo?MahG5n^pnb*s6D>&}nN=5v)
zgJg+4FaC`sQxI^SxP)H?W-uk^9VT!Mx6jao0ba~IBF0(S)>(fTJOgWTUV(h<Ag*V!
zCYSo68Mw*uc(w8~OmPn-OIFsx+$U$TYGf6p^_WE5`<y~^<#O0KX+MY#+YSXw2Qg3I
zw_t{iK9jm_FT^n6sMhls$R1vfI`ySARQwSy)UKWm8mi0aKHUK}wO-WDZ4O#~jT4@8
zJkWTTV@>95gFAEg5TEE8D*MG0YgIz&E!)*ZzK;&hy8Dg>2ibED(`C^4%Op&wc#f8F
z?`ZgkM@X7)fpYeRg3TIPL^NeB%DJ_|<$0OJJX04dRW`u0=#enF<eWfMVj?<LiZM@<
z`+!LFO$>4<6hww^#%D&4sa2CPz8qKylgH$u!|?66$x#=?4i4eGYJ@q1!fJGTD$aaf
zbQkob88kNxCPzctXx@iy`1xBE#4!U%z?O!Bi?SlbWWNqL2JOW#8xOQsXu%`pQJ{TQ
z689?ZD|lhums2p>fp5N?#m}i3_|-6u^bG4qe3}HjpmP~;>PaMsl$7zFC?+Al+a~hy
z(76Ju`wCq0s}y{qm4m0o9{{!dt2{-AQy_QflOT7PDwov!k>{m10H>{aM-OI6GkL-s
z%GOp9W>D5Uva0tUXt*4Q&m1kFY4Tt=cs`Hncs(MSpOnD9<RV<Z<c%)#SI|@85x7pE
z3Z-Y=;p|`2iOiwy_*uQ2Sn?&A8v<?4(=(K=K4JhT=0@O=(I<)3HW_G5`idrzZ%LR_
zJiIW;ATKoKIeoqp*F;xC&bL~)F|RwC7zprqLK1C==*OhbiY8;DzG8B3BPcxi2#17r
zV@~*GQu(7iLFB`1sHk<Pm+OPUal0GprHL~#x8tcqs3dbQVJ_}{J^;igeZbAbZ-8w-
zYk2xXnUhir#T@nhWY+k3wBj!c`wXI>$;F-y)cQ(7OeT<;lR<F3ZX%hW>rK+9$)U^y
zQD)nXJkW09S&u*R7ite%jCFTZxHI^bW>&W1=(12!Q{j*L5&LQMjTSiFNO6$jES{lC
zF2o2znJLBn;k26`B)CuK9>p^d>@SON*WZG@!d(CI5w}4yEEe|VZh?6zO3V|TU4q8G
z5`dzWc<RAM(2#53^;<KFG{2C=#0_&`P0A=H@c9l*BoDA^;6+~J?!EZ*^-UuBaWIC9
zc7viZ7s=<(J|wznDCpna3`$ywc>AwCME8gfRTE@@ZI(1=nf;CKU)oBQySY(!(=9kA
zJqtdo&xU(n5>fyC6X^5mE<RrpESMJU3!&C(oa^Wam@SAWl51LE?8p_Uajpn!_@{-r
za8dN_^A}DGc|;wHlo`!BA6gZ96Q*y|<nGdBSUGAZcw(vGu=`FLtMh?Eg9P`o%olcB
zrD55k1+c`3g8Fn0cMI=^;!s6qaC#i3=S9MrD@jB&e;`z7>%a%O0|MzSM<DRiU5s+h
zhU#bcNrwD+uzFq%t37%`@~Tv9;ftc(E^*FydOB|J)`u1+iE*M+$Ks>QMsQDl3Zxv6
z#hQ_!aI%um?ET(M{MMbr^3_X7vwarvJ7Np2Mq}`Z-Xg)=+1GG$<YnU4--0o@F#-d6
z*fP)FAI3Ew(s?#R;>Z9!BW_vrbz$z>i-f;?Mw7e_!|7a3m@iJjhEIdI9s6hF{9#{-
zYM={<iL9a$_qGtVQ>}DXN(J_nQ(?;IvJj~AifoNq4{n-|sdqO=WDX}{R;wN(R%ORz
zjwz-6Z1xD!*RBS}I0Co!sfVOPui$9LJS-cxN6=j^4c7L$i>cDHL9HnOtF0vQ!}xey
zxx$6<k4~jZJbyUxC4pl;d-MF<o`S>UQo>H$Na<!Hx?;;<xN0QET-QD*%yX9U227HJ
z%;ZL%T9hP&1ooo44aB%!MKRd+x(9B*6iWJuSi_XEGep|t1nzE1!<>BwiRsQO@Y;D6
zHMseRo}>H8Q8jn89nMEMb^=I%0z*BW1Ot+cLHFey8Wp{kNMG%XmxoQ@&QFkFT%O$H
z&8?`%XRiR}e5yl(*6rk`$sq_R+XRI>9ztH;HDY%48Z8Yvi`Iu~Aa~{=OuQdLo9`Io
zrTi!0JUtszzV~1Xu{&KmZ#l0@MgcYl77$;VhhX-XB#eziFicN^w2%py6DP{Kv+A%{
zwHG!|dLwvUGY!OUSM$okrh}h(0@l4&WSZagB|6jQQzL0L=6c9*+<qj6M2;T;=btCx
z#o4_{zd3Pahl(U`Y0)^;k={enW(l7O7ZnOhx;0|WiFyqD9*4mi?u@gD0%W*obK^Y@
zLh-Iw*mtQ6^FHS?k?-bSutsSj99xo%%f_dHMEXKd<ts4*4h%!7$}1#2Z!qe%OoYwY
zo6a1u1s|@G#%k3D!SxnHj$N#YU%wkt&3l1ZUoi%j`M;z&ruPKfEWQxqZBopzcky(+
zbSx}BtAgi`o8n8`FmiYDP=w-Ma9CXo6#f$CEj2FU$qz$0k+v_SoA`TLsQ-joJqUws
z`dT0=F&ZCy_5=0E#Wc>1$H<oJz|C*wSj~)pZN28B?)nru{YVw2OsK~AMH1*}um~Ei
z?Euw=aE$Ih5+A`vaGQAoqpl@DjAsO%59~>=dcUCe%;Ry$x}%_MP!AsxB%pcFO%%_3
zL!xwef-@!G1ePVwkUKSlJ61iHjC^PZ72I`l|3m`}nIeuR4L2a=c`C70%cVZv{t(8G
z0{cUrq-N|j2*{9R9v#d>D{lwE%jl(`II{*DmiMN$I*Y)ItR^9ehLC904Gsjm;a&R}
z{8n@n-3QDkk9zDSF|+s3v8xDAaU9YoJI;en+-C^rw~u;k9V}R5Jrq6Gj~1kq?xK?~
zDMFv`D{$4eBzT`7Jm350LzvP_aGn01s(Owm3!gtGnTdbVBOh#_*8DnM^XLMczPds%
zOurcPp7?=+Wi-f|IB_<~@6qs-E|Yq19fn;?K#c?s$QqpwiyY;+nI_%2-F-F4?r+(+
z=7bKg&!e!|d;xC~jllyp+faHgVvV0MzK#8Y)<)0i{j=Y&p5zP2YJI|xa4a6Zn|gla
z$aOe`k&l&7{*FAeepfxs(<=p=YXVqO@RB}pFXQnIv&mV9Z1|i|jQ+m;X`<-{>@T*P
z1O!S0S*8OE-QN(!;nNrqjXcN*-%GXCHjw6!am<eI6XAkx9M+y&2PJw{G}N;JLnq}4
z!ZuAHIXlN=wv7s-ShttD?!FDT<7ZHFagLgC=cwK@Q@r+33Mx`(K}^L4-oUlHp(3a^
ztoFT3%Tq&1$c-mx;_#fryFZ2V8@(W*?HCSxX@(;uOoZbfMsFL%aFfSG2-Jgga7d*Z
zOd2PPMKQuW=gKn3@yo^4W5jWq@EIs~#Wsj5wZuBDT+-|~ms~aI&WOz40fyB_aB=G-
z#F0C}$5WisQE`D6s>Zn7SBn|be>CXT#ev3jT^RCRlV*uW<C1J0qPciHu@eURk{;J#
z!qJCh_0%HBiW<U|T}!5QI?Js@_8a5E{OO>gH-nMdxs_h?vVs_=zBu0d2(G%^4~$7I
z&Cn69xvD`#N^w6~|47qPRP7olU#S4T(;7VRrUVn#2Y|QVNm9K;gPT}WD?HCmgN{QT
zSghEF&b?0JGIk1XcS&UQ{Bj|o(iB|#Wy8xYeV}Z`2A*H=M_5J{V8}u}G!K1%B8s+5
zo_Hm0((~j-(G{@nx+Gl_RY~1m5g0adCti-WfC0s6yfssi&b_;l>~T*<)whyNWULey
z`1k;n&3*!Du}P3y6a!)Eku*=z9B+<mqS_zR@Z&{MeB87duDv{iOCN<79BqrkO#9tv
zy{8G^X1aorj0op*RsgR;Qn0aE8Xd_@Z1|>6LNj;6*dxkJ@`pf@e((!9@-`c0rL#Ex
zW1O|&(lfMB>pcBDUY`?<)j)mEfoPVP2nlyKfOV8Q6VA9m;OGxHcm7e(O3}sE;(VHz
zJ{_yFrMPvml3epaU7TEd11|Vz;fotm+_(68sxs*!#=N=;@)dq~;ISy;l1;Etrx#va
zJp(tMy2{&B*Fxm`S;6bZB>>8XWXQ1V1?SIi2jiP5!e@6Dz8+h{=shSPN7KZ(LWkZs
zRA)K1xO&3-dBSJ=1^2-(D;Df#w-MvjC6J{a2jLbG=n-mw?Rf-~)T<B`{EAF}-9}8u
z|AqX1w}{e2VGgzF9*UJ8IaR!!VDp8U%pUz!klc4HT>BgiA8XHp#;_y2vF9$+WB0nl
z>Y1Lfz;Y*jvH1oRo{1%w>aL-~J$cZ-p@EOmf*~XOIDK%Vg2pw(<Llnh<bI$M^I7|v
z@OPO&e4E~rjJ>*%6x>S#`s%Fp{WI(7sA=-d$^Fud_lO?2LQ#f^y8RfnvN^gWsSU&;
zdUMAfN?_0K2~>N~1(<$a_`c9qe~3|PB%Yt1VvNBTin>|UK06Cs;UGjFKT8*0jKzNE
zr_iHwlOcD>4XljTVTLqna*9FeBqq5Kw6hAq!deX0DXTGe4HUp-k2o%K$|faoVWjkA
zZ_YZ*3tLQ=U}{zv(bjJu6^_9W{Kf<fCalE!QdbC7&!Jm`WT5X2O}MMpjq#Z^7-u|M
zhklCwMDM#Q<2(;Z<x)d#L9-b5W_BWo^a2`IeF0k}yW`WEgIMPxBdF9;<}63<f@zD+
zL-X8zjE}PlUU#vD33;m_=;361(JhbEw~j^q8{??vu&HoLQ4Ph?1|i?$5?l$EXUdn(
z!^d)>g5Cp~sa@c0h<LpnM-1K$C+ybI9XHFU$_Q6%^SX#n&F_-PoVhf0rW{VXlt5ZH
zi-CdR4Lor;N|4aQh0!Z)B?@Q8!Rotw!iD8Pugei6bix+A+)Iwx(2@xAW=i7f#nWJI
zV>NG@4bVV-B%YLHQ1{^#tm@$j=LFK+w+nsgBkaxCPkD+?PhQi;g(>tg6HTI`Lj@09
zOF*N(5UxKt3yO8lxY_6g)tD@RF*-MJ=)GD}vrZG=eIJF15|Jd%BLN4yN8v~Rxg>1q
z99-g(i1FnMFly@{Vy|wCT1~yVQ}W78)yirxs#}JklLv7Rc9j$Do~tlbsseq7&BsAr
z&+&7#3CcDG((J^9GaB_Qt@9he+`O{_(j*AO7ahfm+SdqE=1ReM2VE}Q=PunWuFV)N
z*$O)>>S^V=J@EK>H>RJl4md9v${B1riMcDT6Vtq_R%YEDph9aI+5S2aiz2RL@*rd0
z!USo?&U`AfUQQWm1!8#EI~YTsOeb?)X3=%&p*THoIvm-lfJqv<VC8rT_v&83I@fUG
z{duauXXa2Y&wvtU0!8(h1f+taShgh$#>m}(n6b?`q&0wSG6{v?`Y90RUV%sYjN)>i
z7t@00eNn~mBJpm0NoT2lCDNm&VnBczlV$ygy1wy%wujPmV{3yjH(qD;+W(fI-@rC-
zD3*fq(p5w@*o!wUb`vZ$*@$kche^La4d~@~2sRzKMc!|;BX6W9VmF&O+$}X3<EvU>
z@3I46_GU6>Y?5Y@JdB~gdm!qV?ZyKwF*v45f~k2V%s1?P2kQ5$agX*?Fc~5I-Q$Za
zr;?S17e_q?$(R(dvO0htM?b|6MMb<pzQHtt$aA;)HlTx1HyUu?ntL$eAdPr95Bb-P
z;ph7ms?sJjLpFy@+F<|+-;%+l<tkMxo{q?)yyznz@vzt{I{ow%Fw4A4#tz;>nkxK=
zgiS5UYiJ|Z-9DWeG<728j(9_ssr&`j+l$DVN;R(TYKSmCDWp}J6)^RHG}`A?L2{lK
zicEO`{%kQSiR7bd%Q$FcKatFa9$c`|Q{a5|gNCXm>U7gUN0VV(hW!i}dMlW*zZ49#
z>;kp5(4n&~*Fyac29<}mQJKs=khk5IdIb~<3TO3%ul8F7rA9_{qwpOyn^`W50W16+
zKl32{oEU=i^G5J5kmMqA$1u^W>~N{R7Map?gRY(>pzZ&8y8SaHLzHXt7QT02vI%pB
z=|Njh4XEJt;K>bJ1I>-cP%8K~u6}ZYsB+(-C8v@w_mr6^hyBzeBA9OENpoTy&HVcQ
z*{;E@OmI=z3Q4bTg5$MwBxsU0+`E#7w`{JH9XGZJ>{Y#Kc}5s6>!yun)f*u-u@*we
z2HJDl3%qvC=GQ#=`)YF<-?JfRs6W}N)PftW{E>dt<=RGAVpOp<WXK;E_)kKb8Jr4_
z2K9o{n(t&`_<B_C{{#=nDuT$(@g)Bi51(G10ll3!ko4#bba-gPD99b6rl&5!*V8Oc
znWoNJl^md-OjHWA+p6&OfN~<gQI452)RYuHn8kd0mQI6m(;>?AFw_q`4*E&XT=(ET
z^j_0$Ues1|=F}c#xHP1a&}Bw&u!leOl8~n|k@B#impb00MtDs5Iwolyg*~TLNSt~U
zJhfeq<;LIXxP9tOf#W>fbjF=5u`<TeVQtVPIUm=CiZhN`{&el|)AUc@Z?Wy1fx-`^
zRQ;efuX4E(g!MRu>3hS_uB{A|k6wUne$vdKb0?vjjsomRy+rIn7vWqxV_LKI3vOLn
zg)Kc-;J~NXaEiS*l#aOwWA6y<!rwQxdhLSe`AaeFDit)pFNN(&p*Xp`KbH`HwV-z5
zQPSFyz<WBunu+i~1lf&o5UBeVBSoAb<yZzv)eXnbVtI6aVHKP^v<p4<2f=p9Oi(ir
zVRVv|aG!ky^z#b`f2q&J^4SJ-EYqY_HD_>Kh&m_?IVSvVd<-K}n+pr2%4m1lMBY>J
jyKwWwF<P1LMpdU$=(8`BY|@lQXB%;R=X&Rl-y8aWBM80K

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xtransformer/saved_model/text_encoder/config.json b/test/tst-data/xmc/xtransformer/saved_model/text_encoder/config.json
new file mode 100644
index 00000000..bc5d2bb2
--- /dev/null
+++ b/test/tst-data/xmc/xtransformer/saved_model/text_encoder/config.json
@@ -0,0 +1,37 @@
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "finetuning_task": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 8,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 10,
+  "is_decoder": false,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 256,
+  "model_type": "bert",
+  "num_attention_heads": 1,
+  "num_hidden_layers": 1,
+  "num_labels": 2,
+  "output_attentions": false,
+  "output_hidden_states": true,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pruned_heads": {},
+  "torchscript": false,
+  "type_vocab_size": 2,
+  "use_bfloat16": false,
+  "vocab_size": 15,
+  "truncate_length": 128
+}
diff --git a/test/tst-data/xmc/xtransformer/saved_model/text_encoder/pytorch_model.bin b/test/tst-data/xmc/xtransformer/saved_model/text_encoder/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7361b679edb8f44fd01cdc30f058c2020127dd21
GIT binary patch
literal 16473
zcmbW82{@JA*Z<99h73_jG>HnCoxN^pqS7Euq!gi);^tIT3K^nMG-(hjDv^rH+3PME
z6e^krG)HLgXpp4-N7V1>`FY;A_kXUdb6wV6dw=$KkLzCNE>tX5&iCscyB3|rha7tv
zhKfz96BmAmup)z<l|@8E`~&>>ehd73=dJJx2=wt=#)=9ZoE3y6zCL`uuh%@UWu89n
z%UH1u&T>K{KiJ=EUZBq^uRvBjgq0ZVEGaYwF7gZDvyu*C&f>zqe3tQ9DF+E>iS{=9
zmMvtZ9VC`77cv(JF}_|4_(Dpc+>`)MuK+L4F+L0Ua;%8H6DvB{d62M%hgSgK&}*rO
zm#3%CvW0<$EByjI=l%U}FQ0{r_^j9hzF)wCMTYa1^L=~+<>m!=c`Wzw^_<7|S{CRR
zFmHv494kI}u(Nb~?}VQ23%yv0jw*6?zJBifX&pv6R?=BX=ri2R*mSt5*>E!p6KfMQ
zD^}`<k_G<Dg`E$Y%x7h$I*2$ZImkq@vJNs1qEW0|%%b*A%k$Z8POO5nK^Hyu_Y3rC
z*V;wz6&DFRw@6~q?{IS=-NM9-RazuHNkm3SRTfewiHNpiyLVt!I<TsIR?Uf3cOKdW
zmhZR3YuUUX^Yq`t)}w2%wY9PNa1&Nzk+g%dkgM6QR;(S{vjf|!1KXR=YB{leoOQau
zPIM3U3YhE{u+(s6#|&wAi88S?A8u`I#`YDW+9m3=qhumkT?f%fwqFOPKc5}o#13@U
z{vR<OKJJ0+pe`X6)|OV*)+Vf8CuDFZM86$k&;c33XNNkm!<>zOL6$A>6D~<Z-*zE}
z#)j^E;aX~+Kf^#TUvEQpxmQ5&Pi=->Vl9VTSXvkxvqnO!tx%is4{ZA|4(}L7lMb{g
zpEYx0&7BSZPv{?dExHDqm|Iy22i3AuuT`gB>vp}o4k+QX;KVY23|->&ORY_pPz&?n
z#%98;GNMzh?Ju<>JJgQq0FUOgW1QHre_N~n^FX(Qe`vMq5^G{5+#}`|tbM1}ah+Pn
zw`-lyfpp-r6P?&ee>}G<+<ljO{hZs$U1H6LTMi#?YQawFRO|Rl?bHsnP95lJe0I7M
zJL8XsxdZ(}@60a2!!68AER3z#S)F>FJN3G>>z&;Jb>*{joY=X}Cja|_@>|aLU(Pr5
z6dsem6uWf^HnuXgv=R>VyiUdQe<^nFQ0&nGU%+QQomj6w8SLNSKQw!Hi58wk78X`k
z?7~jXi#j#?v}<16fnCC9eVy2)&K7^H`M*x6WnIEetj*0WEX-KHPPl(3oNb2(bif1o
zEZ>P;?riq|0so<UMVDxEQ!`893Bs=IL<e=EgWJ)oI?${6>>4LF<lh&V@cQ)%Sn4Ib
zwE14Y&-2<Yp=MU*X2wFj>x588;i(nc4wY*^wZb~)IlKeBp3g=&v6272kGlZ>&>Gbx
z)<k&C8e18&(L$`85WC?AHj<5L*BaY_+{kA)Ik9ohga2)qe_K$U_fCA5I8!q#Q;Xpi
zY(l5X%|C|LK`x5jA`~mU&=Wh5Nqlyz6T9tip~yc@tKX16^lk4NXgS>2+|q*G(K(kp
zJLfXFUEi(_*ls?%$BEth??c)N`>&Oo(k0StxV5#J@Tl0=iA?Q8rnMu}JCOVN>;WhC
z;NJq4uJ!!~`9otymq2r~;lr(khhk<Y@K7i4a69lw2QZ7z9(7`~oi%^Xp}(J>ukeEX
z?F7o{x_N6ellCQYOxS$;eVW@bXTp_vyklja=s=w0vw2QzzH^T*5kLBWs!M>GskNoK
z@ZLP#+5a=0{V!<mzo4UW%4gAu<(#_<8(-k-9vJB3?IY}d=VCkCWxE!}rp8v*7VJ4;
zyY2B*VSBqj;;EvJj-T&n`vRZ6=)_+75rO`#?H@fZ?y_ZbYwO`wX2$GgVasB|maqJ1
zIg%}D?{R5It7UxlsuO!{c?f&mSytHVc}u<c?w;;^ceeaT%vv$nS!fkuZ-lTne+0-s
zBmY|tqJcl6_VyT@y&b~dS@gry9w7fCENAbAu=hGj`~G+7KZADmehB-Z!@GYM-dzOB
z?86YYvZKbpE^2fc<FZvDY;{MSo?X=WwW8TaA?)K0KaDQ@JOljvh5L?u62d<1@ap!<
zt7EUQ&qCPe9YretDC&qF*%u+~%MJ&%E*%Vw*_sfxw!`H=VeUV+;;#sleHFsibvXZL
z{Qk}PN4Uz?hp-JDHT3_b2VDlZ?CTKrO-DVwKddK=*Vwlq?7ROP?MK|hHioc&b<`X3
zhqRxw(-gwK@2L5o_3^JG(s7KkA41rVe@y%PInRCyVVgT@8vXD6{O6&~ehy(<I%*I5
z<Jv!UehFb)JL>BGTiw6kyX@BxwyneeU+3@d{vF3K`z?h1-chUfzfn3=1&D}>h=c@)
z3cvrCUj2Ri0>pmQ68}}}U+3(fV;3Otqo(Aqn*X}~fATtZZ-CT~deXn@$^X>Vas33y
z{3w_GRo;HQ|2X6Remw=q{qU0i<@L`S#$A}qSmECOkQmVIhpWOb*Y?aq=OhLw{wP!W
zRo0$J`OkSDp!}n_`>$ddVX?1YVBkFO1^fV&ABCzv3zr9a2s0^j+p{-+&s+Qq)<u7_
z{(N>2`_1|@@)G~u+WBBE@w>J2Y3uhy#m^8~>Nnq?A(r&-*3LLc<~QrlP(}84Yv+9|
z_q(+-CnNv6wKHt(_M7$RlY_!PtQ`+;ioaVsqiv<%t(`HA@;|H{shRG-TRQ_3m48?}
zvMH+nwEnoc)qb=7j1<&=w{|{S_xRn~nbpwv&H6Lxp!vJCGb7N`%g|Q1n%X}@Y;y|a
zcST$0%zaN69G77VeF`XN|E3_g;S1GUzfm9}62`Nxo`G+AOTp$x-MF(;Pmsw5I?T4f
zEufrv9M(<{Kv9r8keQmePiimTu<rrRww5?@Tngk2H^+PDRXLAl6=r9ADQbyVgZESu
zqH$In4DL_m9D6ClP%;-sx5#oL5qk8f*DT1L-b&KfiI8yf0I&}2$$_{9cV|*9ooi7{
zwtm(CmAzL4bukMBiyiwjd)`sOpoZ_{`N3BZubzq1CaDM-2ArcJrs@!1F`DT=C6cju
zb`ifSJSAa+uY>JLHFT<&$XIKQ=5i83>3!Q0Y~NLX@@e0k|Ci5y+6^5cBJ%J5{^R#Q
zxBef$e{ZXu<Jj3u(*PStxKRohwVhy^_A`{I$RdZwDDh(FoPgSqnz*BQ5cEE_4L5H4
zN+!n~fOxkyy4YHm`t6IwBHy_Xq^QBIUd+R2YQTNa3@7jQmEqMBn{c(nB|KeU44Er7
zU~hF+a6wdwsZG?tk?Q+sP3k9FD5J{p*M*VV%;RWPH62dv*#<>J9)if}N-z=M1_l|1
z-29iC5So))aP?6*=%we<M*C`baJU%un9FgY)dQevm_GJC?1FvtUJ9PrAEesLCt>LA
zzSx?!1D5oW!QLaAz+XR=oWAA-Yn$uwU}{gsETk7Fk`M<cKdc7J*>}<Vg%?a;>xTMv
zQn>iqDLmHeA(c3g0*B=#xE`T<;Fd}y96y`_%KZm|x#1l+c;N*}uv&qUif_=w)&Gyz
z#h=d0fBgREc@F9<&U|A)#!eDpLQiafL2J^nNaQTkzKuqYH^V4@rYCWW9D^CV#_;r3
zEMe-l?}V=>b7;(r@wmctH>o`z4R?%g;byS|s3iN0v^t+hZu?ehZKQ(xSBJo)l3v`~
zCL;*mJ&KBEwh_(G^H6DEBAj`)A1sbUg4v5XbYR05?D2R#Exd4x-l;+ww!nz%*((qF
zw2Z{8O9!yAxQa@S&4!}A2GATmkjXAe5@^gUhKuqgsI8WY_E}9hWSk|41Zsf1nT{Z5
z=LNzg?xRn?T0q`CS6r$g%PAe121hnOg)!3cps-tl`+DdLo__WOqXg5b{w#ga%GcpO
zg=IqZ-m|EA@-97Ee4X6ev<2?(J59<@4d$Y1!x-%~&jc&SNAT_&j6%2Gy&!hc3EHjL
z8H(<1q~qPTQqLFZ=p~ws`O2GMNYp2IuW!WpFS<)zCL+;^x`Ug1UAUSVQFuis4rdkX
zK=T_HoMB!=<7)=O+FA8Dr+lQ9im^8LHT?)V)3^+6Yjj}j2y^s)GZ%M?yamDCKFqg|
zaqv0$44kk23*!$xrWRR;=u9UAu)BR3Yt#~m%Jx=)vfd=fF?mLG9(*KW0ebjzpI)1~
z9!=WL;FA1}yo^+9GE3<RJ)C(3uRfJzX2@N@H#5g$?U@NUpqC6pvz{;{;UX<kdWKSw
zk0Iv6PW=4DfyOv=!)#4)P{B;t-S`mhf4q#f+;RHsb^_UW){xlB$uP?%Xkx=Lb@;rY
ziOg2kB_^M{a}O7%(P}##&dt3BOO0QH>ZD5ARz4P<xE>;w(H^jLN+IgF6$#!S3&W&m
zW1!!cA+T(^u#KW1^jo@=NQ~YBD<ww5Rq<*vId(fe9bXHRs}@7#hFr96vBSCBt9Wh_
zhaj+54<>NVMo3>LNw4iRKu7x`IIwp&v|imwV;(;NZ?S&NH-l<ACmhK(z6{ce5KLY*
zkVM6mVDYvg`1ayBI?|(`Kvw)MiSHFaYr;R_%;a$Jvj~Pi-B-4q`)bKSJtpsjux?vl
zUJ9*;K2ECK@ZfMPuVsL!PC@pt62=YLOz$teLW>J#3$!FXsfYXkLHeE#JWk<t!NT)<
zv8Lh>Slx_3<!`;9Ij0o!M^+T9xq1!XAF#u)l#|rN!klTm{S<UJSK_Q~gf!0AM6JCF
z=(=J)YV`7<`l@qa*p$(zqMHKy$tO%cH-fvKCj$YAd*S*GU1*s;8B!k=kf-U0&l2v@
z7;Y(+r7XbmPzh+z8jmf*Qpgm=3?gN^85}aZG4E>BxW*gtIHf2H^WQh)9IZ!!*T;5~
z3Y#~iUdoiDSm?sZ3wFHvlcj$=UVl2SBAt$ghX`cXD>H^hh0yfv8oi~Nk6d8_oZTtG
zq=P1xd@Pktc-07s(MJWR`JeE{*i<ZV<#7>PHWBIggOGEyg)HBzz{K>8V#xhOT1d89
zebCFK-4FI*`Z&i^|Br4MJs}hvb8ML07k!vm-OtqY;6Oo+{WkE-PeRoJMub0^Li0@}
zvZF-<atOfdf{lXZa|G7CD{sQRBU_=eyc~*N4TrqO=k$oxRa|&jgjr*iiIKM~xjhll
zpt*kl<Gn`?(x`QTd7%VU-1!J=Cnmwz(h*d>dM2(rwuLuk0wFdNdN5m;pTiOz320K;
zf-7@olfx%9xqhPknce%#d4tv)V#}s`5N8@rO2tZm%+H{{-_}FTj#g4Qr4G*9?8KQi
z{UA&KJ{fX&AQL{S4leaPiFT)QaHLrSEtoryL5nrC$2M8mx>lTP609eT!eRPw-DZq9
z(H*nv4`S0PJ^F3N8q{#zi+x;V&`?beCok3Dw3|mkc&;7ue5fo;>oFcy&pm{7#;UOO
z%U`4|{2<D&y-q}IO2Dix8lRrZCp{I%b9V9{1h!9-@!3J4_Eam9dSEHqNDo8mXhc8X
z?r)7_bJ6%{GU>T(Ae345<B0PK8jsF6<MkQAij9h(k87y>gH0Ia#_`S;XMpQj0?~Xg
zY!lb!dYP+n^)F>H{P=b_8S$0Ki>aX3ur%ID-w@d87>{G8>toOhDRiovhxQI@iH%zo
zBt#zo5tSW07pK`I%1xO`{YdfKDqEngi%FmJ4<Ou63mM~Ebh0GCX|WP4U1Nzo&c{$|
z85fLHDMW)EHe98p6c`T-!cgWo)n9!OPrNE1>ALaYG2soVG;W3K$Dh-j&6zkVP7Si`
zwqf-Zb8d&I6i$<MgE1op!khlX7`HJu$opRAaDJ!~CiM(K6WfF6?PJTuB^;(o_aboq
zO>wIHKpPW0s_B;0z7S*+g;q=M(>9I%%ud(A1Q$Ifw@w}gUeG4Iu670&-?>69ZV$qk
zA>z1e%zl_uJ%MOAzasvD)q*xqh1e5!u*Ufe*o?e|&sGEpK8%Tn=%G=h`b{F4c}f+{
zx(`L$SW{l`{A5~GU?|veV2towPo>U#s?lp<FcI5bNDtUmf|h9wirTEi!#9pWiHSb7
z4b=g&+lt)j-HpHvS&HtXO&Dxz!RzHykbm(tuQlNao@(BQC;mzktn_r{%7gpD(=%^)
zTi)t&ArS{~-<t$M;<*<vN<Rv$H%$hkCD*BpWjamN)x*-;CoyHgQ<#8q<ekj|-bUws
z7~N1#``iRjY*yfuCX58>T}{xsg-`b6rlRer%e-@+Me%Y#1>Jc#lQw>w15r#K^o~s`
zcqN*IQMnFaU{{H4`74Q*?P^e&$?<eXDsclUM$?IhmO<{N+rm5Vih!{`fj)E>mRx&C
zzh#U>>3lQj7i<lqs_L*$OHblT&*00nYp8BE4ka6R!`ru>81CcDw5}P7J?;%a?dfIo
z?dTxVnx6r5sQ{m;55=IR%1ocD>8P-?7gPWB5<O-#8kC)Sa2saFlaaoPkaXY>*ji76
zg;%Gb&1(tBp0C6$SN?z+-Dji7o!#KpjUlUF8$kKWBr-+%BIN5n1eX<S@au*gcyap)
zIpCfF8{#)2^E{C%IQ3$7Eb2yD!w(Ysu=${6APqOa&fr~|ItRvIP{i}y*JJ6f*;s9y
zfSdWj5I5SE+86Gm)pcLct{@LrH>(PUgrBw!uy+)gWNyayH4_=-drheHWet89XwGac
zxqvk-lq#D(CP44eL8|819HhcUI^P44Tpm?6IM2(RU4!<jwIKIVfvCtH#K_!Qup4(5
zpDak`T<?!#R4>kktqN`^_syK>PC5hcYF|@J$;;sK)(dLnB*^O%ad5w1HGR=wMD|A}
zfW*>VLG7x$D0w=L(poJ>OW`%H8MY183TI)qyB+CyLYXWUWKq4o9(Z@hP+a?LDECxr
zK2Bguur)CfO4X#80T%YC{^TNk@v;c}T+M*uJ0WzSWIlNHAH-NHEuv3eS@8ZcjD-Az
zVz|xRLPKInQugz)^we=kS=WL=eV=3Kty^?Rz6}OG@CQfDMyUKA14d%TxF)0!-m34S
zWQYeOFMDjA(c?HaIF>=knZqb=`+*F)^BVUBg+rO_HVAZC13Gsn<ITJ@$filKdhbPS
zI<cH;D)z@wI;!AUFqnz<HzWAL0$X-B5zh}B=%gi*yq&l7x#SXg#^7ZzCR~ZKX6ANh
z-i`31!!}4WZzb<h*N4lPul%tfTr@E4;Z=O;{D~~_IfbSE@95^kGL(~&XGYD*L%X}@
zpx^q%crIig#@6(QXMx2a7%5Mu9j%3BG?-cTU4}Yl=3rL$R9L@k5p|7GV-B}mqQ(~T
zoMFo->}NI@^&c$e`kc?ll$+AnHe$Y@b$=EqjVlMS3#ypq9mzAcIl@aG?+%(9FVStv
zj#RDNXe#ZXO4g^T;r%5^V9yR_#60)W2h!s~u`!EC8(pWpJmolh&0@4J)yBS#k<{s3
zUBOe$dH7VEz{~tf;=;s(_VOZf_HF~@9^L}|<?F3u7wyKcNf$v@BNU1sT%eDJ6;SCO
z#qi=qKJiHViVo#sD7!0`G-(^a$?rpXCu|Baf|gkiXf1~5!lAI|_%l3Po<+|v9PnW}
z`KB(*c$c1s7Mx{`CBEdgr7b{;9G7(A9gSRmmpq7&<aW9*gt&#5Naz=JCVx`3Ab-$m
z>Ty5|jvW3-ZCc%^@fdM#Sabwt<f=h~N+yQ28ZoU`bfE6?J`f!&0F@ru5ZL5SXG|3%
zD{ni%e7*N*JN_yzkywjm`vYk5oi}9jbB-z-Wzo+|$KWxYotTnNa9VT~if)d94-In2
zcT=IR);*ZMzHxYFk|Pyy5XRQCN+4}s6?~h#AHK~mz<h)K;QdjeV8v2d&gEJ^I2%z!
zPY%6I6<=M4`T}7*U%wbX2c*-F*}B}3Q*(Gaj)5@a;wKE>C%l&yrqM99C&anA3VNTm
z1jY0-U=)9c#vFDar=5IpO_e2#pZN$J^<%-i-&t_<I*78&dbF30pu07$LeS@Hg7_Dq
zsF?DdFcv1*5ZE7#L%z}HX?x(p`2c8q9m+fLU^%t5J}t=Elm;u!!%)6=C<eY+jKRwz
zF;OoQjW1AoxF&}<T^z(U=FAaDj5EU+vB}`6tR&de+XanZEQi`-hSu(rrI_(5(U5mC
z8o5V?bZ(9V&n@R0PAm!~d9t!N@Y4x8KXNu%m8S*SvlO5>E)UYidZLVkHlubW1nSk~
zAnCFMZ5gYJCK@`7^O927u{s1b+#PV_SUbVkQ;(rwKqeTqrlXUC3wgF>Kfc^}yWs8P
zIE=ZZi7`>KsBf-<Gv<Cr$(6p)P-_n-Z)!2FUeU03>rEOiTUN09;aiY<><gzohJkX=
z<2;YBeULi55aRQ$(XDq}@TBEtaOH%#6i*lCT|7f#HZI5Y;or%iqOWxNf-<z@e+1>e
za@;K0Sh&-(8&mzsj43Y)f;3Scd>Wrbv=RvEHf0?xDT;$<{-5CG#3FR3$t13RBTn30
z4hN^N6in<t4YDMZ={K4t_%J#ERWIHk&u$JwwlW6NcbvoQ2XW|;bDxC1sDpcxzCuog
z1fA6qfCCK@phaUV4XT|5Bj3Bz@`Cdi+NVw6<?%tVHM;^AO8DcFrUS&xHO_iRKm&Mx
zh{fX7Q}Jy0Fh<|%Io<z#v4B<BfhEis&}oh%VORPP`w{~%Z5s*^O?e=j+=zFjx`V&Q
zM65B5B<bU1nJq5+(Dwa&dIVpCqEbI9meH3%GY1l~>I?X{odyw8IqLsvmY{)Of*Za{
zajo)Ycr(o&qt3l1lg=43v5StNkCY~yA)(;2d<Rt9O5?l93-M*_esD~@OujC3!Y66N
z(6s+2%oW_GP4m|i|NH^8bwVu9S4;sJ_4CA=jfS$d+u;7GHX8Jb#q4|fn3_KpPw#w7
zniEwxrnWn-^h=-vLM6G9ZH_Q~mkW1OTMu?rh;iQzUc%H}I{0Br3G_%;V&Y=;iAPN}
zZgo*a=}%eIEZvkz9y0}kHS_7c+qtmSVha4_Addl`d!zl*S#*GB3eouXp490m;L2CA
zT*Ki_*kU1kmQ}a{hJ(tm>ElYw6-ffURu_cIlL)4@kSo5EtWG;5=srIcdnO;Ep$Za$
zjFMXF_Q4AbT20}b+eI+Zw}zD1WX$Q`9Z#ji(sEZ{NRAiADD!jBZh<)_S197dO=(19
z-+L(Qrola{iev(2UW4oJub@;_IP9#Sgd>zTf~@yW;vIYd#S+Kh%4L3>$hR-#!p=r2
z8Z3wLu`kHZi*gvwlZE<eA8F`mSMqi6A#%R;AV$mI2Jg^$V8`1=T>Ga9{0d63M)V?`
zaQOrFERlgBeY?Thc@ZcbIS^O*r$I<$DL7p<z}Ae9WVyEnvs-5aHJUjU99;|XSkN0f
zZGH+JW4sXi%)UU4E*^*QT~XNcY#1<WhjTk!r-Rnbo?MahG5n^pnb*s6D>&}nN=5v)
zgJg+4FaC`sQxI^SxP)H?W-uk^9VT!Mx6jao0ba~IBF0(S)>(fTJOgWTUV(h<Ag*V!
zCYSo68Mw*uc(w8~OmPn-OIFsx+$U$TYGf6p^_WE5`<y~^<#O0KX+MY#+YSXw2Qg3I
zw_t{iK9jm_FT^n6sMhls$R1vfI`ySARQwSy)UKWm8mi0aKHUK}wO-WDZ4O#~jT4@8
zJkWTTV@>95gFAEg5TEE8D*MG0YgIz&E!)*ZzK;&hy8Dg>2ibED(`C^4%Op&wc#f8F
z?`ZgkM@X7)fpYeRg3TIPL^NeB%DJ_|<$0OJJX04dRW`u0=#enF<eWfMVj?<LiZM@<
z`+!LFO$>4<6hww^#%D&4sa2CPz8qKylgH$u!|?66$x#=?4i4eGYJ@q1!fJGTD$aaf
zbQkob88kNxCPzctXx@iy`1xBE#4!U%z?O!Bi?SlbWWNqL2JOW#8xOQsXu%`pQJ{TQ
z689?ZD|lhums2p>fp5N?#m}i3_|-6u^bG4qe3}HjpmP~;>PaMsl$7zFC?+Al+a~hy
z(76Ju`wCq0s}y{qm4m0o9{{!dt2{-AQy_QflOT7PDwov!k>{m10H>{aM-OI6GkL-s
z%GOp9W>D5Uva0tUXt*4Q&m1kFY4Tt=cs`Hncs(MSpOnD9<RV<Z<c%)#SI|@85x7pE
z3Z-Y=;p|`2iOiwy_*uQ2Sn?&A8v<?4(=(K=K4JhT=0@O=(I<)3HW_G5`idrzZ%LR_
zJiIW;ATKoKIeoqp*F;xC&bL~)F|RwC7zprqLK1C==*OhbiY8;DzG8B3BPcxi2#17r
zV@~*GQu(7iLFB`1sHk<Pm+OPUal0GprHL~#x8tcqs3dbQVJ_}{J^;igeZbAbZ-8w-
zYk2xXnUhir#T@nhWY+k3wBj!c`wXI>$;F-y)cQ(7OeT<;lR<F3ZX%hW>rK+9$)U^y
zQD)nXJkW09S&u*R7ite%jCFTZxHI^bW>&W1=(12!Q{j*L5&LQMjTSiFNO6$jES{lC
zF2o2znJLBn;k26`B)CuK9>p^d>@SON*WZG@!d(CI5w}4yEEe|VZh?6zO3V|TU4q8G
z5`dzWc<RAM(2#53^;<KFG{2C=#0_&`P0A=H@c9l*BoDA^;6+~J?!EZ*^-UuBaWIC9
zc7viZ7s=<(J|wznDCpna3`$ywc>AwCME8gfRTE@@ZI(1=nf;CKU)oBQySY(!(=9kA
zJqtdo&xU(n5>fyC6X^5mE<RrpESMJU3!&C(oa^Wam@SAWl51LE?8p_Uajpn!_@{-r
za8dN_^A}DGc|;wHlo`!BA6gZ96Q*y|<nGdBSUGAZcw(vGu=`FLtMh?Eg9P`o%olcB
zrD55k1+c`3g8Fn0cMI=^;!s6qaC#i3=S9MrD@jB&e;`z7>%a%O0|MzSM<DRiU5s+h
zhU#bcNrwD+uzFq%t37%`@~Tv9;ftc(E^*FydOB|J)`u1+iE*M+$Ks>QMsQDl3Zxv6
z#hQ_!aI%um?ET(M{MMbr^3_X7vwarvJ7Np2Mq}`Z-Xg)=+1GG$<YnU4--0o@F#-d6
z*fP)FAI3Ew(s?#R;>Z9!BW_vrbz$z>i-f;?Mw7e_!|7a3m@iJjhEIdI9s6hF{9#{-
zYM={<iL9a$_qGtVQ>}DXN(J_nQ(?;IvJj~AifoNq4{n-|sdqO=WDX}{R;wN(R%ORz
zjwz-6Z1xD!*RBS}I0Co!sfVOPui$9LJS-cxN6=j^4c7L$i>cDHL9HnOtF0vQ!}xey
zxx$6<k4~jZJbyUxC4pl;d-MF<o`S>UQo>H$Na<!Hx?;;<xN0QET-QD*%yX9U227HJ
z%;ZL%T9hP&1ooo44aB%!MKRd+x(9B*6iWJuSi_XEGep|t1nzE1!<>BwiRsQO@Y;D6
zHMseRo}>H8Q8jn89nMEMb^=I%0z*BW1Ot+cLHFey8Wp{kNMG%XmxoQ@&QFkFT%O$H
z&8?`%XRiR}e5yl(*6rk`$sq_R+XRI>9ztH;HDY%48Z8Yvi`Iu~Aa~{=OuQdLo9`Io
zrTi!0JUtszzV~1Xu{&KmZ#l0@MgcYl77$;VhhX-XB#eziFicN^w2%py6DP{Kv+A%{
zwHG!|dLwvUGY!OUSM$okrh}h(0@l4&WSZagB|6jQQzL0L=6c9*+<qj6M2;T;=btCx
z#o4_{zd3Pahl(U`Y0)^;k={enW(l7O7ZnOhx;0|WiFyqD9*4mi?u@gD0%W*obK^Y@
zLh-Iw*mtQ6^FHS?k?-bSutsSj99xo%%f_dHMEXKd<ts4*4h%!7$}1#2Z!qe%OoYwY
zo6a1u1s|@G#%k3D!SxnHj$N#YU%wkt&3l1ZUoi%j`M;z&ruPKfEWQxqZBopzcky(+
zbSx}BtAgi`o8n8`FmiYDP=w-Ma9CXo6#f$CEj2FU$qz$0k+v_SoA`TLsQ-joJqUws
z`dT0=F&ZCy_5=0E#Wc>1$H<oJz|C*wSj~)pZN28B?)nru{YVw2OsK~AMH1*}um~Ei
z?Euw=aE$Ih5+A`vaGQAoqpl@DjAsO%59~>=dcUCe%;Ry$x}%_MP!AsxB%pcFO%%_3
zL!xwef-@!G1ePVwkUKSlJ61iHjC^PZ72I`l|3m`}nIeuR4L2a=c`C70%cVZv{t(8G
z0{cUrq-N|j2*{9R9v#d>D{lwE%jl(`II{*DmiMN$I*Y)ItR^9ehLC904Gsjm;a&R}
z{8n@n-3QDkk9zDSF|+s3v8xDAaU9YoJI;en+-C^rw~u;k9V}R5Jrq6Gj~1kq?xK?~
zDMFv`D{$4eBzT`7Jm350LzvP_aGn01s(Owm3!gtGnTdbVBOh#_*8DnM^XLMczPds%
zOurcPp7?=+Wi-f|IB_<~@6qs-E|Yq19fn;?K#c?s$QqpwiyY;+nI_%2-F-F4?r+(+
z=7bKg&!e!|d;xC~jllyp+faHgVvV0MzK#8Y)<)0i{j=Y&p5zP2YJI|xa4a6Zn|gla
z$aOe`k&l&7{*FAeepfxs(<=p=YXVqO@RB}pFXQnIv&mV9Z1|i|jQ+m;X`<-{>@T*P
z1O!S0S*8OE-QN(!;nNrqjXcN*-%GXCHjw6!am<eI6XAkx9M+y&2PJw{G}N;JLnq}4
z!ZuAHIXlN=wv7s-ShttD?!FDT<7ZHFagLgC=cwK@Q@r+33Mx`(K}^L4-oUlHp(3a^
ztoFT3%Tq&1$c-mx;_#fryFZ2V8@(W*?HCSxX@(;uOoZbfMsFL%aFfSG2-Jgga7d*Z
zOd2PPMKQuW=gKn3@yo^4W5jWq@EIs~#Wsj5wZuBDT+-|~ms~aI&WOz40fyB_aB=G-
z#F0C}$5WisQE`D6s>Zn7SBn|be>CXT#ev3jT^RCRlV*uW<C1J0qPciHu@eURk{;J#
z!qJCh_0%HBiW<U|T}!5QI?Js@_8a5E{OO>gH-nMdxs_h?vVs_=zBu0d2(G%^4~$7I
z&Cn69xvD`#N^w6~|47qPRP7olU#S4T(;7VRrUVn#2Y|QVNm9K;gPT}WD?HCmgN{QT
zSghEF&b?0JGIk1XcS&UQ{Bj|o(iB|#Wy8xYeV}Z`2A*H=M_5J{V8}u}G!K1%B8s+5
zo_Hm0((~j-(G{@nx+Gl_RY~1m5g0adCti-WfC0s6yfssi&b_;l>~T*<)whyNWULey
z`1k;n&3*!Du}P3y6a!)Eku*=z9B+<mqS_zR@Z&{MeB87duDv{iOCN<79BqrkO#9tv
zy{8G^X1aorj0op*RsgR;Qn0aE8Xd_@Z1|>6LNj;6*dxkJ@`pf@e((!9@-`c0rL#Ex
zW1O|&(lfMB>pcBDUY`?<)j)mEfoPVP2nlyKfOV8Q6VA9m;OGxHcm7e(O3}sE;(VHz
zJ{_yFrMPvml3epaU7TEd11|Vz;fotm+_(68sxs*!#=N=;@)dq~;ISy;l1;Etrx#va
zJp(tMy2{&B*Fxm`S;6bZB>>8XWXQ1V1?SIi2jiP5!e@6Dz8+h{=shSPN7KZ(LWkZs
zRA)K1xO&3-dBSJ=1^2-(D;Df#w-MvjC6J{a2jLbG=n-mw?Rf-~)T<B`{EAF}-9}8u
z|AqX1w}{e2VGgzF9*UJ8IaR!!VDp8U%pUz!klc4HT>BgiA8XHp#;_y2vF9$+WB0nl
z>Y1Lfz;Y*jvH1oRo{1%w>aL-~J$cZ-p@EOmf*~XOIDK%Vg2pw(<Llnh<bI$M^I7|v
z@OPO&e4E~rjJ>*%6x>S#`s%Fp{WI(7sA=-d$^Fud_lO?2LQ#f^y8RfnvN^gWsSU&;
zdUMAfN?_0K2~>N~1(<$a_`c9qe~3|PB%Yt1VvNBTin>|UK06Cs;UGjFKT8*0jKzNE
zr_iHwlOcD>4XljTVTLqna*9FeBqq5Kw6hAq!deX0DXTGe4HUp-k2o%K$|faoVWjkA
zZ_YZ*3tLQ=U}{zv(bjJu6^_9W{Kf<fCalE!QdbC7&!Jm`WT5X2O}MMpjq#Z^7-u|M
zhklCwMDM#Q<2(;Z<x)d#L9-b5W_BWo^a2`IeF0k}yW`WEgIMPxBdF9;<}63<f@zD+
zL-X8zjE}PlUU#vD33;m_=;361(JhbEw~j^q8{??vu&HoLQ4Ph?1|i?$5?l$EXUdn(
z!^d)>g5Cp~sa@c0h<LpnM-1K$C+ybI9XHFU$_Q6%^SX#n&F_-PoVhf0rW{VXlt5ZH
zi-CdR4Lor;N|4aQh0!Z)B?@Q8!Rotw!iD8Pugei6bix+A+)Iwx(2@xAW=i7f#nWJI
zV>NG@4bVV-B%YLHQ1{^#tm@$j=LFK+w+nsgBkaxCPkD+?PhQi;g(>tg6HTI`Lj@09
zOF*N(5UxKt3yO8lxY_6g)tD@RF*-MJ=)GD}vrZG=eIJF15|Jd%BLN4yN8v~Rxg>1q
z99-g(i1FnMFly@{Vy|wCT1~yVQ}W78)yirxs#}JklLv7Rc9j$Do~tlbsseq7&BsAr
z&+&7#3CcDG((J^9GaB_Qt@9he+`O{_(j*AO7ahfm+SdqE=1ReM2VE}Q=PunWuFV)N
z*$O)>>S^V=J@EK>H>RJl4md9v${B1riMcDT6Vtq_R%YEDph9aI+5S2aiz2RL@*rd0
z!USo?&U`AfUQQWm1!8#EI~YTsOeb?)X3=%&p*THoIvm-lfJqv<VC8rT_v&83I@fUG
z{duauXXa2Y&wvtU0!8(h1f+taShgh$#>m}(n6b?`q&0wSG6{v?`Y90RUV%sYjN)>i
z7t@00eNn~mBJpm0NoT2lCDNm&VnBczlV$ygy1wy%wujPmV{3yjH(qD;+W(fI-@rC-
zD3*fq(p5w@*o!wUb`vZ$*@$kche^La4d~@~2sRzKMc!|;BX6W9VmF&O+$}X3<EvU>
z@3I46_GU6>Y?5Y@JdB~gdm!qV?ZyKwF*v45f~k2V%s1?P2kQ5$agX*?Fc~5I-Q$Za
zr;?S17e_q?$(R(dvO0htM?b|6MMb<pzQHtt$aA;)HlTx1HyUu?ntL$eAdPr95Bb-P
z;ph7ms?sJjLpFy@+F<|+-;%+l<tkMxo{q?)yyznz@vzt{I{ow%Fw4A4#tz;>nkxK=
zgiS5UYiJ|Z-9DWeG<728j(9_ssr&`j+l$DVN;R(TYKSmCDWp}J6)^RHG}`A?L2{lK
zicEO`{%kQSiR7bd%Q$FcKatFa9$c`|Q{a5|gNCXm>U7gUN0VV(hW!i}dMlW*zZ49#
z>;kp5(4n&~*Fyac29<}mQJKs=khk5IdIb~<3TO3%ul8F7rA9_{qwpOyn^`W50W16+
zKl32{oEU=i^G5J5kmMqA$1u^W>~N{R7Map?gRY(>pzZ&8y8SaHLzHXt7QT02vI%pB
z=|Njh4XEJt;K>bJ1I>-cP%8K~u6}ZYsB+(-C8v@w_mr6^hyBzeBA9OENpoTy&HVcQ
z*{;E@OmI=z3Q4bTg5$MwBxsU0+`E#7w`{JH9XGZJ>{Y#Kc}5s6>!yun)f*u-u@*we
z2HJDl3%qvC=GQ#=`)YF<-?JfRs6W}N)PftW{E>dt<=RGAVpOp<WXK;E_)kKb8Jr4_
z2K9o{n(t&`_<B_C{{#=nDuT$(@g)Bi51(G10ll3!ko4#bba-gPD99b6rl&5!*V8Oc
znWoNJl^md-OjHWA+p6&OfN~<gQI452)RYuHn8kd0mQI6m(;>?AFw_q`4*E&XT=(ET
z^j_0$Ues1|=F}c#xHP1a&}Bw&u!leOl8~n|k@B#impb00MtDs5Iwolyg*~TLNSt~U
zJhfeq<;LIXxP9tOf#W>fbjF=5u`<TeVQtVPIUm=CiZhN`{&el|)AUc@Z?Wy1fx-`^
zRQ;efuX4E(g!MRu>3hS_uB{A|k6wUne$vdKb0?vjjsomRy+rIn7vWqxV_LKI3vOLn
zg)Kc-;J~NXaEiS*l#aOwWA6y<!rwQxdhLSe`AaeFDit)pFNN(&p*Xp`KbH`HwV-z5
zQPSFyz<WBunu+i~1lf&o5UBeVBSoAb<yZzv)eXnbVtI6aVHKP^v<p4<2f=p9Oi(ir
zVRVv|aG!ky^z#b`f2q&J^4SJ-EYqY_HD_>Kh&m_?IVSvVd<-K}n+pr2%4m1lMBY>J
jyKwWwF<P1LMpdU$=(8`BY|@lQXB%;R=X&Rl-y8aWBM80K

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xtransformer/saved_model/text_tokenizer/vocab.txt b/test/tst-data/xmc/xtransformer/saved_model/text_tokenizer/vocab.txt
new file mode 100644
index 00000000..0da6920f
--- /dev/null
+++ b/test/tst-data/xmc/xtransformer/saved_model/text_tokenizer/vocab.txt
@@ -0,0 +1,15 @@
+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+red
+apple
+iphone
+11
+pro
+case
+samsung
+galaxy
+s10
+plus
diff --git a/test/tst-data/xmc/xtransformer/saved_model/tokenizer/vocab.txt b/test/tst-data/xmc/xtransformer/saved_model/tokenizer/vocab.txt
new file mode 100644
index 00000000..0da6920f
--- /dev/null
+++ b/test/tst-data/xmc/xtransformer/saved_model/tokenizer/vocab.txt
@@ -0,0 +1,15 @@
+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+red
+apple
+iphone
+11
+pro
+case
+samsung
+galaxy
+s10
+plus
diff --git a/test/tst-data/xmc/xtransformer/train.txt b/test/tst-data/xmc/xtransformer/train.txt
new file mode 100644
index 00000000..228eac7a
--- /dev/null
+++ b/test/tst-data/xmc/xtransformer/train.txt
@@ -0,0 +1,3 @@
+red apple iphone 11 pro case
+red samsung galaxy s10 case
+nike running shoes
diff --git a/test/tst-data/xmc/xtransformer/train_feat.npz b/test/tst-data/xmc/xtransformer/train_feat.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b054ba686cc2aa0cf1a646ad0688d64dcbd00a31
GIT binary patch
literal 911
zcmWIWW@Zs#U|`??Vnv3ZKbhaA16db<m>Y;Q^HMUCQ;YTT3Myy65q~8uaPmaJv4FMn
zCPmGOS(3MOLEhxdMe7zXm^UpxBxuSE@%iynCUFUsd!NztW9<@8R}ovn^i#@>&1#kv
z*H*44%zA7J2`ou!3Tcc<W(sNTpJq1B5IyyrA<2)8!y)W^*AjLWeue-P_oO~JRN)77
z#{?keg1D!kq=+Kts6kvK!T@p&bCR5b7-JHet&0zrOm_v^)(o{ZEx#x?u>_~Bal$!k
z2n7N-1Vk9uJ$a;rX4$Pnk$b&>7J<CT3B<)2i3O=R?E(h)LPB=A!F`y<#_^!(5}JjX
zi{9o(0xbf?F(}qk5=#>C*q2611n=baf8$W`Z<}k`iJTI<o?dUBW9&fd8JR?waVIjM
z8^EA}5kw&+IdqNasR5#mfuVsB6nH?5(6oWB4?R8+`q+S0B4QU^A9~av^c4g3!6OP?
e8+rgEv^@c8g9UehH!B-Rh7}0Kfa1waARYiR2m*5e

literal 0
HcmV?d00001

diff --git a/test/tst-data/xmc/xtransformer/train_label.npz b/test/tst-data/xmc/xtransformer/train_label.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e3508c6c324b0b102b97995421a2901c0f04bbb1
GIT binary patch
literal 1337
zcmWIWW@Zs#fB;2?P;F_+1wal6b2ErAWag!0CZ`tb<rP#iG6;Z$fiiG<vR|lgKqMnW
z8AG*tN@{U(k-C+Fx=p5ux{iW+T7FSUQDR<veo;y)NZc(kr#KZTUYwCwkP75$Sm<c#
zDAX!Y0Js=|Zea#F2naxICLo6KLF@n&&wbTD{b2%76ok1To+~IRg82<33IZ^iBEOkX
z?l+KIKz?8YVh$ii^~>VJCDWUMq96?N3ox{E6H8!z0f~YDjK=DhnqXt1;)&8&Qc&PZ
zE-pfK{)-EjSHPlz6X<+!qJX&`76ULEtLtqtEr@cx5#=$#3<*pI7ATGC6^H!`XDk59
zfG`KpD=CR3i7=ml#6VyHm}FqU?vpgS<rfBq277!sz?+dtgc(<H0QCfrY5-+6<iY`6
zBWhMd)_5195t{MP^`RzfWPK}vRw5EPx<1tOgsd+En1|r03tby(+#zc-21Yt8CIh@#
RL79+&6$r(E&fo<n1ONo*?oj{$

literal 0
HcmV?d00001