scripts/autogen

#!/usr/bin/env python3
# Copyright (c) 2024-2025 The mlkem-native project authors
# SPDX-License-Identifier: Apache-2.0

import subprocess
import tempfile
import platform
import argparse
import shutil
import pathlib
import sys
import os

modulus = 3329
root_of_unity = 17
montgomery_factor = pow(2, 16, modulus)

# This file re-generated auto-generated source files in mlkem-native.
#
# It currently covers:
# - zeta values for the reference NTT and invNTT
# - lookup tables used for fast rejection sampling
# - source files for monolithic single-CU build
# - simplified assembly sources
# - header guards
# - #undef's for CU-local macros


def gen_header():
    yield "/*"
    yield " * Copyright (c) 2024-2025 The mlkem-native project authors"
    yield " * SPDX-License-Identifier: Apache-2.0"
    yield " */"
    yield ""
    yield "/*"
    yield " * WARNING: This file is auto-generated from scripts/autogen"
    yield " *          Do not modify it directly."
    yield " */"
    yield ""


def update_file(filename, content, dry_run=False):

    # Format contents of .[chi] files
    if filename.endswith(".c") or filename.endswith(".h") or filename.endswith(".i"):
        p = subprocess.run(
            ["clang-format"], capture_output=True, input=content, text=True, shell=True
        )
        if p.returncode != 0:
            print(p.stderr)
            print(
                f"Failed to auto-format autogenerated code (clang-format return code {p.returncode}). Are you running in a nix shell? See BUILDING.md."
            )
            exit(1)
        content = p.stdout

    if dry_run is False:
        with open(filename, "w+") as f:
            f.write(content)
    else:
        if os.path.exists(filename) is False:
            print(f"Autogenerated file {filename} does not exist")
            exit(1)
        with open(filename, "r") as f:
            current_content = f.read()
        if current_content != content:
            filename_new = f"{filename}.new"
            print(
                f"Autogenerated file {filename} needs updating. Have you called scripts/autogen?",
                file=sys.stderr,
            )
            print(f"Writing new version to {filename_new}", file=sys.stderr)
            with open(filename_new, "w") as f:
                f.write(content)
            subprocess.run(["diff", filename, filename_new])
            exit(1)


def bitreverse(i, n):
    r = 0
    for _ in range(n):
        r = 2 * r + (i & 1)
        i >>= 1
    return r


def signed_reduce(a):
    """Return signed canonical representative of a mod b"""
    c = a % modulus
    if c >= modulus / 2:
        c -= modulus
    return c


def gen_c_zetas():
    """Generate source and header file for zeta values used in
    the reference NTT and invNTT"""

    # The zeta values are the powers of the chosen root of unity (17),
    # converted to Montgomery form.

    zeta = []
    for i in range(128):
        zeta.append(signed_reduce(pow(root_of_unity, i, modulus) * montgomery_factor))

    # The source code stores the zeta table in bit reversed form
    yield from (zeta[bitreverse(i, 7)] for i in range(128))


def gen_c_zeta_file(dry_run=False):
    def gen():
        yield from gen_header()
        yield '#include "common.h"'
        yield "#if !defined(MLK_MULTILEVEL_BUILD_NO_SHARED)"
        yield '#include "poly.h"'
        yield ""
        yield "/*"
        yield " * Table of zeta values used in the reference NTT and inverse NTT."
        yield " * See autogen for details."
        yield " */"
        yield "MLK_ALIGN const int16_t zetas[128] = {"
        yield from map(lambda t: str(t) + ",", gen_c_zetas())
        yield "};"
        yield ""
        yield "#else /* MLK_MULTILEVEL_BUILD_NO_SHARED */"
        yield ""
        yield "MLK_EMPTY_CU(zetas)"
        yield ""
        yield "#endif /* MLK_MULTILEVEL_BUILD_NO_SHARED */"
        yield ""

    update_file("mlkem/zetas.c", "\n".join(gen()), dry_run=dry_run)


def prepare_root_for_barrett(root):
    """Takes a constant that the code needs to Barrett-multiply with,
    and returns the pair of (a) its signed canonical form, (b) the
    twisted constant used in the high-mul part of the Barrett multiplication."""

    # Signed canonical reduction
    root = signed_reduce(root)

    def round_to_even(t):
        rt = round(t)
        if rt % 2 == 0:
            return rt
        # Make sure to pick a rounding target
        # that's <= 1 away from x in absolute value.
        if rt <= t:
            return rt + 1
        return rt - 1

    root_twisted = round_to_even((root * 2**16) / modulus) // 2
    return root, root_twisted


def gen_aarch64_root_of_unity_for_block(layer, block, inv=False):
    # We are computing a negacyclic NTT; the twiddles needed here is
    # the second half of the twiddles for a cyclic NTT of twice the size.
    log = bitreverse(pow(2, layer) + block, 7)
    if inv is True:
        log = -log
    root, root_twisted = prepare_root_for_barrett(pow(root_of_unity, log, modulus))
    return root, root_twisted


def gen_aarch64_fwd_ntt_zetas_layer01234():
    # Layers 0,1,2 are merged
    yield from gen_aarch64_root_of_unity_for_block(0, 0)
    yield from gen_aarch64_root_of_unity_for_block(1, 0)
    yield from gen_aarch64_root_of_unity_for_block(1, 1)
    yield from gen_aarch64_root_of_unity_for_block(2, 0)
    yield from gen_aarch64_root_of_unity_for_block(2, 1)
    yield from gen_aarch64_root_of_unity_for_block(2, 2)
    yield from gen_aarch64_root_of_unity_for_block(2, 3)
    yield from (0, 0)  # Padding

    # Layers 3,4,5,6 are merged, but we emit roots for 3,4
    # in separate arrays than those for 5,6
    for block in range(8):  # There are 8 blocks in Layer 4
        yield from gen_aarch64_root_of_unity_for_block(3, block)
        yield from gen_aarch64_root_of_unity_for_block(4, 2 * block + 0)
        yield from gen_aarch64_root_of_unity_for_block(4, 2 * block + 1)
        yield from (0, 0)  # Padding


def gen_aarch64_fwd_ntt_zetas_layer56():
    # Layers 3,4,5,6 are merged, but we emit roots for 3,4
    # in separate arrays than those for 5,6
    for block in range(8):

        def double_ith(t, i):
            yield from (t[i], t[i])

        # Ordering of blocks is adjusted to suit the transposed internal
        # presentation of the data
        for i in range(2):
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 0), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 1), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 2), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 3), i
            )
        for i in range(2):
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 0), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 2), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 4), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 6), i
            )
        for i in range(2):
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 1), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 3), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 5), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 7), i
            )


def gen_aarch64_inv_ntt_zetas_layer01234():
    # Layers 3,4,5,6 are merged, but we emit roots for 3,4
    # in separate arrays than those for 5,6
    for block in range(8):  # There are 8 blocks in Layer 4
        yield from gen_aarch64_root_of_unity_for_block(3, block, inv=True)
        yield from gen_aarch64_root_of_unity_for_block(4, 2 * block + 0, inv=True)
        yield from gen_aarch64_root_of_unity_for_block(4, 2 * block + 1, inv=True)
        yield from (0, 0)  # Padding

    # Layers 0,1,2 are merged
    yield from gen_aarch64_root_of_unity_for_block(0, 0, inv=True)
    yield from gen_aarch64_root_of_unity_for_block(1, 0, inv=True)
    yield from gen_aarch64_root_of_unity_for_block(1, 1, inv=True)
    yield from gen_aarch64_root_of_unity_for_block(2, 0, inv=True)
    yield from gen_aarch64_root_of_unity_for_block(2, 1, inv=True)
    yield from gen_aarch64_root_of_unity_for_block(2, 2, inv=True)
    yield from gen_aarch64_root_of_unity_for_block(2, 3, inv=True)
    yield from (0, 0)  # Padding


def gen_aarch64_inv_ntt_zetas_layer56():
    # Layers 3,4,5,6 are merged, but we emit roots for 3,4
    # in separate arrays than those for 5,6
    for block in range(8):

        def double_ith(t, i):
            yield from (t[i], t[i])

        # Ordering of blocks is adjusted to suit the transposed internal
        # presentation of the data
        for i in range(2):
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 0, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 1, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 2, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(5, 4 * block + 3, inv=True), i
            )
        for i in range(2):
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 0, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 2, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 4, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 6, inv=True), i
            )
        for i in range(2):
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 1, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 3, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 5, inv=True), i
            )
            yield from double_ith(
                gen_aarch64_root_of_unity_for_block(6, 8 * block + 7, inv=True), i
            )


def gen_aarch64_mulcache_twiddles():
    for idx in range(64):
        root = pow(root_of_unity, bitreverse(64 + idx, 7), modulus)
        yield prepare_root_for_barrett(root)[0]
        yield prepare_root_for_barrett(-root)[0]


def gen_aarch64_mulcache_twiddles_twisted():
    for idx in range(64):
        root = pow(root_of_unity, bitreverse(64 + idx, 7), modulus)
        yield prepare_root_for_barrett(root)[1]
        yield prepare_root_for_barrett(-root)[1]


def gen_aarch64_fwd_ntt_zeta_file(dry_run=False):
    def gen(suffix):
        yield from gen_header()
        yield '#include "../../../common.h"'
        yield ""
        yield f"#if defined(MLK_ARITH_BACKEND_AARCH64_{suffix.upper()}) && \\"
        yield "     !defined(MLK_MULTILEVEL_BUILD_NO_SHARED)"
        yield ""
        yield "#include <stdint.h>"
        yield '#include "arith_native_aarch64.h"'
        yield ""
        yield "/*"
        yield " * Table of zeta values used in the AArch64 forward NTT"
        yield " * See autogen for details."
        yield " */"
        yield "MLK_ALIGN const int16_t aarch64_ntt_zetas_layer01234[] = {"
        yield from map(lambda t: str(t) + ",", gen_aarch64_fwd_ntt_zetas_layer01234())
        yield "};"
        yield ""
        yield "MLK_ALIGN const int16_t aarch64_ntt_zetas_layer56[] = {"
        yield from map(lambda t: str(t) + ",", gen_aarch64_fwd_ntt_zetas_layer56())
        yield "};"
        yield ""
        yield "MLK_ALIGN const int16_t aarch64_invntt_zetas_layer01234[] = {"
        yield from map(lambda t: str(t) + ",", gen_aarch64_inv_ntt_zetas_layer01234())
        yield "};"
        yield ""
        yield "MLK_ALIGN const int16_t aarch64_invntt_zetas_layer56[] = {"
        yield from map(lambda t: str(t) + ",", gen_aarch64_inv_ntt_zetas_layer56())
        yield "};"
        yield ""
        yield "MLK_ALIGN const int16_t aarch64_zetas_mulcache_native[] = {"
        yield from map(lambda t: str(t) + ",", gen_aarch64_mulcache_twiddles())
        yield "};"
        yield ""
        yield "MLK_ALIGN const int16_t aarch64_zetas_mulcache_twisted_native[] = {"
        yield from map(lambda t: str(t) + ",", gen_aarch64_mulcache_twiddles_twisted())
        yield "};"
        yield ""
        yield f"#else /* defined(MLK_ARITH_BACKEND_AARCH64_{suffix.upper()})"
        yield "          && !defined(MLK_MULTILEVEL_BUILD_NO_SHARED) */"
        yield ""
        yield "MLK_EMPTY_CU(aarch64_zetas)"
        yield ""
        yield ""
        yield f"#endif /* defined(MLK_ARITH_BACKEND_AARCH64_{suffix.upper()})"
        yield "          && !defined(MLK_MULTILEVEL_BUILD_NO_SHARED) */"
        yield ""

    update_file(
        "dev/aarch64_opt/src/aarch64_zetas.c",
        "\n".join(gen("opt")),
        dry_run=dry_run,
    )

    update_file(
        "dev/aarch64_clean/src/aarch64_zetas.c",
        "\n".join(gen("clean")),
        dry_run=dry_run,
    )


def gen_aarch64_rej_uniform_table_rows():
    # The index into the lookup table is an 8-bit bitmap, i.e. a number 0..255.
    # Conceptually, the table entry at index i is a vector of 8 16-bit values, of
    # which only the first popcount(i) are set; those are the indices of the set-bits
    # in i. Concretely, we store each 16-bit index as consecutive 8-bit indices.
    def get_set_bits_idxs(i):
        bits = list(map(int, format(i, "08b")))
        bits.reverse()
        return [bit_idx for bit_idx in range(8) if bits[bit_idx] == 1]

    for i in range(256):
        idxs = get_set_bits_idxs(i)
        # Replace each index by two consecutive indices
        idxs = [j for i in idxs for j in [2 * i, 2 * i + 1]]
        # Pad by -1
        idxs = idxs + [-1] * (16 - len(idxs))
        yield ",".join(map(str, idxs)) + f" /* {i} */"


def gen_aarch64_rej_uniform_table(dry_run=False):
    def gen(suffix):
        yield from gen_header()
        yield '#include "../../../common.h"'
        yield ""
        yield f"#if defined(MLK_ARITH_BACKEND_AARCH64_{suffix.upper()}) && \\"
        yield "     !defined(MLK_MULTILEVEL_BUILD_NO_SHARED)"
        yield ""
        yield "#include <stdint.h>"
        yield '#include "arith_native_aarch64.h"'
        yield ""
        yield "/*"
        yield " * Lookup table used by rejection sampling of the public matrix."
        yield " * See autogen for details."
        yield " */"
        yield "MLK_ALIGN const uint8_t rej_uniform_table[] = {"
        yield from map(lambda t: str(t) + ",", gen_aarch64_rej_uniform_table_rows())
        yield "};"
        yield ""
        yield f"#else /* defined(MLK_ARITH_BACKEND_AARCH64_{suffix.upper()})"
        yield "          && !defined(MLK_MULTILEVEL_BUILD_NO_SHARED) */"
        yield ""
        yield "MLK_EMPTY_CU(aarch64_rej_uniform_table)"
        yield ""
        yield f"#endif /* defined(MLK_ARITH_BACKEND_AARCH64_{suffix.upper()})"
        yield "          && !defined(MLK_MULTILEVEL_BUILD_NO_SHARED) */"
        yield ""

    update_file(
        "dev/aarch64_opt/src/rej_uniform_table.c",
        "\n".join(gen("opt")),
        dry_run=dry_run,
    )

    update_file(
        "dev/aarch64_clean/src/rej_uniform_table.c",
        "\n".join(gen("clean")),
        dry_run=dry_run,
    )


def gen_avx2_rej_uniform_table_rows():
    # The index into the lookup table is an 8-bit bitmap, i.e. a number 0..255.
    # Conceptually, the table entry at index i is a vector of 8 16-bit values, of
    # which only the first popcount(i) are set; those are the indices of the set-bits
    # in i.
    def get_set_bits_idxs(i):
        bits = list(map(int, format(i, "08b")))
        bits.reverse()
        return [bit_idx for bit_idx in range(8) if bits[bit_idx] == 1]

    for i in range(256):
        idxs = get_set_bits_idxs(i)
        idxs = [2 * i for i in idxs]
        # Pad by -1
        idxs = idxs + [-1] * (8 - len(idxs))
        yield "{" + ",".join(map(str, idxs)) + "}"


def gen_avx2_rej_uniform_table(dry_run=False):
    def gen():
        yield from gen_header()
        yield '#include "../../../common.h"'
        yield ""
        yield "#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \\"
        yield "    !defined(MLK_MULTILEVEL_BUILD_NO_SHARED)"
        yield ""
        yield "#include <stdint.h>"
        yield '#include "arith_native_x86_64.h"'
        yield ""
        yield "/*"
        yield " * Lookup table used by rejection sampling of the public matrix."
        yield " * See autogen for details."
        yield " */"
        yield "MLK_ALIGN const uint8_t rej_uniform_table[256][8] = {"
        yield from map(lambda t: str(t) + ",", gen_avx2_rej_uniform_table_rows())
        yield "};"
        yield ""
        yield "#else /* defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \\"
        yield "         !defined(MLK_MULTILEVEL_BUILD_NO_SHARED) */"
        yield ""
        yield "MLK_EMPTY_CU(avx2_rej_uniform_table)"
        yield ""
        yield "#endif /* defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \\"
        yield "          !defined(MLK_MULTILEVEL_BUILD_NO_SHARED) */"
        yield ""

    update_file(
        "mlkem/native/x86_64/src/rej_uniform_table.c",
        "\n".join(gen()),
        dry_run=dry_run,
    )


def signed_reduce_u16(x):
    x = x % 2**16
    if x >= 2**15:
        x -= 2**16
    return x


def prepare_root_for_montmul(root):
    """Takes a constant that the code needs to Montgomery-multiply with,
    and returns the pair of (a) the signed canonical representative of its
    Montgomery form, (b) the twisted constant used in the low-mul part of
    the Montgomery multiplication."""

    # Convert to Montgomery form and pick canonical signed representative
    root = signed_reduce(root * montgomery_factor)
    root_twisted = signed_reduce_u16(root * pow(modulus, -1, 2**16))
    return root, root_twisted


def gen_avx2_root_of_unity_for_block(layer, block, inv=False):
    # We are computing a negacyclic NTT; the twiddles needed here is
    # the second half of the twiddles for a cyclic NTT of twice the size.
    log = bitreverse(pow(2, layer) + block, 7)
    if inv is True:
        log = -log
    root, root_twisted = prepare_root_for_montmul(pow(root_of_unity, log, modulus))
    return root, root_twisted


def gen_avx2_fwd_ntt_zetas():

    def gen_twiddles(layer, block, repeat):
        """Generates twisted twiddle, then twiddle, for given layer and block.
        Repeat both the given number of times."""
        root, root_twisted = gen_avx2_root_of_unity_for_block(layer, block)
        return [root] * repeat, [root_twisted] * repeat

    def gen_twiddles_many(layer, block_base, block_offsets, repeat):
        """Generates twisted twiddles, then twiddles, of each (layer, block_base + i)
        pair for i in block_offsets. Each twiddle is repeated `repeat` times."""
        root_pairs = list(
            map(lambda x: gen_twiddles(layer, block_base + x, repeat), block_offsets)
        )
        yield from (r for l in root_pairs for r in l[1])
        yield from (r for l in root_pairs for r in l[0])

    # Layers 0 twiddle
    yield from gen_twiddles_many(0, 0, range(1), 4)
    # Padding so that the subsequent twiddles are 16-byte aligned
    yield from [0] * 8

    # Layer 1-6 twiddles, separated by whether they belong to the upper or lower half
    for i in range(2):
        yield from gen_twiddles_many(1, i * (2**0), range(1), 16)
        yield from gen_twiddles_many(2, i * (2**1), range(2), 8)
        yield from gen_twiddles_many(3, i * (2**2), range(4), 4)
        yield from gen_twiddles_many(4, i * (2**3), range(8), 2)
        yield from gen_twiddles_many(5, i * (2**4), range(16), 1)
        yield from gen_twiddles_many(6, i * (2**5), range(0, 32, 2), 1)
        yield from gen_twiddles_many(6, i * (2**5), range(1, 32, 2), 1)


def gen_avx2_fwd_ntt_zeta_file(dry_run=False):
    def gen():
        yield from gen_header()
        yield "/*"
        yield " * Table of zeta values used in the AVX2 NTTs"
        yield " * See autogen for details."
        yield " */"
        yield ""
        yield from map(lambda t: str(t) + ",", gen_avx2_fwd_ntt_zetas())
        yield ""

    update_file(
        "mlkem/native/x86_64/src/x86_64_zetas.i", "\n".join(gen()), dry_run=dry_run
    )


def get_c_source_files(main_only=False):
    if main_only is True:
        return get_files("mlkem/**/*.c")
    else:
        return get_files("mlkem/**/*.c") + get_files("dev/**/*.c")


def get_asm_source_files(main_only=False):
    if main_only is True:
        return get_files("mlkem/**/*.S")
    else:
        return get_files("mlkem/**/*.S") + get_files("dev/**/*.S")


def get_header_files(main_only=False):
    if main_only is True:
        return get_files("mlkem/**/*.h")
    else:
        return get_files("mlkem/**/*.h") + get_files("dev/**/*.h")


def get_files(pattern):
    return list(map(str, pathlib.Path().glob(pattern)))


def get_defines_from_file(c):
    with open(c, "r") as f:
        for l in f.read().split("\n"):
            if l.lstrip().startswith("#define "):
                yield (
                    c,
                    l.lstrip()
                    .removeprefix("#define ")
                    .split(" ")[0]
                    .split("(")[0]
                    .replace("'", ""),
                )


def get_defines():
    for c in get_header_files(main_only=True):
        yield from get_defines_from_file(c)


def get_checked_defines():
    allow_list = [("__contract__", "cbmc.h"), ("__loop__", "cbmc.h")]

    def is_allowed(d, c):
        for d0, c0 in allow_list:
            if c.endswith(c0) is True and d0 == d:
                return True
        return False

    for c, d in get_defines():
        if d.startswith("_") and is_allowed(d, c) is False:
            raise Exception(
                f"{d} from {c}: starts with an underscore, which is not allowed for mlkem-native macros. "
                f"If this is an mlkem-native specific macro, please pick a different name. "
                f"If this is an external macro, it likely needs removing from `gen_monolithic_undef_all_core()` in `scripts/autogen` -- check this!"
            )
        yield (c, d)


def gen_monolithic_undef_all_core(filt=None, desc=""):

    if filt is None:
        filt = lambda c: True

    yield "/*"
    yield f" * Undefine macros from {desc}"
    yield " */"

    defines = list(set(get_checked_defines()))
    defines.sort()

    last_filename = None
    for filename, d in defines:
        if filt(filename) is False:
            continue
        if last_filename != filename:
            yield f"/* {filename} */"
            last_filename = filename
        yield f"#undef {d}"


def gen_monolithic_source_file(dry_run=False):

    def native(c):
        return "native/" in c

    def fips202(c):
        return "fips202" in c

    def aarch64(c):
        return "aarch64" in c

    def x86_64(c):
        return "x86_64" in c

    def native_fips202(c):
        return native(c) and fips202(c)

    def native_arith(c):
        return native(c) and not fips202(c)

    def native_fips202_aarch64(c):
        return native_fips202(c) and aarch64(c)

    def native_fips202_x86_64(c):
        return native_fips202(c) and x86_64(c)

    def native_arith_aarch64(c):
        return native_arith(c) and aarch64(c)

    def native_arith_x86_64(c):
        return native_arith(c) and x86_64(c)

    # List of level-specific source files
    # All other files only need including and building once
    # in multilevel build.
    def k_specific(c):
        k_specific_sources = [
            # sys.h is not k-specific, but has some macro-overlap with
            # mlkem_native.h. Since the macros from mlkem_native.h are
            # undef'ed after each level-include in a multi-level build
            # we thus have to re-include sys.h as well.
            "sys.h",
            "mlkem_native.h",
            "params.h",
            "config.h",
            "common.h",
            "indcpa.c",
            "indcpa.h",
            "kem.c",
            "kem.h",
            "poly_k.c",
            "poly_k.h",
        ]
        for f in k_specific_sources:
            if c.endswith(f):
                return True
        return False

    def k_generic(c):
        return not k_specific(c)

    def gen():
        c_sources = get_c_source_files(main_only=True)
        yield from gen_header()
        yield "/*"
        yield " * Monolithic compilation unit bundling all compilation units within mlkem-native"
        yield " */"
        yield ""
        yield "/* If parts of the mlkem-native source tree are not used,"
        yield " * consider reducing this header via `unifdef`."
        yield " *"
        yield " * Example:"
        yield " * ```bash"
        yield " * unifdef -UMLK_MONOBUILD_WITH_NATIVE_ARITH mlkem_native_monobuild.c"
        yield " * ```"
        yield " */"
        yield ""
        yield '#include "mlkem/sys.h"'
        yield ""
        for c in filter(lambda c: not native(c) and not fips202(c), c_sources):
            yield f'#include "{c}"'
        yield ""
        yield "#if !defined(MLK_MONOBUILD_CUSTOM_FIPS202)"
        for c in filter(lambda c: not native(c) and fips202(c), c_sources):
            yield f'#include "{c}"'
        yield "#endif /* !MLK_MONOBUILD_CUSTOM_FIPS202 */"
        yield ""
        yield "#if defined(MLK_MONOBUILD_WITH_NATIVE_ARITH)"
        yield "#if defined(MLK_SYS_AARCH64)"
        for c in filter(native_arith_aarch64, c_sources):
            yield f'#include "{c}"'
        yield "#endif /* MLK_SYS_AARCH64 */"
        yield "#if defined(MLK_SYS_X86_64)"
        for c in filter(native_arith_x86_64, c_sources):
            yield f'#include "{c}"'
        yield "#endif /* MLK_SYS_X86_64 */"
        yield "#endif /* MLK_MONOBUILD_WITH_NATIVE_ARITH */"
        yield ""
        yield "#if defined(MLK_MONOBUILD_WITH_NATIVE_FIPS202)"
        yield "#if defined(MLK_SYS_AARCH64)"
        for c in filter(native_fips202_aarch64, c_sources):
            yield f'#include "{c}"'
        yield "#endif /* MLK_SYS_AARCH64 */"
        yield "#if defined(MLK_SYS_X86_64)"
        for c in filter(native_fips202_x86_64, c_sources):
            yield f'#include "{c}"'
        yield "#endif /* MLK_SYS_X86_64 */"
        yield "#endif /* MLK_MONOBUILD_WITH_NATIVE_FIPS202 */"
        yield ""
        yield from gen_monolithic_undef_all_core(
            filt=k_specific, desc="MLKEM_K-specific files"
        )
        yield ""
        yield "#if !defined(MLK_MONOBUILD_KEEP_SHARED_HEADERS)"
        yield from gen_monolithic_undef_all_core(
            filt=lambda c: not native(c)
            and k_generic(c)
            and not fips202(c)
            and "cbmc.h" not in c,
            desc="MLKEM_K-generic files",
        )
        # Handle cbmc.h manually -- most #define's therein are only defined when CBMC is set
        # and need not be #undef'ed. In fact, #undef'ing them is risky since their names may
        # well already be occupied.
        yield "/* mlkem/cbmc.h */"
        yield "#undef MLK_CBMC_H"
        yield "#undef __contract__"
        yield "#undef __loop__"
        yield ""
        yield "#if !defined(MLK_MONOBUILD_CUSTOM_FIPS202)"
        yield from gen_monolithic_undef_all_core(
            filt=lambda c: not native(c) and k_generic(c) and fips202(c),
            desc="FIPS-202 files",
        )
        yield "#endif /* !MLK_MONOBUILD_CUSTOM_FIPS202 */"
        yield ""
        yield "#if defined(MLK_MONOBUILD_WITH_NATIVE_FIPS202)"
        yield from gen_monolithic_undef_all_core(
            filt=native_fips202, desc="native code"
        )
        yield "#endif /* MLK_MONOBUILD_WITH_NATIVE_FIPS202 */"
        yield "#if defined(MLK_MONOBUILD_WITH_NATIVE_ARITH)"
        yield from gen_monolithic_undef_all_core(filt=native_arith, desc="native code")
        yield "#endif /* MLK_MONOBUILD_WITH_NATIVE_ARITH */"
        yield "#endif /* MLK_MONOBUILD_KEEP_SHARED_HEADERS */"
        yield ""

    update_file(
        "examples/monolithic_build/mlkem_native_monobuild.c",
        "\n".join(gen()),
        dry_run=dry_run,
    )


def check_asm_register_aliases_for_file(filename):
    """Checks that `filename` has no mismatching or dangling register aliases"""

    def get_alias_def(l):
        s = list(filter(lambda s: s != "", l.strip().split(" ")))
        if len(s) < 3 or s[1] != ".req":
            return None
        return s[0]

    def get_alias_undef(l):
        if l.strip().startswith(".unreq") is False:
            return None
        return list(filter(lambda s: s != "", l.strip().split(" ")))[1]

    with open(filename, "r") as f:
        content = f.read()
    aliases = {}
    for i, l in enumerate(content.split("\n")):
        alias_def = get_alias_def(l)
        alias_undef = get_alias_undef(l)
        if alias_def is not None:
            if alias_def in aliases.keys():
                raise Exception(
                    f"Invalid assembly file {filename}: Duplicate .req directive for {alias_def} at line {i}"
                )
            aliases[alias_def] = i
        elif alias_undef is not None:
            if alias_undef not in aliases.keys():
                raise Exception(
                    f"Invalid assembly file {filename}: .unreq without prior .req for {alias_undef} at line {i}"
                )
            del aliases[alias_undef]

    if len(aliases) > 0:
        fixup_suggestion = [
            "/****************** REGISTER DEALLOCATIONS *******************/"
        ]
        dangling = list(aliases.items())
        # Sort by line number of .req
        dangling.sort(key=lambda s: s[1])

        for a, _ in dangling:
            fixup_suggestion.append(f"    .unreq {a}")
        fixup_suggestion.append("")
        fixup_suggestion = "\n".join(fixup_suggestion)

        raise Exception(
            f"Invalid assembly file {filename}: Dangling .req directives {aliases}.\n\nTry adding this?\n\n{fixup_suggestion}"
        )


def check_asm_register_aliases():
    for asm_file in get_asm_source_files():
        check_asm_register_aliases_for_file(asm_file)


def update_via_simpasm(
    infile_full,
    outdir,
    outfile=None,
    cflags=None,
    preserve_header=True,
    dry_run=False,
    force_cross=False,
):

    _, infile = os.path.split(infile_full)
    if outfile is None:
        outfile = infile
    outfile_full = os.path.join(outdir, outfile)

    # Check if we need to use a cross-compiler
    if "aarch64" in infile_full:
        source_arch = "aarch64"
    elif "x86_64" in infile_full:
        source_arch = "x86_64"
    else:
        raise Exception(f"Could not detect architecture of source file {infile_full}.")
    # Check native architecture
    if platform.machine().lower() in ["arm64", "aarch64"]:
        native_arch = "aarch64"
    else:
        native_arch = "x86_64"

    if native_arch != source_arch:
        cross_prefix = f"{source_arch}-unknown-linux-gnu-"
        cross_gcc = cross_prefix + "gcc"
        # Check if cross-compiler is present
        if shutil.which(cross_gcc) is None:
            if force_cross is False:
                print(
                    f"WARNING: Skipping simplification of file {infile} -- "
                    f"no cross-compilation toolchain for {cross_prefix} found."
                )
                return
            raise Exception(f"Could not find cross toolchain {cross_prefix}")
    else:
        cross_prefix = None

    with tempfile.NamedTemporaryFile(suffix=".S") as tmp:
        try:
            cmd = [
                "./scripts/simpasm",
                "--objdump=llvm-objdump",
                "-i",
                infile_full,
                "-o",
                tmp.name,
            ]
            if cross_prefix is not None:
                # Stick with llvm-objdump for disassembly
                cmd += ["--cc", cross_prefix + "gcc"]
                cmd += ["--nm", cross_prefix + "nm"]
            if cflags is not None:
                cmd += [f'--cflags="{cflags}"']
            if preserve_header is True:
                cmd += ["-p"]
            r = subprocess.run(
                cmd,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.PIPE,
                check=True,
                text=True,
            )
        except subprocess.CalledProcessError as e:
            print(f"Command failed: {' '.join(cmd)}")
            print(f"Exit code: {e.returncode}")
            print(f"stderr: {e.stderr}")
            raise Exception("Failed to run simpasm") from e
        tmp.seek(0)
        new_contents = tmp.read().decode()

    update_file(outfile_full, new_contents, dry_run=dry_run)


def gen_hol_light_asm(dry_run=False):
    def gen_hol_light_asm_file(infile, outfile):
        update_via_simpasm(
            f"dev/aarch64_opt/src/{infile}",
            "proofs/hol_light/arm/mlkem",
            outfile=outfile,
            cflags="-Imlkem/native/aarch64/src -DMLK_ARITH_BACKEND_AARCH64_OPT",
            preserve_header=False,
            dry_run=dry_run,
        )

    gen_hol_light_asm_file("ntt_opt.S", "mlkem_ntt.S")
    gen_hol_light_asm_file("intt_opt.S", "mlkem_intt.S")


def update_via_copy(infile_full, outfile_full, dry_run=False, transform=None):
    with open(infile_full, "r") as f:
        content = f.read()

    if transform is not None:
        content = transform(content)

    update_file(outfile_full, content, dry_run=dry_run)


def update_via_remove(filename, dry_run=False):
    if dry_run is True:
        print(
            f"Autogenerated file {filename} needs removing. Have you called scripts/autogen?",
            file=sys.stderr,
        )
        exit(1)

    # Remove the file
    os.remove(filename)


def synchronize_backend(
    in_dir, out_dir, dry_run=False, delete=False, no_simplify=False, **kwargs
):
    copied = []
    for f in get_files(os.path.join(in_dir, "*")):
        copied.append(os.path.basename(f))
        if delete is True:
            continue
        if no_simplify is False and f.endswith(".S"):
            update_via_simpasm(f, out_dir, dry_run=dry_run, **kwargs)
        else:
            # Update via copy
            _, infile = os.path.split(f)
            outfile_full = os.path.join(out_dir, infile)
            # The header guards will also be checked later, but if we
            # don't do it here, the dry-run would fail because of a
            # mismatching intermediate file
            if f.endswith(".h"):
                transform = lambda c: adjust_header_guard_for_filename(c, outfile_full)
            else:
                transform = None
            update_via_copy(f, outfile_full, dry_run=dry_run, transform=transform)

    if delete is False:
        return

    # Check for files in the target directory that have not been copied
    for f in get_files(os.path.join(out_dir, "*")):
        if os.path.basename(f) in copied:
            continue
        # Otherwise, remove it
        update_via_remove(f, dry_run=dry_run)


def synchronize_backends(
    *, dry_run=False, force_cross=False, clean=False, delete=False, no_simplify=False
):
    if clean is False:
        ty = "opt"
    else:
        ty = "clean"

    if delete is False:
        # We may switch the AArch64 arithmetic backend, so adjust the metadata file
        update_via_copy(
            f"dev/aarch64_{ty}/meta.h",
            "mlkem/native/aarch64/meta.h",
            transform=lambda c: adjust_header_guard_for_filename(
                c, "mlkem/native/aarch64/meta.h"
            ),
        )

    synchronize_backend(
        f"dev/aarch64_{ty}/src",
        "mlkem/native/aarch64/src",
        dry_run=dry_run,
        delete=delete,
        force_cross=force_cross,
        no_simplify=no_simplify,
        cflags="-Imlkem/native/aarch64/src",
    )
    synchronize_backend(
        "dev/fips202/aarch64/src",
        "mlkem/fips202/native/aarch64/src",
        dry_run=dry_run,
        delete=delete,
        force_cross=force_cross,
        no_simplify=no_simplify,
        cflags="-Imlkem/fips202/native/aarch64/src -march=armv8.4-a+sha3",
    )
    synchronize_backend(
        "dev/x86_64/src",
        "mlkem/native/x86_64/src",
        dry_run=dry_run,
        delete=delete,
        force_cross=force_cross,
        no_simplify=no_simplify,
        # Turn off control-flow protection (CET) explicitly. Newer versions of
        # clang turn it on by default and insert endbr64 instructions at every
        # global symbol.
        # We insert endbr64 instruction manually via the MLK_ASM_FN_SYMBOL
        # macro.
        # This leads to duplicate endbr64 instructions causing a failure when
        # comparing the object code before and after simplification.
        cflags="-Imlkem/native/x86_64/src/ -mavx2 -fcf-protection=none",
    )


def adjust_header_guard_for_filename(content, header_file):
    content = content.split("\n")
    exceptions = {"mlkem/mlkem_native.h": "MLK_H"}

    # Use full filename as the header guard, with '/' and '.' replaced by '_'
    guard_name = (
        header_file.removeprefix("mlkem/").replace("/", "_").replace(".", "_").upper()
    )
    guard_name = "MLK_" + guard_name

    if header_file in exceptions.keys():
        guard_name = exceptions[header_file]

    def gen_copyright():
        yield "/*"
        yield " * Copyright (c) 2024-2025 The mlkem-native project authors"
        yield " * SPDX-License-Identifier: Apache-2.0"
        yield " */"

    def gen_guard():
        yield f"#ifndef {guard_name}"
        yield f"#define {guard_name}"

    def gen_footer():
        yield f"#endif /* {guard_name} */"
        yield ""

    cr = list(gen_copyright())
    guard = list(gen_guard())
    footer = list(gen_footer())

    # Check if header file begins with copyright notice; otherwise, add it.
    if content[: len(cr)] != cr:
        assert False
        content = cr + content
    i = len(cr)
    while content[i].strip() == "":
        i += 1
    # Check if header file has some guard -- if so, drop it
    if content[i].strip().startswith("#if !defined") or content[i].strip().startswith(
        "#ifndef"
    ):
        del content[i]
        if content[i].strip().startswith("#define"):
            del content[i]
        has_guard = True
    else:
        has_guard = False
    # Add standardized guard
    content = content[:i] + guard + content[i:]
    # Check if header has some footer
    if (
        has_guard is True
        and content[-1] == ""
        and content[-2].strip().startswith("#endif")
    ):
        del content[-2:]
    # Add standardized footer
    content = content + footer

    return "\n".join(content)


def gen_header_guard(header_file, dry_run=False):
    with open(header_file, "r") as f:
        content = f.read()
    new_content = adjust_header_guard_for_filename(content, header_file)
    update_file(header_file, new_content, dry_run=dry_run)


def gen_header_guards(dry_run=False):
    for h in get_header_files():
        gen_header_guard(h, dry_run=dry_run)


def gen_source_undefs(source_file, dry_run=False):
    # Get list of #define's clauses in this source file (ignore filename)
    undef_list = list(map(lambda c: c[1], get_defines_from_file(source_file)))
    # Get define clauses from header files, as dict
    header_defs = {d: c for (c, d) in get_defines()}

    undefs = []
    ignored = []
    for d in undef_list:
        if d not in header_defs.keys():
            undefs.append(f"#undef {d}")
        else:
            ignored.append((d, header_defs[d]))
    info_line = "/* Some macros are kept because they are also defined in a header. */"
    if len(ignored) != 0:
        undefs.append(
            "/* Some macros are kept because they are also defined in a header. */"
        )
        for d, c in ignored:
            undefs.append(f"/* Keep: {d} ({c.split('/')[-1]}) */")

    # Remove list of #undef's at the end of the source file, and add
    # the up-to-date one.
    with open(source_file, "r") as f:
        content = f.read().split("\n")
    while True:
        l = content[-1].strip()
        if (
            l.startswith("#undef")
            or l == info_line
            or l.startswith("/* Keep:")
            or l == ""
        ):
            del content[-1]
            continue
        break

    footer = [
        "",
        "/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.",
        " * Don't modify by hand -- this is auto-generated by scripts/autogen. */",
    ]
    # Remove existing footer, if present
    if content[-len(footer) :] == footer:
        content = content[: -len(footer)]

    if len(undefs) != 0:
        content = content + footer + undefs + [""]
    else:
        content = content + [""]

    new_content = "\n".join(content)
    update_file(source_file, new_content, dry_run=dry_run)


def gen_undefs(dry_run=False):
    for c in get_c_source_files():
        gen_source_undefs(c, dry_run=dry_run)


def _main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("--dry-run", default=False, action="store_true")
    parser.add_argument("--aarch64-clean", default=False, action="store_true")
    parser.add_argument("--no-simplify", default=False, action="store_true")
    parser.add_argument("--force-cross", default=False, action="store_true")

    args = parser.parse_args()

    check_asm_register_aliases()

    gen_c_zeta_file(args.dry_run)
    gen_aarch64_fwd_ntt_zeta_file(args.dry_run)
    gen_aarch64_rej_uniform_table(args.dry_run)
    gen_avx2_fwd_ntt_zeta_file(args.dry_run)
    gen_avx2_rej_uniform_table(args.dry_run)

    if platform.machine().lower() in ["arm64", "aarch64"]:
        gen_hol_light_asm(args.dry_run)

    synchronize_backends(
        dry_run=args.dry_run,
        clean=args.aarch64_clean,
        no_simplify=args.no_simplify,
        force_cross=args.force_cross,
    )
    gen_header_guards(args.dry_run)
    gen_monolithic_source_file(args.dry_run)
    gen_undefs(args.dry_run)

    synchronize_backends(
        dry_run=args.dry_run,
        clean=args.aarch64_clean,
        delete=True,
        force_cross=args.force_cross,
        no_simplify=args.no_simplify,
    )


if __name__ == "__main__":
    _main()