diff --git a/.github/ci.sh b/.github/ci.sh index 8298ee1..8192bd6 100755 --- a/.github/ci.sh +++ b/.github/ci.sh @@ -8,7 +8,7 @@ python -m pip install --no-use-pep517 --no-deps --disable-pip-version-check -e . pytest -v tests # Check documentation build only in one job, also do releases -if [ "${PYTHON_VERSION}" = "3.6" ]; then +if [ "${PYTHON_VERSION}" = "3.7" ]; then pushd docs make html popd diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d051325..ea4f8f2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,14 +34,11 @@ jobs: with: path: ./.hypothesis key: hypothesisDB ${{ matrix.PYTHON_VERSION }} - - if: matrix.PYTHON_VERSION == '3.6' - shell: bash -x -l {0} - run: pip install dataclasses - name: Run the unittests shell: bash -x -l {0} run: ./.github/ci.sh ${{ matrix.PYTHON_VERSION }} - name: Publish a Python distribution to PyPI - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && matrix.PYTHON_VERSION == '3.6' + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') && matrix.PYTHON_VERSION == '3.7' uses: pypa/gh-action-pypi-publish@v1.4.2 with: user: __token__ diff --git a/environment.yml b/environment.yml index cbc5848..277f60e 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - nodefaults dependencies: # runtime deps - - python>=3.6 + - python>=3.7 - llvmlite>=0.36 - numpy # testing diff --git a/lleaves/compiler/codegen/codegen.py b/lleaves/compiler/codegen/codegen.py index 4d07487..2c55970 100644 --- a/lleaves/compiler/codegen/codegen.py +++ b/lleaves/compiler/codegen/codegen.py @@ -9,6 +9,7 @@ FLOAT = ir.FloatType() INT_CAT = ir.IntType(bits=32) INT = ir.IntType(bits=32) +LONG = ir.IntType(bits=64) ZERO_V = ir.Constant(BOOL, 0) FLOAT_POINTER = ir.PointerType(FLOAT) DOUBLE_PTR = ir.PointerType(DOUBLE) @@ -18,6 +19,10 @@ def iconst(value): return ir.Constant(INT, value) +def lconst(value): + return ir.Constant(LONG, value) + + def fconst(value): return ir.Constant(FLOAT, value) @@ -168,7 +173,9 @@ def _populate_instruction_block( # -- SETUP BLOCK builder = ir.IRBuilder(setup_block) - loop_iter = builder.alloca(INT, 1, "loop-idx") + start_index = builder.zext(start_index, LONG) + end_index = builder.zext(end_index, LONG) + loop_iter = builder.alloca(LONG, 1, "loop-idx") builder.store(start_index, loop_iter) condition_block = root_func.append_basic_block("loop-condition") builder.branch(condition_block) @@ -187,9 +194,9 @@ def _populate_instruction_block( args = [] loop_iter_reg = builder.load(loop_iter) - n_args = ir.Constant(INT, forest.n_args) + n_args = ir.Constant(LONG, forest.n_args) iter_mul_nargs = builder.mul(loop_iter_reg, n_args) - idx = (builder.add(iter_mul_nargs, iconst(i)) for i in range(forest.n_args)) + idx = (builder.add(iter_mul_nargs, lconst(i)) for i in range(forest.n_args)) raw_ptrs = [builder.gep(root_func.args[0], (c,)) for c in idx] # cast the categorical inputs to integer for feature, ptr in zip(forest.features, raw_ptrs): @@ -203,9 +210,9 @@ def _populate_instruction_block( for func in tree_funcs: tree_res = builder.call(func.llvm_function, args) results[func.class_id] = builder.fadd(tree_res, results[func.class_id]) - res_idx = builder.mul(iconst(forest.n_classes), loop_iter_reg) + res_idx = builder.mul(lconst(forest.n_classes), loop_iter_reg) results_ptr = [ - builder.gep(out_arr, (builder.add(res_idx, iconst(class_idx)),)) + builder.gep(out_arr, (builder.add(res_idx, lconst(class_idx)),)) for class_idx in range(forest.n_classes) ] @@ -224,8 +231,7 @@ def _populate_instruction_block( for result, result_ptr in zip(results, results_ptr): builder.store(result, result_ptr) - tmpp1 = builder.add(loop_iter_reg, iconst(1)) - builder.store(tmpp1, loop_iter) + builder.store(builder.add(loop_iter_reg, lconst(1)), loop_iter) builder.branch(condition_block) # -- END CORE LOOP BLOCK diff --git a/lleaves/data_processing.py b/lleaves/data_processing.py index f17cff1..30fb6e7 100644 --- a/lleaves/data_processing.py +++ b/lleaves/data_processing.py @@ -4,6 +4,7 @@ from typing import List, Optional import numpy as np +import pandas as pd try: from pandas import DataFrame as pd_DataFrame @@ -15,7 +16,7 @@ class pd_DataFrame: pass -def _dataframe_to_ndarray(data, pd_traintime_categories: List[List]): +def _dataframe_to_ndarray(data: pd.DataFrame, pd_traintime_categories: List[List]): """ Converts the given dataframe into a 2D numpy array and converts categorical columns to float. @@ -94,7 +95,7 @@ def data_to_ndarray(data, pd_traintime_categories: Optional[List[List]] = None): return data -def ndarray_to_ptr(data): +def ndarray_to_ptr(data: np.ndarray): """ Takes a 2D numpy array, converts to float64 if necessary and returns a pointer diff --git a/lleaves/lleaves.py b/lleaves/lleaves.py index ae4a7e3..5a2956c 100644 --- a/lleaves/lleaves.py +++ b/lleaves/lleaves.py @@ -1,7 +1,7 @@ import concurrent.futures import math import os -from ctypes import CFUNCTYPE, POINTER, c_double, c_int +from ctypes import CFUNCTYPE, POINTER, c_double, c_int32 from pathlib import Path import llvmlite.binding @@ -20,8 +20,8 @@ None, # return void POINTER(c_double), # pointer to data array POINTER(c_double), # pointer to results array - c_int, # start index - c_int, # end index + c_int32, # start index + c_int32, # end index ) @@ -89,12 +89,10 @@ def compile( """ Generate the LLVM IR for this model and compile it to ASM. - For most users tweaking the compilation flags (fcodemodel, fblocksize) will be unnecessary as the default - configuration is already very fast. + For most users tweaking the compilation flags (fcodemodel, fblocksize, finline) will be unnecessary + as the default configuration is already very fast. Modifying the flags is useful only if you're trying to squeeze out the last few percent of performance. - The compile() method is generally not thread-safe. - :param cache: Path to a cache file. If this path doesn't exist, binary will be dumped at path after compilation. If path exists, binary will be loaded and compilation skipped. No effort is made to check staleness / consistency. @@ -160,6 +158,12 @@ def predict(self, data, n_jobs=os.cpu_count()): raise ValueError( f"Data must be of dimension (N, {self.num_feature()}), is {data.shape}." ) + # protect against `ctypes.c_int32` silently overflowing and causing SIGSEGV + if n_predictions >= 2 ** 31 - 1: + raise ValueError( + "Prediction is not supported for datasets with >=2^31-1 rows. " + "Split the dataset into smaller chunks first." + ) # setup input data and predictions array ptr_data = ndarray_to_ptr(data) diff --git a/setup.py b/setup.py index e380123..f49caae 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,6 @@ description="LLVM-based compiler for LightGBM models", long_description=long_description, long_description_content_type="text/markdown", - python_requires=">=3.6", - install_requires=["llvmlite>=0.36", "numpy", "dataclasses; python_version < '3.7'"], + python_requires=">=3.7", + install_requires=["llvmlite>=0.36", "numpy"], ) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..97e9f36 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,23 @@ +import pytest +from lightgbm import Booster + +from lleaves import Model + + +@pytest.fixture(scope="session") +def NYC_llvm(): + llvm_model = Model(model_file="tests/models/NYC_taxi/model.txt") + llvm_model.compile() + return llvm_model + + +@pytest.fixture(scope="session") +def NYC_lgbm(): + return Booster(model_file="tests/models/NYC_taxi/model.txt") + + +@pytest.fixture(scope="session") +def mtpl2_llvm(): + llvm_model = Model(model_file="tests/models/mtpl2/model.txt") + llvm_model.compile() + return llvm_model diff --git a/tests/test_dataprocessing.py b/tests/test_dataprocessing.py index 4c31ced..b8d891b 100644 --- a/tests/test_dataprocessing.py +++ b/tests/test_dataprocessing.py @@ -3,7 +3,9 @@ import numpy as np import pandas as pd import pytest +from lightgbm import Booster +from lleaves import Model from lleaves.data_processing import ( data_to_ndarray, extract_model_global_features, @@ -87,3 +89,20 @@ def test_no_data_modification(): pred = pd.DataFrame(data).astype("category") ndarray_to_ptr(data_to_ndarray(pred, data)) pd.testing.assert_frame_equal(pred, orig) + + +def test_sliced_arrays(): + # predictions should be correct when passed a sliced array + llvm_model = Model(model_file="tests/models/single_tree/model.txt") + llvm_model.compile() + lgbm_model = Booster(model_file="tests/models/single_tree/model.txt") + + n_feature = lgbm_model.num_feature() + data = np.array(list(range(-5 * n_feature, 5 * n_feature)), dtype=np.float64) + data = data.reshape((5, 2 * n_feature)) + sliced = data[:, ::2] + assert not sliced.flags.c_contiguous + np.testing.assert_almost_equal( + llvm_model.predict(sliced, n_jobs=4), lgbm_model.predict(sliced), decimal=13 + ) + return diff --git a/tests/test_parallel.py b/tests/test_parallel.py index f30648e..6c68fb9 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -1,30 +1,37 @@ from ctypes import POINTER, c_double import numpy as np -from lightgbm import Booster -from lleaves import Model +def test_parallel_edgecases(NYC_llvm, NYC_lgbm): + # single row, multiple threads + data = np.array(1 * [NYC_lgbm.num_feature() * [1.0]], dtype=np.float64) + np.testing.assert_almost_equal( + NYC_llvm.predict(data, n_jobs=4), NYC_lgbm.predict(data), decimal=14 + ) + + # last thread has only one prediction (batchsize is ceil(19/7)=3) + data = np.array(19 * [NYC_lgbm.num_feature() * [1.0]], dtype=np.float64) + np.testing.assert_almost_equal( + NYC_llvm.predict(data, n_jobs=7), NYC_lgbm.predict(data), decimal=14 + ) -def test_parallel_iteration(): - llvm_model = Model(model_file="tests/models/NYC_taxi/model.txt") - lgbm_model = Booster(model_file="tests/models/NYC_taxi/model.txt") - llvm_model.compile() - data = np.array(4 * [5 * [1.0]], dtype=np.float64) +def test_parallel_iteration(NYC_llvm, NYC_lgbm): + data = np.array(4 * [NYC_lgbm.num_feature() * [1.0]], dtype=np.float64) data_flat = np.array(data.reshape(data.size), dtype=np.float64) np.testing.assert_almost_equal( - llvm_model.predict(data, n_jobs=4), lgbm_model.predict(data), decimal=14 + NYC_llvm.predict(data, n_jobs=4), NYC_lgbm.predict(data), decimal=14 ) ptr_data = data_flat.ctypes.data_as(POINTER(c_double)) preds = np.zeros(4, dtype=np.float64) ptr_preds = preds.ctypes.data_as(POINTER(c_double)) - llvm_model._c_entry_func(ptr_data, ptr_preds, 2, 4) + NYC_llvm._c_entry_func(ptr_data, ptr_preds, 2, 4) preds_l = list(preds) assert preds_l[0] == 0.0 and preds_l[1] == 0.0 assert preds_l[2] != 0.0 and preds_l[3] != 0.0 - llvm_model._c_entry_func(ptr_data, ptr_preds, 0, 2) + NYC_llvm._c_entry_func(ptr_data, ptr_preds, 0, 2) preds_l = list(preds) assert preds_l[0] != 0.0 and preds_l[1] != 0.0