Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to export as sklearn RF #587

Merged
merged 12 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ repos:
"--linelength=100", "--recursive",
"--filter=-build/c++11,-build/include,-build/namespaces_literals,-runtime/references,-build/include_order,+build/include_what_you_use",
"--root=include"]
additional_dependencies: [cpplint]
additional_dependencies: [cpplint==1.6.1]
types_or: [c++]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.2
Expand Down
15 changes: 15 additions & 0 deletions include/treelite/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,21 @@ TREELITE_DLL int TreeliteSetTreeField(
TreeliteModelHandle model, uint64_t tree_id, char const* name, TreelitePyBufferFrame frame);
/*! \} */

/*!
* \defgroup model_query C API: Model query functions
* Query various properties of tree models
* \{
*/
/*!
* \brief Query the depth of each tree.
* \param model Treelite Model object
* \param out Pointer to array holding depth of each tree
* \param out_len Number of trees
* \return 0 for success; -1 for failure
*/
TREELITE_DLL int TreeliteGetTreeDepth(TreeliteModelHandle model, uint32_t** out, size_t* out_len);
/*! \} */

/*!
* \brief Display last error; can be called by multiple threads
* Note. Each thread will get the last error occured in its own context.
Expand Down
5 changes: 4 additions & 1 deletion include/treelite/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,6 @@ class Model {
inline TypeInfo GetLeafOutputType() const {
return std::visit([](auto&& inner) { return inner.GetLeafOutputType(); }, variant_);
}

inline std::size_t GetNumTree() const {
return std::visit([](auto&& inner) { return inner.GetNumTree(); }, variant_);
}
Expand Down Expand Up @@ -514,6 +513,10 @@ class Model {
/*! \brief Set a field in a tree */
void SetTreeField(std::uint64_t tree_id, std::string const& name, PyBufferFrame frame);

/* Model query functions */
/*! \brief Query the depth of each tree */
std::vector<std::uint32_t> GetTreeDepth() const;

/*!
* \brief Number of features used for the model.
* It is assumed that all feature indices are between 0 and [num_feature]-1.
Expand Down
4 changes: 2 additions & 2 deletions ops/conda_env/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ dependencies:
- llvm-openmp
- cython
- lightgbm
- cpplint
- cpplint=1.6.0
- pylint
- awscli
- python-build
- pip
- pip:
- cibuildwheel
- xgboost==2.1.0
- xgboost>=2.1.0
2 changes: 1 addition & 1 deletion python/treelite/gtil/gtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from scipy.sparse import csr_matrix

from ..core import _LIB, _check_call
from ..frontend import Model
from ..model import Model
from ..util import c_str, typestr_to_ctypes_type, typestr_to_numpy_type


Expand Down
21 changes: 18 additions & 3 deletions python/treelite/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,21 @@ def dump_as_json(self, *, pretty_print: bool = True) -> str:
)
return py_str(json_str.value)

def get_tree_depth(self) -> np.ndarray:
"""
Query the depth of each tree.
"""
depth_array_ptr = ctypes.POINTER(ctypes.c_uint32)()
depth_array_len = ctypes.c_size_t()
_check_call(
_LIB.TreeliteGetTreeDepth(
self.handle,
ctypes.byref(depth_array_ptr),
ctypes.byref(depth_array_len),
)
)
return np.ctypeslib.as_array(depth_array_ptr, shape=(depth_array_len.value,))

def get_header_accessor(self) -> HeaderAccessor:
"""
Obtain accessor for fields in the header.
Expand Down Expand Up @@ -483,7 +498,7 @@ def _numpy2pybuffer(array: np.ndarray) -> _TreelitePyBufferFrame:
ctypes.pythonapi.PyObject_GetBuffer(
ctypes.py_object(view),
ctypes.byref(buffer),
ctypes.c_int(0), # PyBUF_SIMPLE
ctypes.c_int(28), # PyBUF_RECORDS_RO
)
!= 0
):
Expand Down Expand Up @@ -540,7 +555,7 @@ def get_field(self, name: str) -> Union[np.ndarray, str]:
return array.tobytes().decode("utf-8")
return array

def set_field(self, name: str, value: Union[np.ndarray, str]):
def set_field(self, name: str, value: Union[np.ndarray, str]) -> None:
"""
Set a field

Expand Down Expand Up @@ -608,7 +623,7 @@ def get_field(self, name: str) -> np.ndarray:
)
return _pybuffer2numpy(obj)

def set_field(self, name: str, value: np.ndarray):
def set_field(self, name: str, value: np.ndarray) -> None:
"""
Set a field

Expand Down
5 changes: 3 additions & 2 deletions python/treelite/sklearn/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Model loader ingest scikit-learn models into Treelite"""
"""Model loader to ingest scikit-learn models into Treelite"""

from .exporter import export_model
from .importer import import_model


Expand All @@ -14,4 +15,4 @@ def import_model_with_model_builder(sklearn_model):
)


__all__ = ["import_model", "import_model_with_model_builder"]
__all__ = ["import_model", "export_model", "import_model_with_model_builder"]
248 changes: 248 additions & 0 deletions python/treelite/sklearn/exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
"""Converter to export Treelite models as scikit-learn models (EXPERIMENTAL)"""

from enum import IntEnum
from typing import Any

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from ..core import TreeliteError
from ..model import Model


def _ensure_scalar_int(x: Any) -> int:
if isinstance(x, np.ndarray):
assert x.shape == (1,)
return int(x[0])
try:
return int(x)
except ValueError as e:
raise ValueError(f"Cannot interpret x as a scalar integer, {x.type=}") from e

Check warning on line 21 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L18-L21

Added lines #L18 - L21 were not covered by tests


def _ensure_numpy(x: Any) -> np.ndarray:
if isinstance(x, np.ndarray):
return x
raise ValueError(f"x is not a valid NumPy array. {x.type=}")

Check warning on line 27 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L27

Added line #L27 was not covered by tests


_node_dtype = np.dtype(
{
"names": [
"left_child",
"right_child",
"feature",
"threshold",
"impurity",
"n_node_samples",
"weighted_n_node_samples",
"missing_go_to_left",
],
"formats": ["<i8", "<i8", "<i8", "<f8", "<f8", "<i8", "<f8", "u1"],
"offsets": [0, 8, 16, 24, 32, 40, 48, 56],
"itemsize": 64,
}
)


class _TaskType(IntEnum):
# pylint: disable=invalid-name
kBinaryClf = 0
kRegressor = 1
kMultiClf = 2
kLearningToRank = 3
kIsolationForest = 4


def _export_tree(
model, *, tree_id, n_features, n_classes, n_targets, tree_depths, subestimator_class
):
# pylint: disable=too-many-locals
try:
from sklearn import __version__ as sklearn_version
from sklearn.tree._tree import Tree as SKLearnTree
except ImportError as e:
raise TreeliteError("This function requires scikit-learn package") from e

Check warning on line 66 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L65-L66

Added lines #L65 - L66 were not covered by tests

tree_accessor = model.get_tree_accessor(tree_id)
has_categorical_split = tree_accessor.get_field("has_categorical_split").tolist()[0]
if has_categorical_split:
raise NotImplementedError(

Check warning on line 71 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L71

Added line #L71 was not covered by tests
"Trees with categorical splits cannot yet be exported as scikit-learn"
)

tree = SKLearnTree(n_features, n_classes, n_targets)

n_nodes = tree_accessor.get_field("num_nodes").tolist()[0]
nodes = np.empty(n_nodes, dtype=_node_dtype)

nodes["left_child"] = tree_accessor.get_field("cleft")
nodes["right_child"] = tree_accessor.get_field("cright")
nodes["feature"] = tree_accessor.get_field("split_index")
nodes["threshold"] = tree_accessor.get_field("threshold")
nodes["impurity"] = np.nan
nodes["n_node_samples"] = -1
nodes["weighted_n_node_samples"] = np.nan
nodes["missing_go_to_left"] = tree_accessor.get_field("default_left")

if n_targets == 1 and n_classes[0] == 1:
leaf_value = (
tree_accessor.get_field("leaf_value").astype("float64").reshape((-1, 1, 1))
)
else:
# Need to map leaf values to correct layout
leaf_value = np.zeros((n_nodes, n_targets, n_classes[0]), dtype="float64")
leaf_value_raw = tree_accessor.get_field("leaf_vector").astype("float64")
leaf_vec_begin = tree_accessor.get_field("leaf_vector_begin")
leaf_vec_end = tree_accessor.get_field("leaf_vector_end")
for node_id in range(n_nodes):
if leaf_vec_begin[node_id] != leaf_vec_end[node_id]:
# This node is a leaf node and outputs a vector
leaf_value[node_id, :, :] = leaf_value_raw[
leaf_vec_begin[node_id] : leaf_vec_end[node_id]
].reshape((n_targets, n_classes[0]))

state = {
"max_depth": tree_depths[tree_id],
"node_count": n_nodes,
"nodes": nodes,
"values": leaf_value,
}
tree.__setstate__(state)

subestimator = subestimator_class()
subestimator_state = {
"tree_": tree,
"n_outputs_": n_targets,
"_sklearn_version": sklearn_version,
}
if subestimator_class is DecisionTreeClassifier:
if n_targets == 1:
subestimator_state["n_classes_"] = n_classes[0]
else:
subestimator_state["n_classes_"] = n_classes.tolist()
subestimator.__setstate__(subestimator_state)
return subestimator


def export_model(model: Model):
"""
Export a model as a scikit-learn RandomForest.

Note
----
Currently only random forests can be exported as scikit-learn model objects.
Support for gradient boosted trees and other kinds of tree models will be
added in the future.

Parameters
----------
model : :py:class:`Model`
Treelite mobel to export

Returns
-------
sklearn_model : object of type \
:py:class:`~sklearn.ensemble.RandomForestRegressor` / \
:py:class:`~sklearn.ensemble.RandomForestClassifier` / \
:py:class:`~sklearn.ensemble.GradientBoostingRegressor` / \
:py:class:`~sklearn.ensemble.GradientBoostingClassifier`
Scikit-learn model
"""
# pylint: disable=too-many-locals
try:
from sklearn import __version__ as sklearn_version
from sklearn.ensemble import RandomForestRegressor
except ImportError as e:
raise TreeliteError("This function requires scikit-learn package") from e

Check warning on line 158 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L157-L158

Added lines #L157 - L158 were not covered by tests

header_accessor = model.get_header_accessor()
average_tree_output = (
_ensure_scalar_int(header_accessor.get_field("average_tree_output")) == 1
)
task_type = _ensure_scalar_int(header_accessor.get_field("task_type"))
n_features = _ensure_scalar_int(header_accessor.get_field("num_feature"))
n_trees = _ensure_scalar_int(header_accessor.get_field("num_tree"))
n_targets = _ensure_scalar_int(header_accessor.get_field("num_target"))
n_classes = _ensure_numpy(header_accessor.get_field("num_class"))
leaf_vector_shape = _ensure_numpy(header_accessor.get_field("leaf_vector_shape"))
target_id = _ensure_numpy(header_accessor.get_field("target_id"))
class_id = _ensure_numpy(header_accessor.get_field("class_id"))
tree_depths = model.get_tree_depth()

# Heuristics to ensure that the model can be represented as scikit-learn random forest
# 1. average_tree_output must be True
# 2. n_classes[i] must be identical for all targets
# 3. Each leaf must yield an output of shape (n_targets, n_classes)
# 4. target_id[i] must be either 0 or -1
# 5. class_id[i] must be either 0 or -1
def raise_not_rf_error(reason):
raise NotImplementedError(

Check warning on line 181 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L181

Added line #L181 was not covered by tests
"This Treelite model cannot be represented as scikit-learn random forest. "
f"Condition unmet: {reason}"
"Other kinds of tree models in scikit-learn are not yet supported."
)

if not average_tree_output:
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
raise_not_rf_error(

Check warning on line 188 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L188

Added line #L188 was not covered by tests
"Outputs of tree outputs must be averaged to produce the final output"
)
if not np.all(n_classes == n_classes[0]):
raise_not_rf_error("n_classes must be identical for all trees")

Check warning on line 192 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L192

Added line #L192 was not covered by tests
if not np.array_equal(leaf_vector_shape, [n_targets, n_classes.max()]):
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
raise_not_rf_error(

Check warning on line 194 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L194

Added line #L194 was not covered by tests
"Each tree must produce output of dimensions (n_targets, n_classes)"
)
if not np.all((target_id == 0) | (target_id == -1)):
raise_not_rf_error("target_id field must be either 0 or -1")

Check warning on line 198 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L198

Added line #L198 was not covered by tests
if not np.all((class_id == 0) | (class_id == -1)):
raise_not_rf_error("class_id field must be either 0 or -1")

Check warning on line 200 in python/treelite/sklearn/exporter.py

View check run for this annotation

Codecov / codecov/patch

python/treelite/sklearn/exporter.py#L200

Added line #L200 was not covered by tests

if task_type in [_TaskType.kBinaryClf, _TaskType.kMultiClf]:
estimator_class = RandomForestClassifier
subestimator_class = DecisionTreeClassifier
else:
estimator_class = RandomForestRegressor
subestimator_class = DecisionTreeRegressor

estimators = []

for tree_id in range(n_trees):
estimators.append(
_export_tree(
model,
tree_id=tree_id,
n_features=n_features,
n_classes=n_classes,
n_targets=n_targets,
tree_depths=tree_depths,
subestimator_class=subestimator_class,
)
)

clf = estimator_class()
state = {
"estimators_": estimators,
"n_outputs_": n_targets,
"n_features_in_": n_features,
"_sklearn_version": sklearn_version,
}
if estimator_class is RandomForestClassifier:
if n_targets == 1:
state.update(
{
"n_classes_": n_classes[0],
"classes_": np.arange(n_classes[0]),
}
)
else:
state.update(
{
"n_classes_": n_classes.tolist(),
"classes_": [np.arange(n_classes[i]) for i in range(n_targets)],
}
)
clf.__setstate__(state)

return clf
Loading
Loading