diff --git a/.github/workflows/CI_conda_forge.yml b/.github/workflows/CI_conda_forge.yml
index 52407c58..b2e8a261 100644
--- a/.github/workflows/CI_conda_forge.yml
+++ b/.github/workflows/CI_conda_forge.yml
@@ -16,26 +16,12 @@ jobs:
timeout-minutes: 60
defaults:
run:
- shell: bash -l {0}
+ shell: bash -el {0}
strategy:
fail-fast: false
matrix:
- python-version: ['3.10', '3.11', '3.12']
- os: ['ubuntu-latest']
- use-mamba: [true, false]
- include:
- - python-version: '3.10'
- os: 'windows-latest'
- use-mamba: true
- - python-version: '3.12'
- os: 'windows-latest'
- use-mamba: true
- - python-version: '3.10'
- os: 'macos-latest'
- use-mamba: true
- - python-version: '3.12'
- os: 'macos-latest'
- use-mamba: true
+ python-version: ['3.10', '3']
+ os: ['ubuntu-latest', 'windows-latest', 'macos-latest']
steps:
- name: "Set up Conda"
@@ -46,13 +32,14 @@ jobs:
auto-activate-base: true
python-version: ${{ matrix.python-version }}
activate-environment: pysr-test
- - name: "Install pysr with mamba"
- run: conda activate pysr-test && mamba install pysr
- if: ${{ matrix.use-mamba }}
- - name: "Install pysr with conda"
- run: conda activate pysr-test && conda install pysr
- if: ${{ !matrix.use-mamba }}
+ - name: "Install pysr"
+ run: |
+ conda install -y pysr
+ python -c "import pysr"
+ echo "Finished."
- name: "Run tests"
run: |
+ echo "Running tests"
pip install pytest nbval
python -m pysr test main,startup
+ echo "Finished."
diff --git a/.github/workflows/update_backend_version.py b/.github/workflows/update_backend_version.py
index 696da9f5..479080e5 100644
--- a/.github/workflows/update_backend_version.py
+++ b/.github/workflows/update_backend_version.py
@@ -20,7 +20,7 @@
major, minor, patch, *dev = pyproject_data["project"]["version"].split(".")
pyproject_data["project"]["version"] = f"{major}.{minor}.{int(patch)+1}"
-juliapkg_data["packages"]["SymbolicRegression"]["version"] = f"={new_backend_version}"
+juliapkg_data["packages"]["SymbolicRegression"]["version"] = f"~{new_backend_version}"
with open(pyproject_toml, "w") as toml_file:
toml_file.write(tomlkit.dumps(pyproject_data))
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7e5f4fe1..fa10d77f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,25 +9,25 @@ repos:
- id: check-added-large-files
# General formatting
- repo: https://github.com/psf/black
- rev: 24.10.0
+ rev: 25.1.0
hooks:
- id: black
- id: black-jupyter
exclude: pysr/test/test_nb.ipynb
# Stripping notebooks
- repo: https://github.com/kynan/nbstripout
- rev: 0.8.0
+ rev: 0.8.1
hooks:
- id: nbstripout
exclude: pysr/test/test_nb.ipynb
# Unused imports
- repo: https://github.com/hadialqattan/pycln
- rev: "v2.4.0"
+ rev: "v2.5.0"
hooks:
- id: pycln
# Sorted imports
- repo: https://github.com/PyCQA/isort
- rev: "5.13.2"
+ rev: "6.0.0"
hooks:
- id: isort
additional_dependencies: [toml]
diff --git a/README.md b/README.md
index 299f019a..9e3773ac 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ If you've finished a project with PySR, please submit a PR to showcase your work
- [Why PySR?](#why-pysr)
- [Installation](#installation)
- [Quickstart](#quickstart)
-- [→ Documentation](https://ai.damtp.cam.ac.uk/PySR)
+- [→ Documentation](https://ai.damtp.cam.ac.uk/pysr)
- [Contributors](#contributors-)
diff --git a/docs/examples.md b/docs/examples.md
index 6d841b0e..700f73bb 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -546,8 +546,9 @@ y = np.sin(X[:, 0] + X[:, 1]) + X[:, 2]**2
# Define template: we want sin(f(x1, x2)) + g(x3)
template = TemplateExpressionSpec(
- function_symbols=["f", "g"],
- combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)",
+ expressions=["f", "g"],
+ variable_names=["x1", "x2", "x3"],
+ combine="sin(f(x1, x2)) + g(x3)",
)
model = PySRRegressor(
@@ -559,15 +560,23 @@ model = PySRRegressor(
model.fit(X, y)
```
-You can also use no argument-functions for learning constants, like:
+You can also use parameters in your template expressions, which will be optimized during the search:
```python
template = TemplateExpressionSpec(
- function_symbols=["a", "f"],
- combine="((; a, f), (x, y)) -> a() * sin(f(x, y))",
+ expressions=["f", "g"],
+ variable_names=["x1", "x2", "x3"],
+ parameters={"p1": 2, "p2": 1}, # p1 has length 2, p2 has length 1
+ combine="p1[1] * sin(f(x1, x2)) + p1[2] * g(x3) + p2[1]",
)
```
+This will learn an equation of the form:
+
+$$ y = \alpha_1 \sin(f(x_1, x_2)) + \alpha_2 g(x_3) + \beta $$
+
+where $\alpha_1, \alpha_2$ are stored in `p1` and $\beta$ is stored in `p2`. The parameters will be optimized during the search.
+
### Parametric Expressions
When your data has categories with shared equation structure but different parameters,
@@ -609,6 +618,20 @@ model.fit(X, y, category=category)
See [Expression Specifications](/api/#expression-specifications) for more details.
+You can also use `TemplateExpressionSpec` in the same way, passing
+the category as a column of `X`:
+
+```python
+spec = TemplateExpressionSpec(
+ expressions=["f", "g"],
+ variable_names=["x1", "x2", "class"],
+ combine="p1[class] * sin(f(x1, x2)) + p2[class]",
+)
+```
+
+this column will automatically be converted to integers.
+
+
## 12. Using TensorBoard for Logging
You can use TensorBoard to visualize the search progress, as well as
diff --git a/pyproject.toml b/pyproject.toml
index eb3ce615..2a6c0899 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "pysr"
-version = "1.3.1"
+version = "1.5.0"
authors = [
{name = "Miles Cranmer", email = "miles.cranmer@gmail.com"},
]
@@ -22,7 +22,7 @@ dependencies = [
"pandas>=0.21.0,<3.0.0",
"numpy>=1.13.0,<3.0.0",
"scikit_learn>=1.0.0,<2.0.0",
- "juliacall==0.9.23",
+ "juliacall==0.9.24",
"click>=7.0.0,<9.0.0",
"setuptools>=50.0.0",
]
diff --git a/pysr/__init__.py b/pysr/__init__.py
index e26174ab..aabbb669 100644
--- a/pysr/__init__.py
+++ b/pysr/__init__.py
@@ -1,5 +1,12 @@
+import logging
import os
+pysr_logger = logging.getLogger("pysr")
+pysr_logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+pysr_logger.addHandler(handler)
+
if os.environ.get("PYSR_USE_BEARTYPE", "0") == "1":
from beartype.claw import beartype_this_package
diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py
index f9a6eee7..c35ef031 100644
--- a/pysr/expression_specs.py
+++ b/pysr/expression_specs.py
@@ -1,6 +1,7 @@
import copy
from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, NewType, TypeAlias
+from textwrap import dedent
+from typing import TYPE_CHECKING, Any, NewType, TypeAlias, overload
import numpy as np
import pandas as pd
@@ -26,11 +27,9 @@ class AbstractExpressionSpec(ABC):
All expression types must implement:
- 1. julia_expression_type(): The actual expression type, returned as a Julia object.
- This will get stored as `expression_type` in `SymbolicRegression.Options`.
- 2. julia_expression_options(): Method to create the expression options, returned as a Julia object.
- These will get stored as `expression_options` in `SymbolicRegression.Options`.
- 3. create_exports(), which will be used to create the exports of the equations, such as
+ 1. julia_expression_spec(): The actual expression specification, returned as a Julia object.
+ This will get passed as `expression_spec` in `SymbolicRegression.Options`.
+ 2. create_exports(), which will be used to create the exports of the equations, such as
the executable format, the SymPy format, etc.
It may also optionally implement:
@@ -39,13 +38,8 @@ class AbstractExpressionSpec(ABC):
"""
@abstractmethod
- def julia_expression_type(self) -> AnyValue:
- """The expression type"""
- pass # pragma: no cover
-
- @abstractmethod
- def julia_expression_options(self) -> AnyValue:
- """The expression options"""
+ def julia_expression_spec(self) -> AnyValue:
+ """The expression specification"""
pass # pragma: no cover
@abstractmethod
@@ -82,11 +76,8 @@ def supports_latex(self) -> bool:
class ExpressionSpec(AbstractExpressionSpec):
"""The default expression specification, with no special behavior."""
- def julia_expression_type(self):
- return SymbolicRegression.Expression
-
- def julia_expression_options(self):
- return jl.NamedTuple()
+ def julia_expression_spec(self):
+ return SymbolicRegression.ExpressionSpec()
def create_exports(
self,
@@ -127,31 +118,39 @@ class TemplateExpressionSpec(AbstractExpressionSpec):
This class allows you to specify how multiple sub-expressions should be combined
in a structured way, with constraints on which variables each sub-expression can use.
- Pass this to PySRRegressor with the `expression_spec` argument when you are using
- the `TemplateExpression` expression type.
+ Pass this to PySRRegressor with the `expression_spec` argument.
Parameters
----------
- function_symbols : list[str]
- List of symbols representing the inner expressions (e.g., ["f", "g"]).
- These will be used as keys in the template structure.
combine : str
Julia function string that defines how the sub-expressions are combined.
- Takes a NamedTuple of expressions and a tuple of data vectors.
- For example: "((; f, g), (x1, x2, x3)) -> f(x1, x2) + g(x3)^2"
- would constrain f to use x1,x2 and g to use x3.
- num_features : dict[str, int]
- Dictionary mapping function symbols to the number of features each can use.
- For example: {"f": 2, "g": 1} means f takes 2 inputs and g takes 1.
- If not provided, will be inferred from the combine function.
+ For example: "sin(f(x1, x2)) + g(x3)^2" would constrain f to use x1,x2 and g to use x3.
+ expressions : list[str]
+ List of symbols representing the inner expressions (e.g., ["f", "g"]).
+ These will be used as keys in the template structure.
+ variable_names : list[str]
+ List of variable names that will be used in the combine function.
+ parameters : dict[str, int], optional
+ Dictionary mapping parameter names to their lengths. For example, {"p1": 2, "p2": 1}
+ means p1 is a vector of length 2 and p2 is a vector of length 1. These parameters
+ will be optimized during the search.
Examples
--------
```python
# Create template that combines f(x1, x2) and g(x3):
expression_spec = TemplateExpressionSpec(
- function_symbols=["f", "g"],
- combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)^2",
+ expressions=["f", "g"],
+ variable_names=["x1", "x2", "x3"],
+ combine="sin(f(x1, x2)) + g(x3)^2",
+ )
+
+ # With parameters:
+ expression_spec = TemplateExpressionSpec(
+ expressions=["f", "g"],
+ variable_names=["x1", "x2", "x3"],
+ parameters={"p1": 2, "p2": 1},
+ combine="p1[1] * sin(f(x1, x2)) + p1[2] * g(x3) + p2[1]",
)
# Use in PySRRegressor:
@@ -159,9 +158,47 @@ class TemplateExpressionSpec(AbstractExpressionSpec):
expression_spec=expression_spec
)
```
+
+ Notes
+ -----
+ You can also use differential operators in the template with `D(f, 1)(x)` to take
+ the derivative of f with respect to its first argument, evaluated at x.
"""
+ _spec_cache: dict[tuple[str, ...], AnyValue] = {}
+
+ @overload
+ def __init__(
+ self,
+ function_symbols: list[str],
+ combine: str,
+ num_features: dict[str, int] | None = None,
+ ) -> None: ...
+
+ @overload
+ def __init__(
+ self,
+ combine: str,
+ *,
+ expressions: list[str],
+ variable_names: list[str],
+ parameters: dict[str, int] | None = None,
+ ) -> None: ...
+
def __init__(
+ self,
+ *args,
+ **kwargs,
+ ):
+ """Handle both formats with combine as explicit parameter"""
+ self._old_format = len(args) >= 2 or "function_symbols" in kwargs
+
+ if self._old_format:
+ self._load_old_format(*args, **kwargs)
+ else:
+ self._load_new_format(*args, **kwargs)
+
+ def _load_old_format(
self,
function_symbols: list[str],
combine: str,
@@ -170,9 +207,69 @@ def __init__(
self.function_symbols = function_symbols
self.combine = combine
self.num_features = num_features
+ # TODO: warn about old format after some versions
- def julia_expression_type(self):
- return SymbolicRegression.TemplateExpression
+ def _load_new_format(
+ self,
+ combine: str,
+ *,
+ expressions: list[str],
+ variable_names: list[str],
+ parameters: dict[str, int] | None = None,
+ ):
+ self.combine = combine
+ self.expressions = expressions
+ self.variable_names = variable_names
+ self.parameters = parameters
+
+ def _get_cache_key(self):
+ if self._old_format:
+ return (
+ "old",
+ str(self.function_symbols),
+ self.combine,
+ str(self.num_features),
+ )
+ else:
+ return (
+ "new",
+ self.combine,
+ str(self.expressions),
+ str(self.variable_names),
+ str(self.parameters),
+ )
+
+ def julia_expression_spec(self):
+ key = self._get_cache_key()
+ if key in self._spec_cache:
+ return self._spec_cache[key]
+
+ if self._old_format:
+ result = SymbolicRegression.TemplateExpressionSpec(
+ structure=self.julia_expression_options().structure
+ )
+ else:
+ result = self._call_template_macro()
+
+ self._spec_cache[key] = result
+ return result
+
+ def _call_template_macro(self):
+ return jl.seval(self._template_macro_str())
+
+ def _template_macro_str(self):
+ template_inputs = [f"expressions=({', '.join(self.expressions) + ','})"]
+ if self.parameters:
+ template_inputs.append(
+ f"parameters=({', '.join([f'{p}={self.parameters[p]}' for p in self.parameters]) + ','})"
+ )
+ return dedent(
+ f"""
+ @template_spec({', '.join(template_inputs) + ','}) do {', '.join(self.variable_names)}
+ {self.combine}
+ end
+ """
+ )
def julia_expression_options(self):
f_combine = jl.seval(self.combine)
@@ -243,11 +340,10 @@ class ParametricExpressionSpec(AbstractExpressionSpec):
def __init__(self, max_parameters: int):
self.max_parameters = max_parameters
- def julia_expression_type(self):
- return SymbolicRegression.ParametricExpression
-
- def julia_expression_options(self):
- return jl.seval("NamedTuple{(:max_parameters,)}")((self.max_parameters,))
+ def julia_expression_spec(self):
+ return SymbolicRegression.ParametricExpressionSpec(
+ max_parameters=self.max_parameters
+ )
@property
def evaluates_in_julia(self):
diff --git a/pysr/feature_selection.py b/pysr/feature_selection.py
index 8c8358fd..13fca487 100644
--- a/pysr/feature_selection.py
+++ b/pysr/feature_selection.py
@@ -1,5 +1,6 @@
"""Functions for doing feature selection during preprocessing."""
+import logging
from typing import cast
import numpy as np
@@ -8,6 +9,8 @@
from .utils import ArrayLike
+pysr_logger = logging.getLogger(__name__)
+
def run_feature_selection(
X: ndarray,
@@ -44,7 +47,7 @@ def _handle_feature_selection(
):
if select_k_features is not None:
selection = run_feature_selection(X, y, select_k_features)
- print(f"Using features {[variable_names[i] for i in selection]}")
+ pysr_logger.info(f"Using features {[variable_names[i] for i in selection]}")
X = X[:, selection]
else:
selection = None
diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json
index f6d709c2..4f190bce 100644
--- a/pysr/juliapkg.json
+++ b/pysr/juliapkg.json
@@ -3,7 +3,7 @@
"packages": {
"SymbolicRegression": {
"uuid": "8254be44-1295-4e6a-a16d-46603ac705cb",
- "version": "=1.5.1"
+ "version": "~1.8.0"
},
"Serialization": {
"uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b",
diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml
index 769e6647..d3b56aa1 100644
--- a/pysr/param_groupings.yml
+++ b/pysr/param_groupings.yml
@@ -13,6 +13,7 @@
- The Objective:
- elementwise_loss
- loss_function
+ - loss_function_expression
- model_selection
- dimensional_constraint_penalty
- dimensionless_constants_only
diff --git a/pysr/sr.py b/pysr/sr.py
index 8da6cfee..aeadde68 100644
--- a/pysr/sr.py
+++ b/pysr/sr.py
@@ -1,6 +1,7 @@
"""Define the PySRRegressor scikit-learn interface."""
import copy
+import logging
import os
import pickle as pkl
import re
@@ -67,6 +68,8 @@
ALREADY_RAN = False
+pysr_logger = logging.getLogger(__name__)
+
def _process_constraints(
binary_operators: list[str],
@@ -375,9 +378,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
You may pass a function with the same arguments as this (note
that the name of the function doesn't matter). Here,
both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
- If using `batching`, then you should add an
- `idx` argument to the function, which is `nothing`
- for non-batched, and a 1D array of indices for batched.
+ Default is `None`.
+ loss_function_expression : str
+ Similar to `loss_function`, but takes as input the full
+ expression object as the first argument, rather than
+ the innermost `AbstractExpressionNode`. This is useful
+ for specifying custom loss functions on `TemplateExpressionSpec`.
Default is `None`.
complexity_of_operators : dict[str, int | float]
If you would like to use a complexity other than 1 for an
@@ -806,6 +812,7 @@ def __init__(
nested_constraints: dict[str, dict[str, int]] | None = None,
elementwise_loss: str | None = None,
loss_function: str | None = None,
+ loss_function_expression: str | None = None,
complexity_of_operators: dict[str, int | float] | None = None,
complexity_of_constants: int | float | None = None,
complexity_of_variables: int | float | list[int | float] | None = None,
@@ -910,6 +917,7 @@ def __init__(
# - Loss parameters
self.elementwise_loss = elementwise_loss
self.loss_function = loss_function
+ self.loss_function_expression = loss_function_expression
self.complexity_of_operators = complexity_of_operators
self.complexity_of_constants = complexity_of_constants
self.complexity_of_variables = complexity_of_variables
@@ -1100,7 +1108,7 @@ def from_file(
pkl_filename = Path(run_directory) / "checkpoint.pkl"
if pkl_filename.exists():
- print(f"Attempting to load model from {pkl_filename}...")
+ pysr_logger.info(f"Attempting to load model from {pkl_filename}...")
assert binary_operators is None
assert unary_operators is None
assert n_features_in is None
@@ -1114,9 +1122,15 @@ def from_file(
if "equations_" not in model.__dict__ or model.equations_ is None:
model.refresh()
+ if model.expression_spec is not None:
+ warnings.warn(
+ "Loading model from checkpoint file with a non-default expression spec "
+ "is not fully supported as it relies on dynamic objects. This may result in unexpected behavior.",
+ )
+
return model
else:
- print(
+ pysr_logger.info(
f"Checkpoint file {pkl_filename} does not exist. "
"Attempting to recreate model from scratch..."
)
@@ -1219,12 +1233,16 @@ def __getstate__(self) -> dict[str, Any]:
)
state_keys_containing_lambdas = ["extra_sympy_mappings", "extra_torch_mappings"]
for state_key in state_keys_containing_lambdas:
- if state[state_key] is not None and show_pickle_warning:
- warnings.warn(
- f"`{state_key}` cannot be pickled and will be removed from the "
- "serialized instance. When loading the model, please redefine "
- f"`{state_key}` at runtime."
- )
+ warn_msg = (
+ f"`{state_key}` cannot be pickled and will be removed from the "
+ "serialized instance. When loading the model, please redefine "
+ f"`{state_key}` at runtime."
+ )
+ if state[state_key] is not None:
+ if show_pickle_warning:
+ warnings.warn(warn_msg)
+ else:
+ pysr_logger.debug(warn_msg)
state_keys_to_clear = state_keys_containing_lambdas
state_keys_to_clear.append("logger_")
pickled_state = {
@@ -1267,7 +1285,7 @@ def _checkpoint(self):
try:
pkl.dump(self, f)
except Exception as e:
- print(f"Error checkpointing model: {e}")
+ pysr_logger.debug(f"Error checkpointing model: {e}")
self.show_pickle_warnings_ = True
def get_pkl_filename(self) -> Path:
@@ -1428,11 +1446,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:
elif self.maxsize < 7:
raise ValueError("PySR requires a maxsize of at least 7")
- if self.elementwise_loss is not None and self.loss_function is not None:
- raise ValueError(
- "You cannot set both `elementwise_loss` and `loss_function`."
- )
-
# NotImplementedError - Values that could be supported at a later time
if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS:
raise NotImplementedError(
@@ -1744,7 +1757,7 @@ def _pre_transform_training_data(
self.selection_mask_ = selection_mask
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
self.display_feature_names_in_ = self.feature_names_in_
- print(f"Using features {self.feature_names_in_}")
+ pysr_logger.info(f"Using features {self.feature_names_in_}")
# Denoising transformation
if self.denoise:
@@ -1816,7 +1829,7 @@ def _run(
# Start julia backend processes
if not ALREADY_RAN and runtime_params.update_verbosity != 0:
- print("Compiling Julia backend...")
+ pysr_logger.info("Compiling Julia backend...")
parallelism, numprocs = _map_parallelism_params(
self.parallelism, self.procs, getattr(self, "multithreading", None)
@@ -1892,6 +1905,11 @@ def _run(
custom_full_objective = jl.seval(
str(self.loss_function) if self.loss_function is not None else "nothing"
)
+ custom_loss_expression = jl.seval(
+ str(self.loss_function_expression)
+ if self.loss_function_expression is not None
+ else "nothing"
+ )
early_stop_condition = jl.seval(
str(self.early_stop_condition)
@@ -1964,11 +1982,11 @@ def _run(
complexity_of_constants=self.complexity_of_constants,
complexity_of_variables=complexity_of_variables,
complexity_mapping=complexity_mapping,
- expression_type=self.expression_spec_.julia_expression_type(),
- expression_options=self.expression_spec_.julia_expression_options(),
+ expression_spec=self.expression_spec_.julia_expression_spec(),
nested_constraints=nested_constraints,
elementwise_loss=custom_loss,
loss_function=custom_full_objective,
+ loss_function_expression=custom_loss_expression,
maxsize=int(self.maxsize),
output_directory=_escape_filename(self.output_directory_),
npopulations=int(self.populations),
diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py
index 2f5f440a..cb400846 100644
--- a/pysr/test/test_main.py
+++ b/pysr/test/test_main.py
@@ -1,3 +1,4 @@
+import functools
import importlib
import os
import pickle as pkl
@@ -7,11 +8,18 @@
import unittest
import warnings
from pathlib import Path
+from textwrap import dedent
import numpy as np
import pandas as pd
import sympy # type: ignore
-from sklearn.utils.estimator_checks import check_estimator
+
+try:
+ from sklearn.utils.estimator_checks import estimator_checks_generator
+except ImportError:
+ from sklearn.utils.estimator_checks import check_estimator
+
+ estimator_checks_generator = functools.partial(check_estimator, generate_only=True)
from pysr import (
ParametricExpressionSpec,
@@ -45,6 +53,9 @@
"SYMBOLIC_REGRESSION_IS_TESTING", "true"
)
+# Import from juliacall at end:
+from juliacall import JuliaError # type: ignore
+
class TestPipeline(unittest.TestCase):
def setUp(self):
@@ -95,9 +106,16 @@ def test_linear_relation_weighted_bumper(self):
)
def test_multiprocessing_turbo_custom_objective(self):
+ for loss_key in ["loss_function", "loss_function_expression"]:
+ with self.subTest(loss_key=loss_key):
+ self._multiprocessing_turbo_custom_objective(loss_key)
+
+ def _multiprocessing_turbo_custom_objective(self, loss_key):
rstate = np.random.RandomState(0)
y = self.X[:, 0]
y += rstate.randn(*y.shape) * 1e-4
+
+ node_type = "Expression" if loss_key == "loss_function_expression" else "Node"
model = PySRRegressor(
**self.default_test_kwargs,
# Turbo needs to work with unsafe operators:
@@ -106,14 +124,16 @@ def test_multiprocessing_turbo_custom_objective(self):
parallelism="multiprocessing",
turbo=True,
early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1",
- loss_function="""
- function my_objective(tree::Node{T}, dataset::Dataset{T}, options::Options) where T
+ **{
+ loss_key: f"""
+ function my_objective(tree::{node_type}{{T}}, dataset::Dataset{{T}}, options::Options) where T
prediction, flag = eval_tree_array(tree, dataset.X, options)
!flag && return T(Inf)
abs3(x) = abs(x) ^ 3
return sum(abs3, prediction .- dataset.y) / length(prediction)
end
- """,
+ """
+ },
)
model.fit(self.X, y)
print(model.equations_)
@@ -578,6 +598,50 @@ def test_template_expressions_and_custom_complexity(self):
with self.assertRaises(ValueError):
model.latex_table()
+ def test_template_expression_with_parameters(self):
+ # Create random data
+ X_continuous = self.rstate.uniform(-1, 1, (100, 2))
+ category = self.rstate.randint(0, 3, 100) # 3 classes
+ X = np.hstack([X_continuous, category[:, None] + 1])
+
+ # Ground truth: p[class] * x1^2 + x2 where p = [0.5, 1.0, 2.0]
+ true_p = [0.5, 1.0, 2.0]
+ y = np.array(
+ [true_p[c] * x1**2 + x2 for x1, x2, c in zip(X[:, 0], X[:, 1], category)]
+ )
+
+ # Create model with template that includes parameters
+ model = PySRRegressor(
+ **self.default_test_kwargs,
+ expression_spec=TemplateExpressionSpec(
+ "p[class] * x1^2 + f(x2)",
+ expressions=["f"],
+ parameters={"p": 3},
+ variable_names=["x1", "x2", "class"],
+ ),
+ binary_operators=["+", "-", "*", "/"],
+ unary_operators=[],
+ maxsize=10,
+ early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity <= 3",
+ )
+
+ model.fit(X, y)
+
+ # Test on new data
+ X_continuous_test = self.rstate.uniform(-1, 1, (25, 2))
+ category_test = self.rstate.randint(0, 3, 25)
+ X_test = np.hstack([X_continuous_test, category_test[:, None] + 1])
+ y_test = np.array(
+ [
+ true_p[c] * x1**2 + x2
+ for x1, x2, c in zip(X_test[:, 0], X_test[:, 1], category_test)
+ ]
+ )
+ y_pred = model.predict(X_test)
+
+ test_mse = np.mean((y_test - y_pred) ** 2)
+ self.assertLess(test_mse, 1e-5)
+
def test_parametric_expression(self):
# Create data with two classes
n_points = 100
@@ -873,9 +937,8 @@ def test_scikit_learn_compatibility(self):
temp_equation_file=True,
) # Return early.
- check_generator = check_estimator(model, generate_only=True)
exception_messages = []
- for _, check in check_generator:
+ for _, check in estimator_checks_generator(model):
if check.func.__name__ in {
# We can use complex data, so avoid this check
"check_complex_data",
@@ -1046,9 +1109,9 @@ def test_bad_kwargs(self):
bad_kwargs = [
dict(
kwargs=dict(
- elementwise_loss="g(x, y) = 0.0", loss_function="f(*args) = 0.0"
+ elementwise_loss="g(x, y) = 0.0", loss_function="f(args...) = 0.0"
),
- error=ValueError,
+ error=JuliaError,
),
dict(
kwargs=dict(maxsize=3),
@@ -1089,7 +1152,8 @@ def test_bad_kwargs(self):
def test_suggest_keywords(self):
# Easy
self.assertEqual(
- _suggest_keywords(PySRRegressor, "loss_function"), ["loss_function"]
+ _suggest_keywords(PySRRegressor, "loss_function"),
+ ["loss_function", "loss_function_expression"],
)
# More complex, and with error
@@ -1466,6 +1530,121 @@ def test_unit_propagation(self):
# TODO: Determine desired behavior if second .fit() call does not have units
+class TestTemplateExpressionSpec(unittest.TestCase):
+ def _check_macro_str(self, spec, expected_str):
+ self.assertEqual(
+ spec._template_macro_str().strip(), dedent(expected_str).strip()
+ )
+
+ def test_single_expression_no_params_single_variable(self):
+ spec = TemplateExpressionSpec(
+ combine="f(x)", expressions=["f"], variable_names=["x"]
+ )
+ self._check_macro_str(
+ spec,
+ """\
+ @template_spec(expressions=(f,),) do x
+ f(x)
+ end
+ """,
+ )
+
+ def test_multiple_expressions_no_params_multiple_variables(self):
+ spec = TemplateExpressionSpec(
+ combine="f(x, y) + g(z)",
+ expressions=["f", "g"],
+ variable_names=["x", "y", "z"],
+ )
+ self._check_macro_str(
+ spec,
+ """
+ @template_spec(expressions=(f, g,),) do x, y, z
+ f(x, y) + g(z)
+ end
+ """,
+ )
+
+ def test_single_expression_single_param_single_variable(self):
+ spec = TemplateExpressionSpec(
+ combine="p[1] * f(x)",
+ expressions=["f"],
+ variable_names=["x"],
+ parameters={"p": 1},
+ )
+ self._check_macro_str(
+ spec,
+ """
+ @template_spec(expressions=(f,), parameters=(p=1,),) do x
+ p[1] * f(x)
+ end
+ """,
+ )
+
+ def test_multiple_expressions_multiple_params_multiple_variables(self):
+ spec = TemplateExpressionSpec(
+ combine="p1[1]*f(x,y) + p2[1]*g(z)",
+ expressions=["f", "g"],
+ variable_names=["x", "y", "z"],
+ parameters={"p1": 2, "p2": 3},
+ )
+ self._check_macro_str(
+ spec,
+ """
+ @template_spec(expressions=(f, g,), parameters=(p1=2, p2=3,),) do x, y, z
+ p1[1]*f(x,y) + p2[1]*g(z)
+ end
+ """,
+ )
+
+ def test_complex_variable_names(self):
+ spec = TemplateExpressionSpec(
+ combine="f(var1) * g(var2)",
+ expressions=["f", "g"],
+ variable_names=["var1", "var2"],
+ )
+ self._check_macro_str(
+ spec,
+ """
+ @template_spec(expressions=(f, g,),) do var1, var2
+ f(var1) * g(var2)
+ end
+ """,
+ )
+
+ def test_mixed_parameter_types(self):
+ spec = TemplateExpressionSpec(
+ combine="alpha*f(x) + beta*g(y)",
+ expressions=["f", "g"],
+ variable_names=["x", "y"],
+ parameters={"alpha": 1, "beta": 2},
+ )
+ self._check_macro_str(
+ spec,
+ """
+ @template_spec(expressions=(f, g,), parameters=(alpha=1, beta=2,),) do x, y
+ alpha*f(x) + beta*g(y)
+ end
+ """,
+ )
+
+ def test_empty_parameters_case(self):
+ spec = TemplateExpressionSpec(
+ combine="f(x)", expressions=["f"], variable_names=["x"], parameters={}
+ )
+ self.assertNotIn("parameters", spec._template_macro_str())
+
+ def test_maximum_parameters_expressions(self):
+ spec = TemplateExpressionSpec(
+ combine=" + ".join([f"f{i}(x)" for i in range(5)]),
+ expressions=[f"f{i}" for i in range(5)],
+ variable_names=["x"],
+ parameters={f"p{i}": i + 1 for i in range(5)},
+ )
+ macro_str = spec._template_macro_str()
+ self.assertIn("expressions=(f0, f1, f2, f3, f4,),", macro_str)
+ self.assertIn("parameters=(p0=1, p1=2, p2=3, p3=4, p4=5,),", macro_str)
+
+
def runtests(just_tests=False):
"""Run all tests in test.py."""
test_cases = [