diff --git a/.github/workflows/CI_conda_forge.yml b/.github/workflows/CI_conda_forge.yml index 52407c58..b2e8a261 100644 --- a/.github/workflows/CI_conda_forge.yml +++ b/.github/workflows/CI_conda_forge.yml @@ -16,26 +16,12 @@ jobs: timeout-minutes: 60 defaults: run: - shell: bash -l {0} + shell: bash -el {0} strategy: fail-fast: false matrix: - python-version: ['3.10', '3.11', '3.12'] - os: ['ubuntu-latest'] - use-mamba: [true, false] - include: - - python-version: '3.10' - os: 'windows-latest' - use-mamba: true - - python-version: '3.12' - os: 'windows-latest' - use-mamba: true - - python-version: '3.10' - os: 'macos-latest' - use-mamba: true - - python-version: '3.12' - os: 'macos-latest' - use-mamba: true + python-version: ['3.10', '3'] + os: ['ubuntu-latest', 'windows-latest', 'macos-latest'] steps: - name: "Set up Conda" @@ -46,13 +32,14 @@ jobs: auto-activate-base: true python-version: ${{ matrix.python-version }} activate-environment: pysr-test - - name: "Install pysr with mamba" - run: conda activate pysr-test && mamba install pysr - if: ${{ matrix.use-mamba }} - - name: "Install pysr with conda" - run: conda activate pysr-test && conda install pysr - if: ${{ !matrix.use-mamba }} + - name: "Install pysr" + run: | + conda install -y pysr + python -c "import pysr" + echo "Finished." - name: "Run tests" run: | + echo "Running tests" pip install pytest nbval python -m pysr test main,startup + echo "Finished." diff --git a/.github/workflows/update_backend_version.py b/.github/workflows/update_backend_version.py index 696da9f5..479080e5 100644 --- a/.github/workflows/update_backend_version.py +++ b/.github/workflows/update_backend_version.py @@ -20,7 +20,7 @@ major, minor, patch, *dev = pyproject_data["project"]["version"].split(".") pyproject_data["project"]["version"] = f"{major}.{minor}.{int(patch)+1}" -juliapkg_data["packages"]["SymbolicRegression"]["version"] = f"={new_backend_version}" +juliapkg_data["packages"]["SymbolicRegression"]["version"] = f"~{new_backend_version}" with open(pyproject_toml, "w") as toml_file: toml_file.write(tomlkit.dumps(pyproject_data)) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7e5f4fe1..fa10d77f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,25 +9,25 @@ repos: - id: check-added-large-files # General formatting - repo: https://github.com/psf/black - rev: 24.10.0 + rev: 25.1.0 hooks: - id: black - id: black-jupyter exclude: pysr/test/test_nb.ipynb # Stripping notebooks - repo: https://github.com/kynan/nbstripout - rev: 0.8.0 + rev: 0.8.1 hooks: - id: nbstripout exclude: pysr/test/test_nb.ipynb # Unused imports - repo: https://github.com/hadialqattan/pycln - rev: "v2.4.0" + rev: "v2.5.0" hooks: - id: pycln # Sorted imports - repo: https://github.com/PyCQA/isort - rev: "5.13.2" + rev: "6.0.0" hooks: - id: isort additional_dependencies: [toml] diff --git a/README.md b/README.md index 299f019a..9e3773ac 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ If you've finished a project with PySR, please submit a PR to showcase your work - [Why PySR?](#why-pysr) - [Installation](#installation) - [Quickstart](#quickstart) -- [→ Documentation](https://ai.damtp.cam.ac.uk/PySR) +- [→ Documentation](https://ai.damtp.cam.ac.uk/pysr) - [Contributors](#contributors-)
diff --git a/docs/examples.md b/docs/examples.md index 6d841b0e..700f73bb 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -546,8 +546,9 @@ y = np.sin(X[:, 0] + X[:, 1]) + X[:, 2]**2 # Define template: we want sin(f(x1, x2)) + g(x3) template = TemplateExpressionSpec( - function_symbols=["f", "g"], - combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)", + expressions=["f", "g"], + variable_names=["x1", "x2", "x3"], + combine="sin(f(x1, x2)) + g(x3)", ) model = PySRRegressor( @@ -559,15 +560,23 @@ model = PySRRegressor( model.fit(X, y) ``` -You can also use no argument-functions for learning constants, like: +You can also use parameters in your template expressions, which will be optimized during the search: ```python template = TemplateExpressionSpec( - function_symbols=["a", "f"], - combine="((; a, f), (x, y)) -> a() * sin(f(x, y))", + expressions=["f", "g"], + variable_names=["x1", "x2", "x3"], + parameters={"p1": 2, "p2": 1}, # p1 has length 2, p2 has length 1 + combine="p1[1] * sin(f(x1, x2)) + p1[2] * g(x3) + p2[1]", ) ``` +This will learn an equation of the form: + +$$ y = \alpha_1 \sin(f(x_1, x_2)) + \alpha_2 g(x_3) + \beta $$ + +where $\alpha_1, \alpha_2$ are stored in `p1` and $\beta$ is stored in `p2`. The parameters will be optimized during the search. + ### Parametric Expressions When your data has categories with shared equation structure but different parameters, @@ -609,6 +618,20 @@ model.fit(X, y, category=category) See [Expression Specifications](/api/#expression-specifications) for more details. +You can also use `TemplateExpressionSpec` in the same way, passing +the category as a column of `X`: + +```python +spec = TemplateExpressionSpec( + expressions=["f", "g"], + variable_names=["x1", "x2", "class"], + combine="p1[class] * sin(f(x1, x2)) + p2[class]", +) +``` + +this column will automatically be converted to integers. + + ## 12. Using TensorBoard for Logging You can use TensorBoard to visualize the search progress, as well as diff --git a/pyproject.toml b/pyproject.toml index eb3ce615..2a6c0899 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pysr" -version = "1.3.1" +version = "1.5.0" authors = [ {name = "Miles Cranmer", email = "miles.cranmer@gmail.com"}, ] @@ -22,7 +22,7 @@ dependencies = [ "pandas>=0.21.0,<3.0.0", "numpy>=1.13.0,<3.0.0", "scikit_learn>=1.0.0,<2.0.0", - "juliacall==0.9.23", + "juliacall==0.9.24", "click>=7.0.0,<9.0.0", "setuptools>=50.0.0", ] diff --git a/pysr/__init__.py b/pysr/__init__.py index e26174ab..aabbb669 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -1,5 +1,12 @@ +import logging import os +pysr_logger = logging.getLogger("pysr") +pysr_logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +pysr_logger.addHandler(handler) + if os.environ.get("PYSR_USE_BEARTYPE", "0") == "1": from beartype.claw import beartype_this_package diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index f9a6eee7..c35ef031 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -1,6 +1,7 @@ import copy from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, NewType, TypeAlias +from textwrap import dedent +from typing import TYPE_CHECKING, Any, NewType, TypeAlias, overload import numpy as np import pandas as pd @@ -26,11 +27,9 @@ class AbstractExpressionSpec(ABC): All expression types must implement: - 1. julia_expression_type(): The actual expression type, returned as a Julia object. - This will get stored as `expression_type` in `SymbolicRegression.Options`. - 2. julia_expression_options(): Method to create the expression options, returned as a Julia object. - These will get stored as `expression_options` in `SymbolicRegression.Options`. - 3. create_exports(), which will be used to create the exports of the equations, such as + 1. julia_expression_spec(): The actual expression specification, returned as a Julia object. + This will get passed as `expression_spec` in `SymbolicRegression.Options`. + 2. create_exports(), which will be used to create the exports of the equations, such as the executable format, the SymPy format, etc. It may also optionally implement: @@ -39,13 +38,8 @@ class AbstractExpressionSpec(ABC): """ @abstractmethod - def julia_expression_type(self) -> AnyValue: - """The expression type""" - pass # pragma: no cover - - @abstractmethod - def julia_expression_options(self) -> AnyValue: - """The expression options""" + def julia_expression_spec(self) -> AnyValue: + """The expression specification""" pass # pragma: no cover @abstractmethod @@ -82,11 +76,8 @@ def supports_latex(self) -> bool: class ExpressionSpec(AbstractExpressionSpec): """The default expression specification, with no special behavior.""" - def julia_expression_type(self): - return SymbolicRegression.Expression - - def julia_expression_options(self): - return jl.NamedTuple() + def julia_expression_spec(self): + return SymbolicRegression.ExpressionSpec() def create_exports( self, @@ -127,31 +118,39 @@ class TemplateExpressionSpec(AbstractExpressionSpec): This class allows you to specify how multiple sub-expressions should be combined in a structured way, with constraints on which variables each sub-expression can use. - Pass this to PySRRegressor with the `expression_spec` argument when you are using - the `TemplateExpression` expression type. + Pass this to PySRRegressor with the `expression_spec` argument. Parameters ---------- - function_symbols : list[str] - List of symbols representing the inner expressions (e.g., ["f", "g"]). - These will be used as keys in the template structure. combine : str Julia function string that defines how the sub-expressions are combined. - Takes a NamedTuple of expressions and a tuple of data vectors. - For example: "((; f, g), (x1, x2, x3)) -> f(x1, x2) + g(x3)^2" - would constrain f to use x1,x2 and g to use x3. - num_features : dict[str, int] - Dictionary mapping function symbols to the number of features each can use. - For example: {"f": 2, "g": 1} means f takes 2 inputs and g takes 1. - If not provided, will be inferred from the combine function. + For example: "sin(f(x1, x2)) + g(x3)^2" would constrain f to use x1,x2 and g to use x3. + expressions : list[str] + List of symbols representing the inner expressions (e.g., ["f", "g"]). + These will be used as keys in the template structure. + variable_names : list[str] + List of variable names that will be used in the combine function. + parameters : dict[str, int], optional + Dictionary mapping parameter names to their lengths. For example, {"p1": 2, "p2": 1} + means p1 is a vector of length 2 and p2 is a vector of length 1. These parameters + will be optimized during the search. Examples -------- ```python # Create template that combines f(x1, x2) and g(x3): expression_spec = TemplateExpressionSpec( - function_symbols=["f", "g"], - combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)^2", + expressions=["f", "g"], + variable_names=["x1", "x2", "x3"], + combine="sin(f(x1, x2)) + g(x3)^2", + ) + + # With parameters: + expression_spec = TemplateExpressionSpec( + expressions=["f", "g"], + variable_names=["x1", "x2", "x3"], + parameters={"p1": 2, "p2": 1}, + combine="p1[1] * sin(f(x1, x2)) + p1[2] * g(x3) + p2[1]", ) # Use in PySRRegressor: @@ -159,9 +158,47 @@ class TemplateExpressionSpec(AbstractExpressionSpec): expression_spec=expression_spec ) ``` + + Notes + ----- + You can also use differential operators in the template with `D(f, 1)(x)` to take + the derivative of f with respect to its first argument, evaluated at x. """ + _spec_cache: dict[tuple[str, ...], AnyValue] = {} + + @overload + def __init__( + self, + function_symbols: list[str], + combine: str, + num_features: dict[str, int] | None = None, + ) -> None: ... + + @overload + def __init__( + self, + combine: str, + *, + expressions: list[str], + variable_names: list[str], + parameters: dict[str, int] | None = None, + ) -> None: ... + def __init__( + self, + *args, + **kwargs, + ): + """Handle both formats with combine as explicit parameter""" + self._old_format = len(args) >= 2 or "function_symbols" in kwargs + + if self._old_format: + self._load_old_format(*args, **kwargs) + else: + self._load_new_format(*args, **kwargs) + + def _load_old_format( self, function_symbols: list[str], combine: str, @@ -170,9 +207,69 @@ def __init__( self.function_symbols = function_symbols self.combine = combine self.num_features = num_features + # TODO: warn about old format after some versions - def julia_expression_type(self): - return SymbolicRegression.TemplateExpression + def _load_new_format( + self, + combine: str, + *, + expressions: list[str], + variable_names: list[str], + parameters: dict[str, int] | None = None, + ): + self.combine = combine + self.expressions = expressions + self.variable_names = variable_names + self.parameters = parameters + + def _get_cache_key(self): + if self._old_format: + return ( + "old", + str(self.function_symbols), + self.combine, + str(self.num_features), + ) + else: + return ( + "new", + self.combine, + str(self.expressions), + str(self.variable_names), + str(self.parameters), + ) + + def julia_expression_spec(self): + key = self._get_cache_key() + if key in self._spec_cache: + return self._spec_cache[key] + + if self._old_format: + result = SymbolicRegression.TemplateExpressionSpec( + structure=self.julia_expression_options().structure + ) + else: + result = self._call_template_macro() + + self._spec_cache[key] = result + return result + + def _call_template_macro(self): + return jl.seval(self._template_macro_str()) + + def _template_macro_str(self): + template_inputs = [f"expressions=({', '.join(self.expressions) + ','})"] + if self.parameters: + template_inputs.append( + f"parameters=({', '.join([f'{p}={self.parameters[p]}' for p in self.parameters]) + ','})" + ) + return dedent( + f""" + @template_spec({', '.join(template_inputs) + ','}) do {', '.join(self.variable_names)} + {self.combine} + end + """ + ) def julia_expression_options(self): f_combine = jl.seval(self.combine) @@ -243,11 +340,10 @@ class ParametricExpressionSpec(AbstractExpressionSpec): def __init__(self, max_parameters: int): self.max_parameters = max_parameters - def julia_expression_type(self): - return SymbolicRegression.ParametricExpression - - def julia_expression_options(self): - return jl.seval("NamedTuple{(:max_parameters,)}")((self.max_parameters,)) + def julia_expression_spec(self): + return SymbolicRegression.ParametricExpressionSpec( + max_parameters=self.max_parameters + ) @property def evaluates_in_julia(self): diff --git a/pysr/feature_selection.py b/pysr/feature_selection.py index 8c8358fd..13fca487 100644 --- a/pysr/feature_selection.py +++ b/pysr/feature_selection.py @@ -1,5 +1,6 @@ """Functions for doing feature selection during preprocessing.""" +import logging from typing import cast import numpy as np @@ -8,6 +9,8 @@ from .utils import ArrayLike +pysr_logger = logging.getLogger(__name__) + def run_feature_selection( X: ndarray, @@ -44,7 +47,7 @@ def _handle_feature_selection( ): if select_k_features is not None: selection = run_feature_selection(X, y, select_k_features) - print(f"Using features {[variable_names[i] for i in selection]}") + pysr_logger.info(f"Using features {[variable_names[i] for i in selection]}") X = X[:, selection] else: selection = None diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index f6d709c2..4f190bce 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,7 +3,7 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "=1.5.1" + "version": "~1.8.0" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index 769e6647..d3b56aa1 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -13,6 +13,7 @@ - The Objective: - elementwise_loss - loss_function + - loss_function_expression - model_selection - dimensional_constraint_penalty - dimensionless_constants_only diff --git a/pysr/sr.py b/pysr/sr.py index 8da6cfee..aeadde68 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1,6 +1,7 @@ """Define the PySRRegressor scikit-learn interface.""" import copy +import logging import os import pickle as pkl import re @@ -67,6 +68,8 @@ ALREADY_RAN = False +pysr_logger = logging.getLogger(__name__) + def _process_constraints( binary_operators: list[str], @@ -375,9 +378,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): You may pass a function with the same arguments as this (note that the name of the function doesn't matter). Here, both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`. - If using `batching`, then you should add an - `idx` argument to the function, which is `nothing` - for non-batched, and a 1D array of indices for batched. + Default is `None`. + loss_function_expression : str + Similar to `loss_function`, but takes as input the full + expression object as the first argument, rather than + the innermost `AbstractExpressionNode`. This is useful + for specifying custom loss functions on `TemplateExpressionSpec`. Default is `None`. complexity_of_operators : dict[str, int | float] If you would like to use a complexity other than 1 for an @@ -806,6 +812,7 @@ def __init__( nested_constraints: dict[str, dict[str, int]] | None = None, elementwise_loss: str | None = None, loss_function: str | None = None, + loss_function_expression: str | None = None, complexity_of_operators: dict[str, int | float] | None = None, complexity_of_constants: int | float | None = None, complexity_of_variables: int | float | list[int | float] | None = None, @@ -910,6 +917,7 @@ def __init__( # - Loss parameters self.elementwise_loss = elementwise_loss self.loss_function = loss_function + self.loss_function_expression = loss_function_expression self.complexity_of_operators = complexity_of_operators self.complexity_of_constants = complexity_of_constants self.complexity_of_variables = complexity_of_variables @@ -1100,7 +1108,7 @@ def from_file( pkl_filename = Path(run_directory) / "checkpoint.pkl" if pkl_filename.exists(): - print(f"Attempting to load model from {pkl_filename}...") + pysr_logger.info(f"Attempting to load model from {pkl_filename}...") assert binary_operators is None assert unary_operators is None assert n_features_in is None @@ -1114,9 +1122,15 @@ def from_file( if "equations_" not in model.__dict__ or model.equations_ is None: model.refresh() + if model.expression_spec is not None: + warnings.warn( + "Loading model from checkpoint file with a non-default expression spec " + "is not fully supported as it relies on dynamic objects. This may result in unexpected behavior.", + ) + return model else: - print( + pysr_logger.info( f"Checkpoint file {pkl_filename} does not exist. " "Attempting to recreate model from scratch..." ) @@ -1219,12 +1233,16 @@ def __getstate__(self) -> dict[str, Any]: ) state_keys_containing_lambdas = ["extra_sympy_mappings", "extra_torch_mappings"] for state_key in state_keys_containing_lambdas: - if state[state_key] is not None and show_pickle_warning: - warnings.warn( - f"`{state_key}` cannot be pickled and will be removed from the " - "serialized instance. When loading the model, please redefine " - f"`{state_key}` at runtime." - ) + warn_msg = ( + f"`{state_key}` cannot be pickled and will be removed from the " + "serialized instance. When loading the model, please redefine " + f"`{state_key}` at runtime." + ) + if state[state_key] is not None: + if show_pickle_warning: + warnings.warn(warn_msg) + else: + pysr_logger.debug(warn_msg) state_keys_to_clear = state_keys_containing_lambdas state_keys_to_clear.append("logger_") pickled_state = { @@ -1267,7 +1285,7 @@ def _checkpoint(self): try: pkl.dump(self, f) except Exception as e: - print(f"Error checkpointing model: {e}") + pysr_logger.debug(f"Error checkpointing model: {e}") self.show_pickle_warnings_ = True def get_pkl_filename(self) -> Path: @@ -1428,11 +1446,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams: elif self.maxsize < 7: raise ValueError("PySR requires a maxsize of at least 7") - if self.elementwise_loss is not None and self.loss_function is not None: - raise ValueError( - "You cannot set both `elementwise_loss` and `loss_function`." - ) - # NotImplementedError - Values that could be supported at a later time if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS: raise NotImplementedError( @@ -1744,7 +1757,7 @@ def _pre_transform_training_data( self.selection_mask_ = selection_mask self.feature_names_in_ = _check_feature_names_in(self, variable_names) self.display_feature_names_in_ = self.feature_names_in_ - print(f"Using features {self.feature_names_in_}") + pysr_logger.info(f"Using features {self.feature_names_in_}") # Denoising transformation if self.denoise: @@ -1816,7 +1829,7 @@ def _run( # Start julia backend processes if not ALREADY_RAN and runtime_params.update_verbosity != 0: - print("Compiling Julia backend...") + pysr_logger.info("Compiling Julia backend...") parallelism, numprocs = _map_parallelism_params( self.parallelism, self.procs, getattr(self, "multithreading", None) @@ -1892,6 +1905,11 @@ def _run( custom_full_objective = jl.seval( str(self.loss_function) if self.loss_function is not None else "nothing" ) + custom_loss_expression = jl.seval( + str(self.loss_function_expression) + if self.loss_function_expression is not None + else "nothing" + ) early_stop_condition = jl.seval( str(self.early_stop_condition) @@ -1964,11 +1982,11 @@ def _run( complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, complexity_mapping=complexity_mapping, - expression_type=self.expression_spec_.julia_expression_type(), - expression_options=self.expression_spec_.julia_expression_options(), + expression_spec=self.expression_spec_.julia_expression_spec(), nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, + loss_function_expression=custom_loss_expression, maxsize=int(self.maxsize), output_directory=_escape_filename(self.output_directory_), npopulations=int(self.populations), diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index 2f5f440a..cb400846 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -1,3 +1,4 @@ +import functools import importlib import os import pickle as pkl @@ -7,11 +8,18 @@ import unittest import warnings from pathlib import Path +from textwrap import dedent import numpy as np import pandas as pd import sympy # type: ignore -from sklearn.utils.estimator_checks import check_estimator + +try: + from sklearn.utils.estimator_checks import estimator_checks_generator +except ImportError: + from sklearn.utils.estimator_checks import check_estimator + + estimator_checks_generator = functools.partial(check_estimator, generate_only=True) from pysr import ( ParametricExpressionSpec, @@ -45,6 +53,9 @@ "SYMBOLIC_REGRESSION_IS_TESTING", "true" ) +# Import from juliacall at end: +from juliacall import JuliaError # type: ignore + class TestPipeline(unittest.TestCase): def setUp(self): @@ -95,9 +106,16 @@ def test_linear_relation_weighted_bumper(self): ) def test_multiprocessing_turbo_custom_objective(self): + for loss_key in ["loss_function", "loss_function_expression"]: + with self.subTest(loss_key=loss_key): + self._multiprocessing_turbo_custom_objective(loss_key) + + def _multiprocessing_turbo_custom_objective(self, loss_key): rstate = np.random.RandomState(0) y = self.X[:, 0] y += rstate.randn(*y.shape) * 1e-4 + + node_type = "Expression" if loss_key == "loss_function_expression" else "Node" model = PySRRegressor( **self.default_test_kwargs, # Turbo needs to work with unsafe operators: @@ -106,14 +124,16 @@ def test_multiprocessing_turbo_custom_objective(self): parallelism="multiprocessing", turbo=True, early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1", - loss_function=""" - function my_objective(tree::Node{T}, dataset::Dataset{T}, options::Options) where T + **{ + loss_key: f""" + function my_objective(tree::{node_type}{{T}}, dataset::Dataset{{T}}, options::Options) where T prediction, flag = eval_tree_array(tree, dataset.X, options) !flag && return T(Inf) abs3(x) = abs(x) ^ 3 return sum(abs3, prediction .- dataset.y) / length(prediction) end - """, + """ + }, ) model.fit(self.X, y) print(model.equations_) @@ -578,6 +598,50 @@ def test_template_expressions_and_custom_complexity(self): with self.assertRaises(ValueError): model.latex_table() + def test_template_expression_with_parameters(self): + # Create random data + X_continuous = self.rstate.uniform(-1, 1, (100, 2)) + category = self.rstate.randint(0, 3, 100) # 3 classes + X = np.hstack([X_continuous, category[:, None] + 1]) + + # Ground truth: p[class] * x1^2 + x2 where p = [0.5, 1.0, 2.0] + true_p = [0.5, 1.0, 2.0] + y = np.array( + [true_p[c] * x1**2 + x2 for x1, x2, c in zip(X[:, 0], X[:, 1], category)] + ) + + # Create model with template that includes parameters + model = PySRRegressor( + **self.default_test_kwargs, + expression_spec=TemplateExpressionSpec( + "p[class] * x1^2 + f(x2)", + expressions=["f"], + parameters={"p": 3}, + variable_names=["x1", "x2", "class"], + ), + binary_operators=["+", "-", "*", "/"], + unary_operators=[], + maxsize=10, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity <= 3", + ) + + model.fit(X, y) + + # Test on new data + X_continuous_test = self.rstate.uniform(-1, 1, (25, 2)) + category_test = self.rstate.randint(0, 3, 25) + X_test = np.hstack([X_continuous_test, category_test[:, None] + 1]) + y_test = np.array( + [ + true_p[c] * x1**2 + x2 + for x1, x2, c in zip(X_test[:, 0], X_test[:, 1], category_test) + ] + ) + y_pred = model.predict(X_test) + + test_mse = np.mean((y_test - y_pred) ** 2) + self.assertLess(test_mse, 1e-5) + def test_parametric_expression(self): # Create data with two classes n_points = 100 @@ -873,9 +937,8 @@ def test_scikit_learn_compatibility(self): temp_equation_file=True, ) # Return early. - check_generator = check_estimator(model, generate_only=True) exception_messages = [] - for _, check in check_generator: + for _, check in estimator_checks_generator(model): if check.func.__name__ in { # We can use complex data, so avoid this check "check_complex_data", @@ -1046,9 +1109,9 @@ def test_bad_kwargs(self): bad_kwargs = [ dict( kwargs=dict( - elementwise_loss="g(x, y) = 0.0", loss_function="f(*args) = 0.0" + elementwise_loss="g(x, y) = 0.0", loss_function="f(args...) = 0.0" ), - error=ValueError, + error=JuliaError, ), dict( kwargs=dict(maxsize=3), @@ -1089,7 +1152,8 @@ def test_bad_kwargs(self): def test_suggest_keywords(self): # Easy self.assertEqual( - _suggest_keywords(PySRRegressor, "loss_function"), ["loss_function"] + _suggest_keywords(PySRRegressor, "loss_function"), + ["loss_function", "loss_function_expression"], ) # More complex, and with error @@ -1466,6 +1530,121 @@ def test_unit_propagation(self): # TODO: Determine desired behavior if second .fit() call does not have units +class TestTemplateExpressionSpec(unittest.TestCase): + def _check_macro_str(self, spec, expected_str): + self.assertEqual( + spec._template_macro_str().strip(), dedent(expected_str).strip() + ) + + def test_single_expression_no_params_single_variable(self): + spec = TemplateExpressionSpec( + combine="f(x)", expressions=["f"], variable_names=["x"] + ) + self._check_macro_str( + spec, + """\ + @template_spec(expressions=(f,),) do x + f(x) + end + """, + ) + + def test_multiple_expressions_no_params_multiple_variables(self): + spec = TemplateExpressionSpec( + combine="f(x, y) + g(z)", + expressions=["f", "g"], + variable_names=["x", "y", "z"], + ) + self._check_macro_str( + spec, + """ + @template_spec(expressions=(f, g,),) do x, y, z + f(x, y) + g(z) + end + """, + ) + + def test_single_expression_single_param_single_variable(self): + spec = TemplateExpressionSpec( + combine="p[1] * f(x)", + expressions=["f"], + variable_names=["x"], + parameters={"p": 1}, + ) + self._check_macro_str( + spec, + """ + @template_spec(expressions=(f,), parameters=(p=1,),) do x + p[1] * f(x) + end + """, + ) + + def test_multiple_expressions_multiple_params_multiple_variables(self): + spec = TemplateExpressionSpec( + combine="p1[1]*f(x,y) + p2[1]*g(z)", + expressions=["f", "g"], + variable_names=["x", "y", "z"], + parameters={"p1": 2, "p2": 3}, + ) + self._check_macro_str( + spec, + """ + @template_spec(expressions=(f, g,), parameters=(p1=2, p2=3,),) do x, y, z + p1[1]*f(x,y) + p2[1]*g(z) + end + """, + ) + + def test_complex_variable_names(self): + spec = TemplateExpressionSpec( + combine="f(var1) * g(var2)", + expressions=["f", "g"], + variable_names=["var1", "var2"], + ) + self._check_macro_str( + spec, + """ + @template_spec(expressions=(f, g,),) do var1, var2 + f(var1) * g(var2) + end + """, + ) + + def test_mixed_parameter_types(self): + spec = TemplateExpressionSpec( + combine="alpha*f(x) + beta*g(y)", + expressions=["f", "g"], + variable_names=["x", "y"], + parameters={"alpha": 1, "beta": 2}, + ) + self._check_macro_str( + spec, + """ + @template_spec(expressions=(f, g,), parameters=(alpha=1, beta=2,),) do x, y + alpha*f(x) + beta*g(y) + end + """, + ) + + def test_empty_parameters_case(self): + spec = TemplateExpressionSpec( + combine="f(x)", expressions=["f"], variable_names=["x"], parameters={} + ) + self.assertNotIn("parameters", spec._template_macro_str()) + + def test_maximum_parameters_expressions(self): + spec = TemplateExpressionSpec( + combine=" + ".join([f"f{i}(x)" for i in range(5)]), + expressions=[f"f{i}" for i in range(5)], + variable_names=["x"], + parameters={f"p{i}": i + 1 for i in range(5)}, + ) + macro_str = spec._template_macro_str() + self.assertIn("expressions=(f0, f1, f2, f3, f4,),", macro_str) + self.assertIn("parameters=(p0=1, p1=2, p2=3, p3=4, p4=5,),", macro_str) + + def runtests(just_tests=False): """Run all tests in test.py.""" test_cases = [