From 853656fbf59a973364afe99d42ab9690ecc79508 Mon Sep 17 00:00:00 2001
From: Gavin Macaulay <gavin@macaulay.co.nz>
Date: Wed, 21 Aug 2024 15:06:38 +1200
Subject: [PATCH] Polish API and prepare for a release (#21)

* Add a new page about other modelling software

* add more modelling software

* add more software

* Revert "add more software"

This reverts commit 0b9bf6c83da4ee7e85eec15a21ae039dfd2517b4.

* Resources dir for docs, better coordinate system figure

* Simplify API for BenchmarkData and ReferenceModels

* Update to work with API changes

* API polishing

Move model_type into the dataframe/dataset function parameter
Make reference model API more pythonic
Update example code to match these changes

* Up the version number
---
 pyproject.toml                  |  2 +-
 src/echosms/__init__.py         |  4 ++--
 src/echosms/referencemodels.py  |  5 ++---
 src/echosms/scattermodelbase.py | 21 +++++++++------------
 src/echosms/utils.py            | 19 +++++++++++--------
 src/example_code.py             | 23 ++++++++++++++---------
 6 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c69d642..1d95d0a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ packages = ["src/echosms"]
 
 [project]
 name = 'echosms'
-version = '0.0.3'
+version = '0.1.0'
 license = {file = "LICENSE"}
 keywords = ["acoustic", "backscatter", "model"]
 authors = [
diff --git a/src/echosms/__init__.py b/src/echosms/__init__.py
index 214fbcf..c7f2e9f 100644
--- a/src/echosms/__init__.py
+++ b/src/echosms/__init__.py
@@ -1,5 +1,5 @@
 """Setup the public API for echoSMs."""
-from .utils import k, eta, h1, df_from_dict, da_from_dict
+from .utils import k, eta, h1, as_dataframe, as_dataarray
 from .scattermodelbase import ScatterModelBase
 from .benchmarkdata import BenchmarkData
 from .referencemodels import ReferenceModels
@@ -8,4 +8,4 @@
 from .dcmmodel import DCMModel
 
 __all__ = ['ScatterModelBase', 'BenchmarkData', 'ReferenceModels', 'MSSModel', 'PSMSModel',
-           'DCMModel', 'k', 'eta', 'h1', 'da_from_dict', 'df_from_dict']
+           'DCMModel', 'k', 'eta', 'h1', 'as_dataframe', 'as_dataarray']
diff --git a/src/echosms/referencemodels.py b/src/echosms/referencemodels.py
index 446cfb2..1b4dbc6 100644
--- a/src/echosms/referencemodels.py
+++ b/src/echosms/referencemodels.py
@@ -70,8 +70,8 @@ def specification(self, name):
     def parameters(self, name):
         """Model parameters for a particular model.
 
-        Model parameters are a subset of the model specification where the non-numerical
-        items have been removed.
+        Model parameters are a subset of the model specification where the metadata items have
+        been removed.
 
         Parameters
         ----------
@@ -94,6 +94,5 @@ def parameters(self, name):
         del p['name']
         del p['shape']
         del p['description']
-        del p['model_type']
         del p['source']
         return p
diff --git a/src/echosms/scattermodelbase.py b/src/echosms/scattermodelbase.py
index c43d925..89ef0a0 100644
--- a/src/echosms/scattermodelbase.py
+++ b/src/echosms/scattermodelbase.py
@@ -2,7 +2,7 @@
 
 import abc
 import numpy as np
-from .utils import df_from_dict
+from .utils import as_dataframe
 import pandas as pd
 import xarray as xr
 
@@ -42,7 +42,7 @@ def __init__(self):
         # An indication of the maximum ka value that this model provides accurate results for
         self.max_ka = np.nan  # [1]
 
-    def calculate_ts(self, data, model_type, multiprocess=False):
+    def calculate_ts(self, data, multiprocess=False):
         """Calculate the TS for many parameter sets.
 
         Parameters
@@ -54,10 +54,6 @@ def calculate_ts(self, data, model_type, multiprocess=False):
             parameters in calculate_ts_single(). The TS will be calculated for all combinations of
             the coordinate variables. If dictionary, it will be converted to a DataFrame first.
 
-        model_type : string
-            The type of model boundary to apply. Valid values are given in the model_types class
-            variable.
-
         multiprocess : boolean
             Split the ts calculation across CPU cores.
 
@@ -68,7 +64,7 @@ def calculate_ts(self, data, model_type, multiprocess=False):
 
         """
         if isinstance(data, dict):
-            data = df_from_dict(data)
+            data = as_dataframe(data)
         elif isinstance(data, pd.DataFrame):
             pass
         elif isinstance(data, xr.DataArray):
@@ -80,19 +76,20 @@ def calculate_ts(self, data, model_type, multiprocess=False):
 
         if multiprocess:
             # Using mapply:
-            # ts = mapply(data, self.__ts_helper, args=(model_type,), axis=1)
+            # ts = mapply(data, self.__ts_helper, axis=1)
             # Using swifter
-            # ts = df.swifter.apply(self.__ts_helper, args=(model_type,), axis=1)
-            ts = data.apply(self.__ts_helper, args=(model_type,), axis=1)
+            # ts = df.swifter.apply(self.__ts_helper, axis=1)
+            ts = data.apply(self.__ts_helper, axis=1)
         else:  # this uses just one CPU
-            ts = data.apply(self.__ts_helper, args=(model_type,), axis=1)
+            # ts = data.apply(self.__ts_helper, args=(model_type,), axis=1)
+            ts = data.apply(self.__ts_helper, axis=1)
 
         return ts.to_numpy()  # TODO - return data type that matches the input data type
 
     def __ts_helper(self, *args):
         """Convert function arguments and call calculate_ts_single()."""
         p = args[0].to_dict()  # so we can use it for keyword arguments
-        return self.calculate_ts_single(**p, model_type=args[1])
+        return self.calculate_ts_single(**p)
 
     @abc.abstractmethod
     def calculate_ts_single(self):
diff --git a/src/echosms/utils.py b/src/echosms/utils.py
index fc6a26d..31c6b93 100644
--- a/src/echosms/utils.py
+++ b/src/echosms/utils.py
@@ -3,9 +3,10 @@
 import pandas as pd
 import xarray as xr
 from scipy.special import spherical_jn, spherical_yn
+from collections.abc import Iterable
 
 
-def df_from_dict(params: dict) -> pd.DataFrame:
+def as_dataframe(params: dict) -> pd.DataFrame:
     """Convert model parameters from dict form to a Pandas DataFrame.
 
     Parameters
@@ -20,14 +21,15 @@ def df_from_dict(params: dict) -> pd.DataFrame:
         input dict.
 
     """
-    # Use meshgrid to do the Cartesian product, then reshape into a 2D array, then create a
-    # Pandas DataFrame() from that
-    return pd.DataFrame(np.array(
-        np.meshgrid(*tuple(params.values()))).T.reshape(-1, len(params)),
-        columns=params.keys())
+    # Use meshgrid to do the Cartesian product then create a Pandas DataFrame from that, having
+    # flattened the multidimensional arrays and using a dict to provide column names.
+    # This preserves the differing dtypes in each column compared to other ways of
+    # constructing the DataFrame).
+    return pd.DataFrame({k: t.flatten()
+                         for k, t in zip(params.keys(), np.meshgrid(*tuple(params.values())))})
 
 
-def da_from_dict(params: dict) -> xr.DataArray:
+def as_dataarray(params: dict) -> xr.DataArray:
     """Convert model parameters from dict form to a Xarray DataArray.
 
     Parameters
@@ -44,8 +46,9 @@ def da_from_dict(params: dict) -> xr.DataArray:
     """
     # Convert scalars to iterables so xarray is happier later on
     for k, v in params.items():
-        if not hasattr(v, '__iter__'):
+        if not isinstance(v, Iterable) or isinstance(v, str):
             params[k] = [v]
+
     # Lengths of each parameter array
     sz = [len(v) for k, v in params.items()]
     # Create the DataArray
diff --git a/src/example_code.py b/src/example_code.py
index 5490b27..c853cfd 100644
--- a/src/example_code.py
+++ b/src/example_code.py
@@ -7,7 +7,7 @@
 from echosms import MSSModel, PSMSModel, DCMModel
 from echosms import BenchmarkData
 from echosms import ReferenceModels
-from echosms import df_from_dict, da_from_dict
+from echosms import as_dataframe, as_dataarray
 
 # Load the reference model defintiions
 rm = ReferenceModels()
@@ -60,14 +60,14 @@
     for name in names:
         # Get the model parameters used in Jech et al. (2015) for a particular model.
         s = rm.specification(name[0])
-        m = rm.parameters(name[0])  # the subset of s with string items removed
+        m = rm.parameters(name[0])
 
         # Add frequencies and angle to the model parameters
         m['f'] = bm.freq_dataset['Frequency_kHz']*1e3  # [Hz]
         m['theta'] = 90.0
 
         # and run these
-        ts = mod.calculate_ts(m, model_type=s['model_type'])
+        ts = mod.calculate_ts(m)
 
         jech_index = np.mean(np.abs(ts - bmf[name[1]]))
 
@@ -108,14 +108,14 @@
 for name in names:
     # Get the model parameters used in Jech et al. (2015) for a particular model.
     s = rm.specification(name[0])
-    m = rm.parameters(name[0])  # the subset of s with string items removed
+    m = rm.parameters(name[0])
 
     # Add frequencies and angle to the model parameters
     m['f'] = 38000  # [Hz]
     m['theta'] = bmt['Angle_deg']
 
     # and run these
-    ts = mod.calculate_ts(m, model_type=s['model_type'])
+    ts = mod.calculate_ts(m)
 
     jech_index = np.mean(np.abs(ts - bmt[name[1]]))
 
@@ -146,13 +146,13 @@
 m['target_rho'] = np.arange(1020, 1030, 1)  # [kg/m^3]
 m['theta'] = [0, 90.0, 180.0]
 # can convert this to a dataframe
-models_df = df_from_dict(m)
+models_df = as_dataframe(m)
 # could also make a DataFrame of parameters that are not just the combination of all input
 # parameters. This offers a way to specify a more tailored set of model parameters.
 
 print(f'Running {len(models_df)} models')
 # and run
-ts = mss.calculate_ts(models_df, model_type='fluid filled', multiprocess=True)
+ts = mss.calculate_ts(models_df, multiprocess=True)
 
 # And can then add the ts to the params dataframe for ease of selecting and plotting the results
 models_df['ts'] = ts
@@ -174,14 +174,19 @@
           'f': np.linspace(12, 100, num=400) * 1000,
           'theta': np.arange(0, 180, 1),
           'a': 0.07,
+          'model_type': 'fluid filled',
           'target_c': 1450,
           'target_rho': 1250}
 
 # Instead of converting those to a dataframe, an xarray can be used.
-params_xa = da_from_dict(params)
+params_xa = as_dataarray(params)
 
 # how many models runs would that be?
 print(f'Running {np.prod(params_xa.shape)} models!')
 
 # and is called the same way as for the dataframe
-ts = mss.calculate_ts(params_xa, model_type='fluid filled', multiprocess=True)
+if False:  # cause it takes a long time to run (as multiprocess is not enabled internally)
+    ts = mss.calculate_ts(params_xa, multiprocess=True)
+
+# and it can be inserted into params_xa
+# TODO once the data is returned in an appropriate form