diff --git a/docs/conf.py b/docs/conf.py index 8d8eeafe..642e9179 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -45,7 +45,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'classic' +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/exetera.core.rst b/docs/exetera.core.rst index 4ee9e9a1..75a3870f 100644 --- a/docs/exetera.core.rst +++ b/docs/exetera.core.rst @@ -29,7 +29,7 @@ exetera.core.dataset module :show-inheritance: exetera.core.dataframe module ---------------------------- +----------------------------- .. automodule:: exetera.core.dataframe :members: diff --git a/docs/exetera.processing.rst b/docs/exetera.processing.rst index 1b656a6c..da1b330a 100644 --- a/docs/exetera.processing.rst +++ b/docs/exetera.processing.rst @@ -11,6 +11,14 @@ exetera.processing.numpy\_buffer module :members: :undoc-members: :show-inheritance: + +exetera.processing.date\_time_\helpers module +--------------------------------------------- + +.. automodule:: exetera.processing.date_time_helpers + :members: + :undoc-members: + :show-inheritance: Module contents --------------- diff --git a/docs/index.rst b/docs/index.rst index 36f5a838..22769326 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,11 @@ Welcome to ExeTera's documentation! =================================== +ExeTera is a tool for creating reproducible analysis pipelines for large tabular datasets. + +This documentation is for the API of the `ExeTera Package `_. +The `wiki `_ contains in-depth documentation on the concepts behind this software, usage examples, and developer resources such as the roadmap for future releases. + .. toctree:: :maxdepth: 2 :caption: Contents: diff --git a/exetera/bin/exetera_perf_tests.py b/exetera/bin/exetera_perf_tests.py index fa352c78..0fd6cd79 100644 --- a/exetera/bin/exetera_perf_tests.py +++ b/exetera/bin/exetera_perf_tests.py @@ -350,17 +350,18 @@ def read_fields_from_hdf5(file_name, field_count): data = field.data[:] -# generate_dataset(1 << 25, 16) - -new_prep(1<<28, 4) -# new_to_hdf5(16) -# new_hs_test(4) -new_pd_test(1) -# pd_test_1(1 << 24, 64) -#hs_test_1(1 << 21, 16) -# raw_np_test_1(1 << 29, 4) -# minimal_test_1(1 << 29, 16) -# iterator_test_1(1 << 24) - -# read_id_from_csv('/home/ben/covid/patients_export_geocodes_20200830040058.csv', 1) -# read_fields_from_hdf5('/home/ben/covid/ds_20200830_full.hdf5', 16) +if __name__ == "__main__": + # generate_dataset(1 << 25, 16) + + new_prep(1<<28, 4) + # new_to_hdf5(16) + # new_hs_test(4) + new_pd_test(1) + # pd_test_1(1 << 24, 64) + #hs_test_1(1 << 21, 16) + # raw_np_test_1(1 << 29, 4) + # minimal_test_1(1 << 29, 16) + # iterator_test_1(1 << 24) + + # read_id_from_csv('/home/ben/covid/patients_export_geocodes_20200830040058.csv', 1) + # read_fields_from_hdf5('/home/ben/covid/ds_20200830_full.hdf5', 16) diff --git a/exetera/bin/journal_test.py b/exetera/bin/journal_test.py index 4427d052..ca680a5d 100644 --- a/exetera/bin/journal_test.py +++ b/exetera/bin/journal_test.py @@ -3,11 +3,12 @@ from exetera.core.session import Session from exetera.core.journal import journal_test_harness -schema_fn = '/home/ben/covid/covid_schema.json' -old_fn = '/home/ben/covid/ds_20200801_base.hdf5' -new_fn = '/home/ben/covid/ds_20200901_base.hdf5' -dest_fn = '/home/ben/covid/ds_journal.hdf5' - -with open(schema_fn) as f: - schema = load_schema(f) -journal_test_harness(Session(), schema, old_fn, new_fn, dest_fn) +if __name__ == "__main__": + schema_fn = '/home/ben/covid/covid_schema.json' + old_fn = '/home/ben/covid/ds_20200801_base.hdf5' + new_fn = '/home/ben/covid/ds_20200901_base.hdf5' + dest_fn = '/home/ben/covid/ds_journal.hdf5' + + with open(schema_fn) as f: + schema = load_schema(f) + journal_test_harness(Session(), schema, old_fn, new_fn, dest_fn) diff --git a/exetera/core/dataframe.py b/exetera/core/dataframe.py index 19d044d9..72c58133 100644 --- a/exetera/core/dataframe.py +++ b/exetera/core/dataframe.py @@ -36,6 +36,14 @@ class HDF5DataFrame(DataFrame): For a detailed explanation of DataFrame along with examples of its use, please refer to the wiki documentation at https://github.com/KCL-BMEIS/ExeTera/wiki/DataFrame-API + + :param name: name of the dataframe. + :param dataset: a dataset object, where this dataframe belongs to. + :param h5group: the h5group object to store the fields. If the h5group is not empty, acquire data from h5group + object directly. The h5group structure is h5group<-h5group-dataset structure, the later group has a + 'fieldtype' attribute and only one dataset named 'values'. So that the structure is mapped to + Dataframe<-Field-Field.data automatically. + :param dataframe: optional - replicate data from another dictionary of (name:str, field: Field). """ def __init__(self, dataset: Dataset, @@ -44,14 +52,6 @@ def __init__(self, """ Create a Dataframe object, that contains a dictionary of fields. User should always create dataframe by dataset.create_dataframe, otherwise the dataframe is not stored in the dataset. - - :param name: name of the dataframe. - :param dataset: a dataset object, where this dataframe belongs to. - :param h5group: the h5group object to store the fields. If the h5group is not empty, acquire data from h5group - object directly. The h5group structure is h5group<-h5group-dataset structure, the later group has a - 'fieldtype' attribute and only one dataset named 'values'. So that the structure is mapped to - Dataframe<-Field-Field.data automatically. - :param dataframe: optional - replicate data from another dictionary of (name:str, field: Field). """ self.name = name @@ -207,7 +207,7 @@ def __contains__(self, name): :param name: the name of the field to check :return: A boolean value indicating whether this DataFrame contains a Field with the - name in question + name in question """ if not isinstance(name, str): raise TypeError("The name must be a str object.") @@ -309,21 +309,22 @@ def rename(self, a single field to be renamed or you can provide a dictionary with a set of fields to be renamed. - ``` - # rename a single field - df.rename('a', 'b') - - # rename multiple fields - df.rename({'a': 'b', 'b': 'c', 'c': 'a'}) - ``` + Example:: + + # rename a single field + df.rename('a', 'b') + + # rename multiple fields + df.rename({'a': 'b', 'b': 'c', 'c': 'a'}) Field renaming can fail if the resulting set of renamed fields would have name clashes. If this is the case, none of the rename operations go ahead and the dataframe remains unmodified. + :param field: Either a string or a dictionary of name pairs, each of which is the existing - field name and the destination field name + field name and the destination field name :param field_to: Optional parameter containing a string, if `field` is a string. If 'field' - is a dictionary, parameter should not be set. - Field references remain valid after this operation and reflect their renaming. + is a dictionary, parameter should not be set. + Field references remain valid after this operation and reflect their renaming. :return: None """ @@ -498,26 +499,26 @@ def merge(left: DataFrame, Fields are written to the destination dataframe. If the field names clash, they will get appended with the strings specified in 'left_suffix' and 'right_suffix' respectively. - :params left: The left dataframe - :params right: The right dataframe - :left_on: The field corresponding to the left key used to perform the join. This is either the - the name of the field, or a field object. If it is a field object, it can be from another - dataframe but it must be the same length as the fields being joined. This can also be a tuple - of such values when performing joins on compound keys - :right_on: The field corresponding to the right key used to perform the join. This is either - the name of the field, or a field object. If it is a field object, it can be from another - dataframe but it must be the same length as the fields being joined. This can also be a tuple - of such values when performing joins on compound keys - :left_fields: Optional parameter listing which fields are to be joined from the left table. If - this is not set, all fields from the left table are joined - :right_fields: Optional parameter listing which fields are to be joined from the right table. - If this is not set, all fields from the right table are joined - :left_suffix: A string to be appended to fields from the left table if they clash with fields - from the right table. - :right_suffix: A string to be appended to fields from the right table if they clash with fields - from the left table. - :how: Optional parameter specifying the merge mode. It must be one of ('left', 'right', - 'inner', 'outer' or 'cross). If not set, the 'left' join is performed. + :param left: The left dataframe + :param right: The right dataframe + :param left_on: The field corresponding to the left key used to perform the join. This is either the + the name of the field, or a field object. If it is a field object, it can be from another + dataframe but it must be the same length as the fields being joined. This can also be a tuple + of such values when performing joins on compound keys + :param right_on: The field corresponding to the right key used to perform the join. This is either + the name of the field, or a field object. If it is a field object, it can be from another + dataframe but it must be the same length as the fields being joined. This can also be a tuple + of such values when performing joins on compound keys + :param left_fields: Optional parameter listing which fields are to be joined from the left table. If + this is not set, all fields from the left table are joined + :param right_fields: Optional parameter listing which fields are to be joined from the right table. + If this is not set, all fields from the right table are joined + :param left_suffix: A string to be appended to fields from the left table if they clash with fields from the + right table. + :param right_suffix: A string to be appended to fields from the right table if they clash with fields from the + left table. + :param how: Optional parameter specifying the merge mode. It must be one of ('left', 'right', + 'inner', 'outer' or 'cross). If not set, the 'left' join is performed. """ diff --git a/exetera/core/dataset.py b/exetera/core/dataset.py index 008013aa..3f61f2ed 100644 --- a/exetera/core/dataset.py +++ b/exetera/core/dataset.py @@ -28,6 +28,13 @@ class HDF5Dataset(Dataset): For a detailed explanation of Dataset along with examples of its use, please refer to the wiki documentation at https://github.com/KCL-BMEIS/ExeTera/wiki/Dataset-API + + :param session: The session instance to include this dataset to. + :param dataset_path: The path of HDF5 file. + :param mode: the mode in which the dataset should be opened. This is one of "r", "r+" or "w". + :param name: the name that is associated with this dataset. This can be used to retrieve the dataset when + calling :py:meth:`~session.Session.get_dataset`. + :return: A HDF5Dataset instance. """ def __init__(self, session, dataset_path, mode, name): @@ -35,13 +42,6 @@ def __init__(self, session, dataset_path, mode, name): Create a Dataset instance that contains dataframes. The dataframes are represented in a dict() with the name(str) as a key. The construction should always be called by Session.open_dataset() otherwise the instance is not included in Session.datasets. If the HDF5 datafile contains group, the content in loaded into dataframes. - - :param session: The session instance to include this dataset to. - :param dataset_path: The path of HDF5 file. - :param mode: the mode in which the dataset should be opened. This is one of "r", "r+" or "w". - :param name: the name that is associated with this dataset. This can be used to retrieve the dataset when - calling :py:meth:`~session.Session.get_dataset`. - :return: A HDF5Dataset instance. """ self.name = name self._session = session @@ -202,6 +202,7 @@ def __setitem__(self, name: str, dataframe: DataFrame): def __delitem__(self, name: str): """ Delete a dataframe by del dataset[name]. + :param name: The name of dataframe to delete. :return: Boolean if the dataframe is deleted. """ @@ -215,6 +216,7 @@ def __delitem__(self, name: str): def delete_dataframe(self, dataframe: DataFrame): """ Remove dataframe from this dataset by the dataframe object. + :param dataframe: The dataframe instance to delete. :return: Boolean if the dataframe is deleted. """ diff --git a/exetera/core/fields.py b/exetera/core/fields.py index ce63b177..1f3c68fa 100644 --- a/exetera/core/fields.py +++ b/exetera/core/fields.py @@ -511,13 +511,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_filter_to_indexed_field(self, filter_to_apply, target, in_place) @@ -527,13 +527,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_index_to_indexed_field(self, index_to_apply, target, in_place) @@ -588,13 +588,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -604,13 +604,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) @@ -663,13 +663,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -679,13 +679,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) @@ -833,13 +833,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. If 'target' is set, + 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -849,13 +849,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) @@ -924,13 +924,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -940,13 +940,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) @@ -1172,13 +1172,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_filter_to_indexed_field(self, filter_to_apply, target, in_place) @@ -1189,13 +1189,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_index_to_indexed_field(self, index_to_apply, target, in_place) @@ -1266,13 +1266,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -1283,13 +1283,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) @@ -1355,13 +1355,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -1372,13 +1372,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) @@ -1573,13 +1573,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -1590,13 +1590,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) @@ -1686,13 +1686,13 @@ def apply_filter(self, filter_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the filtered data is written to. - :param: filter_to_apply: a Field or numpy array that contains the boolean filter data - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param filter_to_apply: a Field or numpy array that contains the boolean filter data + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The filtered field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_filter_to_field(self, filter_to_apply, target, in_place) @@ -1703,13 +1703,13 @@ def apply_index(self, index_to_apply, target=None, in_place=False): is called unless 'in_place is set to true'. The user can specify a 'target' field that the reindexed data is written to. - :param: index_to_apply: a Field or numpy array that contains the indices - :param: target: if set, this is the field that is written do. This field must be writable. - If 'target' is set, 'in_place' must be False. - :param: in_place: if True, perform the operation destructively on this field. This field - must be writable. If 'in_place' is True, 'target' must be None + :param index_to_apply: a Field or numpy array that contains the indices + :param target: if set, this is the field that is written to. This field must be writable. + If 'target' is set, 'in_place' must be False. + :param in_place: if True, perform the operation destructively on this field. This field + must be writable. If 'in_place' is True, 'target' must be None :return: The reindexed field. This is a new field instance unless 'target' is set, in which - case it is the target field, or unless 'in_place' is True, in which case it is this field. + case it is the target field, or unless 'in_place' is True, in which case it is this field. """ self._ensure_valid() return FieldDataOps.apply_index_to_field(self, index_to_apply, target, in_place) diff --git a/exetera/core/readerwriter.py b/exetera/core/readerwriter.py index 5ff962e3..f8032127 100644 --- a/exetera/core/readerwriter.py +++ b/exetera/core/readerwriter.py @@ -238,9 +238,10 @@ def chunk_factory(self, length): return [None] * length def write_part(self, values): - """Writes a list of strings in indexed string form to a field - Args: - values: a list of utf8 strings + """ + Writes a list of strings in indexed string form to a field. + + :param values: a list of utf8 strings """ if not self.ever_written: self.indices[0] = self.accumulated @@ -432,8 +433,8 @@ def write_part(self, values): Given a list of strings, parse the strings and write the parsed values. Values that cannot be parsed are written out as zero for the values, and zero for the flags to indicate that that entry is not valid. - Args: - values: a list of strings to be parsed + + :param values: a list of strings to be parsed """ elements = np.zeros(len(values), dtype=self.data_writer.nformat) validity = np.zeros(len(values), dtype='bool') diff --git a/exetera/core/session.py b/exetera/core/session.py index 5270bbf8..c873cb31 100644 --- a/exetera/core/session.py +++ b/exetera/core/session.py @@ -37,6 +37,12 @@ class Session(AbstractSession): provides operations that can be performed on Fields. For a more detailed explanation of Session and examples of its usage, please refer to https://github.com/KCL-BMEIS/ExeTera/wiki/Session-API + + :param chunksize: Change the default chunksize that fields created with this dataset use. + Note this is a hint parameter and future versions of Session may choose to ignore it if it + is no longer required. In general, it should only be changed for testing. + :param timestamp: Set the official timestamp for the Session's creation rather than taking + the current date/time. """ def __init__(self, @@ -44,12 +50,6 @@ def __init__(self, timestamp: str = str(datetime.now(timezone.utc))): """ Create a new Session object. - :param chunksize: Change the default chunksize that fields created with this dataset use. - Note this is a hint parameter and future versions of Session may choose to ignore it if it - is no longer required. In general, it should only be changed for testing. - :param timestamp: Set the official timestamp for the Session's creation rather than taking - the current date/time. - :return: A newly created Session object """ if not isinstance(timestamp, str): error_str = "'timestamp' must be a string but is of type {}" @@ -71,11 +71,12 @@ def open_dataset(self, mode: str, name: str): """ - Open a dataset with the given access mode + Open a dataset with the given access mode. + :param dataset_path: the path to the dataset :param mode: the mode in which the dataset should be opened. This is one of "r", "r+" or "w". :param name: the name that is associated with this dataset. This can be used to retrieve the dataset when - calling :py:meth:`~session.Session.get_dataset`. + calling :py:meth:`~session.Session.get_dataset`. :return: The top-level dataset object """ h5py_modes = {"r": "r", "r+": "r+", "w": "w"} @@ -89,6 +90,7 @@ def close_dataset(self, name: str): """ Close the dataset with the given name. If there is no dataset with that name, do nothing. + :param name: The name of the dataset to be closed :return: None """ @@ -100,12 +102,14 @@ def list_datasets(self): """ List the open datasets for this Session object. This is returned as a tuple of strings rather than the datasets themselves. The individual datasets can be fetched using - :py:meth:`~session.Session.get_dataset`. - ``` - names = s.list_datasets() - datasets = [s.open_dataset(n) for n in names] - :return: A tuple containing the names of the currently open datasets for this Session - object + :py:meth:`~exetera.session.Session.get_dataset`. + + Example:: + + names = s.list_datasets() + datasets = [s.get_dataset(n) for n in names] + + :return: A tuple containing the names of the currently open datasets for this Session object """ return tuple(n for n in self.datasets.keys()) @@ -114,41 +118,43 @@ def get_dataset(self, """ Get the dataset with the given name. If there is no dataset with that name, raise a KeyError indicating that the dataset with that name is not present. - :param name: the name of the dataset to be fetched. This is the name that was given to it - when it was opened through :py:meth:`~session.Session.open_dataset`. - :return: The dataset with that name. + + :param name: Name of the dataset to be fetched. This is the name that was given to it + when it was opened through :py:meth:`~session.Session.open_dataset`. + :return: Dataset with that name. """ return self.datasets[name] def close(self): """ - Close all open datasets + Close all open datasets. + :return: None """ for v in self.datasets.values(): v.close() self.datasets = dict() - def get_shared_index(self, - keys: Tuple[np.array]): + def get_shared_index(self, keys: Tuple[np.ndarray]): """ Create a shared index based on a tuple of numpy arrays containing keys. This function generates the sorted union of a tuple of key fields and then maps the individual arrays to their corresponding indices in the sorted union. - Example: + :param keys: a tuple of groups, fields or ndarrays whose contents represent keys + + Example:: + key_1 = ['a', 'b', 'e', 'g', 'i'] key_2 = ['b', 'b', 'c', 'c, 'e', 'g', 'j'] key_3 = ['a', 'c' 'd', 'e', 'g', 'h', 'h', 'i'] - + sorted_union = ['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j'] - + key_1_index = [0, 1, 4, 5, 7] key_2_index = [1, 1, 2, 2, 4, 5, 8] key_3_index = [0, 2, 3, 4, 5, 6, 6, 7] - - :param keys: a tuple of groups, fields or ndarrays whose contents represent keys """ if not isinstance(keys, tuple): raise ValueError("'keys' must be a tuple") @@ -193,8 +199,7 @@ def sort_on(self, :param dest_group: the group into which sorted fields are written :param keys: fields to sort on :param timestamp: optional - timestamp to write on the sorted fields - :param write_mode: optional - write mode to use if the destination fields already - exist + :param write_mode: optional - write mode to use if the destination fields already exist :return: None """ @@ -233,7 +238,8 @@ def print_if_verbose(*args): def dataset_sort_index(self, sort_indices, index=None): """ Generate a sorted index based on a set of fields upon which to sort and an optional - index to apply to the sort_indices + index to apply to the sort_indices. + :param sort_indices: a tuple or list of indices that determine the sorted order :param index: optional - the index by which the initial field should be permuted :return: the resulting index that can be used to permute unsorted fields @@ -345,7 +351,7 @@ def distinct(self, field=None, fields=None, filter=None): results = [uniques[f'{i}'] for i in range(len(fields))] return results - def get_spans(self, field: Union[Field, np.array] = None, + def get_spans(self, field: Union[Field, np.ndarray] = None, dest: Field = None, **kwargs): """ Calculate a set of spans that indicate contiguous equal values. @@ -358,13 +364,14 @@ def get_spans(self, field: Union[Field, np.array] = None, than one field specified, the fields are effectively zipped and the check for spans is carried out on each corresponding tuple in the zipped field. - Example: + Example:: + field: [1, 2, 2, 1, 1, 1, 3, 4, 4, 4, 2, 2, 2, 2, 2] result: [0, 1, 3, 6, 7, 10, 15] :param field: A Field or numpy array to be evaluated for spans :param dest: A destination Field to store the result - :param **kwargs: See below. For parameters set in both argument and kwargs, use kwargs + :param \*\*kwargs: See below. For parameters set in both argument and kwargs, use kwargs :Keyword Arguments: * field -- Similar to field parameter, in case user specify field as keyword @@ -406,15 +413,16 @@ def get_spans(self, field: Union[Field, np.array] = None, return result def _apply_spans_no_src(self, - predicate: Callable[[np.array, np.array], None], - spans: np.array, - dest: Field = None) -> np.array: + predicate: Callable[[np.ndarray, np.ndarray], None], + spans: np.ndarray, + dest: Field = None) -> np.ndarray: """ An implementation method for span applications that are carried out on the spans themselves rather than a target field. - :params predicate: a predicate function that carries out the operation on the spans and produces the result - :params spans: the numpy array of spans to be applied - :params dest: if set, the field to which the results are written + + :param predicate: a predicate function that carries out the operation on the spans and produces the result + :param spans: the numpy array of spans to be applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ assert (dest is None or isinstance(dest, Field)) @@ -432,16 +440,17 @@ def _apply_spans_no_src(self, def _apply_spans_src(self, predicate: Callable[[np.ndarray, np.ndarray, np.ndarray], None], - spans: np.array, - target: np.array, - dest: Field = None) -> np.array: + spans: np.ndarray, + target: np.ndarray, + dest: Field = None) -> np.ndarray: """ An implementation method for span applications that are carried out on a target field. - :params predicate: a predicate function that carries out the operation on the spans and a target field, and - produces the result - :params spans: the numpy array of spans to be applied - :params target: the field to which the spans and predicate are applied - :params dest: if set, the field to which the results are written + + :param predicate: a predicate function that carries out the operation on the spans and a target field, and + produces the result + :param spans: the numpy array of spans to be applied + :param target: the field to which the spans and predicate are applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ assert (dest is None or isinstance(dest, Field)) @@ -463,112 +472,121 @@ def _apply_spans_src(self, return results def apply_spans_index_of_min(self, - spans: np.array, - target: np.array, + spans: np.ndarray, + target: np.ndarray, dest: Field = None): """ - Finds the index of the minimum value within each span on a target field - :params spans: the numpy array of spans to be applied - :params target: the field to which the spans are applied - :params dest: if set, the field to which the results are written + Finds the index of the minimum value within each span on a target field. + + :param spans: the numpy array of spans to be applied + :param target: the field to which the spans are applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_src(ops.apply_spans_index_of_min, spans, target, dest) def apply_spans_index_of_max(self, - spans: np.array, - target: np.array, + spans: np.ndarray, + target: np.ndarray, dest: Field = None): """ - Finds the index of the maximum value within each span on a target field - :params spans: the numpy array of spans to be applied - :params target: the field to which the spans are applied - :params dest: if set, the field to which the results are written + Finds the index of the maximum value within each span on a target field. + + :param spans: the numpy array of spans to be applied + :param target: the field to which the spans are applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_src(ops.apply_spans_index_of_max, spans, target, dest) def apply_spans_index_of_first(self, - spans: np.array, + spans: np.ndarray, dest: Field = None): """ - Finds the index of the first entry within each span - :params spans: the numpy array of spans to be applied - :params dest: if set, the field to which the results are written + Finds the index of the first entry within each span. + + :param spans: the numpy array of spans to be applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_no_src(ops.apply_spans_index_of_first, spans, dest) def apply_spans_index_of_last(self, - spans: np.array, + spans: np.ndarray, dest: Field = None): """ - Finds the index of the last entry within each span - :params spans: the numpy array of spans to be applied - :params dest: if set, the field to which the results are written + Finds the index of the last entry within each span. + + :param spans: the numpy array of spans to be applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_no_src(ops.apply_spans_index_of_last, spans, dest) def apply_spans_count(self, - spans: np.array, + spans: np.ndarray, dest: Field = None): """ - Finds the number of entries within each span - :params spans: the numpy array of spans to be applied - :params dest: if set, the field to which the results are written + Finds the number of entries within each span. + + :param spans: the numpy array of spans to be applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_no_src(ops.apply_spans_count, spans, dest) def apply_spans_min(self, - spans: np.array, - target: np.array, + spans: np.ndarray, + target: np.ndarray, dest: Field = None): """ - Finds the minimum value within span on a target field - :params spans: the numpy array of spans to be applied - :params target: the field to which the spans are applied - :params dest: if set, the field to which the results are written + Finds the minimum value within span on a target field. + + :param spans: the numpy array of spans to be applied + :param target: the field to which the spans are applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_src(ops.apply_spans_min, spans, target, dest) def apply_spans_max(self, - spans: np.array, - target: np.array, + spans: np.ndarray, + target: np.ndarray, dest: Field = None): """ - Finds the maximum value within each span on a target field - :params spans: the numpy array of spans to be applied - :params target: the field to which the spans are applied - :params dest: if set, the field to which the results are written + Finds the maximum value within each span on a target field. + + :param spans: the numpy array of spans to be applied + :param target: the field to which the spans are applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_src(ops.apply_spans_max, spans, target, dest) def apply_spans_first(self, - spans: np.array, - target: np.array, + spans: np.ndarray, + target: np.ndarray, dest: Field = None): """ - Finds the first entry within each span on a target field - :params spans: the numpy array of spans to be applied - :params target: the field to which the spans are applied - :params dest: if set, the field to which the results are written + Finds the first entry within each span on a target field. + + :param spans: the numpy array of spans to be applied + :param target: the field to which the spans are applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_src(ops.apply_spans_first, spans, target, dest) def apply_spans_last(self, - spans: np.array, - target: np.array, + spans: np.ndarray, + target: np.ndarray, dest: Field = None): """ - Finds the last entry within each span on a target field - :params spans: the numpy array of spans to be applied - :params target: the field to which the spans are applied - :params dest: if set, the field to which the results are written + Finds the last entry within each span on a target field. + + :param spans: the numpy array of spans to be applied + :param target: the field to which the spans are applied + :param dest: if set, the field to which the results are written :returns: A numpy array containing the resulting values """ return self._apply_spans_src(ops.apply_spans_last, spans, target, dest) @@ -630,11 +648,12 @@ def _aggregate_impl(self, predicate, index, target=None, dest=None): defines an operation to be carried out, an 'index' array or field that determines the groupings over which the predicate applies, and a 'target' array or field that the operation is carried out upon, should a target be needed. If a 'dest' Field is supplied, the results will be written to it. - :params predicate: a predicate function that carries out the operation on the spans and produces the result - :params index: A numpy array or field representing the sub-ranges that can be aggregated - :params target: A numpy array upon which the operation is required. This only needs to be set for certain - operations. - :params dest: If set, the Field to which the results are written + + :param predicate: a predicate function that carries out the operation on the spans and produces the result + :param index: A numpy array or field representing the sub-ranges that can be aggregated + :param target: A numpy array upon which the operation is required. This only needs to be set for certain + operations. + :param dest: If set, the Field to which the results are written :returns: A numpy array containing the resulting values """ index_ = val.raw_array_from_parameter(self, "index", index) @@ -656,13 +675,15 @@ def _aggregate_impl(self, predicate, index, target=None, dest=None): def aggregate_count(self, index, dest=None): """ Finds the number of entries within each sub-group of index. - Example: - ``` - Index: a a a b b x a c c d d d - Result: 3 2 1 1 2 3 - ``` - :params index: A numpy array or Field containing the index that defines the ranges over which count is applied. - :params dest: If set, a Field to which the resulting counts are written + + Example:: + + + Index: a a a b b x a c c d d d + Result: 3 2 1 1 2 3 + + :param index: A numpy array or Field containing the index that defines the ranges over which count is applied. + :param dest: If set, a Field to which the resulting counts are written :returns: A numpy array containing the resulting values """ return self._aggregate_impl(self.apply_spans_count, index, None, dest) @@ -670,15 +691,16 @@ def aggregate_count(self, index, dest=None): def aggregate_first(self, index, target=None, dest=None): """ Finds the first entries within each sub-group of index. + Example: - ``` - Index: a a a b b x a c c d d d - Target: 1 2 3 4 5 6 7 8 9 0 1 2 - Result: 1 4 6 7 8 0 - ``` - :params index: A numpy array or Field containing the index that defines the ranges over which count is applied. - :params target: A numpy array to which the index and predicate are applied - :params dest: If set, a Field to which the resulting counts are written + + Index: a a a b b x a c c d d d + Target: 1 2 3 4 5 6 7 8 9 0 1 2 + Result: 1 4 6 7 8 0 + + :param index: A numpy array or Field containing the index that defines the ranges over which count is applied. + :param target: A numpy array to which the index and predicate are applied + :param dest: If set, a Field to which the resulting counts are written :returns: A numpy array containing the resulting values """ return self.aggregate_custom(self.apply_spans_first, index, target, dest) @@ -686,15 +708,16 @@ def aggregate_first(self, index, target=None, dest=None): def aggregate_last(self, index, target=None, dest=None): """ Finds the first entries within each sub-group of index. - Example: - ``` - Index: a a a b b x a c c d d d - Target: 1 2 3 4 5 6 7 8 9 0 1 2 - Result: 3 5 6 7 9 2 - ``` - :params index: A numpy array or Field containing the index that defines the ranges over which count is applied. - :params target: A numpy array to which the index and predicate are applied - :params dest: If set, a Field to which the resulting counts are written + + Example:: + + Index: a a a b b x a c c d d d + Target: 1 2 3 4 5 6 7 8 9 0 1 2 + Result: 3 5 6 7 9 2 + + :param index: A numpy array or Field containing the index that defines the ranges over which count is applied. + :param target: A numpy array to which the index and predicate are applied + :param dest: If set, a Field to which the resulting counts are written :returns: A numpy array containing the resulting values """ return self.aggregate_custom(self.apply_spans_last, index, target, dest) @@ -702,15 +725,16 @@ def aggregate_last(self, index, target=None, dest=None): def aggregate_min(self, index, target=None, dest=None): """ Finds the minimum value within each sub-group of index. - Example: - ``` - Index: a a a b b x a c c d d d - Target: 1 2 3 5 4 6 7 8 9 2 1 0 - Result: 1 4 6 7 8 0 - ``` - :params index: A numpy array or Field containing the index that defines the ranges over which min is applied. - :params target: A numpy array to which the index and predicate are applied - :params dest: If set, a Field to which the resulting counts are written + + Example:: + + Index: a a a b b x a c c d d d + Target: 1 2 3 5 4 6 7 8 9 2 1 0 + Result: 1 4 6 7 8 0 + + :param index: A numpy array or Field containing the index that defines the ranges over which min is applied. + :param target: A numpy array to which the index and predicate are applied + :param dest: If set, a Field to which the resulting counts are written :returns: A numpy array containing the resulting values """ return self.aggregate_custom(self.apply_spans_min, index, target, dest) @@ -718,15 +742,16 @@ def aggregate_min(self, index, target=None, dest=None): def aggregate_max(self, index, target=None, dest=None): """ Finds the maximum value within each sub-group of index. + Example: - ``` - Index: a a a b b x a c c d d d - Target: 1 2 3 5 4 6 7 8 9 2 1 0 - Result: 3 5 6 7 9 2 - ``` - :params index: A numpy array or Field containing the index that defines the ranges over which max is applied. - :params target: A numpy array to which the index and predicate are applied - :params dest: If set, a Field to which the resulting counts are written + + Index: a a a b b x a c c d d d + Target: 1 2 3 5 4 6 7 8 9 2 1 0 + Result: 3 5 6 7 9 2 + + :param index: A numpy array or Field containing the index that defines the ranges over which max is applied. + :param target: A numpy array to which the index and predicate are applied + :param dest: If set, a Field to which the resulting counts are written :returns: A numpy array containing the resulting values """ return self.aggregate_custom(self.apply_spans_max, index, target, dest) @@ -744,7 +769,7 @@ def join(self, destination_pkey, fkey_indices, values_to_join, writer=None, fkey_index_spans=None): """ - This method is due for removal and should not be moved. + This method is due for removal and should not be used. Please use the merge or ordered_merge functions instead. """ @@ -793,7 +818,7 @@ def predicate_and_join(self, predicate, destination_pkey, fkey_indices, reader=None, writer=None, fkey_index_spans=None): """ - This method is due for removal and should not be moved. + This method is due for removal and should not be used. Please use the merge or ordered_merge functions instead. """ if reader is not None: @@ -839,19 +864,22 @@ def get(self, field: Union[Field, h5py.Group]): """ Get a Field from a h5py Group. - Example: - ``` - # this code for context - with Session() as s: - - # open a dataset about wildlife - src = s.open_dataset("/my/wildlife/dataset.hdf5", "r", "src") - - # fetch the group containing bird data - birds = src['birds'] - - # get the bird decibel field - bird_decibels = s.get(birds['decibels']) + + Example:: + + # this code for context + with Session() as s: + + # open a dataset about wildlife + src = s.open_dataset("/my/wildlife/dataset.hdf5", "r", "src") + + # fetch the group containing bird data + birds = src['birds'] + + # get the bird decibel field + bird_decibels = s.get(birds['decibels']) + + :param field: The Field or Group object to retrieve. """ if isinstance(field, Field): return field @@ -876,18 +904,18 @@ def get(self, def create_like(self, field, dest_group, dest_name, timestamp=None, chunksize=None): """ Create a field of the same type as an existing field, in the location and with the name provided. - Example: - ``` - with Session as s: - ... - a = s.get(table_1['a']) - b = s.create_like(a, table_2, 'a_times_2') - b.data.write(a.data[:] * 2) - ``` - - :params field: The Field whose type is to be copied - :params dest_group: The group in which the new field should be created - :params dest_name: The name of the new field + + Example:: + + with Session as s: + ... + a = s.get(table_1['a']) + b = s.create_like(a, table_2, 'a_times_2') + b.data.write(a.data[:] * 2) + + :param field: The Field whose type is to be copied + :param dest_group: The group in which the new field should be created + :param dest_name: The name of the new field """ if isinstance(field, h5py.Group): if 'fieldtype' not in field.attrs.keys(): @@ -903,12 +931,12 @@ def create_indexed_string(self, group, name, timestamp=None, chunksize=None): """ Create an indexed string field in the given DataFrame with the given name. - :params group: The group in which the new field should be created - :params name: The name of the new field - :params timestamp: If set, the timestamp that should be given to the new field. If not set - datetime.now() is used. - :params chunksize: If set, the chunksize that should be used to create the new field. In general, this should - not be set unless you are writing unit tests. + :param group: The group in which the new field should be created + :param name: The name of the new field + :param timestamp: If set, the timestamp that should be given to the new field. If not set + datetime.now() is used. + :param chunksize: If set, the chunksize that should be used to create the new field. In general, this should + not be set unless you are writing unit tests. """ if not isinstance(group, (df.DataFrame, h5py.Group)): if isinstance(group, ds.Dataset): @@ -926,15 +954,15 @@ def create_indexed_string(self, group, name, timestamp=None, chunksize=None): def create_fixed_string(self, group, name, length, timestamp=None, chunksize=None): """ - Create an fixed string field in the given DataFrame with the given name, with the given max string length per entry. + Create a fixed string field in the given DataFrame, given name, and given max string length per entry. - :params group: The group in which the new field should be created - :params name: The name of the new field - :params length: The maximum length in bytes that each entry can have. - :params timestamp: If set, the timestamp that should be given to the new field. If not set - datetime.now() is used. - :params chunksize: If set, the chunksize that should be used to create the new field. In general, this should - not be set unless you are writing unit tests. + :param group: The group in which the new field should be created + :param name: The name of the new field + :param length: The maximum length in bytes that each entry can have. + :param timestamp: If set, the timestamp that should be given to the new field. If not set + datetime.now() is used. + :param chunksize: If set, the chunksize that should be used to create the new field. In general, this should + not be set unless you are writing unit tests. """ if not isinstance(group, (df.DataFrame, h5py.Group)): if isinstance(group, ds.Dataset): @@ -951,19 +979,19 @@ def create_fixed_string(self, group, name, length, timestamp=None, chunksize=Non def create_categorical(self, group, name, nformat, key, timestamp=None, chunksize=None): """ - Create a categorical field in the given DataFrame with the given name. This function also takes a numerical format - for the numeric representation of the categories, and a key that maps numeric values to their string + Create a categorical field in the given DataFrame with the given name. This function also takes a numerical + format for the numeric representation of the categories, and a key that maps numeric values to their string string descriptions. - :params group: The group in which the new field should be created - :params name: The name of the new field - :params nformat: A numerical type in the set (int8, uint8, int16, uint18, int32, uint32, int64). It is - recommended to use 'int8'. - :params key: A dictionary that maps numerical values to their string representations - :params timestamp: If set, the timestamp that should be given to the new field. If not set - datetime.now() is used. - :params chunksize: If set, the chunksize that should be used to create the new field. In general, this should - not be set unless you are writing unit tests. + :param group: The group in which the new field should be created + :param name: The name of the new field + :param nformat: A numerical type in the set (int8, uint8, int16, uint18, int32, uint32, int64). It is + recommended to use 'int8'. + :param key: A dictionary that maps numerical values to their string representations + :param timestamp: If set, the timestamp that should be given to the new field. If not set + datetime.now() is used. + :param chunksize: If set, the chunksize that should be used to create the new field. In general, this should + not be set unless you are writing unit tests. """ if not isinstance(group, (df.DataFrame, h5py.Group)): if isinstance(group, ds.Dataset): @@ -983,15 +1011,15 @@ def create_numeric(self, group, name, nformat, timestamp=None, chunksize=None): """ Create a numeric field in the given DataFrame with the given name. - :params group: The group in which the new field should be created - :params name: The name of the new field - :params nformat: A numerical type in the set (int8, uint8, int16, uint18, int32, uint32, int64, uint64, - float32, float64). It is recommended to avoid uint64 as certain operations in numpy cause conversions to - floating point values. - :params timestamp: If set, the timestamp that should be given to the new field. If not set - datetime.now() is used. - :params chunksize: If set, the chunksize that should be used to create the new field. In general, this should - not be set unless you are writing unit tests. + :param group: The group in which the new field should be created + :param name: The name of the new field + :param nformat: A numerical type in the set (int8, uint8, int16, uint18, int32, uint32, int64, uint64, + float32, float64). It is recommended to avoid uint64 as certain operations in numpy cause conversions to + floating point values. + :param timestamp: If set, the timestamp that should be given to the new field. If not set + datetime.now() is used. + :param chunksize: If set, the chunksize that should be used to create the new field. In general, this should + not be set unless you are writing unit tests. """ if not isinstance(group, (df.DataFrame, h5py.Group)): if isinstance(group, ds.Dataset): @@ -1050,9 +1078,9 @@ def chunks(self, ie. chunks(1048576, 500000) -> (0, 500000), (500000, 1000000), (1000000, 1048576) - :params length: The range to be split into chunks - :params chunksize: Optional parameter detailing the size of each chunk. If not set, the - chunksize that the Session was initialized with is used. + :param length: The range to be split into chunks + :param chunksize: Optional parameter detailing the size of each chunk. If not set, the + chunksize that the Session was initialized with is used. """ if chunksize is None: chunksize = self.chunksize @@ -1166,13 +1194,13 @@ def merge_left(self, left_on, right_on, Please use DataFrame.merge instead. - Perform a database-style left join on right_fields, outputting the result to right_writers, - if set. + Perform a database-style left join on right_fields, outputting the result to right_writers, if set. + :param left_on: The key to perform the join on on the left hand side :param right_on: The key to perform the join on on the right hand side :param right_fields: The fields to be mapped from right to left :param right_writers: Optional parameter providing the fields to which the mapped data should - be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. + be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. """ l_key_raw = val.raw_array_from_parameter(self, 'left_on', left_on) l_index = np.arange(len(l_key_raw), dtype=np.int64) @@ -1226,13 +1254,13 @@ def merge_right(self, left_on, right_on, Please use DataFrame.merge instead. - Perform a database-style right join on left_fields, outputting the result to left_writers, - if set. + Perform a database-style right join on left_fields, outputting the result to left_writers, if set. + :param left_on: The key to perform the join on on the left hand side :param right_on: The key to perform the join on on the right hand side :param left_fields: The fields to be mapped from right to left :param left_writers: Optional parameter providing the fields to which the mapped data should - be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. + be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. """ l_key_raw = val.raw_array_from_parameter(self, 'left_on', left_on) @@ -1287,16 +1315,16 @@ def merge_inner(self, left_on, right_on, Please use DataFrame.merge instead. - Perform a database-style inner join on left_fields, outputting the result to left_writers, - if set. + Perform a database-style inner join on left_fields, outputting the result to left_writers, if set. + :param left_on: The key to perform the join on on the left hand side :param right_on: The key to perform the join on on the right hand side :param left_fields: The fields to be mapped from left to inner :param left_writers: Optional parameter providing the fields to which the mapped data should - be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. + be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. :param right_fields: The fields to be mapped from right to inner :param right_writers: Optional parameter providing the fields to which the mapped data should - be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. + be written. If this is not set, the mapped data is returned as numpy arrays and lists instead. """ l_key_raw = val.raw_array_from_parameter(self, 'left_on', left_on) l_index = np.arange(len(l_key_raw), dtype=np.int64) @@ -1419,18 +1447,16 @@ def ordered_merge_left(self, left_on, right_on, right_field_sources=tuple(), lef than numpy arrays and provide a tuple of groups/fields to left_field_sinks, so that the session and compute the merge and apply the mapping in a streaming fashion. + :param left_on: the group/field/numba array that contains the left key values :param right_on: the group/field/numba array that contains the right key values :param left_to_right_map: a group/field/numba array that the map is written to. If - it is a numba array, it must be the size of the resulting merge - :param left_field_sources: a tuple of group/fields/numba arrays that contain the - fields to be joined + it is a numba array, it must be the size of the resulting merge + :param left_field_sources: a tuple of group/fields/numba arrays that contain the fields to be joined :param left_field_sinks: optional - a tuple of group/fields/numba arrays that - the mapped fields should be written to - :param left_unique: a hint to indicate whether the 'left_on' field contains unique - values - :param right_unique: a hint to indicate whether the 'right_on' field contains - unique values + the mapped fields should be written to + :param left_unique: a hint to indicate whether the 'left_on' field contains unique values + :param right_unique: a hint to indicate whether the 'right_on' field contains unique values :return: If left_field_sinks is not set, a tuple of the output fields is returned """ if left_field_sinks is not None: @@ -1491,22 +1517,21 @@ def ordered_merge_right(self, left_on, right_on, Generate the results of a right join and apply it to the fields described in the tuple 'right_field_sources'. If 'right_field_sinks' is set, the mapped values are written to the fields / arrays set there. + Note: in order to achieve best scalability, you should use groups / fields rather than numpy arrays and provide a tuple of groups/fields to right_field_sinks, so that the session and compute the merge and apply the mapping in a streaming fashion. + :param left_on: the group/field/numba array that contains the left key values :param right_on: the group/field/numba array that contains the right key values :param right_to_left_map: a group/field/numba array that the map is written to. If - it is a numba array, it must be the size of the resulting merge - :param right_field_sources: a tuple of group/fields/numba arrays that contain the - fields to be joined + it is a numba array, it must be the size of the resulting merge + :param right_field_sources: a tuple of group/fields/numba arrays that contain the fields to be joined :param right_field_sinks: optional - a tuple of group/fields/numba arrays that - the mapped fields should be written to - :param left_unique: a hint to indicate whether the 'left_on' field contains unique - values - :param right_unique: a hint to indicate whether the 'right_on' field contains - unique values + the mapped fields should be written to + :param left_unique: a hint to indicate whether the 'left_on' field contains unique values + :param right_unique: a hint to indicate whether the 'right_on' field contains unique values :return: If right_field_sinks is not set, a tuple of the output fields is returned """ return self.ordered_merge_left(right_on, left_on, left_field_sources, right_field_sinks, @@ -1520,22 +1545,21 @@ def ordered_merge_inner(self, left_on, right_on, Generate the results of an inner join and apply it to the fields described in the tuple 'right_field_sources'. If 'right_field_sinks' is set, the mapped values are written to the fields / arrays set there. + Note: in order to achieve best scalability, you should use groups / fields rather than numpy arrays and provide a tuple of groups/fields to right_field_sinks, so that the session and compute the merge and apply the mapping in a streaming fashion. + :param left_on: the group/field/numba array that contains the left key values :param right_on: the group/field/numba array that contains the right key values :param right_to_left_map: a group/field/numba array that the map is written to. If - it is a numba array, it must be the size of the resulting merge - :param right_field_sources: a tuple of group/fields/numba arrays that contain the - fields to be joined + it is a numba array, it must be the size of the resulting merge + :param right_field_sources: a tuple of group/fields/numba arrays that contain the fields to be joined :param right_field_sinks: optional - a tuple of group/fields/numba arrays that - the mapped fields should be written to - :param left_unique: a hint to indicate whether the 'left_on' field contains unique - values - :param right_unique: a hint to indicate whether the 'right_on' field contains - unique values + the mapped fields should be written to + :param left_unique: a hint to indicate whether the 'left_on' field contains unique values + :param right_unique: a hint to indicate whether the 'right_on' field contains unique values :return: If right_field_sinks is not set, a tuple of the output fields is returned """ if left_field_sinks is not None: diff --git a/exetera/processing/__init__.py b/exetera/processing/__init__.py index 45c6a2ff..7d510074 100644 --- a/exetera/processing/__init__.py +++ b/exetera/processing/__init__.py @@ -1,2 +1,3 @@ -from . import numpy_buffer \ No newline at end of file +from . import numpy_buffer +from . import date_time_helpers diff --git a/exetera/processing/date_time_helpers.py b/exetera/processing/date_time_helpers.py index 0775565b..b5cc78f8 100644 --- a/exetera/processing/date_time_helpers.py +++ b/exetera/processing/date_time_helpers.py @@ -18,12 +18,12 @@ def get_periods(start_date: datetime, Delta controls whether the sequence of periods is generated from an start point or an end point. When delta is positive, the sequence is generated forwards in time. When delta is negative, the sequence is generate backwards in time. + :param start_date: a ``datetime.datetime`` object for the starting period :param end_date: a ``datetime.datetime`` object for tne ending period, exclusive - :param period: a string representing the unit in which the delta is calculated - ('day', 'days', 'week', 'weeks') + :param period: a string representing the unit in which the delta is calculated ('day', 'days', 'week', 'weeks') :param delta: an integer representing the delta. - :return: + :return: a list of dates """ period_map = { @@ -69,19 +69,23 @@ def get_days(date_field: ArrayLike, end_date: Optional[np.float64] = None ) -> Tuple[ArrayLike, Optional[ArrayLike]]: """ - get_days converts a field of timestamps into a field of relative elapsed days. + This converts a field of timestamps into a field of relative elapsed days. The precise behaviour depends on the optional parameters but essentially, the lowest valid day is taken as day 0, and all other timestamps are converted to whole numbers of days elapsed since this timestamp: + * If ``start_date`` is set, the start_date is used as the zero-date * If ``start_date`` is not set: + * If ``date_filter`` is not set, the lowest timestamp is used as the zero-date * If ``date_filter`` is set, the lowest unfiltered timestamp is used as the zero-date As well as returning the elapsed days, this method can also return a filter for which elapsed dates are valid. This is determined as follows: + * If ``date_filter``, ``start_date`` and ``end_date`` are None, None is returned * otherwise: + * If ``date_filter`` is not provided, the filter represents all dates that are out of range with respect to the start_date and end_date parameters * If ``date_filter`` is provided, the filter is all dates out of range with respect to @@ -118,18 +122,14 @@ def generate_period_offset_map(periods: Sequence[datetime] Given a list of ordered datetimes relating to period boundaries, generate a numpy array of days that map each day to a period. - Example: + Example:: - .. code-block:: python + [datetime(2020,1,5), datetime(2020,1,12), datatime(2020,1,19), datetime(2020,1,26)] - [datetime(2020,1,5), datetime(2020,1,12), datatime(2020,1,19), datetime(2020,1,26)] - - generates the following output - - .. code-block:: python - - [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2] + generates the following output:: + + [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2] In the above example, each period spans a week, and periods cover a total of 3 weeks. As a result, the output is 21 entries long, one for each day covered by the period, and @@ -151,28 +151,23 @@ def get_period_offsets(periods_by_day: ArrayLike, periods, perform the mapping to generate a numpy array indicating which period a day is in for each element. - Example: - - .. code-block:: python - - periods_by_day: [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2] - days: [3, 18, 4, 7, 10, 0, 0, 2, 19, 20, 16, 17, 19, 4, 5, 9, 8, 15] - - generates the following output: + Example:: - .. code-block:: python - [0, 2, 0, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2] + periods_by_day: [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2] + days: [3, 18, 4, 7, 10, 0, 0, 2, 19, 20, 16, 17, 19, 4, 5, 9, 8, 15] - This function should generally be used in concert with generate_period_offset_map, as follows: + generates the following output:: - .. code-block:: python + [0, 2, 0, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2] - start_date = # a start date - end_date = # an end date - periods = get_periods(start_date, end_date, 'week', 1) + This function should generally be used in concert with generate_period_offset_map, as follows:: - days = get_days(session.get(src['my_table']['my_timestamps']).data[:]) - result = get_period_offsets(generate_period_offset_map(periods), days) + start_date = # a start date + end_date = # an end date + periods = get_periods(start_date, end_date, 'week', 1) + + days = get_days(session.get(src['my_table']['my_timestamps']).data[:]) + result = get_period_offsets(generate_period_offset_map(periods), days) """ if not isinstance(periods_by_day, np.ndarray) and\