From 49411f2b80359153d2654775cb952015926d7f0a Mon Sep 17 00:00:00 2001 From: deng113jie Date: Thu, 21 Apr 2022 20:11:05 +0100 Subject: [PATCH] more unittests for code coverge (#279) * init commit for more unittests for code coverge * Test tweaks * add numeric field data ops, timestamp * done tests for memfields, gonna add dataset and dataframe * more tests for dataframes and datasets * minor typo fix * update on test memory field apply_span_ ops * minor update remove duplicate * minor update remove duplicates * add tests for general functions including created_like, get_spans, is_sorted, unique * update test session: create_like, list_datasets * more tests capture exceptions * more tests for fields * more tests for session * more tests * more test for utils * confirm updates with Ben * remove njit deco as field is not avilable in numba * updates on Eric\"s comment Co-authored-by: eric --- exetera/core/dataframe.py | 21 +- exetera/core/fields.py | 32 +- exetera/core/operations.py | 284 +++++++++--------- exetera/core/session.py | 11 +- tests/test_dataframe.py | 110 +++++++ tests/test_dataset.py | 45 +++ tests/test_fields.py | 593 +++++++++++++++++++++++++++++++++++-- tests/test_operations.py | 36 ++- tests/test_session.py | 205 +++++++++++-- tests/test_utils.py | 37 +++ tests/utils.py | 52 ++-- 11 files changed, 1185 insertions(+), 241 deletions(-) diff --git a/exetera/core/dataframe.py b/exetera/core/dataframe.py index fbf23a3e..a99af88a 100644 --- a/exetera/core/dataframe.py +++ b/exetera/core/dataframe.py @@ -92,7 +92,7 @@ def add(self, :param field: field to add to this dataframe, copy the underlying dataset """ - dname = field.name[field.name.index('/', 1)+1:] + dname = field.name if '/' not in field.name else field.name[field.name.index('/', 1)+1:] nfield = field.create_like(self, dname) if field.indexed: nfield.indices.write(field.indices[:]) @@ -330,10 +330,7 @@ def delete_field(self, field): if field.dataframe != self: raise ValueError("This field is owned by a different dataframe") name = field.name - if name is None: - raise ValueError("This dataframe does not contain the field to delete.") - else: - self.__delitem__(name) + self.__delitem__(name) def keys(self): """ @@ -390,7 +387,7 @@ def rename(self, """ if not isinstance(field, (str, dict)): - raise ValueError("'field' must be of type str or dict but is {}").format(type(field)) + raise ValueError("'field' must be of type str or dict but is {}".format(type(field))) dict_ = None if isinstance(field, dict): @@ -518,8 +515,6 @@ def apply_index(self, index_to_apply, ddf=None): :returns: a dataframe contains all the fields re-indexed, self if ddf is not set """ if ddf is not None: - val.validate_all_field_length_in_df(ddf) - if not isinstance(ddf, DataFrame): raise TypeError("The destination object must be an instance of DataFrame.") for name, field in self._columns.items(): @@ -1589,7 +1584,7 @@ def _ordered_merge(left: DataFrame, ops.generate_ordered_map_to_left_left_unique_streamed( a_on[0], b_on[0], a_result, b_result, invalid, rdtype=npdtype) else: - if right_keys_unique: + if b_unique: b_result = dest.create_numeric('_b_map', strdtype) ops.generate_ordered_map_to_left_right_unique_streamed( a_on[0], b_on[0], b_result, invalid, rdtype=npdtype) @@ -1600,12 +1595,14 @@ def _ordered_merge(left: DataFrame, a_on[0], b_on[0], a_result, b_result, invalid, rdtype=npdtype) if how == 'right': - dest.rename('_a_map', '_right_map') + if "_a_map" in dest: + dest.rename('_a_map', '_right_map') dest.rename('_b_map', '_left_map') else: - dest.rename('_a_map', '_left_map') + if "_a_map" in dest: + dest.rename('_a_map', '_left_map') dest.rename('_b_map', '_right_map') - else: + else: # how = inner left_result = dest.create_numeric('_left_map', strdtype) right_result = dest.create_numeric('_right_map', strdtype) if left_keys_unique: diff --git a/exetera/core/fields.py b/exetera/core/fields.py index 2df625b8..109fabf7 100644 --- a/exetera/core/fields.py +++ b/exetera/core/fields.py @@ -479,9 +479,13 @@ def __len__(self): @property def dtype(self): """ - Get datatype of field. + Get datatype of field. Please note constructing a numpy array from IndexedString data can be very memory expensive. """ - return self._dtype + if len(self._indices) > 0: + max_len = np.max(self._indices[1:] - self._indices[:-1]) + else: + max_len = 0 + return np.dtype('S'+str(max_len)) def __getitem__(self, item): """ @@ -579,10 +583,14 @@ def __len__(self): @property def dtype(self): """ - Returns datatype of field + Returns datatype of field. Please note constructing a numpy array from IndexedString data can be very memory expensive. :return: dtype """ - return self._dtype + if len(self._indices) > 0: + max_len = np.max(self._indices[1:] - self._indices[:-1]) + else: + max_len = 0 + return np.dtype('S' + str(max_len)) def __getitem__(self, item): """ @@ -1528,6 +1536,13 @@ def remap(self, key_map, new_key): :param key_map: The mapping rule of convert the old key into the new key. :param new_key: The new key. :return: A CategoricalMemField with the new key. + + Example:: + + cat_field = df.create_categorical('cat', 'int32', {"a": 1, "b": 2}) + cat_field.data.write([1,2,1,2]) + newfield = cat_field.remap([(1, 4), (2, 5)], {"a": 4, "b": 5}) + print(newfield.data[:]) # [4,5,4,5] """ # make sure all key values are included in the key_map for k in self._keys.values(): @@ -1917,7 +1932,7 @@ def __le__(self, value): def __eq__(self, value): return FieldDataOps.equal(self._session, self, value) - def __eq__(self, value): + def __ne__(self, value): return FieldDataOps.not_equal(self._session, self, value) def __gt__(self, value): @@ -2943,6 +2958,13 @@ def remap(self, key_map, new_key): :param key_map: The mapping rule of convert the old key into the new key. :param new_key: The new key. :return: A CategoricalMemField with the new key. + + Example:: + + cat_field = df.create_categorical('cat', 'int32', {"a": 1, "b": 2}) + cat_field.data.write([1,2,1,2]) + newfield = cat_field.remap([(1, 4), (2, 5)], {"a": 4, "b": 5}) + print(newfield.data[:]) """ self._ensure_valid() # make sure all key values are included in the key_map diff --git a/exetera/core/operations.py b/exetera/core/operations.py index d64486f5..aa338a6a 100644 --- a/exetera/core/operations.py +++ b/exetera/core/operations.py @@ -21,61 +21,65 @@ MAX_DATETIME = datetime(year=3000, month=1, day=1) -def dtype_to_str(dtype): - if isinstance(dtype, str): - return dtype - - if dtype == bool: - return 'bool' - elif dtype == np.int8: - return 'int8' - elif dtype == np.int16: - return 'int16' - elif dtype == np.int32: - return 'int32' - elif dtype == np.int64: - return 'int64' - elif dtype == np.uint8: - return 'uint8' - elif dtype == np.uint16: - return 'uint16' - elif dtype == np.uint32: - return 'uint32' - elif dtype == np.uint64: - return 'uint64' - elif dtype == np.float32: - return 'float32' - elif dtype == np.float64: - return 'float64' - - raise ValueError("Unsupported dtype '{}'".format(dtype)) +# def dtype_to_str(dtype): +# if isinstance(dtype, str): +# return dtype +# +# if dtype == bool: +# return 'bool' +# elif dtype == np.int8: +# return 'int8' +# elif dtype == np.int16: +# return 'int16' +# elif dtype == np.int32: +# return 'int32' +# elif dtype == np.int64: +# return 'int64' +# elif dtype == np.uint8: +# return 'uint8' +# elif dtype == np.uint16: +# return 'uint16' +# elif dtype == np.uint32: +# return 'uint32' +# elif dtype == np.uint64: +# return 'uint64' +# elif dtype == np.float32: +# return 'float32' +# elif dtype == np.float64: +# return 'float64' +# +# raise ValueError("Unsupported dtype '{}'".format(dtype)) def str_to_dtype(str_dtype): - if str_dtype == 'bool': - return bool - elif str_dtype == 'int8': - return np.int8 - elif str_dtype == 'int16': - return np.int16 - elif str_dtype == 'int32': - return np.int32 - elif str_dtype == 'int64': - return np.int64 - elif str_dtype == 'uint8': - return np.uint8 - elif str_dtype == 'uint16': - return np.uint16 - elif str_dtype == 'uint32': - return np.uint32 - elif str_dtype == 'uint64': - return np.uint64 - elif str_dtype == 'float32': - return np.float32 - elif str_dtype == 'float64': - return np.float64 - - raise ValueError("Unsupported dtype '{}'".format(str_dtype)) + if str_dtype in ['bool', 'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32', 'uint64', 'float32', 'float64']: + return np.dtype(str_dtype) + else: + raise ValueError("Unsupported dtype '{}'".format(str_dtype)) + # if str_dtype == 'bool': + # return bool + # elif str_dtype == 'int8': + # return np.int8 + # elif str_dtype == 'int16': + # return np.int16 + # elif str_dtype == 'int32': + # return np.int32 + # elif str_dtype == 'int64': + # return np.int64 + # elif str_dtype == 'uint8': + # return np.uint8 + # elif str_dtype == 'uint16': + # return np.uint16 + # elif str_dtype == 'uint32': + # return np.uint32 + # elif str_dtype == 'uint64': + # return np.uint64 + # elif str_dtype == 'float32': + # return np.float32 + # elif str_dtype == 'float64': + # return np.float64 + # + # raise ValueError("Unsupported dtype '{}'".format(str_dtype)) @exetera_njit @@ -203,17 +207,19 @@ def get_valid_value_extents(chunk, start, end, invalid=-1): return first, last -def get_map_datatype_str_based_on_lengths(left_len, right_len): - if left_len < (2 << 30) and right_len < (2 << 30): - index_dtype = 'int32' - else: - index_dtype = 'int64' - return index_dtype +# def get_map_datatype_str_based_on_lengths(left_len, right_len): +# if left_len < (2 << 30) and right_len < (2 << 30): +# index_dtype = 'int32' +# else: +# index_dtype = 'int64' +# return index_dtype def get_map_datatype_based_on_lengths(left_len, right_len): - dtype_str = get_map_datatype_str_based_on_lengths(left_len, right_len) - return np.int32 if dtype_str == 'int32' else np.int64 + if left_len < utils.INT64_INDEX_LENGTH and right_len < utils.INT64_INDEX_LENGTH: + return np.int32 + else: + return np.int64 @exetera_njit @@ -573,7 +579,6 @@ def chunked_copy(src_field, dest_field, chunksize=1 << 20): element_chunked_copy(src_field.data, dest_field.data, chunksize) -@exetera_njit def data_iterator(data_field, chunksize=1 << 20): cur = np.int64(0) chunks_ = chunks(len(data_field.data), chunksize) @@ -1127,83 +1132,83 @@ def _apply_spans_concat_2(spans, src_index, src_values, dest_index, dest_values, return s + 1, d_index_i, d_index_v -@exetera_njit -def apply_spans_concat(spans, src_index, src_values, dest_index, dest_values, - max_index_i, max_value_i, s_start): - separator = np.frombuffer(b',', dtype=np.uint8)[0] - delimiter = np.frombuffer(b'"', dtype=np.uint8)[0] - if s_start == 0: - index_i = np.uint32(1) - index_v = np.int64(0) - dest_index[0] = spans[0] - else: - index_i = np.uint32(0) - index_v = np.int64(0) - - s_end = len(spans)-1 - for s in range(s_start, s_end): - cur = spans[s] - next = spans[s+1] - cur_src_i = src_index[cur] - next_src_i = src_index[next] - - dest_index[index_i] = next_src_i - index_i += 1 - - if next_src_i - cur_src_i > 1: - if next - cur == 1: - # only one entry to be copied, so commas not required - next_index_v = next_src_i - cur_src_i + np.int64(index_v) - dest_values[index_v:next_index_v] = src_values[cur_src_i:next_src_i] - index_v = next_index_v - else: - # check to see how many non-zero-length entries there are; >1 means we must - # separate them by commas - non_empties = 0 - for e in range(cur, next): - if src_index[e] < src_index[e+1]: - non_empties += 1 - if non_empties == 1: - # only one non-empty entry to be copied, so commas not required - next_index_v = next_src_i - cur_src_i + np.int64(index_v) - dest_values[index_v:next_index_v] = src_values[cur_src_i:next_src_i] - index_v = next_index_v - else: - # the outer conditional already determines that we have a non-empty entry - # so there must be multiple non-empty entries and commas are required - for e in range(cur, next): - src_start = src_index[e] - src_end = src_index[e+1] - comma = False - quotes = False - for i_c in range(src_start, src_end): - if src_values[i_c] == separator: - comma = True - elif src_values[i_c] == delimiter: - quotes = True - - d_index = np.int64(0) - if comma or quotes: - dest_values[d_index] = delimiter - d_index += 1 - for i_c in range(src_start, src_end): - if src_values[i_c] == delimiter: - dest_values[d_index] = src_values[i_c] - d_index += 1 - dest_values[d_index] = src_values[i_c] - d_index += 1 - dest_values[d_index] = delimiter - d_index += 1 - else: - s_len = np.int64(src_end - src_start) - dest_values[index_v:index_v + s_len] = src_values[src_start:src_end] - d_index += s_len - index_v += np.int64(d_index) - - # if either the index or values are past the threshold, write them - if index_i >= max_index_i or index_v >= max_value_i: - break - return s+1, index_i, index_v +# @exetera_njit +# def apply_spans_concat(spans, src_index, src_values, dest_index, dest_values, +# max_index_i, max_value_i, s_start): +# separator = np.frombuffer(b',', dtype=np.uint8)[0] +# delimiter = np.frombuffer(b'"', dtype=np.uint8)[0] +# if s_start == 0: +# index_i = np.uint32(1) +# index_v = np.int64(0) +# dest_index[0] = spans[0] +# else: +# index_i = np.uint32(0) +# index_v = np.int64(0) +# +# s_end = len(spans)-1 +# for s in range(s_start, s_end): +# cur = spans[s] +# next = spans[s+1] +# cur_src_i = src_index[cur] +# next_src_i = src_index[next] +# +# dest_index[index_i] = next_src_i +# index_i += 1 +# +# if next_src_i - cur_src_i > 1: +# if next - cur == 1: +# # only one entry to be copied, so commas not required +# next_index_v = next_src_i - cur_src_i + np.int64(index_v) +# dest_values[index_v:next_index_v] = src_values[cur_src_i:next_src_i] +# index_v = next_index_v +# else: +# # check to see how many non-zero-length entries there are; >1 means we must +# # separate them by commas +# non_empties = 0 +# for e in range(cur, next): +# if src_index[e] < src_index[e+1]: +# non_empties += 1 +# if non_empties == 1: +# # only one non-empty entry to be copied, so commas not required +# next_index_v = next_src_i - cur_src_i + np.int64(index_v) +# dest_values[index_v:next_index_v] = src_values[cur_src_i:next_src_i] +# index_v = next_index_v +# else: +# # the outer conditional already determines that we have a non-empty entry +# # so there must be multiple non-empty entries and commas are required +# for e in range(cur, next): +# src_start = src_index[e] +# src_end = src_index[e+1] +# comma = False +# quotes = False +# for i_c in range(src_start, src_end): +# if src_values[i_c] == separator: +# comma = True +# elif src_values[i_c] == delimiter: +# quotes = True +# +# d_index = np.int64(0) +# if comma or quotes: +# dest_values[d_index] = delimiter +# d_index += 1 +# for i_c in range(src_start, src_end): +# if src_values[i_c] == delimiter: +# dest_values[d_index] = src_values[i_c] +# d_index += 1 +# dest_values[d_index] = src_values[i_c] +# d_index += 1 +# dest_values[d_index] = delimiter +# d_index += 1 +# else: +# s_len = np.int64(src_end - src_start) +# dest_values[index_v:index_v + s_len] = src_values[src_start:src_end] +# d_index += s_len +# index_v += np.int64(d_index) +# +# # if either the index or values are past the threshold, write them +# if index_i >= max_index_i or index_v >= max_value_i: +# break +# return s+1, index_i, index_v # ordered map to left functionality: streaming @@ -2184,7 +2189,8 @@ def ordered_left_map_result_size(left, right): return result_size if i < len(left): - result_size += left - i + result_size = result_size + len(left) - i + return result_size @exetera_njit diff --git a/exetera/core/session.py b/exetera/core/session.py index fbf35ce5..abf98651 100644 --- a/exetera/core/session.py +++ b/exetera/core/session.py @@ -148,14 +148,14 @@ def get_shared_index(self, keys: Tuple[np.ndarray]): Example:: key_1 = ['a', 'b', 'e', 'g', 'i'] - key_2 = ['b', 'b', 'c', 'c, 'e', 'g', 'j'] + key_2 = ['b', 'b', 'c', 'c', 'e', 'g', 'j'] key_3 = ['a', 'c' 'd', 'e', 'g', 'h', 'h', 'i'] sorted_union = ['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j'] key_1_index = [0, 1, 4, 5, 7] key_2_index = [1, 1, 2, 2, 4, 5, 8] - key_3_index = [0, 2, 3, 4, 5, 6, 6, 7] + key_3_index = [0, 3, 4, 5, 6, 6, 7] """ if not isinstance(keys, tuple): raise ValueError("'keys' must be a tuple") @@ -334,11 +334,14 @@ def apply_index(self, index_to_apply, src, dest=None): return result def distinct(self, field=None, fields=None, filter=None): + """ + todo: confirm deprecated. + """ if field is None and fields is None: - return ValueError("One of 'field' and 'fields' must be set") + raise ValueError("One of 'field' and 'fields' must be set") if field is not None and fields is not None: - return ValueError("Only one of 'field' and 'fields' may be set") + raise ValueError("Only one of 'field' and 'fields' may be set") if field is not None: return np.unique(field) diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index fb9306ce..2e785a95 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -18,10 +18,13 @@ def test_dataframe_init(self): dst = s.open_dataset(bio, 'w', 'dst') # init df = dst.create_dataframe('dst') + self.assertEqual(len(df), 0) self.assertTrue(isinstance(df, dataframe.DataFrame)) numf = df.create_numeric('numf', 'uint32') df2 = dst.create_dataframe('dst2', dataframe=df) self.assertTrue(isinstance(df2, dataframe.DataFrame)) + self.assertEqual(len(df2), 1) + self.assertListEqual([numf], list(df.values())) # add & set & contains self.assertTrue('numf' in df) @@ -31,11 +34,33 @@ def test_dataframe_init(self): self.assertFalse(df.contains_field(cat)) df['cat'] = cat self.assertTrue('cat' in df) + self.assertEqual(len(df2), 2) + self.assertEqual(len(df), 2) + + with self.assertRaises(TypeError): + df[1] = cat + with self.assertRaises(TypeError): + df['cat2'] = 'foo' + + num2 = s.create_numeric(df2, 'num2', 'int32') + df.add(num2) # add is hard-copy + self.assertTrue('num2' in df) + + with self.assertRaises(TypeError): + 1 in df + with self.assertRaises(TypeError): + df.contains_field(1) + self.assertTrue(df.contains_field(df['num2'])) # list & get self.assertEqual(id(numf), id(df.get_field('numf'))) self.assertEqual(id(numf), id(df['numf'])) + with self.assertRaises(TypeError): + df[1] + with self.assertRaises(ValueError): + df['foo'] + # list & iter dfit = iter(df) self.assertEqual('numf', next(dfit)) @@ -46,8 +71,12 @@ def test_dataframe_init(self): self.assertFalse('numf' in df) with self.assertRaises(ValueError, msg="This field is owned by a different dataframe"): df.delete_field(cat) + with self.assertRaises(ValueError): + del df['numf'] self.assertFalse(df.contains_field(cat)) + + def test_dataframe_create_numeric(self): bio = BytesIO() with session.Session() as s: @@ -199,9 +228,12 @@ def test_dataframe_static_methods(self): df = dst.create_dataframe('dst') numf = s.create_numeric(df, 'numf', 'int32') numf.data.write([5, 4, 3, 2, 1]) + idxs = s.create_indexed_string(df, 'idxs') + idxs.data.write(['aaa', 'b', 'ccc', 'dddd']) df2 = dst.create_dataframe('df2') dataframe.copy(numf, df2,'numf') + dataframe.copy(idxs, df2, 'idxs') self.assertListEqual([5, 4, 3, 2, 1], df2['numf'].data[:].tolist()) df.drop('numf') self.assertTrue('numf' not in df) @@ -209,6 +241,8 @@ def test_dataframe_static_methods(self): self.assertTrue('numf' not in df2) self.assertListEqual([5, 4, 3, 2, 1], df['numf'].data[:].tolist()) + + def test_dataframe_ops(self): bio = BytesIO() with session.Session() as s: @@ -225,6 +259,8 @@ def test_dataframe_ops(self): df.apply_index(index, ddf) self.assertEqual([1, 2, 3, 4, 5], ddf['numf'].data[:].tolist()) self.assertEqual([b'a', b'b', b'c', b'd', b'e'], ddf['fst'].data[:].tolist()) + with self.assertRaises(TypeError): + df.apply_index(index, 'foo') filter_to_apply = np.array([True, True, False, False, True]) ddf = dst.create_dataframe('dst3') @@ -233,6 +269,9 @@ def test_dataframe_ops(self): self.assertEqual([b'e', b'd', b'a'], ddf['fst'].data[:].tolist()) + + + class TestDataFrameRename(unittest.TestCase): def test_rename_1(self): @@ -250,6 +289,11 @@ def test_rename_1(self): self.assertFalse('fa' in df) self.assertTrue('fb' in df) self.assertTrue('fc' in df) + with self.assertRaises(ValueError): + df.rename(123,456) + with self.assertRaises(ValueError): + df.rename({'fc': 'fb'}, 'fb') + df.rename('fc', None) def test_rename_should_not_clash(self): a = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype='int32') @@ -360,6 +404,9 @@ def test_apply_filter(self): df.apply_filter(filt) self.assertListEqual(expected, df['numf'].data[:].tolist()) + with self.assertRaises(TypeError): + df.apply_filter(filt, 123) + def test_apply_filter_with_numeric_filter(self): @@ -425,6 +472,69 @@ def tests_merge_left(self): np.logical_not(ddf['valid_r'].data[:]) self.assertTrue(np.all(valid_if_equal)) + with self.assertRaises(ValueError): + dataframe.merge(123, rdf, ddf, 'l_id', 'r_id') + dataframe.merge(ldf, 123, ddf, 'l_id', 'r_id') + dataframe.merge(ldf, rdf, ddf, 'l_id', 'r_id', how='foo') + + + + def tests_merge_sorted(self): + l_id = np.asarray([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32') + r_vals = ['bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', '', 'ccc2'] + + bio = BytesIO() + with session.Session() as s: + dst = s.open_dataset(bio, 'w', 'dst') + ldf = dst.create_dataframe('ldf') + rdf = dst.create_dataframe('rdf') + ldf.create_numeric('l_id', 'int32').data.write(l_id) + + r_sorted = dst.create_dataframe('r_sorted') + r_sorted.create_numeric('r_id', 'int32').data.write(l_id) + r_sorted.create_indexed_string('r_vals').data.write(r_vals[0:8]) + ddf = dst.create_dataframe('ddf2') + dataframe.merge(ldf, r_sorted, ddf, 'l_id', 'r_id', how='left', hint_left_keys_ordered=True, + hint_right_keys_ordered=True) + expected = ['bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', ''] + self.assertEqual(expected, ddf['r_vals'].data[:]) + valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) + self.assertTrue(np.all(valid_if_equal)) + ddf = dst.create_dataframe('ddf3') + # dataframe.merge(ldf, r_sorted, ddf, 'l_id', 'r_id', how='left', hint_left_keys_ordered=True, + # hint_right_keys_ordered=True, hint_left_keys_unique=True, hint_right_keys_unique=True) + # self.assertEqual(expected, ddf['r_vals'].data[:]) + # valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) + # self.assertTrue(np.all(valid_if_equal)) + + + ddf = dst.create_dataframe('ddf4') + dataframe.merge(ldf, r_sorted, ddf, 'l_id', 'r_id', how='right', hint_left_keys_ordered=True, + hint_right_keys_ordered=True) + self.assertEqual(expected, ddf['r_vals'].data[:]) + valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) + self.assertTrue(np.all(valid_if_equal)) + + ddf = dst.create_dataframe('ddf5') + # dataframe.merge(ldf, r_sorted, ddf, 'l_id', 'r_id', how='right', hint_left_keys_ordered=True, + # hint_right_keys_ordered=True, hint_left_keys_unique=True, hint_right_keys_unique=True) + # self.assertEqual(expected, ddf['r_vals'].data[:]) + # valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) + # self.assertTrue(np.all(valid_if_equal)) + + ddf = dst.create_dataframe('ddf6') + dataframe.merge(ldf, r_sorted, ddf, 'l_id', 'r_id', how='inner', hint_left_keys_ordered=True, + hint_right_keys_ordered=True) + self.assertEqual(expected, ddf['r_vals'].data[:]) + valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) + self.assertTrue(np.all(valid_if_equal)) + + ddf = dst.create_dataframe('ddf7') + # dataframe.merge(ldf, r_sorted, ddf, 'l_id', 'r_id', how='inner', hint_left_keys_ordered=True, + # hint_right_keys_ordered=True, hint_left_keys_unique=True, hint_right_keys_unique=True) + # self.assertEqual(expected, ddf['r_vals'].data[:]) + # valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) + # self.assertTrue(np.all(valid_if_equal)) def tests_merge_right(self): diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 7340b7e9..738e4e44 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -22,6 +22,12 @@ def test_dataset_init(self): num.data.write([1, 2, 3, 4]) self.assertEqual([1, 2, 3, 4], num.data[:].tolist()) + df2 = dst.require_dataframe('df') + self.assertEqual(id(df), id(df2)) + + df3 = dst.require_dataframe('df3') + self.assertTrue(isinstance(df3, DataFrame)) + cat = s.create_categorical(df, 'cat', 'int8', {'a': 1, 'b': 2}) cat.data.write([1 , 1, 2, 2]) self.assertEqual([1, 1, 2, 2], s.get(df['cat']).data[:].tolist()) @@ -63,17 +69,27 @@ def test_dataset_init_with_data(self): del dst['df2'] self.assertTrue(len(dst.keys()) == 1) self.assertTrue(len(dst._file.keys()) == 1) + with self.assertRaises(ValueError): + del dst['df2'] # set dataframe (this is a copy between datasets dst['df3'] = df2 self.assertTrue(isinstance(dst['df3'], DataFrame)) self.assertEqual([b'a', b'b', b'c', b'd'], dst['df3']['fs'].data[:].tolist()) + with self.assertRaises(TypeError): + dst[123] = df2 + dst['df4'] = 'foo' + # set dataframe within the same dataset (rename) dst['df4'] = dst['df3'] self.assertTrue(isinstance(dst['df4'], DataFrame)) self.assertEqual([b'a', b'b', b'c', b'd'], dst['df4']['fs'].data[:].tolist()) + df2.name = None + with self.assertRaises(ValueError): + dst.delete_dataframe(df2) + def test_dataset_static_func(self): bio = BytesIO() bio2 = BytesIO() @@ -82,15 +98,44 @@ def test_dataset_static_func(self): df = dst.create_dataframe('df') num1 = df.create_numeric('num', 'uint32') num1.data.write([1, 2, 3, 4]) + idxs = df.create_indexed_string('idxs') + idxs.data.write(['a', 'bb', 'ccc', 'dddd']) ds2 = s.open_dataset(bio2, 'r+', 'ds2') copy(df, ds2, 'df2') self.assertTrue(isinstance(ds2['df2'], DataFrame)) self.assertTrue(isinstance(ds2['df2']['num'], fields.Field)) + with self.assertRaises(ValueError): + copy(df, ds2, 'df2') + ds2.drop('df2') self.assertTrue(len(ds2) == 0) + df2 = ds2.create_dataframe('df2') + self.assertTrue(ds2.contains_dataframe(df2)) + self.assertFalse(dst.create_dataframe('foo')) + + with self.assertRaises(ValueError): + ds2.create_dataframe('df3', 123) + + with self.assertRaises(TypeError): + ds2.contains_dataframe('foo') + + with self.assertRaises(TypeError): + ds2[123] + + with self.assertRaises(ValueError): + ds2['boo'] + + dst.delete_dataframe(dst['foo']) + ds2.delete_dataframe(df2) + self.assertFalse(ds2.contains_dataframe(df2)) + + self.assertListEqual(['df'], list(dst.keys())) + self.assertListEqual([df], list(dst.values())) + self.assertDictEqual({'df': df}, {k: v for k, v in dst.items()}) + move(df, ds2, 'df2') self.assertTrue(len(dst) == 0) self.assertTrue(len(ds2) == 1) diff --git a/tests/test_fields.py b/tests/test_fields.py index 0866a5dc..d4081fc1 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -1,14 +1,16 @@ from pickle import FALSE import unittest +import operator import numpy as np +import numpy.testing from io import BytesIO import h5py from datetime import datetime from parameterized import parameterized -from .utils import SessionTestCase, shuffle_randstate, allow_slow_tests, DEFAULT_FIELD_DATA +from .utils import SessionTestCase, shuffle_randstate, allow_slow_tests, RAND_STATE,DEFAULT_FIELD_DATA, HARD_INTS, HARD_FLOATS, utc_timestamp, NUMERIC_DATA, TIMESTAMP_DATA, FIXED_STRING_DATA, INDEX_STRING_DATA from exetera.core import session from exetera.core import fields @@ -34,6 +36,10 @@ def test_fields(self, creator, name, kwargs, data): data = np.asarray(data, dtype=kwargs["nformat"]) self.assertFieldEqual(data, f) + wtb = f.writeable() + with self.subTest("writable:"): + self.assertFieldEqual(data, wtb) + class TestFieldExistence(SessionTestCase): @@ -42,7 +48,220 @@ def test_field_truthness(self, creator, name, kwargs, data): """Test every field object is considered True.""" f = self.setup_field(self.df, creator, name, (), kwargs, data) self.assertTrue(bool(f)) + self.assertTrue(f.valid) + + +class TestFieldDataOps(SessionTestCase): + """ + Test data operations for each different field. + 1, compare the result of operations on field against operations on raw numpy data. + """ + + def setUp(self): + super(TestFieldDataOps, self).setUp() + + + @parameterized.expand([(operator.lt,),(operator.gt,),(operator.le,),(operator.ge,),(operator.ne,),(operator.eq,)]) + def test_CategoricalMemField_binary_op(self,op): + """ + Categorical mem field ops against numpy, categorical memory field, categorical field + """ + categorical_memfield = fields.CategoricalMemField(self.s, 'int32', {"a": 1, "b": 2, "c": 3}) + memfield_data = RAND_STATE.randint(1, 4, 20) + categorical_memfield.data.write(memfield_data) + + for i in range(0,5): + indata=np.full(memfield_data.shape,i) + result=op(memfield_data,indata) + + with self.subTest(f"Testing value numpy {i}"): + output=op(categorical_memfield,indata) + + np.testing.assert_array_equal(result,output) + + self.assertIsInstance(output, fields.NumericMemField) + self.assertEqual(output.data.dtype,"bool") + + with self.subTest(f"Testing value Mem field {i}"): + test_field = fields.CategoricalMemField(self.s, 'int32', {"a": 1, "b": 2, "c": 3}) + test_field.data.write(result) + output=op(categorical_memfield,test_field) + + np.testing.assert_array_equal(result,output) + self.assertIsInstance(output, fields.NumericMemField) + self.assertEqual(output.data.dtype,"bool") + + with self.subTest(f"Testing value Field {i}"): + test_field = self.df.create_categorical(f'name{i}','int32',{"a": 1, "b": 2, "c": 3}) + test_field.data.write(result) + output=op(categorical_memfield,test_field) + + np.testing.assert_array_equal(result,output) + self.assertIsInstance(output, fields.NumericMemField) + self.assertEqual(output.data.dtype,"bool") + + @parameterized.expand( + [(operator.lt,), (operator.gt,), (operator.le,), (operator.ge,), (operator.ne,), (operator.eq,)]) + def test_CategoricalField_binary_op(self, op): + """ + Categorical field ops against numpy, categorical memory field, categorical field + """ + field = self.df.create_categorical('catf', 'int32', {"a": 1, "b": 2, "c": 3}) + + data = np.array(RAND_STATE.randint(1, 4, 20)) + field.data.write(data) + + for i in range(0, 5): + indata = np.full(data.shape, i) + result = op(data, indata) + + with self.subTest(f"Testing value numpy {i}"): + output = op(field, indata) + + np.testing.assert_array_equal(result, output) + + self.assertIsInstance(output, fields.NumericMemField) + self.assertEqual(output.data.dtype, "bool") + + with self.subTest(f"Testing value Mem field {i}"): + test_field = fields.CategoricalMemField(self.s, 'int32', {"a": 1, "b": 2, "c": 3}) + test_field.data.write(indata) + output = op(field, test_field) + + np.testing.assert_array_equal(result, output) + self.assertIsInstance(output, fields.NumericMemField) + self.assertEqual(output.data.dtype, "bool") + + with self.subTest(f"Testing value Field {i}"): + test_field = self.df.create_categorical(f'name{i}', 'int32', {"a": 1, "b": 2, "c": 3}) + test_field.data.write(indata) + output = op(field, test_field) + + np.testing.assert_array_equal(result, output) + self.assertIsInstance(output, fields.NumericMemField) + self.assertEqual(output.data.dtype, "bool") + + @parameterized.expand([(operator.eq,),(operator.ge,),(operator.gt,),(operator.le,),(operator.lt,),(operator.ne,),]) + def test_NumericField_binary_ops(self, op): + raw_data = shuffle_randstate(list(range(-10, 10)) + HARD_INTS) + numeric_field = self.df.create_numeric('num', 'int64') + numeric_field.data.write(raw_data) + target = shuffle_randstate(list(range(-10, 10)) + HARD_INTS) # against numpy + result = op(raw_data, target) + output = op(numeric_field, target) + numpy.testing.assert_array_equal(result, output) + + field2 = self.df.create_numeric('num2', 'int64') + field2.data.write(target) + output = op(numeric_field, field2) # against numeric field + numpy.testing.assert_array_equal(result, output) + + memfield = fields.NumericMemField(self.s, 'int64') + memfield.data.write(np.array(target)) + output = op(numeric_field, field2) # against memory numeric field + numpy.testing.assert_array_equal(result, output) + + @parameterized.expand([(operator.add,), (operator.sub,), (operator.mul,), (operator.truediv,), (operator.floordiv,), + (operator.mod,), (operator.lt,), (operator.le,), (operator.eq,), (operator.ne,), + (operator.ge,), (operator.gt,), (divmod,)]) + def test_TimestampField_binary_ops(self, op): + raw_data = np.array(TIMESTAMP_DATA) + target = np.array(TIMESTAMP_DATA) + ts_field = self.df.create_timestamp('ts_field') + ts_field.data.write(raw_data) + result = op(raw_data, target) # timestampe field vs list + output = op(ts_field, target) + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) + + ts_field2 = self.df.create_timestamp('ts_field2') + ts_field2.data.write(target) + output = op(ts_field, ts_field2) # timestamp field vs timestamp field + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) + + ts_field3 = fields.TimestampMemField(self.s) + ts_field3.data.write(target) + output = op(ts_field, ts_field3) # timestamp field vs timestamp mem field + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) + + @parameterized.expand([(operator.add,), (operator.sub,), (operator.mul,), (operator.truediv,), (operator.floordiv,), + (operator.mod,), (divmod,)]) + def test_TimestampField_binary_reverse(self, op): + raw_data = TIMESTAMP_DATA + target = TIMESTAMP_DATA + + ts_field = self.df.create_timestamp('ts_field') + ts_field.data.write(raw_data) + output = op(target, ts_field) # list + field is not implemented, hence will call field.__radd__ + result = op(raw_data, np.array(target)) + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) + + + @parameterized.expand([(operator.add,), (operator.sub,), (operator.mul,), (operator.truediv,), (operator.floordiv,), + (operator.mod,), (operator.lt,), (operator.le,), (operator.eq,), (operator.ne,), + (operator.ge,), (operator.gt,), (divmod,)]) + def test_TimestampMemField_binary_ops(self, op): + raw_data = np.array(TIMESTAMP_DATA) + target = np.array(TIMESTAMP_DATA) + ts_field = fields.TimestampMemField(self.s) + ts_field.data.write(raw_data) + result = op(raw_data, target) # timestampe field vs list + output = op(ts_field, target) + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) + + ts_field2 = self.df.create_timestamp('ts_field2') + ts_field2.data.write(target) + output = op(ts_field, ts_field2) # timestamp field vs timestamp field + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) + + ts_field3 = fields.TimestampMemField(self.s) + ts_field3.data.write(target) + output = op(ts_field, ts_field3) # timestamp field vs timestamp mem field + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) + + @parameterized.expand([(operator.add,), (operator.sub,), (operator.mul,), (operator.truediv,), (operator.floordiv,), + (operator.mod,), (divmod,)]) + def test_TimestampMemField_binary_reverse(self, op): + raw_data = np.array(TIMESTAMP_DATA) + target = TIMESTAMP_DATA + + ts_field = fields.TimestampMemField(self.s) + ts_field.data.write(raw_data) + output = op(target, ts_field) # list + field is not implemented, hence will call field.__radd__ + result = op(raw_data, np.array(target)) + if op == divmod: + numpy.testing.assert_array_equal(result[0], output[0].data[:]) + numpy.testing.assert_array_equal(result[1], output[1].data[:]) + else: + numpy.testing.assert_array_equal(result, output) class TestFieldGetSpans(unittest.TestCase): @@ -70,6 +289,10 @@ def test_get_spans(self): cat.data.write([1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2]) self.assertListEqual([0,2,4,7,10,11,12,13,14],list(cat.get_spans())) + timestamp = s.create_timestamp(ds, 'ts') + timestamp.data.write(TIMESTAMP_DATA) + self.assertListEqual([i for i in range(11)], list(timestamp.get_spans())) + class TestIsSorted(unittest.TestCase): @@ -80,6 +303,10 @@ def test_indexed_string_is_sorted(self): df = ds.create_dataframe('foo') f = df.create_indexed_string('f') + f.data.write('a') + self.assertTrue(f.is_sorted()) + + f.data.clear() vals = ['the', 'quick', '', 'brown', 'fox', 'jumps', '', 'over', 'the', 'lazy', '', 'dog'] f.data.write(vals) self.assertFalse(f.is_sorted()) @@ -96,6 +323,10 @@ def test_fixed_string_is_sorted(self): df = ds.create_dataframe('foo') f = df.create_fixed_string('f', 5) + f.data.write('a') + self.assertTrue(f.is_sorted()) + + f.data.clear() vals = ['a', 'ba', 'bb', 'bac', 'de', 'ddddd', 'deff', 'aaaa', 'ccd'] f.data.write([v.encode() for v in vals]) self.assertFalse(f.is_sorted()) @@ -112,6 +343,10 @@ def test_numeric_is_sorted(self): df = ds.create_dataframe('foo') f = df.create_numeric('f', 'int32') + f.data.write([1]) + self.assertTrue(f.is_sorted()) + + f.data.clear() vals = [74, 1897, 298, 0, -100098, 380982340, 8, 6587, 28421, 293878] f.data.write(vals) self.assertFalse(f.is_sorted()) @@ -128,6 +363,10 @@ def test_categorical_is_sorted(self): df = ds.create_dataframe('foo') f = df.create_categorical('f', 'int8', {'a': 0, 'c': 1, 'd': 2, 'b': 3}) + f.data.write([1]) + self.assertTrue(f.is_sorted()) + + f.data.clear() vals = [0, 1, 3, 2, 3, 2, 2, 0, 0, 1, 2] f.data.write(vals) self.assertFalse(f.is_sorted()) @@ -147,6 +386,10 @@ def test_timestamp_is_sorted(self): f = df.create_timestamp('f') d = D(2020, 5, 10) + f.data.write([d.timestamp()]) + self.assertTrue(f.is_sorted()) + + f.data.clear() vals = [d + T(seconds=50000), d - T(days=280), d + T(weeks=2), d + T(weeks=250), d - T(weeks=378), d + T(hours=2897), d - T(days=23), d + T(minutes=39873)] vals = [v.timestamp() for v in vals] @@ -158,6 +401,128 @@ def test_timestamp_is_sorted(self): f2.data.write(svals) self.assertTrue(f2.is_sorted()) +class TestMemFieldsGeneralMethods(SessionTestCase): + """ + Methods tested here: created_like, get_spans, is_sorted, unique, writeable + """ + def test_numeric_mem_field(self): + raw_data = np.array(shuffle_randstate(NUMERIC_DATA)) + numeric_mem = fields.NumericMemField('num', 'int64') + numeric_mem.data.write(raw_data) + self.assertFalse(numeric_mem.is_sorted()) + + newfield = numeric_mem.create_like(group=None, name=None) + self.assertTrue(isinstance(newfield, fields.NumericMemField)) + newfield.data.write(np.array([1])) + self.assertTrue(newfield.is_sorted()) + newfield.data.clear() + newfield.data.write(raw_data) + with self.assertRaises(ValueError): + newfield.data.write([1,2,3,4,5]) + self.assertListEqual(numeric_mem.get_spans().tolist(), newfield.get_spans().tolist()) + newfield.data.clear() + newfield.data.write(np.array(sorted(raw_data))) + self.assertTrue(newfield.is_sorted()) + self.assertListEqual(numeric_mem.unique().tolist(), newfield.unique().tolist()) + self.assertEqual(id(numeric_mem), id(numeric_mem.writeable())) + + def test_categorical_mem_field(self): + categorical_memfield = fields.CategoricalMemField(self.s, 'int32', {"a": 1, "b": 2, "c": 3}) + categorical_memfield.data.write(np.array([1])) + self.assertTrue(categorical_memfield.is_sorted()) + categorical_memfield.data.clear() + memfield_data = RAND_STATE.randint(1, 4, 20) + categorical_memfield.data.write(memfield_data) + with self.assertRaises(ValueError): + categorical_memfield.data.write([]) + self.assertFalse(categorical_memfield.is_sorted()) + + newfield = categorical_memfield.create_like(group=None, name=None) + self.assertTrue(isinstance(newfield, fields.CategoricalMemField)) + + newfield.data.write(memfield_data) + self.assertListEqual(categorical_memfield.get_spans().tolist(), newfield.get_spans().tolist()) + newfield.data.clear() + newfield.data.write(np.array(sorted(memfield_data))) + self.assertTrue(newfield.is_sorted()) + + self.assertListEqual(categorical_memfield.unique().tolist(), newfield.unique().tolist()) + self.assertEqual(id(categorical_memfield), id(categorical_memfield.writeable())) + + with self.subTest("Test Categorical remap"): + newfield = categorical_memfield.remap([(1, 4), (2, 5), (3, 6)], {"a": 4, "b": 5, "c": 6}) + self.assertEqual(0, np.sum(np.isin(newfield.data[:], [1,2,3]))) + + with self.assertRaises(ValueError): + categorical_memfield.remap([(1, 4), (2, 5)], {"a": 4, "b": 5, "c": 6}) + + def test_fixed_string_mem_field(self): + memfield = fields.FixedStringMemField(self.s, 3) + memfield.data.write(np.array(['a'])) + self.assertTrue(memfield.is_sorted()) + memfield.data.clear() + memfield_data = np.array(FIXED_STRING_DATA) + memfield.data.write(memfield_data) + with self.assertRaises(ValueError): + memfield.data.write([]) + self.assertFalse(memfield.is_sorted()) + + newfield = memfield.create_like(group=None, name=None) + self.assertTrue(isinstance(newfield, fields.FixedStringMemField)) + + newfield.data.write(memfield_data) + self.assertListEqual(memfield.get_spans().tolist(), newfield.get_spans().tolist()) + newfield.data.clear() + newfield.data.write(np.array(sorted(memfield_data))) + self.assertTrue(newfield.is_sorted()) + + self.assertListEqual(memfield.unique().tolist(), newfield.unique().tolist()) + self.assertEqual(id(memfield), id(memfield.writeable())) + + def test_timestamp_mem_field(self): + memfield = fields.TimestampMemField(self.s) + memfield.data.write(np.array([TIMESTAMP_DATA[0]])) + self.assertTrue(memfield.is_sorted()) + memfield.data.clear() + memfield_data = np.array(TIMESTAMP_DATA) + memfield.data.write(memfield_data) + with self.assertRaises(ValueError): + memfield.data.write([]) + self.assertFalse(memfield.is_sorted()) + + newfield = memfield.create_like(group=None, name=None) + self.assertTrue(isinstance(newfield, fields.TimestampMemField)) + + newfield.data.write(memfield_data) + self.assertListEqual(memfield.get_spans().tolist(), newfield.get_spans().tolist()) + newfield.data.clear() + newfield.data.write(np.array(sorted(memfield_data))) + self.assertTrue(newfield.is_sorted()) + + self.assertListEqual(memfield.unique().tolist(), newfield.unique().tolist()) + self.assertEqual(id(memfield), id(memfield.writeable())) + + def test_indexed_string_mem_field(self): + memfield = fields.IndexedStringMemField(self.s) + memfield.data.write(np.array([INDEX_STRING_DATA[0]])) + self.assertTrue(memfield.is_sorted()) + memfield.data.clear() + memfield_data = np.array(INDEX_STRING_DATA) + memfield.data.write(memfield_data) + self.assertFalse(memfield.is_sorted()) + + newfield = memfield.create_like(group=None, name=None) + self.assertTrue(isinstance(newfield, fields.IndexedStringMemField)) + + newfield.data.write(memfield_data) + self.assertListEqual(memfield.get_spans(), newfield.get_spans()) + newfield.data.clear() + newfield.data.write(np.array(sorted(memfield_data))) + self.assertTrue(newfield.is_sorted()) + + self.assertListEqual(memfield.unique().tolist(), newfield.unique().tolist()) + self.assertEqual(id(memfield), id(memfield.writeable())) + class TestIndexedStringFields(unittest.TestCase): @@ -167,9 +532,13 @@ def test_create_indexed_string(self): ds = s.open_dataset(bio, 'w', 'src') df = ds.create_dataframe('src') f = df.create_indexed_string('f') - d = f.data[:] - #print(d) - + f.data.write(INDEX_STRING_DATA) + result = np.array([bytes(i, 'utf-8') for i in INDEX_STRING_DATA]) + self.assertEqual(result.dtype, f.data.dtype) + self.assertListEqual(INDEX_STRING_DATA, f.data[:]) + wtb = f.writeable() + self.assertListEqual(INDEX_STRING_DATA, wtb.data[:]) + self.assertEqual(result.dtype, wtb.data.dtype) def test_filter_indexed_string(self): bio = BytesIO() @@ -257,6 +626,54 @@ def test_clear(self, creator, name, kwargs, data): f.data.clear() self.assertFieldEqual([], f) + def test_indexed_array(self): + f = self.df.create_indexed_string('idxs') + f.data.write(INDEX_STRING_DATA) + self.assertEqual(len(INDEX_STRING_DATA), len(f.data)) + self.assertListEqual(INDEX_STRING_DATA, f.data[:]) + with self.assertRaises(ValueError): + f.data[len(INDEX_STRING_DATA)] + + + def test_readonly_array(self): + f = self.df.create_numeric('num', 'int32') + data = np.array(DEFAULT_FIELD_DATA[1][3], dtype='int32') + f.data.write(data) + f = fields.NumericField(self.s, self.df._h5group['num'], self.df) + self.assertEqual(len(data), len(f.data)) + self.assertEqual('int32', f.data.dtype) + self.assertListEqual(data.tolist(), f.data[:].tolist()) + with self.assertRaises(PermissionError): + f.data[:] = data + f.data.clear() + f.data.write(data) + f.data.write_part(data) + f.data.complete() + + def test_readonly_indexed_array(self): + f = self.df.create_indexed_string('idx') + data = ["a", "bb", "eeeee", "ccc", "dddd","", " ",]*2 + f.data.write(data) + f = fields.IndexedStringField(self.s, self.df._h5group['idx'], self.df) + output = np.array([bytes(i, 'utf-8') for i in data]) + self.assertEqual(output.dtype, f.data.dtype) + self.assertEqual(len(data), len(f.data)) + #self.assertEqual(f.data.dtype) + self.assertListEqual(data, f.data[:]) + with self.assertRaises(PermissionError): + f.data[:] = data + with self.assertRaises(PermissionError): + f.data.clear() + with self.assertRaises(PermissionError): + f.data.write(data) + with self.assertRaises(PermissionError): + f.data.write_part(data) + with self.assertRaises(PermissionError): + f.data.complete() + self.assertEqual(data[0], f.data[0]) + with self.assertRaises(AttributeError): + f.data[len(data)] + class TestMemoryFieldCreateLike(unittest.TestCase): @@ -1035,6 +1452,128 @@ def test_timestamp_apply_index(self): mb = b.apply_index(indices) self.assertListEqual(expected, mb.data[:].tolist()) +class TestFieldMemApplySpansCount(SessionTestCase): + def setUp(self): + super(TestFieldMemApplySpansCount, self).setUp() + + @parameterized.expand([(fields.IndexedStringMemField.apply_spans_first, ['a', 'ccc', 'dddd', 'gg']), + (fields.IndexedStringMemField.apply_spans_last, ['bb', 'ccc', 'fff', 'h']), + (fields.IndexedStringMemField.apply_spans_min, ['a', 'ccc', 'dddd', 'gg']), + (fields.IndexedStringMemField.apply_spans_max, ['bb', 'ccc', 'fff', 'h'])]) + def test_indexed_string_mem_field(self, ops, expected): # target is type field + src_data = ['a', 'bb', 'ccc', 'dddd', 'eeee', 'fff', 'gg', 'h'] + f = fields.IndexedStringMemField(self.s) + f.data.write(src_data) + spans = np.array([0, 2, 3, 6, 8], dtype=np.int32) + + output = ops(f, spans, None, False) # output is a mem field + + self.assertListEqual(output.data[:], expected) + dest = fields.IndexedStringMemField(self.s) + output = ops(f, spans, dest, False) + self.assertListEqual(dest.data[:], expected) + output = ops(f, spans, None, True) + self.assertListEqual(f.data[:], expected) + + @parameterized.expand([(fields.FixedStringMemField.apply_spans_first, [b'a1', b'b1', b'c1', b'd1']), + (fields.FixedStringMemField.apply_spans_last, [b'a2', b'b1', b'c3', b'd2']), + (fields.FixedStringMemField.apply_spans_min, [b'a1', b'b1', b'c1', b'd1']), + (fields.FixedStringMemField.apply_spans_max, [b'a2', b'b1', b'c3', b'd2'])]) + def test_fixed_string_mem_field(self,ops, expected): # target is type field + src_data = np.array([b'a1', b'a2', b'b1', b'c1', b'c2', b'c3', b'd1', b'd2']) + f = fields.FixedStringMemField(self.s, 2) + f.data.write(src_data) + spans = np.array([0, 2, 3, 6, 8], dtype=np.int32) + + output = ops(f, spans, None, False) # output is a mem field + + self.assertListEqual(output.data[:].tolist(), expected) + dest = fields.FixedStringMemField(self.s, 2) + output = ops(f, spans, dest, False) + self.assertListEqual(dest.data[:].tolist(), expected) + output = ops(f, spans, None, True) + self.assertListEqual(f.data[:].tolist(), expected) + + @parameterized.expand([(fields.CategoricalMemField.apply_spans_first, [0, 2, 0, 0]), + (fields.CategoricalMemField.apply_spans_last, [1, 2, 2, 1]), + (fields.CategoricalMemField.apply_spans_min, [0, 2, 0, 0]), + (fields.CategoricalMemField.apply_spans_max, [1, 2, 2, 1])]) + def test_categorical_mem_field(self, ops, expected): # target is type field + spans = np.array([0, 2, 3, 6, 8], dtype=np.int32) + src_data = np.array([0, 1, 2, 0, 1, 2, 0, 1]) + keys = {b'a': 0, b'b': 1, b'c': 2} + + f = fields.CategoricalMemField(self.s, 'int32', keys) + f.data.write(src_data) + + #no dest + output = ops(f, spans, None, False) # output is a mem field + self.assertListEqual(output.data[:].tolist(), expected) + + #dest + dest = fields.CategoricalMemField(self.s, 'int32', keys) + output = ops(f, spans, dest, False) # output is a mem field + self.assertListEqual(dest.data[:].tolist(), expected) + + #inplace + output = ops(f, spans, None, True) # output is a mem field + self.assertListEqual(f.data[:].tolist(), expected) + + @parameterized.expand([(fields.NumericMemField.apply_spans_first, [1, 11, 21, 31]), + (fields.NumericMemField.apply_spans_last, [2, 11, 23, 32]), + (fields.NumericMemField.apply_spans_min, [1, 11, 21, 31]), + (fields.NumericMemField.apply_spans_max, [2, 11, 23, 32])]) + def test_numeric_mem_field(self, ops, expected): # target is type field + spans = np.array([0, 2, 3, 6, 8], dtype=np.int32) + src_data = np.array([1, 2, 11, 21, 22, 23, 31, 32]) + + f = fields.NumericMemField(self.s, 'int32') + f.data.write(src_data) + + # no dest + output = ops(f, spans, None, False) # output is a mem field + self.assertListEqual(output.data[:].tolist(), expected) + + # dest + dest = fields.NumericMemField(self.s, 'int32') + output = ops(f, spans, dest, False) # output is a mem field + self.assertListEqual(dest.data[:].tolist(), expected) + + # inplace + output = ops(f, spans, None, True) # output is a mem field + self.assertListEqual(f.data[:].tolist(), expected) + + @parameterized.expand([(fields.TimestampMemField.apply_spans_first, [0, 2, 3, 6]), + (fields.TimestampMemField.apply_spans_last, [1, 2, 5, 7]), + (fields.TimestampMemField.apply_spans_min, [0, 2, 3, 7]), + (fields.TimestampMemField.apply_spans_max, [1, 2, 5, 6])]) + def test_timestamp_mem_field(self, ops, expected): # target is type field + from datetime import datetime as D + from datetime import timezone + src_data = [D(2020, 1, 1, tzinfo=timezone.utc), D(2021, 5, 1, tzinfo=timezone.utc), + D(2950, 8, 17, tzinfo=timezone.utc), D(1840, 10, 11, tzinfo=timezone.utc), + D(2021, 1, 1, tzinfo=timezone.utc), D(2022, 5, 18, tzinfo=timezone.utc), + D(2951, 8, 17, tzinfo=timezone.utc), D(1841, 10, 11, tzinfo=timezone.utc)] + src_data = np.asarray([d.timestamp() for d in src_data], dtype=np.float64) + spans = np.array([0, 2, 3, 6, 8], dtype=np.int32) + + f = fields.TimestampMemField(self.s) + f.data.write(src_data) + + # no dest + output = ops(f, spans, None, False) # output is a mem field + self.assertListEqual(output.data[:].tolist(), src_data[expected].tolist()) + + # dest + dest = fields.TimestampMemField(self.s) + output = ops(f, spans, dest, False) # output is a mem field + self.assertListEqual(dest.data[:].tolist(), src_data[expected].tolist()) + + # inplace + output = ops(f, spans, None, True) # output is a mem field + self.assertListEqual(f.data[:].tolist(), src_data[expected].tolist()) + + class TestFieldApplySpansCount(unittest.TestCase): @@ -1372,20 +1911,16 @@ def test_numeric_field_astype(self): src = s.open_dataset(bio, 'w', 'src') df = src.create_dataframe('df') num = df.create_numeric('num', 'int16') - num.data.write([1, 2, 3, 4, 5]) - - num = num.astype('int32') - self.assertEqual(num.data[:].dtype.type, np.int32) - num = num.astype('int64') - self.assertEqual(num.data[:].dtype.type, np.int64) - num = num.astype('float32') - self.assertEqual(num.data[:].dtype.type, np.float32) - num = num.astype('float64') - self.assertEqual(num.data[:].dtype.type, np.float64) - with self.assertRaises(Exception) as context: - num.astype('int32', casting='safe') - self.assertTrue(isinstance(context.exception,TypeError)) + num.data.write(NUMERIC_DATA) + for t in ['int32', 'int64', 'float32', 'float64']: + with self.subTest('Convert to '+t): + num = num.astype(t) + self.assertEqual(num.data[:].dtype.name, t) + with self.assertRaises(TypeError): + num.astype('int32', casting='safe') + with self.assertRaises(ValueError): + num.astype('str') class TestFieldUnique(unittest.TestCase): @@ -1635,7 +2170,6 @@ def test_module_field_isin(self, dtype, data, isin_data, expected): self.assertIsInstance(expected, list) np.testing.assert_array_equal(expected, result) - @parameterized.expand(INDEX_STR_ISIN_TESTS) def test_indexed_string_isin(self, data, isin_data, expected): """ @@ -1647,7 +2181,8 @@ def test_indexed_string_isin(self, data, isin_data, expected): with self.assertRaises(TypeError) as context: f.isin(isin_data) - self.assertEqual(str(context.exception), "only list-like or dict-like objects are allowed to be passed to field.isin(), you passed a 'NoneType'") + self.assertEqual(str(context.exception), + "only list-like or dict-like objects are allowed to be passed to field.isin(), you passed a 'NoneType'") else: @@ -1658,8 +2193,24 @@ def test_indexed_string_isin(self, data, isin_data, expected): np.testing.assert_array_equal(expected, result) with self.subTest("Test with duplicate data"): - isin_data = shuffle_randstate(isin_data * 2) # duplicate the search items and shuffle using a fixed seed + isin_data = shuffle_randstate( + isin_data * 2) # duplicate the search items and shuffle using a fixed seed # reuse expected data from previous subtest result = f.isin(isin_data) self.assertIsInstance(result, np.ndarray) - np.testing.assert_array_equal(expected, result) \ No newline at end of file + np.testing.assert_array_equal(expected, result) + + +class TestFieldModuleFunctions(SessionTestCase): + + @parameterized.expand(DEFAULT_FIELD_DATA) + def test_argsort(self, creator, name, kwargs, data): + """ + Tests basic creation of every field type, checking it's contents are actually what was put into them. + """ + f = self.setup_field(self.df, creator, name, (), kwargs, data) + if 'nformat' in kwargs and kwargs['nformat'] in ['int32', 'int64', 'uint32']: + self.assertListEqual(np.argsort(f.data[:]).tolist(), fields.argsort(f, dtype=kwargs['nformat']).data[:].tolist()) + else: + with self.assertRaises(ValueError): + fields.argsort(f) diff --git a/tests/test_operations.py b/tests/test_operations.py index fd61fe62..b65a22cf 100644 --- a/tests/test_operations.py +++ b/tests/test_operations.py @@ -1,14 +1,15 @@ import unittest +from io import BytesIO import numpy as np -from io import BytesIO +import h5py +from parameterized import parameterized from exetera.core import session from exetera.core import fields from exetera.core import operations as ops from exetera.core import utils - -from .utils import slow_test +from .utils import slow_test, SessionTestCase, DEFAULT_FIELD_DATA class TestOpsUtils(unittest.TestCase): @@ -776,6 +777,14 @@ def test_ordered_inner_map_both_unique_streaming_right_final_2(self): self.assertListEqual(l_result.data[:].tolist(), l_expected) self.assertListEqual(r_result.data[:].tolist(), r_expected) + # left map + # =========================== + def test_ordered_left_map_result_size(self): + a_ids = np.asarray([1, 1, 2, 2, 3, 5, 5, 5, 6, 8], dtype=np.int64) + b_ids = np.asarray([1, 1, 2, 3, 5, 5, 6, 7, 8, 8, 8], dtype=np.int64) + result_size = ops.ordered_left_map_result_size(a_ids, b_ids) + self.assertEqual(4, result_size) + # old inner / outer map functionality # =========================== @@ -1218,7 +1227,7 @@ def test_get_spans_two_field(self): spans1=ops.get_spans_for_field(np.array([1, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7, 8, 9, 10])) spans2=ops.get_spans_for_field(np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5])) - spans3= ops._get_spans_for_2_fields_by_spans(spans1,spans2) + spans3= ops._get_spans_for_2_fields_by_spans(spans1, spans2) self.assertTrue(list(spans), list(spans3)) @slow_test @@ -1365,3 +1374,22 @@ def test_transform_to_values(self): byte_data_1 = [x.tobytes() for x in data_1] expected_byte_data_1 = [b'Yes', b'No', b'No', b'Yes'] self.assertEqual(byte_data_1, expected_byte_data_1) + + +class TestDataIterator(SessionTestCase): + + @parameterized.expand(DEFAULT_FIELD_DATA) + def test_data_iterator(self, creator, name, kwargs, data): + f = self.setup_field(self.df, creator, name, (), kwargs, data) + output = [i for i in ops.data_iterator(f)] + result = f.data[:] if isinstance(f, fields.IndexedStringField) else f.data[:].tolist() + self.assertListEqual(output, result) + +class TestStr_to_dtype(SessionTestCase): + + def test_str_to_dtype(self): + for i in ['bool', 'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32', 'uint64', 'float32', 'float64']: + with self.subTest(i): + self.assertEqual(np.dtype(i), ops.str_to_dtype(i)) + with self.assertRaises(ValueError): + ops.str_to_dtype('str') diff --git a/tests/test_session.py b/tests/test_session.py index 7b157b35..0001b33d 100644 --- a/tests/test_session.py +++ b/tests/test_session.py @@ -1,15 +1,15 @@ import unittest - -import numpy as np from io import BytesIO import h5py +import numpy as np +from parameterized import parameterized from exetera.core import session from exetera.core import fields from exetera.core import dataframe from exetera.core import utils from exetera.io import field_importers as fi - +from .utils import DEFAULT_FIELD_DATA, SessionTestCase, NUMERIC_DATA class TestCreateThenLoadBetweenSessionsOld(unittest.TestCase): @@ -17,10 +17,16 @@ def test_create_then_load_indexed_string(self): bio = BytesIO() contents = ['a', 'bb', 'ccc', 'dddd', 'eeeee'] with session.Session() as s: - with h5py.File(bio, 'w') as src: - df = src.create_group('df') - f = s.create_indexed_string(df, 'foo') - f.data.write(contents) + src = s.open_dataset(bio, 'w', 'src') + df = src.create_group('df') + f = s.create_indexed_string(df, 'foo') + f.data.write(contents) + + with self.assertRaises(ValueError): + s.create_indexed_string(src, 'bar') + + with self.assertRaises(ValueError): + s.create_indexed_string("abc", 'bar') with session.Session() as s: with h5py.File(bio, 'r') as src: @@ -31,10 +37,17 @@ def test_create_then_load_fixed_string(self): bio = BytesIO() contents = [s.encode() for s in ['a', 'bb', 'ccc', 'dddd', 'eeeee']] with session.Session() as s: - with h5py.File(bio, 'w') as src: - df = src.create_group('df') - f = s.create_fixed_string(df, 'foo', 5) - f.data.write(contents) + src = s.open_dataset(bio, 'w', 'src') + df = src.create_group('df') + f = s.create_fixed_string(df, 'foo', 5) + f.data.write(contents) + + with self.assertRaises(ValueError): + s.create_fixed_string(src, 'bar', 3) + + with self.assertRaises(ValueError): + s.create_fixed_string("abc", 'bar', 3) + with session.Session() as s: with h5py.File(bio, 'r') as src: @@ -45,10 +58,18 @@ def test_create_then_load_categorical(self): bio = BytesIO() contents = [1, 2, 1, 2] with session.Session() as s: - with h5py.File(bio, 'w') as src: - df = src.create_group('df') - f = s.create_categorical(df, 'foo', 'int8', {b'a': 1, b'b': 2}) - f.data.write(np.array(contents)) + src = s.open_dataset(bio, 'w', 'src') + + df = src.create_group('df') + f = s.create_categorical(df, 'foo', 'int8', {b'a': 1, b'b': 2}) + f.data.write(np.array(contents)) + + with self.assertRaises(ValueError): + s.create_categorical(src, 'bar', 'int32', {}) + + with self.assertRaises(ValueError): + s.create_categorical("abc", 'bar', 'int32', {}) + with session.Session() as s: with h5py.File(bio, 'r') as src: @@ -60,10 +81,17 @@ def test_create_then_load_numeric(self): bio = BytesIO() contents = [1, 2, 1, 2] with session.Session() as s: - with h5py.File(bio, 'w') as src: - df = src.create_group('df') - f = s.create_numeric(df, 'foo', 'int8') - f.data.write(np.array(contents)) + src = s.open_dataset(bio, 'w', 'src') + df = src.create_group('df') + f = s.create_numeric(df, 'foo', 'int8') + f.data.write(np.array(contents)) + + with self.assertRaises(ValueError): + s.create_numeric(src, 'bar', 'int32') + + with self.assertRaises(ValueError): + s.create_numeric("abc", 'bar', 'int32') + with session.Session() as s: with h5py.File(bio, 'r') as src: @@ -78,10 +106,18 @@ def test_create_then_load_timestamp(self): contents = [c.timestamp() for c in contents] with session.Session() as s: - with h5py.File(bio, 'w') as src: - df = src.create_group('df') - f = s.create_timestamp(df, 'foo') - f.data.write(np.array(contents)) + src = s.open_dataset(bio, 'w', 'src') + df = src.create_group('df') + f = s.create_timestamp(df, 'foo') + f.data.write(np.array(contents)) + + with self.assertRaises(ValueError): + s.create_timestamp(src, 'bar') + + with self.assertRaises(ValueError): + s.create_timestamp("abc", 'bar') + + with session.Session() as s: with h5py.File(bio, 'r') as src: @@ -100,6 +136,7 @@ def test_create_then_load_categorical(self): with session.Session() as s: src = s.open_dataset(bio, 'r', 'src') + self.assertTupleEqual(s.list_datasets(),('src',)) f = s.get(src['df']['foo']) self.assertDictEqual({1: b'a', 2: b'b'}, f.keys) @@ -112,13 +149,50 @@ def test_create_new_then_load(self): with session.Session() as s: src = s.open_dataset(bio1, 'r', 'src') + self.assertTupleEqual(s.list_datasets(), ('src',)) df = src['df'] f = df['foo'] self.assertIsNotNone(f) self.assertEqual('foo', f.name) - f2 = s.get(df['foo']) + with self.assertRaises(ValueError): + s.open_dataset(bio1, 'r', 'src') + + with self.assertRaises(ValueError): + session.Session(timestamp=123) + + def test_set_time(self): + from datetime import datetime + with session.Session() as s: + with self.assertRaises(ValueError): + s.set_timestamp(123) + ts = datetime.now().timestamp() + s.set_timestamp(str(ts)) + self.assertEqual(str(ts), s.timestamp) + + def test_session_get(self): + bio = BytesIO() + with session.Session() as s: + src = s.open_dataset(bio, 'w', 'src') + df = src.create_dataframe('df') + with self.assertRaises(AttributeError): + s.get('abc') + num = df.create_numeric('num', 'int32') + num.data.write([1,2,3,4]) + num = s.get(df._h5group['num']) + self.assertListEqual([1,2,3,4], num.data[:].tolist()) + + with self.assertRaises(ValueError): + s.create_like("abc", df, 'num2') + + s.create_like(df._h5group['num'], df, 'num2') + self.assertTrue(isinstance(df['num2'], fields.NumericField)) + + + + + class TestSessionMerge(unittest.TestCase): @@ -660,16 +734,25 @@ def test_sort_on(self): self.assertListEqual([10, 30, 50, 40, 20], val_f.data[:].tolist()) self.assertListEqual(['a', 'bbb', 'ccccc', 'dddd', 'ee'], val2_f.data[:]) + df2 = dst.create_dataframe('ds2') + s.sort_on(src, df2, ("idx",), verbose=False) + self.assertListEqual([b'a', b'b', b'c', b'd', b'e'], df2['idx'].data[:].tolist()) + self.assertListEqual([10, 30, 50, 40, 20], df2['val'].data[:].tolist()) + self.assertListEqual(['a', 'bbb', 'ccccc', 'dddd', 'ee'], df2['val2'].data[:]) + -class TestSessionFilter(unittest.TestCase): +class TestSessionFilter(SessionTestCase): def test_apply_filter(self): - s = session.Session(10) vx = np.asarray([1, 2, 3, 4, 5, 6, 7, 8]) filt = np.asarray([True, True, False, False, True, False, True, False]) - result = s.apply_filter(filt, vx) + result = self.s.apply_filter(filt, vx) + self.assertListEqual([1, 2, 5, 7], result.tolist()) + + self.df.create_numeric('num', 'int32').data.write(vx) + result = self.s.apply_filter(filt, self.df['num']) self.assertListEqual([1, 2, 5, 7], result.tolist()) @@ -799,6 +882,10 @@ def test_apply_spans_min(self): s.apply_spans_min(spans, s.get(ds['vals']), dest=s.create_numeric(ds, 'result2', 'int64')) self.assertListEqual([0, 2, 4, 1], s.get(ds['result2']).data[:].tolist()) + vals = np.asarray([0, 8, 2, 6, 4, 5, 3, 7, 1], dtype=np.int64) # wrong length + with self.assertRaises(ValueError): + s.apply_spans_min(spans, vals) + def test_apply_spans_max(self): idx = np.asarray([0, 1, 1, 2, 2, 2, 3, 3, 3, 3], dtype=np.int32) vals = np.asarray([0, 8, 2, 6, 4, 5, 3, 7, 1, 9], dtype=np.int64) @@ -900,6 +987,12 @@ def test_apply_spans_concat(self): self.assertListEqual([0, 1, 4, 9, 16], s.get(ds['result']).indices[:].tolist()) self.assertListEqual(['a', 'b,a', 'b,a,b', 'a,b,a,b'], s.get(ds['result']).data[:]) + with self.assertRaises(ValueError): + s.apply_spans_concat(spans, s.create_numeric(ds, 'foo3', 'int32'), dest=s.create_indexed_string(ds, 'foo')) + with self.assertRaises(ValueError): + s.apply_spans_concat(spans, s.get(ds['vals']), dest=s.create_numeric(ds, 'foo2', 'int32')) + + def test_apply_spans_concat_2(self): idx = np.asarray([0, 0, 1, 2, 2, 3, 4, 4, 4, 4], dtype=np.int32) vals = ['a', 'b,c', 'd', 'e,f', 'g', 'h,i', 'j', 'k,l', 'm', 'n,o'] @@ -1009,6 +1102,10 @@ def test_aggregate_first(self): s.aggregate_first(idx, s.get(ds['vals']), dest=s.create_numeric(ds, 'result2', 'int64')) self.assertListEqual([0, 8, 6, 3], s.get(ds['result2']).data[:].tolist()) + with self.assertRaises(ValueError): + s.aggregate_first(idx, None, None) + + def test_aggregate_last(self): idx = np.asarray([0, 1, 1, 2, 2, 2, 3, 3, 3, 3], dtype=np.int32) vals = np.asarray([0, 8, 2, 6, 4, 5, 3, 7, 1, 9], dtype=np.int64) @@ -1239,3 +1336,57 @@ def test_date_importer(self): expected_date_list = ['2020-05-10', '2020-05-12', '2020-05-12', '2020-05-15'] self.assertListEqual(hf['foo'].data[:].tolist(), [datetime.strptime(x, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp() for x in expected_date_list]) + + +class TestSessionCreate_like(SessionTestCase): + + @parameterized.expand(DEFAULT_FIELD_DATA) + def test_create_like(self, creator, name, kwargs, data): + """ + Tests basic creation of every field type, checking it's contents are actually what was put into them. + """ + f = self.setup_field(self.df, creator, name, (), kwargs, data) + + # Convert numeric fields and use Numpy's conversion as an oracle to test overflown values in field. If a value + # overflows when stored in a field then the field's contents will obviously vary compared to `data`, so change + # data to match by using Numpy to handle overflow for us. + if "nformat" in kwargs: + data = np.asarray(data, dtype=kwargs["nformat"]) + + self.assertFieldEqual(data, f) + + nf = self.s.create_like(f, self.df, name+'_') + self.assertTrue(isinstance(nf, type(f))) + +class TestSessionGetSharedIndex(SessionTestCase): + def test_get_shared_index(self): + key_1 = np.array(['a', 'b', 'e', 'g', 'i']) + key_2 = np.array(['b', 'b', 'c', 'c', 'e', 'g', 'j']) + key_3 = np.array(['a', 'c' 'd', 'e', 'g', 'h', 'h', 'i']) + with self.assertRaises(ValueError): + self.s.get_shared_index([key_1, key_2, key_3]) + result = self.s.get_shared_index((key_1, key_2, key_3)) + self.assertEqual(3, len(result)) + self.assertListEqual([0, 1, 4, 5, 7], result[0].tolist()) + self.assertListEqual([1, 1, 2, 2, 4, 5, 8], result[1].tolist()) + self.assertListEqual([0, 3, 4, 5, 6, 6, 7], result[2].tolist()) + + +class TestSessionDistinct(SessionTestCase): + def test_session_distinct(self): + num1 = self.df.create_numeric('num1', 'int32') + num1.data.write(np.array(NUMERIC_DATA)) + num2 = self.df.create_numeric('num2', 'int32') + num2.data.write(np.array(NUMERIC_DATA)) + with self.assertRaises(ValueError): + self.s.distinct(field=None, fields=None) + with self.assertRaises(ValueError): + self.s.distinct(num1, (num1, num2)) + output = self.s.distinct(field=num1.data[:]).tolist() + result = np.unique(num1.data[:]).tolist() + self.assertListEqual(result, output) + output = self.s.distinct(fields=(num1.data[:], num2.data[:])) + self.assertEqual(2, len(output)) + self.assertListEqual(output[0].tolist(), result) + self.assertListEqual(output[1].tolist(), result) + diff --git a/tests/test_utils.py b/tests/test_utils.py index e54ba83d..b951b8e2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,7 +11,12 @@ import unittest +import numpy as np + from exetera.core.utils import find_longest_sequence_of, get_min_max +from exetera.core import utils +from exetera.core.utils import find_longest_sequence_of, get_min_max, validate_file_exists +from .utils import HARD_INTS, HARD_FLOATS class TestUtils(unittest.TestCase): @@ -59,3 +64,35 @@ def test_get_min_max_for_permitted_types(self): self.assertEqual(min_value, expected_min_max_values[value_type][0]) self.assertEqual(max_value, expected_min_max_values[value_type][1]) + def test_validate_file_exists(self): + import os + with self.assertRaises(FileExistsError): + validate_file_exists('./tempfile') + os.mkdir('./tempfile') + with self.assertRaises(FileNotFoundError): + validate_file_exists('./tempfile') + os.rmdir('./tempfile') + + def test_count_flag(self): + flag = np.array([True if i%2 == 0 else False for i in range(100)]) + output = utils.count_flag_empty(flag) + self.assertEqual(np.sum(flag == False), output) + output = utils.count_flag_set(flag, True) + self.assertEqual(np.sum(flag == False), output) + output = utils.count_flag_not_set(flag, True) + self.assertEqual(np.sum(flag != True), output) + + def test_string_to_date(self): + from datetime import datetime + ts_s = '2021-11-22 11:22:33.000-0500' + ts = utils.string_to_datetime(ts_s) + self.assertEqual(datetime.strptime(ts_s, '%Y-%m-%d %H:%M:%S.%f%z'), ts) + with self.assertRaises(ValueError): + utils.string_to_datetime("foo-boo") + + def test_build_histogram(self): + data = np.array([np.random.randint(0, 50) for i in range(1000)]) + output = utils.build_histogram(data) + a, b = np.unique(data, return_counts=True) + result = [(a[i], b[i]) for i in range(len(a))] + self.assertListEqual(sorted(result), sorted(output)) diff --git a/tests/utils.py b/tests/utils.py index 18e39cec..7c6864b1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -72,46 +72,40 @@ def shuffle_randstate(arr: ArrayLike, seed=DEFAULT_SEED) -> ArrayLike: # default field initialization values for every field type, format is: # (creator method, field name, args for method, kwargs for method, data) +NUMERIC_DATA = list(range(-10, 10)) + HARD_INTS +TIMESTAMP_DATA = [ + utc_timestamp(2020, 1, 1), + utc_timestamp(2021, 5, 18), + utc_timestamp(2950, 8, 17), + utc_timestamp(1840, 10, 11), + utc_timestamp(2110, 11, 1), + utc_timestamp(2002, 3, 3), + utc_timestamp(1963, 6, 7), + utc_timestamp(2018, 2, 28), + utc_timestamp(2400, 9, 1), + utc_timestamp(1, 1, 1), + ] +FIXED_STRING_DATA = [b"aaa", b"bbb", b"eee", b"ccc", b"ddd", b" "]*2 +INDEX_STRING_DATA = ["a", "bb", "eeeee", "ccc", "dddd","", " ",]*2 DEFAULT_FIELD_DATA = [ - ("create_numeric", "f_i8", {"nformat": "int8"}, shuffle_randstate(list(range(-10, 10)) + HARD_INTS)), - ( - "create_numeric", - "f_i32", - {"nformat": "int32"}, - shuffle_randstate(list(range(-10, 10)) + HARD_INTS), - ), - ( - "create_numeric", - "f_i64", - {"nformat": "int64"}, - shuffle_randstate(list(range(-10, 10)) + HARD_INTS), - ), - ("create_numeric", "f_f32", {"nformat": "float32"}, shuffle_randstate(list(range(-10, 10)) + HARD_FLOATS)), - ("create_numeric", "f_f64", {"nformat": "float64"}, shuffle_randstate(list(range(-10, 10)) + HARD_FLOATS)), + ("create_numeric", "f_i8", {"nformat": "int8"}, shuffle_randstate(NUMERIC_DATA)), + ("create_numeric", "f_i32", {"nformat": "int32"}, shuffle_randstate(NUMERIC_DATA)), + ("create_numeric", "f_i64", {"nformat": "int64"}, shuffle_randstate(NUMERIC_DATA)), + ("create_numeric", "f_f32", {"nformat": "float32"}, shuffle_randstate(NUMERIC_DATA)), + ("create_numeric", "f_f64", {"nformat": "float64"}, shuffle_randstate(NUMERIC_DATA)), ( "create_categorical", "f_cat123", {"nformat": "int8", "key": {"a": 1, "b": 2, "c": 3}}, RAND_STATE.randint(1, 4, 20).tolist(), ), - ("create_indexed_string", "f_istr", {}, ["a", "bb", "eeeee", "ccc", "dddd","", " ",]*2), - ("create_fixed_string", "f_fstr", {"length": 3}, [b"aaa", b"bbb", b"eee", b"ccc", b"ddd", b" "]*2), + ("create_indexed_string", "f_istr", {}, INDEX_STRING_DATA), + ("create_fixed_string", "f_fstr", {"length": 3}, FIXED_STRING_DATA), ( "create_timestamp", "f_ts", {}, - [ - utc_timestamp(2020, 1, 1), - utc_timestamp(2021, 5, 18), - utc_timestamp(2950, 8, 17), - utc_timestamp(1840, 10, 11), - utc_timestamp(2110, 11, 1), - utc_timestamp(2002, 3, 3), - utc_timestamp(1963, 6, 7), - utc_timestamp(2018, 2, 28), - utc_timestamp(2400, 9, 1), - utc_timestamp(1, 1, 1), - ], + TIMESTAMP_DATA, ), ]