Skip to content

Commit

Permalink
isin unittest with parameterized (#285)
Browse files Browse the repository at this point in the history
* isin unittest with parameterized

* raise TypeError when test_element is None
  • Loading branch information
Liyuan-Chen-1024 authored Apr 13, 2022
1 parent 9258c77 commit 24cc402
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 68 deletions.
12 changes: 8 additions & 4 deletions exetera/core/operations.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime
from typing import Optional, Union
from typing import Optional, Type, Union

import numpy as np
import math
Expand Down Expand Up @@ -3068,8 +3068,12 @@ def indexed_string_unique(indices, values, unique_result, unique_index, unique_i


def isin_for_indexed_string_field(test_elements, indices, values):
if isinstance(test_elements, set):
test_elements = list(test_elements)
if test_elements is None:
raise TypeError("only list-like or dict-like objects are allowed to be passed to field.isin(), you passed a 'NoneType'")

test_elements = [x for x in test_elements if x is not None]
if len(test_elements) == 0:
return np.asarray([False] * (len(indices) - 1), dtype='bool')

# sort first
test_elements = np.sort(test_elements)
Expand All @@ -3080,7 +3084,7 @@ def isin_for_indexed_string_field(test_elements, indices, values):

@exetera_njit
def isin_indexed_string_speedup(test_elements, indices, values):
result = [False] * (len(indices) - 1)
result = np.asarray([False] * (len(indices) - 1), dtype='bool')
len_test_eles = len(test_elements)
for i in range(len(indices)-1):
v = values[indices[i] : indices[i+1]]
Expand Down
183 changes: 119 additions & 64 deletions tests/test_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from exetera.core import persistence as per
from exetera.io import field_importers as fi
from exetera.core import utils
import itertools

NUMERIC_ONLY = [d for d in DEFAULT_FIELD_DATA if d[0] == "create_numeric"]

Expand Down Expand Up @@ -1530,82 +1531,136 @@ def test_unique_timestamp_field_with_input_out_of_order(self):

self.assertEqual(df['ts'].unique().tolist(), [ts1, ts2])

class TestFieldIsIn(unittest.TestCase):
def test_module_field_isin(self):
bio = BytesIO()
with session.Session() as s:
src = s.open_dataset(bio, 'w', 'src')
df = src.create_dataframe('df')
f = df.create_numeric('f', 'int16')
f.data.write([1, 2, 3, 4, 5])

r1 = fields.isin(f, [1,2,3]) # test_element is list
self.assertIsInstance(r1, fields.NumericMemField)
self.assertEqual(r1.data[:].tolist(), [True, True, True, False, False])

r2 = fields.isin(f, 3) # single test_element
self.assertIsInstance(r2, fields.NumericMemField)
self.assertEqual(r2.data[:].tolist(), [False, False, True, False, False])
REALLY_LARGE_LIST = list(range(1_000_000))
NUMERIC_ISIN_TESTS = [
("int16", [1, 2, 3, 4, 5], [], None),
("int16", [1, 2, 3, 4, 5], [6, 7], None),
("int16", [1, 2, 3, 4, 5], [1, 2, 3], None),
("int16", [1, 2, 3, 4, 5], [1, 2, 3, 6, 7], None),
("int16", [1, 2, 3, 4, 5], [4, 1, 3], None),
("int16", [3, 1, 5, 4, 2], [4, 1, 3], None),
("int16", [1, 2, 3, 4, 5], 3, None),
("int16", [3, 1, 5, 4, 2], 4, None),
# really large inputs can take a long time to run through oracle to pre-compute result
("int32", REALLY_LARGE_LIST, REALLY_LARGE_LIST, [True] * len(REALLY_LARGE_LIST)),
("int32", REALLY_LARGE_LIST, shuffle_randstate(REALLY_LARGE_LIST), [True] * len(REALLY_LARGE_LIST)),
]

# test data for index string field, test conditions these define are covered to a degree already by DEFAULT_FIELD_DATA
INDEX_STR_DATA = ["a", "", "apple", "app", "APPLE", "APP", "aaaa", "app/", "apple12", "ip"]
INDEX_STR_ISIN_TESTS = [
(INDEX_STR_DATA,[],None),
(INDEX_STR_DATA,None,None),
(INDEX_STR_DATA,[None],None),
(INDEX_STR_DATA, ["None"], None),
(INDEX_STR_DATA, ["a", "APPLE"], None),
(INDEX_STR_DATA, ["app", "APP"], None),
(INDEX_STR_DATA, ["app/", "app//"], None),
(INDEX_STR_DATA, ["apple12", "APPLE12", "apple13"], None),
(INDEX_STR_DATA, ["ip", "ipd", "id"], None),
(INDEX_STR_DATA, [""], None),
(INDEX_STR_DATA, INDEX_STR_DATA, [True] * len(INDEX_STR_DATA)),
]


def isin_oracle(data, isin_values, expected=None):
"""
Generates the expected membership list of boolean values that should be returned by `f.isin(isin_values)` where `f`
contains `data`. If `expected` is not None this is returned instead, this allows pre-defined expected values to be
given for only some test cases so to avoid long running calls to this function.
"""
if expected is not None:
return expected

if isinstance(isin_values, type(data)) and isin_values == data:
return [True] * len(data)

if isinstance(isin_values, (tuple, list)):
return [d in isin_values for d in data]

return [d == isin_values for d in data]


class TestFieldIsIn(SessionTestCase):
@parameterized.expand(DEFAULT_FIELD_DATA)
def test_isin_default_fields(self, creator, name, kwargs, data):
"""
Tests `isin` for the default fields by checking with an empty input list and lists containing every value
and every pair of values from the field data.
"""
f = self.setup_field(self.df, creator, name, (), kwargs, data)
if "nformat" in kwargs:
data = np.asarray(data, dtype=kwargs["nformat"])

def test_isin_on_numeric_field(self):
bio = BytesIO()
with session.Session() as s:
src = s.open_dataset(bio, 'w', 'src')
df = src.create_dataframe('df')
df.create_numeric('f', 'int16').data.write([1, 2, 3, 4, 5])
with self.subTest("Test empty isin parameter"):
expected = [False] * len(data)
result = f.isin([])
np.testing.assert_array_equal(expected, result)

# test_element is list
self.assertEqual(df['f'].isin([1,2,3]).tolist(), [True, True, True, False, False])
# single test_element
self.assertEqual(df['f'].isin(3).tolist(), [False, False, True, False, False])
self.assertEqual(df['f'].isin(8).tolist(), [False, False, False, False, False])
with self.subTest("Test 1 isin values"):
for idx in range(len(data)):
isin_data=[data[idx]]
expected = isin_oracle(data, isin_data)
result = f.isin(isin_data)
np.testing.assert_array_equal(expected, result)

with self.subTest("Test 2 isin values"):
for idx1,idx2 in itertools.product(range(len(data)),repeat=2):
isin_data=[data[idx1],data[idx2]]
expected = isin_oracle(data, isin_data)
result = f.isin(isin_data)
np.testing.assert_array_equal(expected, result)

def test_isin_on_indexed_string_field_with_testelements_all_unique(self):
bio = BytesIO()
with session.Session() as s:
src = s.open_dataset(bio, 'w', 'src')
df = src.create_dataframe('df')
df.create_indexed_string('foo').data.write(['a', '', 'apple','app', 'APPLE', 'APP', 'app/', 'apple12', 'ip'])

self.assertEqual(df['foo'].isin(['APPLE', '']), [False, True, False, False, True, False, False, False, False])
self.assertEqual(df['foo'].isin(['app','APP']), [False, False, False, True, False, True, False, False, False])
self.assertEqual(df['foo'].isin(['app/','app//']), [False, False, False, False, False, False, True, False, False])
self.assertEqual(df['foo'].isin(['apple12','APPLE12', 'apple13']), [False, False, False, False, False, False, False, True, False])
self.assertEqual(df['foo'].isin(['ip','ipd']), [False, False, False, False, False, False, False, False, True])
@parameterized.expand(NUMERIC_ISIN_TESTS)
def test_module_field_isin(self, dtype, data, isin_data, expected):
"""
Test `isin` for the numeric fields using `fields.isin` function and the object's method.
"""
f = self.setup_field(self.df, "create_numeric", "f", (dtype,), {}, data)

with self.subTest("Test module function"):
result = fields.isin(f, isin_data)
expected = isin_oracle(data, isin_data, expected)

def test_isin_on_indexed_string_field_with_duplicate_in_testelements(self):
bio = BytesIO()
with session.Session() as s:
src = s.open_dataset(bio, 'w', 'src')
df = src.create_dataframe('df')
df.create_indexed_string('foo').data.write(['a', '', 'apple','app', 'APPLE', 'APP', 'app/', 'apple12', 'ip'])
self.assertIsInstance(result, fields.NumericMemField)
self.assertFieldEqual(expected, result)

self.assertEqual(df['foo'].isin(['APPLE', '', '', 'APPLE']), [False, True, False, False, True, False, False, False, False])
self.assertEqual(df['foo'].isin(['app','APP', 'app', 'APP']), [False, False, False, True, False, True, False, False, False])
self.assertEqual(df['foo'].isin(['app/','app//', 'app//']), [False, False, False, False, False, False, True, False, False])
self.assertEqual(df['foo'].isin(['APPLE12', 'apple12', 'apple12', 'APPLE12', 'apple13']), [False, False, False, False, False, False, False, True, False])
self.assertEqual(df['foo'].isin(['ip','ipd', 'id']), [False, False, False, False, False, False, False, False, True])
with self.subTest("Test field method"):
result = f.isin(isin_data)
expected = isin_oracle(data, isin_data, expected)

self.assertIsInstance(result, np.ndarray)
self.assertIsInstance(expected, list)
np.testing.assert_array_equal(expected, result)

def test_isin_on_fixed_string_field(self):
bio = BytesIO()
with session.Session() as s:
src = s.open_dataset(bio, 'w', 'src')
df = src.create_dataframe('df')
df.create_fixed_string('foo', 2).data.write([b'aa', b'bb', b'cc'])

self.assertEqual(df['foo'].isin([b'aa', b'zz']).tolist(), [True, False, False])
@parameterized.expand(INDEX_STR_ISIN_TESTS)
def test_indexed_string_isin(self, data, isin_data, expected):
"""
Test `isin` for the fixed string fields using `fields.isin` function and the object's method.
"""
f = self.setup_field(self.df, "create_indexed_string", "f", (), {}, data)

def test_isin_on_timestamp_field(self):
bio = BytesIO()
with session.Session() as s:
src = s.open_dataset(bio, 'w', 'src')
df = src.create_dataframe('df')
if isin_data is None:
with self.assertRaises(TypeError) as context:
f.isin(isin_data)

ts1 = datetime(2021, 12, 1).timestamp()
ts2 = datetime(2022, 1, 1).timestamp()
ts3 = datetime(2022, 2, 1).timestamp()
df.create_timestamp('ts').data.write([ts2, ts3, ts1])
self.assertEqual(df['ts'].isin({ts1, ts2}).tolist(), [True, False, True])
self.assertEqual(str(context.exception), "only list-like or dict-like objects are allowed to be passed to field.isin(), you passed a 'NoneType'")

else:

with self.subTest("Test with given data"):
expected = isin_oracle(data, isin_data, expected)
result = f.isin(isin_data)
self.assertIsInstance(result, np.ndarray)
np.testing.assert_array_equal(expected, result)

with self.subTest("Test with duplicate data"):
isin_data = shuffle_randstate(isin_data * 2) # duplicate the search items and shuffle using a fixed seed
# reuse expected data from previous subtest
result = f.isin(isin_data)
self.assertIsInstance(result, np.ndarray)
np.testing.assert_array_equal(expected, result)

0 comments on commit 24cc402

Please sign in to comment.