diff --git a/setup.py b/setup.py index f83d744..fe1e79d 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def parse_requirements(filename): name=PACKAGE_NAME, packages=proj_packages, package_dir={PACKAGE_NAME: SOURCE_DIRECTORY}, - version="0.0.30", + version="0.0.31", author="Fabian Groeger", author_email="fabian.groeger@unibas.ch", description="A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates and label errors.", diff --git a/src/cleaner/selfclean_cleaner.py b/src/cleaner/selfclean_cleaner.py index 828b707..071e7e7 100644 --- a/src/cleaner/selfclean_cleaner.py +++ b/src/cleaner/selfclean_cleaner.py @@ -55,7 +55,8 @@ def __init__( log_level: str = "INFO", **kwargs, ): - set_log_level(min_log_level=log_level) + self.log_level = log_level + set_log_level(min_log_level=self.log_level) fix_random_seeds(seed=random_seed) self.memmap = memmap @@ -119,12 +120,17 @@ def fit( # create the distance matrix in chunks n_chunks = math.ceil(self.N / self.chunk_size) - for i in tqdm( - range(n_chunks), - desc="Creating distance matrix", - total=n_chunks, - position=0, - leave=True, + iterator = range(n_chunks) + for i in ( + tqdm( + iterator, + desc="Creating distance matrix", + total=n_chunks, + position=0, + leave=True, + ) + if self.log_level == "DEBUG" + else iterator ): chunk_slice = slice(i * self.chunk_size, (i + 1) * self.chunk_size, 1) X_emb = emb_space[chunk_slice] @@ -164,7 +170,19 @@ def fit( ) triu_indices = np.triu_indices(self.N, k=1) # create the upper triangular matrix of the distance matrix - for start_idx in range(0, len(triu_indices[0]), self.chunk_size): + n_chunks = math.ceil(len(triu_indices[0]) / self.chunk_size) + iterator = range(0, len(triu_indices[0]), self.chunk_size) + for start_idx in ( + tqdm( + iterator, + desc="Creating upper triangular distance matrix", + total=n_chunks, + position=0, + leave=True, + ) + if self.log_level == "DEBUG" + else iterator + ): end_idx = min(start_idx + self.chunk_size, len(triu_indices[0])) self.p_distances[start_idx:end_idx] = self.distance_matrix[ triu_indices[0][start_idx:end_idx], triu_indices[1][start_idx:end_idx]