From 7036f91ddcd4ceffda3afdfbfebbbc24d81f8a6b Mon Sep 17 00:00:00 2001 From: gram Date: Wed, 27 Sep 2023 08:34:39 +0200 Subject: [PATCH] small fixes --- tests/test_compression/test_entropy_ncd.py | 2 +- textdistance/algorithms/base.py | 2 +- textdistance/algorithms/compression_based.py | 5 ++--- textdistance/algorithms/token_based.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_compression/test_entropy_ncd.py b/tests/test_compression/test_entropy_ncd.py index 9cba4eb..6e81764 100644 --- a/tests/test_compression/test_entropy_ncd.py +++ b/tests/test_compression/test_entropy_ncd.py @@ -31,7 +31,7 @@ def test_simmetry_compressor(text): @hypothesis.given(text=hypothesis.strategies.text(min_size=1)) def test_idempotency_compressor(text): # I've modified idempotency to some kind of distributivity for constant. - # Now it indicates that compressor really compress. + # Now it indicates that compressor actually does compression. assert ALG._get_size(text * 2) < ALG._get_size(text) * 2 diff --git a/textdistance/algorithms/base.py b/textdistance/algorithms/base.py index e82902a..44d6361 100644 --- a/textdistance/algorithms/base.py +++ b/textdistance/algorithms/base.py @@ -155,7 +155,7 @@ def _sum_counters(self, *sequences: Counter[T]) -> Counter[T]: result += s return result - def _count_counters(self, counter: Counter) -> float: + def _count_counters(self, counter: Counter) -> int: """Return all elements count from Counter """ if getattr(self, 'as_set', False): diff --git a/textdistance/algorithms/compression_based.py b/textdistance/algorithms/compression_based.py index bc8aecb..7c10fc1 100644 --- a/textdistance/algorithms/compression_based.py +++ b/textdistance/algorithms/compression_based.py @@ -106,8 +106,7 @@ def _make_probs(self, *sequences) -> dict[str, tuple[Fraction, Fraction]]: prob_pairs = {} cumulative_count = 0 - counts = sorted(counts.items(), key=lambda x: (x[1], x[0]), reverse=True) - for char, current_count in counts: + for char, current_count in counts.most_common(): prob_pairs[char] = ( Fraction(cumulative_count, total_letters), Fraction(current_count, total_letters), @@ -214,7 +213,7 @@ def _get_size(self, data: Sequence) -> float: class EntropyNCD(_NCDBase): """Entropy based NCD - Get Entropy of input secueance as a size of compressed data. + Get Entropy of input sequence as a size of compressed data. https://en.wikipedia.org/wiki/Entropy_(information_theory) https://en.wikipedia.org/wiki/Entropy_encoding diff --git a/textdistance/algorithms/token_based.py b/textdistance/algorithms/token_based.py index fb0466a..74804b9 100644 --- a/textdistance/algorithms/token_based.py +++ b/textdistance/algorithms/token_based.py @@ -122,7 +122,7 @@ def __call__(self, *sequences: Sequence) -> float: sequences = [self._count_counters(s) for s in sequences] # ints ks = list(islice(self.ks, len(sequences))) - if len(sequences) == 2 or self.bias is None: + if len(sequences) != 2 or self.bias is None: result = intersection for k, s in zip(ks, sequences): result += k * (s - intersection)