From 510cd02202653e813f0231c79b82440e6f4cb04a Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Wed, 31 Jul 2024 09:14:37 -0500 Subject: [PATCH 1/9] Heavily optimize `Sqids.__to_id()` Converting the alphabet to a list is very costly at scale. Getting the length of the alphabet repeatedly is a little costly. Comparing `result == 0` vs `not result` is measurably costly. These have all been eliminated. Python's timeit module suggest a performance improvement of ~300%. --- sqids/sqids.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sqids/sqids.py b/sqids/sqids.py index ad173cf..88dc463 100644 --- a/sqids/sqids.py +++ b/sqids/sqids.py @@ -136,13 +136,13 @@ def __shuffle(self, alphabet: str) -> str: def __to_id(self, num: int, alphabet: str) -> str: id_chars: List[str] = [] - chars = list(alphabet) result = num + alphabet_length = len(alphabet) while True: - id_chars.insert(0, chars[result % len(chars)]) - result = result // len(chars) - if result == 0: + id_chars.insert(0, alphabet[result % alphabet_length]) + result = result // alphabet_length + if not result: break return "".join(id_chars) From ef28bef54cf22c4696364fb5764db046ed3fd9c1 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Thu, 1 Aug 2024 07:24:56 -0500 Subject: [PATCH 2/9] Fail fast when checking numbers to encode Previous behavior required checking the entire list even if the first number is invalid. --- sqids/sqids.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sqids/sqids.py b/sqids/sqids.py index 88dc463..f2bce86 100644 --- a/sqids/sqids.py +++ b/sqids/sqids.py @@ -44,8 +44,7 @@ def encode(self, numbers: List[int]) -> str: if not numbers: return "" - in_range_numbers = [n for n in numbers if 0 <= n <= sys.maxsize] - if len(in_range_numbers) != len(numbers): + if not all(0 <= number <= sys.maxsize for number in numbers): raise ValueError(f"Encoding supports numbers between 0 and {sys.maxsize}") return self.__encode_numbers(numbers, 0) From 1955cbca454f4d6ab7825c929e2f2209dbb994db Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Thu, 1 Aug 2024 09:55:40 -0500 Subject: [PATCH 3/9] Use `any()` to eliminate a block indent --- sqids/sqids.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sqids/sqids.py b/sqids/sqids.py index f2bce86..286f999 100644 --- a/sqids/sqids.py +++ b/sqids/sqids.py @@ -10,9 +10,8 @@ def __init__( min_length: int = DEFAULT_MIN_LENGTH, blocklist: List[str] = DEFAULT_BLOCKLIST, ): - for char in alphabet: - if ord(char) > 127: - raise ValueError("Alphabet cannot contain multibyte characters") + if any(ord(char) > 127 for char in alphabet): + raise ValueError("Alphabet cannot contain multibyte characters") if len(alphabet) < 3: raise ValueError("Alphabet length must be at least 3") From 39698a801bdc5a8d2abb0ddacabf859addc35441 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Thu, 1 Aug 2024 09:59:29 -0500 Subject: [PATCH 4/9] Eliminate blocklist looping when checking if an ID is blocked By filtering the blocklist once during instantiation, a significant amount of computation can be eliminated when the same instance is reused over and over. This additionally updates the hypothesis testing; generated IDs are now confirmed to be blockable. --- sqids/sqids.py | 56 +++++++++++++++++++++++++++------------- tests/test_round_trip.py | 19 +++++++++++--- 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/sqids/sqids.py b/sqids/sqids.py index 286f999..1e3b8bc 100644 --- a/sqids/sqids.py +++ b/sqids/sqids.py @@ -2,6 +2,8 @@ import sys from .constants import DEFAULT_ALPHABET, DEFAULT_BLOCKLIST, DEFAULT_MIN_LENGTH +DIGITS = set("0123456789") + class Sqids: def __init__( @@ -28,16 +30,33 @@ def __init__( f"Minimum length has to be between 0 and {MIN_LENGTH_LIMIT}" ) - filtered_blocklist: Set[str] = set() - alphabet_lower = alphabet.lower() - for word_lower in (w.lower() for w in blocklist if len(w) >= 3): - intersection = [c for c in word_lower if c in alphabet_lower] - if len(intersection) == len(word_lower): - filtered_blocklist.add(word_lower) + exact_match: Set[str] = set() + match_at_ends: Set[str] = set() + match_anywhere: Set[str] = set() + alphabet_lower = set(alphabet.lower()) + for word in blocklist: + if len(word) < 3: + continue + elif len(word) == 3: + exact_match.add(word.lower()) + continue + + word_lower = word.lower() + word_lower_set = set(word_lower) + if word_lower_set & alphabet_lower != word_lower_set: + continue + + if word_lower_set & DIGITS: + match_at_ends.add(word_lower) + else: + match_anywhere.add(word_lower) self.__alphabet = self.__shuffle(alphabet) self.__min_length = min_length - self.__blocklist = filtered_blocklist + self.__blocklist_exact_match = exact_match + # When matching at the ends, `.startswith()` and `.endswith()` need a tuple. + self.__blocklist_match_at_ends = tuple(match_at_ends) + self.__blocklist_match_anywhere = match_anywhere def encode(self, numbers: List[int]) -> str: if not numbers: @@ -84,7 +103,7 @@ def __encode_numbers(self, numbers: List[int], increment: int = 0) -> str: alphabet = self.__shuffle(alphabet) id_ += alphabet[: min(self.__min_length - len(id_), len(alphabet))] - if self.__is_blocked_id(id_): + if len(id_) >= 3 and self.__is_blocked_id(id_): id_ = self.__encode_numbers(numbers, increment + 1) return id_ @@ -152,16 +171,17 @@ def __to_number(self, id_: str, alphabet: str) -> int: def __is_blocked_id(self, id_: str) -> bool: id_ = id_.lower() - for word in self.__blocklist: - if len(word) > len(id_): - continue - if len(id_) <= 3 or len(word) <= 3: - if id_ == word: - return True - elif any(c.isdigit() for c in word): - if id_.startswith(word) or id_.endswith(word): - return True - elif word in id_: + if len(id_) == 3: + return id_ in self.__blocklist_exact_match + + if ( + id_.startswith(self.__blocklist_match_at_ends) + or id_.endswith(self.__blocklist_match_at_ends) + ): + return True + + for word in self.__blocklist_match_anywhere: + if word in id_: return True return False diff --git a/tests/test_round_trip.py b/tests/test_round_trip.py index 90732a2..3886f5f 100644 --- a/tests/test_round_trip.py +++ b/tests/test_round_trip.py @@ -6,10 +6,13 @@ import hypothesis.strategies as st -lists_of_integers = st.lists(elements=st.integers(min_value=0, max_value=sys.maxsize)) +lists_of_integers = st.lists( + elements=st.integers(min_value=0, max_value=sys.maxsize), + min_size=1, +) min_lengths = st.integers(min_value=0, max_value=255) alphabets = st.text( - alphabet=st.characters(min_codepoint=0, max_codepoint=0x7f), + alphabet=st.characters(min_codepoint=0, max_codepoint=0x7F), min_size=3, ) @@ -23,5 +26,13 @@ def test_round_trip_encoding(numbers, min_length, alphabet): # Reject non-unique alphabets without failing the test. assume(len(set(alphabet)) == len(alphabet)) - sqid = sqids.Sqids(min_length=min_length, alphabet=alphabet, blocklist=[]) - assert sqid.decode(sqid.encode(numbers)) == numbers + sqid_1 = sqids.Sqids(min_length=min_length, alphabet=alphabet, blocklist=[]) + id_1 = sqid_1.encode(numbers) + assert sqid_1.decode(id_1) == numbers + + # If the ID is long enough, use it as a blocklist word and ensure it is blocked. + if len(id_1) >= 3: + sqid_2 = sqids.Sqids(min_length=min_length, alphabet=alphabet, blocklist=[id_1]) + id_2 = sqid_2.encode(numbers) + assert id_1 != id_2 + assert sqid_2.decode(id_2) == numbers From ac069bd4e89f5fa46d9022b3efdae3eefdc15791 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sat, 3 Aug 2024 07:25:57 -0500 Subject: [PATCH 5/9] Add a changelog entry --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 672ab63..c6584ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ **Unreleased** - Support Python 3.12 and 3.13. +- Speed up encoding by ~85% by optimizing blocklist checks. + This improvement requires more calculation when the `Sqids` class is instantiated, + so users are encouraged to instantiate `Sqids` once and always reuse the instance. **v0.4.1** - Compatibility with Python 3.6 (not officially supported) From ec137019a9d41571cbe2a3ef2583c79c9e1f0a36 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sat, 3 Aug 2024 07:32:34 -0500 Subject: [PATCH 6/9] Add a basic performance measurement script --- assets/performance.py | 61 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 assets/performance.py diff --git a/assets/performance.py b/assets/performance.py new file mode 100644 index 0000000..9c22178 --- /dev/null +++ b/assets/performance.py @@ -0,0 +1,61 @@ +import sqids +import timeit + +number = 100_000 + +print(f"Iterations: {number:,d}") + +print( + "{0:<20s} {1:7.3f}".format( + "Instantiate:", + timeit.timeit( + stmt="sqids.Sqids()", + globals={"sqids": sqids}, + number=number, + ) + ) +) + +print( + "{0:<20s} {1:7.3f}".format( + "Encode [0]:", # [0] -> 'bM' + timeit.timeit( + stmt="squid.encode([0])", + globals={"squid": sqids.Sqids()}, + number=number, + ) + ) +) + +print( + "{0:<20s} {1:7.3f}".format( + "Encode [0, 1, 2]:", # [0, 1, 2] -> 'rSCtlB' + timeit.timeit( + stmt="squid.encode([0, 1, 2])", + globals={"squid": sqids.Sqids()}, + number=number, + ) + ) +) + +print( + "{0:<20s} {1:7.3f}".format( + "Decode 'bM':", # 'bM' -> [0] + timeit.timeit( + stmt="squid.decode('bM')", + globals={"squid": sqids.Sqids()}, + number=number, + ) + ) +) + +print( + "{0:<20s} {1:7.3f}".format( + "Decode 'rSCtlB':", # 'rSCtlB' -> [0, 1, 2] + timeit.timeit( + stmt="squid.decode('rSCtlB')", + globals={"squid": sqids.Sqids()}, + number=number, + ) + ), +) From 442f3167a9c5ac64266447a3aaf123b58a8f0431 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sat, 3 Aug 2024 07:37:00 -0500 Subject: [PATCH 7/9] Add a test to ensure short blocklist words are ignored --- tests/test_blocklist.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_blocklist.py b/tests/test_blocklist.py index e2f9a67..a385070 100644 --- a/tests/test_blocklist.py +++ b/tests/test_blocklist.py @@ -82,3 +82,12 @@ def test_max_encoding_attempts(): with pytest.raises(Exception): sqids.encode([0]) + + +def test_small_words_are_ignored(): + """Blocklist words shorter than 3 characters must be ignored.""" + + id_ = Sqids().encode([0]) + assert id_ == "bM" + id_ = Sqids(blocklist=[id_]).encode([0]) + assert id_ == "bM" From 5452169f03484a2fd57ccb143ea0c9727fe6e044 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Sat, 3 Aug 2024 07:52:08 -0500 Subject: [PATCH 8/9] Don't track coverage in the generated roundtrip test conditional --- tests/test_round_trip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_round_trip.py b/tests/test_round_trip.py index 3886f5f..f38ea18 100644 --- a/tests/test_round_trip.py +++ b/tests/test_round_trip.py @@ -31,7 +31,7 @@ def test_round_trip_encoding(numbers, min_length, alphabet): assert sqid_1.decode(id_1) == numbers # If the ID is long enough, use it as a blocklist word and ensure it is blocked. - if len(id_1) >= 3: + if len(id_1) >= 3: # pragma: nocover sqid_2 = sqids.Sqids(min_length=min_length, alphabet=alphabet, blocklist=[id_1]) id_2 = sqid_2.encode(numbers) assert id_1 != id_2 From 7c4ef18d2614c8f7433d10882ba68930573c98ac Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Mon, 5 Aug 2024 18:09:32 -0500 Subject: [PATCH 9/9] Make instantiation fast when using the default alphabet and blocklist --- assets/filter_blocklist.py | 99 +++++++ sqids/constants.py | 580 ++++++++++++++++++++++++++++++++++++- sqids/sqids.py | 64 ++-- tests/test_blocklist.py | 26 ++ 4 files changed, 742 insertions(+), 27 deletions(-) create mode 100644 assets/filter_blocklist.py diff --git a/assets/filter_blocklist.py b/assets/filter_blocklist.py new file mode 100644 index 0000000..764e39a --- /dev/null +++ b/assets/filter_blocklist.py @@ -0,0 +1,99 @@ +import pathlib +import sys +from typing import Set, Tuple + + +repo_root = pathlib.Path(__file__).parent.parent +this_file = pathlib.Path(__file__).relative_to(repo_root) +constants_path = repo_root / "sqids/constants.py" +import sqids.constants # noqa + + +DIGITS = set("0123456789") + + +def filter_blocklist() -> Tuple[Set[str], Set[str], Set[str]]: + """Pre-filter the blocklist and update the constants file.""" + + exact_match = set() + match_at_ends = set() + match_anywhere = set() + + for word in sqids.constants.DEFAULT_BLOCKLIST: + if len(word) == 3: + exact_match.add(word) + elif set(word) & DIGITS: + match_at_ends.add(word) + else: + match_anywhere.add(word) + + return exact_match, match_at_ends, match_anywhere + + +def generate_new_constants_file( + exact_match: Set[str], + match_at_ends: Set[str], + match_anywhere: Set[str], +) -> str: + """Generate the text of a new constants file.""" + + lines = [ + f'DEFAULT_ALPHABET = "{sqids.constants.DEFAULT_ALPHABET}"', + f"DEFAULT_MIN_LENGTH = {sqids.constants.DEFAULT_MIN_LENGTH}", + "", + "# =======", + "# NOTE", + "# =======", + "#", + f"# When updating the blocklist, run {this_file} to pre-filter constants.", + "# This is critical for performance.", + "#", + "", + "DEFAULT_BLOCKLIST = [", + ] + # Output a sorted blocklist. + for word in sorted(sqids.constants.DEFAULT_BLOCKLIST): + lines.append(f' "{word}",') + lines.append("]") + + # Output exact-match blocklist words. + lines.append("") + lines.append("_exact_match = {") + for word in sorted(exact_match): + lines.append(f' "{word}",') + lines.append("}") + + # Output match-at-ends blocklist words. + lines.append("") + lines.append("_match_at_ends = (") + for word in sorted(match_at_ends): + lines.append(f' "{word}",') + lines.append(")") + + # Output match-anywhere blocklist words. + lines.append("") + lines.append("_match_anywhere = {") + for word in sorted(match_anywhere): + lines.append(f' "{word}",') + lines.append("}") + + return "\n".join(lines).rstrip() + "\n" # Include a trailing newline. + + +def main() -> int: + text = constants_path.read_text() + + exact_match, match_at_ends, match_anywhere = filter_blocklist() + new_text = generate_new_constants_file(exact_match, match_at_ends, match_anywhere) + + if text == new_text: + print("No changes necessary") + return 0 + + print(f"Updating {constants_path.relative_to(repo_root)}") + constants_path.write_text(new_text, newline="\n", encoding="utf-8") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sqids/constants.py b/sqids/constants.py index 601ffb2..07ce122 100644 --- a/sqids/constants.py +++ b/sqids/constants.py @@ -1,4 +1,14 @@ DEFAULT_ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" +DEFAULT_MIN_LENGTH = 0 + +# ======= +# NOTE +# ======= +# +# When updating the blocklist, run assets/filter_blocklist.py to pre-filter constants. +# This is critical for performance. +# + DEFAULT_BLOCKLIST = [ "0rgasm", "1d10t", @@ -561,4 +571,572 @@ "zocco1a", "zoccola", ] -DEFAULT_MIN_LENGTH = 0 + +_exact_match = { + "ass", + "cum", + "fag", + "g00", + "g0o", + "go0", + "goo", +} + +_match_at_ends = ( + "0rgasm", + "1d10t", + "1d1ot", + "1di0t", + "1diot", + "1eccacu10", + "1eccacu1o", + "1eccacul0", + "1eccaculo", + "1mbec11e", + "1mbec1le", + "1mbeci1e", + "1mbecile", + "a11upat0", + "a11upato", + "a1lupat0", + "a1lupato", + "ah01e", + "ah0le", + "aho1e", + "al1upat0", + "al1upato", + "allupat0", + "ana1", + "ana1e", + "arrapat0", + "b00b", + "b00be", + "b01ata", + "b0ceta", + "b0iata", + "b0ob", + "b0obe", + "b0sta", + "b1tch", + "b1te", + "b1tte", + "ba1atkar", + "bastard0", + "batt0na", + "bo0b", + "bo0be", + "bo1ata", + "bran1age", + "bran1er", + "bran1ette", + "bran1eur", + "bran1euse", + "c0ck", + "c0g110ne", + "c0g11one", + "c0g1i0ne", + "c0g1ione", + "c0gl10ne", + "c0gl1one", + "c0gli0ne", + "c0glione", + "c0na", + "c0nnard", + "c0nnasse", + "c0nne", + "c0u111es", + "c0u11les", + "c0u1l1es", + "c0u1lles", + "c0ui11es", + "c0ui1les", + "c0uil1es", + "c0uilles", + "c11t", + "c11t0", + "c11to", + "c1it", + "c1it0", + "c1ito", + "cabr0n", + "cabra0", + "cara1h0", + "cara1ho", + "caracu10", + "caracu1o", + "caracul0", + "caralh0", + "cazz0", + "cazz1mma", + "ch00t1a", + "ch00t1ya", + "ch00tia", + "ch00tiya", + "ch0d", + "ch0ot1a", + "ch0ot1ya", + "ch0otia", + "ch0otiya", + "ch1asse", + "ch1avata", + "ch1er", + "ch1ng0", + "ch1ngadaz0s", + "ch1ngadazos", + "ch1ngader1ta", + "ch1ngaderita", + "ch1ngar", + "ch1ngo", + "ch1ngues", + "ch1nk", + "ching0", + "chingadaz0s", + "chingader1ta", + "cho0t1a", + "cho0t1ya", + "cho0tia", + "cho0tiya", + "choot1a", + "choot1ya", + "cl1t", + "cl1t0", + "cl1to", + "clit0", + "cog110ne", + "cog11one", + "cog1i0ne", + "cog1ione", + "cogl10ne", + "cogl1one", + "cogli0ne", + "cou111es", + "cou11les", + "cou1l1es", + "cou1lles", + "coui11es", + "coui1les", + "couil1es", + "cu10", + "cu1att0ne", + "cu1attone", + "cu1er0", + "cu1ero", + "cu1o", + "cul0", + "culatt0ne", + "culer0", + "d11d0", + "d11do", + "d1ck", + "d1ld0", + "d1ldo", + "de1ch", + "di1d0", + "di1do", + "dild0", + "encu1e", + "enf01re", + "enf0ire", + "enfo1re", + "estup1d0", + "estup1do", + "estupid0", + "etr0n", + "f0da", + "f0der", + "f0ttere", + "f0tters1", + "f0ttersi", + "f0tze", + "f0utre", + "f1ca", + "f1cker", + "f1ga", + "fotters1", + "fr0c10", + "fr0c1o", + "fr0ci0", + "fr0cio", + "fr0sc10", + "fr0sc1o", + "fr0sci0", + "fr0scio", + "froc10", + "froc1o", + "froci0", + "frosc10", + "frosc1o", + "frosci0", + "g0u1ne", + "g0uine", + "gou1ne", + "gr0gnasse", + "haram1", + "hund1n", + "id10t", + "id1ot", + "idi0t", + "imbec11e", + "imbec1le", + "imbeci1e", + "j1zz", + "k1ke", + "kam1ne", + "leccacu10", + "leccacu1o", + "leccacul0", + "m1erda", + "m1gn0tta", + "m1gnotta", + "m1nch1a", + "m1nchia", + "m1st", + "mam0n", + "mamahuev0", + "masturbat10n", + "masturbat1on", + "masturbati0n", + "merd0s0", + "merd0so", + "merdos0", + "mign0tta", + "minch1a", + "musch1", + "n1gger", + "negr0", + "nerch1a", + "p00p", + "p011a", + "p01la", + "p0l1a", + "p0lla", + "p0mp1n0", + "p0mp1no", + "p0mpin0", + "p0mpino", + "p0op", + "p0rca", + "p0rn", + "p0rra", + "p0uff1asse", + "p0uffiasse", + "p1p1", + "p1pi", + "p1r1a", + "p1rla", + "p1sc10", + "p1sc1o", + "p1sci0", + "p1scio", + "p1sser", + "pa11e", + "pa1le", + "pal1e", + "pane1e1r0", + "pane1e1ro", + "pane1eir0", + "pane1eiro", + "panele1r0", + "panele1ro", + "paneleir0", + "pec0r1na", + "pec0rina", + "pecor1na", + "pen1s", + "pendej0", + "pip1", + "pir1a", + "pisc10", + "pisc1o", + "pisci0", + "po0p", + "po11a", + "po1la", + "pol1a", + "pomp1n0", + "pomp1no", + "pompin0", + "pouff1asse", + "pr1ck", + "put1za", + "puta1n", + "r0mp1ba11e", + "r0mp1ba1le", + "r0mp1bal1e", + "r0mp1balle", + "r0mpiba11e", + "r0mpiba1le", + "r0mpibal1e", + "r0mpiballe", + "rand1", + "recch10ne", + "recch1one", + "recchi0ne", + "romp1ba11e", + "romp1ba1le", + "romp1bal1e", + "romp1balle", + "rompiba11e", + "rompiba1le", + "rompibal1e", + "ruff1an0", + "ruff1ano", + "ruffian0", + "s1ut", + "sa10pe", + "sa1aud", + "sa1ope", + "sal0pe", + "sb0rr0ne", + "sb0rra", + "sb0rrone", + "sbatters1", + "sborr0ne", + "sc0pare", + "sc0pata", + "sch1ampe", + "sche1se", + "sche1sse", + "schwachs1nn1g", + "schwachs1nnig", + "schwachsinn1g", + "sh1t", + "sp0mp1nare", + "sp0mpinare", + "spomp1nare", + "str0nz0", + "str0nza", + "str0nzo", + "stronz0", + "stup1d", + "succh1am1", + "succh1ami", + "succhiam1", + "t0pa", + "test1c1e", + "test1cle", + "testic1e", + "tr01a", + "tr0ia", + "tr0mbare", + "tr1ng1er", + "tr1ngler", + "tring1er", + "tro1a", + "vaffancu10", + "vaffancu1o", + "vaffancul0", + "vag1na", + "w1chsen", + "x0ch0ta", + "x0chota", + "xoch0ta", + "z0cc01a", + "z0cc0la", + "z0cco1a", + "z0ccola", + "z1z1", + "z1zi", + "ziz1", + "zocc01a", + "zocc0la", + "zocco1a", +) + +_match_anywhere = { + "aand", + "ahole", + "allupato", + "anal", + "anale", + "anus", + "arrapato", + "arsch", + "arse", + "balatkar", + "bastardo", + "battona", + "bitch", + "bite", + "bitte", + "boceta", + "boiata", + "boob", + "boobe", + "bosta", + "branlage", + "branler", + "branlette", + "branleur", + "branleuse", + "cabrao", + "cabron", + "caca", + "cacca", + "cacete", + "cagante", + "cagar", + "cagare", + "cagna", + "caraculo", + "caralho", + "cazzata", + "cazzimma", + "cazzo", + "chatte", + "chiasse", + "chiavata", + "chier", + "chingadazos", + "chingaderita", + "chingar", + "chingo", + "chingues", + "chink", + "chod", + "chootia", + "chootiya", + "clit", + "clito", + "cock", + "coglione", + "cona", + "connard", + "connasse", + "conne", + "couilles", + "cracker", + "crap", + "culattone", + "culero", + "culo", + "cunt", + "damn", + "deich", + "depp", + "dick", + "dildo", + "dyke", + "encule", + "enema", + "enfoire", + "estupido", + "etron", + "fica", + "ficker", + "figa", + "foda", + "foder", + "fottere", + "fottersi", + "fotze", + "foutre", + "frocio", + "froscio", + "fuck", + "gandu", + "gouine", + "grognasse", + "harami", + "haramzade", + "hundin", + "idiot", + "imbecile", + "jerk", + "jizz", + "kamine", + "kike", + "leccaculo", + "mamahuevo", + "mamon", + "masturbate", + "masturbation", + "merda", + "merde", + "merdoso", + "mierda", + "mignotta", + "minchia", + "mist", + "muschi", + "neger", + "negre", + "negro", + "nerchia", + "nigger", + "orgasm", + "palle", + "paneleiro", + "patakha", + "pecorina", + "pendejo", + "penis", + "pipi", + "pirla", + "piscio", + "pisser", + "polla", + "pompino", + "poop", + "porca", + "porn", + "porra", + "pouffiasse", + "prick", + "pussy", + "puta", + "putain", + "pute", + "putiza", + "puttana", + "queca", + "randi", + "rape", + "recchione", + "retard", + "rompiballe", + "ruffiano", + "sacanagem", + "salaud", + "salope", + "saugnapf", + "sbattere", + "sbattersi", + "sborra", + "sborrone", + "scheise", + "scheisse", + "schlampe", + "schwachsinnig", + "schwanz", + "scopare", + "scopata", + "sexy", + "shit", + "slut", + "spompinare", + "stronza", + "stronzo", + "stupid", + "succhiami", + "sucker", + "tapette", + "testicle", + "tette", + "topa", + "tringler", + "troia", + "trombare", + "turd", + "twat", + "vaffanculo", + "vagina", + "verdammt", + "verga", + "wank", + "wichsen", + "xana", + "xochota", + "zizi", + "zoccola", +} diff --git a/sqids/sqids.py b/sqids/sqids.py index 1e3b8bc..89d51e9 100644 --- a/sqids/sqids.py +++ b/sqids/sqids.py @@ -1,6 +1,13 @@ -from typing import List, Set +from typing import List, Set, Tuple import sys -from .constants import DEFAULT_ALPHABET, DEFAULT_BLOCKLIST, DEFAULT_MIN_LENGTH +from .constants import ( + DEFAULT_ALPHABET, + DEFAULT_BLOCKLIST, + DEFAULT_MIN_LENGTH, + _exact_match, + _match_at_ends, + _match_anywhere, +) DIGITS = set("0123456789") @@ -30,33 +37,38 @@ def __init__( f"Minimum length has to be between 0 and {MIN_LENGTH_LIMIT}" ) - exact_match: Set[str] = set() - match_at_ends: Set[str] = set() - match_anywhere: Set[str] = set() - alphabet_lower = set(alphabet.lower()) - for word in blocklist: - if len(word) < 3: - continue - elif len(word) == 3: - exact_match.add(word.lower()) - continue - - word_lower = word.lower() - word_lower_set = set(word_lower) - if word_lower_set & alphabet_lower != word_lower_set: - continue - - if word_lower_set & DIGITS: - match_at_ends.add(word_lower) - else: - match_anywhere.add(word_lower) + # When the blocklist and alphabet are defaults, use pre-filtered blocklists. + if blocklist is DEFAULT_BLOCKLIST and alphabet is DEFAULT_ALPHABET: + self.__blocklist_exact_match: Set[str] = _exact_match + self.__blocklist_match_at_ends: Tuple[str, ...] = _match_at_ends + self.__blocklist_match_anywhere: Set[str] = _match_anywhere + else: + alphabet_lower = set(alphabet.lower()) + exact_match: Set[str] = set() + match_at_ends: Set[str] = set() + match_anywhere: Set[str] = set() + for word in blocklist: + if len(word) < 3: + continue + word_lower = word.lower() + word_lower_set = set(word_lower) + if word_lower_set & alphabet_lower != word_lower_set: + continue + + if len(word) == 3: + exact_match.add(word.lower()) + elif word_lower_set & DIGITS: + match_at_ends.add(word_lower) + else: + match_anywhere.add(word_lower) + + self.__blocklist_exact_match = exact_match + # When matching at the ends, `.startswith()` and `.endswith()` need a tuple. + self.__blocklist_match_at_ends = tuple(match_at_ends) + self.__blocklist_match_anywhere = match_anywhere self.__alphabet = self.__shuffle(alphabet) self.__min_length = min_length - self.__blocklist_exact_match = exact_match - # When matching at the ends, `.startswith()` and `.endswith()` need a tuple. - self.__blocklist_match_at_ends = tuple(match_at_ends) - self.__blocklist_match_anywhere = match_anywhere def encode(self, numbers: List[int]) -> str: if not numbers: diff --git a/tests/test_blocklist.py b/tests/test_blocklist.py index a385070..594de59 100644 --- a/tests/test_blocklist.py +++ b/tests/test_blocklist.py @@ -1,3 +1,6 @@ +import pathlib +import sys + import pytest from sqids import Sqids @@ -70,6 +73,16 @@ def test_blocklist_filtering_in_constructor(): assert numbers == [1, 2, 3] +@pytest.mark.parametrize("word", ("ab!", "abc!", "xyz")) +def test_alphabet_is_not_superset_of_blocklist_word_characters(word): + """Verify that a non-subset blocklist word is ignored.""" + + sqids = Sqids(alphabet="abc", blocklist=[word]) + assert sqids._Sqids__blocklist_exact_match == set() + assert sqids._Sqids__blocklist_match_at_ends == tuple() + assert sqids._Sqids__blocklist_match_anywhere == set() + + def test_max_encoding_attempts(): alphabet = "abc" min_length = 3 @@ -91,3 +104,16 @@ def test_small_words_are_ignored(): assert id_ == "bM" id_ = Sqids(blocklist=[id_]).encode([0]) assert id_ == "bM" + + +def test_constants_file_is_pristine(): + """Verify the constants file is pristine.""" + + repo_root = pathlib.Path(__file__).parent.parent + sys.path.append(str(repo_root / "assets")) + import filter_blocklist + + sets = filter_blocklist.filter_blocklist() + new_text = filter_blocklist.generate_new_constants_file(*sets) + error_message = "You must run assets/filter_blocklist.py!" + assert filter_blocklist.constants_path.read_text() == new_text, error_message