From ccacd483c7b3e9304f24dfb4d30877700c54b0a7 Mon Sep 17 00:00:00 2001 From: sudoskys Date: Sat, 11 Jan 2025 19:58:06 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20refactor(app):=20remove=20depre?= =?UTF-8?q?cated=20detect=5Flangs=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `detect_langs` function was removed to streamline the codebase. This change eliminates redundant functionality and encourages the use of the updated `detect_language` method, enhancing maintainability. --- src/fast_langdetect/__init__.py | 2 +- src/fast_langdetect/ft_detect/__init__.py | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/fast_langdetect/__init__.py b/src/fast_langdetect/__init__.py index 1e22cbd..3c3d5d8 100644 --- a/src/fast_langdetect/__init__.py +++ b/src/fast_langdetect/__init__.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- -from .ft_detect import detect, detect_language, detect_langs, detect_multilingual # noqa: F401 \ No newline at end of file +from .ft_detect import detect, detect_language, detect_multilingual # noqa: F401 \ No newline at end of file diff --git a/src/fast_langdetect/ft_detect/__init__.py b/src/fast_langdetect/ft_detect/__init__.py index 85e81a5..e8c0e76 100644 --- a/src/fast_langdetect/ft_detect/__init__.py +++ b/src/fast_langdetect/ft_detect/__init__.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # @Time : 2024/1/17 δΈ‹εˆ4:00 -import logging from .infer import detect from .infer import detect_multilingual # noqa: F401 @@ -24,14 +23,3 @@ def detect_language(sentence, *, low_memory: bool = True): if lang_code == "JA" and not is_japanese(sentence): lang_code = "ZH" return lang_code - - -def detect_langs(sentence, *, low_memory: bool = True): - """ - Detect language - :param sentence: str sentence - :param low_memory: bool (default: True) whether to use low memory mode - :return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters) - """ - logging.warning("detect_langs is deprecated, use detect_language instead") - return detect_language(sentence, low_memory=low_memory) From 2628dfe9d962e826885c6303f44f0329d78b8afe Mon Sep 17 00:00:00 2001 From: sudoskys Date: Sat, 11 Jan 2025 19:58:25 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E2=9C=A8=20feat(app):=20enhance=20FastText?= =?UTF-8?q?=20model=20download=20&=20load=20settings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adjusted `retry_max` to 2, `sleep_max` to 5s, `timeout` to 7s - Changed logging from error to warning for load failures - Defaulted `low_memory` to False for better performance These changes improve resilience and performance in model handling. --- src/fast_langdetect/ft_detect/infer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py index 6442e7f..edae183 100644 --- a/src/fast_langdetect/ft_detect/infer.py +++ b/src/fast_langdetect/ft_detect/infer.py @@ -62,8 +62,9 @@ def download_model( folder=str(save_path.parent), filename=save_path.name, proxy=proxy, - retry_max=3, - timeout=30, + retry_max=2, + sleep_max=5, + timeout=7, ) except Exception as e: logger.error(f"fast-langdetect:Failed to download FastText model from {download_url}: {e}") @@ -94,7 +95,7 @@ def load_fasttext_model( # Load FastText model return fasttext.load_model(str(model_path)) except Exception as e: - logger.error(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}") + logger.warning(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}") raise DetectError(f"Failed to load FastText model: {e}") @@ -131,7 +132,7 @@ def load_model( _model_cache.cache_model(cache_key, model) return model except Exception as e: - logger.error(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}") + logger.warning(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}") if use_strict_mode: raise DetectError("Failed to load FastText model.") from e elif not low_memory: @@ -176,7 +177,7 @@ def detect( def detect_multilingual( text: str, *, - low_memory: bool = True, + low_memory: bool = False, model_download_proxy: Optional[str] = None, k: int = 5, threshold: float = 0.0, From 1859e86df46f328f9c946c986cac376cb11316d7 Mon Sep 17 00:00:00 2001 From: sudoskys Date: Sat, 11 Jan 2025 20:21:01 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9C=A8=20feat(app):=20add=20MD5=20verifi?= =?UTF-8?q?cation=20for=20FastText=20model=20integrity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce MD5 hash verification for the FastText model download. This ensures the integrity of the model file, reducing prediction errors due to corrupted downloads. --- src/fast_langdetect/ft_detect/infer.py | 52 +++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py index edae183..eb771f3 100644 --- a/src/fast_langdetect/ft_detect/infer.py +++ b/src/fast_langdetect/ft_detect/infer.py @@ -17,6 +17,7 @@ FASTTEXT_LARGE_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" FASTTEXT_LARGE_MODEL_NAME = "lid.176.bin" +VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65" class DetectError(Exception): @@ -38,6 +39,36 @@ def cache_model(self, key: str, model) -> None: _model_cache = ModelManager() +import hashlib + + +def calculate_md5(file_path, chunk_size=8192): + """ + Calculate the MD5 hash of a file. + + :param file_path: Path to the file + :param chunk_size: Size of each chunk to read from the file + :return: MD5 hash of the file + """ + md5 = hashlib.md5() + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(chunk_size), b''): + md5.update(chunk) + return md5.hexdigest() + + +def verify_md5(file_path, expected_md5, chunk_size=8192): + """ + Verify the MD5 hash of a file against an expected hash. + + :param file_path: Path to the file + :param expected_md5: Expected MD5 hash + :param chunk_size: Size of each chunk to read from the file + :return: True if the file's MD5 hash matches the expected hash, False otherwise + """ + md5 = calculate_md5(file_path, chunk_size) + return md5 == expected_md5 + def download_model( download_url: str, @@ -84,12 +115,23 @@ def load_fasttext_model( :return: FastText model :raises DetectError: If model loading fails """ - if not model_path.exists() and download_url: - # Attempt to download the model - download_model(download_url, model_path, proxy) - + if all([ + VERIFY_FASTTEXT_LARGE_MODEL, + model_path.exists(), + model_path.name == FASTTEXT_LARGE_MODEL_NAME, + ]): + if not verify_md5(model_path, VERIFY_FASTTEXT_LARGE_MODEL): + logger.warning( + f"fast-langdetect: MD5 hash verification failed for {model_path}, " + f"please check the integrity of the downloaded file from {FASTTEXT_LARGE_MODEL_URL}. " + "\n This may seriously reduce the prediction accuracy. " + "If you want to ignore this, please set `fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL = None` " + ) if not model_path.exists(): - raise DetectError(f"FastText model file not found at {model_path}") + if download_url: + download_model(download_url, model_path, proxy) + if not model_path.exists(): + raise DetectError(f"FastText model file not found at {model_path}") try: # Load FastText model From 8577da5028afb88f157ddac42a35c58205b4650a Mon Sep 17 00:00:00 2001 From: sudoskys Date: Sat, 11 Jan 2025 20:30:28 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=93=9D=20docs(api):=20enhance=20docst?= =?UTF-8?q?ring=20with=20usage=20notes=20and=20parameter=20details?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added guidelines for handling line breaks in input text. - Clarified parameter descriptions for better user understanding. - Highlighted the importance of large models for accuracy. --- src/fast_langdetect/ft_detect/infer.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py index eb771f3..e68cee3 100644 --- a/src/fast_langdetect/ft_detect/infer.py +++ b/src/fast_langdetect/ft_detect/infer.py @@ -192,12 +192,15 @@ def detect( ) -> Dict[str, Union[str, float]]: """ Detect the language of a text using FastText. - This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character. - If the model is not supervised, this function will throw a ValueError. + + - You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised. + + - In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models! + :param text: The text for language detection - :param low_memory: Whether to use a memory-efficient model + :param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html) :param model_download_proxy: Download proxy for the model if needed - :param use_strict_mode: If it was enabled, strictly loads large model or raises error if it fails + :param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled. :return: A dictionary with detected language and confidence score :raises LanguageDetectionError: If detection fails """ @@ -227,6 +230,18 @@ def detect_multilingual( ) -> List[Dict[str, Any]]: """ Detect the top-k probable languages for a given text. + + - You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised. + + - In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models! + + :param text: The text for language detection + :param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html) + :param model_download_proxy: Download proxy for the model if needed + :param k: Number of top languages to return + :param threshold: Minimum confidence score to consider + :param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled. + :return: A list of dictionaries with detected languages and confidence scores """ model = load_model( low_memory=low_memory, From f08eeaaa029151003c662ef8c5b42023510268b7 Mon Sep 17 00:00:00 2001 From: sudoskys Date: Sat, 11 Jan 2025 20:33:06 +0800 Subject: [PATCH 5/5] chore(release): bump version to 0.2.4 in pyproject.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit πŸ”„ Updated the project version to 0.2.4 for new changes. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 025b726..e4dc19e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "fast-langdetect" -version = "0.2.3" +version = "0.2.4" description = "Quickly detect text language and segment language" authors = [ { name = "sudoskys", email = "coldlando@hotmail.com" },