From ccacd483c7b3e9304f24dfb4d30877700c54b0a7 Mon Sep 17 00:00:00 2001
From: sudoskys <coldlando@hotmail.com>
Date: Sat, 11 Jan 2025 19:58:06 +0800
Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20refactor(app):=20remove=20depre?=
 =?UTF-8?q?cated=20detect=5Flangs=20function?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `detect_langs` function was removed to streamline the codebase.
This change eliminates redundant functionality and encourages the use
of the updated `detect_language` method, enhancing maintainability.
---
 src/fast_langdetect/__init__.py           |  2 +-
 src/fast_langdetect/ft_detect/__init__.py | 12 ------------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/fast_langdetect/__init__.py b/src/fast_langdetect/__init__.py
index 1e22cbd..3c3d5d8 100644
--- a/src/fast_langdetect/__init__.py
+++ b/src/fast_langdetect/__init__.py
@@ -1,4 +1,4 @@
 # -*- coding: utf-8 -*-
 
 
-from .ft_detect import detect, detect_language, detect_langs, detect_multilingual  # noqa: F401
\ No newline at end of file
+from .ft_detect import detect, detect_language, detect_multilingual  # noqa: F401
\ No newline at end of file
diff --git a/src/fast_langdetect/ft_detect/__init__.py b/src/fast_langdetect/ft_detect/__init__.py
index 85e81a5..e8c0e76 100644
--- a/src/fast_langdetect/ft_detect/__init__.py
+++ b/src/fast_langdetect/ft_detect/__init__.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/1/17 下午4:00
-import logging
 
 from .infer import detect
 from .infer import detect_multilingual  # noqa: F401
@@ -24,14 +23,3 @@ def detect_language(sentence, *, low_memory: bool = True):
     if lang_code == "JA" and not is_japanese(sentence):
         lang_code = "ZH"
     return lang_code
-
-
-def detect_langs(sentence, *, low_memory: bool = True):
-    """
-    Detect language
-    :param sentence: str sentence
-    :param low_memory: bool (default: True) whether to use low memory mode
-    :return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters)
-    """
-    logging.warning("detect_langs is deprecated, use detect_language instead")
-    return detect_language(sentence, low_memory=low_memory)

From 2628dfe9d962e826885c6303f44f0329d78b8afe Mon Sep 17 00:00:00 2001
From: sudoskys <coldlando@hotmail.com>
Date: Sat, 11 Jan 2025 19:58:25 +0800
Subject: [PATCH 2/5] =?UTF-8?q?=E2=9C=A8=20feat(app):=20enhance=20FastText?=
 =?UTF-8?q?=20model=20download=20&=20load=20settings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Adjusted `retry_max` to 2, `sleep_max` to 5s, `timeout` to 7s
- Changed logging from error to warning for load failures
- Defaulted `low_memory` to False for better performance

These changes improve resilience and performance in model handling.
---
 src/fast_langdetect/ft_detect/infer.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py
index 6442e7f..edae183 100644
--- a/src/fast_langdetect/ft_detect/infer.py
+++ b/src/fast_langdetect/ft_detect/infer.py
@@ -62,8 +62,9 @@ def download_model(
             folder=str(save_path.parent),
             filename=save_path.name,
             proxy=proxy,
-            retry_max=3,
-            timeout=30,
+            retry_max=2,
+            sleep_max=5,
+            timeout=7,
         )
     except Exception as e:
         logger.error(f"fast-langdetect:Failed to download FastText model from {download_url}: {e}")
@@ -94,7 +95,7 @@ def load_fasttext_model(
         # Load FastText model
         return fasttext.load_model(str(model_path))
     except Exception as e:
-        logger.error(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}")
+        logger.warning(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}")
         raise DetectError(f"Failed to load FastText model: {e}")
 
 
@@ -131,7 +132,7 @@ def load_model(
         _model_cache.cache_model(cache_key, model)
         return model
     except Exception as e:
-        logger.error(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}")
+        logger.warning(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}")
         if use_strict_mode:
             raise DetectError("Failed to load FastText model.") from e
         elif not low_memory:
@@ -176,7 +177,7 @@ def detect(
 def detect_multilingual(
         text: str,
         *,
-        low_memory: bool = True,
+        low_memory: bool = False,
         model_download_proxy: Optional[str] = None,
         k: int = 5,
         threshold: float = 0.0,

From 1859e86df46f328f9c946c986cac376cb11316d7 Mon Sep 17 00:00:00 2001
From: sudoskys <coldlando@hotmail.com>
Date: Sat, 11 Jan 2025 20:21:01 +0800
Subject: [PATCH 3/5] =?UTF-8?q?=E2=9C=A8=20feat(app):=20add=20MD5=20verifi?=
 =?UTF-8?q?cation=20for=20FastText=20model=20integrity?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce MD5 hash verification for the FastText model download.
This ensures the integrity of the model file, reducing prediction
errors due to corrupted downloads.
---
 src/fast_langdetect/ft_detect/infer.py | 52 +++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py
index edae183..eb771f3 100644
--- a/src/fast_langdetect/ft_detect/infer.py
+++ b/src/fast_langdetect/ft_detect/infer.py
@@ -17,6 +17,7 @@
 
 FASTTEXT_LARGE_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
 FASTTEXT_LARGE_MODEL_NAME = "lid.176.bin"
+VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
 
 
 class DetectError(Exception):
@@ -38,6 +39,36 @@ def cache_model(self, key: str, model) -> None:
 
 _model_cache = ModelManager()
 
+import hashlib
+
+
+def calculate_md5(file_path, chunk_size=8192):
+    """
+    Calculate the MD5 hash of a file.
+
+    :param file_path: Path to the file
+    :param chunk_size: Size of each chunk to read from the file
+    :return: MD5 hash of the file
+    """
+    md5 = hashlib.md5()
+    with open(file_path, 'rb') as f:
+        for chunk in iter(lambda: f.read(chunk_size), b''):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
+def verify_md5(file_path, expected_md5, chunk_size=8192):
+    """
+    Verify the MD5 hash of a file against an expected hash.
+
+    :param file_path: Path to the file
+    :param expected_md5: Expected MD5 hash
+    :param chunk_size: Size of each chunk to read from the file
+    :return: True if the file's MD5 hash matches the expected hash, False otherwise
+    """
+    md5 = calculate_md5(file_path, chunk_size)
+    return md5 == expected_md5
+
 
 def download_model(
         download_url: str,
@@ -84,12 +115,23 @@ def load_fasttext_model(
     :return: FastText model
     :raises DetectError: If model loading fails
     """
-    if not model_path.exists() and download_url:
-        # Attempt to download the model
-        download_model(download_url, model_path, proxy)
-
+    if all([
+        VERIFY_FASTTEXT_LARGE_MODEL,
+        model_path.exists(),
+        model_path.name == FASTTEXT_LARGE_MODEL_NAME,
+    ]):
+        if not verify_md5(model_path, VERIFY_FASTTEXT_LARGE_MODEL):
+            logger.warning(
+                f"fast-langdetect: MD5 hash verification failed for {model_path}, "
+                f"please check the integrity of the downloaded file from {FASTTEXT_LARGE_MODEL_URL}. "
+                "\n    This may seriously reduce the prediction accuracy. "
+                "If you want to ignore this, please set `fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL = None` "
+            )
     if not model_path.exists():
-        raise DetectError(f"FastText model file not found at {model_path}")
+        if download_url:
+            download_model(download_url, model_path, proxy)
+        if not model_path.exists():
+            raise DetectError(f"FastText model file not found at {model_path}")
 
     try:
         # Load FastText model

From 8577da5028afb88f157ddac42a35c58205b4650a Mon Sep 17 00:00:00 2001
From: sudoskys <coldlando@hotmail.com>
Date: Sat, 11 Jan 2025 20:30:28 +0800
Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=93=9D=20docs(api):=20enhance=20docst?=
 =?UTF-8?q?ring=20with=20usage=20notes=20and=20parameter=20details?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added guidelines for handling line breaks in input text.
- Clarified parameter descriptions for better user understanding.
- Highlighted the importance of large models for accuracy.
---
 src/fast_langdetect/ft_detect/infer.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py
index eb771f3..e68cee3 100644
--- a/src/fast_langdetect/ft_detect/infer.py
+++ b/src/fast_langdetect/ft_detect/infer.py
@@ -192,12 +192,15 @@ def detect(
 ) -> Dict[str, Union[str, float]]:
     """
     Detect the language of a text using FastText.
-    This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
-    If the model is not supervised, this function will throw a ValueError.
+
+    - You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised.
+
+    - In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models!
+
     :param text: The text for language detection
-    :param low_memory: Whether to use a memory-efficient model
+    :param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html)
     :param model_download_proxy: Download proxy for the model if needed
-    :param use_strict_mode: If it was enabled, strictly loads large model or raises error if it fails
+    :param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled.
     :return: A dictionary with detected language and confidence score
     :raises LanguageDetectionError: If detection fails
     """
@@ -227,6 +230,18 @@ def detect_multilingual(
 ) -> List[Dict[str, Any]]:
     """
     Detect the top-k probable languages for a given text.
+
+    - You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised.
+
+    - In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models!
+
+    :param text: The text for language detection
+    :param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html)
+    :param model_download_proxy: Download proxy for the model if needed
+    :param k: Number of top languages to return
+    :param threshold: Minimum confidence score to consider
+    :param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled.
+    :return: A list of dictionaries with detected languages and confidence scores
     """
     model = load_model(
         low_memory=low_memory,

From f08eeaaa029151003c662ef8c5b42023510268b7 Mon Sep 17 00:00:00 2001
From: sudoskys <coldlando@hotmail.com>
Date: Sat, 11 Jan 2025 20:33:06 +0800
Subject: [PATCH 5/5] chore(release): bump version to 0.2.4 in pyproject.toml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🔄 Updated the project version to 0.2.4 for new changes.
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 025b726..e4dc19e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fast-langdetect"
-version = "0.2.3"
+version = "0.2.4"
 description = "Quickly detect text language and segment language"
 authors = [
     { name = "sudoskys", email = "coldlando@hotmail.com" },