Skip to content

Commit

Permalink
Merge pull request #9 from LlmKira/dev-3
Browse files Browse the repository at this point in the history
✨ feat(app): add MD5 verification for FastText model integrity
  • Loading branch information
sudoskys authored Jan 11, 2025
2 parents 2de37bb + f08eeaa commit 364d4f5
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 28 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "fast-langdetect"
version = "0.2.3"
version = "0.2.4"
description = "Quickly detect text language and segment language"
authors = [
{ name = "sudoskys", email = "coldlando@hotmail.com" },
Expand Down
2 changes: 1 addition & 1 deletion src/fast_langdetect/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-


from .ft_detect import detect, detect_language, detect_langs, detect_multilingual # noqa: F401
from .ft_detect import detect, detect_language, detect_multilingual # noqa: F401
12 changes: 0 additions & 12 deletions src/fast_langdetect/ft_detect/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
# @Time : 2024/1/17 下午4:00
import logging

from .infer import detect
from .infer import detect_multilingual # noqa: F401
Expand All @@ -24,14 +23,3 @@ def detect_language(sentence, *, low_memory: bool = True):
if lang_code == "JA" and not is_japanese(sentence):
lang_code = "ZH"
return lang_code


def detect_langs(sentence, *, low_memory: bool = True):
"""
Detect language
:param sentence: str sentence
:param low_memory: bool (default: True) whether to use low memory mode
:return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters)
"""
logging.warning("detect_langs is deprecated, use detect_language instead")
return detect_language(sentence, low_memory=low_memory)
86 changes: 72 additions & 14 deletions src/fast_langdetect/ft_detect/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

FASTTEXT_LARGE_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
FASTTEXT_LARGE_MODEL_NAME = "lid.176.bin"
VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"


class DetectError(Exception):
Expand All @@ -38,6 +39,36 @@ def cache_model(self, key: str, model) -> None:

_model_cache = ModelManager()

import hashlib


def calculate_md5(file_path, chunk_size=8192):
"""
Calculate the MD5 hash of a file.
:param file_path: Path to the file
:param chunk_size: Size of each chunk to read from the file
:return: MD5 hash of the file
"""
md5 = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(chunk_size), b''):
md5.update(chunk)
return md5.hexdigest()


def verify_md5(file_path, expected_md5, chunk_size=8192):
"""
Verify the MD5 hash of a file against an expected hash.
:param file_path: Path to the file
:param expected_md5: Expected MD5 hash
:param chunk_size: Size of each chunk to read from the file
:return: True if the file's MD5 hash matches the expected hash, False otherwise
"""
md5 = calculate_md5(file_path, chunk_size)
return md5 == expected_md5


def download_model(
download_url: str,
Expand All @@ -62,8 +93,9 @@ def download_model(
folder=str(save_path.parent),
filename=save_path.name,
proxy=proxy,
retry_max=3,
timeout=30,
retry_max=2,
sleep_max=5,
timeout=7,
)
except Exception as e:
logger.error(f"fast-langdetect:Failed to download FastText model from {download_url}: {e}")
Expand All @@ -83,18 +115,29 @@ def load_fasttext_model(
:return: FastText model
:raises DetectError: If model loading fails
"""
if not model_path.exists() and download_url:
# Attempt to download the model
download_model(download_url, model_path, proxy)

if all([
VERIFY_FASTTEXT_LARGE_MODEL,
model_path.exists(),
model_path.name == FASTTEXT_LARGE_MODEL_NAME,
]):
if not verify_md5(model_path, VERIFY_FASTTEXT_LARGE_MODEL):
logger.warning(
f"fast-langdetect: MD5 hash verification failed for {model_path}, "
f"please check the integrity of the downloaded file from {FASTTEXT_LARGE_MODEL_URL}. "
"\n This may seriously reduce the prediction accuracy. "
"If you want to ignore this, please set `fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL = None` "
)
if not model_path.exists():
raise DetectError(f"FastText model file not found at {model_path}")
if download_url:
download_model(download_url, model_path, proxy)
if not model_path.exists():
raise DetectError(f"FastText model file not found at {model_path}")

try:
# Load FastText model
return fasttext.load_model(str(model_path))
except Exception as e:
logger.error(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}")
logger.warning(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}")
raise DetectError(f"Failed to load FastText model: {e}")


Expand Down Expand Up @@ -131,7 +174,7 @@ def load_model(
_model_cache.cache_model(cache_key, model)
return model
except Exception as e:
logger.error(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}")
logger.warning(f"fast-langdetect:Failed to load model ({'low' if low_memory else 'high'} memory): {e}")
if use_strict_mode:
raise DetectError("Failed to load FastText model.") from e
elif not low_memory:
Expand All @@ -149,12 +192,15 @@ def detect(
) -> Dict[str, Union[str, float]]:
"""
Detect the language of a text using FastText.
This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
If the model is not supervised, this function will throw a ValueError.
- You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised.
- In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models!
:param text: The text for language detection
:param low_memory: Whether to use a memory-efficient model
:param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html)
:param model_download_proxy: Download proxy for the model if needed
:param use_strict_mode: If it was enabled, strictly loads large model or raises error if it fails
:param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled.
:return: A dictionary with detected language and confidence score
:raises LanguageDetectionError: If detection fails
"""
Expand All @@ -176,14 +222,26 @@ def detect(
def detect_multilingual(
text: str,
*,
low_memory: bool = True,
low_memory: bool = False,
model_download_proxy: Optional[str] = None,
k: int = 5,
threshold: float = 0.0,
use_strict_mode: bool = False,
) -> List[Dict[str, Any]]:
"""
Detect the top-k probable languages for a given text.
- You MUST manually remove line breaks(`n`) from the text to be processed in advance, otherwise a ValueError is raised.
- In scenarios **where accuracy is important**, you should not rely on the detection results of small models, use `low_memory=False` to download larger models!
:param text: The text for language detection
:param low_memory: Whether to use the compressed version of the model (https://fasttext.cc/docs/en/language-identification.html)
:param model_download_proxy: Download proxy for the model if needed
:param k: Number of top languages to return
:param threshold: Minimum confidence score to consider
:param use_strict_mode: When this parameter is enabled, the fallback after loading failure will be disabled.
:return: A list of dictionaries with detected languages and confidence scores
"""
model = load_model(
low_memory=low_memory,
Expand Down

0 comments on commit 364d4f5

Please sign in to comment.