🎨 chore(ft_detect): refactor detect_langs to detect_language

Refactor `detect_langs` function to `detect_language` for better clarity and deprecation warning. Remove redundancy and improve code readability.
LlmKira · Jul 6, 2024 · 17b159a · 17b159a
1 parent 00aa55e
commit 17b159a
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -2,67 +2,89 @@
 
 [![PyPI version](https://badge.fury.io/py/fast-langdetect.svg)](https://badge.fury.io/py/fast-langdetect)
 [![Downloads](https://pepy.tech/badge/fast-langdetect)](https://pepy.tech/project/fast-langdetect)
-[![Downloads](https://pepy.tech/badge/fast-langdetect/month)](https://pepy.tech/project/fast-langdetect/month)
+[![Downloads](https://pepy.tech/badge/fast-langdetect/month)](https://pepy.tech/project/fast-langdetect/)
 
-Python 3.9-3.12 support only. 🐍
+## Overview
 
-80x faster and 95% accurate language identification with Fasttext 🏎️
+**fast-langdetect** provides ultra-fast and highly accurate language detection based on FastText, a library developed by
+Facebook. This package is 80x faster than traditional methods and offers 95% accuracy.
 
-This library is a wrapper for the language detection model trained on fasttext by Facebook. For more information, please
-visit: https://fasttext.cc/docs/en/language-identification.html 📘
+It supports Python versions 3.9 to 3.12.
 
-This repository is patched
-from [zafercavdar/fasttext-langdetect](https://github.com/zafercavdar/fasttext-langdetect#benchmark), adding
-multi-language segmentation and better packaging
-support. 🌐
+This project builds upon [zafercavdar/fasttext-langdetect](https://github.com/zafercavdar/fasttext-langdetect#benchmark)
+with enhancements in packaging.
 
-Facilitates more accurate TTS implementation. 🗣️
+For more information on the underlying FastText model, refer to the official
+documentation: [FastText Language Identification](https://fasttext.cc/docs/en/language-identification.html).
 
-**Need 200M+ memory to use low_memory mode** 💾
+> [!NOTE]
+> This library requires over 200MB of memory to use in low memory mode.
 
 ## Installation 💻
 
+To install fast-langdetect, you can use either `pip` or `pdm`:
+
+### Using pip
+
 ```bash
 pip install fast-langdetect
 ```
 
-## Usage 🖥️
+### Using pdm
+
+```bash
+pdm add fast-langdetect
+```
 
-**For more accurate language detection, please use `detect(text,low_memory=False)` to load the big model.**
+## Usage 🖥️
 
-**Model will be downloaded in `/tmp/fasttext-langdetect` directory when you first use it.**
+For optimal performance and accuracy in language detection, use `detect(text, low_memory=False)` to load the larger
+model.
 
-```python
-from fast_langdetect import detect_langs
+> The model will be downloaded to the `/tmp/fasttext-langdetect` directory upon first use.
 
-print(detect_langs("Hello, world!"))
-# EN
+### Native API (Recommended)
 
-print(detect_langs("Привет, мир!"))
-# RU
+```python
+from fast_langdetect import detect, detect_multilingual
 
+# Single language detection
+print(detect("Hello, world!"))
+# Output: {'lang': 'en', 'score': 0.1520957201719284}
 
-print(detect_langs("你好，世界！"))
-# ZH
+print(detect("Привет, мир!")["lang"])
+# Output: ru
 
+# Multi-language detection
+print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
+# Output: [
+#     {'lang': 'ru', 'score': 0.39008623361587524},
+#     {'lang': 'zh', 'score': 0.18235979974269867},
+# ]
 ```
 
-## Advanced usage 🚀
+### Convenient `detect_language` Function
 
 ```python
-from fast_langdetect import detect, detect_multilingual
+from fast_langdetect import detect_language
 
-print(detect("Hello, world!"))
-# {'lang': 'en', 'score': 0.1520957201719284}
+# Single language detection
+print(detect_language("Hello, world!"))
+# Output: EN
 
-print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
-# [{'lang': 'ru', 'score': 0.39008623361587524}, {'lang': 'zh', 'score': 0.18235979974269867}, {'lang': 'ja', 'score': 0.08473210036754608}, {'lang': 'sr', 'score': 0.057975586503744125}, {'lang': 'en', 'score': 0.05422825738787651}]
+print(detect_language("Привет, мир!"))
+# Output: RU
+
+print(detect_language("你好，世界！"))
+# Output: ZH
 ```
 
-### Splitting text by language 🌐
+### Splitting Text by Language 🌐
 
-check out the [split-lang](https://github.com/DoodleBears/split-lang).
+For text splitting based on language, please refer to the [split-lang](https://github.com/DoodleBears/split-lang)
+repository.
 
 ## Accuracy 🎯
 
-References to the [benchmark](https://github.com/zafercavdar/fasttext-langdetect#benchmark)
+For detailed benchmark results, refer
+to [zafercavdar/fasttext-langdetect#benchmark](https://github.com/zafercavdar/fasttext-langdetect#benchmark).
diff --git a/feature_test/__init__.py b/feature_test/__init__.py
@@ -1,14 +1,21 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/1/18 上午11:41
 # @Author  : sudoskys
-# @File    : __init__.py.py
-# @Software: PyCharm
-from fast_langdetect import detect, detect_multilingual, detect_langs
-from fast_langdetect import parse_sentence
 
-print(parse_sentence("你好世界"))
-print(parse_sentence("你好世界！Hello, world！Привет, мир！"))
-print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
 
+from fast_langdetect import detect, detect_multilingual, detect_language
+
+# 测试繁体，简体，日文，英文，韩文，法文，德文，西班牙文
+
+print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
+# [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]
 print(detect("hello world"))
-print(detect_langs("Привет, мир!"))
+
+print(detect_language("Привет, мир!"))
+print(detect_language("你好世界"))
+print(detect_language("こんにちは世界"))
+print(detect_language("안녕하세요 세계"))
+print(detect_language("Bonjour le monde"))
+print(detect_language("Hallo Welt"))
+print(detect_language("Hola mundo"))
+print(detect_language("這些機構主辦的課程，多以基本電腦使用為主，例如文書處理、中文輸入、互聯網應用等"))
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,9 +7,8 @@ authors = [
 ]
 dependencies = [
     "fasttext-wheel>=0.9.2",
-    "requests>=2.31.0",
     "robust-downloader>=0.0.2",
-    "numpy>=1.26.4,<2.0.0",
+    "langdetect>=1.0.9",
 ]
 requires-python = ">=3.9,<3.13"
 readme = "README.md"

diff --git a/src/fast_langdetect/__init__.py b/src/fast_langdetect/__init__.py
@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 
-from .ft_detect import detect, detect_langs, detect_multilingual  # noqa: F401
+from .ft_detect import detect, detect_language, detect_langs, detect_multilingual  # noqa: F401
diff --git a/src/fast_langdetect/ft_detect/__init__.py b/src/fast_langdetect/ft_detect/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/1/17 下午4:00
-# @Author  : sudoskys
-# @File    : __init__.py
+import logging
+
 from .infer import detect
 from .infer import detect_multilingual  # noqa: F401
 
@@ -13,7 +13,7 @@ def is_japanese(string):
     return False
 
 
-def detect_langs(sentence, *, low_memory: bool = True):
+def detect_language(sentence, *, low_memory: bool = True):
     """
     Detect language
     :param sentence: str sentence
@@ -24,3 +24,14 @@ def detect_langs(sentence, *, low_memory: bool = True):
     if lang_code == "JA" and not is_japanese(sentence):
         lang_code = "ZH"
     return lang_code
+
+
+def detect_langs(sentence, *, low_memory: bool = True):
+    """
+    Detect language
+    :param sentence: str sentence
+    :param low_memory: bool (default: True) whether to use low memory mode
+    :return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters)
+    """
+    logging.warning("detect_langs is deprecated, use detect_language instead")
+    return detect_language(sentence, low_memory=low_memory)
diff --git a/tests/test_detect.py b/tests/test_detect.py
@@ -11,8 +11,23 @@ def test_muti_detect():
     assert result[0].get("lang") == "en", "ft_detect error"
 
 
+def test_detect():
+    from fast_langdetect import detect
+    assert detect("hello world")["lang"] == "en", "ft_detect error"
+    assert detect("你好世界")["lang"] == "zh", "ft_detect error"
+    assert detect("こんにちは世界")["lang"] == "ja", "ft_detect error"
+    assert detect("안녕하세요 세계")["lang"] == "ko", "ft_detect error"
+    assert detect("Bonjour le monde")["lang"] == "fr", "ft_detect error"
+
+
 def test_detect_totally():
-    from fast_langdetect import detect_langs
-    assert detect_langs("hello world") == "EN", "ft_detect error"
-    assert detect_langs("你好世界") == "ZH", "ft_detect error"
-    assert detect_langs("こんにちは世界") == "JA", "ft_detect error"
+    from fast_langdetect import detect_language
+    assert detect_language("hello world") == "EN", "ft_detect error"
+    assert detect_language("你好世界") == "ZH", "ft_detect error"
+    assert detect_language("こんにちは世界") == "JA", "ft_detect error"
+    assert detect_language("안녕하세요 세계") == "KO", "ft_detect error"
+    assert detect_language("Bonjour le monde") == "FR", "ft_detect error"
+    assert detect_language("Hallo Welt") == "DE", "ft_detect error"
+    assert detect_language(
+        "這些機構主辦的課程，多以基本電腦使用為主，例如文書處理、中文輸入、互聯網應用等"
+    ) == "ZH", "ft_detect error"