From af3df0b6030fa70de1bd9b376f26a28e5b2e37a6 Mon Sep 17 00:00:00 2001 From: dem Date: Sat, 26 Sep 2020 15:00:20 +0200 Subject: [PATCH] Add getLanguages() and loaded models restrictions for static calls - Add manual - Enforce tests --- README.md | 44 +++++++++++++- src/LanguageDetector/LanguageDetector.php | 57 +++++++++++++------ tests/LanguageDetector/LanguageSubsetTest.php | 27 ++++++++- 3 files changed, 108 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 3a9b2b1..a6c4bb4 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,15 @@ Table of contents - [API Methods](#api-methods) - [evaluate()](#evaluate) - [getLanguage()](#getlanguage) + - [getLanguages()](#getLanguages) - [getScores()](#getscores) - [getSupportedLanguages()](#getsupportedlanguages) - [getText()](#gettext) + - [options](#options) - [For one-liners only](#for-one-liners-only) + Features -------- @@ -32,7 +35,7 @@ Features - Learning steps are already done, library is ready to use - Small code, small footprint - N-grams algorithm -- Supports PHP 5.4, 5.5, 5.6, 7.0, 7.1, 7.2, 7.3, 7.4 and HHVM +- Supports PHP 5.4, 5.5, 5.6, 7.0, 7.1, 7.2, 7.3, 7.4, 8.0 and HHVM Install @@ -160,6 +163,19 @@ $detector->getLanguage(); // Returns 'en' ``` ________________________________________________________________________ +#### getLanguages() + +__Type__ *array* + +A list of loaded models that will be evaluated. + +__Example__ + +```php +$detector->getLanguages(); // Returns something like ['de', 'en', 'fr'] +``` +________________________________________________________________________ + #### getScores() __Type__ *array* @@ -222,6 +238,26 @@ $detector->getText(); ``` ________________________________________________________________________ +#### Options + +__Type__ *\LanguageDetector\LanguageDetector* + +For even better performance, loaded models can be specified explicitly. + +__Example__ + +```php + +$text = 'My tailor is rich and Alison is in the kitchen with Bob.'; + +$detector = new LanguageDetector(null, ['en', 'fr', 'de']); + +$language = $detector->evaluate($text); + +echo $language; // Prints something like 'en' +``` +________________________________________________________________________ + #### For one-liners only __Type__ *\LanguageDetector\LanguageDetector* @@ -260,5 +296,11 @@ print_r($detector->getSupportedLanguages()); // The last evaluated string echo $detector->getText(); +// Limit loaded languages for even better performance +echo LanguageDetector\LanguageDetector::detect( + 'My tailor is rich and Alison is in the kitchen with Bob.', + ['en', 'de', 'fr', 'es'] +); // en + ``` ________________________________________________________________________ diff --git a/src/LanguageDetector/LanguageDetector.php b/src/LanguageDetector/LanguageDetector.php index eaff409..9ced84f 100644 --- a/src/LanguageDetector/LanguageDetector.php +++ b/src/LanguageDetector/LanguageDetector.php @@ -16,7 +16,7 @@ /** * LanguageDetector is the entry point for the detecting process. - */ + */ class LanguageDetector { /** @@ -41,17 +41,19 @@ class LanguageDetector /** * Configure all subset languages - * + * * @param string $dir A directory where subsets are. * @param array $languages Language codes to load models for. By default, all languages are loaded. */ - public function __construct($dir = null, $languages = null) + public function __construct($dir = null, array $languages = []) { $datadir = null === $dir ? __DIR__ . '/subsets' : rtrim($dir, '/'); foreach (glob($datadir . '/*') as $file) { - if (! $languages || in_array(basename($file), $languages)) { + if (!count($languages) + || in_array(basename($file), $languages) + ) { $this->languages[basename($file)] = new Language($file); } } @@ -59,7 +61,7 @@ public function __construct($dir = null, $languages = null) /** * Evaluates that a string matches a language - * + * * @param string $text * @return \LanguageDetector\LanguageDetector * @throws \InvalidArgumentException if $text is not a string @@ -87,15 +89,25 @@ public function evaluate($text): self /** * Static call for oneliners - * + * * @param string $text + * @param array $languages Language codes to load models for. By + * default, all languages are loaded. * @return \LanguageDetector\LanguageDetector * @api */ - public static function detect($text): self + public static function detect($text, array $languages = []): self { - if (is_null(self::$detector)) { - self::$detector = new self(); + // All specified models have been loaded + $diff = count($languages) + ? array_diff( + self::$detector->getLanguages(), + $languages + ) + : []; + + if (is_null(self::$detector) || count($diff)) { + self::$detector = new self(null, $languages); } return self::$detector->evaluate($text); @@ -127,9 +139,20 @@ public function getLanguage($code = null): Language return $this->languages[$code]; } + /** + * Get loaded languages + * + * @return []string An array of ISO codes + * @api + */ + public function getLanguages(): array + { + return array_keys($this->languages); + } + /** * Get all scored subsets - * + * * @return array An array of ISO codes => scores * @throws \Exception if nothing has been evaluated * @api @@ -145,7 +168,7 @@ public function getScores(): array /** * Get all supported languages - * + * * @return array An array of ISO codes * @api */ @@ -156,7 +179,7 @@ public function getSupportedLanguages(): array /** * Get evaluated text - * + * * @return string * @api */ @@ -167,7 +190,7 @@ public function getText(): string /** * Get best result when detector is used as a string - * + * * @return string */ public function __toString(): string @@ -177,14 +200,14 @@ public function __toString(): string /** * Evaluate probabilities for one language - * + * * @param array $chunks * @return \Closure An evaluator */ private function calculate(array $chunks): callable { return function($language, $code) use ($chunks) { - $this->scores[$code] = + $this->scores[$code] = array_sum( array_intersect_key( $language->getFreq(), @@ -197,7 +220,7 @@ private function calculate(array $chunks): callable /** * Chunk a text - * + * * @return array */ private function chunk(): array @@ -205,7 +228,7 @@ private function chunk(): array $chunks = []; $len = mb_strlen($this->text); - // Chunk sizes + // Chunk sizes for ($i = 0; $i < 3; $i++) { for ($j = 0; $j < $len; $j++) { if ($len > $j + $i) { diff --git a/tests/LanguageDetector/LanguageSubsetTest.php b/tests/LanguageDetector/LanguageSubsetTest.php index 22cd8cc..3a6ca33 100644 --- a/tests/LanguageDetector/LanguageSubsetTest.php +++ b/tests/LanguageDetector/LanguageSubsetTest.php @@ -3,6 +3,7 @@ namespace LanguageDetectorTest; use LanguageDetector\Language; +use LanguageDetector\LanguageDetector; use PHPUnit\Framework\TestCase; use InvalidArgumentException; @@ -74,7 +75,7 @@ public function getLanguageSubsetScenarios() /** * Tests that subset are loaded - * + * * @dataProvider getLanguageSubsetScenarios */ public function testSubsetContents($code, $expected = null) @@ -99,7 +100,7 @@ public function testSubsetContents($code, $expected = null) /** * Tests that getCode return s a valid code - * + * * @dataProvider getLanguageSubsetScenarios */ public function testSubsetGetCode($code, $expected = null) @@ -126,4 +127,26 @@ public function testSubsetGetCode($code, $expected = null) (string)$language ); } + + /** + * Tests that a limited number of subsets has been loaded + */ + public function testLimitLoadedSubsets() + { + $subsets = ['da', 'en', 'no', 'sv']; + + $language = new LanguageDetector(null, $subsets); + + $this->assertEquals( + $subsets, + $language->getLanguages() + ); + + $language = LanguageDetector::detect('ok', $subsets); + + $this->assertEquals( + $subsets, + $language->getLanguages() + ); + } }