-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Uploading new code from https://github.com/dhotson/classifier-php
Updated with PSR4 class and composer for autoloading packagist.
- Loading branch information
Stéphane Bauland
committed
Jul 27, 2016
1 parent
febe16b
commit 6d2e3cb
Showing
4 changed files
with
636 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"name": "rookmoot/classifier", | ||
"type": "library", | ||
"description": "A PHP Bayesian classifier algorithm library.", | ||
"keywords": ["Bayes", "Bayesian", "classifier", "algorithm"], | ||
"homepage": "http://www.diatelys.fr", | ||
"license": "BSD", | ||
"authors": [ | ||
{ | ||
"name": "Stéphane Bauland", | ||
"email": "stephane.bauland@diatelys.fr" | ||
} | ||
], | ||
"autoload": { | ||
"psr-4": {"Classify\\": "src/"} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
<?php | ||
|
||
namespace Classify; | ||
|
||
/** | ||
* PHP version of Ruby Bayes classifier library | ||
* @see http://github.com/xaviershay/classifier | ||
*/ | ||
class Classify | ||
{ | ||
private $_categories; | ||
private $_totalWords; | ||
|
||
/** | ||
* The class can be created with one or more categories, each of which will be | ||
* initialized and given a training method. E.g., | ||
* $b = new Bayes('Interesting', 'Uninteresting', 'Spam') | ||
*/ | ||
public function __construct() | ||
{ | ||
$categories = func_get_args(); | ||
|
||
$this->_totalWords = 0; | ||
|
||
$this->_categories = array(); | ||
foreach ($categories as $category) | ||
{ | ||
$this->_categories[$category] = array(); | ||
} | ||
} | ||
|
||
/** | ||
* Provides a general training method for all categories specified in Bayes#new | ||
* For example: | ||
* $b = new Bayes('this', 'that', 'the_other') | ||
* $b->train('this', 'This text') | ||
* $b->train('that', 'That text') | ||
* $b->train('the_other', 'The other text') | ||
*/ | ||
public function train($category, $text) | ||
{ | ||
foreach ($this->_wordArray($text) as $word => $count) | ||
{ | ||
if (!isset($this->_categories[$category][$word])) | ||
$this->_categories[$category][$word] = 0; | ||
|
||
$this->_categories[$category][$word] += $count; | ||
|
||
$this->_totalWords += $count; | ||
} | ||
} | ||
|
||
/** | ||
* Returns the scores in each category the provided +text+. E.g., | ||
* $b->classifications("I hate bad words and you") | ||
* => {"Uninteresting" => -12.6997928013932, "Interesting" => -18.4206807439524} | ||
* The largest of these scores (the one closest to 0) is the one picked out by classify() | ||
*/ | ||
public function classifications($text) | ||
{ | ||
$score = array(); | ||
|
||
foreach ($this->_categories as $category => $categoryWords) | ||
{ | ||
$score[$category] = 0.0; | ||
$total = array_sum(array_values($categoryWords)); | ||
|
||
foreach ($this->_wordArray($text) as $word => $count) | ||
{ | ||
$s = isset($categoryWords[$word]) ? $categoryWords[$word] : 0.1; | ||
$score[$category] += log($s / $total); | ||
} | ||
} | ||
|
||
return $score; | ||
} | ||
|
||
/** | ||
* Returns the classification of the provided +text+, which is one of the | ||
* categories given in the initializer. E.g., | ||
* $b->classify("I hate bad words and you") | ||
* => 'Uninteresting' | ||
*/ | ||
public function classify($text) | ||
{ | ||
$a = $this->classifications($text); | ||
arsort($a); | ||
return array_shift(array_keys($a)); | ||
} | ||
|
||
// ---- | ||
|
||
/** | ||
* Return an array of strings => ints. Each word in the string is stemmed, | ||
* and indexes to its frequency in the document. | ||
*/ | ||
private function _wordArray($word) | ||
{ | ||
return $this->_wordArrayForWords( | ||
array_merge( | ||
preg_split('/\s+/', preg_replace('/[^\w\s]/','', $word)), | ||
preg_split('/\s+/', preg_replace('/[\w]/',' ', $word)))); | ||
} | ||
|
||
private function _wordArrayForWords($words) | ||
{ | ||
$d = array(); | ||
|
||
foreach ($words as $word) | ||
{ | ||
if (preg_match('/[\w]+/',$word)) $word = strtolower($word); | ||
$key = Stemmer::Stem($word); | ||
|
||
if (preg_match('/[^\w]/',$word) | ||
|| !in_array($word, self::$CORPUS_SKIP_WORDS) | ||
&& strlen($word) > 2) | ||
{ | ||
if (!isset($d[$key])) | ||
$d[$key] = 0; | ||
|
||
$d[$key] += 1; | ||
} | ||
} | ||
|
||
return $d; | ||
} | ||
|
||
private static $CORPUS_SKIP_WORDS = array( | ||
"a", | ||
"again", | ||
"all", | ||
"along", | ||
"are", | ||
"also", | ||
"an", | ||
"and", | ||
"as", | ||
"at", | ||
"but", | ||
"by", | ||
"came", | ||
"can", | ||
"cant", | ||
"couldnt", | ||
"did", | ||
"didn", | ||
"didnt", | ||
"do", | ||
"doesnt", | ||
"dont", | ||
"ever", | ||
"first", | ||
"from", | ||
"have", | ||
"her", | ||
"here", | ||
"him", | ||
"how", | ||
"i", | ||
"if", | ||
"in", | ||
"into", | ||
"is", | ||
"isnt", | ||
"it", | ||
"itll", | ||
"just", | ||
"last", | ||
"least", | ||
"like", | ||
"most", | ||
"my", | ||
"new", | ||
"no", | ||
"not", | ||
"now", | ||
"of", | ||
"on", | ||
"or", | ||
"should", | ||
"sinc", | ||
"so", | ||
"some", | ||
"th", | ||
"than", | ||
"this", | ||
"that", | ||
"the", | ||
"their", | ||
"then", | ||
"those", | ||
"to", | ||
"told", | ||
"too", | ||
"true", | ||
"try", | ||
"until", | ||
"url", | ||
"us", | ||
"were", | ||
"when", | ||
"whether", | ||
"while", | ||
"with", | ||
"within", | ||
"yes", | ||
"you", | ||
"youll", | ||
); | ||
} | ||
|
Oops, something went wrong.