Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Updated with PSR4 class and composer for autoloading packagist.
  • Loading branch information
Stéphane Bauland committed Jul 27, 2016
1 parent febe16b commit 6d2e3cb
Show file tree
Hide file tree
Showing 4 changed files with 636 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ A PHP Bayesian classifier algorithm library.

### Installation
```php
composer require roomoot/classifier
composer require rookmoot/classifier
```
17 changes: 17 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "rookmoot/classifier",
"type": "library",
"description": "A PHP Bayesian classifier algorithm library.",
"keywords": ["Bayes", "Bayesian", "classifier", "algorithm"],
"homepage": "http://www.diatelys.fr",
"license": "BSD",
"authors": [
{
"name": "Stéphane Bauland",
"email": "stephane.bauland@diatelys.fr"
}
],
"autoload": {
"psr-4": {"Classify\\": "src/"}
}
}
211 changes: 211 additions & 0 deletions src/Classify.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
<?php

namespace Classify;

/**
* PHP version of Ruby Bayes classifier library
* @see http://github.com/xaviershay/classifier
*/
class Classify
{
private $_categories;
private $_totalWords;

/**
* The class can be created with one or more categories, each of which will be
* initialized and given a training method. E.g.,
* $b = new Bayes('Interesting', 'Uninteresting', 'Spam')
*/
public function __construct()
{
$categories = func_get_args();

$this->_totalWords = 0;

$this->_categories = array();
foreach ($categories as $category)
{
$this->_categories[$category] = array();
}
}

/**
* Provides a general training method for all categories specified in Bayes#new
* For example:
* $b = new Bayes('this', 'that', 'the_other')
* $b->train('this', 'This text')
* $b->train('that', 'That text')
* $b->train('the_other', 'The other text')
*/
public function train($category, $text)
{
foreach ($this->_wordArray($text) as $word => $count)
{
if (!isset($this->_categories[$category][$word]))
$this->_categories[$category][$word] = 0;

$this->_categories[$category][$word] += $count;

$this->_totalWords += $count;
}
}

/**
* Returns the scores in each category the provided +text+. E.g.,
* $b->classifications("I hate bad words and you")
* => {"Uninteresting" => -12.6997928013932, "Interesting" => -18.4206807439524}
* The largest of these scores (the one closest to 0) is the one picked out by classify()
*/
public function classifications($text)
{
$score = array();

foreach ($this->_categories as $category => $categoryWords)
{
$score[$category] = 0.0;
$total = array_sum(array_values($categoryWords));

foreach ($this->_wordArray($text) as $word => $count)
{
$s = isset($categoryWords[$word]) ? $categoryWords[$word] : 0.1;
$score[$category] += log($s / $total);
}
}

return $score;
}

/**
* Returns the classification of the provided +text+, which is one of the
* categories given in the initializer. E.g.,
* $b->classify("I hate bad words and you")
* => 'Uninteresting'
*/
public function classify($text)
{
$a = $this->classifications($text);
arsort($a);
return array_shift(array_keys($a));
}

// ----

/**
* Return an array of strings => ints. Each word in the string is stemmed,
* and indexes to its frequency in the document.
*/
private function _wordArray($word)
{
return $this->_wordArrayForWords(
array_merge(
preg_split('/\s+/', preg_replace('/[^\w\s]/','', $word)),
preg_split('/\s+/', preg_replace('/[\w]/',' ', $word))));
}

private function _wordArrayForWords($words)
{
$d = array();

foreach ($words as $word)
{
if (preg_match('/[\w]+/',$word)) $word = strtolower($word);
$key = Stemmer::Stem($word);

if (preg_match('/[^\w]/',$word)
|| !in_array($word, self::$CORPUS_SKIP_WORDS)
&& strlen($word) > 2)
{
if (!isset($d[$key]))
$d[$key] = 0;

$d[$key] += 1;
}
}

return $d;
}

private static $CORPUS_SKIP_WORDS = array(
"a",
"again",
"all",
"along",
"are",
"also",
"an",
"and",
"as",
"at",
"but",
"by",
"came",
"can",
"cant",
"couldnt",
"did",
"didn",
"didnt",
"do",
"doesnt",
"dont",
"ever",
"first",
"from",
"have",
"her",
"here",
"him",
"how",
"i",
"if",
"in",
"into",
"is",
"isnt",
"it",
"itll",
"just",
"last",
"least",
"like",
"most",
"my",
"new",
"no",
"not",
"now",
"of",
"on",
"or",
"should",
"sinc",
"so",
"some",
"th",
"than",
"this",
"that",
"the",
"their",
"then",
"those",
"to",
"told",
"too",
"true",
"try",
"until",
"url",
"us",
"were",
"when",
"whether",
"while",
"with",
"within",
"yes",
"you",
"youll",
);
}

Loading

0 comments on commit 6d2e3cb

Please sign in to comment.