bgruszka / phpantispam
Bayesian spam recognition library
v0.2.1
2016-08-30 13:10 UTC
Requires (Dev)
- phpunit/phpunit: 4.6.*
This package is not auto-updated.
Last update: 2024-04-13 15:23:05 UTC
README
PHPAntiSpam is a library that recognize if documents / messages / texts are spam or not. The library use statistical analysis.
Explanation in 4 steps:
- Create tokenizer
- Create corpus (with lexemes) from historical messages
- Choose method to use in classification
- Classify message
Implemented methods:
- Paul Graham method
- Brian Burton method
- Robinson Geometric Mean Test method
- Fisher-Robinson's Inverse Chi-Square Test method
Installation
composer require bgruszka/phpantispam "^0.2"
Examples
<?php // First add autoloader and all necessary classes require_once 'vendor/autoload.php'; use PHPAntiSpam\Corpus\ArrayCorpus; use PHPAntiSpam\Classifier; use PHPAntiSpam\Tokenizer\WhitespaceTokenizer; // Let's decleare our example training set $messages = [ ['category' => 'spam', 'content' => 'this is spam'], ['category' => 'nospam', 'content' => 'this is'], ]; // As tokenizer we can use the simplest one - WhitespaceTokenizer (but of course you can also use RegexpTokenizer // or create new one) $tokenizer = new WhitespaceTokenizer(); // Let's define our corpus - collection of text documents $corpus = new ArrayCorpus($messages, $tokenizer); // For classifying text we can use different methods // ------------------------------------------------------------------------------------ // Graham method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\GrahamMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Graham method:' . PHP_EOL; echo sprintf('Spam probability: %s', $spamProbability) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability < 0.9 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL; // ------------------------------------------------------------------------------------ // Burton method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\BurtonMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Burton method:' . PHP_EOL; echo sprintf('Spam probability: %s', $spamProbability) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability < 0.9 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL; // ------------------------------------------------------------------------------------ // Robinson Geometric Mean Test Method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\RobinsonGeometricMeanTestMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Robinson Geometric Mean Test method:' . PHP_EOL; echo sprintf( 'Spam probability: [spamminess: %s; hamminess: %s; combined: %s]', $spamProbability['spamminess'], $spamProbability['hamminess'], $spamProbability['combined'] ) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability['combined'] <= 0.55 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL; // ------------------------------------------------------------------------------------ // Fisher-Robinson Inverse Chi Square Method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\FisherRobinsonInverseChiSquareMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Fisher-Robinson Inverse Chi Square method:' . PHP_EOL; echo sprintf( 'Spam probability: [spamminess: %s; hamminess: %s; combined: %s]', $spamProbability['spamminess'], $spamProbability['hamminess'], $spamProbability['combined'] ) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability['combined'] <= 0.55 ? 'NO' : 'YES') . PHP_EOL;