bgruszka/phpantispam

Bayesian spam recognition library

v0.2.1 2016-08-30 13:10 UTC

README

Build Status Code Climate

PHPAntiSpam is a library that recognize if documents / messages / texts are spam or not. The library use statistical analysis.

Explanation in 4 steps:

  • Create tokenizer
  • Create corpus (with lexemes) from historical messages
  • Choose method to use in classification
  • Classify message

Implemented methods:

  • Paul Graham method
  • Brian Burton method
  • Robinson Geometric Mean Test method
  • Fisher-Robinson's Inverse Chi-Square Test method

Installation

composer require bgruszka/phpantispam "^0.2"

Examples

<?php

// First add autoloader and all necessary classes
require_once 'vendor/autoload.php';

use PHPAntiSpam\Corpus\ArrayCorpus;
use PHPAntiSpam\Classifier;
use PHPAntiSpam\Tokenizer\WhitespaceTokenizer;

// Let's decleare our example training set
$messages = [
    ['category' => 'spam', 'content' => 'this is spam'],
    ['category' => 'nospam', 'content' => 'this is'],
];

// As tokenizer we can use the simplest one - WhitespaceTokenizer (but of course you can also use RegexpTokenizer
// or create new one)
$tokenizer = new WhitespaceTokenizer();

// Let's define our corpus - collection of text documents
$corpus = new ArrayCorpus($messages, $tokenizer);

// For classifying text we can use different methods

// ------------------------------------------------------------------------------------
// Graham method

$classifier = new Classifier($corpus);
$classifier->setMethod(new \PHPAntiSpam\Method\GrahamMethod($corpus));

$spamProbability = $classifier->isSpam('This is spam');

echo 'With Graham method:' . PHP_EOL;
echo sprintf('Spam probability: %s', $spamProbability) . PHP_EOL;
echo sprintf('Is spam: %s', $spamProbability < 0.9 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL;

// ------------------------------------------------------------------------------------
// Burton method

$classifier = new Classifier($corpus);
$classifier->setMethod(new \PHPAntiSpam\Method\BurtonMethod($corpus));

$spamProbability = $classifier->isSpam('This is spam');

echo 'With Burton method:' . PHP_EOL;
echo sprintf('Spam probability: %s', $spamProbability) . PHP_EOL;
echo sprintf('Is spam: %s', $spamProbability < 0.9 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL;

// ------------------------------------------------------------------------------------
// Robinson Geometric Mean Test Method

$classifier = new Classifier($corpus);
$classifier->setMethod(new \PHPAntiSpam\Method\RobinsonGeometricMeanTestMethod($corpus));

$spamProbability = $classifier->isSpam('This is spam');

echo 'With Robinson Geometric Mean Test method:' . PHP_EOL;
echo sprintf(
    'Spam probability: [spamminess: %s; hamminess: %s; combined: %s]', 
    $spamProbability['spamminess'], 
    $spamProbability['hamminess'], 
    $spamProbability['combined']
) . PHP_EOL;
echo sprintf('Is spam: %s', $spamProbability['combined'] <= 0.55 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL;

// ------------------------------------------------------------------------------------
// Fisher-Robinson Inverse Chi Square Method

$classifier = new Classifier($corpus);
$classifier->setMethod(new \PHPAntiSpam\Method\FisherRobinsonInverseChiSquareMethod($corpus));

$spamProbability = $classifier->isSpam('This is spam');

echo 'With Fisher-Robinson Inverse Chi Square method:' . PHP_EOL;
echo sprintf(
    'Spam probability: [spamminess: %s; hamminess: %s; combined: %s]', 
    $spamProbability['spamminess'], 
    $spamProbability['hamminess'], 
    $spamProbability['combined']
) . PHP_EOL;
echo sprintf('Is spam: %s', $spamProbability['combined'] <= 0.55 ? 'NO' : 'YES') . PHP_EOL;