There are many emails downloaded from Internet.
The emails are labeled with spam
or ham
.
This article will demonstrate the theory of Naive Bayes Filter.
Email Data
There are
Spam
or ham
.
We also have a dictionary of
if it is in
Otherwise ,
Naive Bayes Model
- set
Pr(L=spam)=s andPr(L=ham)=1−s - for
j=1,...,J
ifL=ham , setPr(Yj=1)=pj,Pr(Yj=0)=1−pj ,
ifL=spam , setPr(Yj=1)=qj,Pr(Yj=0)=1−qj .
We shall assume that our training data
We have known that given
Parameter Learning
We need to estimate the parameters
To learn parameters, we find
Because of
then
Given
then
if L = ham
else if L = spam
use log function in
In the above equation, we can degrade the first one by:
So, we assume
Then, we simplify the second factor:
using
For simplification, we assume that:
Then, we have :
Find Optimal Value
there is a function
, then its first derivative is
To get the optimal of
then
so, we can apply the above theory to get parameter
Prediction
Given a sample with
where
So, we can determine whether an email is a spam or a ham by:
if
However, in numerical calculation, we should use
if
Laplace Smoothing
Have we finished? There is a corner case, where there is a word that does not appear in any training samples. To handle this problem, we use Laplace Smoothing Coefficient
Then the parameters are:
File Arrangement
There are two .py files and one data
directory in current work-space.
In data directory, there are 3 sub directories ham
,spam
,
tesing
.
In ham
, the text files are organized like this:
In spam
, the text files are organized like this:
In testing
, the text files are organized like this:
Each email is like that:
.
All words in email are separated by a space, even a punctuation.
code
naivebayes.py
import sys
import os.path
import collections
import math
import util
import numpy as np
USAGE = "%s <test data folder> <spam folder> <ham folder>"
def get_counts(file_list):
""" Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """
words = []
for filename in file_list:
words.extend(list(set(util.get_words_in_file(filename))))
counter = collections.Counter(words)
return counter
def get_log_probabilities(file_list):
""" Computes log-frequencies for each word that occurs in the files in file_list. Input ----- file_list : a list of filenames, suitable for use with open() or util.get_words_in_file(). Output ------ A dict whose keys are words, and whose values are the log of the smoothed estimate of the fraction of files the key occurred in. Hint ---- The data structure util.DefaultDict will be useful to you here, as will the get_counts() helper above. """
counter = get_counts(file_list)
num_files = len(file_list)
for key in counter:
counter[key] = math.log((counter[key]+1) / (num_files+2))
return counter
def learn_distributions(file_lists_by_category):
""" Input ----- A two-element list. The first element is a list of spam files, and the second element is a list of ham (non-spam) files. Output ------ (log_probabilities_by_category, log_prior) log_probabilities_by_category : A list whose first element is a smoothed estimate for log P(y=w_j|c=spam) (as a dict, just as in get_log_probabilities above), and whose second element is the same for c=ham. log_prior_by_category : A list of estimates for the log-probabilities for each class: [est. for log P(c=spam), est. for log P(c=ham)] """
spam_file_list, ham_file_list = file_lists_by_category
spam_counter = get_log_probabilities(spam_file_list)
length_spam = len(spam_file_list)
length_ham = len(ham_file_list)
ham_counter = get_log_probabilities(ham_file_list)
all_set = spam_counter.keys() | ham_counter.keys()
for word in all_set:
if word not in spam_counter:
spam_counter[word] = math.log(1.0/(length_spam+2)) # smooth
if word not in ham_counter:
ham_counter[word] = math.log(1.0/(length_ham+2)) # smooth
n_total = length_spam + length_ham
return ([spam_counter, ham_counter],
[math.log(length_spam*1.0/n_total), math.log(length_ham*1.0/n_total)])
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category):
""" Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """
words = set(util.get_words_in_file(email_filename))
# prob_log_spam, prob_log_ham = log_prior_by_category
spam_counter, ham_counter = log_probabilities_by_category
prob_log_spam = -9.0
prob_log_ham = math.log(1-math.exp(prob_log_spam))
spam_log_sum = prob_log_spam
ham_log_sum = prob_log_ham
print("log(s) = {0}, log(1-s) = {1}".format(prob_log_spam, prob_log_ham))
for word in spam_counter:
if word in words:
spam_log_sum += spam_counter[word]
else:
spam_log_sum += math.log(1 - math.exp(spam_counter[word]))
for word in ham_counter:
if word in words:
ham_log_sum += ham_counter[word]
else:
ham_log_sum += math.log(1 - math.exp(ham_counter[word]))
if spam_log_sum >= ham_log_sum:
return 'spam'
else:
return 'ham'
def classify_emails(spam_files, ham_files, test_files):
''' compute the label of each email in test_files. return value: List,such as ['spam','ham', 'spam', 'ham',....] '''
log_probabilities_by_category, log_prior = \
learn_distributions([spam_files, ham_files])
estimated_labels = []
for test_file in test_files:
estimated_label = \
classify_email(test_file, log_probabilities_by_category, log_prior)
estimated_labels.append(estimated_label)
return estimated_labels
def main():
''' usage: $python naivebayes.py data/testing/ data/spam/ data/ham/ '''
### Read arguments
if len(sys.argv) != 4:
print(USAGE%sys.argv[0])
testing_folder = sys.argv[1]
(spam_folder, ham_folder) = sys.argv[2:4]
### Learn the distributions
file_lists = []
for folder in (spam_folder, ham_folder):
file_lists.append(util.get_files_in_folder(folder))
(log_probabilities_by_category, log_priors_by_category) = \
learn_distributions(file_lists)
# Here, columns and rows are indexed by 0 = 'spam' and 1 = 'ham'
# rows correspond to true label, columns correspond to guessed label
performance_measures = np.zeros([2, 2])
### Classify and measure performance
for filename in util.get_files_in_folder(testing_folder):
## Classify
label = classify_email(filename,
log_probabilities_by_category,
log_priors_by_category)
## Measure performance
# Use the filename to determine the true label
base = os.path.basename(filename)
true_index = ('ham' in base)
guessed_index = (label == 'ham')
performance_measures[true_index, guessed_index] += 1
# Uncomment this line to see which files your classifier
# gets right/wrong:
# print("%s : %s" %(label, filename))
template = "You correctly classified %d out of %d spam emails, and %d out of %d ham emails."
# Correct counts are on the diagonal
correct = np.diag(performance_measures)
# totals are obtained by summing across guessed labels
totals = np.sum(performance_measures, 1)
print(template % (correct[0],
totals[0],
correct[1],
totals[1]))
if __name__ == '__main__':
main()
util.py
import os
def get_words_in_file(filename):
""" Returns a list of all words in the file at filename. """
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
# read() reads in a string from a file pointer, and split() splits a
# string into words based on whitespace
words = f.read().split()
return words
def get_files_in_folder(folder):
""" Returns a list of files in folder (including the path to the file) """
filenames = os.listdir(folder)
# os.path.join combines paths while dealing with /s and \s appropriately
full_filenames = [os.path.join(folder, filename) for filename in filenames]
return full_filenames