There are many emails downloaded from Internet.
The emails are labeled with spam or ham.
This article will demonstrate the theory of Naive Bayes Filter.
Email Data
There are
Spam or ham.
We also have a dictionary of
if it is in
Otherwise ,
Naive Bayes Model
- set
Pr(L=spam)=s andPr(L=ham)=1−s - for
j=1,...,J
ifL=ham , setPr(Yj=1)=pj,Pr(Yj=0)=1−pj ,
ifL=spam , setPr(Yj=1)=qj,Pr(Yj=0)=1−qj .
We shall assume that our training data
We have known that given
Parameter Learning
We need to estimate the parameters
To learn parameters, we find
Because of
then
Given
then
if L = ham
else if L = spam
use log function in
In the above equation, we can degrade the first one by:
So, we assume
Then, we simplify the second factor:
using
For simplification, we assume that:
Then, we have :
Find Optimal Value
there is a function
, then its first derivative is
To get the optimal of
then
so, we can apply the above theory to get parameter
Prediction
Given a sample with
where
So, we can determine whether an email is a spam or a ham by:
if
However, in numerical calculation, we should use
if
Laplace Smoothing
Have we finished? There is a corner case, where there is a word that does not appear in any training samples. To handle this problem, we use Laplace Smoothing Coefficient
Then the parameters are:
File Arrangement
There are two .py files and one data directory in current work-space.
In data directory, there are 3 sub directories ham,spam,
tesing.
In ham, the text files are organized like this:
In spam, the text files are organized like this:
In testing, the text files are organized like this:
Each email is like that:
.
All words in email are separated by a space, even a punctuation.
code
naivebayes.py
import sys
import os.path
import collections
import math
import util
import numpy as np
USAGE = "%s <test data folder> <spam folder> <ham folder>"
def get_counts(file_list):
""" Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """
words = []
for filename in file_list:
words.extend(list(set(util.get_words_in_file(filename))))
counter = collections.Counter(words)
return counter
def get_log_probabilities(file_list):
""" Computes log-frequencies for each word that occurs in the files in file_list. Input ----- file_list : a list of filenames, suitable for use with open() or util.get_words_in_file(). Output ------ A dict whose keys are words, and whose values are the log of the smoothed estimate of the fraction of files the key occurred in. Hint ---- The data structure util.DefaultDict will be useful to you here, as will the get_counts() helper above. """
counter = get_counts(file_list)
num_files = len(file_list)
for key in counter:
counter[key] = math.log((counter[key]+1) / (num_files+2))
return counter
def learn_distributions(file_lists_by_category):
""" Input ----- A two-element list. The first element is a list of spam files, and the second element is a list of ham (non-spam) files. Output ------ (log_probabilities_by_category, log_prior) log_probabilities_by_category : A list whose first element is a smoothed estimate for log P(y=w_j|c=spam) (as a dict, just as in get_log_probabilities above), and whose second element is the same for c=ham. log_prior_by_category : A list of estimates for the log-probabilities for each class: [est. for log P(c=spam), est. for log P(c=ham)] """
spam_file_list, ham_file_list = file_lists_by_category
spam_counter = get_log_probabilities(spam_file_list)
length_spam = len(spam_file_list)
length_ham = len(ham_file_list)
ham_counter = get_log_probabilities(ham_file_list)
all_set = spam_counter.keys() | ham_counter.keys()
for word in all_set:
if word not in spam_counter:
spam_counter[word] = math.log(1.0/(length_spam+2)) # smooth
if word not in ham_counter:
ham_counter[word] = math.log(1.0/(length_ham+2)) # smooth
n_total = length_spam + length_ham
return ([spam_counter, ham_counter],
[math.log(length_spam*1.0/n_total), math.log(length_ham*1.0/n_total)])
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category):
""" Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """
words = set(util.get_words_in_file(email_filename))
# prob_log_spam, prob_log_ham = log_prior_by_category
spam_counter, ham_counter = log_probabilities_by_category
prob_log_spam = -9.0
prob_log_ham = math.log(1-math.exp(prob_log_spam))
spam_log_sum = prob_log_spam
ham_log_sum = prob_log_ham
print("log(s) = {0}, log(1-s) = {1}".format(prob_log_spam, prob_log_ham))
for word in spam_counter:
if word in words:
spam_log_sum += spam_counter[word]
else:
spam_log_sum += math.log(1 - math.exp(spam_counter[word]))
for word in ham_counter:
if word in words:
ham_log_sum += ham_counter[word]
else:
ham_log_sum += math.log(1 - math.exp(ham_counter[word]))
if spam_log_sum >= ham_log_sum:
return 'spam'
else:
return 'ham'
def classify_emails(spam_files, ham_files, test_files):
''' compute the label of each email in test_files. return value: List,such as ['spam','ham', 'spam', 'ham',....] '''
log_probabilities_by_category, log_prior = \
learn_distributions([spam_files, ham_files])
estimated_labels = []
for test_file in test_files:
estimated_label = \
classify_email(test_file, log_probabilities_by_category, log_prior)
estimated_labels.append(estimated_label)
return estimated_labels
def main():
''' usage: $python naivebayes.py data/testing/ data/spam/ data/ham/ '''
### Read arguments
if len(sys.argv) != 4:
print(USAGE%sys.argv[0])
testing_folder = sys.argv[1]
(spam_folder, ham_folder) = sys.argv[2:4]
### Learn the distributions
file_lists = []
for folder in (spam_folder, ham_folder):
file_lists.append(util.get_files_in_folder(folder))
(log_probabilities_by_category, log_priors_by_category) = \
learn_distributions(file_lists)
# Here, columns and rows are indexed by 0 = 'spam' and 1 = 'ham'
# rows correspond to true label, columns correspond to guessed label
performance_measures = np.zeros([2, 2])
### Classify and measure performance
for filename in util.get_files_in_folder(testing_folder):
## Classify
label = classify_email(filename,
log_probabilities_by_category,
log_priors_by_category)
## Measure performance
# Use the filename to determine the true label
base = os.path.basename(filename)
true_index = ('ham' in base)
guessed_index = (label == 'ham')
performance_measures[true_index, guessed_index] += 1
# Uncomment this line to see which files your classifier
# gets right/wrong:
# print("%s : %s" %(label, filename))
template = "You correctly classified %d out of %d spam emails, and %d out of %d ham emails."
# Correct counts are on the diagonal
correct = np.diag(performance_measures)
# totals are obtained by summing across guessed labels
totals = np.sum(performance_measures, 1)
print(template % (correct[0],
totals[0],
correct[1],
totals[1]))
if __name__ == '__main__':
main() util.py
import os
def get_words_in_file(filename):
""" Returns a list of all words in the file at filename. """
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
# read() reads in a string from a file pointer, and split() splits a
# string into words based on whitespace
words = f.read().split()
return words
def get_files_in_folder(folder):
""" Returns a list of files in folder (including the path to the file) """
filenames = os.listdir(folder)
# os.path.join combines paths while dealing with /s and \s appropriately
full_filenames = [os.path.join(folder, filename) for filename in filenames]
return full_filenames

京公网安备 11010502036488号