-
Notifications
You must be signed in to change notification settings - Fork 13
/
spam_classifier.py
121 lines (99 loc) · 3.74 KB
/
spam_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import math
def getwords(doc):
splitter = re.compile("\\W*")
#Split the words by non-alpha characters
words = [s.lower() for s in splitter.split(doc)
if len(s)>2 and len(s)<20]
return dict((w,1) for w in words)
def sampletrain(cl):
cl.train('Nobody owns the water.', 'good')
cl.train('the quick rabbit jumps fences','good')
cl.train('buy pharmaceuticals now', 'bad')
cl.train('make quick money at online casino', 'bad')
cl.train('the quick brown fox jumps.', 'good')
class classifier:
def __init__(self, getfeatures, filename=None):
#Counts of feature/category combinations
self.fc = {}
#Counts of docs in each category
self.cc={}
self.getfeatures = getfeatures
#Increase the count of a feature-category pair
def incf(self, f, cat):
self.fc.setdefault(f, {})
self.fc[f].setdefault(cat, 0)
self.fc[f][cat]+=1
#Increase the count of a category
def incc(self, cat):
self.cc.setdefault(cat, 0)
self.cc[cat]+=1
#The number of times a feature has appeared in a category
def fcount(self, f, cat):
if f in self.fc and cat in self.fc[f]:
return float(self.fc[f][cat])
return 0.0
#The number of items in a category
def catcount(self, cat):
if cat in self.cc:
return float(self.cc[cat])
return 0
#The total number of items
def totalcount(self):
return sum(self.cc.values())
#List of all categories
def categories(self):
return self.cc.keys()
def train(self, item, cat):
features= self.getfeatures(item)
#Increment the count for every feature with this category
for f in features:
self.incf(f, cat)
#Increment the count for this category
self.incc(cat)
def fprob(self, f, cat):
if self.catcount(cat)==0: return 0
return self.fcount(f, cat)/self.catcount(cat)
def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
# Calculate current probability
basicprob = prf(f,cat)
# Count the number of times this feature has appeared in
# all categories
totals=sum([self.fcount(f,c) for c in self.categories()])
# Calculate the weighted average
bp=((weight*ap)+(totals*basicprob))/(weight+totals)
return bp
class naivebayes(classifier):
def __init__(self, getfeatures):
classifier.__init__(self, getfeatures)
self.thresholds={}
def docprob(self, item, cat):
features = self.getfeatures(item)
#Multiply the probabilities of all features together
p = 1
for f in features : p *= self.weightedprob(f, cat, self.fprob)
return p
def prob(self, item, cat):
catprob = self.catcount(cat)/self.totalcount()
docprob = self.docprob(item, cat)
return docprob*catprob
def setthreshold(self, cat, t):
self.thresholds[cat]=t
def getthreshold(self, cat):
if not cat in self.thresholds: return 1.0
return self.thresholds[cat]
def classify(self, item, default=None):
probs = {}
#Find the category with the highest probability
max = 0.0
for cat in self.categories():
probs[cat] = self.prob(item, cat)
if probs[cat]>max:
max=probs[cat]
best=cat
#Make sure the probability exceeds threshold* next best
for cat in probs:
if cat == best: continue
if probs[cat] * self.getthreshold(best)>probs[best]:
return default
return best