import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from sklearn.model_selection import train_test_split


data = pd.read_csv('data/sms.csv.gz', compression='gzip', sep='\t', header=None, names=['Label', 'SMS'])
data


X_train_wc.head(5)


classify('Sounds good, Alex, then see u there')

P(Spam|message): 8.011681173647132e-29
P(Ham|message): 5.170079874432155e-28
Label: Ham


classify('YOU WIN THE PRIZE MONEY JACKPOT! CALL 14')

P(Spam|message): 1.0341656349099176e-32
P(Ham|message): 6.673654155714669e-32
Label: Ham

	Label	SMS
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...
...	...	...
5567	spam	This is the 2nd time we have tried 2 contact u...
5568	ham	Will ü b going to esplanade fr home?
5569	ham	Pity, * was in mood for that. So...any other s...
5570	ham	The guy did some bitching but I acted like i'd...
5571	ham	Rofl. Its true to its name

	curious	exorcism	savings	hv9d	50gbp	wc1n3xx	abroad	option	m8	their	...	watch	vu	rice	crashed	science	pulling	comment	except	chances	providing
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

Naïve Bayes Spam Classifier¶

Part 1: Data Preparation¶

Basic Data Statistics¶

Data Cleaning¶

Splitting Training and Testing Data¶

Sanity Check the Split¶

Build the Vocabulary¶

Word Counts for Each Message¶

Part 2: Naïve Bayes Classifier with sklearn¶

Part 3: Naïve Bayes Classifier from Scratch¶

Count the Occurrences of Each Word¶

Training the Model¶

Classifier¶

Testing the Model¶

Example Messages¶

Accuracy on Test Set¶

	curious	exorcism	savings	hv9d	50gbp	wc1n3xx	abroad	option	m8	their	...	watch	vu	rice	crashed	science	pulling	comment	except	chances	providing
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	curious	exorcism	savings	hv9d	50gbp	wc1n3xx	abroad	option	m8	their	...	watch	vu	rice	crashed	science	pulling	comment	except	chances	providing
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	curious	exorcism	savings	hv9d	50gbp	wc1n3xx	abroad	option	m8	their	...	watch	vu	rice	crashed	science	pulling	comment	except	chances	providing
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0