import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('spam.tsv', sep='\t')
df.head()
label | message | length | punct | |
---|---|---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... | 111 | 9 |
1 | ham | Ok lar... Joking wif u oni... | 29 | 6 |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 155 | 6 |
3 | ham | U dun say so early hor... U c already then say... | 49 | 6 |
4 | ham | Nah I don't think he goes to usf, he lives aro... | 61 | 2 |
df['message'].iloc[0]
'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
df.tail()
label | message | length | punct | |
---|---|---|---|---|
5567 | spam | This is the 2nd time we have tried 2 contact u... | 160 | 8 |
5568 | ham | Will ü b going to esplanade fr home? | 36 | 1 |
5569 | ham | Pity, * was in mood for that. So...any other s... | 57 | 7 |
5570 | ham | The guy did some bitching but I acted like i'd... | 125 | 1 |
5571 | ham | Rofl. Its true to its name | 26 | 1 |
plt.figure(figsize=(16, 4))
sns.countplot(data=df, x='label')
plt.show()
print(df['label'].value_counts()/len(df))
ham 0.865937 spam 0.134063 Name: label, dtype: float64
df.isna().sum()
label 0 message 0 length 0 punct 0 dtype: int64
ham = df[df['label'] == 'ham']
spam = df[df['label'] == 'spam']
ham = ham.sample(len(spam))
len(ham), len(spam)
(747, 747)
df = ham.append(spam, ignore_index=True)
C:\Users\yandiher\AppData\Local\Temp\ipykernel_10108\2145385498.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df = ham.append(spam, ignore_index=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1494 entries, 0 to 1493 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 label 1494 non-null object 1 message 1494 non-null object 2 length 1494 non-null int64 3 punct 1494 non-null int64 dtypes: int64(2), object(2) memory usage: 46.8+ KB
from sklearn.model_selection import train_test_split, GridSearchCV
x = df['message'].values
y = df['label'].values
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.8, random_state=42, stratify=y)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
models = [RandomForestClassifier(),
LogisticRegression(),
KNeighborsClassifier(),
SVC()]
names = ['RandomForestClassifier',
'LogisticRegression',
'KNeighborsClassifier',
'SVC']
scores = []
for model in models:
clf = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', model)])
clf.fit(xTrain, yTrain)
score = clf.score(xTest, yTest)
scores.append(score)
comparison = pd.DataFrame(data={'names': names, 'scores': scores},
columns=['names', 'scores']).sort_values(by=['scores'], ascending=False)
comparison
names | scores | |
---|---|---|
3 | SVC | 0.966555 |
0 | RandomForestClassifier | 0.956522 |
1 | LogisticRegression | 0.953177 |
2 | KNeighborsClassifier | 0.929766 |
clf = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', SVC())])
clf.fit(xTrain, yTrain)
prediction = clf.predict(xTest)
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
print(classification_report(yTest, prediction))
precision recall f1-score support ham 0.95 0.99 0.97 150 spam 0.99 0.95 0.97 149 accuracy 0.97 299 macro avg 0.97 0.97 0.97 299 weighted avg 0.97 0.97 0.97 299
ConfusionMatrixDisplay.from_predictions(yTest, prediction)
plt.title(label='Support Vector Classifier')
plt.show()
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
TfidfVectorizer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
TfidfVectorizer()
x = vectorizer.transform(x)
y = pd.get_dummies(data=y, drop_first=True)
y = y.values.ravel()
params = {'C': [0.001, 0.01, 0.1, 1],
'kernel': ['rbf', 'poly', 'linear'],
'gamma': [0.1, 1, 10, 100]}
grid = GridSearchCV(estimator=SVC(), param_grid=params, cv=5, scoring='accuracy')
grid.fit(x, y)
GridSearchCV(cv=5, estimator=SVC(), param_grid={'C': [0.001, 0.01, 0.1, 1], 'gamma': [0.1, 1, 10, 100], 'kernel': ['rbf', 'poly', 'linear']}, scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=5, estimator=SVC(), param_grid={'C': [0.001, 0.01, 0.1, 1], 'gamma': [0.1, 1, 10, 100], 'kernel': ['rbf', 'poly', 'linear']}, scoring='accuracy')
SVC()
SVC()
print(grid.best_params_)
print(grid.best_score_)
{'C': 1, 'gamma': 0.1, 'kernel': 'linear'} 0.9544903593634262
clf = SVC(**grid.best_params_)
clf.fit(x, y)
SVC(C=1, gamma=0.1, kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(C=1, gamma=0.1, kernel='linear')
test = ['free tickets if you click this button. get your seat now!!!']
test
['free tickets if you click this button. get your seat now!!!']
test = vectorizer.transform(test)
if clf.predict(test) == 1:
print('this message is spam')
else:
print('this message is ham')
this message is spam