import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('Restaurant-Reviews.tsv', sep='\t', quoting=3)
df.head()
Review | Liked | |
---|---|---|
0 | Wow... Loved this place. | 1 |
1 | Crust is not good. | 0 |
2 | Not tasty and the texture was just nasty. | 0 |
3 | Stopped by during the late May bank holiday of... | 1 |
4 | The selection on the menu was great and so wer... | 1 |
df.tail()
Review | Liked | |
---|---|---|
995 | I think food should have flavor and texture an... | 0 |
996 | Appetite instantly gone. | 0 |
997 | Overall I was not impressed and would not go b... | 0 |
998 | The whole experience was underwhelming, and I ... | 0 |
999 | Then, as if I hadn't wasted enough of my life ... | 0 |
df.isna().sum()
Review 0 Liked 0 dtype: int64
print(len(df))
print(len(df[df['Liked'] == 1]))
1000 500
import re
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\yandiher\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
from nltk.corpus import stopwords
df['Review'][0]
'Wow... Loved this place.'
review = re.sub(pattern='[^a-zA-Z]',
repl=' ',
string=df['Review'][0]).lower().split()
preview = []
for word in review:
if word not in stopwords.words('english'):
preview.append(word)
print(preview)
print(review)
['wow', 'loved', 'place'] ['wow', 'loved', 'this', 'place']
review = [word for word in review if word not in stopwords.words(fileids='english')]
from nltk.stem import PorterStemmer
pStemmer = PorterStemmer()
review = [pStemmer.stem(word=word) for word in review]
review = ' '.join(review)
review
'wow love place'
corpus = []
for i in range (len(df)):
review = re.sub(pattern='[^a-zA-Z]',
repl=' ',
string=df['Review'][i])
review = review.lower()
review = review.split()
review = [pStemmer.stem(word) for word in review if word not in stopwords.words(fileids='english')]
review = ' '.join(review)
corpus.append(review)
print(corpus[6])
print(df['Review'].iloc[6])
honeslti tast fresh Honeslty it didn't taste THAT fresh.)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
CountVectorizer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
CountVectorizer()
x = vectorizer.transform(corpus).toarray()
y = df['Liked'].values
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, train_size=0.8, random_state=42, stratify=y)
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(xTrain, yTrain)
prediction = model.predict(xTest)
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
print('the accuracy of the model is ', accuracy_score(y_true=yTest, y_pred=prediction))
print(classification_report(y_true=yTest, y_pred=prediction))
the accuracy of the model is 0.77 precision recall f1-score support 0 0.79 0.73 0.76 100 1 0.75 0.81 0.78 100 accuracy 0.77 200 macro avg 0.77 0.77 0.77 200 weighted avg 0.77 0.77 0.77 200
ConfusionMatrixDisplay.from_predictions(y_true=yTest, y_pred=prediction)
plt.show()
def pred(text):
text = vectorizer.transform(text).toarray()
text = model.predict(text)
if test == 1:
print('the customer(s) love our restaurant.')
else:
print('the customer(s) do not like our restaurant.')
test = ['how is it possible that there was a cockroach inside my meal?']
pred(test)
the customer(s) do not like our restaurant.