import glob
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix


# 4 - READING IN DATA

# The spam/ham was collected from the comment section of five different videos and stored in five different CSVs
# The code in this cell merges these CSVs together.

# gets all csv filenames in directory
all_files = glob.glob("*.csv")

# list to store dataframes
li = []

# gets each csv and makes a dataframe out of it
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    # stores dfs in li
    li.append(df)

# all csvs combined
full_df = pd.concat(li, axis=0, ignore_index=True)
# selecting just jsut the data I will need for X and y
data = full_df[['CLASS','CONTENT']].copy()


# function to normalize comments using tokens
def split_into_tokens(message):
    return TextBlob(message).words

# function to normalize comments using lemmas (FOR PARAMATER TINKERING)
def split_into_lemmas(comment):
    comment = comment.lower()
    words = TextBlob(comment).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]


# converting data documents to a sparse matrix of token counts
bow_transformer = CountVectorizer(analyzer=split_into_tokens).fit(data['CONTENT'])

# computing the vector matrix for the bag of words
comments_bow = bow_transformer.transform(data['CONTENT'])

# learning the idf vector (global term weights)
tfidf_transformer = TfidfTransformer().fit(comments_bow)

# transform count matrix to a tf or tf-idf representation
comments_tfidf = tfidf_transformer.transform(comments_bow)


# fitting tfidf data representation and class predictions to Naive Bayes model
%time spam_detector = MultinomialNB().fit(comments_tfidf, data['CLASS'])

CPU times: user 8.57 ms, sys: 2.26 ms, total: 10.8 ms
Wall time: 7.17 ms


# array of all predictions
all_predictions = spam_detector.predict(comments_tfidf)


# ACCURACY FOR MODEL WITHOUT CROSS-VALIDATION AND CONFUSION MATRIX
print ('accuracy', accuracy_score(data['CLASS'], all_predictions))
print ('confusion matrix\n', confusion_matrix(data['CLASS'], all_predictions))
print ('(row=Actual, col=Predicted)')
plt.matshow(confusion_matrix(data['CLASS'], all_predictions), cmap=plt.cm.binary, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')

accuracy 0.9744376278118609
confusion matrix
 [[922  29]
 [ 21 984]]
(row=Actual, col=Predicted)

Text(0.5, 0, 'Predicted')


# 7 - TINKERING WITH VARIABLES
# ADDING CROSS VALIDATION (5) AND USING TOKENS IN BAG OF WORDS
X_train, X_test, y_train, y_test = train_test_split(data['CONTENT'], data['CLASS'], test_size=0.2)

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_tokens)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

scores = cross_val_score(pipeline,  # steps to convert raw messages into models
                         X_train,  # training data
                         y_train,  # training classes
                         cv=5,  # split data randomly into 5 parts: 4 for training, 1 for scoring
                         scoring='accuracy',  # scoring metric
                         n_jobs=-1,  # -1 = use all cores = faster
                         )
print(scores)
print(scores.mean(), scores.std())

[0.89456869 0.88498403 0.88817891 0.89776358 0.8974359 ]
0.8925862210207258 0.005127812524312259


# USING CROSS VALIDATION (5) AND LEMMAS IN BAG OF WORDS
X_train, X_test, y_train, y_test = train_test_split(data['CONTENT'], data['CLASS'], test_size=0.2)

pipeline1 = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

scores = cross_val_score(pipeline1,  # steps to convert raw messages into models
                         X_train,  # training data
                         y_train,  # training classes
                         cv=5,  # split data randomly into 10 parts: 9 for training, 1 for scoring
                         scoring='accuracy',  # scoring metric
                         n_jobs=-1,  # -1 = use all cores = faster
                         )
print(scores)
print(scores.mean(), scores.std())

[0.89171975 0.89808917 0.9044586  0.92356688 0.87820513 0.88461538
 0.91666667 0.87179487 0.87820513 0.8974359 ]
0.8944757471827536 0.016152145365167972


# USING CROSS VALIDATION (10) AND LEMMAS IN BAG OF WORDS
X_train, X_test, y_train, y_test = train_test_split(data['CONTENT'], data['CLASS'], test_size=0.2)

pipeline1 = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

scores = cross_val_score(pipeline1,  # steps to convert raw messages into models
                         X_train,  # training data
                         y_train,  # training classes
                         cv=10,  # split data randomly into 10 parts: 9 for training, 1 for scoring
                         scoring='accuracy',  # scoring metric
                         n_jobs=-1,  # -1 = use all cores = faster
                         )
print(scores)
print(scores.mean(), scores.std())

[0.88571429 0.92380952 0.84761905 0.86666667 0.88461538 0.90384615
 0.88461538 0.92307692 0.875      0.92307692 0.89423077 0.86538462
 0.92307692 0.89423077 0.90384615]
0.8932539682539683 0.02299797207109598


# USING CROSS VALIDATION (10), USING LEMMAS IN BAG OF WORDS, UNWEIGHTING THE DATA,
# AND DECREASING THE SIZE OF THE DATASETS IN TRAIN_TEST_SPLIT
X_train, X_test, y_train, y_test = train_test_split(data['CONTENT'], data['CLASS'], test_size=0.1)

pipeline1 = Pipeline([
    ('bow', CountVectorizer(analyzer=split_into_lemmas)),  # strings to token integer counts
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

scores = cross_val_score(pipeline1,  # steps to convert raw messages into models
                         X_train,  # training data
                         y_train,  # training classes
                         cv=10,  # split data randomly into 10 parts: 9 for training, 1 for scoring
                         scoring='accuracy',  # scoring metric
                         n_jobs=-1,  # -1 = use all cores = faster
                         )
print(scores)
print(scores.mean(), scores.std())

[0.88636364 0.92045455 0.89772727 0.93181818 0.875      0.90340909
 0.96590909 0.88068182 0.88068182 0.94318182]
0.9085227272727273 0.029132826689305155

Naive Bayes¶