My Brain Cells

Easiest (and best) learning materials for anyone with a curiosity for machine learning and artificial intelligence, Deep learning, Programming, and other fun life hacks.

IMDB Sentimental Analysis using deep learning

Sentimental Analysis using deep learning

Download the dataset from here.


    ## Relevant imports


    import pandas as pd

    import numpy as np

    import os

    import matplotlib.pyplot as plt

    import re


    from collections import defaultdict


    # Tokenizer imports

    from nltk.tokenize import sent_tokenize

    from nltk.tokenize import word_tokenize

    from nltk.tokenize import WordPunctTokenizer

    from nltk.tokenize import regexp_tokenize


    # NLTK corpus and stemming/lemmatizer imports

    from nltk import pos_tag

    from nltk.corpus import stopwords

    from nltk.corpus import wordnet

    from nltk.stem import WordNetLemmatizer


    # Scikit-learn packages

    from sklearn.preprocessing import LabelEncoder

    from sklearn.feature_extraction.text import TfidfVectorizer

    from sklearn import model_selection, naive_bayes, svm, linear_model

    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


    # Gensim imports

    import gensim


    data = pd.read_csv(“{PATH}/IMDB Dataset.csv”)




    (50000, 2)




                                                  review sentiment

    0  One of the other reviewers has mentioned that …  positive

    1  A wonderful little production. <br /><br />The…  positive

    2  I thought this was a wonderful way to spend ti…  positive

    3  Basically there’s a family where a little boy …  negative

    4  Petter Mattei’s “Love in the Time of Money” is…  positive




    array([‘positive’, ‘negative’], dtype=object)




    positive    25000

    negative    25000

    Name: sentiment, dtype: int64




    review       object

    sentiment    object

    dtype: object




    data.drop_duplicates(keep = “first”, inplace = True)



    (49582, 2)




    review       0

    sentiment    0

    dtype: int64


    # Convert reviews to lowercase = x: str(x).lower())


    data.reset_index(inplace = True)


    data = data.drop(“index”, axis = 1)




                                                      review sentiment

    0      one of the other reviewers has mentioned that …  positive

    1      a wonderful little production. <br /><br />the…  positive

    2      i thought this was a wonderful way to spend ti…  positive

    3      basically there’s a family where a little boy …  negative

    4      petter mattei’s “love in the time of money” is…  positive

    …                                                  …       …

    49577  i thought this movie did a down right good job…  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative

    49579  i am a catholic taught in parochial elementary…  negative

    49580  i’m going to have to disagree with the previou…  negative

    49581  no one expects the star trek movies to be high…  negative


    [49582 rows x 2 columns]


    def strip_html(raw_text):

      find_html = re.compile(‘<.*?>’)

      clean_text = re.sub(find_html, ”, raw_text)

      return clean_text = x: strip_html(x))




                                                      review sentiment

    0      one of the other reviewers has mentioned that …  positive

    1      a wonderful little production. the filming tec…  positive

    2      i thought this was a wonderful way to spend ti…  positive

    3      basically there’s a family where a little boy …  negative

    4      petter mattei’s “love in the time of money” is…  positive

    …                                                  …       …

    49577  i thought this movie did a down right good job…  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative

    49579  i am a catholic taught in parochial elementary…  negative

    49580  i’m going to have to disagree with the previou…  negative

    49581  no one expects the star trek movies to be high…  negative


    [49582 rows x 2 columns]


    # Running WhiteSpace tokenizer 

    wpTokenizer = WordPunctTokenizer()

    data[“review_tokenized”] = [wpTokenizer.tokenize(text) for text in data[“review”]]




                                                      review sentiment 

    0      one of the other reviewers has mentioned that …  positive   

    1      a wonderful little production. the filming tec…  positive   

    2      i thought this was a wonderful way to spend ti…  positive   

    3      basically there’s a family where a little boy …  negative   

    4      petter mattei’s “love in the time of money” is…  positive   

    …                                                  …       …   

    49577  i thought this movie did a down right good job…  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative   

    49579  i am a catholic taught in parochial elementary…  negative   

    49580  i’m going to have to disagree with the previou…  negative   

    49581  no one expects the star trek movies to be high…  negative   



    0      [one, of, the, other, reviewers, has, mentione…  

    1      [a, wonderful, little, production, ., the, fil…  

    2      [i, thought, this, was, a, wonderful, way, to,…  

    3      [basically, there, ‘, s, a, family, where, a, …  

    4      [petter, mattei, ‘, s, “, love, in, the, time,…  

    …                                                  …  

    49577  [i, thought, this, movie, did, a, down, right,…  

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, …  

    49579  [i, am, a, catholic, taught, in, parochial, el…  

    49580  [i, ‘, m, going, to, have, to, disagree, with,…  

    49581  [no, one, expects, the, star, trek, movies, to…  


    [49582 rows x 3 columns]


    # Stopwords removal & WordNet lemmatization 


    # Define POS tags 

    tag_map = defaultdict(lambda : wordnet.NOUN)

    tag_map[‘J’] = wordnet.ADJ

    tag_map[‘V’] = wordnet.VERB

    tag_map[‘R’] = wordnet.ADV


    for index, text in enumerate(data.review_tokenized):

        if index % 100 == 0:


    #     print(“-” * 50)

        word_list = []

        wordnet_lemmatizer = WordNetLemmatizer()

        for word, tag in pos_tag(text):

            if word not in stopwords.words(“english”) and word.isalpha():

                word_processed = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])


        data.loc[index, “review_tokenized_cleaned”] = str(word_list)

























                                                      review sentiment 

    0      one of the other reviewers has mentioned that …  positive   

    1      a wonderful little production. the filming tec…  positive   

    2      i thought this was a wonderful way to spend ti…  positive   

    3      basically there’s a family where a little boy …  negative   

    4      petter mattei’s “love in the time of money” is…  positive   

    …                                                  …       …   

    49577  i thought this movie did a down right good job…  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative   

    49579  i am a catholic taught in parochial elementary…  negative   

    49580  i’m going to have to disagree with the previou…  negative   

    49581  no one expects the star trek movies to be high…  negative   



    0      [one, of, the, other, reviewers, has, mentione…   

    1      [a, wonderful, little, production, ., the, fil…   

    2      [i, thought, this, was, a, wonderful, way, to,…   

    3      [basically, there, ‘, s, a, family, where, a, …   

    4      [petter, mattei, ‘, s, “, love, in, the, time,…   

    …                                                  …   

    49577  [i, thought, this, movie, did, a, down, right,…   

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, …   

    49579  [i, am, a, catholic, taught, in, parochial, el…   

    49580  [i, ‘, m, going, to, have, to, disagree, with,…   

    49581  [no, one, expects, the, star, trek, movies, to…   



    0      [‘one’, ‘reviewer’, ‘mention’, ‘watch’, ‘oz’, …  

    1      [‘wonderful’, ‘little’, ‘production’, ‘filming…  

    2      [‘think’, ‘wonderful’, ‘way’, ‘spend’, ‘time’,…  

    3      [‘basically’, ‘family’, ‘little’, ‘boy’, ‘jake…  

    4      [‘petter’, ‘mattei’, ‘love’, ‘time’, ‘money’, …  

    …                                                  …  

    49577  [‘think’, ‘movie’, ‘right’, ‘good’, ‘job’, ‘cr…  

    49578  [‘bad’, ‘plot’, ‘bad’, ‘dialogue’, ‘bad’, ‘act…  

    49579  [‘catholic’, ‘taught’, ‘parochial’, ‘elementar…  

    49580  [‘go’, ‘disagree’, ‘previous’, ‘comment’, ‘sid…  

    49581  [‘one’, ‘expect’, ‘star’, ‘trek’, ‘movie’, ‘hi…  


    [49582 rows x 4 columns]






    train_X, test_X, train_y, test_y = model_selection.train_test_split(data.review_tokenized_cleaned, data.sentiment, test_size = 0.3, random_state =1)














    negative    7461

    positive    7414

    Name: sentiment, dtype: int64




    positive    17470

    negative    17237

    Name: sentiment, dtype: int64


    label_enc = LabelEncoder()

    train_y = label_enc.fit_transform(train_y)

    test_y = label_enc.transform(test_y)


    print(np.unique(test_y, return_counts = True))

    print(np.unique(train_y, return_counts = True))


    (array([0, 1]), array([7461, 7414]))

    (array([0, 1]), array([17237, 17470]))


    tfidf_vect = TfidfVectorizer(max_features = 5000)




    train_X_tfidf = tfidf_vect.transform(train_X)

    test_X_tfidf = tfidf_vect.transform(test_X)


## Modelling Multinomial Naives Bayes


    train_X_tfidf_dense = train_X_tfidf.todense()

    test_X_tfidf_dense = test_X_tfidf.todense()


    nb_model = naive_bayes.GaussianNB(), train_y)




    preds_nb = nb_model.predict(test_X_tfidf_dense)






    accuracy_score(preds_nb, test_y)




    confusion_matrix(test_y, preds_nb)


    array([[5938, 1523],

           [1632, 5782]])


    print(classification_report(test_y, preds_nb))


                  precision    recall  f1-score   support


               0       0.78      0.80      0.79      7461

               1       0.79      0.78      0.79      7414


        accuracy                           0.79     14875

       macro avg       0.79      0.79      0.79     14875

    weighted avg       0.79      0.79      0.79     14875


## Support Vector Machine Classifier


Training can take some time, grab a coffee in the meanwhile 🙂


    svm = svm.SVC(C = 1.0, kernel = “linear”, degree = 3, gamma = “auto”), train_y)


    SVC(gamma=’auto’, kernel=’linear’)


    preds_svm = svm.predict(test_X_tfidf)





    accuracy_score(preds_svm, test_y)




    print(classification_report(test_y, preds_svm))


                  precision    recall  f1-score   support


               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414


        accuracy                           0.88     14875

       macro avg       0.88      0.88      0.88     14875

    weighted avg       0.88      0.88      0.88     14875


## Logistic Regression


    log_reg = linear_model.LogisticRegression(solver = “lbfgs”), train_y)




    preds_log_reg = log_reg.predict(test_X_tfidf)





    accuracy_score(preds_log_reg, test_y)




    print(classification_report(test_y, preds_log_reg))


                  precision    recall  f1-score   support


               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414


        accuracy                           0.89     14875

       macro avg       0.89      0.89      0.89     14875

    weighted avg       0.89      0.89      0.89     14875


Leave a Reply

Your email address will not be published. Required fields are marked *

Back to top