My Brain Cells

Easiest (and best) learning materials for anyone with a curiosity for machine learning and artificial intelligence, Deep learning, Programming, and other fun life hacks.

IMDB Sentimental Analysis using deep learning

Sentimental Analysis using deep learning


Download the dataset from here.

Code:

    ## Relevant imports

 

    import pandas as pd

    import numpy as np

    import os

    import matplotlib.pyplot as plt

    import re

 

    from collections import defaultdict

 

    # Tokenizer imports

    from nltk.tokenize import sent_tokenize

    from nltk.tokenize import word_tokenize

    from nltk.tokenize import WordPunctTokenizer

    from nltk.tokenize import regexp_tokenize

 

    # NLTK corpus and stemming/lemmatizer imports

    from nltk import pos_tag

    from nltk.corpus import stopwords

    from nltk.corpus import wordnet

    from nltk.stem import WordNetLemmatizer

 

    # Scikit-learn packages

    from sklearn.preprocessing import LabelEncoder

    from sklearn.feature_extraction.text import TfidfVectorizer

    from sklearn import model_selection, naive_bayes, svm, linear_model

    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

 

    # Gensim imports

    import gensim

 

    data = pd.read_csv(“{PATH}/IMDB Dataset.csv”)

 

    data.shape

 

    (50000, 2)

 

    data.head()

 

                                                  review sentiment

    0  One of the other reviewers has mentioned that …  positive

    1  A wonderful little production. <br /><br />The…  positive

    2  I thought this was a wonderful way to spend ti…  positive

    3  Basically there’s a family where a little boy …  negative

    4  Petter Mattei’s “Love in the Time of Money” is…  positive

 

    data.sentiment.unique()

 

    array([‘positive’, ‘negative’], dtype=object)

 

    data.sentiment.value_counts()

 

    positive    25000

    negative    25000

    Name: sentiment, dtype: int64

 

    data.dtypes

 

    review       object

    sentiment    object

    dtype: object

 

    data.review.duplicated().sum()

 

    418

 

    data.drop_duplicates(keep = “first”, inplace = True)

    data.shape

 

    (49582, 2)

 

    data.isna().sum()

 

    review       0

    sentiment    0

    dtype: int64

 

    # Convert reviews to lowercase

 

    data.review = data.review.apply(lambda x: str(x).lower())

 

    data.reset_index(inplace = True)

 

    data = data.drop(“index”, axis = 1)

 

    data

 

                                                      review sentiment

    0      one of the other reviewers has mentioned that …  positive

    1      a wonderful little production. <br /><br />the…  positive

    2      i thought this was a wonderful way to spend ti…  positive

    3      basically there’s a family where a little boy …  negative

    4      petter mattei’s “love in the time of money” is…  positive

    …                                                  …       …

    49577  i thought this movie did a down right good job…  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative

    49579  i am a catholic taught in parochial elementary…  negative

    49580  i’m going to have to disagree with the previou…  negative

    49581  no one expects the star trek movies to be high…  negative

 

    [49582 rows x 2 columns]

 

    def strip_html(raw_text):

      find_html = re.compile(‘<.*?>’)

      clean_text = re.sub(find_html, ”, raw_text)

      return clean_text

 

    data.review = data.review.apply(lambda x: strip_html(x))

 

    data

 

                                                      review sentiment

    0      one of the other reviewers has mentioned that …  positive

    1      a wonderful little production. the filming tec…  positive

    2      i thought this was a wonderful way to spend ti…  positive

    3      basically there’s a family where a little boy …  negative

    4      petter mattei’s “love in the time of money” is…  positive

    …                                                  …       …

    49577  i thought this movie did a down right good job…  positive

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative

    49579  i am a catholic taught in parochial elementary…  negative

    49580  i’m going to have to disagree with the previou…  negative

    49581  no one expects the star trek movies to be high…  negative

 

    [49582 rows x 2 columns]

 

    # Running WhiteSpace tokenizer 

    wpTokenizer = WordPunctTokenizer()

    data[“review_tokenized”] = [wpTokenizer.tokenize(text) for text in data[“review”]]

 

    data

 

                                                      review sentiment 

    0      one of the other reviewers has mentioned that …  positive   

    1      a wonderful little production. the filming tec…  positive   

    2      i thought this was a wonderful way to spend ti…  positive   

    3      basically there’s a family where a little boy …  negative   

    4      petter mattei’s “love in the time of money” is…  positive   

    …                                                  …       …   

    49577  i thought this movie did a down right good job…  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative   

    49579  i am a catholic taught in parochial elementary…  negative   

    49580  i’m going to have to disagree with the previou…  negative   

    49581  no one expects the star trek movies to be high…  negative   

 

                                            review_tokenized  

    0      [one, of, the, other, reviewers, has, mentione…  

    1      [a, wonderful, little, production, ., the, fil…  

    2      [i, thought, this, was, a, wonderful, way, to,…  

    3      [basically, there, ‘, s, a, family, where, a, …  

    4      [petter, mattei, ‘, s, “, love, in, the, time,…  

    …                                                  …  

    49577  [i, thought, this, movie, did, a, down, right,…  

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, …  

    49579  [i, am, a, catholic, taught, in, parochial, el…  

    49580  [i, ‘, m, going, to, have, to, disagree, with,…  

    49581  [no, one, expects, the, star, trek, movies, to…  

 

    [49582 rows x 3 columns]

 

    # Stopwords removal & WordNet lemmatization 

 

    # Define POS tags 

    tag_map = defaultdict(lambda : wordnet.NOUN)

    tag_map[‘J’] = wordnet.ADJ

    tag_map[‘V’] = wordnet.VERB

    tag_map[‘R’] = wordnet.ADV

 

    for index, text in enumerate(data.review_tokenized):

        if index % 100 == 0:

            print(index)

    #     print(“-” * 50)

        word_list = []

        wordnet_lemmatizer = WordNetLemmatizer()

        for word, tag in pos_tag(text):

            if word not in stopwords.words(“english”) and word.isalpha():

                word_processed = wordnet_lemmatizer.lemmatize(word, tag_map[tag[0]])

                word_list.append(word_processed)

        data.loc[index, “review_tokenized_cleaned”] = str(word_list)

 

    0

    100

    200

    300

    400

    500

    600

    700

    800

    900

    1000

…..

    48800

    48900

    49000

    49100

    49200

    49300

    49400

    49500

 

    data

 

                                                      review sentiment 

    0      one of the other reviewers has mentioned that …  positive   

    1      a wonderful little production. the filming tec…  positive   

    2      i thought this was a wonderful way to spend ti…  positive   

    3      basically there’s a family where a little boy …  negative   

    4      petter mattei’s “love in the time of money” is…  positive   

    …                                                  …       …   

    49577  i thought this movie did a down right good job…  positive   

    49578  bad plot, bad dialogue, bad acting, idiotic di…  negative   

    49579  i am a catholic taught in parochial elementary…  negative   

    49580  i’m going to have to disagree with the previou…  negative   

    49581  no one expects the star trek movies to be high…  negative   

 

                                            review_tokenized 

    0      [one, of, the, other, reviewers, has, mentione…   

    1      [a, wonderful, little, production, ., the, fil…   

    2      [i, thought, this, was, a, wonderful, way, to,…   

    3      [basically, there, ‘, s, a, family, where, a, …   

    4      [petter, mattei, ‘, s, “, love, in, the, time,…   

    …                                                  …   

    49577  [i, thought, this, movie, did, a, down, right,…   

    49578  [bad, plot, ,, bad, dialogue, ,, bad, acting, …   

    49579  [i, am, a, catholic, taught, in, parochial, el…   

    49580  [i, ‘, m, going, to, have, to, disagree, with,…   

    49581  [no, one, expects, the, star, trek, movies, to…   

 

                                    review_tokenized_cleaned  

    0      [‘one’, ‘reviewer’, ‘mention’, ‘watch’, ‘oz’, …  

    1      [‘wonderful’, ‘little’, ‘production’, ‘filming…  

    2      [‘think’, ‘wonderful’, ‘way’, ‘spend’, ‘time’,…  

    3      [‘basically’, ‘family’, ‘little’, ‘boy’, ‘jake…  

    4      [‘petter’, ‘mattei’, ‘love’, ‘time’, ‘money’, …  

    …                                                  …  

    49577  [‘think’, ‘movie’, ‘right’, ‘good’, ‘job’, ‘cr…  

    49578  [‘bad’, ‘plot’, ‘bad’, ‘dialogue’, ‘bad’, ‘act…  

    49579  [‘catholic’, ‘taught’, ‘parochial’, ‘elementar…  

    49580  [‘go’, ‘disagree’, ‘previous’, ‘comment’, ‘sid…  

    49581  [‘one’, ‘expect’, ‘star’, ‘trek’, ‘movie’, ‘hi…  

 

    [49582 rows x 4 columns]

 

    data.review_tokenized_cleaned.isna().sum()

 

    0

 

    train_X, test_X, train_y, test_y = model_selection.train_test_split(data.review_tokenized_cleaned, data.sentiment, test_size = 0.3, random_state =1)

 

    print(train_X.shape)

    print(test_X.shape)

    print(train_y.shape)

    print(test_y.shape)

 

    (34707,)

    (14875,)

    (34707,)

    (14875,)

 

    test_y.value_counts()

 

    negative    7461

    positive    7414

    Name: sentiment, dtype: int64

 

    train_y.value_counts()

 

    positive    17470

    negative    17237

    Name: sentiment, dtype: int64

 

    label_enc = LabelEncoder()

    train_y = label_enc.fit_transform(train_y)

    test_y = label_enc.transform(test_y)

 

    print(np.unique(test_y, return_counts = True))

    print(np.unique(train_y, return_counts = True))

 

    (array([0, 1]), array([7461, 7414]))

    (array([0, 1]), array([17237, 17470]))

 

    tfidf_vect = TfidfVectorizer(max_features = 5000)

    tfidf_vect.fit(data.review_tokenized_cleaned)

 

    TfidfVectorizer(max_features=5000)

 

    train_X_tfidf = tfidf_vect.transform(train_X)

    test_X_tfidf = tfidf_vect.transform(test_X)

 

## Modelling Multinomial Naives Bayes

 

    train_X_tfidf_dense = train_X_tfidf.todense()

    test_X_tfidf_dense = test_X_tfidf.todense()

 

    nb_model = naive_bayes.GaussianNB()

    nb_model.fit(train_X_tfidf_dense, train_y)

 

    GaussianNB()

 

    preds_nb = nb_model.predict(test_X_tfidf_dense)

 

    preds_nb.shape

 

    (14875,)

 

    accuracy_score(preds_nb, test_y)

 

    0.7878991596638656

 

    confusion_matrix(test_y, preds_nb)

 

    array([[5938, 1523],

           [1632, 5782]])

 

    print(classification_report(test_y, preds_nb))

 

                  precision    recall  f1-score   support

 

               0       0.78      0.80      0.79      7461

               1       0.79      0.78      0.79      7414

 

        accuracy                           0.79     14875

       macro avg       0.79      0.79      0.79     14875

    weighted avg       0.79      0.79      0.79     14875

 

## Support Vector Machine Classifier

 

Training can take some time, grab a coffee in the meanwhile 🙂

 

    svm = svm.SVC(C = 1.0, kernel = “linear”, degree = 3, gamma = “auto”)

    svm.fit(train_X_tfidf, train_y)

 

    SVC(gamma=’auto’, kernel=’linear’)

 

    preds_svm = svm.predict(test_X_tfidf)

    print(preds_svm.shape)

 

    (14875,)

 

    accuracy_score(preds_svm, test_y)

 

    0.8836302521008403

 

    print(classification_report(test_y, preds_svm))

 

                  precision    recall  f1-score   support

 

               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414

 

        accuracy                           0.88     14875

       macro avg       0.88      0.88      0.88     14875

    weighted avg       0.88      0.88      0.88     14875

 

## Logistic Regression

 

    log_reg = linear_model.LogisticRegression(solver = “lbfgs”)

    log_reg.fit(train_X_tfidf, train_y)

 

    LogisticRegression()

 

    preds_log_reg = log_reg.predict(test_X_tfidf)

    preds_log_reg.shape

 

    (14875,)

 

    accuracy_score(preds_log_reg, test_y)

 

    0.8863193277310925

 

    print(classification_report(test_y, preds_log_reg))

 

                  precision    recall  f1-score   support

 

               0       0.90      0.87      0.88      7461

               1       0.87      0.90      0.89      7414

 

        accuracy                           0.89     14875

       macro avg       0.89      0.89      0.89     14875

    weighted avg       0.89      0.89      0.89     14875


Anthony

Leave a Reply

Your email address will not be published. Required fields are marked *

Back to top