In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid',font_scale=1.2)
sns.set_palette(sns.color_palette("rocket"))

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from collections import defaultdict

# ignore the warnings
import warnings
warnings.filterwarnings('ignore')
2024-12-09 17:57:27.533730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1733785047.983556    7480 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733785048.174063    7480 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-09 17:57:29.193483: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [2]:
# Let's start by defining some key variables that will be used later on in the training/evaluation process
RANDOM_SEED = 50
BATCH_SIZE = 16 # Note that increasing the batch size reduces the training time significantly, but gives you lower accuracy.
# Set seed for reproducibility.
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
In [3]:
df_reviews = pd.read_csv('train.tsv', sep='\t')
df_reviews.head()
Out[3]:
PhraseId SentenceId Phrase Sentiment
0 1 1 A series of escapades demonstrating the adage ... 1
1 2 1 A series of escapades demonstrating the adage ... 2
2 3 1 A series 2
3 4 1 A 2
4 5 1 series 2
In [4]:
# drop review id 
df_reviews.drop(['SentenceId','PhraseId'],axis=1,inplace=True)
In [5]:
df_reviews
Out[5]:
Phrase Sentiment
0 A series of escapades demonstrating the adage ... 1
1 A series of escapades demonstrating the adage ... 2
2 A series 2
3 A 2
4 series 2
... ... ...
156055 Hearst 's 2
156056 forced avuncular chortles 1
156057 avuncular chortles 3
156058 avuncular 2
156059 chortles 2

156060 rows × 2 columns

In [6]:
# lets check the shape of reviews dataframe
df_reviews.shape
Out[6]:
(156060, 2)
In [7]:
df_reviews.isnull().sum()
Out[7]:
Phrase       0
Sentiment    0
dtype: int64
In [8]:
# helper function to draw percentage above each bar
def draw_percentage(ax,total=float(len(df_reviews))):
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2.
        y = p.get_height()
        ax.annotate(percentage, (x, y),ha='center',va='bottom')
In [9]:
plt.figure(figsize = (10,6))
total = float(len(df_reviews))
ax = sns.countplot(x = 'Sentiment',data=df_reviews)
plt.title('Count Plot of Review Score', fontsize=20)
plt.xlabel('Sentiment score')
draw_percentage(ax)
plt.show()
No description has been provided for this image
In [10]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

# Let's load a pre-trained BertTokenizer
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
In [11]:
sample_txt = "we love you"

tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')
 Sentence: we love you
   Tokens: ['we', 'love', 'you']
Token IDs: [1195, 1567, 1128]
In [12]:
# encoding of a sentence
encoding = tokenizer.encode_plus(
    sample_txt,
    padding='max_length', # Pad sentence to max length
    truncation=True,  #Truncate sentence to max length
    max_length=32,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    return_attention_mask=True, # Return attention mask
    return_tensors='pt',  # Return torch objects
    )

encoding.keys()
Out[12]:
dict_keys(['input_ids', 'attention_mask'])
In [13]:
print(len(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
32
tensor([ 101, 1195, 1567, 1128,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
In [14]:
# The attention mask has the same length
print(len(encoding['attention_mask'][0]))
print(encoding['attention_mask'])
32
tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
In [15]:
# We can inverse the tokenization to have a look at the special tokens
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
Out[15]:
['[CLS]',
 'we',
 'love',
 'you',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']
In [16]:
token_lens = []
for text in df_reviews.Phrase:
    tokens = tokenizer.encode(text, truncation=True,max_length=512)
    token_lens.append(len(tokens))
In [17]:
plt.figure(figsize = (8,6))
sns.histplot(token_lens,kde=True)
plt.xlim([0, 150])
plt.xlabel('Token count')
plt.show()
No description has been provided for this image
In [18]:
MAX_LEN = 100
In [19]:
# We have all building blocks required to create a torch dataset. Let's do it...
class dataset(Dataset):
    
    def __init__(self, Phrase, Sentiment, tokenizer, max_len):
        
        self.Phrase = Phrase
        self.Sentiment = Sentiment
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.Phrase)
    
    def __getitem__(self, item):
        # step 1: get the reviews and targets
        Phrase = str(self.Phrase[item])
        Sentiment = self.Sentiment[item]
        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        encoding = self.tokenizer.encode_plus(
            Phrase,
            add_special_tokens=True, # Add '[CLS]' and '[SEP]'
            padding='max_length',  # Pad sentence to max length
            truncation=True,    # Truncate sentence to max length
            max_length=self.max_len,
            return_token_type_ids=False,
            return_attention_mask=True, # Return attention mask
            return_tensors='pt', # return torch objects/tensor
        )
        return {
            'Phrase_text': Phrase,
            'input_ids': encoding['input_ids'].flatten(),  # Tensor of token ids to be fed to a model
            'attention_mask': encoding['attention_mask'].flatten(),  #Tensor of indices specifying which tokens should be attended to by the model
            'Sentiment': torch.tensor(Sentiment, dtype=torch.long)
            }
In [20]:
# lets split the data
df_train, df_test = train_test_split(df_reviews, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

print('Train Data Size', df_train.shape)
print('Validation Data Size', df_val.shape)
print('Test Data Size', df_test.shape)
Train Data Size (140454, 2)
Validation Data Size (7803, 2)
Test Data Size (7803, 2)
In [21]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = dataset(
        Phrase=df.Phrase.to_numpy(),
        Sentiment=df.Sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
      )
    return DataLoader(
        ds,
        batch_size=batch_size
      )
In [22]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
In [23]:
data = next(iter(train_data_loader))
data.keys()
Out[23]:
dict_keys(['Phrase_text', 'input_ids', 'attention_mask', 'Sentiment'])
In [24]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=False)
In [25]:
last_hidden_state, pooled_output = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask']
)
# The last_hidden_state is a sequence of hidden states of the last layer of the model. 
# Obtaining the pooled_output is done by applying the BertPooler on last_hidden_state.
last_hidden_state.shape
Out[25]:
torch.Size([1, 32, 768])
In [26]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=False)
        # dropout layer for some regularization 
        self.drop = nn.Dropout(p=0.3)
        # A fully-connected layer for our output
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        last_hidden_state,pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)
In [27]:
# Let's create an instance and move it to the GPU.
model = SentimentClassifier(5)#len(class_names)
model = model.to(device)
In [28]:
# We'll move the example batch of our training data to the GPU
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length
torch.Size([16, 100])
torch.Size([16, 100])
In [29]:
EPOCHS = 3
#  AdamW optimizer to correct weight decay
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
#  We'll also use a linear scheduler with no warmup steps
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
# cross-entropy loss function
loss_fn = nn.CrossEntropyLoss().to(device)
In [31]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples, progress_bar=None):
    """
    Train the model for one epoch.
    
    Args:
        model (torch.nn.Module): The model to train.
        data_loader (DataLoader): Data loader for training data.
        loss_fn (torch.nn.Module): Loss function.
        optimizer (torch.optim.Optimizer): Optimizer.
        device (torch.device): Device to perform training on.
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler.
        n_examples (int): Number of examples in the training set.
        progress_bar (tqdm, optional): Progress bar for real-time monitoring.

    Returns:
        tuple: Accuracy and average loss for the epoch.
    """
    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        Sentiment = batch["Sentiment"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        # Compute loss
        loss = loss_fn(outputs, Sentiment)
        losses.append(loss.item())

        # Backward pass and optimization
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Update correct predictions count
        correct_predictions += torch.sum(preds == Sentiment)

        # Update progress bar
        if progress_bar:
            progress_bar.update(1)

    return correct_predictions.double() / n_examples, np.mean(losses)
In [33]:
def eval_model(model, data_loader, loss_fn, device, n_examples, progress_bar=None):
    """
    Evaluate the model on a validation set.

    Args:
        model (torch.nn.Module): The model to evaluate.
        data_loader (DataLoader): Data loader for validation data.
        loss_fn (torch.nn.Module): Loss function.
        device (torch.device): Device to perform evaluation on.
        n_examples (int): Number of examples in the validation set.
        progress_bar (tqdm, optional): Progress bar for real-time monitoring.

    Returns:
        tuple: Accuracy and average loss for the validation set.
    """
    model = model.eval()  # Set model to evaluation mode
    losses = []
    correct_predictions = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for batch in data_loader:
            # Move data to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            Sentiment = batch["Sentiment"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            # Compute loss
            loss = loss_fn(outputs, Sentiment)
            losses.append(loss.item())

            # Update correct predictions
            correct_predictions += torch.sum(preds == Sentiment)

            # Update progress bar
            if progress_bar:
                progress_bar.update(1)

    return correct_predictions.double() / n_examples, np.mean(losses)
In [35]:
from tqdm import tqdm
from collections import defaultdict
import torch

history = defaultdict(list)
best_accuracy = 0

# Start training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 100)
    
    # Use tqdm for real-time monitoring of training
    train_epoch_desc = f'Training Epoch {epoch + 1}/{EPOCHS}'
    with tqdm(total=len(train_data_loader), desc=train_epoch_desc, leave=False) as pbar:
        train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train), progress_bar=pbar)
    
    print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
    
    # Validate the model
    val_epoch_desc = f'Validation Epoch {epoch + 1}/{EPOCHS}'
    with tqdm(total=len(val_data_loader), desc=val_epoch_desc, leave=False) as pbar:
        val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val), progress_bar=pbar)
    
    print(f'Val loss   {val_loss:.4f} accuracy {val_acc:.4f}')
    print()
    
    # Append metrics to history
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # Save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

# Note: You may need to modify the `train_epoch` and `eval_model` functions to accept an optional `progress_bar` parameter and update it during the data loader iterations.
Epoch 1/3
----------------------------------------------------------------------------------------------------
                                                                                
Train loss 0.8075 accuracy 0.6642
                                                                                
Val loss   0.7417 accuracy 0.6956

Epoch 2/3
----------------------------------------------------------------------------------------------------
                                                                                
Train loss 0.6434 accuracy 0.7367
                                                                                
Val loss   0.7612 accuracy 0.6895

Epoch 3/3
----------------------------------------------------------------------------------------------------
                                                                                
Train loss 0.5542 accuracy 0.7779
                                                                                
Val loss   0.8015 accuracy 0.6913


In [36]:
history_cpu_train_acc = [i.cpu() for i in history['train_acc']]
history_cpu_val_acc = [i.cpu() for i in history['val_acc']]
In [37]:
plt.plot(history_cpu_train_acc, label='train accuracy')
plt.plot(history_cpu_val_acc, label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])
plt.show()
No description has been provided for this image
In [39]:
model = SentimentClassifier(5)
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)
In [40]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)
test_acc.item()
Out[40]:
0.7048571062411892
In [47]:
def get_predictions(model, data_loader):
  # put model in evaluation mode
  model = model.eval()
  # Create empty lists to store outputs
  Phrase_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for batch in data_loader:
      # We'll move the example batch of our test data to the GPU
      texts = batch["Phrase_text"]
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      Sentiment = batch["Sentiment"].to(device)
      # Perform a forward pass. This will return logits.
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      # Get the predictions
      _, preds = torch.max(outputs, dim=1)
      Phrase_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(Sentiment)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return Phrase_texts, predictions, prediction_probs, real_values
In [48]:
y_Phrase_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)
In [56]:
class_names = ['negative','somewhat negative', 'neutral','somewhat positive' 'positive']
In [58]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.57      0.39      0.47       365
           1       0.57      0.64      0.60      1310
           2       0.80      0.80      0.80      4027
           3       0.62      0.65      0.64      1628
           4       0.66      0.49      0.56       473

    accuracy                           0.70      7803
   macro avg       0.64      0.59      0.61      7803
weighted avg       0.71      0.70      0.70      7803

In [59]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="plasma")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
show_confusion_matrix(df_cm)
No description has been provided for this image