In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid',font_scale=1.2)
sns.set_palette(sns.color_palette("rocket"))
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
# ignore the warnings
import warnings
warnings.filterwarnings('ignore')
2024-12-09 17:57:27.533730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered WARNING: All log messages before absl::InitializeLog() is called are written to STDERR E0000 00:00:1733785047.983556 7480 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered E0000 00:00:1733785048.174063 7480 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2024-12-09 17:57:29.193483: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [2]:
# Let's start by defining some key variables that will be used later on in the training/evaluation process
RANDOM_SEED = 50
BATCH_SIZE = 16 # Note that increasing the batch size reduces the training time significantly, but gives you lower accuracy.
# Set seed for reproducibility.
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
In [3]:
df_reviews = pd.read_csv('train.tsv', sep='\t')
df_reviews.head()
Out[3]:
PhraseId | SentenceId | Phrase | Sentiment | |
---|---|---|---|---|
0 | 1 | 1 | A series of escapades demonstrating the adage ... | 1 |
1 | 2 | 1 | A series of escapades demonstrating the adage ... | 2 |
2 | 3 | 1 | A series | 2 |
3 | 4 | 1 | A | 2 |
4 | 5 | 1 | series | 2 |
In [4]:
# drop review id
df_reviews.drop(['SentenceId','PhraseId'],axis=1,inplace=True)
In [5]:
df_reviews
Out[5]:
Phrase | Sentiment | |
---|---|---|
0 | A series of escapades demonstrating the adage ... | 1 |
1 | A series of escapades demonstrating the adage ... | 2 |
2 | A series | 2 |
3 | A | 2 |
4 | series | 2 |
... | ... | ... |
156055 | Hearst 's | 2 |
156056 | forced avuncular chortles | 1 |
156057 | avuncular chortles | 3 |
156058 | avuncular | 2 |
156059 | chortles | 2 |
156060 rows × 2 columns
In [6]:
# lets check the shape of reviews dataframe
df_reviews.shape
Out[6]:
(156060, 2)
In [7]:
df_reviews.isnull().sum()
Out[7]:
Phrase 0 Sentiment 0 dtype: int64
In [8]:
# helper function to draw percentage above each bar
def draw_percentage(ax,total=float(len(df_reviews))):
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width() / 2.
y = p.get_height()
ax.annotate(percentage, (x, y),ha='center',va='bottom')
In [9]:
plt.figure(figsize = (10,6))
total = float(len(df_reviews))
ax = sns.countplot(x = 'Sentiment',data=df_reviews)
plt.title('Count Plot of Review Score', fontsize=20)
plt.xlabel('Sentiment score')
draw_percentage(ax)
plt.show()
In [10]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
# Let's load a pre-trained BertTokenizer
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
In [11]:
sample_txt = "we love you"
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_txt}')
print(f' Tokens: {tokens}')
print(f'Token IDs: {token_ids}')
Sentence: we love you Tokens: ['we', 'love', 'you'] Token IDs: [1195, 1567, 1128]
In [12]:
# encoding of a sentence
encoding = tokenizer.encode_plus(
sample_txt,
padding='max_length', # Pad sentence to max length
truncation=True, #Truncate sentence to max length
max_length=32,
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
return_token_type_ids=False,
return_attention_mask=True, # Return attention mask
return_tensors='pt', # Return torch objects
)
encoding.keys()
Out[12]:
dict_keys(['input_ids', 'attention_mask'])
In [13]:
print(len(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
32 tensor([ 101, 1195, 1567, 1128, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
In [14]:
# The attention mask has the same length
print(len(encoding['attention_mask'][0]))
print(encoding['attention_mask'])
32 tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
In [15]:
# We can inverse the tokenization to have a look at the special tokens
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
Out[15]:
['[CLS]', 'we', 'love', 'you', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
In [16]:
token_lens = []
for text in df_reviews.Phrase:
tokens = tokenizer.encode(text, truncation=True,max_length=512)
token_lens.append(len(tokens))
In [17]:
plt.figure(figsize = (8,6))
sns.histplot(token_lens,kde=True)
plt.xlim([0, 150])
plt.xlabel('Token count')
plt.show()
In [18]:
MAX_LEN = 100
In [19]:
# We have all building blocks required to create a torch dataset. Let's do it...
class dataset(Dataset):
def __init__(self, Phrase, Sentiment, tokenizer, max_len):
self.Phrase = Phrase
self.Sentiment = Sentiment
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.Phrase)
def __getitem__(self, item):
# step 1: get the reviews and targets
Phrase = str(self.Phrase[item])
Sentiment = self.Sentiment[item]
# step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
encoding = self.tokenizer.encode_plus(
Phrase,
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
padding='max_length', # Pad sentence to max length
truncation=True, # Truncate sentence to max length
max_length=self.max_len,
return_token_type_ids=False,
return_attention_mask=True, # Return attention mask
return_tensors='pt', # return torch objects/tensor
)
return {
'Phrase_text': Phrase,
'input_ids': encoding['input_ids'].flatten(), # Tensor of token ids to be fed to a model
'attention_mask': encoding['attention_mask'].flatten(), #Tensor of indices specifying which tokens should be attended to by the model
'Sentiment': torch.tensor(Sentiment, dtype=torch.long)
}
In [20]:
# lets split the data
df_train, df_test = train_test_split(df_reviews, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
print('Train Data Size', df_train.shape)
print('Validation Data Size', df_val.shape)
print('Test Data Size', df_test.shape)
Train Data Size (140454, 2) Validation Data Size (7803, 2) Test Data Size (7803, 2)
In [21]:
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = dataset(
Phrase=df.Phrase.to_numpy(),
Sentiment=df.Sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size
)
In [22]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
In [23]:
data = next(iter(train_data_loader))
data.keys()
Out[23]:
dict_keys(['Phrase_text', 'input_ids', 'attention_mask', 'Sentiment'])
In [24]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=False)
In [25]:
last_hidden_state, pooled_output = bert_model(
input_ids=encoding['input_ids'],
attention_mask=encoding['attention_mask']
)
# The last_hidden_state is a sequence of hidden states of the last layer of the model.
# Obtaining the pooled_output is done by applying the BertPooler on last_hidden_state.
last_hidden_state.shape
Out[25]:
torch.Size([1, 32, 768])
In [26]:
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=False)
# dropout layer for some regularization
self.drop = nn.Dropout(p=0.3)
# A fully-connected layer for our output
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
"""
Feed input to BERT and the classifier to compute logits.
@param input_ids (torch.Tensor): an input tensor with shape (batch_size,
max_length)
@param attention_mask (torch.Tensor): a tensor that hold attention mask
information with shape (batch_size, max_length)
@return logits (torch.Tensor): an output tensor with shape (batch_size,
num_labels)
"""
# Feed input to BERT
last_hidden_state,pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
output = self.drop(pooled_output)
return self.out(output)
In [27]:
# Let's create an instance and move it to the GPU.
model = SentimentClassifier(5)#len(class_names)
model = model.to(device)
In [28]:
# We'll move the example batch of our training data to the GPU
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length
torch.Size([16, 100]) torch.Size([16, 100])
In [29]:
EPOCHS = 3
# AdamW optimizer to correct weight decay
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
# We'll also use a linear scheduler with no warmup steps
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
# cross-entropy loss function
loss_fn = nn.CrossEntropyLoss().to(device)
In [31]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples, progress_bar=None):
"""
Train the model for one epoch.
Args:
model (torch.nn.Module): The model to train.
data_loader (DataLoader): Data loader for training data.
loss_fn (torch.nn.Module): Loss function.
optimizer (torch.optim.Optimizer): Optimizer.
device (torch.device): Device to perform training on.
scheduler (torch.optim.lr_scheduler): Learning rate scheduler.
n_examples (int): Number of examples in the training set.
progress_bar (tqdm, optional): Progress bar for real-time monitoring.
Returns:
tuple: Accuracy and average loss for the epoch.
"""
model = model.train()
losses = []
correct_predictions = 0
for batch in data_loader:
# Move data to device
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
Sentiment = batch["Sentiment"].to(device)
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs, dim=1)
# Compute loss
loss = loss_fn(outputs, Sentiment)
losses.append(loss.item())
# Backward pass and optimization
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
# Update correct predictions count
correct_predictions += torch.sum(preds == Sentiment)
# Update progress bar
if progress_bar:
progress_bar.update(1)
return correct_predictions.double() / n_examples, np.mean(losses)
In [33]:
def eval_model(model, data_loader, loss_fn, device, n_examples, progress_bar=None):
"""
Evaluate the model on a validation set.
Args:
model (torch.nn.Module): The model to evaluate.
data_loader (DataLoader): Data loader for validation data.
loss_fn (torch.nn.Module): Loss function.
device (torch.device): Device to perform evaluation on.
n_examples (int): Number of examples in the validation set.
progress_bar (tqdm, optional): Progress bar for real-time monitoring.
Returns:
tuple: Accuracy and average loss for the validation set.
"""
model = model.eval() # Set model to evaluation mode
losses = []
correct_predictions = 0
with torch.no_grad(): # Disable gradient computation for evaluation
for batch in data_loader:
# Move data to device
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
Sentiment = batch["Sentiment"].to(device)
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs, dim=1)
# Compute loss
loss = loss_fn(outputs, Sentiment)
losses.append(loss.item())
# Update correct predictions
correct_predictions += torch.sum(preds == Sentiment)
# Update progress bar
if progress_bar:
progress_bar.update(1)
return correct_predictions.double() / n_examples, np.mean(losses)
In [35]:
from tqdm import tqdm
from collections import defaultdict
import torch
history = defaultdict(list)
best_accuracy = 0
# Start training loop
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 100)
# Use tqdm for real-time monitoring of training
train_epoch_desc = f'Training Epoch {epoch + 1}/{EPOCHS}'
with tqdm(total=len(train_data_loader), desc=train_epoch_desc, leave=False) as pbar:
train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train), progress_bar=pbar)
print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
# Validate the model
val_epoch_desc = f'Validation Epoch {epoch + 1}/{EPOCHS}'
with tqdm(total=len(val_data_loader), desc=val_epoch_desc, leave=False) as pbar:
val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val), progress_bar=pbar)
print(f'Val loss {val_loss:.4f} accuracy {val_acc:.4f}')
print()
# Append metrics to history
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
# Save the best model
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc
# Note: You may need to modify the `train_epoch` and `eval_model` functions to accept an optional `progress_bar` parameter and update it during the data loader iterations.
Epoch 1/3 ----------------------------------------------------------------------------------------------------
Train loss 0.8075 accuracy 0.6642
Val loss 0.7417 accuracy 0.6956 Epoch 2/3 ----------------------------------------------------------------------------------------------------
Train loss 0.6434 accuracy 0.7367
Val loss 0.7612 accuracy 0.6895 Epoch 3/3 ----------------------------------------------------------------------------------------------------
Train loss 0.5542 accuracy 0.7779
Val loss 0.8015 accuracy 0.6913
In [36]:
history_cpu_train_acc = [i.cpu() for i in history['train_acc']]
history_cpu_val_acc = [i.cpu() for i in history['val_acc']]
In [37]:
plt.plot(history_cpu_train_acc, label='train accuracy')
plt.plot(history_cpu_val_acc, label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])
plt.show()
In [39]:
model = SentimentClassifier(5)
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)
In [40]:
test_acc, _ = eval_model(
model,
test_data_loader,
loss_fn,
device,
len(df_test)
)
test_acc.item()
Out[40]:
0.7048571062411892
In [47]:
def get_predictions(model, data_loader):
# put model in evaluation mode
model = model.eval()
# Create empty lists to store outputs
Phrase_texts = []
predictions = []
prediction_probs = []
real_values = []
with torch.no_grad():
for batch in data_loader:
# We'll move the example batch of our test data to the GPU
texts = batch["Phrase_text"]
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
Sentiment = batch["Sentiment"].to(device)
# Perform a forward pass. This will return logits.
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
# Get the predictions
_, preds = torch.max(outputs, dim=1)
Phrase_texts.extend(texts)
predictions.extend(preds)
prediction_probs.extend(outputs)
real_values.extend(Sentiment)
predictions = torch.stack(predictions).cpu()
prediction_probs = torch.stack(prediction_probs).cpu()
real_values = torch.stack(real_values).cpu()
return Phrase_texts, predictions, prediction_probs, real_values
In [48]:
y_Phrase_texts, y_pred, y_pred_probs, y_test = get_predictions(
model,
test_data_loader
)
In [56]:
class_names = ['negative','somewhat negative', 'neutral','somewhat positive' 'positive']
In [58]:
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.57 0.39 0.47 365 1 0.57 0.64 0.60 1310 2 0.80 0.80 0.80 4027 3 0.62 0.65 0.64 1628 4 0.66 0.49 0.56 473 accuracy 0.70 7803 macro avg 0.64 0.59 0.61 7803 weighted avg 0.71 0.70 0.70 7803
In [59]:
def show_confusion_matrix(confusion_matrix):
hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="plasma")
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
plt.ylabel('True sentiment')
plt.xlabel('Predicted sentiment');
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
show_confusion_matrix(df_cm)