This is the final submission file for the Capstone project - Eye for Blind.
An CNN-RNN based Attention model has been built on flickr8k dataset to predict captions for random images.
The Model selects captions using Greedy Search and resulting captions are evaluated using BLUE score.
#Import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import glob
import random as rn
rn.seed(42)
from keras import backend as K
import tensorflow as tf
tf.random.set_seed(42)
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import string
import json
from tqdm import tqdm
import time
Let's read the dataset
1.Import the dataset and read image & captions into two seperate variables
2.Visualise both the images & text present in the dataset
3.Create word-to-index and index-to-word mappings.
4.Create a dataframe which summarizes the image, path & captions as a dataframe
5.Visualise the top 30 occuring words in the captions
6.Create a list which contains all the captions & path
#Importing the dataset and reading the image into a seperate variable
image_folder ='../input/flickr8k/Images/'
all_imgs = glob.glob(image_folder + '*.jpg',recursive=True)
print("The total images present in the dataset: {}".format(len(all_imgs)))
The total images present in the dataset: 8091
#Importing the dataset and read the text file into a seperate variable
text_file_path = '../input/flickr8k/captions.txt'
image_captions_df = pd.read_csv(text_file_path)
image_captions_df.head(10)
image | caption | |
---|---|---|
0 | 1000268201_693b08cb0e.jpg | A child in a pink dress is climbing up a set o... |
1 | 1000268201_693b08cb0e.jpg | A girl going into a wooden building . |
2 | 1000268201_693b08cb0e.jpg | A little girl climbing into a wooden playhouse . |
3 | 1000268201_693b08cb0e.jpg | A little girl climbing the stairs to her playh... |
4 | 1000268201_693b08cb0e.jpg | A little girl in a pink dress going into a woo... |
5 | 1001773457_577c3a7d70.jpg | A black dog and a spotted dog are fighting |
6 | 1001773457_577c3a7d70.jpg | A black dog and a tri-colored dog playing with... |
7 | 1001773457_577c3a7d70.jpg | A black dog and a white dog with brown spots a... |
8 | 1001773457_577c3a7d70.jpg | Two dogs of different breeds looking at each o... |
9 | 1001773457_577c3a7d70.jpg | Two dogs on pavement moving toward each other . |
#Visualising three random images & captions present in the dataset
for i in range(3) :
# image
random_index = rn.randint(0,len(all_imgs))
image_id = image_captions_df.loc[random_index,'image']
image = plt.imread(image_folder + image_id)
plt.title('Image ID : ' +image_id )
plt.imshow(image)
plt.axis('off')
plt.show()
print('Image Shape : ', image.shape,'\n')
# captions
condition = image_captions_df['image'] == image_id
print('Captions for Image ID # ', image_id , ' : ')
print(image_captions_df.loc[condition,'caption'].values, '\n\n')
Image Shape : (500, 375, 3) Captions for Image ID # 211981411_e88b8043c2.jpg : ['A hummer is driving through a mud puddle several feet deep .' 'An old dusty car is half way in the brown water .' 'An old jeep partially submerged in water' 'Dirty car in body of water' 'The dirty vehicle is caught in a flood .']
Image Shape : (375, 500, 3) Captions for Image ID # 1224851143_33bcdd299c.jpg : ['A boy eats with a spoon .' 'A little boy holds a spoon up to his mouth .' 'A little boy is eating his food off of a spoon while sitting on a patio .' 'A small child dressed in green is eating with a spoon .' 'A young child holds a spoon to its mouth while sitting in a chair .']
Image Shape : (351, 500, 3) Captions for Image ID # 1055753357_4fa3d8d693.jpg : ['Two constructions workers sit on a beam taking a break .' 'Two construction workers are sitting up on the side of a building .' 'Two construction workers sitting on an I-beam .' 'Two construction workers take a seat on a steel beam .' 'Two men take a break from construction .']
# Creating a dataframe which summarizes the image, path & captions as a dataframe
all_img_id= image_captions_df['image'].values #storing all the image ids
all_img_vector= (image_folder + '/'+ image_captions_df['image']).values #storing all the image paths
annotations= image_captions_df['caption'].values #storing all the captions
df = pd.DataFrame(list(zip(all_img_id, all_img_vector,annotations)),columns =['ID','Path', 'Captions'])
pd.set_option('max_colwidth', 100)
print('No of samples : ', df.shape[0])
df.head(10)
No of samples : 40455
ID | Path | Captions | |
---|---|---|---|
0 | 1000268201_693b08cb0e.jpg | ../input/flickr8k/Images//1000268201_693b08cb0e.jpg | A child in a pink dress is climbing up a set of stairs in an entry way . |
1 | 1000268201_693b08cb0e.jpg | ../input/flickr8k/Images//1000268201_693b08cb0e.jpg | A girl going into a wooden building . |
2 | 1000268201_693b08cb0e.jpg | ../input/flickr8k/Images//1000268201_693b08cb0e.jpg | A little girl climbing into a wooden playhouse . |
3 | 1000268201_693b08cb0e.jpg | ../input/flickr8k/Images//1000268201_693b08cb0e.jpg | A little girl climbing the stairs to her playhouse . |
4 | 1000268201_693b08cb0e.jpg | ../input/flickr8k/Images//1000268201_693b08cb0e.jpg | A little girl in a pink dress going into a wooden cabin . |
5 | 1001773457_577c3a7d70.jpg | ../input/flickr8k/Images//1001773457_577c3a7d70.jpg | A black dog and a spotted dog are fighting |
6 | 1001773457_577c3a7d70.jpg | ../input/flickr8k/Images//1001773457_577c3a7d70.jpg | A black dog and a tri-colored dog playing with each other on the road . |
7 | 1001773457_577c3a7d70.jpg | ../input/flickr8k/Images//1001773457_577c3a7d70.jpg | A black dog and a white dog with brown spots are staring at each other in the street . |
8 | 1001773457_577c3a7d70.jpg | ../input/flickr8k/Images//1001773457_577c3a7d70.jpg | Two dogs of different breeds looking at each other on the road . |
9 | 1001773457_577c3a7d70.jpg | ../input/flickr8k/Images//1001773457_577c3a7d70.jpg | Two dogs on pavement moving toward each other . |
Each image id has 5 captions associated with it therefore the total dataset has 40455 samples.
# Reading a random image from dataframe
row = rn.randint(0,df.shape[0])
image = plt.imread(df.loc[row,'Path'])
plt.imshow(image)
print(df.loc[row,'Captions'])
A soccer player in red uniform runs after a soccer ball .
#Creating the vocabulary & the counter for the captions
# Removing all punctuations from captions
df['Captions'] = df['Captions'].apply(lambda x : ''.join(l for l in x if l not in string.punctuation))
vocabulary= [y.lower() for x in df['Captions'].values for y in x.split()]
val_count=Counter(vocabulary)
val_count.most_common(30)
[('a', 62986), ('in', 18974), ('the', 18418), ('on', 10743), ('is', 9345), ('and', 8851), ('dog', 8136), ('with', 7765), ('man', 7265), ('of', 6713), ('two', 5638), ('white', 3940), ('black', 3832), ('boy', 3581), ('are', 3504), ('woman', 3402), ('girl', 3328), ('to', 3173), ('wearing', 3062), ('at', 2914), ('people', 2883), ('water', 2783), ('red', 2672), ('young', 2630), ('brown', 2563), ('an', 2432), ('his', 2357), ('blue', 2268), ('dogs', 2125), ('running', 2073)]
#Visualising the top 30 occuring words in the captions
top30 = val_count.most_common(30)
words = []
counts = []
for word_count in top30 :
words.append(word_count[0])
counts.append(word_count[1])
plt.figure(figsize=(20,10))
plt.title('Top 30 words in the vocabulary')
plt.xlabel('word')
plt.ylabel('count')
plot = sns.barplot(x=words, y=counts)
for p in plot.patches:
plot.annotate(format(int(p.get_height())),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
plt.show()
#Creating a list which contains all the captions and adding the <start> & <end> token to all those captions.
annotations= (df['Captions'].apply(lambda x : '<start> '+ x + ' <end>')).values
#Creating a list which contains all the path to the images
all_img_path= df['Path'].values
print("Total captions present in the dataset: "+ str(len(annotations)))
print("Total images present in the dataset: " + str(len(all_img_path)))
Total captions present in the dataset: 40455 Total images present in the dataset: 40455
1.Create the tokenized vectors by tokenizing the captions fore ex :split them using spaces & other filters. This gives us a vocabulary of all of the unique words in the data. Keep the total vocaublary to top 5,000 words for saving memory.
2.Replace all other words with the unknown token "UNK" .
3.Create word-to-index and index-to-word mappings.
4.Pad all sequences to be the same length as the longest one.
# creating the tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(
num_words = 5000,
filters = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ',
lower = True,
split = " ",
char_level = False,
oov_token = '<unk>',)
# fitting the tokenizer on words in the dataset
tokenizer.fit_on_texts(annotations)
# Converting sentences to sequences of word token indexes
caption_sequences = tokenizer.texts_to_sequences(annotations)
caption_sequences[:10]
[[3, 2, 43, 5, 2, 91, 171, 8, 120, 54, 2, 396, 13, 393, 5, 29, 1, 694, 4], [3, 2, 20, 315, 65, 2, 195, 118, 4], [3, 2, 41, 20, 120, 65, 2, 195, 2432, 4], [3, 2, 41, 20, 120, 6, 393, 21, 61, 2432, 4], [3, 2, 41, 20, 5, 2, 91, 171, 315, 65, 2, 195, 2995, 4], [3, 2, 16, 10, 9, 2, 853, 10, 18, 344, 4], [3, 2, 16, 10, 9, 2, 1563, 10, 35, 11, 138, 83, 7, 6, 156, 4], [3, 2, 16, 10, 9, 2, 15, 10, 11, 28, 998, 18, 638, 23, 138, 83, 5, 6, 73, 4], [3, 14, 32, 13, 739, 2650, 89, 23, 138, 83, 7, 6, 156, 4], [3, 14, 32, 7, 721, 797, 320, 138, 83, 4]]
# Creating word-to-index and index-to-word mappings.
word2index = json.loads(tokenizer.get_config()['word_index'])
index2word = json.loads(tokenizer.get_config()['index_word'])
index2word = {int(key) : value for key,value in index2word.items()}
print('Index for the word ',"<start> is", word2index["<start>"])
print('Word for the index : 3 is ', index2word[3])
Index for the word <start> is 3 Word for the index : 3 is <start>
# Creating a word count of the tokenizer to visulize the Top 30 occuring words after text processing
word_counts = tokenizer.get_config()['word_counts']
word_counts_df = pd.DataFrame.from_dict(data = json.loads(word_counts), orient='index', columns=['count'])
top_30 = word_counts_df.sort_values(by='count', ascending=False)[:30]
plt.figure(figsize=(20,10))
plt.title('Top 30 words in the vocabulary')
plt.xlabel('word')
plt.ylabel('count')
plot = sns.barplot(x=top_30.index , y=top_30['count'])
for p in plot.patches:
plot.annotate(format(int(p.get_height())),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
plt.show()
# Padding each vector to the max_length of the captions and storing it to a vairable
max_length = max([len(caption_sequence) for caption_sequence in caption_sequences])
print('Max Length of Caption :', max_length)
# padding all caption sequences.
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(
caption_sequences, padding="post"
)
print("The shape of Caption vector is :" + str(cap_vector.shape))
Max Length of Caption : 38 The shape of Caption vector is :(40455, 38)
# Adding padding index to tokenizer
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'
!mkdir Features # Creating a folder to store features
with tf.device('/GPU:0'):
# Inceptionv3 Model with ImageNet weights
def load_inception_model() :
image_model = tf.keras.applications.InceptionV3(include_top=False,weights='imagenet')
new_input = image_model.input # input of the image_model
hidden_layer = image_model.layers[-1].output # output of the image_model
return tf.keras.Model(inputs=new_input, outputs=hidden_layer) # final model using both input & output layer
image_features_extract_model = load_inception_model()
def extract_features_and_cache(path) :
# Reading Image
image = tf.io.read_file(bytes.decode(path.numpy()))
image = tf.image.decode_jpeg(image)
image = tf.image.convert_image_dtype(image, tf.float32)
# Resizing the Image for Inceptionv3
image = tf.image.resize(image, [299, 299])
# Preprocessing for Inceptionv3
preprocessed_for_inception = tf.keras.applications.inception_v3.preprocess_input(image) # normalization (-1,1)
preprocessed_for_inception = tf.expand_dims(preprocessed_for_inception, 0)
# Extracting Image Features
features = image_features_extract_model(preprocessed_for_inception)
extracted_features = tf.reshape(features, (-1, tf.shape(features)[-1]))
# Caching Image Features
image_name = (bytes.decode(path.numpy())).split('/')[-1]
feature_path = './Features/' + image_name.split('.')[0] + '_features_.npy'
np.save(feature_path, extracted_features)
return feature_path
# Creating an tf.dataset with unique images in the dataset
unique_images = list(set(all_img_path))
image_dataset = tf.data.Dataset.from_tensor_slices(unique_images)
# Running feature extraction and caching
for path in tqdm(image_dataset) :
extract_features_and_cache(path)
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5 87916544/87910968 [==============================] - 1s 0us/step
100%|██████████| 8091/8091 [11:09<00:00, 12.09it/s]
# Checking if features of all images are extracted
assert len(glob.glob('./Features/*')) == len(set(all_img_path))
# Function to load cached extracted features from disk
def load_features(path) :
image_name = (bytes.decode(path.numpy())).split('/')[-1]
feature_path = './Features/' + image_name.split('.')[0] + '_features_.npy'
features = np.load(feature_path)
features = tf.reshape(features, (-1, tf.shape(features)[-1]))
return features
BATCH_SIZE = 256
BUFFER_SIZE = 1024
dataset_size = all_img_path.shape[0]
train_size = int(0.8 * dataset_size)
train_img_paths, train_cap_vector = all_img_path[:train_size], cap_vector[:train_size][:]
test_img_paths, test_cap_vector = all_img_path[train_size : ], cap_vector[train_size :][:]
train_dataset = tf.data.Dataset.from_tensor_slices((train_img_paths, train_cap_vector))
train_dataset = train_dataset.shuffle(BUFFER_SIZE, seed=42, reshuffle_each_iteration=True)
## Replacing images with features extracted from Inception net V3
train_dataset = train_dataset.map(lambda path, caption : (tf.py_function(load_features, [path],tf.float32),caption), num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE, drop_remainder=True)
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_img_paths, test_cap_vector))
## Replacing images with features extracted from Inception net V3
test_dataset = test_dataset.map(lambda path, caption : (tf.py_function(load_features, [path],tf.float32),caption), num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE, drop_remainder=True)
test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
train_dataset.cache('./train')
test_dataset.cache('./test')
<CacheDataset shapes: (<unknown>, (256, 38)), types: (tf.float32, tf.int32)>
# image, caption check
index = 6010
path = test_img_paths[index]
image = plt.imread(path)
plt.imshow(image)
print(test_cap_vector[index])
print([tokenizer.index_word[id] for id in test_cap_vector[index]])
[ 3 14 32 843 21 402 6 1 26 177 76 6 83 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ['<start>', 'two', 'dogs', 'try', 'to', 'get', 'the', '<unk>', 'red', 'frisbee', 'from', 'the', 'other', '<end>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
# Features and Caption shapes check
sample_img_feature_batch, sample_cap_batch = next(iter(train_dataset))
print(sample_img_feature_batch.shape) #(batch_size, 64, 2048)
print(sample_cap_batch.shape) #(batch_size, max_len)
(256, 64, 2048) (256, 38)
Setting the model parameters
Building the Encoder, Attention model & Decoder
embedding_dim = 256
units = 512
vocab_size = 5001 #top 5,000 words +1 ( <unk>)
train_num_steps = train_size // BATCH_SIZE
test_num_steps = (dataset_size - train_size) // BATCH_SIZE
print('number of training steps : ', train_num_steps)
print('number of test steps : ', test_num_steps)
number of training steps : 126 number of test steps : 31
with tf.device('/GPU:0') :
Model = tf.keras.Model
class Encoder(Model):
def __init__(self,embed_dim=embedding_dim):
super(Encoder, self).__init__()
self.dense = tf.keras.layers.Dense(embed_dim , activation='relu') #building Dense layer with relu activation
def call(self, x):
features = self.dense(x) # extracting the features from the image shape: shape : (batch, 8*8, embed_dim)
return features
with tf.device('/GPU:0') :
class Attention_model(Model):
def __init__(self, units):
super(Attention_model, self).__init__()
self.units=units
self.W1 = tf.keras.layers.Dense(self.units)#Dense layer
self.W2 = tf.keras.layers.Dense(self.units) #Dense layer
self.V = tf.keras.layers.Dense(1) #Final Dense layer with unit 1
def call(self, features, hidden):
hidden_with_time_axis = tf.expand_dims(hidden,1) # Expanding the hidden shape to shape: (batch_size, 1, hidden_size)
attention_hidden_layer = (tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))) # attention_hidden_layer.shape = (BATCH_SIZE, 64, units)
score = self.V(attention_hidden_layer)
attention_weights = tf.nn.softmax(score, axis=1)# extracting attention weights with shape: (batch_size, 8*8, 1)
context_vector = attention_weights * features #Creating context vector with shape (batch_size, 8*8,embedding_dim)
context_vector = tf.reduce_sum(context_vector, axis=1) # reducing the shape to (batch_size, embedding_dim)
return context_vector, attention_weights
with tf.device('/GPU:0') :
class Decoder(Model):
def __init__(self, embed_dim, units, vocab_size):
super(Decoder, self).__init__()
self.units=units
self.attention = Attention_model(self.units) #iniitalizing Attention model with units
self.embed = tf.keras.layers.Embedding(vocab_size, embed_dim) #building an Embedding layer
self.gru = tf.keras.layers.GRU(self.units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform') # RNN
self.d1 = tf.keras.layers.Dense(self.units) #Dense layer
self.d2 = tf.keras.layers.Dense(vocab_size) #Dense layer
def call(self,x,features, hidden):
context_vector, attention_weights = self.attention(features,hidden) #creating context vector & attention weights from attention model
x = self.embed(x) # embedding input to shape: (batch_size, 1, embedding_dim)
x = tf.concat([tf.expand_dims(context_vector,axis=1) ,x], axis=-1)# Concatenating input with the context vector from attention layer. Shape: (batch_size, 1, embedding_dim + embedding_dim)
output,state = self.gru(x) # Extracting the output & hidden state from GRU layer. Output shape : (batch_size, max_length, hidden_size)
x = self.d1(output)
x = tf.reshape(x, (-1, x.shape[2])) # shape : (batch_size * max_length, hidden_size)
x = self.d2(x) # shape : (batch_size * max_length, vocab_size)
return x, state, attention_weights
def init_state(self, batch_size):
return tf.zeros((batch_size, self.units))
# Encoder and Decoder Object instantiation
with tf.device('/GPU:0') :
encoder = Encoder(embedding_dim)
decoder=Decoder(embedding_dim, units, vocab_size)
# Encoder and decoder output shapes check
features=encoder(sample_img_feature_batch)
hidden = decoder.init_state(batch_size=sample_cap_batch.shape[0])
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * sample_cap_batch.shape[0], 1)
predictions, hidden_out, attention_weights= decoder(dec_input, features, hidden)
print('Feature shape from Encoder: {}'.format(features.shape)) #shape = (batch, 8*8, embed_dim)
assert features.shape == (BATCH_SIZE, 8*8, embedding_dim)
print('Predictions shape from Decoder: {}'.format(predictions.shape)) #shape = (batch,vocab_size)
assert predictions.shape == (BATCH_SIZE,vocab_size)
print('Attention weights shape from Decoder: {}'.format(attention_weights.shape)) #shape = (batch, 8*8, embed_dim)
assert attention_weights.shape == (BATCH_SIZE, 64, 1)
Feature shape from Encoder: (256, 64, 256) Predictions shape from Decoder: (256, 5001) Attention weights shape from Decoder: (256, 64, 1)
# Optimiser and loss object
optimizer = tf.keras.optimizers.Adam() #defining the optimizer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') #defining loss object
# Custom loss function
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0)) # masking padding sequences
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_) # average loss across batch
# Check point manager
checkpoint_path = "./checkpoints/train_dataset/"
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
start_epoch = 0
if ckpt_manager.latest_checkpoint:
start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) # retrieve last epoch from saved checkpoints
ckpt.restore(ckpt_manager.latest_checkpoint)
# Custom Train Step
@tf.function
def train_step(img_tensor, target) :
loss = 0
hidden = decoder.init_state(batch_size=target.shape[0])
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)
with tf.GradientTape() as tape:
#training steps
# Encoder
features=encoder(img_tensor,training =True)
# Decoder
for i in range(1,target.shape[1]) :
predictions, hidden, _ = decoder(dec_input, features, hidden)
loss += loss_function(target[:,i], predictions)
dec_input = tf.expand_dims(target[:,i],1)
# backpropagation
avg_loss = (loss / int(target.shape[1]))
trainable_variables = encoder.trainable_weights + decoder.trainable_weights
gradients = tape.gradient(loss,trainable_variables)
optimizer.apply_gradients(zip(gradients,trainable_variables))
return loss, avg_loss
# Custom Test Step
@tf.function
def test_step(img_tensor, target):
loss = 0
#testing steps
hidden = decoder.init_state(batch_size=target.shape[0])
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)
features=encoder(img_tensor)
for i in range(1,target.shape[1]) :
predictions, hidden, _ = decoder(dec_input, features, hidden)
loss += loss_function(target[:,i], predictions)
predicted_id = tf.argmax(predictions[0])
dec_input = tf.expand_dims([predicted_id] * target.shape[0] , 1)
avg_loss = (loss / int(target.shape[1]))
return loss, avg_loss
with tf.device('/GPU:0') :
def test_loss_cal(test_dataset):
total_loss = 0
#write your code to get the average loss result on your test data
for img_tensor, target in test_dataset :
batch_loss , t_loss = test_step(img_tensor, target)
total_loss += t_loss
avg_test_loss = total_loss/int(target.shape[1])
return avg_test_loss
loss_plot = []
test_loss_plot = []
EPOCHS = 15
best_test_loss=100
with tf.device('/GPU:0'):
for epoch in range(start_epoch, EPOCHS):
print('EPOCH :',epoch+1,' of ', EPOCHS)
start = time.time()
total_loss = 0
for (batch, (img_tensor, target)) in enumerate(tqdm(train_dataset)):
batch_loss, t_loss = train_step(img_tensor, target)
total_loss += t_loss
# if batch % 100 == 0:
# average_batch_loss = total_batch_loss.numpy()/int(target.shape[1])
# print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')
avg_train_loss=total_loss / train_num_steps
loss_plot.append(avg_train_loss)
test_loss = test_loss_cal(test_dataset)
test_loss_plot.append(test_loss)
print ('For epoch: {}, the train loss is {:.3f}, & test loss is {:.3f}'.format(epoch+1,avg_train_loss,test_loss))
print ('Time taken for 1 epoch {:.3f} sec\n'.format(time.time() - start))
if test_loss < best_test_loss:
print('Test loss has been reduced from %.3f to %.3f' % (best_test_loss, test_loss))
best_test_loss = test_loss
ckpt_manager.save()
0%| | 0/126 [00:00<?, ?it/s]
EPOCH : 1 of 15
100%|██████████| 126/126 [01:59<00:00, 1.06it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 1, the train loss is 0.991, & test loss is 1.919 Time taken for 1 epoch 150.363 sec Test loss has been reduced from 100.000 to 1.919 EPOCH : 2 of 15
100%|██████████| 126/126 [01:34<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 2, the train loss is 0.969, & test loss is 1.950 Time taken for 1 epoch 112.153 sec EPOCH : 3 of 15
100%|██████████| 126/126 [01:35<00:00, 1.32it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 3, the train loss is 0.949, & test loss is 1.973 Time taken for 1 epoch 113.212 sec EPOCH : 4 of 15
100%|██████████| 126/126 [01:34<00:00, 1.34it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 4, the train loss is 0.930, & test loss is 1.969 Time taken for 1 epoch 111.738 sec EPOCH : 5 of 15
100%|██████████| 126/126 [01:35<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 5, the train loss is 0.913, & test loss is 1.934 Time taken for 1 epoch 113.022 sec EPOCH : 6 of 15
100%|██████████| 126/126 [01:35<00:00, 1.32it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 6, the train loss is 0.898, & test loss is 1.925 Time taken for 1 epoch 112.787 sec EPOCH : 7 of 15
100%|██████████| 126/126 [01:34<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 7, the train loss is 0.883, & test loss is 1.979 Time taken for 1 epoch 112.311 sec EPOCH : 8 of 15
100%|██████████| 126/126 [01:34<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 8, the train loss is 0.868, & test loss is 1.960 Time taken for 1 epoch 112.796 sec EPOCH : 9 of 15
100%|██████████| 126/126 [01:33<00:00, 1.34it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 9, the train loss is 0.854, & test loss is 1.987 Time taken for 1 epoch 111.973 sec EPOCH : 10 of 15
100%|██████████| 126/126 [01:34<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 10, the train loss is 0.840, & test loss is 2.040 Time taken for 1 epoch 112.470 sec EPOCH : 11 of 15
100%|██████████| 126/126 [01:35<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 11, the train loss is 0.829, & test loss is 2.028 Time taken for 1 epoch 112.486 sec EPOCH : 12 of 15
100%|██████████| 126/126 [01:34<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 12, the train loss is 0.817, & test loss is 2.055 Time taken for 1 epoch 112.844 sec EPOCH : 13 of 15
100%|██████████| 126/126 [01:35<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 13, the train loss is 0.803, & test loss is 2.050 Time taken for 1 epoch 112.752 sec EPOCH : 14 of 15
100%|██████████| 126/126 [01:34<00:00, 1.33it/s] 0%| | 0/126 [00:00<?, ?it/s]
For epoch: 14, the train loss is 0.791, & test loss is 2.088 Time taken for 1 epoch 112.555 sec EPOCH : 15 of 15
100%|██████████| 126/126 [01:35<00:00, 1.33it/s]
For epoch: 15, the train loss is 0.779, & test loss is 2.091 Time taken for 1 epoch 112.626 sec
# Loss Plot
plt.plot(loss_plot)
plt.plot(test_loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()
1.Define your evaluation function using greedy search
2.Define your evaluation function using beam search ( optional)
3.Test it on a sample data using BLEU score
# Model evaulation using greedy search
def evaluate(path):
# attention_plot = np.zeros((max_length, attention_features_shape=64))
attention_plot = np.zeros((max_length,64))
hidden = decoder.init_state(batch_size=1)
# Reading Image
image = tf.io.read_file(path)
image = tf.image.decode_jpeg(image)
image = tf.image.convert_image_dtype(image, tf.float32)
# Resizing the Image for Inceptionv3
image = tf.image.resize(image, [299, 299])
# Preprocessing for Inceptionv3
preprocessed_for_inception = tf.keras.applications.inception_v3.preprocess_input(image) # normalization (-1,1)
preprocessed_for_inception = tf.expand_dims(preprocessed_for_inception, 0)
# Extracting Image Features
features = image_features_extract_model(preprocessed_for_inception)
extracted_features = tf.reshape(features, (features.shape[0], -1, features.shape[3]))
features = encoder(extracted_features) # extract the features by passing the input to encoder
dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
result = []
for i in range(max_length):
predictions, hidden, attention_weights = decoder(dec_input, features, hidden) # get the output from decoder
attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()
predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
result.append(tokenizer.index_word[predicted_id])
if tokenizer.index_word[predicted_id] == '<end>':
return result, attention_plot,predictions
dec_input = tf.expand_dims([predicted_id], 0)
attention_plot = attention_plot[:len(result), :]
return result, attention_plot,predictions
# Plotting images with attention weights at each time step
from PIL import Image
def plot_attmap(caption, weights, image):
fig = plt.figure(figsize=(30, 30))
temp_img = np.array(Image.open(image))
len_cap = len(caption)
for cap in range(len_cap):
weights_img = np.reshape(weights[cap], (8,8))
weights_img = np.array(Image.fromarray(weights_img).resize((224, 224), Image.LANCZOS))
ax = fig.add_subplot(len_cap//2, len_cap//2, cap+1)
ax.set_title(caption[cap], fontsize=15)
img=ax.imshow(temp_img)
ax.imshow(weights_img, cmap='gist_heat', alpha=0.6,extent=img.get_extent())
ax.axis('off')
plt.subplots_adjust(hspace=0.2, wspace=0.2)
plt.show()
from nltk.translate.bleu_score import sentence_bleu
def filt_text(text):
filt=['<start>','<unk>','<end>']
temp= text.split()
[temp.remove(j) for k in filt for j in temp if k==j.strip()]
text=' '.join(temp)
return text
# Greedy Search Evaluation on a test image , caption
test_img_paths, test_cap_vector
rid = np.random.randint(0, len(test_img_paths))
test_image = test_img_paths[rid]
real_caption = test_cap_vector[rid]
real_caption = ' '.join([tokenizer.index_word[i] for i in test_cap_vector[rid] if i not in [0]])
result, attention_plot,pred_test = evaluate(test_image)
real_caption=filt_text(real_caption)
pred_caption=' '.join(result).rsplit(' ', 1)[0]
real_appn = []
real_appn.append(real_caption.split())
reference = real_appn
candidate = pred_caption.split()
score = sentence_bleu(reference, candidate, weights=(0,0,1,0))
print(f"BLEU score: {score*100}")
print('Real Caption:', real_caption)
print('Prediction Caption:', pred_caption)
plot_attmap(result, attention_plot, test_image)
Image.open(test_image)
BLEU score: 100.0 Real Caption: a man racing on a motorbike Prediction Caption: two motocross bikers one dirt biker making a skate ramp