#Import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import keras
import glob
import random as rn 
rn.seed(42)
from keras import backend as K
import tensorflow as tf
tf.random.set_seed(42)

import warnings
warnings.filterwarnings('ignore')

from collections import Counter
import string
import json
from tqdm import tqdm
import time


#Importing the dataset and reading the image into a seperate variable

image_folder ='../input/flickr8k/Images/'  

all_imgs = glob.glob(image_folder + '*.jpg',recursive=True)
print("The total images present in the dataset: {}".format(len(all_imgs)))

The total images present in the dataset: 8091


#Importing the dataset and read the text file into a seperate variable

text_file_path = '../input/flickr8k/captions.txt'
image_captions_df = pd.read_csv(text_file_path)

image_captions_df.head(10)


#Visualising three random images & captions present in the dataset

for i in range(3) : 
    # image
    random_index = rn.randint(0,len(all_imgs))
    image_id = image_captions_df.loc[random_index,'image']
    image = plt.imread(image_folder + image_id)
    plt.title('Image ID : ' +image_id )
    plt.imshow(image)
    plt.axis('off')
    plt.show()
    print('Image Shape : ', image.shape,'\n')
    
    # captions
    condition = image_captions_df['image'] == image_id
    print('Captions for Image ID # ', image_id , ' : ')
    print(image_captions_df.loc[condition,'caption'].values, '\n\n')

Image Shape :  (500, 375, 3) 

Captions for Image ID #  211981411_e88b8043c2.jpg  : 
['A hummer is driving through a mud puddle several feet deep .'
 'An old dusty car is half way in the brown water .'
 'An old jeep partially submerged in water' 'Dirty car in body of water'
 'The dirty vehicle is caught in a flood .']

Image Shape :  (375, 500, 3) 

Captions for Image ID #  1224851143_33bcdd299c.jpg  : 
['A boy eats with a spoon .'
 'A little boy holds a spoon up to his mouth .'
 'A little boy is eating his food off of a spoon while sitting on a patio .'
 'A small child dressed in green is eating with a spoon .'
 'A young child holds a spoon to its mouth while sitting in a chair .']

Image Shape :  (351, 500, 3) 

Captions for Image ID #  1055753357_4fa3d8d693.jpg  : 
['Two constructions workers sit on a beam taking a break .'
 'Two construction workers are sitting up on the side of a building .'
 'Two construction workers sitting on an I-beam .'
 'Two construction workers take a seat on a steel beam .'
 'Two men take a break from construction .']


# Creating a dataframe which summarizes the image, path & captions as a dataframe

all_img_id= image_captions_df['image'].values #storing all the image ids
all_img_vector= (image_folder + '/'+ image_captions_df['image']).values  #storing all the image paths
annotations= image_captions_df['caption'].values #storing all the captions 

df = pd.DataFrame(list(zip(all_img_id, all_img_vector,annotations)),columns =['ID','Path', 'Captions']) 
pd.set_option('max_colwidth', 100)

print('No of samples : ', df.shape[0])
df.head(10)

No of samples :  40455


# Reading a random image from dataframe
row = rn.randint(0,df.shape[0])
image = plt.imread(df.loc[row,'Path'])
plt.imshow(image)

print(df.loc[row,'Captions'])

A soccer player in red uniform runs after a soccer ball .


#Creating the vocabulary & the counter for the captions

# Removing all punctuations from captions 
df['Captions'] = df['Captions'].apply(lambda x : ''.join(l for l in x if l not in string.punctuation))

vocabulary= [y.lower() for x in df['Captions'].values for y in x.split()] 

val_count=Counter(vocabulary)
val_count.most_common(30)

[('a', 62986),
 ('in', 18974),
 ('the', 18418),
 ('on', 10743),
 ('is', 9345),
 ('and', 8851),
 ('dog', 8136),
 ('with', 7765),
 ('man', 7265),
 ('of', 6713),
 ('two', 5638),
 ('white', 3940),
 ('black', 3832),
 ('boy', 3581),
 ('are', 3504),
 ('woman', 3402),
 ('girl', 3328),
 ('to', 3173),
 ('wearing', 3062),
 ('at', 2914),
 ('people', 2883),
 ('water', 2783),
 ('red', 2672),
 ('young', 2630),
 ('brown', 2563),
 ('an', 2432),
 ('his', 2357),
 ('blue', 2268),
 ('dogs', 2125),
 ('running', 2073)]


#Visualising the top 30 occuring words in the captions
top30 = val_count.most_common(30)

words = []
counts = []
for word_count in top30 : 
    words.append(word_count[0])
    counts.append(word_count[1])

plt.figure(figsize=(20,10))
plt.title('Top 30 words in the vocabulary')
plt.xlabel('word')
plt.ylabel('count')
plot = sns.barplot(x=words, y=counts)
for p in plot.patches:
    plot.annotate(format(int(p.get_height())), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.show()


#Creating a list which contains all the captions and adding the <start> & <end> token to all those captions.
annotations= (df['Captions'].apply(lambda x : '<start> '+ x + ' <end>')).values

#Creating a list which contains all the path to the images
all_img_path= df['Path'].values

print("Total captions present in the dataset: "+ str(len(annotations)))
print("Total images present in the dataset: " + str(len(all_img_path)))

Total captions present in the dataset: 40455
Total images present in the dataset: 40455


# creating the tokenizer

tokenizer = keras.preprocessing.text.Tokenizer(
num_words = 5000, 
filters = '!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ',
lower = True, 
split = " ", 
char_level = False, 
oov_token = '<unk>',)

# fitting the tokenizer on words in the dataset
tokenizer.fit_on_texts(annotations)

# Converting sentences to sequences of word token indexes
caption_sequences = tokenizer.texts_to_sequences(annotations)
caption_sequences[:10]

[[3, 2, 43, 5, 2, 91, 171, 8, 120, 54, 2, 396, 13, 393, 5, 29, 1, 694, 4],
 [3, 2, 20, 315, 65, 2, 195, 118, 4],
 [3, 2, 41, 20, 120, 65, 2, 195, 2432, 4],
 [3, 2, 41, 20, 120, 6, 393, 21, 61, 2432, 4],
 [3, 2, 41, 20, 5, 2, 91, 171, 315, 65, 2, 195, 2995, 4],
 [3, 2, 16, 10, 9, 2, 853, 10, 18, 344, 4],
 [3, 2, 16, 10, 9, 2, 1563, 10, 35, 11, 138, 83, 7, 6, 156, 4],
 [3, 2, 16, 10, 9, 2, 15, 10, 11, 28, 998, 18, 638, 23, 138, 83, 5, 6, 73, 4],
 [3, 14, 32, 13, 739, 2650, 89, 23, 138, 83, 7, 6, 156, 4],
 [3, 14, 32, 7, 721, 797, 320, 138, 83, 4]]


# Creating word-to-index and index-to-word mappings.
word2index = json.loads(tokenizer.get_config()['word_index'])
index2word = json.loads(tokenizer.get_config()['index_word'])

index2word = {int(key) : value for key,value in index2word.items()}

print('Index for the word ',"<start> is", word2index["<start>"])
print('Word for the index : 3 is ', index2word[3])

Index for the word  <start> is 3
Word for the index : 3 is  <start>


# Creating a word count of the tokenizer to visulize the Top 30 occuring words after text processing
 
word_counts = tokenizer.get_config()['word_counts']

word_counts_df = pd.DataFrame.from_dict(data = json.loads(word_counts), orient='index', columns=['count'])
top_30 = word_counts_df.sort_values(by='count', ascending=False)[:30]


plt.figure(figsize=(20,10))
plt.title('Top 30 words in the vocabulary')
plt.xlabel('word')
plt.ylabel('count')
plot = sns.barplot(x=top_30.index , y=top_30['count'])
for p in plot.patches:
    plot.annotate(format(int(p.get_height())), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.show()


# Padding each vector to the max_length of the captions and storing it to a vairable

max_length = max([len(caption_sequence) for caption_sequence in caption_sequences])
print('Max Length of Caption :', max_length)

# padding all caption sequences.
cap_vector =  tf.keras.preprocessing.sequence.pad_sequences(
    caption_sequences, padding="post"
)

print("The shape of Caption vector is :" + str(cap_vector.shape))

Max Length of Caption : 38
The shape of Caption vector is :(40455, 38)


# Adding padding index to tokenizer
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'


!mkdir Features # Creating a folder to store features 

with tf.device('/GPU:0'):
    
    # Inceptionv3 Model with ImageNet weights
    def load_inception_model() : 
        image_model = tf.keras.applications.InceptionV3(include_top=False,weights='imagenet')
        new_input = image_model.input #  input of the image_model
        hidden_layer =  image_model.layers[-1].output #  output of the image_model
        return tf.keras.Model(inputs=new_input, outputs=hidden_layer) # final model using both input & output layer
         

    image_features_extract_model = load_inception_model()

    def extract_features_and_cache(path) : 
        
        # Reading Image 
        image = tf.io.read_file(bytes.decode(path.numpy()))
        image = tf.image.decode_jpeg(image)
        image = tf.image.convert_image_dtype(image, tf.float32)

        # Resizing the Image for Inceptionv3
        image = tf.image.resize(image, [299, 299])

        # Preprocessing for Inceptionv3
        preprocessed_for_inception = tf.keras.applications.inception_v3.preprocess_input(image) # normalization (-1,1)
        preprocessed_for_inception = tf.expand_dims(preprocessed_for_inception, 0)

        # Extracting Image Features 
        
        features = image_features_extract_model(preprocessed_for_inception)
        extracted_features = tf.reshape(features, (-1, tf.shape(features)[-1]))

        # Caching Image Features 
        image_name = (bytes.decode(path.numpy())).split('/')[-1]
        feature_path =  './Features/' + image_name.split('.')[0] + '_features_.npy'
        np.save(feature_path, extracted_features)

        return feature_path
    
    
# Creating an tf.dataset with unique images in the dataset

unique_images = list(set(all_img_path)) 
image_dataset = tf.data.Dataset.from_tensor_slices(unique_images)

# Running feature extraction and caching
for path in tqdm(image_dataset) : 
        extract_features_and_cache(path)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
87916544/87910968 [==============================] - 1s 0us/step

100%|██████████| 8091/8091 [11:09<00:00, 12.09it/s]


# Checking if features of all images are extracted
assert len(glob.glob('./Features/*')) == len(set(all_img_path))


# Function to load cached extracted features from disk
def load_features(path) : 
    image_name = (bytes.decode(path.numpy())).split('/')[-1]
    feature_path =  './Features/' + image_name.split('.')[0] + '_features_.npy'
    features = np.load(feature_path)
    features = tf.reshape(features, (-1, tf.shape(features)[-1]))
    return features


BATCH_SIZE = 256
BUFFER_SIZE = 1024
dataset_size = all_img_path.shape[0]
train_size = int(0.8 * dataset_size)


train_img_paths, train_cap_vector = all_img_path[:train_size], cap_vector[:train_size][:]
test_img_paths, test_cap_vector = all_img_path[train_size : ], cap_vector[train_size :][:]

train_dataset = tf.data.Dataset.from_tensor_slices((train_img_paths, train_cap_vector))
train_dataset = train_dataset.shuffle(BUFFER_SIZE, seed=42, reshuffle_each_iteration=True)

## Replacing images with features extracted from Inception net V3
train_dataset = train_dataset.map(lambda path, caption : (tf.py_function(load_features, [path],tf.float32),caption), num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE, drop_remainder=True)
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_img_paths, test_cap_vector))

## Replacing images with features extracted from Inception net V3
test_dataset = test_dataset.map(lambda path, caption : (tf.py_function(load_features, [path],tf.float32),caption), num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE, drop_remainder=True)
test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

train_dataset.cache('./train')
test_dataset.cache('./test')

<CacheDataset shapes: (<unknown>, (256, 38)), types: (tf.float32, tf.int32)>


# image, caption check 
index = 6010
path = test_img_paths[index]
image = plt.imread(path)
plt.imshow(image)

print(test_cap_vector[index])
print([tokenizer.index_word[id] for id in test_cap_vector[index]])

[  3  14  32 843  21 402   6   1  26 177  76   6  83   4   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
['<start>', 'two', 'dogs', 'try', 'to', 'get', 'the', '<unk>', 'red', 'frisbee', 'from', 'the', 'other', '<end>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


# Features and Caption shapes check 

sample_img_feature_batch, sample_cap_batch = next(iter(train_dataset))
print(sample_img_feature_batch.shape) #(batch_size, 64, 2048)
print(sample_cap_batch.shape) #(batch_size, max_len)

(256, 64, 2048)
(256, 38)


embedding_dim = 256 
units = 512
vocab_size = 5001 #top 5,000 words +1 ( <unk>)
train_num_steps = train_size // BATCH_SIZE
test_num_steps = (dataset_size - train_size) // BATCH_SIZE

print('number of training steps : ', train_num_steps)
print('number of test steps : ', test_num_steps)

number of training steps :  126
number of test steps :  31


with tf.device('/GPU:0') : 
    Model = tf.keras.Model
    class Encoder(Model):
        def __init__(self,embed_dim=embedding_dim):
            super(Encoder, self).__init__()
            self.dense = tf.keras.layers.Dense(embed_dim , activation='relu') #building Dense layer with relu activation

        def call(self, x):
            features =  self.dense(x) # extracting the features from the image shape: shape : (batch, 8*8, embed_dim)
            return features


with tf.device('/GPU:0') : 
    class Attention_model(Model):
        def __init__(self, units):
            super(Attention_model, self).__init__()
            self.units=units
            self.W1 = tf.keras.layers.Dense(self.units)#Dense layer
            self.W2 = tf.keras.layers.Dense(self.units) #Dense layer
            self.V = tf.keras.layers.Dense(1) #Final Dense layer with unit 1


        def call(self, features, hidden):

            hidden_with_time_axis =  tf.expand_dims(hidden,1) # Expanding the hidden shape to shape: (batch_size, 1, hidden_size)
            attention_hidden_layer = (tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))) # attention_hidden_layer.shape = (BATCH_SIZE, 64, units)
            score = self.V(attention_hidden_layer)
            attention_weights =  tf.nn.softmax(score, axis=1)# extracting attention weights with shape: (batch_size, 8*8, 1)
            context_vector =  attention_weights * features #Creating context vector with shape (batch_size, 8*8,embedding_dim)
            context_vector = tf.reduce_sum(context_vector, axis=1) # reducing the shape to (batch_size, embedding_dim)


            return context_vector, attention_weights


with tf.device('/GPU:0') : 
    class Decoder(Model):
        def __init__(self, embed_dim, units, vocab_size):
            super(Decoder, self).__init__()
            self.units=units
            self.attention = Attention_model(self.units) #iniitalizing Attention model with units
            self.embed = tf.keras.layers.Embedding(vocab_size, embed_dim) #building an Embedding layer
            self.gru = tf.keras.layers.GRU(self.units,return_sequences=True,return_state=True,recurrent_initializer='glorot_uniform') # RNN 
            self.d1 = tf.keras.layers.Dense(self.units) #Dense layer
            self.d2 = tf.keras.layers.Dense(vocab_size) #Dense layer


        def call(self,x,features, hidden):
            context_vector, attention_weights = self.attention(features,hidden) #creating context vector & attention weights from attention model
            x =  self.embed(x) # embedding input to shape: (batch_size, 1, embedding_dim)
            x =  tf.concat([tf.expand_dims(context_vector,axis=1) ,x], axis=-1)# Concatenating input with the context vector from attention layer. Shape: (batch_size, 1, embedding_dim + embedding_dim)

            output,state = self.gru(x) # Extracting the output & hidden state from GRU layer. Output shape : (batch_size, max_length, hidden_size)

            x = self.d1(output)

            x = tf.reshape(x, (-1, x.shape[2])) # shape : (batch_size * max_length, hidden_size)

            x = self.d2(x) # shape : (batch_size * max_length, vocab_size)

            return x, state, attention_weights

        def init_state(self, batch_size):
            return tf.zeros((batch_size, self.units))


# Encoder and Decoder Object instantiation
with tf.device('/GPU:0') : 
    encoder = Encoder(embedding_dim)
    decoder=Decoder(embedding_dim, units, vocab_size)


# Encoder and decoder output shapes check 

features=encoder(sample_img_feature_batch)

hidden = decoder.init_state(batch_size=sample_cap_batch.shape[0])
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * sample_cap_batch.shape[0], 1)
predictions, hidden_out, attention_weights= decoder(dec_input, features, hidden)

print('Feature shape from Encoder: {}'.format(features.shape)) #shape = (batch, 8*8, embed_dim)
assert features.shape == (BATCH_SIZE, 8*8, embedding_dim)

print('Predictions shape from Decoder: {}'.format(predictions.shape)) #shape = (batch,vocab_size)
assert predictions.shape == (BATCH_SIZE,vocab_size)

print('Attention weights shape from Decoder: {}'.format(attention_weights.shape)) #shape = (batch, 8*8, embed_dim)
assert attention_weights.shape == (BATCH_SIZE, 64, 1)

Feature shape from Encoder: (256, 64, 256)
Predictions shape from Decoder: (256, 5001)
Attention weights shape from Decoder: (256, 64, 1)


# Optimiser and loss object
 
optimizer = tf.keras.optimizers.Adam() #defining the optimizer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') #defining loss object


# Custom loss function 
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0)) # masking padding sequences
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype) 
    loss_ *= mask

    return tf.reduce_mean(loss_) # average loss across batch


# Check point manager 

checkpoint_path = "./checkpoints/train_dataset/"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) # retrieve last epoch from saved checkpoints
    ckpt.restore(ckpt_manager.latest_checkpoint)


# Custom Train Step  
@tf.function
def train_step(img_tensor, target) : 

    loss = 0
    hidden = decoder.init_state(batch_size=target.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)


    with tf.GradientTape() as tape:
        #training steps
        
        # Encoder
        features=encoder(img_tensor,training =True)
        
        # Decoder
        for i in range(1,target.shape[1]) : 
            predictions, hidden, _ = decoder(dec_input, features, hidden)
            loss += loss_function(target[:,i], predictions)
            dec_input = tf.expand_dims(target[:,i],1)
    
    # backpropagation
    avg_loss = (loss / int(target.shape[1]))
    trainable_variables = encoder.trainable_weights + decoder.trainable_weights
    
    gradients = tape.gradient(loss,trainable_variables)
    optimizer.apply_gradients(zip(gradients,trainable_variables))

    return loss, avg_loss


# Custom Test Step 

@tf.function
def test_step(img_tensor, target):
    loss = 0

    #testing steps
    
    hidden = decoder.init_state(batch_size=target.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)

    features=encoder(img_tensor)

    for i in range(1,target.shape[1]) : 
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        loss += loss_function(target[:,i], predictions)
        predicted_id = tf.argmax(predictions[0])
        dec_input = tf.expand_dims([predicted_id] * target.shape[0] , 1)



    avg_loss = (loss / int(target.shape[1]))

    return loss, avg_loss


with tf.device('/GPU:0') : 
    def test_loss_cal(test_dataset):
        total_loss = 0 
        #write your code to get the average loss result on your test data
        for img_tensor, target in test_dataset : 
            batch_loss , t_loss = test_step(img_tensor, target)
            total_loss += t_loss
        avg_test_loss = total_loss/int(target.shape[1])
        return avg_test_loss


loss_plot = []
test_loss_plot = []
EPOCHS = 15

best_test_loss=100

with tf.device('/GPU:0'):
    for epoch in range(start_epoch, EPOCHS):
        print('EPOCH :',epoch+1,' of ', EPOCHS)
        start = time.time()
        total_loss = 0

        for (batch, (img_tensor, target)) in enumerate(tqdm(train_dataset)):

            batch_loss, t_loss = train_step(img_tensor, target)
            total_loss += t_loss

    #             if batch % 100 == 0:
    #                 average_batch_loss = total_batch_loss.numpy()/int(target.shape[1])
    #                 print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')

        avg_train_loss=total_loss / train_num_steps

        loss_plot.append(avg_train_loss)    
        test_loss = test_loss_cal(test_dataset)
        test_loss_plot.append(test_loss)

        print ('For epoch: {}, the train loss is {:.3f}, & test loss is {:.3f}'.format(epoch+1,avg_train_loss,test_loss))
        print ('Time taken for 1 epoch {:.3f} sec\n'.format(time.time() - start))

        if test_loss < best_test_loss:
            print('Test loss has been reduced from %.3f to %.3f' % (best_test_loss, test_loss))
            best_test_loss = test_loss
            ckpt_manager.save()

  0%|          | 0/126 [00:00<?, ?it/s]

EPOCH : 1  of  15

100%|██████████| 126/126 [01:59<00:00,  1.06it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

For epoch: 1, the train loss is 0.991, & test loss is 1.919
Time taken for 1 epoch 150.363 sec

Test loss has been reduced from 100.000 to 1.919
EPOCH : 2  of  15

100%|██████████| 126/126 [01:34<00:00,  1.33it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

For epoch: 2, the train loss is 0.969, & test loss is 1.950
Time taken for 1 epoch 112.153 sec

EPOCH : 3  of  15

100%|██████████| 126/126 [01:35<00:00,  1.32it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

For epoch: 3, the train loss is 0.949, & test loss is 1.973
Time taken for 1 epoch 113.212 sec

EPOCH : 4  of  15

100%|██████████| 126/126 [01:34<00:00,  1.34it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

For epoch: 4, the train loss is 0.930, & test loss is 1.969
Time taken for 1 epoch 111.738 sec

EPOCH : 5  of  15


# Loss Plot

plt.plot(loss_plot)
plt.plot(test_loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()


# Model evaulation using greedy search
def evaluate(path):
    # attention_plot = np.zeros((max_length, attention_features_shape=64))
    attention_plot = np.zeros((max_length,64))
    hidden = decoder.init_state(batch_size=1)

     # Reading Image 
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image)
    image = tf.image.convert_image_dtype(image, tf.float32)

    # Resizing the Image for Inceptionv3
    image = tf.image.resize(image, [299, 299])

    # Preprocessing for Inceptionv3
    preprocessed_for_inception = tf.keras.applications.inception_v3.preprocess_input(image) # normalization (-1,1)
    preprocessed_for_inception = tf.expand_dims(preprocessed_for_inception, 0)

    # Extracting Image Features 

    features = image_features_extract_model(preprocessed_for_inception)
    extracted_features = tf.reshape(features, (features.shape[0], -1, features.shape[3]))
        

    features = encoder(extracted_features) # extract the features by passing the input to encoder

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []

    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden) # get the output from decoder
        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()
        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        
        result.append(tokenizer.index_word[predicted_id])
    

        if tokenizer.index_word[predicted_id] == '<end>':
            return result, attention_plot,predictions

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot,predictions


# Plotting images with attention weights at each time step 
from PIL import Image
def plot_attmap(caption, weights, image):

    fig = plt.figure(figsize=(30, 30))
    temp_img = np.array(Image.open(image))
    
    len_cap = len(caption)
    for cap in range(len_cap):
        weights_img = np.reshape(weights[cap], (8,8))
        weights_img = np.array(Image.fromarray(weights_img).resize((224, 224), Image.LANCZOS))
        
        ax = fig.add_subplot(len_cap//2, len_cap//2, cap+1)
        ax.set_title(caption[cap], fontsize=15)
        
        img=ax.imshow(temp_img)
        
        ax.imshow(weights_img, cmap='gist_heat', alpha=0.6,extent=img.get_extent())
        ax.axis('off')
    plt.subplots_adjust(hspace=0.2, wspace=0.2)
    plt.show()


from nltk.translate.bleu_score import sentence_bleu


def filt_text(text):
    filt=['<start>','<unk>','<end>'] 
    temp= text.split()
    [temp.remove(j) for k in filt for j in temp if k==j.strip()]
    text=' '.join(temp)
    return text


# Greedy Search Evaluation on a test image , caption

test_img_paths, test_cap_vector
rid = np.random.randint(0, len(test_img_paths))
test_image = test_img_paths[rid]

real_caption = test_cap_vector[rid]


real_caption = ' '.join([tokenizer.index_word[i] for i in test_cap_vector[rid] if i not in [0]])
result, attention_plot,pred_test = evaluate(test_image)


real_caption=filt_text(real_caption)      


pred_caption=' '.join(result).rsplit(' ', 1)[0]

real_appn = []
real_appn.append(real_caption.split())
reference = real_appn
candidate = pred_caption.split()

score = sentence_bleu(reference, candidate, weights=(0,0,1,0))
print(f"BLEU score: {score*100}")
print('Real Caption:', real_caption)
print('Prediction Caption:', pred_caption)
plot_attmap(result, attention_plot, test_image)


Image.open(test_image)

BLEU score: 100.0
Real Caption: a man racing on a motorbike
Prediction Caption: two motocross bikers one dirt biker making a skate ramp

	image	caption
0	1000268201_693b08cb0e.jpg	A child in a pink dress is climbing up a set o...
1	1000268201_693b08cb0e.jpg	A girl going into a wooden building .
2	1000268201_693b08cb0e.jpg	A little girl climbing into a wooden playhouse .
3	1000268201_693b08cb0e.jpg	A little girl climbing the stairs to her playh...
4	1000268201_693b08cb0e.jpg	A little girl in a pink dress going into a woo...
5	1001773457_577c3a7d70.jpg	A black dog and a spotted dog are fighting
6	1001773457_577c3a7d70.jpg	A black dog and a tri-colored dog playing with...
7	1001773457_577c3a7d70.jpg	A black dog and a white dog with brown spots a...
8	1001773457_577c3a7d70.jpg	Two dogs of different breeds looking at each o...
9	1001773457_577c3a7d70.jpg	Two dogs on pavement moving toward each other .

	ID	Path	Captions
0	1000268201_693b08cb0e.jpg	../input/flickr8k/Images//1000268201_693b08cb0e.jpg	A child in a pink dress is climbing up a set of stairs in an entry way .
1	1000268201_693b08cb0e.jpg	../input/flickr8k/Images//1000268201_693b08cb0e.jpg	A girl going into a wooden building .
2	1000268201_693b08cb0e.jpg	../input/flickr8k/Images//1000268201_693b08cb0e.jpg	A little girl climbing into a wooden playhouse .
3	1000268201_693b08cb0e.jpg	../input/flickr8k/Images//1000268201_693b08cb0e.jpg	A little girl climbing the stairs to her playhouse .
4	1000268201_693b08cb0e.jpg	../input/flickr8k/Images//1000268201_693b08cb0e.jpg	A little girl in a pink dress going into a wooden cabin .
5	1001773457_577c3a7d70.jpg	../input/flickr8k/Images//1001773457_577c3a7d70.jpg	A black dog and a spotted dog are fighting
6	1001773457_577c3a7d70.jpg	../input/flickr8k/Images//1001773457_577c3a7d70.jpg	A black dog and a tri-colored dog playing with each other on the road .
7	1001773457_577c3a7d70.jpg	../input/flickr8k/Images//1001773457_577c3a7d70.jpg	A black dog and a white dog with brown spots are staring at each other in the street .
8	1001773457_577c3a7d70.jpg	../input/flickr8k/Images//1001773457_577c3a7d70.jpg	Two dogs of different breeds looking at each other on the road .
9	1001773457_577c3a7d70.jpg	../input/flickr8k/Images//1001773457_577c3a7d70.jpg	Two dogs on pavement moving toward each other .

EYE FOR BLIND¶

Data understanding¶

Pre-Processing the captions¶

Pre-processing the images & Extracting Features using a Pre-trained CNN¶

Create the train & test data¶

Model Building¶

Encoder¶

Attention model¶

Decoder¶

Model training & optimization¶

Model Evaluation¶

Greedy Search¶