Understanding Siamese Network with example and codes

Understanding Siamese Network with example and codes

One-Shot Learning with Siamese Network trained using Contrastive loss

Siamese Network architecture

  • It is a combination of 2 shallow(few hidden layers), identical CNNs. The structure can be anything you wish to have.
  • The parameters between these CNNs are shared i.e. same weights and biases being used for both the CNNs. Only one set of weights is trained and used for both CNNs.
  • It uses triplet or contrastive loss functions. Never heard of them? will discuss soon
  • The final expected output is a binary (0 or 1) where 1= similar images else 0.

(1-Y) x 0.5 x X² + Y x 0.5 x (max(0,m-X))²


max(0,d(A,P) + d(A,B) + alpha)


Transfer Learning is the answer

So what we would be doing is for making the model understand the basic features of images in general, we will be training the CNN on any image dataset (big enough) and later on, using our scanty dataset, will make the model specific to our problem statement. Hence, Transfer learning can be taken as a 2 step process

%matplotlib inline
from randimage import get_random_image, show_array
import random
import pandas as pd
from sklearn.datasets import load_digits
import cv2
import numpy as np
from PIL import Image
from keras.utils.np_utils import to_categorical
from keras import backend as K
from keras.layers import Input, Lambda, Dense, Dropout, Convolution2D, MaxPooling2D, Flatten,Activation
from keras.models import Sequential, Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
from keras import callbacks
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras.models import Model,load_model, model_from_json
import tensorflow as tf
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization

#enable eager execution in tensorflow

digits_ = load_digits()
target_ = digits_['target']
target_ = train_labels = to_categorical(target_, num_classes=10)
digits = digits_['data'].reshape(-1,8,8)

digits_resize = np.zeros((len(digits),32,32))

for x,y in enumerate(digits):
digits_resize[x] = (cv2.resize(y, dsize=(32,32), interpolation=cv2.INTER_CUBIC)+1)/2

def build_base_network(input_shape=(32,32,1)):

model = Sequential()
model.add(Convolution2D(16, (8,8), strides=(1,1),activation="relu", input_shape=input_shape))
model.add(Convolution2D(32, (4,4),strides=(1,1), activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(256, activation="relu"))
return model

model = build_base_network()
rms = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)#RMSprop()
rms = RMSprop()
earlyStopping = EarlyStopping(monitor='val_loss',

model.compile(loss='categorical_crossentropy', optimizer=rms,metrics="accuracy")

model.fit(digits_resize.reshape(-1,32,32,1), target_, validation_split=.20,batch_size= batch_size, verbose=1, epochs=10, callbacks=callback_early_stop_reduceLROnPlateau)

CNN Architectureresults for MNIST Classification

one = []
zero = []

img_size = (32,32)

for x in range(200):

img = get_random_image(img_size)

#picking random a,b,c,d coordinates for plotting rectangle
a,b = random.randrange(0,img_size[0]/4),random.randrange(0,img_size[0]/4)
c,d = random.randrange(img_size[0]/2,img_size[0]),random.randrange(img_size[0]/2,img_size[0])

value = random.sample([True,False],1)[0]
if value==False:
#plotting rectangle
img[a:c,b:d,0] = 25
img[a:c,b:d,1] = 25
img[a:c,b:d,2] = 25
#convert RGB image to black & white
img = np.asarray(Image.fromarray((img*255).astype(np.uint8)).convert('L'))/255
img = np.asarray(Image.fromarray((img*255).astype(np.uint8)).convert('L'))/255

  • We are converting RGB images to single channel images using convert(‘L’)
  • a,b,c,d can be taken as the coordinates for the rectangle in positive class images

Actual dataset with rectangles at random placesRandom images

def euclidean_distance(vects):
#euclidean distance, output for Siamese network
x, y = vects
return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)

def compute_accuracy(predictions, labels):
return labels[predictions.ravel() < 0.5].mean()

def accuracy(y_true, y_pred):
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

def contrastive_loss(y_true, y_pred):
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

total_sample_size = 50
test_sample_size = 200
dim1,dim2 = 32,32
count = 0

x_pair = np.zeros([total_sample_size, 2, 1, dim1, dim2])
y = np.zeros([total_sample_size,1])

x_pair_test = np.zeros([test_sample_size, 2, 1, dim1, dim2])
y_test = np.zeros([test_sample_size,1])

for x in range(total_sample_size):
value = random.sample([True,False],1)[0]
if value:
pair = random.choices(one, k=2)
x_pair[x,0,0,:,:] = pair[0]
x_pair[x,1,0,:,:] = pair[1]
#setting label=1 for similar images
y[x] = 1

x_pair[x,0,0,:,:] = random.choices(one, k=1)[0]
x_pair[x,1,0,:,:] = random.choices(zero, k=1)[0]
#setting label=0 for dissimilar images
y[x] = 0

for x in range(test_sample_size):
value = random.sample([True,False],1)[0]
if value:
pair = random.choices(one, k=2)
x_pair_test[x,0,0,:,:] = pair[0]
x_pair_test[x,1,0,:,:] = pair[1]
y_test[x] = 1

x_pair_test[x,0,0,:,:] = random.choices(one, k=1)[0]
x_pair_test[x,1,0,:,:] = random.choices(zero, k=1)[0]
y_test[x] = 0

  • Taking validation_set=200, repeat the above steps. We have taken a comparatively bigger validation so as to be sure of the results we get.

#remove the final output layer from the CNN we added for MNIST Classification
model2= Model(inputs=model.input, outputs=model.layers[-2].output)

input_dim = x_pair.shape[3:]+ tuple([1])

img_a = Input(shape=input_dim)
img_b = Input(shape=input_dim)

feat_vecs_a = model2(img_a)
feat_vecs_b = model2(img_b)

#Siamese output using utility functions declared above
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([feat_vecs_a, feat_vecs_b])

  • Using the feature embedding for two images (the pair we will feed), calculate the euclidean distance. This will be our output!

rms = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)#RMSprop()
earlyStopping = EarlyStopping(monitor='val_loss',

model = Model(inputs=[img_a, img_b], outputs=distance)
model.compile(loss=contrastive_loss, optimizer=rms,metrics=[accuracy])

Modified CNN for Transfer Learning

img1 = x_pair[:, 0].reshape(-1,32,32,1)
img2 = x_pair[:, 1].reshape(-1,32,32,1)

img3 = x_pair_test[:, 0].reshape(-1,32,32,1)
img4 = x_pair_test[:, 1].reshape(-1,32,32,1)

batch_size = 8
history = model.fit([img1, img2], y, validation_data=([img3,img4],y_test),
batch_size= batch_size, verbose=1, epochs=10, callbacks=callback_early_stop_reduceLROnPlateau)

with open('model_architecture.json', 'w') as f: