Source code for miprometheus.models.relational_net.conv_input_model
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (C) IBM Corporation 2018
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""conv_input_model.py: contains CNN model for the ``RelationalNetwork``."""
__author__ = "Vincent Marois"
import torch
import numpy as np
from torch.nn import Module
from miprometheus.utils.app_state import AppState
[docs]class ConvInputModel(Module):
"""
Simple 4 layers CNN for image encoding in the ``RelationalNetwork`` model.
"""
[docs] def __init__(self):
"""
Constructor.
Defines the 4 convolutional layers and batch normalization layers.
This implementation is inspired from the description in the section \
'Supplementary Material - CLEVR from pixels' in the reference paper \
(https://arxiv.org/pdf/1706.01427.pdf).
"""
# call base constructor
super(ConvInputModel, self).__init__()
# Note: formula for computing the output size is O = floor((W - K + 2P)/S + 1)
# W is the input height/length, K is the filter size, P is the padding, and S is the stride
# define layers
# input image size is indicated as 128 x 128 in the paper for this
# model
self.conv1 =torch.nn.Conv2d(
in_channels=3, out_channels=24, kernel_size=3, stride=2, padding=1)
# output shape should be [24 x 64 x 64]
self.batchNorm1 =torch.nn.BatchNorm2d(24)
self.conv2 =torch.nn.Conv2d(24, 24, 3, stride=2, padding=1)
# output shape should be [24 x 32 x 32]
self.batchNorm2 =torch.nn.BatchNorm2d(24)
self.conv3 =torch.nn.Conv2d(24, 24, 3, stride=2, padding=1)
# output shape should be [24 x 16 x 16]
self.batchNorm3 =torch.nn.BatchNorm2d(24)
self.conv4 =torch.nn.Conv2d(24, 24, 3, stride=2, padding=1)
# output shape should be [24 x 8 x 8]
self.batchNorm4 =torch.nn.BatchNorm2d(24)
[docs] def get_output_nb_filters(self):
"""
:return: The number of filters of the last conv layer.
"""
return self.conv4.out_channels
[docs] def get_output_shape(self, height, width):
"""
Getter method which computes the output height & width of the features maps.
:param height: Input image height.
:type height: int
:param width: Input image width.
:type width: int
:return: height, width of the produced feature maps.
"""
def get_output_dim(dim, kernel_size, stride, padding):
"""
Using the convolution formula to compute the output dim with the specified kernel_size, stride, padding.
Assuming dilatation=1.
"""
return np.floor(((dim + 2*padding - kernel_size)/stride) + 1)
height1 = get_output_dim(height, self.conv1.kernel_size[0], self.conv1.stride[0], self.conv1.padding[0])
width1 = get_output_dim(width, self.conv1.kernel_size[1], self.conv1.stride[1], self.conv1.padding[1])
height2 = get_output_dim(height1, self.conv2.kernel_size[0], self.conv2.stride[0], self.conv2.padding[0])
width2 = get_output_dim(width1, self.conv2.kernel_size[1], self.conv2.stride[1], self.conv2.padding[1])
height3 = get_output_dim(height2, self.conv3.kernel_size[0], self.conv3.stride[0], self.conv3.padding[0])
width3 = get_output_dim(width2, self.conv3.kernel_size[1], self.conv3.stride[1], self.conv3.padding[1])
height4 = get_output_dim(height3, self.conv4.kernel_size[0], self.conv4.stride[0], self.conv4.padding[0])
width4 = get_output_dim(width3, self.conv4.kernel_size[1], self.conv4.stride[1], self.conv4.padding[1])
return height4, width4
[docs] def forward(self, img):
"""
Forward pass of the CNN.
:param img: images to pass through the CNN layers. Should be of size [N, 3, 128, 128].
:type img: torch.tensor
:return: output of the CNN. Should be of size [N, 24, 8, 8].
"""
x = self.conv1(img)
x = self.batchNorm1(x)
x = torch.nn.functional.relu(x)
x = self.conv2(x)
x = self.batchNorm2(x)
x = torch.nn.functional.relu(x)
x = self.conv3(x)
x = self.batchNorm3(x)
x = torch.nn.functional.relu(x)
x = self.conv4(x)
x = self.batchNorm4(x)
x = torch.nn.functional.relu(x)
return x
if __name__ == '__main__':
"""
Unit Test for the ``ConvInputModel``.
"""
# "Image" - batch x channels x width x height
batch_size = 64
img_size = 128
input_np = np.random.binomial(1, 0.5, (batch_size, 3, img_size, img_size))
image = torch.from_numpy(input_np).type(AppState().dtype)
cnn = ConvInputModel()
feature_maps = cnn(image)
print('feature_maps:', feature_maps.shape)
print('Computed output height, width:', cnn.get_output_shape(img_size, img_size))