Source code for miprometheus.models.relational_net.conv_input_model

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) IBM Corporation 2018
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

""" contains CNN model for the ``RelationalNetwork``."""
__author__ = "Vincent Marois"

import torch
import numpy as np
from torch.nn import Module

from miprometheus.utils.app_state import AppState

[docs]class ConvInputModel(Module): """ Simple 4 layers CNN for image encoding in the ``RelationalNetwork`` model. """
[docs] def __init__(self): """ Constructor. Defines the 4 convolutional layers and batch normalization layers. This implementation is inspired from the description in the section \ 'Supplementary Material - CLEVR from pixels' in the reference paper \ ( """ # call base constructor super(ConvInputModel, self).__init__() # Note: formula for computing the output size is O = floor((W - K + 2P)/S + 1) # W is the input height/length, K is the filter size, P is the padding, and S is the stride # define layers # input image size is indicated as 128 x 128 in the paper for this # model self.conv1 =torch.nn.Conv2d( in_channels=3, out_channels=24, kernel_size=3, stride=2, padding=1) # output shape should be [24 x 64 x 64] self.batchNorm1 =torch.nn.BatchNorm2d(24) self.conv2 =torch.nn.Conv2d(24, 24, 3, stride=2, padding=1) # output shape should be [24 x 32 x 32] self.batchNorm2 =torch.nn.BatchNorm2d(24) self.conv3 =torch.nn.Conv2d(24, 24, 3, stride=2, padding=1) # output shape should be [24 x 16 x 16] self.batchNorm3 =torch.nn.BatchNorm2d(24) self.conv4 =torch.nn.Conv2d(24, 24, 3, stride=2, padding=1) # output shape should be [24 x 8 x 8] self.batchNorm4 =torch.nn.BatchNorm2d(24)
[docs] def get_output_nb_filters(self): """ :return: The number of filters of the last conv layer. """ return self.conv4.out_channels
[docs] def get_output_shape(self, height, width): """ Getter method which computes the output height & width of the features maps. :param height: Input image height. :type height: int :param width: Input image width. :type width: int :return: height, width of the produced feature maps. """ def get_output_dim(dim, kernel_size, stride, padding): """ Using the convolution formula to compute the output dim with the specified kernel_size, stride, padding. Assuming dilatation=1. """ return np.floor(((dim + 2*padding - kernel_size)/stride) + 1) height1 = get_output_dim(height, self.conv1.kernel_size[0], self.conv1.stride[0], self.conv1.padding[0]) width1 = get_output_dim(width, self.conv1.kernel_size[1], self.conv1.stride[1], self.conv1.padding[1]) height2 = get_output_dim(height1, self.conv2.kernel_size[0], self.conv2.stride[0], self.conv2.padding[0]) width2 = get_output_dim(width1, self.conv2.kernel_size[1], self.conv2.stride[1], self.conv2.padding[1]) height3 = get_output_dim(height2, self.conv3.kernel_size[0], self.conv3.stride[0], self.conv3.padding[0]) width3 = get_output_dim(width2, self.conv3.kernel_size[1], self.conv3.stride[1], self.conv3.padding[1]) height4 = get_output_dim(height3, self.conv4.kernel_size[0], self.conv4.stride[0], self.conv4.padding[0]) width4 = get_output_dim(width3, self.conv4.kernel_size[1], self.conv4.stride[1], self.conv4.padding[1]) return height4, width4
[docs] def forward(self, img): """ Forward pass of the CNN. :param img: images to pass through the CNN layers. Should be of size [N, 3, 128, 128]. :type img: torch.tensor :return: output of the CNN. Should be of size [N, 24, 8, 8]. """ x = self.conv1(img) x = self.batchNorm1(x) x = torch.nn.functional.relu(x) x = self.conv2(x) x = self.batchNorm2(x) x = torch.nn.functional.relu(x) x = self.conv3(x) x = self.batchNorm3(x) x = torch.nn.functional.relu(x) x = self.conv4(x) x = self.batchNorm4(x) x = torch.nn.functional.relu(x) return x
if __name__ == '__main__': """ Unit Test for the ``ConvInputModel``. """ # "Image" - batch x channels x width x height batch_size = 64 img_size = 128 input_np = np.random.binomial(1, 0.5, (batch_size, 3, img_size, img_size)) image = torch.from_numpy(input_np).type(AppState().dtype) cnn = ConvInputModel() feature_maps = cnn(image) print('feature_maps:', feature_maps.shape) print('Computed output height, width:', cnn.get_output_shape(img_size, img_size))