Source code for miprometheus.models.ntm.ntm_interface

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (C) IBM Corporation 2018
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""ntm_interface.py: pytorch module implementing NTM interface to external memory."""
__author__ = "Tomasz Kornuta"


import torch
import logging
import collections
import numpy as np
from torch.nn import Module
logger = logging.getLogger('NTM-Interface')

from miprometheus.utils.app_state import AppState


# Helper collection type.
_HeadStateTuple = collections.namedtuple(
    'HeadStateTuple', ('attention', 'similarity', 'gate', 'shift'))


[docs]class HeadStateTuple(_HeadStateTuple):
    """
    Tuple used by interface for storing current/past state information.
    """
    __slots__ = ()


# Helper collection type.
_InterfaceStateTuple = collections.namedtuple(
    'InterfaceStateTuple', ('read_heads', 'write_head',))


[docs]class InterfaceStateTuple(_InterfaceStateTuple):
    """
    Tuple used by interface for storing current/past state information.
    """
    __slots__ = ()


[docs]class NTMInterface(Module):
    """
    Class realizing interface between controller and memory.
    """

[docs]    def __init__(self, params):
        """
        Constructor.

        :param params: Dictionary of parameters.

        """
        # Call constructor of base class.
        super(NTMInterface, self).__init__()

        # Parse parameters.
        # Get hidden state size.
        self.ctrl_hidden_state_size = params['controller']['hidden_state_size']
        # Get memory parameters.
        self.num_memory_content_bits = params['memory']['num_content_bits']
        # Get interface parameters.
        self.interface_shift_size = params['interface']['shift_size']
        assert self.interface_shift_size % 2 != 0, 'Shift size must be an odd number'
        assert self.interface_shift_size > 0, 'Shift size must be > 0'
        self.interface_num_read_heads = params['interface']['num_read_heads']
        assert self.interface_num_read_heads >= 1, "NTM requires at least 1 read head (currently %r)" % self.interface_num_read_heads

        # Check if CBA should be used or not.
        self.use_content_based_addressing = params['interface'].get(
            'use_content_based_addressing', True)

        # -------------- READ HEADS -----------------#

        # Number/size of parameters of a single read head:
        if self.use_content_based_addressing:
            # key [MEMORY_CONTENT_BITS] + beta [1] + gate [1] + gamma [1] + shift kernel size [SHIFT_SIZE]
            # All read params = NUM_HEADS * above (but it's not important here)
            num_read_params = (self.num_memory_content_bits +
                               1 + 1 + 1 + self.interface_shift_size)
            # Dictionary with read parameters - used during slicing.
            self.read_param_locations = self.calculate_param_locations(
                {
                    'query_vector': self.num_memory_content_bits,
                    'beta': 1,
                    'gate': 1,
                    'shift': self.interface_shift_size,
                    'gamma': 1},
                "Read")
            assert num_read_params == self.read_param_locations[-1], "Last location must be equal to number of read params."
        else:
            # gamma [1] + shift kernel size [SHIFT_SIZE]
            # All read params = NUM_HEADS * above (but it's not important here)
            num_read_params = (1 + self.interface_shift_size)
            # Dictionary with read parameters - used during slicing.
            self.read_param_locations = self.calculate_param_locations({
                'shift': self.interface_shift_size, 'gamma': 1}, "Read")
            assert num_read_params == self.read_param_locations[-1], "Last location must be equal to number of read params."

       # Forward linear layers that generate parameters of read heads.
        self.hidden2read_list = torch.nn.ModuleList()
        for _ in range(self.interface_num_read_heads):
            self.hidden2read_list.append(torch.nn.Linear(
                self.ctrl_hidden_state_size, num_read_params))

        # -------------- WRITE HEAD -----------------#
        # Number/size of wrrite parameters:
        if self.use_content_based_addressing:
            #   key [MEMORY_BITS] + beta [1] + gate [1] + gamma [1] +
            # + shift kernel size [SHIFT_SIZE] + erase vector [MEMORY_CONTENT_BITS] + write vector[MEMORY_BITS]
            num_write_params = 3 * self.num_memory_content_bits + \
                1 + 1 + 1 + self.interface_shift_size
            # Write parameters - used during slicing.
            self.write_param_locations = self.calculate_param_locations(
                {
                    'query_vector': self.num_memory_content_bits,
                    'beta': 1,
                    'gate': 1,
                    'shift': self.interface_shift_size,
                    'gamma': 1,
                    'erase_vector': self.num_memory_content_bits,
                    'add_vector': self.num_memory_content_bits},
                "Write")
            assert num_write_params == self.write_param_locations[
                -1], "Last location must be equal to number of write params."
        else:
            # gamma [1] +
            # + shift kernel size [SHIFT_SIZE] + erase vector [MEMORY_CONTENT_BITS] + write vector[MEMORY_BITS]
            num_write_params = 2 * self.num_memory_content_bits + 1 + self.interface_shift_size
            # Write parameters - used during slicing.
            self.write_param_locations = self.calculate_param_locations(
                {'shift': self.interface_shift_size, 'gamma': 1,
                 'erase_vector': self.num_memory_content_bits,
                 'add_vector': self.num_memory_content_bits},
                "Write")
            assert num_write_params == self.write_param_locations[
                -1], "Last location must be equal to number of write params."

       # Forward linear layer that generates parameters of write heads.
        self.hidden2write_params = torch.nn.Linear(
            self.ctrl_hidden_state_size, num_write_params)

[docs]    def init_state(self, batch_size, num_memory_addresses):
        """
        Returns 'zero' (initial) state tuple.

        :param batch_size: Size of the batch in given iteraction/epoch.
        :param num_memory_addresses: Number of memory addresses.
        :returns: Initial state tuple - object of InterfaceStateTuple class.

        """
        dtype = AppState().dtype
        # Add read head states - one for each read head.

        read_state_tuples = []

        # Initial  attention weights [BATCH_SIZE x MEMORY_ADDRESSES x 1]
        # Initialize attention: to address 0.
        zh_attention = torch.zeros(
            batch_size, num_memory_addresses, 1).type(dtype)
        zh_attention[:, 0, 0] = 1

        # Initialize gating: to previous attention (i.e. zero-hard).
        init_gating = torch.ones(batch_size, 1, 1).type(dtype)

        # Initialize shift - to zero.
        init_shift = torch.zeros(
            batch_size, self.interface_shift_size, 1).type(dtype)
        init_shift[:, 1, 0] = 1

        for i in range(self.interface_num_read_heads):

            read_ht = HeadStateTuple(
                zh_attention, zh_attention, init_gating, init_shift)

            # Single read head tuple.
            read_state_tuples.append(read_ht)

        # Single write head tuple.
        write_state_tuple = HeadStateTuple(
            zh_attention, zh_attention, init_gating, init_shift)

        # Return tuple.
        interface_state = InterfaceStateTuple(
            read_state_tuples, write_state_tuple)
        return interface_state

[docs]    def forward(self, ctrl_hidden_state_BxH, prev_memory_BxAxC,
                prev_interface_state_tuple):
        """
        Controller forward function.

        :param ctrl_hidden_state_BxH: a Tensor with controller hidden state of size [BATCH_SIZE  x HIDDEN_SIZE]
        :param prev_memory_BxAxC: Previous state of the memory [BATCH_SIZE x  MEMORY_ADDRESSES x CONTENT_BITS]
        :param prev_interface_state_tuple: Tuple containing previous read and write attention vectors.
        :returns: List of read vectors [BATCH_SIZE x CONTENT_SIZE], updated memory and state tuple (object of LSTMStateTuple class).

        """
        # Unpack previous cell  state - just to make sure that everything is ok...
        #(prev_read_attentions_BxAx1_H,  prev_write_attention_BxAx1) = prev_interface_state_tuple
       # Unpack cell state.
        (prev_read_state_tuples, prev_write_state_tuple) = prev_interface_state_tuple
        (prev_write_attention_BxAx1, _, _, _) = prev_write_state_tuple
        (prev_read_attentions_BxAx1_H, _, _, _) = zip(*prev_read_state_tuples)

        # !! Execute single step !!

        # Read attentions
        read_attentions_BxAx1_H = []
        # List of read vectors - with two dimensions! [BATCH_SIZE x
        # CONTENT_SIZE]
        read_vectors_BxC_H = []
        # List of read tuples - for visualization.
        read_state_tuples = []

        # Read heads.
        for i in range(self.interface_num_read_heads):
            # Calculate parameters of a given read head.
            params_BxP = self.hidden2read_list[i](ctrl_hidden_state_BxH)

            if self.use_content_based_addressing:
                # Split the parameters.
                query_vector_BxC, beta_Bx1, gate_Bx1, shift_BxS, gamma_Bx1 = self.split_params(
                    params_BxP, self.read_param_locations)
                # Update the attention of a given read head.
                read_attention_BxAx1, read_state_tuple = self.update_attention(
                    query_vector_BxC, beta_Bx1, gate_Bx1, shift_BxS, gamma_Bx1,
                    prev_memory_BxAxC, prev_read_attentions_BxAx1_H[i])
            else:
                # Split the parameters.
                shift_BxS, gamma_Bx1 = self.split_params(
                    params_BxP, self.read_param_locations)
                # Update the attention of a given read head.
                read_attention_BxAx1, read_state_tuple = self.update_attention(
                    _, _, _, shift_BxS, gamma_Bx1, prev_memory_BxAxC, prev_read_attentions_BxAx1_H[i])

            # Read vector from memory [BATCH_SIZE x CONTENT_BITS].
            read_vector_BxC = self.read_from_memory(
                read_attention_BxAx1, prev_memory_BxAxC)

            # Save read attentions and vectors in a list.
            read_attentions_BxAx1_H.append(read_attention_BxAx1)
            read_vectors_BxC_H.append(read_vector_BxC)
            # We always collect tuples, as we are using e.g. attentions from
            # them.
            read_state_tuples.append(read_state_tuple)

        # Write head operation.
        # Calculate parameters of a given read head.
        params_BxP = self.hidden2write_params(ctrl_hidden_state_BxH)

        if self.use_content_based_addressing:
            # Split the parameters.
            query_vector_BxC, beta_Bx1, gate_Bx1, shift_BxS, gamma_Bx1, erase_vector_BxC, add_vector_BxC = self.split_params(
                params_BxP, self.write_param_locations)
            # Update the attention of the write head.
            write_attention_BxAx1, write_state_tuple = self.update_attention(
                query_vector_BxC, beta_Bx1, gate_Bx1, shift_BxS, gamma_Bx1,
                prev_memory_BxAxC, prev_write_attention_BxAx1)
        else:
            # Split the parameters.
            shift_BxS, gamma_Bx1, erase_vector_BxC, add_vector_BxC = self.split_params(
                params_BxP, self.write_param_locations)
            # Update the attention of the write head.
            write_attention_BxAx1, write_state_tuple = self.update_attention(
                _, _, _, shift_BxS, gamma_Bx1, prev_memory_BxAxC, prev_write_attention_BxAx1)

        # Add 3rd dimensions where required and apply non-linear transformations.
        # I didn't had that non-linear transformation in TF!
        erase_vector_Bx1xC = torch.nn.functional.sigmoid(erase_vector_BxC).unsqueeze(1)
        add_vector_Bx1xC = torch.nn.functional.sigmoid(add_vector_BxC).unsqueeze(1)

        #logger.debug("write_attention_BxAx1 {}:\n {}".format(write_attention_BxAx1.size(),  write_attention_BxAx1))

        # Update the memory.
        memory_BxAxC = self.update_memory(
            write_attention_BxAx1,
            erase_vector_Bx1xC,
            add_vector_Bx1xC,
            prev_memory_BxAxC)

        # Pack current cell state.
        interface_state_tuple = InterfaceStateTuple(
            read_state_tuples, write_state_tuple)

        # Return read vector, new memory state and state tuple.
        return read_vectors_BxC_H, memory_BxAxC, interface_state_tuple

[docs]    def calculate_param_locations(self, param_sizes_dict, head_name):
        """
        Calculates locations of parameters, that will subsequently be used
        during parameter splitting.

        :param param_sizes_dict: Dictionary containing parameters along with their sizes (in bits/units).
        :param head_name: Name of head.
        :returns: "Locations" of parameters.

        """
        #logger.debug("{} param sizes dict:\n {}".format(head_name, param_sizes_dict))
        # Create the parameter lengths and store their cumulative sum
        lengths = np.fromiter(param_sizes_dict.values(), dtype=int)
        # Store "parameter locations" for further usage.
        param_locations = np.cumsum(
            np.insert(lengths, 0, 0), dtype=int).tolist()
        #logger.debug("{} param locations:\n {}".format(head_name, param_locations))
        return param_locations

[docs]    def split_params(self, params, locations):
        """
        Split parameters into list on the basis of locations.
        """
        param_splits = [params[..., locations[i]:locations[i + 1]]
                        for i in range(len(locations) - 1)]
        #logger.debug("Splitted params:\n {}".format(param_splits))
        return param_splits

[docs]    def update_attention(
            self,
            query_vector_BxC,
            beta_Bx1,
            gate_Bx1,
            shift_BxS,
            gamma_Bx1,
            prev_memory_BxAxC,
            prev_attention_BxAx1):
        """
        Updates the attention weights.

        :param query_vector_BxC: Query used for similarity calculation in content-based addressing [BATCH_SIZE x CONTENT_BITS]
        :param beta_Bx1: Strength parameter used in content-based addressing.
        :param gate_Bx1:
        :param shift_BxS:
        :param gamma_Bx1:
        :param prev_memory_BxAxC: tensor containing memory before update [BATCH_SIZE x MEMORY_ADDRESSES x CONTENT_BITS]
        :param prev_attention_BxAx1: previous attention vector [BATCH_SIZE x MEMORY_ADDRESSES x 1]
        :returns: attention vector of size [BATCH_SIZE x ADDRESS_SIZE x 1]

        """
        # Add 3rd dimensions where required and apply non-linear transformations.
        # Produce location-addressing params.
        shift_BxSx1 = torch.nn.functional.softmax(shift_BxS, dim=1).unsqueeze(2)
        # Gamma - oneplus.
        gamma_Bx1x1 = torch.nn.functional.softplus(gamma_Bx1).unsqueeze(2) + 1

        if self.use_content_based_addressing:
            # Add 3rd dimensions where required and apply non-linear transformations.
            # Produce content-addressing params.
            query_vector_Bx1xC = torch.nn.functional.sigmoid(query_vector_BxC).unsqueeze(1)
            # Beta: oneplus
            beta_Bx1x1 = torch.nn.functional.softplus(beta_Bx1).unsqueeze(2) + 1
            # Produce gating param.
            gate_Bx1x1 = torch.nn.functional.sigmoid(gate_Bx1).unsqueeze(2)

            # Content-based addressing.
            content_attention_BxAx1 = self.content_based_addressing(
                query_vector_Bx1xC, beta_Bx1x1, prev_memory_BxAxC)

            # Gating mechanism - choose beetween new attention from CBA or
            # attention from previous iteration. [BATCH_SIZE x ADDRESSES x 1].
            attention_after_gating_BxAx1 = gate_Bx1x1 * content_attention_BxAx1 + \
                (torch.ones_like(gate_Bx1x1) - gate_Bx1x1) * prev_attention_BxAx1
            #logger.debug("attention_after_gating_BxAx1 {}:\n {}".format(attention_after_gating_BxAx1.size(),  attention_after_gating_BxAx1))

            # Location-based addressing.
            location_attention_BxAx1 = self.location_based_addressing(
                attention_after_gating_BxAx1, shift_BxSx1, gamma_Bx1x1)

        else:
            # Location-based addressing ONLY!
            location_attention_BxAx1 = self.location_based_addressing(
                prev_attention_BxAx1, shift_BxSx1, gamma_Bx1x1)
            content_attention_BxAx1 = torch.zeros_like(
                location_attention_BxAx1)
            gate_Bx1x1 = torch.zeros_like(gamma_Bx1x1)

        #logger.warning("location_attention_BxAx1 {}:\n {}".format(location_attention_BxAx1.size(),  location_attention_BxAx1))

        head_tuple = HeadStateTuple(
            location_attention_BxAx1,
            content_attention_BxAx1,
            gate_Bx1x1,
            shift_BxSx1)

        return location_attention_BxAx1, head_tuple

[docs]    def content_based_addressing(
            self, query_vector_Bx1xC, beta_Bx1x1, prev_memory_BxAxC):
        """
        Computes content-based addressing. Uses query vectors for calculation
        of similarity.

        :param query_vector_Bx1xC: NTM "key"  [BATCH_SIZE x 1 x CONTENT_BITS]
        :param beta_Bx1x1: key strength [BATCH_SIZE x 1 x 1]
        :param prev_memory_BxAxC: tensor containing memory before update [BATCH_SIZE x MEMORY_ADDRESSES x CONTENT_BITS]
        :returns: attention of size [BATCH_SIZE x ADDRESS_SIZE x 1]

        """
        # Normalize query batch - along content.
        norm_query_vector_Bx1xC = torch.nn.functional.normalize(query_vector_Bx1xC, p=2, dim=2)
        #logger.debug("norm_query_vector_Bx1xC {}:\n {}".format(norm_query_vector_Bx1xC.size(),  norm_query_vector_Bx1xC))

        # Normalize memory - along content.
        norm_memory_BxAxC = torch.nn.functional.normalize(prev_memory_BxAxC, p=2, dim=2)
        #logger.debug("norm_memory_BxAxC {}:\n {}".format(norm_memory_BxAxC.size(),  norm_memory_BxAxC))

        # Calculate cosine similarity [BATCH_SIZE x MEMORY_ADDRESSES x 1].
        similarity_BxAx1 = torch.matmul(
            norm_memory_BxAxC, torch.transpose(norm_query_vector_Bx1xC, 1, 2))
        #logger.debug("similarity_BxAx1 {}:\n {}".format(similarity_BxAx1.size(),  similarity_BxAx1))

        # Element-wise multiplication [BATCH_SIZE x MEMORY_ADDRESSES x 1]
        strengthtened_similarity_BxAx1 = torch.matmul(
            similarity_BxAx1, beta_Bx1x1)
        #logger.debug("strengthtened_similarity_BxAx1 {}:\n {}".format(strengthtened_similarity_BxAx1.size(),  strengthtened_similarity_BxAx1))

        # Calculate attention based on similarity along the "slot dimension"
        # [BATCH_SIZE x MEMORY_ADDRESSES x 1].
        attention_BxAx1 = torch.nn.functional.softmax(strengthtened_similarity_BxAx1, dim=1)
        #logger.debug("attention_BxAx1 {}:\n {}".format(attention_BxAx1.size(),  attention_BxAx1))
        return attention_BxAx1

[docs]    def location_based_addressing(
            self, attention_BxAx1, shift_BxSx1, gamma_Bx1x1):
        """
        Computes location-based addressing, i.e. shitfts the head and sharpens.

        :param attention_BxAx1: Current attention [BATCH_SIZE x ADDRESS_SIZE x 1]
        :param shift_BxSx1: soft shift maks (convolutional kernel) [BATCH_SIZE x SHIFT_SIZE x 1]
        :param gamma_Bx1x1: sharpening factor [BATCH_SIZE x 1 x 1]
        :returns: attention vector of size [BATCH_SIZE x ADDRESS_SIZE x 1]

        """

        # 1. Perform circular convolution.
        shifted_attention_BxAx1 = self.circular_convolution(
            attention_BxAx1, shift_BxSx1)

        # 2. Perform Sharpening.
        sharpened_attention_BxAx1 = self.sharpening(
            shifted_attention_BxAx1, gamma_Bx1x1)

        return sharpened_attention_BxAx1

[docs]    def circular_convolution(self, attention_BxAx1, shift_BxSx1):
        """
        Performs circular convolution, i.e. shitfts the attention accodring to
        given shift vector (convolution mask).

        :param attention_BxAx1: Current attention [BATCH_SIZE x ADDRESS_SIZE x 1]
        :param shift_BxSx1: soft shift maks (convolutional kernel) [BATCH_SIZE x SHIFT_SIZE x 1]
        :returns: attention vector of size [BATCH_SIZE x ADDRESS_SIZE x 1]

        """
        def circular_index(idx, num_addr):
            """
            Calculates the index, taking into consideration the number of
            addresses in memory.

            :param idx: index (single element)
            :param num_addr: number of addresses in memory

            """
            if idx < 0:
                return num_addr + idx
            elif idx >= num_addr:
                return idx - num_addr
            else:
                return idx

        # Check whether inputs are already on GPU or not.
        #dtype = torch.cuda.LongTensor if attention_BxAx1.is_cuda else torch.LongTensor
        dtype = AppState().LongTensor

        # Get number of memory addresses and batch size.
        batch_size = attention_BxAx1.size(0)
        num_addr = attention_BxAx1.size(1)
        shift_size = self.interface_shift_size

        #logger.debug("shift_BxSx1 {}: {}".format(shift_BxSx1,  shift_BxSx1.size()))
        # Create an extended list of indices indicating what elements of the
        # sequence will be where.
        ext_indices_tensor = torch.Tensor(
            [circular_index(shift, num_addr)
             for shift in range(
                 -shift_size // 2 + 1, num_addr + shift_size // 2)]).type(dtype)
        #logger.debug("ext_indices {}:\n {}".format(ext_indices_tensor.size(),  ext_indices_tensor))

        # Use indices for creation of an extended attention vector.
        ext_attention_BxEAx1 = torch.index_select(
            attention_BxAx1, dim=1, index=ext_indices_tensor)
        #logger.debug("ext_attention_BxEAx1 {}:\n {}".format(ext_attention_BxEAx1.size(),  ext_attention_BxEAx1))

        # Transpose inputs to convolution.
        ext_att_trans_Bx1xEA = torch.transpose(ext_attention_BxEAx1, 1, 2)
        shift_trans_Bx1xS = torch.transpose(shift_BxSx1, 1, 2)
        # Perform  convolution for every batch-filter pair.
        tmp_attention_list = []
        for b in range(batch_size):
            tmp_attention_list.append(torch.nn.functional.conv1d(ext_att_trans_Bx1xEA.narrow(
                0, b, 1), shift_trans_Bx1xS.narrow(0, b, 1)))
        # Concatenate list into a single tensor.
        shifted_attention_BxAx1 = torch.transpose(
            torch.cat(tmp_attention_list, dim=0), 1, 2)
        #logger.debug("shifted_attention_BxAx1 {}:\n {}".format(shifted_attention_BxAx1.size(),  shifted_attention_BxAx1))

        return shifted_attention_BxAx1

[docs]    def sharpening(self, attention_BxAx1, gamma_Bx1x1):
        """
        Performs attention sharpening.

        :param attention_BxAx1: Current attention [BATCH_SIZE x ADDRESS_SIZE x 1]
        :param gamma_Bx1x1: sharpening factor [BATCH_SIZE x 1 x 1]
        :returns: attention vector of size [BATCH_SIZE x ADDRESS_SIZE x 1]

        """
        # Power.
        pow_attention_BxAx1 = torch.pow(attention_BxAx1 + 1e-12, gamma_Bx1x1)
        #logger.error("pow_attention_BxAx1 {}:\n {}".format(pow_attention_BxAx1.size(),  pow_attention_BxAx1))

        # Normalize along addresses.
        norm_attention_BxAx1 = torch.nn.functional.normalize(pow_attention_BxAx1, p=1, dim=1)
        #logger.error("EEEE norm_attention_BxAx1 {}:\n {}".format(norm_attention_BxAx1.size(),  norm_attention_BxAx1))

        return norm_attention_BxAx1

[docs]    def read_from_memory(self, attention_BxAx1, memory_BxAxC):
        """
        Returns 2D tensor of size [BATCH_SIZE x CONTENT_BITS] storing vector
        read from memory given the attention.

        :param attention_BxAx1: Current attention [BATCH_SIZE x ADDRESS_SIZE x 1]
        :param memory_BxAxC: tensor containing memory [BATCH_SIZE x MEMORY_ADDRESSES x CONTENT_BITS]
        :returns: vector read from the memory [BATCH_SIZE x CONTENT_BITS]

        """
        read_vector_Bx1xC = torch.matmul(
            torch.transpose(attention_BxAx1, 1, 2), memory_BxAxC)
        #logger.debug("read_vector_Bx1xC {}:\n {}".format(read_vector_Bx1xC.size(),  read_vector_Bx1xC))

        # Return 2D tensor.
        return read_vector_Bx1xC.squeeze(dim=1)

[docs]    def update_memory(self, write_attention_BxAx1,
                      erase_vector_Bx1xC, add_vector_Bx1xC, prev_memory_BxAxC):
        """
        Returns 3D tensor of size [BATCH_SIZE x MEMORY_ADDRESSES x
        CONTENT_BITS] storing new content of the memory.

        :param write_attention_BxAx1: Current write attention [BATCH_SIZE x ADDRESS_SIZE x 1]
        :param erase_vector_Bx1xC: Erase vector [BATCH_SIZE x  1 x CONTENT_BITS]
        :param add_vector_Bx1xC: Add vector [BATCH_SIZE x 1 x CONTENT_BITS]
        :param prev_memory_BxAxC: tensor containing previous state of the memory [BATCH_SIZE x MEMORY_ADDRESSES x CONTENT_BITS]
        :returns: vector read from the memory [BATCH_SIZE x CONTENT_BITS]

        """
        # 1. Calculate the preserved content.
        preserve_content_BxAxC = 1 - \
            torch.matmul(write_attention_BxAx1, erase_vector_Bx1xC)
        # 2. Calculate the added content.
        add_content_BxAxC = torch.matmul(
            write_attention_BxAx1, add_vector_Bx1xC)
        # 3. Update memory.
        memory_BxAxC = prev_memory_BxAxC * preserve_content_BxAxC + add_content_BxAxC

        return memory_BxAxC