Spaces:

karolmajek
/

maxdeeplab

Runtime error

App Files Files Community

maxdeeplab / model /layers /dual_path_transformer.py

karolmajek

from https://huggingface.co/spaces/akhaliq/deeplab2

0924f30 about 4 years ago

raw

history blame contribute delete

22.8 kB

	# coding=utf-8
	# Copyright 2021 The Deeplab2 Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Implements dual path transformer layers proposed in MaX-DeepLab [1].

	Dual-path transformer introduces a global memory path in addition to a CNN path,
	allowing bi-directional communication with any CNN layers.

	[1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
	CVPR 2021.
	Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
	"""

	import tensorflow as tf

	from deeplab2.model import utils
	from deeplab2.model.layers import activations
	from deeplab2.model.layers import convolutions


	class AttentionOperation(tf.keras.layers.Layer):
	"""Computes standard 1D multi-head attention with query, key, and value."""

	def __init__(self,
	name,
	activation,
	transformer_activation,
	bn_layer=tf.keras.layers.BatchNormalization):
	"""Initializes an AttentionOperation layer.

	Args:
	name: A string, the name of this layer.
	activation: A string, type of activation function to apply.
	transformer_activation: A string, type of activation function for
	self-attention. Support 'sigmoid' and 'softmax'.
	bn_layer: An optional tf.keras.layers.Layer that computes the
	normalization (default: tf.keras.layers.BatchNormalization).
	"""
	super(AttentionOperation, self).__init__(name=name)
	# batch_norm_similarity has shape [batch, num_heads, num_query, num_key],
	# where num_query and num_key usually equals to height or width or length,
	# i.e., spatial dimensions, so batch norm is applied to axis=1 only.
	self._batch_norm_similarity = bn_layer(axis=1, name='batch_norm_similarity')
	# batch_norm_retrieved_value is done on shape [batch, num_heads, length,
	# value_channels], which will be reshaped to the output shape [batch,
	# length, value_channels * num_heads], so we apply batch norm on the
	# effective channel dimension -- value_channels * num_heads.
	self._batch_norm_retrieved_value = bn_layer(
	axis=[1, 3], name='batch_norm_retrieved_value')
	self._activation_fn = activations.get_activation(activation)
	self._transformer_activation_fn = activations.get_activation(
	transformer_activation)

	def call(self, inputs, training=False):
	"""Performs an AttentionOperation.

	Args:
	inputs: A tuple of (query, key, value), where query is [batch, num_head,
	query_length, channels] tensor, key is a [batch, num_head, key_length,
	channels] tensor, and value is a [batch, key_length, num_head,
	value_channels] tensor.
	training: A boolean, whether the model is in training mode.

	Returns:
	output: A [batch, query_length, num_head * value_channels] tensor, the
	retrieved value.
	"""
	# Decode query, key, and value from inputs.
	query, key, value = inputs
	# Compute attention similarity.
	similarity_logits = tf.einsum('bhld,bhmd->bhlm', query, key)
	similarity_logits = self._batch_norm_similarity(
	similarity_logits, training=training)
	# Apply a transformer attention activation function, e.g. softmax.
	attention_weights = self._transformer_activation_fn(similarity_logits)
	# Retrieve the value content.
	retrieved_value = tf.einsum(
	'bhlm,bmhd->bhld', attention_weights, value)
	retrieved_value = self._batch_norm_retrieved_value(
	retrieved_value, training=training)
	retrieved_value = self._activation_fn(retrieved_value)
	# Reshape the output.
	return utils.transpose_and_reshape_for_attention_operation(
	retrieved_value)


	class DualPathTransformerLayer(tf.keras.layers.Layer):
	"""Applies a dual path transformer layer, as proposed in MaX-DeepLab [1].

	Dual-path transformer layer takes a pixel space input and a memory space
	input, and performs memory2pixel attention, pixel2memory attention, and
	memory2memory self-attention. Note that the pixel2pixel self-attention or
	convolution in the pixel space is implemented in axial_layers.py and
	axial_blocks.py. Thus, the pixel2pixel operation is not included in this
	DualPathTransformerLayer implementation. Please use this class together with
	a residual block with axial-attention, global-attention, or convolution in
	order to construct the full dual path transformer in the paper.

	[1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
	CVPR 2021.
	Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
	"""

	def __init__(self,
	name='dual_path_transformer_layer',
	activation='relu',
	filters=128,
	num_heads=8,
	bottleneck_expansion=2,
	key_expansion=1,
	value_expansion=2,
	feed_forward_network_channels=2048,
	use_memory_self_attention=True,
	use_pixel2memory_feedback_attention=True,
	transformer_activation='softmax',
	bn_layer=tf.keras.layers.BatchNormalization,
	conv_kernel_weight_decay=0.0):
	"""Initializes a DualPathTransformerLayer.

	This function implements a dual path transformer layer between a pixel space
	and a memory space, as described in the MaX-DeepLab paper. In this dual path
	transformer, the memory2pixel cross attention and the memory self-attention
	share a single activation, e.g. softmax.

	Reference:
	MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers",
	CVPR 2021. https://arxiv.org/abs/2012.00759
	Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.

	Args:
	name: A string, the name of this dual path transformer layer.
	activation: A string, type of activation function to apply.
	filters: An integer, the base number of channels for the layer.
	num_heads: An integer, the number of heads in multi-head attention.
	bottleneck_expansion: A float, the channel expansion ratio for the
	bottleneck.
	key_expansion: A float, the channel expansion ratio for keys.
	value_expansion: A float, the channel expansion ratio for values.
	feed_forward_network_channels: An integer, the number of channels for the
	feed_forward_network. Zero means no feed_forward_network will be
	applied.
	use_memory_self_attention: A boolean, whether to apply the memory space
	self-attention.
	use_pixel2memory_feedback_attention: A boolean, whether to apply the
	pixel2memory feedback attention.
	transformer_activation: A string, type of activation function for
	self-attention. Support 'sigmoid' and 'softmax'.
	bn_layer: A tf.keras.layers.Layer that computes the normalization
	(default: tf.keras.layers.BatchNormalization).
	conv_kernel_weight_decay: A float, the weight decay for convolution
	kernels.

	Raises:
	ValueError: If filters * key_expansion is not divisible by num_heads.
	ValueError: If filters * value_expansion is not divisible by num_heads.
	"""
	super(DualPathTransformerLayer, self).__init__(name=name)

	bottleneck_channels = int(round(filters * bottleneck_expansion))
	total_key_depth = int(round(filters * key_expansion))
	total_value_depth = int(round(filters * value_expansion))

	if total_key_depth % num_heads:
	raise ValueError('Total_key_depth should be divisible by num_heads.')

	if total_value_depth % num_heads:
	raise ValueError('Total_value_depth should be divisible by num_heads.')

	# Compute query key value with one convolution and a batch norm layer. The
	# initialization std is standard transformer initialization (without batch
	# norm), as used in SASA and ViT. In our case, we use batch norm by default,
	# so it does not require careful tuning. If one wants to remove all batch
	# norms in axial attention, this standard initialization should still be
	# good, but a more careful initialization is encouraged.
	initialization_std = bottleneck_channels ** -0.5

	self._memory_conv1_bn_act = convolutions.Conv1D(
	bottleneck_channels, 'memory_conv1_bn_act',
	use_bias=False,
	use_bn=True,
	bn_layer=bn_layer,
	activation=activation,
	conv_kernel_weight_decay=conv_kernel_weight_decay)

	self._pixel_conv1_bn_act = convolutions.Conv1D(
	bottleneck_channels, 'pixel_conv1_bn_act',
	use_bias=False,
	use_bn=True,
	bn_layer=bn_layer,
	activation=activation,
	conv_kernel_weight_decay=conv_kernel_weight_decay)

	# We always compute the query for memory space, since it gathers information
	# from the pixel space and thus cannot be removed. We compute the key and
	# value for memory space only when they are necessary (i.e. either
	# use_memory_self_attention or use_pixel2memory_feedback_attention).
	if use_memory_self_attention or use_pixel2memory_feedback_attention:
	self._memory_qkv_conv_bn = convolutions.Conv1D(
	total_key_depth * 2 + total_value_depth, 'memory_qkv_conv_bn',
	use_bias=False,
	use_bn=True,
	bn_layer=bn_layer,
	activation='none',
	conv_kernel_weight_decay=conv_kernel_weight_decay,
	kernel_initializer=tf.keras.initializers.TruncatedNormal(
	stddev=initialization_std))
	else:
	# Compute memory query only if memory key and value are not used.
	self._memory_query_conv_bn = convolutions.Conv1D(
	total_key_depth, 'memory_query_conv_bn',
	use_bias=False,
	use_bn=True,
	bn_layer=bn_layer,
	activation='none',
	conv_kernel_weight_decay=conv_kernel_weight_decay,
	kernel_initializer=tf.keras.initializers.TruncatedNormal(
	stddev=initialization_std))

	# For the pixel space, we always compute the key and value, since they
	# provide information for the memory space and thus cannot be removed. We
	# compute the query for pixel space only when it is necessary (i.e.
	# use_pixel2memory_feedback_attention is True).
	if use_pixel2memory_feedback_attention:
	self._pixel_qkv_conv_bn = convolutions.Conv1D(
	total_key_depth * 2 + total_value_depth, 'pixel_qkv_conv_bn',
	use_bias=False,
	use_bn=True,
	bn_layer=bn_layer,
	activation='none',
	conv_kernel_weight_decay=conv_kernel_weight_decay,
	kernel_initializer=tf.keras.initializers.TruncatedNormal(
	stddev=initialization_std))
	else:
	self._pixel_kv_conv_bn = convolutions.Conv1D(
	total_key_depth + total_value_depth, 'pixel_kv_conv_bn',
	use_bias=False,
	use_bn=True,
	bn_layer=bn_layer,
	activation='none',
	conv_kernel_weight_decay=conv_kernel_weight_decay,
	kernel_initializer=tf.keras.initializers.TruncatedNormal(
	stddev=initialization_std))
	self._memory_attention = AttentionOperation(
	'memory_attention', activation, transformer_activation,
	bn_layer=bn_layer)
	if use_pixel2memory_feedback_attention:
	self._pixel_attention = AttentionOperation(
	'pixel_attention', activation, transformer_activation,
	bn_layer=bn_layer)

	self._use_memory_self_attention = use_memory_self_attention
	self._use_pixel2memory_feedback_attention = (
	use_pixel2memory_feedback_attention)
	self._total_key_depth = total_key_depth
	self._total_value_depth = total_value_depth
	self._num_heads = num_heads
	self._bn_layer = bn_layer
	self._conv_kernel_weight_decay = conv_kernel_weight_decay
	self._activation = activation
	self._activation_fn = activations.get_activation(activation)
	self._feed_forward_network_channels = feed_forward_network_channels

	def build(self, input_shape_list):
	pixel_shape, memory_shape = input_shape_list[:2]
	# Here we follow ResNet bottleneck blocks: we apply a batch norm with gamma
	# initialized at zero, followed by drop path and an activation function.
	# Initializing this gamma at zero ensures that at random initialization of
	# the model, the skip connections dominate all residual blocks. In this way,
	# all the skip connections construct an identity mapping that passes the
	# gradients (without any distortion from the randomly initialized blocks) to
	# all residual blocks. This helps training at early epochs.
	# Reference: "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour".
	# https://arxiv.org/abs/1706.02677
	self._memory_conv3_bn = convolutions.Conv1D(
	memory_shape[-1], 'memory_conv3_bn',
	use_bias=False,
	use_bn=True,
	bn_layer=self._bn_layer,
	bn_gamma_initializer='zeros',
	activation='none',
	conv_kernel_weight_decay=self._conv_kernel_weight_decay)

	if self._feed_forward_network_channels > 0:
	self._memory_ffn_conv1_bn_act = convolutions.Conv1D(
	self._feed_forward_network_channels, 'memory_ffn_conv1_bn_act',
	use_bias=False,
	use_bn=True,
	bn_layer=self._bn_layer,
	activation=self._activation,
	conv_kernel_weight_decay=self._conv_kernel_weight_decay)
	# Again, we follow ResNet bottleneck blocks: we apply a batch norm with
	# gamma initialized at zero, followed by drop path and an activation
	# function.
	self._memory_ffn_conv2_bn = convolutions.Conv1D(
	memory_shape[-1], 'memory_ffn_conv2_bn',
	use_bias=False,
	use_bn=True,
	bn_layer=self._bn_layer,
	bn_gamma_initializer='zeros',
	activation='none',
	conv_kernel_weight_decay=self._conv_kernel_weight_decay)
	if self._use_pixel2memory_feedback_attention:
	self._pixel_conv3_bn = convolutions.Conv1D(
	pixel_shape[-1], 'pixel_conv3_bn',
	use_bias=False,
	use_bn=True,
	bn_layer=self._bn_layer,
	bn_gamma_initializer='zeros',
	activation='none',
	conv_kernel_weight_decay=self._conv_kernel_weight_decay)

	def call(self, inputs):
	"""Performs a forward pass.

	We have to define drop_path_masks outside the layer call and pass it into
	the layer call, because recompute_grad (gradient checkpointing) does not
	allow any randomness within the function call. In addition, recompute_grad
	only supports float tensors as inputs. For this reason, the training flag
	should be also passed as a float tensor. For the same reason, we cannot
	support passing drop_path_random_mask as None. Instead, we ask the users to
	pass only the first two tensors when drop path is not used.

	Args:
	inputs: A tuple of 3 or 6 tensors, containing
	pixel_space_input should be a [batch, num_pixel, pixel_space_channels]
	tensor.
	memory_space_input should be a [batch, num_memory,
	memory_space_channels] tensor.
	float_tensor_training should be a float tensor of 0.0 or 1.0, whether
	the model is in training mode.
	(optional) pixel_space_drop_path_mask is a drop path mask tensor of
	shape [batch, 1, 1] for the pixel space.
	(optional) memory_space_attention_drop_path_mask is a drop path mask
	tensor of shape [batch, 1, 1] for the memory space.
	(optional) memory_space_feed_forward_network_drop_path_mask is a drop
	path mask tensor of shape [batch, 1, 1] for the memory space feed
	forward network.

	Returns:
	pixel_space_output: A [batch, num_pixel, pixel_space_channels] tensor.
	activated_pixel_space_output: A [batch, num_pixel, pixel_space_channels]
	tensor, activated pixel_space_output.
	memory_space_output: A [batch, num_memory, memory_space_channels]
	tensor.

	Raises:
	ValueError: If the length of inputs is not 3 or 6.
	"""
	if len(inputs) not in (3, 6):
	raise ValueError('The length of inputs should be either 3 or 6.')

	# Unpack the inputs.
	(pixel_space_input, memory_space_input, float_tensor_training,
	pixel_space_drop_path_mask, memory_space_attention_drop_path_mask,
	memory_space_feed_forward_network_drop_path_mask) = (
	utils.pad_sequence_with_none(inputs, target_length=6))

	# Recompute_grad takes only float tensors as inputs. It does not allow
	# bools or boolean tensors. For this reason, we cast training to a float
	# tensor outside this call, and now we cast it back to a boolean tensor.
	training = tf.cast(float_tensor_training, tf.bool)

	# Decode the inputs shapes.
	pixel_shape = pixel_space_input.get_shape().as_list()
	memory_shape = memory_space_input.get_shape().as_list()

	# Similar to the ResNet bottleneck design, we do an input down projection
	# in both the pixel space and the memory space.
	memory_space = self._memory_conv1_bn_act(memory_space_input,
	training=training)

	# Pixel space input is not activated.
	pixel_space = self._pixel_conv1_bn_act(
	self._activation_fn(pixel_space_input), training=training)

	if (self._use_memory_self_attention or
	self._use_pixel2memory_feedback_attention):
	memory_space_qkv = self._memory_qkv_conv_bn(memory_space,
	training=training)
	# Split, reshape, and transpose the query, key, and value.
	memory_query, memory_key, memory_value = (
	tf.split(memory_space_qkv, [
	self._total_key_depth, self._total_key_depth,
	self._total_value_depth], axis=-1))
	memory_key = utils.reshape_and_transpose_for_attention_operation(
	memory_key, self._num_heads)
	memory_value = tf.reshape(memory_value, [
	-1, memory_shape[1], self._num_heads,
	self._total_value_depth // self._num_heads])
	else:
	# Compute memory query only if memory key and value are not used.
	memory_query = self._memory_query_conv_bn(memory_space,
	training=training)
	# Reshape and transpose the query.
	memory_query = utils.reshape_and_transpose_for_attention_operation(
	memory_query, self._num_heads)

	if self._use_pixel2memory_feedback_attention:
	pixel_space_qkv = self._pixel_qkv_conv_bn(pixel_space,
	training=training)
	# Split the query, key, and value.
	pixel_query, pixel_key, pixel_value = tf.split(
	pixel_space_qkv, [
	self._total_key_depth, self._total_key_depth,
	self._total_value_depth], axis=-1)
	pixel_query = utils.reshape_and_transpose_for_attention_operation(
	pixel_query, self._num_heads)
	else:
	pixel_space_kv = self._pixel_kv_conv_bn(pixel_space, training=training)
	# Split the key and the value.
	pixel_key, pixel_value = tf.split(pixel_space_kv, [
	self._total_key_depth, self._total_value_depth], axis=-1)
	# Reshape and transpose the key and the value.
	pixel_key = utils.reshape_and_transpose_for_attention_operation(
	pixel_key, self._num_heads)
	pixel_value = tf.reshape(pixel_value, [
	-1, pixel_shape[1], self._num_heads,
	self._total_value_depth // self._num_heads])

	# Compute memory space attention.
	if not self._use_memory_self_attention:
	# If memory self attention is not used, then only memory2pixel cross
	# attention is used for the memory space. In this case, the key and the
	# value are simply pixel_key and pixel_value.
	memory_attention_key = pixel_key
	memory_attention_value = pixel_value
	else:
	# If we also use memory self attention, the key and the value are the
	# concatenation of keys and values in both the pixel space and the
	# memory space.
	memory_attention_key = tf.concat([pixel_key, memory_key], axis=2)
	memory_attention_value = tf.concat([pixel_value, memory_value], axis=1)

	memory_space = self._memory_attention(
	(memory_query, memory_attention_key, memory_attention_value),
	training=training)
	memory_space = self._memory_conv3_bn(memory_space, training=training)

	if memory_space_attention_drop_path_mask is not None:
	memory_space = memory_space * memory_space_attention_drop_path_mask
	memory_space_output = self._activation_fn(
	memory_space_input + memory_space)

	# Apply an optional feed-forward network to the memory space.
	if self._feed_forward_network_channels > 0:
	memory_space = self._memory_ffn_conv1_bn_act(memory_space_output,
	training=training)
	memory_space = self._memory_ffn_conv2_bn(memory_space,
	training=training)
	if memory_space_feed_forward_network_drop_path_mask is not None:
	memory_space = (memory_space *
	memory_space_feed_forward_network_drop_path_mask)
	memory_space_output = self._activation_fn(
	memory_space_output + memory_space)

	# Compute pixel space attention and the output projection only when
	# pixel2memory_feedback_attention is used.
	if self._use_pixel2memory_feedback_attention:
	pixel_space = self._pixel_attention(
	(pixel_query, memory_key, memory_value), training=training)
	pixel_space = self._pixel_conv3_bn(pixel_space, training=training)
	if pixel_space_drop_path_mask is not None:
	pixel_space = pixel_space * pixel_space_drop_path_mask
	pixel_space_output = pixel_space_input + pixel_space
	else:
	# If pixel2memory_feedback_attention is not used, the pixel_space_input
	# is not changed.
	pixel_space_output = pixel_space_input
	activated_pixel_space_output = self._activation_fn(pixel_space_output)

	# Return the pixel space output and memory space output. Note that we
	# return pixel sapce output with and without the activation function,
	# because our decoder might use non-activated features.
	return (pixel_space_output,
	activated_pixel_space_output,
	memory_space_output)