Spaces:

KyanChen
/

RSPrompter

Runtime error

App Files Files Community

RSPrompter / mmdet /models /dense_heads /fovea_head.py

KyanChen

Upload 787 files

3e06e1c over 2 years ago

raw

history blame contribute delete

21.4 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	from typing import Dict, List, Optional, Tuple

	import torch
	import torch.nn as nn
	from mmcv.cnn import ConvModule
	from mmcv.ops import DeformConv2d
	from mmengine.config import ConfigDict
	from mmengine.model import BaseModule
	from mmengine.structures import InstanceData
	from torch import Tensor

	from mmdet.registry import MODELS
	from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
	from ..utils import filter_scores_and_topk, multi_apply
	from .anchor_free_head import AnchorFreeHead

	INF = 1e8


	class FeatureAlign(BaseModule):
	"""Feature Align Module.

	Feature Align Module is implemented based on DCN v1.
	It uses anchor shape prediction rather than feature map to
	predict offsets of deform conv layer.

	Args:
	in_channels (int): Number of channels in the input feature map.
	out_channels (int): Number of channels in the output feature map.
	kernel_size (int): Size of the convolution kernel.
	``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
	deform_groups: (int): Group number of DCN in
	FeatureAdaption module.
	init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
	dict], optional): Initialization config dict.
	"""

	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int = 3,
	deform_groups: int = 4,
	init_cfg: OptMultiConfig = dict(
	type='Normal',
	layer='Conv2d',
	std=0.1,
	override=dict(type='Normal', name='conv_adaption', std=0.01))
	) -> None:
	super().__init__(init_cfg=init_cfg)
	offset_channels = kernel_size * kernel_size * 2
	self.conv_offset = nn.Conv2d(
	4, deform_groups * offset_channels, 1, bias=False)
	self.conv_adaption = DeformConv2d(
	in_channels,
	out_channels,
	kernel_size=kernel_size,
	padding=(kernel_size - 1) // 2,
	deform_groups=deform_groups)
	self.relu = nn.ReLU(inplace=True)

	def forward(self, x: Tensor, shape: Tensor) -> Tensor:
	"""Forward function of feature align module.

	Args:
	x (Tensor): Features from the upstream network.
	shape (Tensor): Exponential of bbox predictions.

	Returns:
	x (Tensor): The aligned features.
	"""
	offset = self.conv_offset(shape)
	x = self.relu(self.conv_adaption(x, offset))
	return x


	@MODELS.register_module()
	class FoveaHead(AnchorFreeHead):
	"""Detection Head of `FoveaBox: Beyond Anchor-based Object Detector.

	<https://arxiv.org/abs/1904.03797>`_.

	Args:
	num_classes (int): Number of categories excluding the background
	category.
	in_channels (int): Number of channels in the input feature map.
	base_edge_list (list[int]): List of edges.
	scale_ranges (list[tuple]): Range of scales.
	sigma (float): Super parameter of ``FoveaHead``.
	with_deform (bool): Whether use deform conv.
	deform_groups (int): Deformable conv group size.
	init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
	dict], optional): Initialization config dict.
	"""

	def __init__(self,
	num_classes: int,
	in_channels: int,
	base_edge_list: List[int] = (16, 32, 64, 128, 256),
	scale_ranges: List[tuple] = ((8, 32), (16, 64), (32, 128),
	(64, 256), (128, 512)),
	sigma: float = 0.4,
	with_deform: bool = False,
	deform_groups: int = 4,
	init_cfg: OptMultiConfig = dict(
	type='Normal',
	layer='Conv2d',
	std=0.01,
	override=dict(
	type='Normal',
	name='conv_cls',
	std=0.01,
	bias_prob=0.01)),
	**kwargs) -> None:
	self.base_edge_list = base_edge_list
	self.scale_ranges = scale_ranges
	self.sigma = sigma
	self.with_deform = with_deform
	self.deform_groups = deform_groups
	super().__init__(
	num_classes=num_classes,
	in_channels=in_channels,
	init_cfg=init_cfg,
	**kwargs)

	def _init_layers(self) -> None:
	"""Initialize layers of the head."""
	# box branch
	super()._init_reg_convs()
	self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)

	# cls branch
	if not self.with_deform:
	super()._init_cls_convs()
	self.conv_cls = nn.Conv2d(
	self.feat_channels, self.cls_out_channels, 3, padding=1)
	else:
	self.cls_convs = nn.ModuleList()
	self.cls_convs.append(
	ConvModule(
	self.feat_channels, (self.feat_channels * 4),
	3,
	stride=1,
	padding=1,
	conv_cfg=self.conv_cfg,
	norm_cfg=self.norm_cfg,
	bias=self.norm_cfg is None))
	self.cls_convs.append(
	ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
	1,
	stride=1,
	padding=0,
	conv_cfg=self.conv_cfg,
	norm_cfg=self.norm_cfg,
	bias=self.norm_cfg is None))
	self.feature_adaption = FeatureAlign(
	self.feat_channels,
	self.feat_channels,
	kernel_size=3,
	deform_groups=self.deform_groups)
	self.conv_cls = nn.Conv2d(
	int(self.feat_channels * 4),
	self.cls_out_channels,
	3,
	padding=1)

	def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
	"""Forward features of a single scale level.

	Args:
	x (Tensor): FPN feature maps of the specified stride.

	Returns:
	tuple: scores for each class and bbox predictions of input
	feature maps.
	"""
	cls_feat = x
	reg_feat = x
	for reg_layer in self.reg_convs:
	reg_feat = reg_layer(reg_feat)
	bbox_pred = self.conv_reg(reg_feat)
	if self.with_deform:
	cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
	for cls_layer in self.cls_convs:
	cls_feat = cls_layer(cls_feat)
	cls_score = self.conv_cls(cls_feat)
	return cls_score, bbox_pred

	def loss_by_feat(
	self,
	cls_scores: List[Tensor],
	bbox_preds: List[Tensor],
	batch_gt_instances: InstanceList,
	batch_img_metas: List[dict],
	batch_gt_instances_ignore: OptInstanceList = None
	) -> Dict[str, Tensor]:
	"""Calculate the loss based on the features extracted by the detection
	head.

	Args:
	cls_scores (list[Tensor]): Box scores for each scale level,
	each is a 4D-tensor, the channel number is
	num_priors * num_classes.
	bbox_preds (list[Tensor]): Box energies / deltas for each scale
	level, each is a 4D-tensor, the channel number is
	num_priors * 4.
	batch_gt_instances (list[:obj:`InstanceData`]): Batch of
	gt_instance. It usually includes ``bboxes`` and ``labels``
	attributes.
	batch_img_metas (list[dict]): Meta information of each image, e.g.,
	image size, scaling factor, etc.
	batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
	Batch of gt_instances_ignore. It includes ``bboxes`` attribute
	data that is ignored during training and testing.
	Defaults to None.

	Returns:
	dict[str, Tensor]: A dictionary of loss components.
	"""
	assert len(cls_scores) == len(bbox_preds)

	featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
	priors = self.prior_generator.grid_priors(
	featmap_sizes,
	dtype=bbox_preds[0].dtype,
	device=bbox_preds[0].device)
	num_imgs = cls_scores[0].size(0)
	flatten_cls_scores = [
	cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
	for cls_score in cls_scores
	]
	flatten_bbox_preds = [
	bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
	for bbox_pred in bbox_preds
	]
	flatten_cls_scores = torch.cat(flatten_cls_scores)
	flatten_bbox_preds = torch.cat(flatten_bbox_preds)
	flatten_labels, flatten_bbox_targets = self.get_targets(
	batch_gt_instances, featmap_sizes, priors)

	# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
	pos_inds = ((flatten_labels >= 0)
	& (flatten_labels < self.num_classes)).nonzero().view(-1)
	num_pos = len(pos_inds)

	loss_cls = self.loss_cls(
	flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
	if num_pos > 0:
	pos_bbox_preds = flatten_bbox_preds[pos_inds]
	pos_bbox_targets = flatten_bbox_targets[pos_inds]
	pos_weights = pos_bbox_targets.new_ones(pos_bbox_targets.size())
	loss_bbox = self.loss_bbox(
	pos_bbox_preds,
	pos_bbox_targets,
	pos_weights,
	avg_factor=num_pos)
	else:
	loss_bbox = torch.tensor(
	0,
	dtype=flatten_bbox_preds.dtype,
	device=flatten_bbox_preds.device)
	return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)

	def get_targets(
	self, batch_gt_instances: InstanceList, featmap_sizes: List[tuple],
	priors_list: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
	"""Compute regression and classification for priors in multiple images.

	Args:
	batch_gt_instances (list[:obj:`InstanceData`]): Batch of
	gt_instance. It usually includes ``bboxes`` and ``labels``
	attributes.
	featmap_sizes (list[tuple]): Size tuple of feature maps.
	priors_list (list[Tensor]): Priors list of each fpn level, each has
	shape (num_priors, 2).

	Returns:
	tuple: Targets of each level.

	- flatten_labels (list[Tensor]): Labels of each level.
	- flatten_bbox_targets (list[Tensor]): BBox targets of each
	level.
	"""
	label_list, bbox_target_list = multi_apply(
	self._get_targets_single,
	batch_gt_instances,
	featmap_size_list=featmap_sizes,
	priors_list=priors_list)
	flatten_labels = [
	torch.cat([
	labels_level_img.flatten() for labels_level_img in labels_level
	]) for labels_level in zip(*label_list)
	]
	flatten_bbox_targets = [
	torch.cat([
	bbox_targets_level_img.reshape(-1, 4)
	for bbox_targets_level_img in bbox_targets_level
	]) for bbox_targets_level in zip(*bbox_target_list)
	]
	flatten_labels = torch.cat(flatten_labels)
	flatten_bbox_targets = torch.cat(flatten_bbox_targets)
	return flatten_labels, flatten_bbox_targets

	def _get_targets_single(self,
	gt_instances: InstanceData,
	featmap_size_list: List[tuple] = None,
	priors_list: List[Tensor] = None) -> tuple:
	"""Compute regression and classification targets for a single image.

	Args:
	gt_instances (:obj:`InstanceData`): Ground truth of instance
	annotations. It usually includes ``bboxes`` and ``labels``
	attributes.
	featmap_size_list (list[tuple]): Size tuple of feature maps.
	priors_list (list[Tensor]): Priors of each fpn level, each has
	shape (num_priors, 2).

	Returns:
	tuple:

	- label_list (list[Tensor]): Labels of all anchors in the image.
	- box_target_list (list[Tensor]): BBox targets of all anchors in
	the image.
	"""
	gt_bboxes_raw = gt_instances.bboxes
	gt_labels_raw = gt_instances.labels
	gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
	(gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
	label_list = []
	bbox_target_list = []
	# for each pyramid, find the cls and box target
	for base_len, (lower_bound, upper_bound), stride, featmap_size, \
	priors in zip(self.base_edge_list, self.scale_ranges,
	self.strides, featmap_size_list, priors_list):
	# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
	priors = priors.view(*featmap_size, 2)
	x, y = priors[..., 0], priors[..., 1]
	labels = gt_labels_raw.new_full(featmap_size, self.num_classes)
	bbox_targets = gt_bboxes_raw.new_ones(featmap_size[0],
	featmap_size[1], 4)
	# scale assignment
	hit_indices = ((gt_areas >= lower_bound) &
	(gt_areas <= upper_bound)).nonzero().flatten()
	if len(hit_indices) == 0:
	label_list.append(labels)
	bbox_target_list.append(torch.log(bbox_targets))
	continue
	_, hit_index_order = torch.sort(-gt_areas[hit_indices])
	hit_indices = hit_indices[hit_index_order]
	gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
	gt_labels = gt_labels_raw[hit_indices]
	half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
	half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
	# valid fovea area: left, right, top, down
	pos_left = torch.ceil(
	gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \
	clamp(0, featmap_size[1] - 1)
	pos_right = torch.floor(
	gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \
	clamp(0, featmap_size[1] - 1)
	pos_top = torch.ceil(
	gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \
	clamp(0, featmap_size[0] - 1)
	pos_down = torch.floor(
	gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \
	clamp(0, featmap_size[0] - 1)
	for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
	zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
	gt_bboxes_raw[hit_indices, :]):
	labels[py1:py2 + 1, px1:px2 + 1] = label
	bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
	(x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
	bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
	(y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
	bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
	(gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len
	bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
	(gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len
	bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
	label_list.append(labels)
	bbox_target_list.append(torch.log(bbox_targets))
	return label_list, bbox_target_list

	# Same as base_dense_head/_predict_by_feat_single except self._bbox_decode
	def _predict_by_feat_single(self,
	cls_score_list: List[Tensor],
	bbox_pred_list: List[Tensor],
	score_factor_list: List[Tensor],
	mlvl_priors: List[Tensor],
	img_meta: dict,
	cfg: Optional[ConfigDict] = None,
	rescale: bool = False,
	with_nms: bool = True) -> InstanceData:
	"""Transform a single image's features extracted from the head into
	bbox results.

	Args:
	cls_score_list (list[Tensor]): Box scores from all scale
	levels of a single image, each item has shape
	(num_priors * num_classes, H, W).
	bbox_pred_list (list[Tensor]): Box energies / deltas from
	all scale levels of a single image, each item has shape
	(num_priors * 4, H, W).
	score_factor_list (list[Tensor]): Score factor from all scale
	levels of a single image, each item has shape
	(num_priors * 1, H, W).
	mlvl_priors (list[Tensor]): Each element in the list is
	the priors of a single level in feature pyramid, has shape
	(num_priors, 2).
	img_meta (dict): Image meta info.
	cfg (ConfigDict, optional): Test / postprocessing
	configuration, if None, test_cfg would be used.
	Defaults to None.
	rescale (bool): If True, return boxes in original image space.
	Defaults to False.
	with_nms (bool): If True, do nms before return boxes.
	Defaults to True.

	Returns:
	:obj:`InstanceData`: Detection results of each image
	after the post process.
	Each item usually contains following keys.

	- scores (Tensor): Classification scores, has a shape
	(num_instance, )
	- labels (Tensor): Labels of bboxes, has a shape
	(num_instances, ).
	- bboxes (Tensor): Has a shape (num_instances, 4),
	the last dimension 4 arrange as (x1, y1, x2, y2).
	"""
	cfg = self.test_cfg if cfg is None else cfg
	assert len(cls_score_list) == len(bbox_pred_list)
	img_shape = img_meta['img_shape']
	nms_pre = cfg.get('nms_pre', -1)

	mlvl_bboxes = []
	mlvl_scores = []
	mlvl_labels = []
	for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \
	enumerate(zip(cls_score_list, bbox_pred_list, self.strides,
	self.base_edge_list, mlvl_priors)):
	assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
	bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)

	scores = cls_score.permute(1, 2, 0).reshape(
	-1, self.cls_out_channels).sigmoid()

	# After https://github.com/open-mmlab/mmdetection/pull/6268/,
	# this operation keeps fewer bboxes under the same `nms_pre`.
	# There is no difference in performance for most models. If you
	# find a slight drop in performance, you can set a larger
	# `nms_pre` than before.
	results = filter_scores_and_topk(
	scores, cfg.score_thr, nms_pre,
	dict(bbox_pred=bbox_pred, priors=priors))
	scores, labels, _, filtered_results = results

	bbox_pred = filtered_results['bbox_pred']
	priors = filtered_results['priors']

	bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape)

	mlvl_bboxes.append(bboxes)
	mlvl_scores.append(scores)
	mlvl_labels.append(labels)

	results = InstanceData()
	results.bboxes = torch.cat(mlvl_bboxes)
	results.scores = torch.cat(mlvl_scores)
	results.labels = torch.cat(mlvl_labels)

	return self._bbox_post_process(
	results=results,
	cfg=cfg,
	rescale=rescale,
	with_nms=with_nms,
	img_meta=img_meta)

	def _bbox_decode(self, priors: Tensor, bbox_pred: Tensor, base_len: int,
	max_shape: int) -> Tensor:
	"""Function to decode bbox.

	Args:
	priors (Tensor): Center proiors of an image, has shape
	(num_instances, 2).
	bbox_preds (Tensor): Box energies / deltas for all instances,
	has shape (batch_size, num_instances, 4).
	base_len (int): The base length.
	max_shape (int): The max shape of bbox.

	Returns:
	Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
	shape (batch_size, num_instances, 4).
	"""
	bbox_pred = bbox_pred.exp()

	y = priors[:, 1]
	x = priors[:, 0]
	x1 = (x - base_len * bbox_pred[:, 0]). \
	clamp(min=0, max=max_shape[1] - 1)
	y1 = (y - base_len * bbox_pred[:, 1]). \
	clamp(min=0, max=max_shape[0] - 1)
	x2 = (x + base_len * bbox_pred[:, 2]). \
	clamp(min=0, max=max_shape[1] - 1)
	y2 = (y + base_len * bbox_pred[:, 3]). \
	clamp(min=0, max=max_shape[0] - 1)
	decoded_bboxes = torch.stack([x1, y1, x2, y2], -1)
	return decoded_bboxes