Source code for mmaction.models.backbones.c2d
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple, Union
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule
from mmaction.models.backbones.resnet import ResNet
from mmaction.registry import MODELS
[docs]@MODELS.register_module()
class C2D(ResNet):
"""C2D backbone.
Compared to ResNet-50, a temporal-pool is added after the first
bottleneck. Detailed structure is kept same as "video-nonlocal-net" repo.
Please refer to https://github.com/facebookresearch/video-nonlocal-net/blob
/main/scripts/run_c2d_baseline_400k.sh.
Please note that there are some improvements compared to "Non-local Neural
Networks" paper (https://arxiv.org/abs/1711.07971).
Differences are noted at https://github.com/facebookresearch/video-nonlocal
-net#modifications-for-improving-speed.
"""
def _make_stem_layer(self) -> None:
"""Construct the stem layers consists of a conv+norm+act module and a
pooling layer."""
self.conv1 = ConvModule(
self.in_channels,
64,
kernel_size=7,
stride=2,
padding=3,
bias=False,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg)
self.maxpool3d_1 = nn.MaxPool3d(
kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
self.maxpool3d_2 = nn.MaxPool3d(
kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
[docs] def forward(self, x: torch.Tensor) \
-> Union[torch.Tensor, Tuple[torch.Tensor]]:
"""Defines the computation performed at every call.
Args:
x (torch.Tensor): The input data.
Returns:
Union[torch.Tensor or Tuple[torch.Tensor]]: The feature of the
input samples extracted by the backbone.
"""
batches = x.shape[0]
def _convert_to_2d(x: torch.Tensor) -> torch.Tensor:
"""(N, C, T, H, W) -> (N x T, C, H, W)"""
x = x.permute((0, 2, 1, 3, 4))
x = x.reshape(-1, x.shape[2], x.shape[3], x.shape[4])
return x
def _convert_to_3d(x: torch.Tensor) -> torch.Tensor:
"""(N x T, C, H, W) -> (N, C, T, H, W)"""
x = x.reshape(batches, -1, x.shape[1], x.shape[2], x.shape[3])
x = x.permute((0, 2, 1, 3, 4))
return x
x = _convert_to_2d(x)
x = self.conv1(x)
x = _convert_to_3d(x)
x = self.maxpool3d_1(x)
x = _convert_to_2d(x)
outs = []
for i, layer_name in enumerate(self.res_layers):
res_layer = getattr(self, layer_name)
x = res_layer(x)
if i == 0:
x = _convert_to_3d(x)
x = self.maxpool3d_2(x)
x = _convert_to_2d(x)
if i in self.out_indices:
x = _convert_to_3d(x)
outs.append(x)
if len(outs) == 1:
return outs[0]
return tuple(outs)