Shortcuts

Source code for mmaction.models.backbones.c2d

# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple, Union

import torch
import torch.nn as nn
from mmcv.cnn import ConvModule

from mmaction.models.backbones.resnet import ResNet
from mmaction.registry import MODELS


[docs]@MODELS.register_module() class C2D(ResNet): """C2D backbone. Compared to ResNet-50, a temporal-pool is added after the first bottleneck. Detailed structure is kept same as "video-nonlocal-net" repo. Please refer to https://github.com/facebookresearch/video-nonlocal-net/blob /main/scripts/run_c2d_baseline_400k.sh. Please note that there are some improvements compared to "Non-local Neural Networks" paper (https://arxiv.org/abs/1711.07971). Differences are noted at https://github.com/facebookresearch/video-nonlocal -net#modifications-for-improving-speed. """ def _make_stem_layer(self) -> None: """Construct the stem layers consists of a conv+norm+act module and a pooling layer.""" self.conv1 = ConvModule( self.in_channels, 64, kernel_size=7, stride=2, padding=3, bias=False, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.maxpool3d_1 = nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0)) self.maxpool3d_2 = nn.MaxPool3d( kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
[docs] def forward(self, x: torch.Tensor) \ -> Union[torch.Tensor, Tuple[torch.Tensor]]: """Defines the computation performed at every call. Args: x (torch.Tensor): The input data. Returns: Union[torch.Tensor or Tuple[torch.Tensor]]: The feature of the input samples extracted by the backbone. """ batches = x.shape[0] def _convert_to_2d(x: torch.Tensor) -> torch.Tensor: """(N, C, T, H, W) -> (N x T, C, H, W)""" x = x.permute((0, 2, 1, 3, 4)) x = x.reshape(-1, x.shape[2], x.shape[3], x.shape[4]) return x def _convert_to_3d(x: torch.Tensor) -> torch.Tensor: """(N x T, C, H, W) -> (N, C, T, H, W)""" x = x.reshape(batches, -1, x.shape[1], x.shape[2], x.shape[3]) x = x.permute((0, 2, 1, 3, 4)) return x x = _convert_to_2d(x) x = self.conv1(x) x = _convert_to_3d(x) x = self.maxpool3d_1(x) x = _convert_to_2d(x) outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) x = res_layer(x) if i == 0: x = _convert_to_3d(x) x = self.maxpool3d_2(x) x = _convert_to_2d(x) if i in self.out_indices: x = _convert_to_3d(x) outs.append(x) if len(outs) == 1: return outs[0] return tuple(outs)