본문 바로가기

블루아카이브

HeadHunter 구조 살펴보기

가장 중요한 Backbone생성 함수는 아래와 같음

def create_backbone(cfg, use_deform=False,
                    context=None, default_filter=False):
    """Creates backbone """
    in_channels = cfg['in_channel']    # 256
    if cfg['name'] == 'Resnet50':  # 이걸 진행. ResnotModel50을 pretrain=True 형태로 불러온다.
        feat_ext = models.resnet50(pretrained=cfg['pretrain'])    # feat_ext = models.resnet50(True)
        if len(cfg['return_layers']) == 3:
            in_channels_list = [
                in_channels * 2,
                in_channels * 4,
                in_channels * 8,
            ]
        elif len(cfg['return_layers']) == 4:  # 이걸 진행
            in_channels_list = [
                    in_channels,
                    in_channels * 2,
                    in_channels * 4,
                    in_channels * 8,
            ]
        else:
            raise ValueError("Not yet ready for 5FPN")
    elif cfg['name'] == 'Resnet152':
        feat_ext = models.resnet152(pretrained=cfg['pretrain'])
        in_channels_list = [
            in_channels,
            in_channels * 2,
            in_channels * 4,
            in_channels * 8,
        ]
    elif cfg['name'] == 'mobilenet0.25':
        feat_ext = MobileNetV1()
        in_channels_list = [
            in_channels * 2,
            in_channels * 4,
            in_channels * 8,
        ]
        if cfg['pretrain']:
            checkpoint = torch.load("./Weights/mobilenetV1X0.25_pretrain.tar",
                                    map_location=torch.device('cpu'))
            new_state_dict = OrderedDict()
            for k, v in checkpoint['state_dict'].items():
                name = k[7:]  # remove module.
                new_state_dict[name] = v
                # load params
            feat_ext.load_state_dict(new_state_dict)
    else:
        raise ValueError("Unsupported backbone")

    print(feat_ext)

    out_channels = cfg['out_channel']
    backbone_with_fpn = BackBoneWithFPN(feat_ext, cfg['return_layers'],
                                        in_channels_list,
                                        out_channels,
                                        context_module=context,
                                        use_deform=use_deform,
                                        default_filter=default_filter)
    return backbone_with_fpn

해당 코드는 resnet50을 쓰고 있으며, 따라서

feat_ext = models.resnet(pretrained=cfg['pretrain'])만을 체크해보자.

이 모델의 구조가 어떤지 알기 위해서는 

print(feat_ext)를 찍어보면 됨.

그러면 아래와 같다.

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer2): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (3): Bottleneck(
      (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer3): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (3): Bottleneck(
      (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (4): Bottleneck(
      (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (5): Bottleneck(
      (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Linear(in_features=2048, out_features=1000, bias=True)
)

가장 먼저 BottleNeck 구조가 나옴.

BottleNeck 구조는 아래와 같음.

기존의 Residual block은 한 block에 Convolution layer(3x3)을 2개 가진 구조. 그러면 (3x3)x2 = 18번 연산임.

근데 위처럼 바꾸면? layer의 수는 더 많아졌지만, computational cost가 줄어들게 된다.

 

1x1 convolution 이면 사실 convolution의 의미는 사라지는 것 같다. 영상처리의 개념에서의 filter는 1x1 사이즈라서 의미가 없어지니까(매우 단순한 projection의 의미는 있겠지만..). 하지만, 딥러닝에서 1x1 convolution 을 사용할 때가 있다.
 
그것은 바로 filter의 수를 조절하는 것이다. 1x1 convolution 이라고만 말하지 않고, filter 수(차원의 수, 채널의 수와 혼용된다.)라는 것이 함께 말해야 의미가 있다.
 
사실, 입력하는 채널의 수와 출력하는 채널의 수가 완전히 동일하다면 convolution의 큰 의미는 없을 것으로 생각된다. 하지만 차원의 수를 바꿔준다면! 이야기는 달라진다. 차원을 크게도, 그리고 작게도 만든다. 입력받는 것에 비해 적은 수의 차원으로 채널의 수로, filter의 수로 만들어 준다면- 차원이 축소된 정보로 연산량을 크게 줄일 수 있다. 한번 이렇게 줄여 놓으면 뒤로가면 연계되는 연산량의 수, 파라미터의 수가 확 줄어들기 때문에 같은 컴퓨팅 자원과 시간 자원으로 더 깊은 네트워크를 설계하고 학습할 수 있게 된다. 그런데, 차원을 단순히/무작정 작게만 만든다고 다 되는 것은 아니다. 적당한 크기가 필요하고, 그 다음의 레이어에서 학습할 만큼은 남겨둔 적당한 차원이어야한다. 이러한 구조를 잘 활용한 것이 bottleneck 이라는 구조이다.
 
bottleneck은 이름 그대로, >< 같은 모양이 되기 때문에 bottleneck 이다. 1x1 convolution으로 좁아졌다가, 3x3 convolution(원래 하고자 했던 연산)을 하고, 1x1 convolution으로 다시 차원의 깊이도 깊게만드는 block을 bottleneck 이라고 부른다. 실제로 하는 일은 비용이 많이 드는 3x3, 5x5 convolution을 하기 전에, 1x1 convolution을 하여 차원을 감소시킨 후에 3x3, 5x5 convolution을 수행하는 것이다. inception 모델에서 활용되어 파라미터의 수는 줄이고 레이어를 깊게 하였으며, 특히 ResNet에서 엄청난 효과를 나타내었다.

이게 주요 골자임. 또한 layer가 많아짐은 곧 활성화 함수가 기존보다 더 들어간다는 의미이고 이는 더 많은 non-linearity가 들어가게 되면서 Input을 다양하게 가공할 수 있음을 나타낸다.

인풋과 아웃풋이 같게 하는 패딩을 한다는 조건 하의 1x1 컨볼루션의 효과. 결과는 같은데, 연산량이 확 차이남.

(3, 1920, 1080)을 넣으면 각각 아래의 shape을 가짐.

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 64, 960, 540]           9,408
       BatchNorm2d-2         [-1, 64, 960, 540]             128
              ReLU-3         [-1, 64, 960, 540]               0
         MaxPool2d-4         [-1, 64, 480, 270]               0
            Conv2d-5         [-1, 64, 480, 270]           4,096
       BatchNorm2d-6         [-1, 64, 480, 270]             128
              ReLU-7         [-1, 64, 480, 270]               0
            Conv2d-8         [-1, 64, 480, 270]          36,864
       BatchNorm2d-9         [-1, 64, 480, 270]             128
             ReLU-10         [-1, 64, 480, 270]               0
           Conv2d-11        [-1, 256, 480, 270]          16,384
      BatchNorm2d-12        [-1, 256, 480, 270]             512
           Conv2d-13        [-1, 256, 480, 270]          16,384
      BatchNorm2d-14        [-1, 256, 480, 270]             512
             ReLU-15        [-1, 256, 480, 270]               0
       Bottleneck-16        [-1, 256, 480, 270]               0
           Conv2d-17         [-1, 64, 480, 270]          16,384
      BatchNorm2d-18         [-1, 64, 480, 270]             128
             ReLU-19         [-1, 64, 480, 270]               0
           Conv2d-20         [-1, 64, 480, 270]          36,864
      BatchNorm2d-21         [-1, 64, 480, 270]             128
             ReLU-22         [-1, 64, 480, 270]               0
           Conv2d-23        [-1, 256, 480, 270]          16,384
      BatchNorm2d-24        [-1, 256, 480, 270]             512
             ReLU-25        [-1, 256, 480, 270]               0
       Bottleneck-26        [-1, 256, 480, 270]               0
           Conv2d-27         [-1, 64, 480, 270]          16,384
      BatchNorm2d-28         [-1, 64, 480, 270]             128
             ReLU-29         [-1, 64, 480, 270]               0
           Conv2d-30         [-1, 64, 480, 270]          36,864
      BatchNorm2d-31         [-1, 64, 480, 270]             128
             ReLU-32         [-1, 64, 480, 270]               0
           Conv2d-33        [-1, 256, 480, 270]          16,384
      BatchNorm2d-34        [-1, 256, 480, 270]             512
             ReLU-35        [-1, 256, 480, 270]               0
       Bottleneck-36        [-1, 256, 480, 270]               0
           Conv2d-37        [-1, 128, 480, 270]          32,768
      BatchNorm2d-38        [-1, 128, 480, 270]             256
             ReLU-39        [-1, 128, 480, 270]               0
           Conv2d-40        [-1, 128, 240, 135]         147,456
      BatchNorm2d-41        [-1, 128, 240, 135]             256
             ReLU-42        [-1, 128, 240, 135]               0
           Conv2d-43        [-1, 512, 240, 135]          65,536
      BatchNorm2d-44        [-1, 512, 240, 135]           1,024
           Conv2d-45        [-1, 512, 240, 135]         131,072
      BatchNorm2d-46        [-1, 512, 240, 135]           1,024
             ReLU-47        [-1, 512, 240, 135]               0
       Bottleneck-48        [-1, 512, 240, 135]               0
           Conv2d-49        [-1, 128, 240, 135]          65,536
      BatchNorm2d-50        [-1, 128, 240, 135]             256
             ReLU-51        [-1, 128, 240, 135]               0
           Conv2d-52        [-1, 128, 240, 135]         147,456
      BatchNorm2d-53        [-1, 128, 240, 135]             256
             ReLU-54        [-1, 128, 240, 135]               0
           Conv2d-55        [-1, 512, 240, 135]          65,536
      BatchNorm2d-56        [-1, 512, 240, 135]           1,024
             ReLU-57        [-1, 512, 240, 135]               0
       Bottleneck-58        [-1, 512, 240, 135]               0
           Conv2d-59        [-1, 128, 240, 135]          65,536
      BatchNorm2d-60        [-1, 128, 240, 135]             256
             ReLU-61        [-1, 128, 240, 135]               0
           Conv2d-62        [-1, 128, 240, 135]         147,456
      BatchNorm2d-63        [-1, 128, 240, 135]             256
             ReLU-64        [-1, 128, 240, 135]               0
           Conv2d-65        [-1, 512, 240, 135]          65,536
      BatchNorm2d-66        [-1, 512, 240, 135]           1,024
             ReLU-67        [-1, 512, 240, 135]               0
       Bottleneck-68        [-1, 512, 240, 135]               0
           Conv2d-69        [-1, 128, 240, 135]          65,536
      BatchNorm2d-70        [-1, 128, 240, 135]             256
             ReLU-71        [-1, 128, 240, 135]               0
           Conv2d-72        [-1, 128, 240, 135]         147,456
      BatchNorm2d-73        [-1, 128, 240, 135]             256
             ReLU-74        [-1, 128, 240, 135]               0
           Conv2d-75        [-1, 512, 240, 135]          65,536
      BatchNorm2d-76        [-1, 512, 240, 135]           1,024
             ReLU-77        [-1, 512, 240, 135]               0
       Bottleneck-78        [-1, 512, 240, 135]               0
           Conv2d-79        [-1, 256, 240, 135]         131,072
      BatchNorm2d-80        [-1, 256, 240, 135]             512
             ReLU-81        [-1, 256, 240, 135]               0
           Conv2d-82         [-1, 256, 120, 68]         589,824
      BatchNorm2d-83         [-1, 256, 120, 68]             512
             ReLU-84         [-1, 256, 120, 68]               0
           Conv2d-85        [-1, 1024, 120, 68]         262,144
      BatchNorm2d-86        [-1, 1024, 120, 68]           2,048
           Conv2d-87        [-1, 1024, 120, 68]         524,288
      BatchNorm2d-88        [-1, 1024, 120, 68]           2,048
             ReLU-89        [-1, 1024, 120, 68]               0
       Bottleneck-90        [-1, 1024, 120, 68]               0
           Conv2d-91         [-1, 256, 120, 68]         262,144
      BatchNorm2d-92         [-1, 256, 120, 68]             512
             ReLU-93         [-1, 256, 120, 68]               0
           Conv2d-94         [-1, 256, 120, 68]         589,824
      BatchNorm2d-95         [-1, 256, 120, 68]             512
             ReLU-96         [-1, 256, 120, 68]               0
           Conv2d-97        [-1, 1024, 120, 68]         262,144
      BatchNorm2d-98        [-1, 1024, 120, 68]           2,048
             ReLU-99        [-1, 1024, 120, 68]               0
      Bottleneck-100        [-1, 1024, 120, 68]               0
          Conv2d-101         [-1, 256, 120, 68]         262,144
     BatchNorm2d-102         [-1, 256, 120, 68]             512
            ReLU-103         [-1, 256, 120, 68]               0
          Conv2d-104         [-1, 256, 120, 68]         589,824
     BatchNorm2d-105         [-1, 256, 120, 68]             512
            ReLU-106         [-1, 256, 120, 68]               0
          Conv2d-107        [-1, 1024, 120, 68]         262,144
     BatchNorm2d-108        [-1, 1024, 120, 68]           2,048
            ReLU-109        [-1, 1024, 120, 68]               0
      Bottleneck-110        [-1, 1024, 120, 68]               0
          Conv2d-111         [-1, 256, 120, 68]         262,144
     BatchNorm2d-112         [-1, 256, 120, 68]             512
            ReLU-113         [-1, 256, 120, 68]               0
          Conv2d-114         [-1, 256, 120, 68]         589,824
     BatchNorm2d-115         [-1, 256, 120, 68]             512
            ReLU-116         [-1, 256, 120, 68]               0
          Conv2d-117        [-1, 1024, 120, 68]         262,144
     BatchNorm2d-118        [-1, 1024, 120, 68]           2,048
            ReLU-119        [-1, 1024, 120, 68]               0
      Bottleneck-120        [-1, 1024, 120, 68]               0
          Conv2d-121         [-1, 256, 120, 68]         262,144
     BatchNorm2d-122         [-1, 256, 120, 68]             512
            ReLU-123         [-1, 256, 120, 68]               0
          Conv2d-124         [-1, 256, 120, 68]         589,824
     BatchNorm2d-125         [-1, 256, 120, 68]             512
            ReLU-126         [-1, 256, 120, 68]               0
          Conv2d-127        [-1, 1024, 120, 68]         262,144
     BatchNorm2d-128        [-1, 1024, 120, 68]           2,048
            ReLU-129        [-1, 1024, 120, 68]               0
      Bottleneck-130        [-1, 1024, 120, 68]               0
          Conv2d-131         [-1, 256, 120, 68]         262,144
     BatchNorm2d-132         [-1, 256, 120, 68]             512
            ReLU-133         [-1, 256, 120, 68]               0
          Conv2d-134         [-1, 256, 120, 68]         589,824
     BatchNorm2d-135         [-1, 256, 120, 68]             512
            ReLU-136         [-1, 256, 120, 68]               0
          Conv2d-137        [-1, 1024, 120, 68]         262,144
     BatchNorm2d-138        [-1, 1024, 120, 68]           2,048
            ReLU-139        [-1, 1024, 120, 68]               0
      Bottleneck-140        [-1, 1024, 120, 68]               0
          Conv2d-141         [-1, 512, 120, 68]         524,288
     BatchNorm2d-142         [-1, 512, 120, 68]           1,024
            ReLU-143         [-1, 512, 120, 68]               0
          Conv2d-144          [-1, 512, 60, 34]       2,359,296
     BatchNorm2d-145          [-1, 512, 60, 34]           1,024
            ReLU-146          [-1, 512, 60, 34]               0
          Conv2d-147         [-1, 2048, 60, 34]       1,048,576
     BatchNorm2d-148         [-1, 2048, 60, 34]           4,096
          Conv2d-149         [-1, 2048, 60, 34]       2,097,152
     BatchNorm2d-150         [-1, 2048, 60, 34]           4,096
            ReLU-151         [-1, 2048, 60, 34]               0
      Bottleneck-152         [-1, 2048, 60, 34]               0
          Conv2d-153          [-1, 512, 60, 34]       1,048,576
     BatchNorm2d-154          [-1, 512, 60, 34]           1,024
            ReLU-155          [-1, 512, 60, 34]               0
          Conv2d-156          [-1, 512, 60, 34]       2,359,296
     BatchNorm2d-157          [-1, 512, 60, 34]           1,024
            ReLU-158          [-1, 512, 60, 34]               0
          Conv2d-159         [-1, 2048, 60, 34]       1,048,576
     BatchNorm2d-160         [-1, 2048, 60, 34]           4,096
            ReLU-161         [-1, 2048, 60, 34]               0
      Bottleneck-162         [-1, 2048, 60, 34]               0
          Conv2d-163          [-1, 512, 60, 34]       1,048,576
     BatchNorm2d-164          [-1, 512, 60, 34]           1,024
            ReLU-165          [-1, 512, 60, 34]               0
          Conv2d-166          [-1, 512, 60, 34]       2,359,296
     BatchNorm2d-167          [-1, 512, 60, 34]           1,024
            ReLU-168          [-1, 512, 60, 34]               0
          Conv2d-169         [-1, 2048, 60, 34]       1,048,576
     BatchNorm2d-170         [-1, 2048, 60, 34]           4,096
            ReLU-171         [-1, 2048, 60, 34]               0
      Bottleneck-172         [-1, 2048, 60, 34]               0
AdaptiveAvgPool2d-173           [-1, 2048, 1, 1]               0
          Linear-174                 [-1, 1000]       2,049,000
================================================================
Total params: 25,557,032
Trainable params: 25,557,032
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 23.73
Forward/backward pass size (MB): 11862.45
Params size (MB): 97.49
Estimated Total Size (MB): 11983.67
----------------------------------------------------------------

이 때, Conv2d와 BatchNorm2d의 구조는

torch.nn.Conv2d(
    in_channels, 
    out_channels, 
    kernel_size, 
    stride=1, 
    padding=0, 
    dilation=1, 
    groups=1, 
    bias=True, 
    padding_mode='zeros'
)

또한 

m = nn.BatchNorm2d("""채널 수""")

"""
num_features (int) – 인풋사이즈인(N, C, H, W)의 예상 C(채널) 수
eps(θ) – 수치 안정성을 위해 분모에 추가되는 값. 기본값: 1e-5
momentum(float) – running_messages 및 running_var 계산에 사용되는 값. 누적 이동 평균(즉, 단순 평균)에 대해 없음으로 설정할 수 있습니다. 기본값: 0.1
affine (boole) – True로 설정하면 이 모듈에 학습 가능한 affine 매개 변수가 있는 부울 값입니다. 기본값: T
track_running_stats (boole) – 이 모듈은 True로 설정하면 런닝 평균과 분산을 추적하고, False로 설정하면 해당 통계를 추적하지 않으며, running_mean 및 running_var 통계 버퍼를 없음으로 초기화하는 부울 값입니다. 이러한 버퍼가 없음이면 이 모듈은 항상 배치 통계를 사용합니다. 훈련 모드와 평가 모드 모두에서. 기본값: 진실의
"""

def customRCNN을 보면

# default = T
    if median_anchors:
        anchor_sizes = cfg['anchor_sizes'] # 'anchor_sizes': ((12,), (32,), (64,), (112,), (196,), (256,), (384,), (512,))
        aspect_ratios = cfg['aspect_ratios']  # 'aspect_ratios': ((0.5, 1.0, 1.5), (0.5, 1.0, 1.5), (0.5, 1.0, 1.5), (0.5, 1.0, 1.5), (0.5, 1.0, 1.5), (0.5, 1.0, 1.5), (0.5, 1.0, 1.5), (0.5, 1.0, 1.5))

        """
        AnchorGenetor - https://tutorials.pytorch.kr/intermediate/torchvision_tutorial.html
        이에 의하면, [12, 32, 64, ...]의 서로 다른 크기와 (0.5, 1.0, 1.5)의 다른 측면 비율(Aspect ratio)를 가진
        8x3개의 앵커를 공간위치마다 생성하도록 한다고 한다.
        이 aspect_ratio와 anchor_sizes에 관해서는 https://herbwood.tistory.com/10
        """
        rpn_anchor_generator = AnchorGenerator(anchor_sizes,
                                           aspect_ratios)
        kwargs['rpn_anchor_generator'] = rpn_anchor_generator

이라는 코드가 있음.

이 AnchorGenerator의 anchor_sizesaspect_ratios가 곧 하이퍼파라미터의 일종임.

 

Faster R-CNN 논문(Faster R-CNN: Towards Real-Time ObjectDetection with Region Proposal Networks) 리뷰 (tistory.com)

 

Faster R-CNN 논문(Faster R-CNN: Towards Real-Time ObjectDetection with Region Proposal Networks) 리뷰

이번 포스팅에서는 Faster R-CNN 논문(Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks)을 읽고 정리해봤습니다. 기존 Fast R-CNN 모델은 여전히 Selective search 알고리즘을 통해 region proposals

herbwood.tistory.com

이는 여기에 잘 나와있음.

 


> /workspace/HeadHunter/head_detection/models/head_detect.py(130)customRCNN()->FasterRCNN(
 ...e)
    )
  )
)
-> return model
(Pdb) next
--Call--
> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(444)cuda()
-> def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
(Pdb) next
> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(458)cuda()
-> return self._apply(lambda t: t.cuda(device))

head_detect.py의 return model을 진행하면, next는 바로 자신을 호출한 함수가 아니라, 의외로

torch.nn.modules.modules.pycuda()함수로 나아간다.

여기에서 def cuda - return self._apply(...)을 실행하고 바로 호출한 함수로 되돌아옴.

일단 혹시 모르니 적어둔다.


다음으로, 아직 지식이 짧아서 그런지 정확히 어느 단에서 image가 들어가서 작업을 하는 건지 이해가 되지 않기에,

import pdb

pdb.set_trace()
outputs = model(image)

>>> next(다음 줄로 이동)
>>> step(다음 실행 함수로 이동)

을 일일이 찍어가며 한 번 확인해봤다. 그 결과,

> /workspace/HeadHunter/test.py(355)test()
-> outputs = model(images)   # outputs : customRCNN에 images 넣은 결과. 즉 하나의 배치를 통째로 넣음.

--Call--
> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(710)_call_impl()
-> def _call_impl(self, *input, **kwargs):

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(711)_call_impl()
-> for hook in itertools.chain(

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(712)_call_impl()
-> _global_forward_pre_hooks.values(),

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(713)_call_impl()
-> self._forward_pre_hooks.values()):

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(711)_call_impl()
-> for hook in itertools.chain(

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(719)_call_impl()
-> if torch._C._get_tracing_state():

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(722)_call_impl()
-> result = self.forward(*input, **kwargs)

--Call--
> /usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py(44)forward()
-> def forward(self, images, targets=None):

> /usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py(58)forward()
-> if self.training and targets is None:

> /usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py(60)forward()
-> if self.training:

> /usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py(73)forward()
-> original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], [])

--Call--
> /usr/lib/python3.8/typing.py(802)__getitem__()
-> def __getitem__(self, params):

> /usr/lib/python3.8/typing.py(803)__getitem__()
-> if self._name != 'Callable' or not self._special:

> /usr/lib/python3.8/typing.py(804)__getitem__()
-> return self.__getitem_inner__(params)

--Call--
> /usr/lib/python3.8/typing.py(255)inner()
-> @functools.wraps(func)

> /usr/lib/python3.8/typing.py(257)inner()
-> try:

> /usr/lib/python3.8/typing.py(258)inner()
-> return cached(*args, **kwds)

--Call--
> /usr/lib/python3.8/typing.py(720)__hash__()
-> def __hash__(self):

> /usr/lib/python3.8/typing.py(721)__hash__()
-> if self.__origin__ is Union:

> /usr/lib/python3.8/typing.py(723)__hash__()
-> return hash((self.__origin__, self.__args__))

--Return--
> /usr/lib/python3.8/typing.py(723)__hash__()->-3768858926959439677
-> return hash((self.__origin__, self.__args__))

--Return--
> /usr/lib/python3.8/typing.py(258)inner()->typing.Tuple[int, int]
-> return cached(*args, **kwds)

--Return--
> /usr/lib/python3.8/typing.py(804)__getitem__()->typing.Tuple[int, int]
-> return self.__getitem_inner__(params)

--Call--
> /usr/lib/python3.8/typing.py(255)inner()
-> @functools.wraps(func)

다음과 같은 결과를 획득.

즉, 우선은 바로 선언한 모델로 들어가는 것이 아닌, module.py_call_impl()라는 함수에 진입한다.

그 다음에 torchvision/models/detection/generalized_rcnn.py에 들어가는데, 이는 곧

fast_rcnn.py의 15줄에서 import하는 함수임.

 

또한 fast_rcnn.py의 FasterRCNN의 __init__함수 마지막단에

transform = GeneralizedRCNNTransform(image_mean, image_std, min_size, max_size)
super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)

가 존재하고, 그 바로 아래에 

class GeneralizedRCNNTransform(nn.Module):

    def __init__(self, image_mean, image_std, min_size, max_size):
    ....

가 존재하는 것을 볼 때, 실제로 이미지처리는 해당 클래스에서 진행하는 것을 알 수 있다.

즉, Generalized...는 (nn.Module)을 상속받고, FasterRCNN은 Generalized...를 상속받는다.

따라서 FasterRCNN에서 forward를 진행하면, Generalized...의 forward를 진행하게 된다.

 

실행순서를 살펴본 결과,

test.py에서 outputs = model(images)를 실행 시,

module.py - generalized_rcnn.py - typing.py - check_generic ... 등을 거쳐 

비로소 head_detections/models/fast_rcnn.py 에 다다르게 되고,

여기에서 class GeneralizedRCNNTransform - forward()를 진행한 다음,

특이하게도 그 다음엔 

models/net.pyclass_BackBoneWithFPN의 forward()에 다다르게 됨. 

특이한 구조가 아닐 수 없다.

model = create_model(combined_cfg)
outputs = model(images)

def create_model :	# test.py
	model = customRCNN
	return model
    
def customRCNN :	# models/head_detect.py
	backbone_with_fpn = create_backbone() = BackBoneWithFPN(nn.Module)	# models/net.py
    
	"""
	class BackBoneWithFPN(nn.module) :
		def __init__() :
		def forward() :
	"""
    
	model = FasterRCNN(backbone_with_fpn)
	return model
    
    
class FasterRCNN(GeneralizedRCNN) :	# models/fast_rcnn.py
	transform = GeneralizedRCNNTransform()
	super(self, FasterRCNN).__init__(backbone)

class GeneralizedRCNN(nn.Module) :	# models/fast_rcnn.py
	def __init__():
		super(GeneralizedRCNNTransform, self).__init__()
	def forward(image):	# 이미지는 여기에 들어가게 됨.

구조는 위와 같다.

1행은 init을 실행하고, 2행은 forward를 실행한다는 것을 생각하면, 

또한 

> /workspace/HeadHunter/head_detection/models/fast_rcnn.py(180)forward()
-> return image_list, targets
(Pdb) step
--Return--
> /workspace/HeadHunter/head_detection/models/fast_rcnn.py(180)forward()->(<torchvision....x7fdf2857b8b0>, None)
-> return image_list, targets
(Pdb) step
...
--Call--
> /workspace/HeadHunter/head_detection/models/net.py(264)forward()
-> def forward(self, inputs):
(Pdb) step
> /workspace/HeadHunter/head_detection/models/net.py(265)forward()
-> feat_maps = self.body(inputs)

로그는 위와 같다.

추측컨대, backbone으로 설치한 클래스의 forward가 자동으로 실행되는 것으로 보임.

이 backbone의 forward가 진정한 딥러닝 과정으로 보인다.

    def forward(self, inputs):
        feat_maps = self.body(inputs)
        fpn_out = self.fpn(feat_maps)
        fpn_out = list(fpn_out.values()) if isinstance(fpn_out, dict) else fpn_out
        if self.context_list is not None:
            context_out = [cont_i(fpn_f) for fpn_f, cont_i in zip(fpn_out, self.context_list)]
        else:
            context_out = fpn_out
        # back to dict
        feat_extracted = OrderedDict([(k, v) for k, v in zip(list(feat_maps.keys()), context_out)])
        return feat_extracted

의 첫 feat_maps = self.body(inputs)에 pdb.set_trace()를 찍어본 결과 batchnorm.py ReLU같은 게 실행 됨.

그리고 이후 fpn_out부터는 feature_pyramid_network.py 의 함수가 실행됨.

 

이후에 return feat_extracted를 한 다음에는 

fast_rcnn.py의 def postprocess 실행.

> /usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py(100)forward()
-> detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)

--Call--
> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(758)__getattr__()
-> def __getattr__(self, name: str) -> Union[Tensor, 'Module']:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(759)__getattr__()
-> if '_parameters' in self.__dict__:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(760)__getattr__()
-> _parameters = self.__dict__['_parameters']

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(761)__getattr__()
-> if name in _parameters:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(763)__getattr__()
-> if '_buffers' in self.__dict__:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(764)__getattr__()
-> _buffers = self.__dict__['_buffers']

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(765)__getattr__()
-> if name in _buffers:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(767)__getattr__()
-> if '_modules' in self.__dict__:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(768)__getattr__()
-> modules = self.__dict__['_modules']

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(769)__getattr__()
-> if name in modules:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(770)__getattr__()
-> return modules[name]

--Return--
> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(770)__getattr__()->GeneralizedRCNNTransform()
-> return modules[name]

--Call--
> /workspace/HeadHunter/head_detection/models/fast_rcnn.py(235)postprocess()
-> def postprocess(self, result, image_shapes, original_image_sizes):

이 postprocess는 /usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py(100)forward()처럼, 실제로 generalized_rcnn.py의 forward()함수로 인해 실행된 것을 보임. 

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(744)_call_impl()->([{'boxes': tensor([[1340...vice='cuda:0'), 'labels': tensor([1], device='cuda:0'), 'scores': tensor([0.869...vice='cuda:0')}], {})
-> return result

> /usr/local/lib/python3.8/dist-packages/torchvision/models/detection/generalized_rcnn.py(100)forward()
-> detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)

--Call--
> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(758)__getattr__()
-> def __getattr__(self, name: str) -> Union[Tensor, 'Module']:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(759)__getattr__()
-> if '_parameters' in self.__dict__:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(760)__getattr__()
-> _parameters = self.__dict__['_parameters']

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(761)__getattr__()
-> if name in _parameters:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(763)__getattr__()
-> if '_buffers' in self.__dict__:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(764)__getattr__()
-> _buffers = self.__dict__['_buffers']

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(765)__getattr__()
-> if name in _buffers:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(767)__getattr__()
-> if '_modules' in self.__dict__:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(768)__getattr__()
-> modules = self.__dict__['_modules']

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(769)__getattr__()
-> if name in modules:

> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(770)__getattr__()
-> return modules[name]

--Return--
> /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(770)__getattr__()->GeneralizedRCNNTransform()
-> return modules[name]

이 fast_rcnn.py - postprocess가 마치고 나서야 outputs = model(images)가 끝나고 다음 줄로 넘어가게 됨.

'블루아카이브' 카테고리의 다른 글

PCA  (0) 2023.12.12
Forward-Forward 이해용  (0) 2023.01.20
BERT & RoBERTa & CORD & NER  (0) 2022.12.17
에러 기록  (0) 2022.11.18
Visual Transformer  (0) 2022.08.22