Skip to content

Commit

Permalink
Merge pull request #165 from stanstarks/master
Browse files Browse the repository at this point in the history
add multi-label nms for FCOS post process
  • Loading branch information
stanstarks committed Oct 12, 2019
2 parents ebdd88b + 2cd10da commit 0748f1d
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 28 deletions.
136 changes: 136 additions & 0 deletions fcos_core/csrc/cuda/ml_nms.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>

#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>

#include <vector>
#include <iostream>

int const threadsPerBlock = sizeof(unsigned long long) * 8;

__device__ inline float devIoU(float const * const a, float const * const b) {
if (a[5] != b[5]) {
return 0.0;
}
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}

__global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;

// if (row_start > col_start) return;

const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);

__shared__ float block_boxes[threadsPerBlock * 6];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 6 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
block_boxes[threadIdx.x * 6 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
block_boxes[threadIdx.x * 6 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
block_boxes[threadIdx.x * 6 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
block_boxes[threadIdx.x * 6 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
block_boxes[threadIdx.x * 6 + 5] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
}
__syncthreads();

if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 6;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}

// boxes is a N x 6 tensor
at::Tensor ml_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);

int boxes_num = boxes.size(0);

const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);

scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();

THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState

unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));

mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));

dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
ml_nms_kernel<<<blocks, threads>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);

std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));

std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>();

int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;

if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}

THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}
1 change: 1 addition & 0 deletions fcos_core/csrc/cuda/vision.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
const int width);

at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
at::Tensor ml_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);

int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
at::Tensor offset, at::Tensor output,
Expand Down
27 changes: 27 additions & 0 deletions fcos_core/csrc/ml_nms.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include "cpu/vision.h"

#ifdef WITH_CUDA
#include "cuda/vision.h"
#endif


at::Tensor ml_nms(const at::Tensor& dets,
const at::Tensor& scores,
const at::Tensor& labels,
const float threshold) {

if (dets.type().is_cuda()) {
#ifdef WITH_CUDA
// TODO raise error if not compiled with CUDA
if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1);
return ml_nms_cuda(b, threshold);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("CPU version not implemented");
}
2 changes: 2 additions & 0 deletions fcos_core/csrc/vision.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include "nms.h"
#include "ml_nms.h"
#include "ROIAlign.h"
#include "ROIPool.h"
#include "SigmoidFocalLoss.h"
Expand All @@ -8,6 +9,7 @@

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nms", &nms, "non-maximum suppression");
m.def("ml_nms", &ml_nms, "multi-label non-maximum suppression");
m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
Expand Down
3 changes: 2 additions & 1 deletion fcos_core/layers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .misc import ConvTranspose2d
from .misc import BatchNorm2d
from .misc import interpolate
from .nms import nms
from .nms import nms, ml_nms
from .roi_align import ROIAlign
from .roi_align import roi_align
from .roi_pool import ROIPool
Expand All @@ -26,6 +26,7 @@

__all__ = [
"nms",
"ml_nms",
"roi_align",
"ROIAlign",
"roi_pool",
Expand Down
1 change: 1 addition & 0 deletions fcos_core/layers/nms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
from fcos_core import _C

nms = _C.nms
ml_nms = _C.ml_nms
# nms.__doc__ = """
# This function performs Non-maximum suppresion"""
30 changes: 3 additions & 27 deletions fcos_core/modeling/rpn/fcos/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from fcos_core.modeling.utils import cat
from fcos_core.structures.bounding_box import BoxList
from fcos_core.structures.boxlist_ops import cat_boxlist
from fcos_core.structures.boxlist_ops import boxlist_nms
from fcos_core.structures.boxlist_ops import boxlist_ml_nms
from fcos_core.structures.boxlist_ops import remove_small_boxes


Expand Down Expand Up @@ -146,32 +146,8 @@ def select_over_all_levels(self, boxlists):
num_images = len(boxlists)
results = []
for i in range(num_images):
scores = boxlists[i].get_field("scores")
labels = boxlists[i].get_field("labels")
boxes = boxlists[i].bbox
boxlist = boxlists[i]
result = []
# skip the background
for j in range(1, self.num_classes):
inds = (labels == j).nonzero().view(-1)

scores_j = scores[inds]
boxes_j = boxes[inds, :].view(-1, 4)
boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
boxlist_for_class.add_field("scores", scores_j)
boxlist_for_class = boxlist_nms(
boxlist_for_class, self.nms_thresh,
score_field="scores"
)
num_labels = len(boxlist_for_class)
boxlist_for_class.add_field(
"labels", torch.full((num_labels,), j,
dtype=torch.int64,
device=scores.device)
)
result.append(boxlist_for_class)

result = cat_boxlist(result)
# multiclass nms
result = boxlist_ml_nms(boxlists[i], self.nms_thresh)
number_of_detections = len(result)

# Limit to max_per_image detections **over all classes**
Expand Down
28 changes: 28 additions & 0 deletions fcos_core/structures/boxlist_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .bounding_box import BoxList

from fcos_core.layers import nms as _box_nms
from fcos_core.layers import ml_nms as _box_ml_nms


def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"):
Expand Down Expand Up @@ -31,6 +32,33 @@ def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"):
return boxlist.convert(mode)


def boxlist_ml_nms(boxlist, nms_thresh, max_proposals=-1,
score_field="scores", label_field="labels"):
"""
Performs non-maximum suppression on a boxlist, with scores specified
in a boxlist field via score_field.
Arguments:
boxlist(BoxList)
nms_thresh (float)
max_proposals (int): if > 0, then only the top max_proposals are kept
after non-maximum suppression
score_field (str)
"""
if nms_thresh <= 0:
return boxlist
mode = boxlist.mode
boxlist = boxlist.convert("xyxy")
boxes = boxlist.bbox
scores = boxlist.get_field(score_field)
labels = boxlist.get_field(label_field)
keep = _box_ml_nms(boxes, scores, labels.float(), nms_thresh)
if max_proposals > 0:
keep = keep[: max_proposals]
boxlist = boxlist[keep]
return boxlist.convert(mode)


def remove_small_boxes(boxlist, min_size):
"""
Only keep boxes with both sides >= min_size
Expand Down

0 comments on commit 0748f1d

Please sign in to comment.