diff --git a/FastDeploy.cmake.in b/FastDeploy.cmake.in old mode 100755 new mode 100644 index 6f71b2365c..3ad85fb79d --- a/FastDeploy.cmake.in +++ b/FastDeploy.cmake.in @@ -248,6 +248,7 @@ if(WITH_XPU) list(APPEND FASTDEPLOY_LIBS -lpthread -lrt -ldl) endif() + # log lib for Android if(ANDROID) find_library(log-lib log) diff --git a/fastdeploy/vision/visualize/segmentation.cc b/fastdeploy/vision/visualize/segmentation.cc index 37d0f572ef..5fa8c78912 100644 --- a/fastdeploy/vision/visualize/segmentation.cc +++ b/fastdeploy/vision/visualize/segmentation.cc @@ -15,169 +15,13 @@ #ifdef ENABLE_VISION_VISUALIZE #include "fastdeploy/vision/visualize/visualize.h" +#include "fastdeploy/vision/visualize/segmentation_arm.h" #include "opencv2/highgui.hpp" #include "opencv2/imgproc/imgproc.hpp" -#ifdef __ARM_NEON -#include -#endif namespace fastdeploy { namespace vision { -#ifdef __ARM_NEON -static constexpr int VIS_SEG_OMP_NUM_THREADS = 2; - -static inline void QuantizeBlendingWeight8( - float weight, uint8_t* old_multi_factor, uint8_t* new_multi_factor) { - // Quantize the weight to boost blending performance. - // if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7 - // if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6 - // if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5 - // if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4 - // Shift factor is always 3, but the mul factor is different. - // Moving 7 bits to the right tends to result in a zero value, - // So, We choose to shift 3 bits to get an approximation. - uint8_t weight_quantize = static_cast(weight * 8.0f); - *new_multi_factor = weight_quantize; - *old_multi_factor = (8 - weight_quantize); -} - -static cv::Mat FastVisSegmentationNEON( - const cv::Mat& im, const SegmentationResult& result, - float weight, bool quantize_weight = true) { - int64_t height = result.shape[0]; - int64_t width = result.shape[1]; - auto vis_img = cv::Mat(height, width, CV_8UC3); - - int32_t size = static_cast(height * width); - uint8_t *vis_ptr = static_cast(vis_img.data); - const uint8_t *label_ptr = static_cast(result.label_map.data()); - const uint8_t *im_ptr = static_cast(im.data); - - if (!quantize_weight) { - uint8x16_t zerox16 = vdupq_n_u8(0); - #pragma omp parallel for proc_bind(close) \ - num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static) - for (int i = 0; i < size - 15; i += 16) { - uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3); // 48 bytes - uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes - uint8x16_t ibx16 = bgrx16x3.val[0]; - uint8x16_t igx16 = bgrx16x3.val[1]; - uint8x16_t irx16 = bgrx16x3.val[2]; - // e.g 0b00000001 << 7 -> 0b10000000 128; - uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); - uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); - uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); - uint8x16x3_t vbgrx16x3; - // Keep the pixels of input im if mask = 0 - uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16); - vbgrx16x3.val[0] = vorrq_u8(vandq_u8(cezx16, ibx16), mbx16); - vbgrx16x3.val[1] = vorrq_u8(vandq_u8(cezx16, igx16), mgx16); - vbgrx16x3.val[2] = vorrq_u8(vandq_u8(cezx16, irx16), mrx16); - vst3q_u8(vis_ptr + i * 3, vbgrx16x3); - } - for (int i = size - 15; i < size; i++) { - uint8_t label = label_ptr[i]; - vis_ptr[i * 3 + 0] = (label << 7); - vis_ptr[i * 3 + 1] = (label << 4); - vis_ptr[i * 3 + 2] = (label << 3); - } - // Blend the colors use OpenCV - cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img); - return vis_img; - } - - // Quantize the weight to boost blending performance. - // After that, we can directly use shift instructions - // to blend the colors from input im and mask. Please - // check QuantizeBlendingWeight8 for more details. - uint8_t old_multi_factor, new_multi_factor; - QuantizeBlendingWeight8(weight, &old_multi_factor, - &new_multi_factor); - if (new_multi_factor == 0) { - return im; // Only keep origin image. - } - - if (new_multi_factor == 8) { - // Only keep mask, no need to blending with origin image. - #pragma omp parallel for proc_bind(close) \ - num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static) - for (int i = 0; i < size - 15; i += 16) { - uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes - // e.g 0b00000001 << 7 -> 0b10000000 128; - uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); - uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); - uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); - uint8x16x3_t vbgr16x3; - vbgr16x3.val[0] = mbx16; - vbgr16x3.val[1] = mgx16; - vbgr16x3.val[2] = mrx16; - vst3q_u8(vis_ptr + i * 3, vbgr16x3); - } - for (int i = size - 15; i < size; i++) { - uint8_t label = label_ptr[i]; - vis_ptr[i * 3 + 0] = (label << 7); - vis_ptr[i * 3 + 1] = (label << 4); - vis_ptr[i * 3 + 2] = (label << 3); - } - return vis_img; - } - - uint8x16_t zerox16 = vdupq_n_u8(0); - uint8x16_t old_fx16 = vdupq_n_u8(old_multi_factor); - uint8x16_t new_fx16 = vdupq_n_u8(new_multi_factor); - // Blend the two colors together with quantize 'weight'. - #pragma omp parallel for proc_bind(close) \ - num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static) - for (int i = 0; i < size - 15; i += 16) { - uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3); // 48 bytes - uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes - uint8x16_t ibx16 = bgrx16x3.val[0]; - uint8x16_t igx16 = bgrx16x3.val[1]; - uint8x16_t irx16 = bgrx16x3.val[2]; - // e.g 0b00000001 << 7 -> 0b10000000 128; - uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); - uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); - uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); - // Moving 7 bits to the right tends to result in zero, - // So, We choose to shift 3 bits to get an approximation - uint8x16_t ibx16_mshr = vmulq_u8(vshrq_n_u8(ibx16, 3), old_fx16); - uint8x16_t igx16_mshr = vmulq_u8(vshrq_n_u8(igx16, 3), old_fx16); - uint8x16_t irx16_mshr = vmulq_u8(vshrq_n_u8(irx16, 3), old_fx16); - uint8x16_t mbx16_mshr = vmulq_u8(vshrq_n_u8(mbx16, 3), new_fx16); - uint8x16_t mgx16_mshr = vmulq_u8(vshrq_n_u8(mgx16, 3), new_fx16); - uint8x16_t mrx16_mshr = vmulq_u8(vshrq_n_u8(mrx16, 3), new_fx16); - uint8x16_t qbx16 = vqaddq_u8(ibx16_mshr, mbx16_mshr); - uint8x16_t qgx16 = vqaddq_u8(igx16_mshr, mgx16_mshr); - uint8x16_t qrx16 = vqaddq_u8(irx16_mshr, mrx16_mshr); - // Keep the pixels of input im if label = 0 (means mask = 0) - uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16); - uint8x16_t abx16 = vandq_u8(cezx16, ibx16); - uint8x16_t agx16 = vandq_u8(cezx16, igx16); - uint8x16_t arx16 = vandq_u8(cezx16, irx16); - uint8x16x3_t vbgr16x3; - // Reset qx values to 0 if label is 0, then, keep mask values - // if label is not 0 - uint8x16_t ncezx16 = vmvnq_u8(cezx16); - vbgr16x3.val[0] = vorrq_u8(abx16, vandq_u8(ncezx16, qbx16)); - vbgr16x3.val[1] = vorrq_u8(agx16, vandq_u8(ncezx16, qgx16)); - vbgr16x3.val[2] = vorrq_u8(arx16, vandq_u8(ncezx16, qrx16)); - // Store the blended pixels to vis img - vst3q_u8(vis_ptr + i * 3, vbgr16x3); - } - for (int i = size - 15; i < size; i++) { - uint8_t label = label_ptr[i]; - vis_ptr[i * 3 + 0] = (im_ptr[i * 3 + 0] >> 3) * old_multi_factor - + ((label << 7) >> 3) * new_multi_factor; - vis_ptr[i * 3 + 1] = (im_ptr[i * 3 + 1] >> 3) * old_multi_factor - + ((label << 4) >> 3) * new_multi_factor; - vis_ptr[i * 3 + 2] = (im_ptr[i * 3 + 2] >> 3) * old_multi_factor - + ((label << 3) >> 3) * new_multi_factor; - } - return vis_img; -} -#endif - static cv::Mat VisSegmentationCommonCpu( const cv::Mat& im, const SegmentationResult& result, float weight) { @@ -210,7 +54,7 @@ cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result, float weight) { // TODO: Support SSE/AVX on x86_64 platforms #ifdef __ARM_NEON - return FastVisSegmentationNEON(im, result, weight, true); + return VisSegmentationNEON(im, result, weight, true); #else return VisSegmentationCommonCpu(im, result, weight); #endif @@ -223,7 +67,7 @@ cv::Mat Visualize::VisSegmentation(const cv::Mat& im, "function instead." << std::endl; #ifdef __ARM_NEON - return FastVisSegmentationNEON(im, result, 0.5f, true); + return VisSegmentationNEON(im, result, 0.5f, true); #else return VisSegmentationCommonCpu(im, result, 0.5f); #endif diff --git a/fastdeploy/vision/visualize/segmentation_arm.cc b/fastdeploy/vision/visualize/segmentation_arm.cc new file mode 100644 index 0000000000..154883f77d --- /dev/null +++ b/fastdeploy/vision/visualize/segmentation_arm.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE + +#include "fastdeploy/vision/visualize/segmentation_arm.h" +#ifdef __ARM_NEON +#include +#endif + +namespace fastdeploy { +namespace vision { + +static constexpr int _OMP_THREADS = 2; + +static inline void QuantizeBlendingWeight8( + float weight, uint8_t* old_multi_factor, uint8_t* new_multi_factor) { + // Quantize the weight to boost blending performance. + // if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7 + // if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6 + // if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5 + // if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4 + // Shift factor is always 3, but the mul factor is different. + // Moving 7 bits to the right tends to result in a zero value, + // So, We choose to shift 3 bits to get an approximation. + uint8_t weight_quantize = static_cast(weight * 8.0f); + *new_multi_factor = weight_quantize; + *old_multi_factor = (8 - weight_quantize); +} + +cv::Mat VisSegmentationNEON( + const cv::Mat& im, const SegmentationResult& result, + float weight, bool quantize_weight) { +#ifndef __ARM_NEON + FDASSERT(false, "FastDeploy was not compiled with Arm NEON support!") +#else + int64_t height = result.shape[0]; + int64_t width = result.shape[1]; + auto vis_img = cv::Mat(height, width, CV_8UC3); + + int32_t size = static_cast(height * width); + uint8_t *vis_ptr = static_cast(vis_img.data); + const uint8_t *label_ptr = static_cast(result.label_map.data()); + const uint8_t *im_ptr = static_cast(im.data); + + if (!quantize_weight) { + uint8x16_t zerox16 = vdupq_n_u8(0); + #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS) + for (int i = 0; i < size - 15; i += 16) { + uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3); // 48 bytes + uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes + uint8x16_t ibx16 = bgrx16x3.val[0]; + uint8x16_t igx16 = bgrx16x3.val[1]; + uint8x16_t irx16 = bgrx16x3.val[2]; + // e.g 0b00000001 << 7 -> 0b10000000 128; + uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); + uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); + uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); + uint8x16x3_t vbgrx16x3; + // Keep the pixels of input im if mask = 0 + uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16); + vbgrx16x3.val[0] = vorrq_u8(vandq_u8(cezx16, ibx16), mbx16); + vbgrx16x3.val[1] = vorrq_u8(vandq_u8(cezx16, igx16), mgx16); + vbgrx16x3.val[2] = vorrq_u8(vandq_u8(cezx16, irx16), mrx16); + vst3q_u8(vis_ptr + i * 3, vbgrx16x3); + } + for (int i = size - 15; i < size; i++) { + uint8_t label = label_ptr[i]; + vis_ptr[i * 3 + 0] = (label << 7); + vis_ptr[i * 3 + 1] = (label << 4); + vis_ptr[i * 3 + 2] = (label << 3); + } + // Blend the colors use OpenCV + cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img); + return vis_img; + } + + // Quantize the weight to boost blending performance. + // After that, we can directly use shift instructions + // to blend the colors from input im and mask. Please + // check QuantizeBlendingWeight8 for more details. + uint8_t old_multi_factor, new_multi_factor; + QuantizeBlendingWeight8(weight, &old_multi_factor, + &new_multi_factor); + if (new_multi_factor == 0) { + return im; // Only keep origin image. + } + + if (new_multi_factor == 8) { + // Only keep mask, no need to blending with origin image. + #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS) + for (int i = 0; i < size - 15; i += 16) { + uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes + // e.g 0b00000001 << 7 -> 0b10000000 128; + uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); + uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); + uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); + uint8x16x3_t vbgr16x3; + vbgr16x3.val[0] = mbx16; + vbgr16x3.val[1] = mgx16; + vbgr16x3.val[2] = mrx16; + vst3q_u8(vis_ptr + i * 3, vbgr16x3); + } + for (int i = size - 15; i < size; i++) { + uint8_t label = label_ptr[i]; + vis_ptr[i * 3 + 0] = (label << 7); + vis_ptr[i * 3 + 1] = (label << 4); + vis_ptr[i * 3 + 2] = (label << 3); + } + return vis_img; + } + + uint8x16_t zerox16 = vdupq_n_u8(0); + uint8x16_t old_fx16 = vdupq_n_u8(old_multi_factor); + uint8x16_t new_fx16 = vdupq_n_u8(new_multi_factor); + // Blend the two colors together with quantize 'weight'. + #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS) + for (int i = 0; i < size - 15; i += 16) { + uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3); // 48 bytes + uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes + uint8x16_t ibx16 = bgrx16x3.val[0]; + uint8x16_t igx16 = bgrx16x3.val[1]; + uint8x16_t irx16 = bgrx16x3.val[2]; + // e.g 0b00000001 << 7 -> 0b10000000 128; + uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); + uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); + uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); + // Moving 7 bits to the right tends to result in zero, + // So, We choose to shift 3 bits to get an approximation + uint8x16_t ibx16_mshr = vmulq_u8(vshrq_n_u8(ibx16, 3), old_fx16); + uint8x16_t igx16_mshr = vmulq_u8(vshrq_n_u8(igx16, 3), old_fx16); + uint8x16_t irx16_mshr = vmulq_u8(vshrq_n_u8(irx16, 3), old_fx16); + uint8x16_t mbx16_mshr = vmulq_u8(vshrq_n_u8(mbx16, 3), new_fx16); + uint8x16_t mgx16_mshr = vmulq_u8(vshrq_n_u8(mgx16, 3), new_fx16); + uint8x16_t mrx16_mshr = vmulq_u8(vshrq_n_u8(mrx16, 3), new_fx16); + uint8x16_t qbx16 = vqaddq_u8(ibx16_mshr, mbx16_mshr); + uint8x16_t qgx16 = vqaddq_u8(igx16_mshr, mgx16_mshr); + uint8x16_t qrx16 = vqaddq_u8(irx16_mshr, mrx16_mshr); + // Keep the pixels of input im if label = 0 (means mask = 0) + uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16); + uint8x16_t abx16 = vandq_u8(cezx16, ibx16); + uint8x16_t agx16 = vandq_u8(cezx16, igx16); + uint8x16_t arx16 = vandq_u8(cezx16, irx16); + uint8x16x3_t vbgr16x3; + // Reset qx values to 0 if label is 0, then, keep mask values + // if label is not 0 + uint8x16_t ncezx16 = vmvnq_u8(cezx16); + vbgr16x3.val[0] = vorrq_u8(abx16, vandq_u8(ncezx16, qbx16)); + vbgr16x3.val[1] = vorrq_u8(agx16, vandq_u8(ncezx16, qgx16)); + vbgr16x3.val[2] = vorrq_u8(arx16, vandq_u8(ncezx16, qrx16)); + // Store the blended pixels to vis img + vst3q_u8(vis_ptr + i * 3, vbgr16x3); + } + for (int i = size - 15; i < size; i++) { + uint8_t label = label_ptr[i]; + vis_ptr[i * 3 + 0] = (im_ptr[i * 3 + 0] >> 3) * old_multi_factor + + ((label << 7) >> 3) * new_multi_factor; + vis_ptr[i * 3 + 1] = (im_ptr[i * 3 + 1] >> 3) * old_multi_factor + + ((label << 4) >> 3) * new_multi_factor; + vis_ptr[i * 3 + 2] = (im_ptr[i * 3 + 2] >> 3) * old_multi_factor + + ((label << 3) >> 3) * new_multi_factor; + } + return vis_img; +#endif +} + +} // namespace vision +} // namespace fastdeploy + +#endif \ No newline at end of file diff --git a/fastdeploy/vision/visualize/segmentation_arm.h b/fastdeploy/vision/visualize/segmentation_arm.h new file mode 100644 index 0000000000..15c91eb54b --- /dev/null +++ b/fastdeploy/vision/visualize/segmentation_arm.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE +#pragma once + +#include "fastdeploy/vision/common/result.h" +#include "opencv2/imgproc/imgproc.hpp" + +namespace fastdeploy { +namespace vision { + +cv::Mat VisSegmentationNEON(const cv::Mat& im, const SegmentationResult& result, + float weight, bool quantize_weight = true); + +} // namespace vision +} // namespace fastdeploy + +#endif + diff --git a/fastdeploy/vision/visualize/swap_background.cc b/fastdeploy/vision/visualize/swap_background.cc index fba8b1bbbc..c7669332b4 100644 --- a/fastdeploy/vision/visualize/swap_background.cc +++ b/fastdeploy/vision/visualize/swap_background.cc @@ -15,15 +15,17 @@ #ifdef ENABLE_VISION_VISUALIZE #include "fastdeploy/vision/visualize/visualize.h" +#include "fastdeploy/vision/visualize/swap_background_arm.h" #include "opencv2/highgui.hpp" #include "opencv2/imgproc/imgproc.hpp" +#include "fastdeploy/utils/utils.h" namespace fastdeploy { namespace vision { -cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, - const MattingResult& result, - bool remove_small_connected_area) { +static cv::Mat SwapBackgroundCommonCpu( + const cv::Mat& im, const cv::Mat& background, + const MattingResult& result, bool remove_small_connected_area) { FDASSERT((!im.empty()), "Image can't be empty!"); FDASSERT((im.channels() == 3), "Only support 3 channels image mat!"); FDASSERT((!background.empty()), "Background image can't be empty!"); @@ -60,6 +62,7 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, uchar* background_data = static_cast(background_copy.data); uchar* im_data = static_cast(im.data); float* alpha_data = reinterpret_cast(alpha.data); + for (size_t i = 0; i < height; ++i) { for (size_t j = 0; j < width; ++j) { float alpha_val = alpha_data[i * width + j]; @@ -74,8 +77,9 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, return vis_img; } -cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, - const SegmentationResult& result, int background_label) { +static cv::Mat SwapBackgroundCommonCpu( + const cv::Mat& im, const cv::Mat& background, + const SegmentationResult& result, int background_label) { FDASSERT((!im.empty()), "Image can't be empty!"); FDASSERT((im.channels() == 3), "Only support 3 channels image mat!"); FDASSERT((!background.empty()), "Background image can't be empty!"); @@ -100,6 +104,7 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, uchar* background_data = static_cast(background_copy.data); uchar* im_data = static_cast(im.data); float keep_value = 0.f; + for (size_t i = 0; i < height; ++i) { for (size_t j = 0; j < width; ++j) { int category_id = result.label_map[i * width + j]; @@ -116,107 +121,59 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, } } } + return vis_img; } +// Public interfaces for SwapBackground. +cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, + const MattingResult& result, + bool remove_small_connected_area) { + // TODO: Support SSE/AVX on x86_64 platforms +#ifdef __ARM_NEON + return SwapBackgroundNEON(im, background, result, + remove_small_connected_area); +#else + return SwapBackgroundCommonCpu(im, background, result, + remove_small_connected_area); +#endif +} + +cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background, + const SegmentationResult& result, int background_label) { + // TODO: Support SSE/AVX on x86_64 platforms +#ifdef __ARM_NEON + // return SwapBackgroundNEON(im, background, result, background_label); + return SwapBackgroundNEON(im, background, result, background_label); +#else + return SwapBackgroundCommonCpu(im, background, result, background_label); +#endif +} + +// DEPRECATED cv::Mat Visualize::SwapBackgroundMatting(const cv::Mat& im, const cv::Mat& background, const MattingResult& result, bool remove_small_connected_area) { - FDASSERT((!im.empty()), "Image can't be empty!"); - FDASSERT((im.channels() == 3), "Only support 3 channels image mat!"); - FDASSERT((!background.empty()), "Background image can't be empty!"); - FDASSERT((background.channels() == 3), - "Only support 3 channels background image mat!"); - auto vis_img = im.clone(); - auto background_copy = background.clone(); - int out_h = static_cast(result.shape[0]); - int out_w = static_cast(result.shape[1]); - int height = im.rows; - int width = im.cols; - int bg_height = background.rows; - int bg_width = background.cols; - std::vector alpha_copy; - alpha_copy.assign(result.alpha.begin(), result.alpha.end()); - float* alpha_ptr = static_cast(alpha_copy.data()); - cv::Mat alpha(out_h, out_w, CV_32FC1, alpha_ptr); - if (remove_small_connected_area) { - alpha = Visualize::RemoveSmallConnectedArea(alpha, 0.05f); - } - if ((vis_img).type() != CV_8UC3) { - (vis_img).convertTo((vis_img), CV_8UC3); - } - if ((background_copy).type() != CV_8UC3) { - (background_copy).convertTo((background_copy), CV_8UC3); - } - if ((bg_height != height) || (bg_width != width)) { - cv::resize(background, background_copy, cv::Size(width, height)); - } - if ((out_h != height) || (out_w != width)) { - cv::resize(alpha, alpha, cv::Size(width, height)); - } - uchar* vis_data = static_cast(vis_img.data); - uchar* background_data = static_cast(background_copy.data); - uchar* im_data = static_cast(im.data); - float* alpha_data = reinterpret_cast(alpha.data); - for (size_t i = 0; i < height; ++i) { - for (size_t j = 0; j < width; ++j) { - float alpha_val = alpha_data[i * width + j]; - for (size_t c = 0; c < 3; ++c) { - vis_data[i * width * 3 + j * 3 + c] = cv::saturate_cast( - static_cast(im_data[i * width * 3 + j * 3 + c]) * alpha_val + - (1.f - alpha_val) * background_data[i * width * 3 + j * 3 + c]); - } - } - } - - return vis_img; +// TODO: Support SSE/AVX on x86_64 platforms +#ifdef __ARM_NEON + return SwapBackgroundNEON(im, background, result, + remove_small_connected_area); +#else + return SwapBackgroundCommonCpu(im, background, result, + remove_small_connected_area); +#endif } cv::Mat Visualize::SwapBackgroundSegmentation( const cv::Mat& im, const cv::Mat& background, int background_label, const SegmentationResult& result) { - FDASSERT((!im.empty()), "Image can't be empty!"); - FDASSERT((im.channels() == 3), "Only support 3 channels image mat!"); - FDASSERT((!background.empty()), "Background image can't be empty!"); - FDASSERT((background.channels() == 3), - "Only support 3 channels background image mat!"); - auto vis_img = im.clone(); - auto background_copy = background.clone(); - int height = im.rows; - int width = im.cols; - int bg_height = background.rows; - int bg_width = background.cols; - if ((vis_img).type() != CV_8UC3) { - (vis_img).convertTo((vis_img), CV_8UC3); - } - if ((background_copy).type() != CV_8UC3) { - (background_copy).convertTo((background_copy), CV_8UC3); - } - if ((bg_height != height) || (bg_width != width)) { - cv::resize(background, background_copy, cv::Size(width, height)); - } - uchar* vis_data = static_cast(vis_img.data); - uchar* background_data = static_cast(background_copy.data); - uchar* im_data = static_cast(im.data); - float keep_value = 0.f; - for (size_t i = 0; i < height; ++i) { - for (size_t j = 0; j < width; ++j) { - int category_id = result.label_map[i * width + j]; - if (background_label != category_id) { - keep_value = 1.0f; - } else { - keep_value = 0.f; - } - for (size_t c = 0; c < 3; ++c) { - vis_data[i * width * 3 + j * 3 + c] = cv::saturate_cast( - static_cast(im_data[i * width * 3 + j * 3 + c]) * - keep_value + - (1.f - keep_value) * background_data[i * width * 3 + j * 3 + c]); - } - } - } - return vis_img; + // TODO: Support SSE/AVX on x86_64 platforms +#ifdef __ARM_NEON + return SwapBackgroundNEON(im, background, result, background_label); +#else + return SwapBackgroundCommonCpu(im, background, result, background_label); +#endif } } // namespace vision diff --git a/fastdeploy/vision/visualize/swap_background_arm.cc b/fastdeploy/vision/visualize/swap_background_arm.cc new file mode 100644 index 0000000000..3abbffd95b --- /dev/null +++ b/fastdeploy/vision/visualize/swap_background_arm.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE +#include "fastdeploy/vision/visualize/visualize.h" +#include "fastdeploy/vision/visualize/swap_background_arm.h" +#ifdef __ARM_NEON +#include +#endif +#include "fastdeploy/utils/utils.h" + +namespace fastdeploy { +namespace vision { + +static constexpr int _OMP_THREADS = 2; + +cv::Mat SwapBackgroundNEON(const cv::Mat& im, + const cv::Mat& background, + const MattingResult& result, + bool remove_small_connected_area) { +#ifndef __ARM_NEON + FDASSERT(false, "FastDeploy was not compiled with Arm NEON support!"); +#else + FDASSERT((!im.empty()), "Image can't be empty!"); + FDASSERT((im.channels() == 3), "Only support 3 channels image mat!"); + FDASSERT((!background.empty()), "Background image can't be empty!"); + FDASSERT((background.channels() == 3), + "Only support 3 channels background image mat!"); + int out_h = static_cast(result.shape[0]); + int out_w = static_cast(result.shape[1]); + int height = im.rows; + int width = im.cols; + int bg_height = background.rows; + int bg_width = background.cols; + + // WARN: may change the original alpha + float* alpha_ptr = const_cast(result.alpha.data()); + + cv::Mat alpha(out_h, out_w, CV_32FC1, alpha_ptr); + if (remove_small_connected_area) { + alpha = Visualize::RemoveSmallConnectedArea(alpha, 0.05f); + } + auto vis_img = cv::Mat(height, width, CV_8UC3); + + cv::Mat background_ref; + if ((bg_height != height) || (bg_width != width)) { + cv::resize(background, background_ref, cv::Size(width, height)); + } else { + background_ref = background; // ref only + } + if ((background_ref).type() != CV_8UC3) { + (background_ref).convertTo((background_ref), CV_8UC3); + } + + if ((out_h != height) || (out_w != width)) { + cv::resize(alpha, alpha, cv::Size(width, height)); + } + + uint8_t* vis_data = static_cast(vis_img.data); + const uint8_t* background_data = static_cast(background_ref.data); + const uint8_t* im_data = static_cast(im.data); + const float* alpha_data = reinterpret_cast(alpha.data); + + const int32_t size = static_cast(height * width); + #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS) + for(int i = 0; i < size - 7; i += 8) { + uint8x8x3_t ibgrx8x3 = vld3_u8(im_data + i * 3); // 24 bytes + // u8 -> u16 -> u32 -> f32 + uint16x8_t ibx8 = vmovl_u8(ibgrx8x3.val[0]); + uint16x8_t igx8 = vmovl_u8(ibgrx8x3.val[1]); + uint16x8_t irx8 = vmovl_u8(ibgrx8x3.val[2]); + uint8x8x3_t bbgrx8x3 = vld3_u8(background_data + i * 3); // 24 bytes + uint16x8_t bbx8 = vmovl_u8(bbgrx8x3.val[0]); + uint16x8_t bgx8 = vmovl_u8(bbgrx8x3.val[1]); + uint16x8_t brx8 = vmovl_u8(bbgrx8x3.val[2]); + + uint32x4_t hibx4 = vmovl_u16(vget_high_u16(ibx8)); + uint32x4_t higx4 = vmovl_u16(vget_high_u16(igx8)); + uint32x4_t hirx4 = vmovl_u16(vget_high_u16(irx8)); + uint32x4_t libx4 = vmovl_u16(vget_low_u16(ibx8)); + uint32x4_t ligx4 = vmovl_u16(vget_low_u16(igx8)); + uint32x4_t lirx4 = vmovl_u16(vget_low_u16(irx8)); + + uint32x4_t hbbx4 = vmovl_u16(vget_high_u16(bbx8)); + uint32x4_t hbgx4 = vmovl_u16(vget_high_u16(bgx8)); + uint32x4_t hbrx4 = vmovl_u16(vget_high_u16(brx8)); + uint32x4_t lbbx4 = vmovl_u16(vget_low_u16(bbx8)); + uint32x4_t lbgx4 = vmovl_u16(vget_low_u16(bgx8)); + uint32x4_t lbrx4 = vmovl_u16(vget_low_u16(brx8)); + + float32x4_t fhibx4 = vcvtq_f32_u32(hibx4); + float32x4_t fhigx4 = vcvtq_f32_u32(higx4); + float32x4_t fhirx4 = vcvtq_f32_u32(hirx4); + float32x4_t flibx4 = vcvtq_f32_u32(libx4); + float32x4_t fligx4 = vcvtq_f32_u32(ligx4); + float32x4_t flirx4 = vcvtq_f32_u32(lirx4); + + float32x4_t fhbbx4 = vcvtq_f32_u32(hbbx4); + float32x4_t fhbgx4 = vcvtq_f32_u32(hbgx4); + float32x4_t fhbrx4 = vcvtq_f32_u32(hbrx4); + float32x4_t flbbx4 = vcvtq_f32_u32(lbbx4); + float32x4_t flbgx4 = vcvtq_f32_u32(lbgx4); + float32x4_t flbrx4 = vcvtq_f32_u32(lbrx4); + + // alpha load from little end + float32x4_t lalpx4 = vld1q_f32(alpha_data + i); // low bits + float32x4_t halpx4 = vld1q_f32(alpha_data + i + 4); // high bits + float32x4_t rlalpx4 = vsubq_f32(vdupq_n_f32(1.0f), lalpx4); + float32x4_t rhalpx4 = vsubq_f32(vdupq_n_f32(1.0f), halpx4); + + // blending + float32x4_t fhvbx4 = vaddq_f32(vmulq_f32(fhibx4, halpx4), vmulq_f32(fhbbx4, rhalpx4)); + float32x4_t fhvgx4 = vaddq_f32(vmulq_f32(fhigx4, halpx4), vmulq_f32(fhbgx4, rhalpx4)); + float32x4_t fhvrx4 = vaddq_f32(vmulq_f32(fhirx4, halpx4), vmulq_f32(fhbrx4, rhalpx4)); + float32x4_t flvbx4 = vaddq_f32(vmulq_f32(flibx4, lalpx4), vmulq_f32(flbbx4, rlalpx4)); + float32x4_t flvgx4 = vaddq_f32(vmulq_f32(fligx4, lalpx4), vmulq_f32(flbgx4, rlalpx4)); + float32x4_t flvrx4 = vaddq_f32(vmulq_f32(flirx4, lalpx4), vmulq_f32(flbrx4, rlalpx4)); + + // f32 -> u32 -> u16 -> u8 + uint8x8x3_t vbgrx8x3; + // combine low 64 bits and high 64 bits into one 128 neon register + vbgrx8x3.val[0] = vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(flvbx4)), + vmovn_u32(vcvtq_u32_f32(fhvbx4)))); + vbgrx8x3.val[1] = vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(flvgx4)), + vmovn_u32(vcvtq_u32_f32(fhvgx4)))); + vbgrx8x3.val[2] = vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(flvrx4)), + vmovn_u32(vcvtq_u32_f32(fhvrx4)))); + vst3_u8(vis_data + i * 3, vbgrx8x3); + } + + for (int i = size - 7; i < size; i++) { + float alp = alpha_data[i]; + for (int c = 0; c < 3; ++c) { + vis_data[i * 3 + 0] = cv::saturate_cast( + static_cast(im_data[i * 3 + c]) * alp + (1.0f - alp) + * static_cast(background_data[i * 3 + c])); + } + } + + return vis_img; +#endif +} + +cv::Mat SwapBackgroundNEON(const cv::Mat& im, + const cv::Mat& background, + const SegmentationResult& result, + int background_label) { +#ifndef __ARM_NEON + FDASSERT(false, "FastDeploy was not compiled with Arm NEON support!") +#else + FDASSERT((!im.empty()), "Image can't be empty!"); + FDASSERT((im.channels() == 3), "Only support 3 channels image mat!"); + FDASSERT((!background.empty()), "Background image can't be empty!"); + FDASSERT((background.channels() == 3), + "Only support 3 channels background image mat!"); + int out_h = static_cast(result.shape[0]); + int out_w = static_cast(result.shape[1]); + int height = im.rows; + int width = im.cols; + int bg_height = background.rows; + int bg_width = background.cols; + auto vis_img = cv::Mat(height, width, CV_8UC3); + + cv::Mat background_ref; + if ((bg_height != height) || (bg_width != width)) { + cv::resize(background, background_ref, cv::Size(width, height)); + } else { + background_ref = background; // ref only + } + if ((background_ref).type() != CV_8UC3) { + (background_ref).convertTo((background_ref), CV_8UC3); + } + + uint8_t* vis_data = static_cast(vis_img.data); + const uint8_t* background_data = static_cast(background_ref.data); + const uint8_t* im_data = static_cast(im.data); + const uint8_t *label_data = static_cast(result.label_map.data()); + + const uint8_t background_label_ = static_cast(background_label); + const int32_t size = static_cast(height * width); + + uint8x16_t backgroundx16 = vdupq_n_u8(background_label_); + #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS) + for (int i = 0; i < size - 15; i += 16) { + uint8x16x3_t ibgr16x3 = vld3q_u8(im_data + i * 3); // 48 bytes + uint8x16x3_t bbgr16x3 = vld3q_u8(background_data + i * 3); + uint8x16_t labelx16 = vld1q_u8(label_data + i); // 16 bytes + // Set mask bit = 1 if label != background_label + uint8x16_t nkeepx16 = vceqq_u8(labelx16, backgroundx16); + uint8x16_t keepx16 = vmvnq_u8(nkeepx16); // keep_value = 1 + uint8x16x3_t vbgr16x3; + vbgr16x3.val[0] = vorrq_u8(vandq_u8(ibgr16x3.val[0], keepx16), + vandq_u8(bbgr16x3.val[0], nkeepx16)); + vbgr16x3.val[1] = vorrq_u8(vandq_u8(ibgr16x3.val[1], keepx16), + vandq_u8(bbgr16x3.val[1], nkeepx16)); + vbgr16x3.val[2] = vorrq_u8(vandq_u8(ibgr16x3.val[2], keepx16), + vandq_u8(bbgr16x3.val[2], nkeepx16)); + // Store the blended pixels to vis img + vst3q_u8(vis_data + i * 3, vbgr16x3); + } + + for (int i = size - 15; i < size; i++) { + uint8_t label = label_data[i]; + if (label != background_label_) { + vis_data[i * 3 + 0] = im_data[i * 3 + 0]; + vis_data[i * 3 + 1] = im_data[i * 3 + 1]; + vis_data[i * 3 + 2] = im_data[i * 3 + 2]; + } else { + vis_data[i * 3 + 0] = background_data[i * 3 + 0]; + vis_data[i * 3 + 1] = background_data[i * 3 + 1]; + vis_data[i * 3 + 2] = background_data[i * 3 + 2]; + } + } + + return vis_img; +#endif +} + +} // namespace vision +} // namespace fastdeploy + +#endif \ No newline at end of file diff --git a/fastdeploy/vision/visualize/swap_background_arm.h b/fastdeploy/vision/visualize/swap_background_arm.h new file mode 100644 index 0000000000..eb401e656a --- /dev/null +++ b/fastdeploy/vision/visualize/swap_background_arm.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef ENABLE_VISION_VISUALIZE +#pragma once + +#include "fastdeploy/vision/common/result.h" +#include "opencv2/imgproc/imgproc.hpp" + +namespace fastdeploy { +namespace vision { + +cv::Mat SwapBackgroundNEON(const cv::Mat& im, + const cv::Mat& background, + const MattingResult& result, + bool remove_small_connected_area = false); + +cv::Mat SwapBackgroundNEON(const cv::Mat& im, + const cv::Mat& background, + const SegmentationResult& result, + int background_label); + +} // namespace vision +} // namespace fastdeploy + +#endif +