diff --git a/FastDeploy.cmake.in b/FastDeploy.cmake.in
old mode 100755
new mode 100644
index 6f71b2365c..3ad85fb79d
--- a/FastDeploy.cmake.in
+++ b/FastDeploy.cmake.in
@@ -248,6 +248,7 @@ if(WITH_XPU)
   list(APPEND FASTDEPLOY_LIBS -lpthread -lrt -ldl)
 endif()
 
+
 # log lib for Android
 if(ANDROID)
   find_library(log-lib log)
diff --git a/fastdeploy/vision/visualize/segmentation.cc b/fastdeploy/vision/visualize/segmentation.cc
index 37d0f572ef..5fa8c78912 100644
--- a/fastdeploy/vision/visualize/segmentation.cc
+++ b/fastdeploy/vision/visualize/segmentation.cc
@@ -15,169 +15,13 @@
 #ifdef ENABLE_VISION_VISUALIZE
 
 #include "fastdeploy/vision/visualize/visualize.h"
+#include "fastdeploy/vision/visualize/segmentation_arm.h"
 #include "opencv2/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
 
 namespace fastdeploy {
 namespace vision {
 
-#ifdef __ARM_NEON  
-static constexpr int VIS_SEG_OMP_NUM_THREADS = 2;
-
-static inline void QuantizeBlendingWeight8(
-  float weight, uint8_t* old_multi_factor, uint8_t* new_multi_factor) {
-  // Quantize the weight to boost blending performance.
-  // if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7
-  // if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6
-  // if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5
-  // if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4
-  // Shift factor is always 3, but the mul factor is different.
-  // Moving 7 bits to the right tends to result in a zero value,
-  // So, We choose to shift 3 bits to get an approximation.
-  uint8_t weight_quantize = static_cast<uint8_t>(weight * 8.0f);
-  *new_multi_factor = weight_quantize;
-  *old_multi_factor = (8 - weight_quantize);
-}
-
-static cv::Mat FastVisSegmentationNEON(
-  const cv::Mat& im, const SegmentationResult& result,
-  float weight, bool quantize_weight = true) {
-  int64_t height = result.shape[0];
-  int64_t width = result.shape[1];
-  auto vis_img = cv::Mat(height, width, CV_8UC3);
-  
-  int32_t size = static_cast<int32_t>(height * width);
-  uint8_t *vis_ptr = static_cast<uint8_t*>(vis_img.data);
-  const uint8_t *label_ptr = static_cast<const uint8_t*>(result.label_map.data());
-  const uint8_t *im_ptr = static_cast<const uint8_t*>(im.data);
-
-  if (!quantize_weight) {
-    uint8x16_t zerox16 = vdupq_n_u8(0);
-    #pragma omp parallel for proc_bind(close) \
-    num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static)
-    for (int i = 0; i < size - 15; i += 16) {
-      uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3);  // 48 bytes
-      uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
-      uint8x16_t ibx16 = bgrx16x3.val[0];
-      uint8x16_t igx16 = bgrx16x3.val[1];
-      uint8x16_t irx16 = bgrx16x3.val[2];
-      // e.g 0b00000001 << 7 -> 0b10000000 128;
-      uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
-      uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
-      uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
-      uint8x16x3_t vbgrx16x3;
-      // Keep the pixels of input im if mask = 0
-      uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16);
-      vbgrx16x3.val[0] = vorrq_u8(vandq_u8(cezx16, ibx16), mbx16);
-      vbgrx16x3.val[1] = vorrq_u8(vandq_u8(cezx16, igx16), mgx16);
-      vbgrx16x3.val[2] = vorrq_u8(vandq_u8(cezx16, irx16), mrx16);
-      vst3q_u8(vis_ptr + i * 3, vbgrx16x3);
-    }
-    for (int i = size - 15; i < size; i++) {
-      uint8_t label = label_ptr[i];
-      vis_ptr[i * 3 + 0] = (label << 7); 
-      vis_ptr[i * 3 + 1] = (label << 4); 
-      vis_ptr[i * 3 + 2] = (label << 3); 
-    }
-    // Blend the colors use OpenCV
-    cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img);
-    return vis_img;
-  }
-  
-  // Quantize the weight to boost blending performance.
-  // After that, we can directly use shift instructions
-  // to blend the colors from input im and mask. Please 
-  // check QuantizeBlendingWeight8 for more details.
-  uint8_t old_multi_factor, new_multi_factor;
-  QuantizeBlendingWeight8(weight, &old_multi_factor,
-                          &new_multi_factor);     
-  if (new_multi_factor == 0) {
-    return im; // Only keep origin image.
-  }                                            
-  
-  if (new_multi_factor == 8) {
-    // Only keep mask, no need to blending with origin image.
-    #pragma omp parallel for proc_bind(close) \
-    num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static)
-    for (int i = 0; i < size - 15; i += 16) {
-      uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
-      // e.g 0b00000001 << 7 -> 0b10000000 128;
-      uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
-      uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
-      uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
-      uint8x16x3_t vbgr16x3;
-      vbgr16x3.val[0] = mbx16;
-      vbgr16x3.val[1] = mgx16;
-      vbgr16x3.val[2] = mrx16;
-      vst3q_u8(vis_ptr + i * 3, vbgr16x3);
-    }
-    for (int i = size - 15; i < size; i++) {
-      uint8_t label = label_ptr[i];
-      vis_ptr[i * 3 + 0] = (label << 7); 
-      vis_ptr[i * 3 + 1] = (label << 4); 
-      vis_ptr[i * 3 + 2] = (label << 3); 
-    }  
-    return vis_img;
-  }
-  
-  uint8x16_t zerox16 = vdupq_n_u8(0);
-  uint8x16_t old_fx16 = vdupq_n_u8(old_multi_factor);
-  uint8x16_t new_fx16 = vdupq_n_u8(new_multi_factor);
-  // Blend the two colors together with quantize 'weight'.
-  #pragma omp parallel for proc_bind(close) \
-  num_threads(VIS_SEG_OMP_NUM_THREADS) schedule(static)
-  for (int i = 0; i < size - 15; i += 16) {
-    uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3);  // 48 bytes
-    uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
-    uint8x16_t ibx16 = bgrx16x3.val[0];
-    uint8x16_t igx16 = bgrx16x3.val[1];
-    uint8x16_t irx16 = bgrx16x3.val[2];
-    // e.g 0b00000001 << 7 -> 0b10000000 128;
-    uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
-    uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
-    uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
-    // Moving 7 bits to the right tends to result in zero,
-    // So, We choose to shift 3 bits to get an approximation 
-    uint8x16_t ibx16_mshr = vmulq_u8(vshrq_n_u8(ibx16, 3), old_fx16);
-    uint8x16_t igx16_mshr = vmulq_u8(vshrq_n_u8(igx16, 3), old_fx16);   
-    uint8x16_t irx16_mshr = vmulq_u8(vshrq_n_u8(irx16, 3), old_fx16);
-    uint8x16_t mbx16_mshr = vmulq_u8(vshrq_n_u8(mbx16, 3), new_fx16);
-    uint8x16_t mgx16_mshr = vmulq_u8(vshrq_n_u8(mgx16, 3), new_fx16);
-    uint8x16_t mrx16_mshr = vmulq_u8(vshrq_n_u8(mrx16, 3), new_fx16);  
-    uint8x16_t qbx16 = vqaddq_u8(ibx16_mshr, mbx16_mshr);
-    uint8x16_t qgx16 = vqaddq_u8(igx16_mshr, mgx16_mshr);
-    uint8x16_t qrx16 = vqaddq_u8(irx16_mshr, mrx16_mshr);
-    // Keep the pixels of input im if label = 0 (means mask = 0)
-    uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16);
-    uint8x16_t abx16 = vandq_u8(cezx16, ibx16);
-    uint8x16_t agx16 = vandq_u8(cezx16, igx16);
-    uint8x16_t arx16 = vandq_u8(cezx16, irx16);
-    uint8x16x3_t vbgr16x3;  
-    // Reset qx values to 0 if label is 0, then, keep mask values 
-    // if label is not 0  
-    uint8x16_t ncezx16 = vmvnq_u8(cezx16); 
-    vbgr16x3.val[0] = vorrq_u8(abx16, vandq_u8(ncezx16, qbx16));
-    vbgr16x3.val[1] = vorrq_u8(agx16, vandq_u8(ncezx16, qgx16));
-    vbgr16x3.val[2] = vorrq_u8(arx16, vandq_u8(ncezx16, qrx16));
-    // Store the blended pixels to vis img
-    vst3q_u8(vis_ptr + i * 3, vbgr16x3);
-  }
-  for (int i = size - 15; i < size; i++) {
-    uint8_t label = label_ptr[i];
-    vis_ptr[i * 3 + 0] = (im_ptr[i * 3 + 0] >> 3) * old_multi_factor 
-      + ((label << 7) >> 3) * new_multi_factor; 
-    vis_ptr[i * 3 + 1] = (im_ptr[i * 3 + 1] >> 3) * old_multi_factor 
-      + ((label << 4) >> 3) * new_multi_factor; 
-    vis_ptr[i * 3 + 2] = (im_ptr[i * 3 + 2] >> 3) * old_multi_factor 
-      + ((label << 3) >> 3) * new_multi_factor;   
-  }  
-  return vis_img;
-}
-#endif
-
 static cv::Mat VisSegmentationCommonCpu(
   const cv::Mat& im, const SegmentationResult& result,
   float weight) {
@@ -210,7 +54,7 @@ cv::Mat VisSegmentation(const cv::Mat& im, const SegmentationResult& result,
                         float weight) {
   // TODO: Support SSE/AVX on x86_64 platforms                        
 #ifdef __ARM_NEON 
-  return FastVisSegmentationNEON(im, result, weight, true);
+  return VisSegmentationNEON(im, result, weight, true);
 #else  
   return VisSegmentationCommonCpu(im, result, weight);
 #endif  
@@ -223,7 +67,7 @@ cv::Mat Visualize::VisSegmentation(const cv::Mat& im,
                "function instead."
             << std::endl;     
 #ifdef __ARM_NEON 
-  return FastVisSegmentationNEON(im, result, 0.5f, true);
+  return VisSegmentationNEON(im, result, 0.5f, true);
 #else  
   return VisSegmentationCommonCpu(im, result, 0.5f);
 #endif  
diff --git a/fastdeploy/vision/visualize/segmentation_arm.cc b/fastdeploy/vision/visualize/segmentation_arm.cc
new file mode 100644
index 0000000000..154883f77d
--- /dev/null
+++ b/fastdeploy/vision/visualize/segmentation_arm.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef ENABLE_VISION_VISUALIZE
+
+#include "fastdeploy/vision/visualize/segmentation_arm.h"
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace fastdeploy {
+namespace vision {
+
+static constexpr int _OMP_THREADS = 2;
+
+static inline void QuantizeBlendingWeight8(
+  float weight, uint8_t* old_multi_factor, uint8_t* new_multi_factor) {
+  // Quantize the weight to boost blending performance.
+  // if 0.0 < w <= 1/8, w ~ 1/8=1/(2^3) shift right 3 mul 1, 7
+  // if 1/8 < w <= 2/8, w ~ 2/8=1/(2^3) shift right 3 mul 2, 6
+  // if 2/8 < w <= 3/8, w ~ 3/8=1/(2^3) shift right 3 mul 3, 5
+  // if 3/8 < w <= 4/8, w ~ 4/8=1/(2^3) shift right 3 mul 4, 4
+  // Shift factor is always 3, but the mul factor is different.
+  // Moving 7 bits to the right tends to result in a zero value,
+  // So, We choose to shift 3 bits to get an approximation.
+  uint8_t weight_quantize = static_cast<uint8_t>(weight * 8.0f);
+  *new_multi_factor = weight_quantize;
+  *old_multi_factor = (8 - weight_quantize);
+}
+
+cv::Mat VisSegmentationNEON(
+  const cv::Mat& im, const SegmentationResult& result,
+  float weight, bool quantize_weight) {
+#ifndef __ARM_NEON  
+   FDASSERT(false, "FastDeploy was not compiled with Arm NEON support!")
+#else
+  int64_t height = result.shape[0];
+  int64_t width = result.shape[1];
+  auto vis_img = cv::Mat(height, width, CV_8UC3);
+  
+  int32_t size = static_cast<int32_t>(height * width);
+  uint8_t *vis_ptr = static_cast<uint8_t*>(vis_img.data);
+  const uint8_t *label_ptr = static_cast<const uint8_t*>(result.label_map.data());
+  const uint8_t *im_ptr = static_cast<const uint8_t*>(im.data);
+
+  if (!quantize_weight) {
+    uint8x16_t zerox16 = vdupq_n_u8(0);
+    #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS)
+    for (int i = 0; i < size - 15; i += 16) {
+      uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3);  // 48 bytes
+      uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
+      uint8x16_t ibx16 = bgrx16x3.val[0];
+      uint8x16_t igx16 = bgrx16x3.val[1];
+      uint8x16_t irx16 = bgrx16x3.val[2];
+      // e.g 0b00000001 << 7 -> 0b10000000 128;
+      uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
+      uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
+      uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
+      uint8x16x3_t vbgrx16x3;
+      // Keep the pixels of input im if mask = 0
+      uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16);
+      vbgrx16x3.val[0] = vorrq_u8(vandq_u8(cezx16, ibx16), mbx16);
+      vbgrx16x3.val[1] = vorrq_u8(vandq_u8(cezx16, igx16), mgx16);
+      vbgrx16x3.val[2] = vorrq_u8(vandq_u8(cezx16, irx16), mrx16);
+      vst3q_u8(vis_ptr + i * 3, vbgrx16x3);
+    }
+    for (int i = size - 15; i < size; i++) {
+      uint8_t label = label_ptr[i];
+      vis_ptr[i * 3 + 0] = (label << 7); 
+      vis_ptr[i * 3 + 1] = (label << 4); 
+      vis_ptr[i * 3 + 2] = (label << 3); 
+    }
+    // Blend the colors use OpenCV
+    cv::addWeighted(im, 1.0 - weight, vis_img, weight, 0, vis_img);
+    return vis_img;
+  }
+  
+  // Quantize the weight to boost blending performance.
+  // After that, we can directly use shift instructions
+  // to blend the colors from input im and mask. Please 
+  // check QuantizeBlendingWeight8 for more details.
+  uint8_t old_multi_factor, new_multi_factor;
+  QuantizeBlendingWeight8(weight, &old_multi_factor,
+                          &new_multi_factor);     
+  if (new_multi_factor == 0) {
+    return im; // Only keep origin image.
+  }                                            
+  
+  if (new_multi_factor == 8) {
+    // Only keep mask, no need to blending with origin image.
+    #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS)
+    for (int i = 0; i < size - 15; i += 16) {
+      uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
+      // e.g 0b00000001 << 7 -> 0b10000000 128;
+      uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
+      uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
+      uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
+      uint8x16x3_t vbgr16x3;
+      vbgr16x3.val[0] = mbx16;
+      vbgr16x3.val[1] = mgx16;
+      vbgr16x3.val[2] = mrx16;
+      vst3q_u8(vis_ptr + i * 3, vbgr16x3);
+    }
+    for (int i = size - 15; i < size; i++) {
+      uint8_t label = label_ptr[i];
+      vis_ptr[i * 3 + 0] = (label << 7); 
+      vis_ptr[i * 3 + 1] = (label << 4); 
+      vis_ptr[i * 3 + 2] = (label << 3); 
+    }  
+    return vis_img;   
+  }
+  
+  uint8x16_t zerox16 = vdupq_n_u8(0);
+  uint8x16_t old_fx16 = vdupq_n_u8(old_multi_factor);
+  uint8x16_t new_fx16 = vdupq_n_u8(new_multi_factor);
+  // Blend the two colors together with quantize 'weight'.
+  #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS)
+  for (int i = 0; i < size - 15; i += 16) {
+    uint8x16x3_t bgrx16x3 = vld3q_u8(im_ptr + i * 3);  // 48 bytes
+    uint8x16_t labelx16 = vld1q_u8(label_ptr + i); // 16 bytes
+    uint8x16_t ibx16 = bgrx16x3.val[0];
+    uint8x16_t igx16 = bgrx16x3.val[1];
+    uint8x16_t irx16 = bgrx16x3.val[2];
+    // e.g 0b00000001 << 7 -> 0b10000000 128;
+    uint8x16_t mbx16 = vshlq_n_u8(labelx16, 7); 
+    uint8x16_t mgx16 = vshlq_n_u8(labelx16, 4); 
+    uint8x16_t mrx16 = vshlq_n_u8(labelx16, 3); 
+    // Moving 7 bits to the right tends to result in zero,
+    // So, We choose to shift 3 bits to get an approximation 
+    uint8x16_t ibx16_mshr = vmulq_u8(vshrq_n_u8(ibx16, 3), old_fx16);
+    uint8x16_t igx16_mshr = vmulq_u8(vshrq_n_u8(igx16, 3), old_fx16);   
+    uint8x16_t irx16_mshr = vmulq_u8(vshrq_n_u8(irx16, 3), old_fx16);
+    uint8x16_t mbx16_mshr = vmulq_u8(vshrq_n_u8(mbx16, 3), new_fx16);
+    uint8x16_t mgx16_mshr = vmulq_u8(vshrq_n_u8(mgx16, 3), new_fx16);
+    uint8x16_t mrx16_mshr = vmulq_u8(vshrq_n_u8(mrx16, 3), new_fx16);  
+    uint8x16_t qbx16 = vqaddq_u8(ibx16_mshr, mbx16_mshr);
+    uint8x16_t qgx16 = vqaddq_u8(igx16_mshr, mgx16_mshr);
+    uint8x16_t qrx16 = vqaddq_u8(irx16_mshr, mrx16_mshr);
+    // Keep the pixels of input im if label = 0 (means mask = 0)
+    uint8x16_t cezx16 = vceqq_u8(labelx16, zerox16);
+    uint8x16_t abx16 = vandq_u8(cezx16, ibx16);
+    uint8x16_t agx16 = vandq_u8(cezx16, igx16);
+    uint8x16_t arx16 = vandq_u8(cezx16, irx16);
+    uint8x16x3_t vbgr16x3;  
+    // Reset qx values to 0 if label is 0, then, keep mask values 
+    // if label is not 0  
+    uint8x16_t ncezx16 = vmvnq_u8(cezx16); 
+    vbgr16x3.val[0] = vorrq_u8(abx16, vandq_u8(ncezx16, qbx16));
+    vbgr16x3.val[1] = vorrq_u8(agx16, vandq_u8(ncezx16, qgx16));
+    vbgr16x3.val[2] = vorrq_u8(arx16, vandq_u8(ncezx16, qrx16));
+    // Store the blended pixels to vis img
+    vst3q_u8(vis_ptr + i * 3, vbgr16x3);
+  }
+  for (int i = size - 15; i < size; i++) {
+    uint8_t label = label_ptr[i];
+    vis_ptr[i * 3 + 0] = (im_ptr[i * 3 + 0] >> 3) * old_multi_factor 
+      + ((label << 7) >> 3) * new_multi_factor; 
+    vis_ptr[i * 3 + 1] = (im_ptr[i * 3 + 1] >> 3) * old_multi_factor 
+      + ((label << 4) >> 3) * new_multi_factor; 
+    vis_ptr[i * 3 + 2] = (im_ptr[i * 3 + 2] >> 3) * old_multi_factor 
+      + ((label << 3) >> 3) * new_multi_factor;   
+  }  
+  return vis_img;
+#endif  
+}
+
+}  // namespace vision
+}  // namespace fastdeploy
+
+#endif
\ No newline at end of file
diff --git a/fastdeploy/vision/visualize/segmentation_arm.h b/fastdeploy/vision/visualize/segmentation_arm.h
new file mode 100644
index 0000000000..15c91eb54b
--- /dev/null
+++ b/fastdeploy/vision/visualize/segmentation_arm.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef ENABLE_VISION_VISUALIZE
+#pragma once
+
+#include "fastdeploy/vision/common/result.h"
+#include "opencv2/imgproc/imgproc.hpp"
+
+namespace fastdeploy {
+namespace vision {
+
+cv::Mat VisSegmentationNEON(const cv::Mat& im, const SegmentationResult& result,
+                            float weight, bool quantize_weight = true);
+
+}  // namespace vision
+}  // namespace fastdeploy
+
+#endif
+
diff --git a/fastdeploy/vision/visualize/swap_background.cc b/fastdeploy/vision/visualize/swap_background.cc
index fba8b1bbbc..c7669332b4 100644
--- a/fastdeploy/vision/visualize/swap_background.cc
+++ b/fastdeploy/vision/visualize/swap_background.cc
@@ -15,15 +15,17 @@
 #ifdef ENABLE_VISION_VISUALIZE
 
 #include "fastdeploy/vision/visualize/visualize.h"
+#include "fastdeploy/vision/visualize/swap_background_arm.h"
 #include "opencv2/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
+#include "fastdeploy/utils/utils.h"
 
 namespace fastdeploy {
 namespace vision {
 
-cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
-                       const MattingResult& result,
-                       bool remove_small_connected_area) {
+static cv::Mat SwapBackgroundCommonCpu(
+  const cv::Mat& im, const cv::Mat& background,
+  const MattingResult& result, bool remove_small_connected_area) {
   FDASSERT((!im.empty()), "Image can't be empty!");
   FDASSERT((im.channels() == 3), "Only support 3 channels image mat!");
   FDASSERT((!background.empty()), "Background image can't be empty!");
@@ -60,6 +62,7 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
   uchar* background_data = static_cast<uchar*>(background_copy.data);
   uchar* im_data = static_cast<uchar*>(im.data);
   float* alpha_data = reinterpret_cast<float*>(alpha.data);
+
   for (size_t i = 0; i < height; ++i) {
     for (size_t j = 0; j < width; ++j) {
       float alpha_val = alpha_data[i * width + j];
@@ -74,8 +77,9 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
   return vis_img;
 }
 
-cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
-                       const SegmentationResult& result, int background_label) {
+static cv::Mat SwapBackgroundCommonCpu(
+  const cv::Mat& im, const cv::Mat& background,
+  const SegmentationResult& result, int background_label) {
   FDASSERT((!im.empty()), "Image can't be empty!");
   FDASSERT((im.channels() == 3), "Only support 3 channels image mat!");
   FDASSERT((!background.empty()), "Background image can't be empty!");
@@ -100,6 +104,7 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
   uchar* background_data = static_cast<uchar*>(background_copy.data);
   uchar* im_data = static_cast<uchar*>(im.data);
   float keep_value = 0.f;
+
   for (size_t i = 0; i < height; ++i) {
     for (size_t j = 0; j < width; ++j) {
       int category_id = result.label_map[i * width + j];
@@ -116,107 +121,59 @@ cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
       }
     }
   }
+
   return vis_img;
 }
 
+// Public interfaces for SwapBackground.
+cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
+                       const MattingResult& result,
+                       bool remove_small_connected_area) {
+  // TODO: Support SSE/AVX on x86_64 platforms                        
+#ifdef __ARM_NEON 
+  return SwapBackgroundNEON(im, background, result, 
+                            remove_small_connected_area);                       
+#else  
+  return SwapBackgroundCommonCpu(im, background, result, 
+                                 remove_small_connected_area);                          
+#endif    
+}
+
+cv::Mat SwapBackground(const cv::Mat& im, const cv::Mat& background,
+                       const SegmentationResult& result, int background_label) {
+  // TODO: Support SSE/AVX on x86_64 platforms                        
+#ifdef __ARM_NEON 
+  // return SwapBackgroundNEON(im, background, result, background_label);
+  return SwapBackgroundNEON(im, background, result, background_label);
+#else  
+  return SwapBackgroundCommonCpu(im, background, result, background_label);
+#endif    
+}
+
+// DEPRECATED
 cv::Mat Visualize::SwapBackgroundMatting(const cv::Mat& im,
                                          const cv::Mat& background,
                                          const MattingResult& result,
                                          bool remove_small_connected_area) {
-  FDASSERT((!im.empty()), "Image can't be empty!");
-  FDASSERT((im.channels() == 3), "Only support 3 channels image mat!");
-  FDASSERT((!background.empty()), "Background image can't be empty!");
-  FDASSERT((background.channels() == 3),
-           "Only support 3 channels background image mat!");
-  auto vis_img = im.clone();
-  auto background_copy = background.clone();
-  int out_h = static_cast<int>(result.shape[0]);
-  int out_w = static_cast<int>(result.shape[1]);
-  int height = im.rows;
-  int width = im.cols;
-  int bg_height = background.rows;
-  int bg_width = background.cols;
-  std::vector<float> alpha_copy;
-  alpha_copy.assign(result.alpha.begin(), result.alpha.end());
-  float* alpha_ptr = static_cast<float*>(alpha_copy.data());
-  cv::Mat alpha(out_h, out_w, CV_32FC1, alpha_ptr);
-  if (remove_small_connected_area) {
-    alpha = Visualize::RemoveSmallConnectedArea(alpha, 0.05f);
-  }
-  if ((vis_img).type() != CV_8UC3) {
-    (vis_img).convertTo((vis_img), CV_8UC3);
-  }
-  if ((background_copy).type() != CV_8UC3) {
-    (background_copy).convertTo((background_copy), CV_8UC3);
-  }
-  if ((bg_height != height) || (bg_width != width)) {
-    cv::resize(background, background_copy, cv::Size(width, height));
-  }
-  if ((out_h != height) || (out_w != width)) {
-    cv::resize(alpha, alpha, cv::Size(width, height));
-  }
-  uchar* vis_data = static_cast<uchar*>(vis_img.data);
-  uchar* background_data = static_cast<uchar*>(background_copy.data);
-  uchar* im_data = static_cast<uchar*>(im.data);
-  float* alpha_data = reinterpret_cast<float*>(alpha.data);
-  for (size_t i = 0; i < height; ++i) {
-    for (size_t j = 0; j < width; ++j) {
-      float alpha_val = alpha_data[i * width + j];
-      for (size_t c = 0; c < 3; ++c) {
-        vis_data[i * width * 3 + j * 3 + c] = cv::saturate_cast<uchar>(
-            static_cast<float>(im_data[i * width * 3 + j * 3 + c]) * alpha_val +
-            (1.f - alpha_val) * background_data[i * width * 3 + j * 3 + c]);
-      }
-    }
-  }
-
-  return vis_img;
+// TODO: Support SSE/AVX on x86_64 platforms                        
+#ifdef __ARM_NEON 
+  return SwapBackgroundNEON(im, background, result, 
+                            remove_small_connected_area);
+#else  
+  return SwapBackgroundCommonCpu(im, background, result, 
+                                 remove_small_connected_area);
+#endif                                              
 }
 
 cv::Mat Visualize::SwapBackgroundSegmentation(
     const cv::Mat& im, const cv::Mat& background, int background_label,
     const SegmentationResult& result) {
-  FDASSERT((!im.empty()), "Image can't be empty!");
-  FDASSERT((im.channels() == 3), "Only support 3 channels image mat!");
-  FDASSERT((!background.empty()), "Background image can't be empty!");
-  FDASSERT((background.channels() == 3),
-           "Only support 3 channels background image mat!");
-  auto vis_img = im.clone();
-  auto background_copy = background.clone();
-  int height = im.rows;
-  int width = im.cols;
-  int bg_height = background.rows;
-  int bg_width = background.cols;
-  if ((vis_img).type() != CV_8UC3) {
-    (vis_img).convertTo((vis_img), CV_8UC3);
-  }
-  if ((background_copy).type() != CV_8UC3) {
-    (background_copy).convertTo((background_copy), CV_8UC3);
-  }
-  if ((bg_height != height) || (bg_width != width)) {
-    cv::resize(background, background_copy, cv::Size(width, height));
-  }
-  uchar* vis_data = static_cast<uchar*>(vis_img.data);
-  uchar* background_data = static_cast<uchar*>(background_copy.data);
-  uchar* im_data = static_cast<uchar*>(im.data);
-  float keep_value = 0.f;
-  for (size_t i = 0; i < height; ++i) {
-    for (size_t j = 0; j < width; ++j) {
-      int category_id = result.label_map[i * width + j];
-      if (background_label != category_id) {
-        keep_value = 1.0f;
-      } else {
-        keep_value = 0.f;
-      }
-      for (size_t c = 0; c < 3; ++c) {
-        vis_data[i * width * 3 + j * 3 + c] = cv::saturate_cast<uchar>(
-            static_cast<float>(im_data[i * width * 3 + j * 3 + c]) *
-                keep_value +
-            (1.f - keep_value) * background_data[i * width * 3 + j * 3 + c]);
-      }
-    }
-  }
-  return vis_img;
+  // TODO: Support SSE/AVX on x86_64 platforms                        
+#ifdef __ARM_NEON 
+  return SwapBackgroundNEON(im, background, result, background_label);
+#else  
+  return SwapBackgroundCommonCpu(im, background, result, background_label);
+#endif    
 }
 
 }  // namespace vision
diff --git a/fastdeploy/vision/visualize/swap_background_arm.cc b/fastdeploy/vision/visualize/swap_background_arm.cc
new file mode 100644
index 0000000000..3abbffd95b
--- /dev/null
+++ b/fastdeploy/vision/visualize/swap_background_arm.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef ENABLE_VISION_VISUALIZE
+#include "fastdeploy/vision/visualize/visualize.h"
+#include "fastdeploy/vision/visualize/swap_background_arm.h"
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+#include "fastdeploy/utils/utils.h"
+
+namespace fastdeploy {
+namespace vision {
+
+static constexpr int _OMP_THREADS = 2;
+
+cv::Mat SwapBackgroundNEON(const cv::Mat& im, 
+                           const cv::Mat& background, 
+                           const MattingResult& result, 
+                           bool remove_small_connected_area) {
+#ifndef __ARM_NEON  
+   FDASSERT(false, "FastDeploy was not compiled with Arm NEON support!");
+#else
+   FDASSERT((!im.empty()), "Image can't be empty!");
+   FDASSERT((im.channels() == 3), "Only support 3 channels image mat!");
+   FDASSERT((!background.empty()), "Background image can't be empty!");
+   FDASSERT((background.channels() == 3),
+            "Only support 3 channels background image mat!");
+   int out_h = static_cast<int>(result.shape[0]);
+   int out_w = static_cast<int>(result.shape[1]);
+   int height = im.rows;
+   int width = im.cols;
+   int bg_height = background.rows;
+   int bg_width = background.cols;
+   
+   // WARN: may change the original alpha
+   float* alpha_ptr = const_cast<float*>(result.alpha.data());
+
+   cv::Mat alpha(out_h, out_w, CV_32FC1, alpha_ptr);
+   if (remove_small_connected_area) {
+      alpha = Visualize::RemoveSmallConnectedArea(alpha, 0.05f);
+   }
+   auto vis_img = cv::Mat(height, width, CV_8UC3);  
+   
+   cv::Mat background_ref;
+   if ((bg_height != height) || (bg_width != width)) {
+      cv::resize(background, background_ref, cv::Size(width, height));
+   } else {
+      background_ref = background; // ref only
+   }
+   if ((background_ref).type() != CV_8UC3) {
+      (background_ref).convertTo((background_ref), CV_8UC3);
+   }
+
+   if ((out_h != height) || (out_w != width)) {
+      cv::resize(alpha, alpha, cv::Size(width, height));
+   }
+
+   uint8_t* vis_data = static_cast<uint8_t*>(vis_img.data);
+   const uint8_t* background_data = static_cast<const uint8_t*>(background_ref.data);
+   const uint8_t* im_data = static_cast<const uint8_t*>(im.data);
+   const float* alpha_data = reinterpret_cast<const float*>(alpha.data);
+
+   const int32_t size = static_cast<int32_t>(height * width);
+   #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS)
+   for(int i = 0; i < size - 7; i += 8) {
+      uint8x8x3_t ibgrx8x3 = vld3_u8(im_data + i * 3);  // 24 bytes
+      // u8 -> u16 -> u32 -> f32
+      uint16x8_t ibx8 = vmovl_u8(ibgrx8x3.val[0]);
+      uint16x8_t igx8 = vmovl_u8(ibgrx8x3.val[1]);
+      uint16x8_t irx8 = vmovl_u8(ibgrx8x3.val[2]);
+      uint8x8x3_t bbgrx8x3 = vld3_u8(background_data + i * 3);  // 24 bytes
+      uint16x8_t bbx8 = vmovl_u8(bbgrx8x3.val[0]);
+      uint16x8_t bgx8 = vmovl_u8(bbgrx8x3.val[1]);
+      uint16x8_t brx8 = vmovl_u8(bbgrx8x3.val[2]);
+
+      uint32x4_t hibx4 = vmovl_u16(vget_high_u16(ibx8));
+      uint32x4_t higx4 = vmovl_u16(vget_high_u16(igx8));
+      uint32x4_t hirx4 = vmovl_u16(vget_high_u16(irx8));
+      uint32x4_t libx4 = vmovl_u16(vget_low_u16(ibx8));
+      uint32x4_t ligx4 = vmovl_u16(vget_low_u16(igx8));
+      uint32x4_t lirx4 = vmovl_u16(vget_low_u16(irx8));
+
+      uint32x4_t hbbx4 = vmovl_u16(vget_high_u16(bbx8));
+      uint32x4_t hbgx4 = vmovl_u16(vget_high_u16(bgx8));
+      uint32x4_t hbrx4 = vmovl_u16(vget_high_u16(brx8));
+      uint32x4_t lbbx4 = vmovl_u16(vget_low_u16(bbx8));
+      uint32x4_t lbgx4 = vmovl_u16(vget_low_u16(bgx8));
+      uint32x4_t lbrx4 = vmovl_u16(vget_low_u16(brx8));
+
+      float32x4_t fhibx4 = vcvtq_f32_u32(hibx4);
+      float32x4_t fhigx4 = vcvtq_f32_u32(higx4);
+      float32x4_t fhirx4 = vcvtq_f32_u32(hirx4);
+      float32x4_t flibx4 = vcvtq_f32_u32(libx4);
+      float32x4_t fligx4 = vcvtq_f32_u32(ligx4);
+      float32x4_t flirx4 = vcvtq_f32_u32(lirx4);
+
+      float32x4_t fhbbx4 = vcvtq_f32_u32(hbbx4);
+      float32x4_t fhbgx4 = vcvtq_f32_u32(hbgx4);
+      float32x4_t fhbrx4 = vcvtq_f32_u32(hbrx4);
+      float32x4_t flbbx4 = vcvtq_f32_u32(lbbx4);
+      float32x4_t flbgx4 = vcvtq_f32_u32(lbgx4);
+      float32x4_t flbrx4 = vcvtq_f32_u32(lbrx4);
+      
+      // alpha load from little end
+      float32x4_t lalpx4 = vld1q_f32(alpha_data + i); // low bits
+      float32x4_t halpx4 = vld1q_f32(alpha_data + i + 4); // high bits
+      float32x4_t rlalpx4 = vsubq_f32(vdupq_n_f32(1.0f), lalpx4);
+      float32x4_t rhalpx4 = vsubq_f32(vdupq_n_f32(1.0f), halpx4);
+
+      // blending 
+      float32x4_t fhvbx4 = vaddq_f32(vmulq_f32(fhibx4, halpx4), vmulq_f32(fhbbx4, rhalpx4));
+      float32x4_t fhvgx4 = vaddq_f32(vmulq_f32(fhigx4, halpx4), vmulq_f32(fhbgx4, rhalpx4));
+      float32x4_t fhvrx4 = vaddq_f32(vmulq_f32(fhirx4, halpx4), vmulq_f32(fhbrx4, rhalpx4));
+      float32x4_t flvbx4 = vaddq_f32(vmulq_f32(flibx4, lalpx4), vmulq_f32(flbbx4, rlalpx4));
+      float32x4_t flvgx4 = vaddq_f32(vmulq_f32(fligx4, lalpx4), vmulq_f32(flbgx4, rlalpx4));
+      float32x4_t flvrx4 = vaddq_f32(vmulq_f32(flirx4, lalpx4), vmulq_f32(flbrx4, rlalpx4));
+
+      // f32 -> u32 -> u16 -> u8
+      uint8x8x3_t vbgrx8x3;
+      // combine low 64 bits and high 64 bits into one 128 neon register
+      vbgrx8x3.val[0] = vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(flvbx4)), 
+                                               vmovn_u32(vcvtq_u32_f32(fhvbx4))));
+      vbgrx8x3.val[1] = vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(flvgx4)), 
+                                               vmovn_u32(vcvtq_u32_f32(fhvgx4))));
+      vbgrx8x3.val[2] = vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(flvrx4)), 
+                                               vmovn_u32(vcvtq_u32_f32(fhvrx4))));                                         
+      vst3_u8(vis_data + i * 3, vbgrx8x3);
+   }
+
+   for (int i = size - 7; i < size; i++) {
+      float alp = alpha_data[i];
+      for (int c = 0; c < 3; ++c) {
+         vis_data[i * 3 + 0] = cv::saturate_cast<uchar>(
+            static_cast<float>(im_data[i * 3 + c]) * alp  + (1.0f - alp) 
+            * static_cast<float>(background_data[i * 3 + c]));
+      }
+   }
+
+   return vis_img;
+#endif
+}
+
+cv::Mat SwapBackgroundNEON(const cv::Mat& im,
+                           const cv::Mat& background,
+                           const SegmentationResult& result,
+                           int background_label) {
+#ifndef __ARM_NEON  
+   FDASSERT(false, "FastDeploy was not compiled with Arm NEON support!")
+#else
+   FDASSERT((!im.empty()), "Image can't be empty!");
+   FDASSERT((im.channels() == 3), "Only support 3 channels image mat!");
+   FDASSERT((!background.empty()), "Background image can't be empty!");
+   FDASSERT((background.channels() == 3),
+            "Only support 3 channels background image mat!");
+   int out_h = static_cast<int>(result.shape[0]);
+   int out_w = static_cast<int>(result.shape[1]);
+   int height = im.rows;
+   int width = im.cols;
+   int bg_height = background.rows;
+   int bg_width = background.cols;
+   auto vis_img = cv::Mat(height, width, CV_8UC3);  
+   
+   cv::Mat background_ref;
+   if ((bg_height != height) || (bg_width != width)) {
+      cv::resize(background, background_ref, cv::Size(width, height));
+   } else {
+      background_ref = background; // ref only
+   }
+   if ((background_ref).type() != CV_8UC3) {
+      (background_ref).convertTo((background_ref), CV_8UC3);
+   }
+   
+   uint8_t* vis_data = static_cast<uint8_t*>(vis_img.data);
+   const uint8_t* background_data = static_cast<const uint8_t*>(background_ref.data);
+   const uint8_t* im_data = static_cast<const uint8_t*>(im.data);
+   const uint8_t *label_data = static_cast<const uint8_t*>(result.label_map.data());
+
+   const uint8_t background_label_ = static_cast<uint8_t>(background_label);
+   const int32_t size = static_cast<int32_t>(height * width);
+
+   uint8x16_t backgroundx16 = vdupq_n_u8(background_label_);
+   #pragma omp parallel for proc_bind(close) num_threads(_OMP_THREADS)
+   for (int i = 0; i < size - 15; i += 16) {
+      uint8x16x3_t ibgr16x3 = vld3q_u8(im_data + i * 3); // 48 bytes
+      uint8x16x3_t bbgr16x3 = vld3q_u8(background_data + i * 3);
+      uint8x16_t labelx16 = vld1q_u8(label_data + i); // 16 bytes
+      // Set mask bit = 1 if label != background_label
+      uint8x16_t nkeepx16 = vceqq_u8(labelx16, backgroundx16);
+      uint8x16_t keepx16 = vmvnq_u8(nkeepx16); // keep_value = 1
+      uint8x16x3_t vbgr16x3;
+      vbgr16x3.val[0] = vorrq_u8(vandq_u8(ibgr16x3.val[0], keepx16), 
+                                 vandq_u8(bbgr16x3.val[0], nkeepx16));
+      vbgr16x3.val[1] = vorrq_u8(vandq_u8(ibgr16x3.val[1], keepx16), 
+                                 vandq_u8(bbgr16x3.val[1], nkeepx16));
+      vbgr16x3.val[2] = vorrq_u8(vandq_u8(ibgr16x3.val[2], keepx16), 
+                                 vandq_u8(bbgr16x3.val[2], nkeepx16));
+      // Store the blended pixels to vis img
+      vst3q_u8(vis_data + i * 3, vbgr16x3);
+   }
+
+   for (int i = size - 15; i < size; i++) {
+      uint8_t label = label_data[i];
+      if (label != background_label_) {
+         vis_data[i * 3 + 0] = im_data[i * 3 + 0];
+         vis_data[i * 3 + 1] = im_data[i * 3 + 1];
+         vis_data[i * 3 + 2] = im_data[i * 3 + 2];
+      } else {
+         vis_data[i * 3 + 0] = background_data[i * 3 + 0];
+         vis_data[i * 3 + 1] = background_data[i * 3 + 1];
+         vis_data[i * 3 + 2] = background_data[i * 3 + 2];
+      }
+   }
+
+   return vis_img;
+#endif
+}
+
+}  // namespace vision
+}  // namespace fastdeploy
+
+#endif
\ No newline at end of file
diff --git a/fastdeploy/vision/visualize/swap_background_arm.h b/fastdeploy/vision/visualize/swap_background_arm.h
new file mode 100644
index 0000000000..eb401e656a
--- /dev/null
+++ b/fastdeploy/vision/visualize/swap_background_arm.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef ENABLE_VISION_VISUALIZE
+#pragma once
+
+#include "fastdeploy/vision/common/result.h"
+#include "opencv2/imgproc/imgproc.hpp"
+
+namespace fastdeploy {
+namespace vision {
+
+cv::Mat SwapBackgroundNEON(const cv::Mat& im, 
+                           const cv::Mat& background, 
+                           const MattingResult& result, 
+                           bool remove_small_connected_area = false);
+
+cv::Mat SwapBackgroundNEON(const cv::Mat& im,
+                           const cv::Mat& background,
+                           const SegmentationResult& result,
+                           int background_label);    
+
+}  // namespace vision
+}  // namespace fastdeploy
+
+#endif
+