diff --git a/benchmarking/benchmark_top_hat.ipynb b/benchmarking/benchmark_top_hat.ipynb
new file mode 100644
index 00000000..df4ea461
--- /dev/null
+++ b/benchmarking/benchmark_top_hat.ipynb
@@ -0,0 +1,409 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Background subtraction using top-hat in scikit-image and pyclesperanto\n",
+    "This notebook compares different implementations of a background subtraction method.\n",
+    "\n",
+    "**Note:** benchmarking results vary heavily depending on image size, kernel size, used operations, parameters and used hardware. Use this notebook to adapt it to your use-case scenario and benchmark on your target hardware. If you have different scenarios or use-cases, you are very welcome to submit your notebook as pull-request!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<NVIDIA GeForce RTX 4090 on Platform: NVIDIA CUDA (1 refs)>"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pyclesperanto_prototype as cle\n",
+    "import pyclesperanto as pcle\n",
+    "\n",
+    "from skimage import morphology\n",
+    "import time\n",
+    "\n",
+    "# to measure kernel execution duration properly, we need to set this flag. It will slow down exection of workflows a bit though\n",
+    "cle.set_wait_for_kernel_finish(True)\n",
+    "\n",
+    "# selet a GPU with the following in the name. This will fallback to any other GPU if none with this name is found\n",
+    "cle.select_device('RTX')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(OpenCL) NVIDIA GeForce RTX 4090 (OpenCL 3.0 CUDA)\n",
+       "\tVendor:                      NVIDIA Corporation\n",
+       "\tDriver Version:              535.183.06\n",
+       "\tDevice Type:                 GPU\n",
+       "\tCompute Units:               128\n",
+       "\tGlobal Memory Size:          24183 MB\n",
+       "\tMaximum Object Size:         6045 MB\n",
+       "\tMax Clock Frequency:         2520 MHz\n",
+       "\tImage Support:               Yes"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pcle.wait_for_kernel_to_finish(True)\n",
+    "pcle.select_device('RTX')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "radius = 10\n",
+    "disk_kernel = morphology.ball(radius)\n",
+    "square_kernel = morphology.cube(radius)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# test data\n",
+    "import numpy as np\n",
+    "\n",
+    "test_image = np.random.random([50, 1024, 1024]).astype(np.uint8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "4 warnings generated.\n",
+      "4 warnings generated.\n",
+      "4 warnings generated.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pyclesperanto_prototype top-hat-shere duration: 0.25440526008605957\n",
+      "pyclesperanto_prototype top-hat-shere duration: 0.07837772369384766\n",
+      "pyclesperanto_prototype top-hat-shere duration: 0.07501721382141113\n",
+      "pyclesperanto_prototype top-hat-shere duration: 0.06390213966369629\n",
+      "pyclesperanto_prototype top-hat-shere duration: 0.06378054618835449\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (disk) with pyclesperanto_prototype\n",
+    "result_image = None\n",
+    "\n",
+    "test_image_gpu = cle.push(test_image)\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = cle.top_hat_sphere(test_image_gpu, result_image, radius_x=radius, radius_y=radius)\n",
+    "    print(\"pyclesperanto_prototype top-hat-shere duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pyclesperanto top-hat-shere duration: 0.16746950149536133\n",
+      "pyclesperanto top-hat-shere duration: 0.06676363945007324\n",
+      "pyclesperanto top-hat-shere duration: 0.06292200088500977\n",
+      "pyclesperanto top-hat-shere duration: 0.061110734939575195\n",
+      "pyclesperanto top-hat-shere duration: 0.061188459396362305\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (disk) with pyclesperanto\n",
+    "result_image = None\n",
+    "\n",
+    "test_image_gpu = pcle.push(test_image)\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = pcle.top_hat(test_image_gpu, result_image, radius_x=radius, radius_y=radius, connectivity=\"sphere\")\n",
+    "    print(\"pyclesperanto top-hat-shere duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pyclesperanto_prototype top-hat-box duration: 0.11823725700378418\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.020470619201660156\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.021088123321533203\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.021039724349975586\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.021093130111694336\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "4 warnings generated.\n",
+      "4 warnings generated.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (square) using pyclesperanto_prototype\n",
+    "result_image = None\n",
+    "\n",
+    "test_image_gpu = cle.push(test_image)\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = cle.top_hat_box(test_image_gpu, result_image, radius_x=radius, radius_y=radius)\n",
+    "    print(\"pyclesperanto_prototype top-hat-box duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pyclesperanto top-hat-box duration: 0.11615204811096191\n",
+      "pyclesperanto top-hat-box duration: 0.009797096252441406\n",
+      "pyclesperanto top-hat-box duration: 0.009183645248413086\n",
+      "pyclesperanto top-hat-box duration: 0.00987696647644043\n",
+      "pyclesperanto top-hat-box duration: 0.009579658508300781\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (square) using pyclesperanto\n",
+    "result_image = None\n",
+    "\n",
+    "test_image_gpu = pcle.push(test_image)\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = pcle.top_hat(test_image_gpu, result_image, radius_x=radius, radius_y=radius, connectivity=\"box\")\n",
+    "    print(\"pyclesperanto top-hat-box duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "skimage top-hat disk duration: 226.4084393978119\n",
+      "skimage top-hat disk duration: 226.01446318626404\n",
+      "skimage top-hat disk duration: 228.47509145736694\n",
+      "skimage top-hat disk duration: 225.7498869895935\n",
+      "skimage top-hat disk duration: 225.4377100467682\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (disk) with scikit-image\n",
+    "result_image = None\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = morphology.white_tophat(test_image, footprint=disk_kernel)\n",
+    "    print(\"skimage top-hat disk duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "skimage top-hat square duration: 1.4154345989227295\n",
+      "skimage top-hat square duration: 1.4211006164550781\n",
+      "skimage top-hat square duration: 1.422464370727539\n",
+      "skimage top-hat square duration: 1.4164481163024902\n",
+      "skimage top-hat square duration: 1.419248342514038\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (square) with scikit-image`\n",
+    "result_image = None\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = morphology.white_tophat(test_image, footprint=square_kernel)\n",
+    "    print(\"skimage top-hat square duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Impact of `dtype` on speed\n",
+    "\n",
+    "We notice that `pyclesperanto` is significatively faster than the `prototype` when running the top-hat-box filter. \n",
+    "Both code are fairly similar and rely on the same kernel code, hence their is no justification to this speed gain.\n",
+    "\n",
+    "Indeed, from the `gaussian blur` benchmark, we can see that we do not have a speed gain even thow both rely on distributed filters.\n",
+    "The main change is from the `dtype`. Here, we are processing `uint8` (or `unsigned char`) while the `propotype` only manipulate `float32` type (x4 the memory than `uint8`).\n",
+    "\n",
+    "If we replicate the benchmarking with `float32` type, we see similar processing speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_image = np.random.random([50, 1024, 1024]).astype(np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pyclesperanto_prototype top-hat-box duration: 0.019962787628173828\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.02262401580810547\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.0244443416595459\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.023203611373901367\n",
+      "pyclesperanto_prototype top-hat-box duration: 0.02309703826904297\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (square) using prototype\n",
+    "result_image = None\n",
+    "\n",
+    "test_image_gpu = cle.push(test_image)\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = cle.top_hat_box(test_image_gpu, result_image, radius_x=radius, radius_y=radius)\n",
+    "    print(\"pyclesperanto_prototype top-hat-box duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pyclesperanto top-hat-box duration: 0.15494489669799805\n",
+      "pyclesperanto top-hat-box duration: 0.017775535583496094\n",
+      "pyclesperanto top-hat-box duration: 0.018291234970092773\n",
+      "pyclesperanto top-hat-box duration: 0.01889801025390625\n",
+      "pyclesperanto top-hat-box duration: 0.01804947853088379\n"
+     ]
+    }
+   ],
+   "source": [
+    "# top-hat (square) using pyclesperanto\n",
+    "result_image = None\n",
+    "\n",
+    "test_image_gpu = pcle.push(test_image)\n",
+    "\n",
+    "for i in range(0, 5):\n",
+    "    start_time = time.time()\n",
+    "    result_image = pcle.top_hat(test_image_gpu, result_image, radius_x=radius, radius_y=radius, connectivity=\"box\")\n",
+    "    print(\"pyclesperanto top-hat-box duration: \" + str(time.time() - start_time))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/benchmarking/voronoi_otsu_labeling.ipynb b/benchmarking/benchmark_voronoi_otsu_labeling.ipynb
similarity index 100%
rename from benchmarking/voronoi_otsu_labeling.ipynb
rename to benchmarking/benchmark_voronoi_otsu_labeling.ipynb