From 0ecbf77e306727d6cab08fdf2e2d1b9715f20fa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Thu, 22 Apr 2021 15:30:52 +0200
Subject: [PATCH 01/84] Bump version to 3.3.1-rc

Directly RC as we'll keep changes conservative to keep this branch usable
in production at any time.
---
 misc/dist/osx_tools.app/Contents/Info.plist | 4 ++--
 misc/dist/windows/godot.iss                 | 2 +-
 version.py                                  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/misc/dist/osx_tools.app/Contents/Info.plist b/misc/dist/osx_tools.app/Contents/Info.plist
index 0870473e03a7..3e7257eed8af 100644
--- a/misc/dist/osx_tools.app/Contents/Info.plist
+++ b/misc/dist/osx_tools.app/Contents/Info.plist
@@ -19,11 +19,11 @@
 	<key>CFBundlePackageType</key>
 	<string>APPL</string>
 	<key>CFBundleShortVersionString</key>
-	<string>3.3</string>
+	<string>3.3.1</string>
 	<key>CFBundleSignature</key>
 	<string>godot</string>
 	<key>CFBundleVersion</key>
-	<string>3.3</string>
+	<string>3.3.1</string>
 	<key>NSMicrophoneUsageDescription</key>
 	<string>Microphone access is required to capture audio.</string>
 	<key>NSCameraUsageDescription</key>
diff --git a/misc/dist/windows/godot.iss b/misc/dist/windows/godot.iss
index 722c6263b0d9..637b5044c8e3 100644
--- a/misc/dist/windows/godot.iss
+++ b/misc/dist/windows/godot.iss
@@ -1,5 +1,5 @@
 #define MyAppName "Godot Engine"
-#define MyAppVersion "3.3"
+#define MyAppVersion "3.3.1"
 #define MyAppPublisher "Godot Engine contributors"
 #define MyAppURL "https://godotengine.org/"
 #define MyAppExeName "godot.exe"
diff --git a/version.py b/version.py
index f6eae3eb14fc..296e325a7bca 100644
--- a/version.py
+++ b/version.py
@@ -2,8 +2,8 @@
 name = "Godot Engine"
 major = 3
 minor = 3
-patch = 0
-status = "stable"
+patch = 1
+status = "rc"
 module_config = ""
 year = 2021
 website = "https://godotengine.org"

From e86d086573d0c9c78c2f2dbf26265cc0d5bcf5fd Mon Sep 17 00:00:00 2001
From: bruvzg <7645683+bruvzg@users.noreply.github.com>
Date: Thu, 22 Apr 2021 10:24:04 +0300
Subject: [PATCH 02/84] Fix crash on GDNative API json generator exit.

(cherry picked from commit a4423c82f87ad0eaa9ffc3842407d003321ac22d)
---
 modules/gdnative/nativescript/nativescript.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/gdnative/nativescript/nativescript.cpp b/modules/gdnative/nativescript/nativescript.cpp
index cc46a1fff30a..2a1a3af7818b 100644
--- a/modules/gdnative/nativescript/nativescript.cpp
+++ b/modules/gdnative/nativescript/nativescript.cpp
@@ -39,6 +39,8 @@
 #include "core/os/os.h"
 #include "core/project_settings.h"
 
+#include "main/main.h"
+
 #include "scene/main/scene_tree.h"
 #include "scene/resources/resource_format_text.h"
 
@@ -1055,6 +1057,7 @@ void NativeScriptLanguage::init() {
 		if (generate_c_api(E->next()->get()) != OK) {
 			ERR_PRINT("Failed to generate C API\n");
 		}
+		Main::cleanup(true);
 		exit(0);
 	}
 #endif

From b616f41573ce56e872f73f160345da8a86504375 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Thu, 22 Apr 2021 02:18:44 +0200
Subject: [PATCH 03/84] fbx: Fix include for zlib that broke unbundling

It's possible to link against system zlib on Linux, so we should use system paths.

(cherry picked from commit 93b74061387075909d1b4d29b0e5b2924e06f7d7)
---
 modules/fbx/SCsub                    | 3 +++
 modules/fbx/fbx_parser/FBXParser.cpp | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/fbx/SCsub b/modules/fbx/SCsub
index 84220a66fa47..0311fddfee6c 100644
--- a/modules/fbx/SCsub
+++ b/modules/fbx/SCsub
@@ -8,6 +8,9 @@ env_fbx = env_modules.Clone()
 # Make includes relative to the folder path specified here so our includes are clean
 env_fbx.Prepend(CPPPATH=["#modules/fbx/"])
 
+if env["builtin_zlib"]:
+    env_fbx.Prepend(CPPPATH=["#thirdparty/zlib/"])
+
 # Godot's own source files
 env_fbx.add_source_files(env.modules_sources, "tools/*.cpp")
 env_fbx.add_source_files(env.modules_sources, "data/*.cpp")
diff --git a/modules/fbx/fbx_parser/FBXParser.cpp b/modules/fbx/fbx_parser/FBXParser.cpp
index 0d737eb272ca..e9e44d94bf98 100644
--- a/modules/fbx/fbx_parser/FBXParser.cpp
+++ b/modules/fbx/fbx_parser/FBXParser.cpp
@@ -74,8 +74,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *  @brief Implementation of the FBX parser and the rudimentary DOM that we use
  */
 
-#include "thirdparty/zlib/zlib.h"
 #include <stdlib.h> /* strtol */
+#include <zlib.h>
 
 #include "ByteSwapper.h"
 #include "FBXParseTools.h"

From bcbf7ce3b65a68e40d69baa111fe26fe5c464a4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Thu, 22 Apr 2021 16:23:13 +0200
Subject: [PATCH 04/84] Add type_traits include for
 `std::is_trivially_destructible`

(cherry picked from commit 3d46f2855860e93f1cd7d8cd73a35fe34bac64dd)
---
 core/safe_refcount.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/safe_refcount.h b/core/safe_refcount.h
index 7519329f2a58..c34d84f55b31 100644
--- a/core/safe_refcount.h
+++ b/core/safe_refcount.h
@@ -36,6 +36,7 @@
 #if !defined(NO_THREADS)
 
 #include <atomic>
+#include <type_traits>
 
 // Design goals for these classes:
 // - No automatic conversions or arithmetic operators,

From 33d6b1f68f807e0d25430c269f4c34e134bbdfa4 Mon Sep 17 00:00:00 2001
From: JFonS <joan.fonssanchez@gmail.com>
Date: Thu, 22 Apr 2021 15:01:25 +0200
Subject: [PATCH 05/84] CPU lightmapper environment energy fixes.

* Better handling of the scene's environment energy in the lightmapper
  bakes.
* Fixed a bug where ProceduralSky::get_panorama() returned a reference
  instead of a copy.
* Removed includes to Embree's internal header files.

(cherry picked from commit 2db2d1153d2deb8490c0ca5ad0f094077f382f28)
---
 modules/raycast/lightmap_raycaster.cpp | 30 +++++++++++---------------
 scene/3d/baked_lightmap.cpp            | 19 ++++++++++++----
 scene/3d/baked_lightmap.h              |  2 +-
 scene/resources/sky.cpp                |  4 ++--
 scene/resources/sky.h                  |  2 +-
 5 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/modules/raycast/lightmap_raycaster.cpp b/modules/raycast/lightmap_raycaster.cpp
index 29334b7cb02c..fac4385d7711 100644
--- a/modules/raycast/lightmap_raycaster.cpp
+++ b/modules/raycast/lightmap_raycaster.cpp
@@ -30,12 +30,7 @@
 
 #include "lightmap_raycaster.h"
 
-// From Embree.
-#include <math/vec2.h>
-#include <math/vec3.h>
-#include <xmmintrin.h>
-
-using namespace embree;
+#include <pmmintrin.h>
 
 LightmapRaycaster *LightmapRaycasterEmbree::create_embree_raycaster() {
 	return memnew(LightmapRaycasterEmbree);
@@ -135,25 +130,24 @@ void LightmapRaycasterEmbree::add_mesh(const Vector<Vector3> &p_vertices, const
 
 	ERR_FAIL_COND(vertex_count % 3 != 0);
 	ERR_FAIL_COND(vertex_count != p_uv2s.size());
+	ERR_FAIL_COND(!p_normals.empty() && vertex_count != p_normals.size());
 
-	Vec3fa *embree_vertices = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
-	Vec2fa *embree_light_uvs = (Vec2fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, RTC_FORMAT_FLOAT2, sizeof(Vec2fa), vertex_count);
-	uint32_t *embree_triangles = (uint32_t *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(uint32_t) * 3, vertex_count / 3);
+	Vector3 *embree_vertices = (Vector3 *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, sizeof(Vector3), vertex_count);
+	copymem(embree_vertices, p_vertices.ptr(), sizeof(Vector3) * vertex_count);
 
-	Vec3fa *embree_normals = nullptr;
-	if (!p_normals.empty()) {
-		embree_normals = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
-	}
+	Vector2 *embree_light_uvs = (Vector2 *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, RTC_FORMAT_FLOAT2, sizeof(Vector2), vertex_count);
+	copymem(embree_light_uvs, p_uv2s.ptr(), sizeof(Vector2) * vertex_count);
 
+	uint32_t *embree_triangles = (uint32_t *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(uint32_t) * 3, vertex_count / 3);
 	for (int i = 0; i < vertex_count; i++) {
-		embree_vertices[i] = Vec3fa(p_vertices[i].x, p_vertices[i].y, p_vertices[i].z);
-		embree_light_uvs[i] = Vec2fa(p_uv2s[i].x, p_uv2s[i].y);
-		if (embree_normals != nullptr) {
-			embree_normals[i] = Vec3fa(p_normals[i].x, p_normals[i].y, p_normals[i].z);
-		}
 		embree_triangles[i] = i;
 	}
 
+	if (!p_normals.empty()) {
+		Vector3 *embree_normals = (Vector3 *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, RTC_FORMAT_FLOAT3, sizeof(Vector3), vertex_count);
+		copymem(embree_normals, p_normals.ptr(), sizeof(Vector3) * vertex_count);
+	}
+
 	rtcCommitGeometry(embree_mesh);
 	rtcSetGeometryIntersectFilterFunction(embree_mesh, filter_function);
 	rtcSetGeometryUserData(embree_mesh, this);
diff --git a/scene/3d/baked_lightmap.cpp b/scene/3d/baked_lightmap.cpp
index 67c201c3e5d1..af044fccdb48 100644
--- a/scene/3d/baked_lightmap.cpp
+++ b/scene/3d/baked_lightmap.cpp
@@ -810,7 +810,7 @@ BakedLightmap::BakeError BakedLightmap::bake(Node *p_from_node, String p_data_sa
 			} break;
 			case ENVIRONMENT_MODE_CUSTOM_SKY: {
 				if (environment_custom_sky.is_valid()) {
-					environment_image = _get_irradiance_from_sky(environment_custom_sky, Vector2i(128, 64));
+					environment_image = _get_irradiance_from_sky(environment_custom_sky, environment_custom_energy, Vector2i(128, 64));
 					environment_xform.set_euler(environment_custom_sky_rotation_degrees * Math_PI / 180.0);
 				}
 
@@ -1233,7 +1233,7 @@ void BakedLightmap::_clear_lightmaps() {
 	}
 }
 
-Ref<Image> BakedLightmap::_get_irradiance_from_sky(Ref<Sky> p_sky, Vector2i p_size) {
+Ref<Image> BakedLightmap::_get_irradiance_from_sky(Ref<Sky> p_sky, float p_energy, Vector2i p_size) {
 	if (p_sky.is_null()) {
 		return Ref<Image>();
 	}
@@ -1245,7 +1245,7 @@ Ref<Image> BakedLightmap::_get_irradiance_from_sky(Ref<Sky> p_sky, Vector2i p_si
 	}
 	Ref<ProceduralSky> procedural = p_sky;
 	if (procedural.is_valid()) {
-		sky_image = procedural->get_panorama();
+		sky_image = procedural->get_data();
 	}
 
 	if (sky_image.is_null()) {
@@ -1254,6 +1254,17 @@ Ref<Image> BakedLightmap::_get_irradiance_from_sky(Ref<Sky> p_sky, Vector2i p_si
 
 	sky_image->convert(Image::FORMAT_RGBF);
 	sky_image->resize(p_size.x, p_size.y, Image::INTERPOLATE_CUBIC);
+
+	if (p_energy != 1.0) {
+		sky_image->lock();
+		for (int i = 0; i < p_size.y; i++) {
+			for (int j = 0; j < p_size.x; j++) {
+				sky_image->set_pixel(j, i, sky_image->get_pixel(j, i) * p_energy);
+			}
+		}
+		sky_image->unlock();
+	}
+
 	return sky_image;
 }
 
@@ -1261,7 +1272,7 @@ Ref<Image> BakedLightmap::_get_irradiance_map(Ref<Environment> p_env, Vector2i p
 	Environment::BGMode bg_mode = p_env->get_background();
 	switch (bg_mode) {
 		case Environment::BG_SKY: {
-			return _get_irradiance_from_sky(p_env->get_sky(), Vector2i(128, 64));
+			return _get_irradiance_from_sky(p_env->get_sky(), p_env->get_bg_energy(), Vector2i(128, 64));
 		}
 		case Environment::BG_CLEAR_COLOR:
 		case Environment::BG_COLOR: {
diff --git a/scene/3d/baked_lightmap.h b/scene/3d/baked_lightmap.h
index 6f8f0b8f980c..09791b5d2ab2 100644
--- a/scene/3d/baked_lightmap.h
+++ b/scene/3d/baked_lightmap.h
@@ -187,7 +187,7 @@ class BakedLightmap : public VisualInstance {
 	void _clear_lightmaps();
 
 	void _get_material_images(const MeshesFound &p_found_mesh, Lightmapper::MeshData &r_mesh_data, Vector<Ref<Texture> > &r_albedo_textures, Vector<Ref<Texture> > &r_emission_textures);
-	Ref<Image> _get_irradiance_from_sky(Ref<Sky> p_sky, Vector2i p_size);
+	Ref<Image> _get_irradiance_from_sky(Ref<Sky> p_sky, float p_energy, Vector2i p_size);
 	Ref<Image> _get_irradiance_map(Ref<Environment> p_env, Vector2i p_size);
 	void _find_meshes_and_lights(Node *p_at_node, Vector<MeshesFound> &meshes, Vector<LightsFound> &lights);
 	Vector2i _compute_lightmap_size(const MeshesFound &p_mesh);
diff --git a/scene/resources/sky.cpp b/scene/resources/sky.cpp
index 0db92f213252..0bad4a2ed0cc 100644
--- a/scene/resources/sky.cpp
+++ b/scene/resources/sky.cpp
@@ -390,8 +390,8 @@ ProceduralSky::TextureSize ProceduralSky::get_texture_size() const {
 	return texture_size;
 }
 
-Ref<Image> ProceduralSky::get_panorama() const {
-	return panorama;
+Ref<Image> ProceduralSky::get_data() const {
+	return panorama->duplicate();
 }
 
 RID ProceduralSky::get_rid() const {
diff --git a/scene/resources/sky.h b/scene/resources/sky.h
index 5b5e6c701761..0cfd00bcda47 100644
--- a/scene/resources/sky.h
+++ b/scene/resources/sky.h
@@ -190,7 +190,7 @@ class ProceduralSky : public Sky {
 	void set_texture_size(TextureSize p_size);
 	TextureSize get_texture_size() const;
 
-	Ref<Image> get_panorama() const;
+	Ref<Image> get_data() const;
 
 	virtual RID get_rid() const;
 

From 66625962bff3b83caae324d6e9d402e2bfacc48f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Thu, 22 Apr 2021 02:03:43 +0200
Subject: [PATCH 06/84] embree: Allow building against system library on Linux

(cherry picked from commit b266cc2315f2d8fb1615e2c4f18558e602a7a700)
---
 SConstruct             |   1 +
 modules/raycast/SCsub  | 165 +++++++++++++++++++++--------------------
 platform/x11/detect.py |   4 +
 3 files changed, 90 insertions(+), 80 deletions(-)

diff --git a/SConstruct b/SConstruct
index c30f53308b78..7abe58ba49c1 100644
--- a/SConstruct
+++ b/SConstruct
@@ -151,6 +151,7 @@ opts.Add(BoolVariable("use_precise_math_checks", "Math checks use very precise e
 # Thirdparty libraries
 opts.Add(BoolVariable("builtin_bullet", "Use the built-in Bullet library", True))
 opts.Add(BoolVariable("builtin_certs", "Use the built-in SSL certificates bundles", True))
+opts.Add(BoolVariable("builtin_embree", "Use the built-in Embree library", True))
 opts.Add(BoolVariable("builtin_enet", "Use the built-in ENet library", True))
 opts.Add(BoolVariable("builtin_freetype", "Use the built-in FreeType library", True))
 opts.Add(BoolVariable("builtin_libogg", "Use the built-in libogg library", True))
diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub
index 789d7d32514a..e6808d76ba1c 100644
--- a/modules/raycast/SCsub
+++ b/modules/raycast/SCsub
@@ -3,92 +3,97 @@
 Import("env")
 Import("env_modules")
 
-embree_src = [
-    "common/sys/sysinfo.cpp",
-    "common/sys/alloc.cpp",
-    "common/sys/filename.cpp",
-    "common/sys/library.cpp",
-    "common/sys/thread.cpp",
-    "common/sys/string.cpp",
-    "common/sys/regression.cpp",
-    "common/sys/mutex.cpp",
-    "common/sys/condition.cpp",
-    "common/sys/barrier.cpp",
-    "common/math/constants.cpp",
-    "common/simd/sse.cpp",
-    "common/lexers/stringstream.cpp",
-    "common/lexers/tokenstream.cpp",
-    "common/tasking/taskschedulerinternal.cpp",
-    "common/algorithms/parallel_for.cpp",
-    "common/algorithms/parallel_reduce.cpp",
-    "common/algorithms/parallel_prefix_sum.cpp",
-    "common/algorithms/parallel_for_for.cpp",
-    "common/algorithms/parallel_for_for_prefix_sum.cpp",
-    "common/algorithms/parallel_partition.cpp",
-    "common/algorithms/parallel_sort.cpp",
-    "common/algorithms/parallel_set.cpp",
-    "common/algorithms/parallel_map.cpp",
-    "common/algorithms/parallel_filter.cpp",
-    "kernels/common/device.cpp",
-    "kernels/common/stat.cpp",
-    "kernels/common/acceln.cpp",
-    "kernels/common/accelset.cpp",
-    "kernels/common/state.cpp",
-    "kernels/common/rtcore.cpp",
-    "kernels/common/rtcore_builder.cpp",
-    "kernels/common/scene.cpp",
-    "kernels/common/alloc.cpp",
-    "kernels/common/geometry.cpp",
-    "kernels/common/scene_triangle_mesh.cpp",
-    "kernels/geometry/primitive4.cpp",
-    "kernels/builders/primrefgen.cpp",
-    "kernels/bvh/bvh.cpp",
-    "kernels/bvh/bvh_statistics.cpp",
-    "kernels/bvh/bvh4_factory.cpp",
-    "kernels/bvh/bvh8_factory.cpp",
-    "kernels/bvh/bvh_collider.cpp",
-    "kernels/bvh/bvh_rotate.cpp",
-    "kernels/bvh/bvh_refit.cpp",
-    "kernels/bvh/bvh_builder.cpp",
-    "kernels/bvh/bvh_builder_morton.cpp",
-    "kernels/bvh/bvh_builder_sah.cpp",
-    "kernels/bvh/bvh_builder_sah_spatial.cpp",
-    "kernels/bvh/bvh_builder_sah_mb.cpp",
-    "kernels/bvh/bvh_builder_twolevel.cpp",
-    "kernels/bvh/bvh_intersector1_bvh4.cpp",
-]
+env_raycast = env_modules.Clone()
+
+# Thirdparty source files
 
-embree_dir = "#thirdparty/embree/"
+if env["builtin_embree"]:
+    thirdparty_dir = "#thirdparty/embree/"
 
-env_embree = env_modules.Clone()
-embree_sources = [embree_dir + file for file in embree_src]
-env_embree.Prepend(CPPPATH=[embree_dir, embree_dir + "include"])
-env_embree.Append(
-    CPPFLAGS=[
-        "-DEMBREE_TARGET_SSE2",
-        "-DEMBREE_LOWEST_ISA",
-        "-DTASKING_INTERNAL",
-        "-DNDEBUG",
-        "-D__SSE2__",
-        "-D__SSE__",
+    embree_src = [
+        "common/sys/sysinfo.cpp",
+        "common/sys/alloc.cpp",
+        "common/sys/filename.cpp",
+        "common/sys/library.cpp",
+        "common/sys/thread.cpp",
+        "common/sys/string.cpp",
+        "common/sys/regression.cpp",
+        "common/sys/mutex.cpp",
+        "common/sys/condition.cpp",
+        "common/sys/barrier.cpp",
+        "common/math/constants.cpp",
+        "common/simd/sse.cpp",
+        "common/lexers/stringstream.cpp",
+        "common/lexers/tokenstream.cpp",
+        "common/tasking/taskschedulerinternal.cpp",
+        "common/algorithms/parallel_for.cpp",
+        "common/algorithms/parallel_reduce.cpp",
+        "common/algorithms/parallel_prefix_sum.cpp",
+        "common/algorithms/parallel_for_for.cpp",
+        "common/algorithms/parallel_for_for_prefix_sum.cpp",
+        "common/algorithms/parallel_partition.cpp",
+        "common/algorithms/parallel_sort.cpp",
+        "common/algorithms/parallel_set.cpp",
+        "common/algorithms/parallel_map.cpp",
+        "common/algorithms/parallel_filter.cpp",
+        "kernels/common/device.cpp",
+        "kernels/common/stat.cpp",
+        "kernels/common/acceln.cpp",
+        "kernels/common/accelset.cpp",
+        "kernels/common/state.cpp",
+        "kernels/common/rtcore.cpp",
+        "kernels/common/rtcore_builder.cpp",
+        "kernels/common/scene.cpp",
+        "kernels/common/alloc.cpp",
+        "kernels/common/geometry.cpp",
+        "kernels/common/scene_triangle_mesh.cpp",
+        "kernels/geometry/primitive4.cpp",
+        "kernels/builders/primrefgen.cpp",
+        "kernels/bvh/bvh.cpp",
+        "kernels/bvh/bvh_statistics.cpp",
+        "kernels/bvh/bvh4_factory.cpp",
+        "kernels/bvh/bvh8_factory.cpp",
+        "kernels/bvh/bvh_collider.cpp",
+        "kernels/bvh/bvh_rotate.cpp",
+        "kernels/bvh/bvh_refit.cpp",
+        "kernels/bvh/bvh_builder.cpp",
+        "kernels/bvh/bvh_builder_morton.cpp",
+        "kernels/bvh/bvh_builder_sah.cpp",
+        "kernels/bvh/bvh_builder_sah_spatial.cpp",
+        "kernels/bvh/bvh_builder_sah_mb.cpp",
+        "kernels/bvh/bvh_builder_twolevel.cpp",
+        "kernels/bvh/bvh_intersector1_bvh4.cpp",
     ]
-)
 
-if not env_embree.msvc:
-    env_embree.Append(CPPFLAGS=["-msse2", "-mxsave"])
-    if env["platform"] == "windows":
-        env_embree.Append(CPPFLAGS=["-mstackrealign"])
+    thirdparty_sources = [thirdparty_dir + file for file in embree_src]
 
-if env["platform"] == "windows":
-    if env.msvc:
-        env.Append(LINKFLAGS=["psapi.lib"])
-    else:
-        env.Append(LIBS=["psapi"])
+    env_raycast.Prepend(CPPPATH=[thirdparty_dir, thirdparty_dir + "include"])
+    env_raycast.Append(
+        CPPDEFINES=[
+            "EMBREE_TARGET_SSE2",
+            "EMBREE_LOWEST_ISA",
+            "TASKING_INTERNAL",
+            "NDEBUG",
+            "__SSE2__",
+            "__SSE__",
+        ]
+    )
 
-env_embree.disable_warnings()
-env_embree.add_source_files(env.modules_sources, embree_sources)
+    if not env.msvc:
+        env_raycast.Append(CPPFLAGS=["-msse2", "-mxsave"])
+        if env["platform"] == "windows":
+            env_raycast.Append(CPPFLAGS=["-mstackrealign"])
+
+    if env["platform"] == "windows":
+        if env.msvc:
+            env.Append(LINKFLAGS=["psapi.lib"])
+        else:
+            env.Append(LIBS=["psapi"])
+
+    env_thirdparty = env_raycast.Clone()
+    env_thirdparty.disable_warnings()
+    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
 
-env_raycast = env_modules.Clone()
-env_raycast.Prepend(CPPPATH=[embree_dir, embree_dir + "include", embree_dir + "common"])
 
+# Godot source files
 env_raycast.add_source_files(env.modules_sources, "*.cpp")
diff --git a/platform/x11/detect.py b/platform/x11/detect.py
index ba5fb30d2edf..7513bd701fd8 100644
--- a/platform/x11/detect.py
+++ b/platform/x11/detect.py
@@ -310,6 +310,10 @@ def configure(env):
     if not env["builtin_pcre2"]:
         env.ParseConfig("pkg-config libpcre2-32 --cflags --libs")
 
+    if not env["builtin_embree"]:
+        # No pkgconfig file so far, hardcode expected lib name.
+        env.Append(LIBS=["embree3"])
+
     ## Flags
 
     if os.system("pkg-config --exists alsa") == 0:  # 0 means found

From 1e3166115acf9980cfbcc0d7ea760fc38e6dcb65 Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Thu, 22 Apr 2021 15:56:47 +0200
Subject: [PATCH 07/84] Tweak lightmapper warning message to mention Rosetta
 emulation on macOS

(cherry picked from commit 47f869b731d7821f02eb601e409bd3ce347ed30b)
---
 editor/plugins/baked_lightmap_editor_plugin.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/editor/plugins/baked_lightmap_editor_plugin.cpp b/editor/plugins/baked_lightmap_editor_plugin.cpp
index 0064f8293f78..847ce717856d 100644
--- a/editor/plugins/baked_lightmap_editor_plugin.cpp
+++ b/editor/plugins/baked_lightmap_editor_plugin.cpp
@@ -70,7 +70,11 @@ void BakedLightmapEditorPlugin::_bake_select_file(const String &p_file) {
 				EditorNode::get_singleton()->show_warning(TTR("Some mesh is invalid. Make sure the UV2 channel values are contained within the [0.0,1.0] square region."));
 				break;
 			case BakedLightmap::BAKE_ERROR_NO_LIGHTMAPPER:
-				EditorNode::get_singleton()->show_warning(TTR("Godot editor was built without ray tracing support, lightmaps can't be baked."));
+#ifdef OSX_ENABLED
+				EditorNode::get_singleton()->show_warning(TTR("Godot editor was built without ray tracing support; lightmaps can't be baked.\nIf you are using an Apple Silicon-based Mac, try forcing Rosetta emulation on Godot.app in the application settings\nthen restart the editor."));
+#else
+				EditorNode::get_singleton()->show_warning(TTR("Godot editor was built without ray tracing support; lightmaps can't be baked."));
+#endif
 				break;
 			default: {
 			}

From aa84787fb4f44ea7bd4c7449cd050b3b95d33e78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Thu, 22 Apr 2021 18:53:43 +0200
Subject: [PATCH 08/84] lightmapper: Disable build if raycast module can't
 build

We need to propagate the hacky checks from the raycast config to the
lightmapper config, as the failure of a `can_build()` check is not notified to
other modules (which might even be checked further depending on the processing
order in SConstruct).

A more thorough fix would be to change SConstruct to do two loops on modules:
one to check `can_build()` and disable modules which can't build, then another
one to rechecked `can_build()` with the new lineup and do further config.
But there would be more risk for regressions than with this ad hoc hack.

Similar story for the `platform/x11/detect.py` change... oh my eyes :(

(cherry picked from commit a2c68d9da71053efa3ca7de6162aa71bc3651b92)
---
 modules/lightmapper_cpu/config.py | 25 ++++++++++++++++++++++++-
 platform/x11/detect.py            |  5 ++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/modules/lightmapper_cpu/config.py b/modules/lightmapper_cpu/config.py
index d01c1726dd34..0b8837aa4edd 100644
--- a/modules/lightmapper_cpu/config.py
+++ b/modules/lightmapper_cpu/config.py
@@ -1,5 +1,28 @@
 def can_build(env, platform):
-    return env["tools"] and env["module_raycast_enabled"]
+    if not env["tools"] or not env["module_raycast_enabled"]:
+        return False
+
+    # Depends on raycast module (embree), but we can't have access to the result of
+    # `can_build()` for that module, so we need to duplicate that code as a short-term
+    # solution.
+
+    # Embree requires at least SSE2 to be available, so 32-bit and ARM64 builds are
+    # not supported.
+    # It's also only relevant for tools build and desktop platforms,
+    # as doing lightmap generation on Android or HTML5 would be a bit far-fetched.
+    supported_platform = platform in ["x11", "osx", "windows", "server"]
+    supported_bits = env["bits"] == "64"
+    supported_arch = env["arch"] != "arm64"
+
+    # Hack to disable on Linux arm64. This won't work well for cross-compilation (checks
+    # host, not target) and would need a more thorough fix by refactoring our arch and
+    # bits-handling code.
+    from platform import machine
+
+    if platform == "x11" and machine() != "x86_64":
+        supported_arch = False
+
+    return supported_platform and supported_bits and supported_arch
 
 
 def configure(env):
diff --git a/platform/x11/detect.py b/platform/x11/detect.py
index 7513bd701fd8..ada0669a69c0 100644
--- a/platform/x11/detect.py
+++ b/platform/x11/detect.py
@@ -310,7 +310,10 @@ def configure(env):
     if not env["builtin_pcre2"]:
         env.ParseConfig("pkg-config libpcre2-32 --cflags --libs")
 
-    if not env["builtin_embree"]:
+    # Embree is only compatible with x86_64. Yet another unreliable hack that will break
+    # cross-compilation, this will really need to be handle better. Thankfully only affects
+    # people who disable builtin_embree (likely distro packagers).
+    if not env["builtin_embree"] and (is64 and platform.machine() == "x86_64"):
         # No pkgconfig file so far, hardcode expected lib name.
         env.Append(LIBS=["embree3"])
 

From 162c78f9dca3669f84c837356730a5d33a63060b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Fri, 23 Apr 2021 10:25:51 +0200
Subject: [PATCH 09/84] Linux: Don't attempt linking embree3 on non-tools, link
 it for headless too

`tech_debt++`, that's what we get for not taking the time to cleanup all this
and do it right...

Follow-up to #48073 and #48102.

(cherry picked from commit a14b51df924cbdd915f46571b396d6b9ac6e84ff)
---
 platform/server/detect.py | 7 +++++++
 platform/x11/detect.py    | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/platform/server/detect.py b/platform/server/detect.py
index 59eb3dc5cbc2..49fbdedd3f4b 100644
--- a/platform/server/detect.py
+++ b/platform/server/detect.py
@@ -222,6 +222,13 @@ def configure(env):
     if not env["builtin_pcre2"]:
         env.ParseConfig("pkg-config libpcre2-32 --cflags --libs")
 
+    # Embree is only compatible with x86_64. Yet another unreliable hack that will break
+    # cross-compilation, this will really need to be handle better. Thankfully only affects
+    # people who disable builtin_embree (likely distro packagers).
+    if env["tools"] and not env["builtin_embree"] and (is64 and platform.machine() == "x86_64"):
+        # No pkgconfig file so far, hardcode expected lib name.
+        env.Append(LIBS=["embree3"])
+
     ## Flags
 
     # Linkflags below this line should typically stay the last ones
diff --git a/platform/x11/detect.py b/platform/x11/detect.py
index ada0669a69c0..1a5204415090 100644
--- a/platform/x11/detect.py
+++ b/platform/x11/detect.py
@@ -313,7 +313,7 @@ def configure(env):
     # Embree is only compatible with x86_64. Yet another unreliable hack that will break
     # cross-compilation, this will really need to be handle better. Thankfully only affects
     # people who disable builtin_embree (likely distro packagers).
-    if not env["builtin_embree"] and (is64 and platform.machine() == "x86_64"):
+    if env["tools"] and not env["builtin_embree"] and (is64 and platform.machine() == "x86_64"):
         # No pkgconfig file so far, hardcode expected lib name.
         env.Append(LIBS=["embree3"])
 

From 5e16b108244e2d6c12bfbd39ec4716437ff2b6bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Fri, 23 Apr 2021 21:32:46 +0200
Subject: [PATCH 10/84] Android: Fix get_buffer false positive on empty dest
 buffer

Follow-up to #46810, this was missed in #47079 when fixing the issue
for other platforms.

Fixes #48135.

(cherry picked from commit a09f3833bda5c1c695a137d2eb153aeb06141484)
---
 platform/android/file_access_android.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platform/android/file_access_android.cpp b/platform/android/file_access_android.cpp
index 7396c5a108cb..f6e131b6f01e 100644
--- a/platform/android/file_access_android.cpp
+++ b/platform/android/file_access_android.cpp
@@ -125,7 +125,7 @@ uint8_t FileAccessAndroid::get_8() const {
 }
 
 int FileAccessAndroid::get_buffer(uint8_t *p_dst, int p_length) const {
-	ERR_FAIL_COND_V(!p_dst, -1);
+	ERR_FAIL_COND_V(!p_dst && p_length > 0, -1);
 	ERR_FAIL_COND_V(p_length < 0, -1);
 
 	off_t r = AAsset_read(a, p_dst, p_length);

From 3b44829fbdae077597dced3670aade484b32ec73 Mon Sep 17 00:00:00 2001
From: Arthur Paulino <arthurleonardo.ap@gmail.com>
Date: Mon, 12 Oct 2020 20:24:19 -0300
Subject: [PATCH 11/84] Updating KinematicBody2D "is_on" functions'
 descriptions

(cherry picked from commit 0c9a1a1cd4c5ebfbabe5b04e81a5ddb24b3dcb06)
---
 doc/classes/KinematicBody.xml   | 6 +++---
 doc/classes/KinematicBody2D.xml | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/classes/KinematicBody.xml b/doc/classes/KinematicBody.xml
index d340d9194c62..fafd23dd9d48 100644
--- a/doc/classes/KinematicBody.xml
+++ b/doc/classes/KinematicBody.xml
@@ -59,21 +59,21 @@
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if the body is on the ceiling. Only updates when calling [method move_and_slide] or [method move_and_slide_with_snap].
+				Returns [code]true[/code] if the body collided with the ceiling on the last call of [method move_and_slide] or [method move_and_slide_with_snap]. Otherwise, returns [code]false[/code].
 			</description>
 		</method>
 		<method name="is_on_floor" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if the body is on the floor. Only updates when calling [method move_and_slide] or [method move_and_slide_with_snap].
+				Returns [code]true[/code] if the body collided with the floor on the last call of [method move_and_slide] or [method move_and_slide_with_snap]. Otherwise, returns [code]false[/code].
 			</description>
 		</method>
 		<method name="is_on_wall" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if the body is on a wall. Only updates when calling [method move_and_slide] or [method move_and_slide_with_snap].
+				Returns [code]true[/code] if the body collided with a wall on the last call of [method move_and_slide] or [method move_and_slide_with_snap]. Otherwise, returns [code]false[/code].
 			</description>
 		</method>
 		<method name="move_and_collide">
diff --git a/doc/classes/KinematicBody2D.xml b/doc/classes/KinematicBody2D.xml
index 6d98617943b5..feb3f5fe5b0a 100644
--- a/doc/classes/KinematicBody2D.xml
+++ b/doc/classes/KinematicBody2D.xml
@@ -55,21 +55,21 @@
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if the body is on the ceiling. Only updates when calling [method move_and_slide] or [method move_and_slide_with_snap].
+				Returns [code]true[/code] if the body collided with the ceiling on the last call of [method move_and_slide] or [method move_and_slide_with_snap]. Otherwise, returns [code]false[/code].
 			</description>
 		</method>
 		<method name="is_on_floor" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if the body is on the floor. Only updates when calling [method move_and_slide] or [method move_and_slide_with_snap].
+				Returns [code]true[/code] if the body collided with the floor on the last call of [method move_and_slide] or [method move_and_slide_with_snap]. Otherwise, returns [code]false[/code].
 			</description>
 		</method>
 		<method name="is_on_wall" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if the body is on a wall. Only updates when calling [method move_and_slide] or [method move_and_slide_with_snap].
+				Returns [code]true[/code] if the body collided with a wall on the last call of [method move_and_slide] or [method move_and_slide_with_snap]. Otherwise, returns [code]false[/code].
 			</description>
 		</method>
 		<method name="move_and_collide">

From dc98144b99f35cc6446e297e2c6082d8c441d540 Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Thu, 22 Apr 2021 20:07:54 +0200
Subject: [PATCH 12/84] Link to Feature tags more explicitly in ProjectSettings
 documentation

(cherry picked from commit 188bd5638c9c9f1807b79c1d67f15915f477be91)
---
 doc/classes/ProjectSettings.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index 1ac058a64fbb..83e1cf1d1226 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -6,6 +6,7 @@
 	<description>
 		Contains global variables accessible from everywhere. Use [method get_setting], [method set_setting] or [method has_setting] to access them. Variables stored in [code]project.godot[/code] are also loaded into ProjectSettings, making this object very useful for reading custom game configuration options.
 		When naming a Project Settings property, use the full path to the setting including the category. For example, [code]"application/config/name"[/code] for the project name. Category and property names can be viewed in the Project Settings dialog.
+		[b]Feature tags:[/b] Project settings can be overriden for specific platforms and configurations (debug, release, ...) using [url=https://docs.godotengine.org/en/latest/tutorials/export/feature_tags.html]feature tags[/url].
 		[b]Overriding:[/b] Any project setting can be overridden by creating a file named [code]override.cfg[/code] in the project's root directory. This can also be used in exported projects by placing this file in the same directory as the project binary. Overriding will still take the base project settings' [url=https://docs.godotengine.org/en/latest/tutorials/export/feature_tags.html]feature tags[/url] in account. Therefore, make sure to [i]also[/i] override the setting with the desired feature tags if you want them to override base project settings on all platforms and configurations.
 	</description>
 	<tutorials>

From bceaef6500bf6941cd23856d0eeb810101c26f25 Mon Sep 17 00:00:00 2001
From: Yuri Sizov <yuris@humnom.net>
Date: Thu, 22 Apr 2021 21:54:35 +0300
Subject: [PATCH 13/84] Correct pre-deprication warning message regarding
 linuxbsd/x11 platform

(cherry picked from commit dbd4b45ca23a05ee373f3037b178dca40089a588)
---
 SConstruct | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SConstruct b/SConstruct
index 7abe58ba49c1..d138c7b250e5 100644
--- a/SConstruct
+++ b/SConstruct
@@ -213,7 +213,7 @@ else:
 if selected_platform in ["linux", "bsd", "linuxbsd"]:
     if selected_platform == "linuxbsd":
         # Alias for forward compatibility.
-        print('Platform "linuxbsd" is still called "x11" in Godot 3.2.x. Building for platform "x11".')
+        print('Platform "linuxbsd" is still called "x11" in Godot 3.x. Building for platform "x11".')
     # Alias for convenience.
     selected_platform = "x11"
 

From 9201ffa9a26b8bb62094c780c3affdf40a7f4da7 Mon Sep 17 00:00:00 2001
From: kleonc <9283098+kleonc@users.noreply.github.com>
Date: Thu, 22 Apr 2021 19:59:16 +0200
Subject: [PATCH 14/84] Improve some argument names for core types

(cherry picked from commit 4d7f642fb3c14c8d7f130a56f1af81bff29f33fc)
---
 core/color.h                                  | 10 +--
 core/math/basis.cpp                           | 12 ++--
 core/math/basis.h                             |  2 +-
 core/math/quat.cpp                            | 72 +++++++++----------
 core/math/quat.h                              | 60 ++++++++--------
 core/math/vector2.cpp                         |  8 +--
 core/math/vector2.h                           | 34 ++++-----
 core/math/vector3.cpp                         |  8 +--
 core/math/vector3.h                           | 52 +++++++-------
 core/variant_call.cpp                         | 22 +++---
 doc/classes/Basis.xml                         |  4 +-
 doc/classes/Color.xml                         |  6 +-
 doc/classes/Quat.xml                          | 12 ++--
 doc/classes/Transform2D.xml                   |  2 +-
 doc/classes/Vector2.xml                       | 16 ++---
 doc/classes/Vector3.xml                       | 16 ++---
 .../glue/GodotSharp/GodotSharp/Core/Quat.cs   |  5 +-
 .../GodotSharp/GodotSharp/Core/Vector2.cs     |  5 +-
 .../GodotSharp/GodotSharp/Core/Vector3.cs     |  5 +-
 19 files changed, 177 insertions(+), 174 deletions(-)

diff --git a/core/color.h b/core/color.h
index f1b8de3a571b..a9c63a349ff9 100644
--- a/core/color.h
+++ b/core/color.h
@@ -93,14 +93,14 @@ struct Color {
 	Color inverted() const;
 	Color contrasted() const;
 
-	_FORCE_INLINE_ Color linear_interpolate(const Color &p_b, float p_t) const {
+	_FORCE_INLINE_ Color linear_interpolate(const Color &p_to, float p_weight) const {
 
 		Color res = *this;
 
-		res.r += (p_t * (p_b.r - r));
-		res.g += (p_t * (p_b.g - g));
-		res.b += (p_t * (p_b.b - b));
-		res.a += (p_t * (p_b.a - a));
+		res.r += (p_weight * (p_to.r - r));
+		res.g += (p_weight * (p_to.g - g));
+		res.b += (p_weight * (p_to.b - b));
+		res.a += (p_weight * (p_to.a - a));
 
 		return res;
 	}
diff --git a/core/math/basis.cpp b/core/math/basis.cpp
index 5885d531a0ab..a00e703cdb45 100644
--- a/core/math/basis.cpp
+++ b/core/math/basis.cpp
@@ -1033,16 +1033,16 @@ void Basis::set_diagonal(const Vector3 &p_diag) {
 	elements[2][2] = p_diag.z;
 }
 
-Basis Basis::slerp(const Basis &target, const real_t &t) const {
+Basis Basis::slerp(const Basis &p_to, const real_t &p_weight) const {
 
 	//consider scale
 	Quat from(*this);
-	Quat to(target);
+	Quat to(p_to);
 
-	Basis b(from.slerp(to, t));
-	b.elements[0] *= Math::lerp(elements[0].length(), target.elements[0].length(), t);
-	b.elements[1] *= Math::lerp(elements[1].length(), target.elements[1].length(), t);
-	b.elements[2] *= Math::lerp(elements[2].length(), target.elements[2].length(), t);
+	Basis b(from.slerp(to, p_weight));
+	b.elements[0] *= Math::lerp(elements[0].length(), p_to.elements[0].length(), p_weight);
+	b.elements[1] *= Math::lerp(elements[1].length(), p_to.elements[1].length(), p_weight);
+	b.elements[2] *= Math::lerp(elements[2].length(), p_to.elements[2].length(), p_weight);
 
 	return b;
 }
diff --git a/core/math/basis.h b/core/math/basis.h
index 01dbd72a833f..83b51e5abbdc 100644
--- a/core/math/basis.h
+++ b/core/math/basis.h
@@ -172,7 +172,7 @@ class Basis {
 	bool is_diagonal() const;
 	bool is_rotation() const;
 
-	Basis slerp(const Basis &target, const real_t &t) const;
+	Basis slerp(const Basis &p_to, const real_t &p_weight) const;
 
 	operator String() const;
 
diff --git a/core/math/quat.cpp b/core/math/quat.cpp
index 8df56ee5a8a6..72d0a9923c2a 100644
--- a/core/math/quat.cpp
+++ b/core/math/quat.cpp
@@ -106,18 +106,18 @@ Vector3 Quat::get_euler_yxz() const {
 	return m.get_euler_yxz();
 }
 
-void Quat::operator*=(const Quat &q) {
+void Quat::operator*=(const Quat &p_q) {
 
-	set(w * q.x + x * q.w + y * q.z - z * q.y,
-			w * q.y + y * q.w + z * q.x - x * q.z,
-			w * q.z + z * q.w + x * q.y - y * q.x,
-			w * q.w - x * q.x - y * q.y - z * q.z);
+	set(w * p_q.x + x * p_q.w + y * p_q.z - z * p_q.y,
+			w * p_q.y + y * p_q.w + z * p_q.x - x * p_q.z,
+			w * p_q.z + z * p_q.w + x * p_q.y - y * p_q.x,
+			w * p_q.w - x * p_q.x - y * p_q.y - z * p_q.z);
 }
 
-Quat Quat::operator*(const Quat &q) const {
+Quat Quat::operator*(const Quat &p_q) const {
 
 	Quat r = *this;
-	r *= q;
+	r *= p_q;
 	return r;
 }
 
@@ -150,29 +150,29 @@ Quat Quat::inverse() const {
 	return Quat(-x, -y, -z, w);
 }
 
-Quat Quat::slerp(const Quat &q, const real_t &t) const {
+Quat Quat::slerp(const Quat &p_to, const real_t &p_weight) const {
 #ifdef MATH_CHECKS
 	ERR_FAIL_COND_V_MSG(!is_normalized(), Quat(), "The start quaternion must be normalized.");
-	ERR_FAIL_COND_V_MSG(!q.is_normalized(), Quat(), "The end quaternion must be normalized.");
+	ERR_FAIL_COND_V_MSG(!p_to.is_normalized(), Quat(), "The end quaternion must be normalized.");
 #endif
 	Quat to1;
 	real_t omega, cosom, sinom, scale0, scale1;
 
 	// calc cosine
-	cosom = dot(q);
+	cosom = dot(p_to);
 
 	// adjust signs (if necessary)
 	if (cosom < 0.0) {
 		cosom = -cosom;
-		to1.x = -q.x;
-		to1.y = -q.y;
-		to1.z = -q.z;
-		to1.w = -q.w;
+		to1.x = -p_to.x;
+		to1.y = -p_to.y;
+		to1.z = -p_to.z;
+		to1.w = -p_to.w;
 	} else {
-		to1.x = q.x;
-		to1.y = q.y;
-		to1.z = q.z;
-		to1.w = q.w;
+		to1.x = p_to.x;
+		to1.y = p_to.y;
+		to1.z = p_to.z;
+		to1.w = p_to.w;
 	}
 
 	// calculate coefficients
@@ -181,13 +181,13 @@ Quat Quat::slerp(const Quat &q, const real_t &t) const {
 		// standard case (slerp)
 		omega = Math::acos(cosom);
 		sinom = Math::sin(omega);
-		scale0 = Math::sin((1.0 - t) * omega) / sinom;
-		scale1 = Math::sin(t * omega) / sinom;
+		scale0 = Math::sin((1.0 - p_weight) * omega) / sinom;
+		scale1 = Math::sin(p_weight * omega) / sinom;
 	} else {
 		// "from" and "to" quaternions are very close
 		//  ... so we can do a linear interpolation
-		scale0 = 1.0 - t;
-		scale1 = t;
+		scale0 = 1.0 - p_weight;
+		scale1 = p_weight;
 	}
 	// calculate final values
 	return Quat(
@@ -197,37 +197,37 @@ Quat Quat::slerp(const Quat &q, const real_t &t) const {
 			scale0 * w + scale1 * to1.w);
 }
 
-Quat Quat::slerpni(const Quat &q, const real_t &t) const {
+Quat Quat::slerpni(const Quat &p_to, const real_t &p_weight) const {
 #ifdef MATH_CHECKS
 	ERR_FAIL_COND_V_MSG(!is_normalized(), Quat(), "The start quaternion must be normalized.");
-	ERR_FAIL_COND_V_MSG(!q.is_normalized(), Quat(), "The end quaternion must be normalized.");
+	ERR_FAIL_COND_V_MSG(!p_to.is_normalized(), Quat(), "The end quaternion must be normalized.");
 #endif
 	const Quat &from = *this;
 
-	real_t dot = from.dot(q);
+	real_t dot = from.dot(p_to);
 
 	if (Math::absf(dot) > 0.9999) return from;
 
 	real_t theta = Math::acos(dot),
 		   sinT = 1.0 / Math::sin(theta),
-		   newFactor = Math::sin(t * theta) * sinT,
-		   invFactor = Math::sin((1.0 - t) * theta) * sinT;
+		   newFactor = Math::sin(p_weight * theta) * sinT,
+		   invFactor = Math::sin((1.0 - p_weight) * theta) * sinT;
 
-	return Quat(invFactor * from.x + newFactor * q.x,
-			invFactor * from.y + newFactor * q.y,
-			invFactor * from.z + newFactor * q.z,
-			invFactor * from.w + newFactor * q.w);
+	return Quat(invFactor * from.x + newFactor * p_to.x,
+			invFactor * from.y + newFactor * p_to.y,
+			invFactor * from.z + newFactor * p_to.z,
+			invFactor * from.w + newFactor * p_to.w);
 }
 
-Quat Quat::cubic_slerp(const Quat &q, const Quat &prep, const Quat &postq, const real_t &t) const {
+Quat Quat::cubic_slerp(const Quat &p_b, const Quat &p_pre_a, const Quat &p_post_b, const real_t &p_weight) const {
 #ifdef MATH_CHECKS
 	ERR_FAIL_COND_V_MSG(!is_normalized(), Quat(), "The start quaternion must be normalized.");
-	ERR_FAIL_COND_V_MSG(!q.is_normalized(), Quat(), "The end quaternion must be normalized.");
+	ERR_FAIL_COND_V_MSG(!p_b.is_normalized(), Quat(), "The end quaternion must be normalized.");
 #endif
 	//the only way to do slerp :|
-	real_t t2 = (1.0 - t) * t * 2;
-	Quat sp = this->slerp(q, t);
-	Quat sq = prep.slerpni(postq, t);
+	real_t t2 = (1.0 - p_weight) * p_weight * 2;
+	Quat sp = this->slerp(p_b, p_weight);
+	Quat sq = p_pre_a.slerpni(p_post_b, p_weight);
 	return sp.slerpni(sq, t2);
 }
 
diff --git a/core/math/quat.h b/core/math/quat.h
index f0042f4496ed..20a94b9df464 100644
--- a/core/math/quat.h
+++ b/core/math/quat.h
@@ -49,7 +49,7 @@ class Quat {
 	Quat normalized() const;
 	bool is_normalized() const;
 	Quat inverse() const;
-	_FORCE_INLINE_ real_t dot(const Quat &q) const;
+	_FORCE_INLINE_ real_t dot(const Quat &p_q) const;
 
 	void set_euler_xyz(const Vector3 &p_euler);
 	Vector3 get_euler_xyz() const;
@@ -59,9 +59,9 @@ class Quat {
 	void set_euler(const Vector3 &p_euler) { set_euler_yxz(p_euler); };
 	Vector3 get_euler() const { return get_euler_yxz(); };
 
-	Quat slerp(const Quat &q, const real_t &t) const;
-	Quat slerpni(const Quat &q, const real_t &t) const;
-	Quat cubic_slerp(const Quat &q, const Quat &prep, const Quat &postq, const real_t &t) const;
+	Quat slerp(const Quat &p_to, const real_t &p_weight) const;
+	Quat slerpni(const Quat &p_to, const real_t &p_weight) const;
+	Quat cubic_slerp(const Quat &p_b, const Quat &p_pre_a, const Quat &p_post_b, const real_t &p_weight) const;
 
 	void set_axis_angle(const Vector3 &axis, const real_t &angle);
 	_FORCE_INLINE_ void get_axis_angle(Vector3 &r_axis, real_t &r_angle) const {
@@ -72,8 +72,8 @@ class Quat {
 		r_axis.z = z * r;
 	}
 
-	void operator*=(const Quat &q);
-	Quat operator*(const Quat &q) const;
+	void operator*=(const Quat &p_q);
+	Quat operator*(const Quat &p_q) const;
 
 	Quat operator*(const Vector3 &v) const {
 		return Quat(w * v.x + y * v.z - z * v.y,
@@ -91,8 +91,8 @@ class Quat {
 		return v + ((uv * w) + u.cross(uv)) * ((real_t)2);
 	}
 
-	_FORCE_INLINE_ void operator+=(const Quat &q);
-	_FORCE_INLINE_ void operator-=(const Quat &q);
+	_FORCE_INLINE_ void operator+=(const Quat &p_q);
+	_FORCE_INLINE_ void operator-=(const Quat &p_q);
 	_FORCE_INLINE_ void operator*=(const real_t &s);
 	_FORCE_INLINE_ void operator/=(const real_t &s);
 	_FORCE_INLINE_ Quat operator+(const Quat &q2) const;
@@ -121,18 +121,18 @@ class Quat {
 	Quat(const Vector3 &axis, const real_t &angle) { set_axis_angle(axis, angle); }
 
 	Quat(const Vector3 &euler) { set_euler(euler); }
-	Quat(const Quat &q) :
-			x(q.x),
-			y(q.y),
-			z(q.z),
-			w(q.w) {
+	Quat(const Quat &p_q) :
+			x(p_q.x),
+			y(p_q.y),
+			z(p_q.z),
+			w(p_q.w) {
 	}
 
-	Quat operator=(const Quat &q) {
-		x = q.x;
-		y = q.y;
-		z = q.z;
-		w = q.w;
+	Quat operator=(const Quat &p_q) {
+		x = p_q.x;
+		y = p_q.y;
+		z = p_q.z;
+		w = p_q.w;
 		return *this;
 	}
 
@@ -166,26 +166,26 @@ class Quat {
 	}
 };
 
-real_t Quat::dot(const Quat &q) const {
-	return x * q.x + y * q.y + z * q.z + w * q.w;
+real_t Quat::dot(const Quat &p_q) const {
+	return x * p_q.x + y * p_q.y + z * p_q.z + w * p_q.w;
 }
 
 real_t Quat::length_squared() const {
 	return dot(*this);
 }
 
-void Quat::operator+=(const Quat &q) {
-	x += q.x;
-	y += q.y;
-	z += q.z;
-	w += q.w;
+void Quat::operator+=(const Quat &p_q) {
+	x += p_q.x;
+	y += p_q.y;
+	z += p_q.z;
+	w += p_q.w;
 }
 
-void Quat::operator-=(const Quat &q) {
-	x -= q.x;
-	y -= q.y;
-	z -= q.z;
-	w -= q.w;
+void Quat::operator-=(const Quat &p_q) {
+	x -= p_q.x;
+	y -= p_q.y;
+	z -= p_q.z;
+	w -= p_q.w;
 }
 
 void Quat::operator*=(const real_t &s) {
diff --git a/core/math/vector2.cpp b/core/math/vector2.cpp
index b9c83d924617..3f91f201fd5a 100644
--- a/core/math/vector2.cpp
+++ b/core/math/vector2.cpp
@@ -134,8 +134,8 @@ Vector2 Vector2::posmodv(const Vector2 &p_modv) const {
 	return Vector2(Math::fposmod(x, p_modv.x), Math::fposmod(y, p_modv.y));
 }
 
-Vector2 Vector2::project(const Vector2 &p_b) const {
-	return p_b * (dot(p_b) / p_b.length_squared());
+Vector2 Vector2::project(const Vector2 &p_to) const {
+	return p_to * (dot(p_to) / p_to.length_squared());
 }
 
 Vector2 Vector2::snapped(const Vector2 &p_by) const {
@@ -158,14 +158,14 @@ Vector2 Vector2::clamped(real_t p_len) const {
 	return v;
 }
 
-Vector2 Vector2::cubic_interpolate(const Vector2 &p_b, const Vector2 &p_pre_a, const Vector2 &p_post_b, real_t p_t) const {
+Vector2 Vector2::cubic_interpolate(const Vector2 &p_b, const Vector2 &p_pre_a, const Vector2 &p_post_b, real_t p_weight) const {
 
 	Vector2 p0 = p_pre_a;
 	Vector2 p1 = *this;
 	Vector2 p2 = p_b;
 	Vector2 p3 = p_post_b;
 
-	real_t t = p_t;
+	real_t t = p_weight;
 	real_t t2 = t * t;
 	real_t t3 = t2 * t;
 
diff --git a/core/math/vector2.h b/core/math/vector2.h
index 4190f0f5b117..33c8360d19bc 100644
--- a/core/math/vector2.h
+++ b/core/math/vector2.h
@@ -70,22 +70,22 @@ struct Vector2 {
 	real_t distance_squared_to(const Vector2 &p_vector2) const;
 	real_t angle_to(const Vector2 &p_vector2) const;
 	real_t angle_to_point(const Vector2 &p_vector2) const;
-	_FORCE_INLINE_ Vector2 direction_to(const Vector2 &p_b) const;
+	_FORCE_INLINE_ Vector2 direction_to(const Vector2 &p_to) const;
 
 	real_t dot(const Vector2 &p_other) const;
 	real_t cross(const Vector2 &p_other) const;
 	Vector2 posmod(const real_t p_mod) const;
 	Vector2 posmodv(const Vector2 &p_modv) const;
-	Vector2 project(const Vector2 &p_b) const;
+	Vector2 project(const Vector2 &p_to) const;
 
 	Vector2 plane_project(real_t p_d, const Vector2 &p_vec) const;
 
 	Vector2 clamped(real_t p_len) const;
 
-	_FORCE_INLINE_ static Vector2 linear_interpolate(const Vector2 &p_a, const Vector2 &p_b, real_t p_t);
-	_FORCE_INLINE_ Vector2 linear_interpolate(const Vector2 &p_b, real_t p_t) const;
-	_FORCE_INLINE_ Vector2 slerp(const Vector2 &p_b, real_t p_t) const;
-	Vector2 cubic_interpolate(const Vector2 &p_b, const Vector2 &p_pre_a, const Vector2 &p_post_b, real_t p_t) const;
+	_FORCE_INLINE_ static Vector2 linear_interpolate(const Vector2 &p_a, const Vector2 &p_b, real_t p_weight);
+	_FORCE_INLINE_ Vector2 linear_interpolate(const Vector2 &p_to, real_t p_weight) const;
+	_FORCE_INLINE_ Vector2 slerp(const Vector2 &p_to, real_t p_weight) const;
+	Vector2 cubic_interpolate(const Vector2 &p_b, const Vector2 &p_pre_a, const Vector2 &p_post_b, real_t p_weight) const;
 	Vector2 move_toward(const Vector2 &p_to, const real_t p_delta) const;
 
 	Vector2 slide(const Vector2 &p_normal) const;
@@ -230,36 +230,36 @@ _FORCE_INLINE_ bool Vector2::operator!=(const Vector2 &p_vec2) const {
 	return x != p_vec2.x || y != p_vec2.y;
 }
 
-Vector2 Vector2::linear_interpolate(const Vector2 &p_b, real_t p_t) const {
+Vector2 Vector2::linear_interpolate(const Vector2 &p_to, real_t p_weight) const {
 
 	Vector2 res = *this;
 
-	res.x += (p_t * (p_b.x - x));
-	res.y += (p_t * (p_b.y - y));
+	res.x += (p_weight * (p_to.x - x));
+	res.y += (p_weight * (p_to.y - y));
 
 	return res;
 }
 
-Vector2 Vector2::slerp(const Vector2 &p_b, real_t p_t) const {
+Vector2 Vector2::slerp(const Vector2 &p_to, real_t p_weight) const {
 #ifdef MATH_CHECKS
 	ERR_FAIL_COND_V_MSG(!is_normalized(), Vector2(), "The start Vector2 must be normalized.");
 #endif
-	real_t theta = angle_to(p_b);
-	return rotated(theta * p_t);
+	real_t theta = angle_to(p_to);
+	return rotated(theta * p_weight);
 }
 
-Vector2 Vector2::direction_to(const Vector2 &p_b) const {
-	Vector2 ret(p_b.x - x, p_b.y - y);
+Vector2 Vector2::direction_to(const Vector2 &p_to) const {
+	Vector2 ret(p_to.x - x, p_to.y - y);
 	ret.normalize();
 	return ret;
 }
 
-Vector2 Vector2::linear_interpolate(const Vector2 &p_a, const Vector2 &p_b, real_t p_t) {
+Vector2 Vector2::linear_interpolate(const Vector2 &p_a, const Vector2 &p_b, real_t p_weight) {
 
 	Vector2 res = p_a;
 
-	res.x += (p_t * (p_b.x - p_a.x));
-	res.y += (p_t * (p_b.y - p_a.y));
+	res.x += (p_weight * (p_b.x - p_a.x));
+	res.y += (p_weight * (p_b.y - p_a.y));
 
 	return res;
 }
diff --git a/core/math/vector3.cpp b/core/math/vector3.cpp
index 37aa7d23e26e..c30acc0ea919 100644
--- a/core/math/vector3.cpp
+++ b/core/math/vector3.cpp
@@ -76,7 +76,7 @@ Vector3 Vector3::snapped(Vector3 p_val) const {
 	return v;
 }
 
-Vector3 Vector3::cubic_interpolaten(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_t) const {
+Vector3 Vector3::cubic_interpolaten(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_weight) const {
 
 	Vector3 p0 = p_pre_a;
 	Vector3 p1 = *this;
@@ -96,7 +96,7 @@ Vector3 Vector3::cubic_interpolaten(const Vector3 &p_b, const Vector3 &p_pre_a,
 			p3 = p2 + (p3 - p2) * (bc / cd);
 	}
 
-	real_t t = p_t;
+	real_t t = p_weight;
 	real_t t2 = t * t;
 	real_t t3 = t2 * t;
 
@@ -108,14 +108,14 @@ Vector3 Vector3::cubic_interpolaten(const Vector3 &p_b, const Vector3 &p_pre_a,
 	return out;
 }
 
-Vector3 Vector3::cubic_interpolate(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_t) const {
+Vector3 Vector3::cubic_interpolate(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_weight) const {
 
 	Vector3 p0 = p_pre_a;
 	Vector3 p1 = *this;
 	Vector3 p2 = p_b;
 	Vector3 p3 = p_post_b;
 
-	real_t t = p_t;
+	real_t t = p_weight;
 	real_t t2 = t * t;
 	real_t t3 = t2 * t;
 
diff --git a/core/math/vector3.h b/core/math/vector3.h
index fdd3a0087f94..c2930fbdc6bf 100644
--- a/core/math/vector3.h
+++ b/core/math/vector3.h
@@ -88,10 +88,10 @@ struct Vector3 {
 
 	/* Static Methods between 2 vector3s */
 
-	_FORCE_INLINE_ Vector3 linear_interpolate(const Vector3 &p_b, real_t p_t) const;
-	_FORCE_INLINE_ Vector3 slerp(const Vector3 &p_b, real_t p_t) const;
-	Vector3 cubic_interpolate(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_t) const;
-	Vector3 cubic_interpolaten(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_t) const;
+	_FORCE_INLINE_ Vector3 linear_interpolate(const Vector3 &p_to, real_t p_weight) const;
+	_FORCE_INLINE_ Vector3 slerp(const Vector3 &p_to, real_t p_weight) const;
+	Vector3 cubic_interpolate(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_weight) const;
+	Vector3 cubic_interpolaten(const Vector3 &p_b, const Vector3 &p_pre_a, const Vector3 &p_post_b, real_t p_weight) const;
 	Vector3 move_toward(const Vector3 &p_to, const real_t p_delta) const;
 
 	_FORCE_INLINE_ Vector3 cross(const Vector3 &p_b) const;
@@ -105,15 +105,15 @@ struct Vector3 {
 	_FORCE_INLINE_ Vector3 ceil() const;
 	_FORCE_INLINE_ Vector3 round() const;
 
-	_FORCE_INLINE_ real_t distance_to(const Vector3 &p_b) const;
-	_FORCE_INLINE_ real_t distance_squared_to(const Vector3 &p_b) const;
+	_FORCE_INLINE_ real_t distance_to(const Vector3 &p_to) const;
+	_FORCE_INLINE_ real_t distance_squared_to(const Vector3 &p_to) const;
 
 	_FORCE_INLINE_ Vector3 posmod(const real_t p_mod) const;
 	_FORCE_INLINE_ Vector3 posmodv(const Vector3 &p_modv) const;
-	_FORCE_INLINE_ Vector3 project(const Vector3 &p_b) const;
+	_FORCE_INLINE_ Vector3 project(const Vector3 &p_to) const;
 
-	_FORCE_INLINE_ real_t angle_to(const Vector3 &p_b) const;
-	_FORCE_INLINE_ Vector3 direction_to(const Vector3 &p_b) const;
+	_FORCE_INLINE_ real_t angle_to(const Vector3 &p_to) const;
+	_FORCE_INLINE_ Vector3 direction_to(const Vector3 &p_to) const;
 
 	_FORCE_INLINE_ Vector3 slide(const Vector3 &p_normal) const;
 	_FORCE_INLINE_ Vector3 bounce(const Vector3 &p_normal) const;
@@ -196,27 +196,27 @@ Vector3 Vector3::round() const {
 	return Vector3(Math::round(x), Math::round(y), Math::round(z));
 }
 
-Vector3 Vector3::linear_interpolate(const Vector3 &p_b, real_t p_t) const {
+Vector3 Vector3::linear_interpolate(const Vector3 &p_to, real_t p_weight) const {
 
 	return Vector3(
-			x + (p_t * (p_b.x - x)),
-			y + (p_t * (p_b.y - y)),
-			z + (p_t * (p_b.z - z)));
+			x + (p_weight * (p_to.x - x)),
+			y + (p_weight * (p_to.y - y)),
+			z + (p_weight * (p_to.z - z)));
 }
 
-Vector3 Vector3::slerp(const Vector3 &p_b, real_t p_t) const {
-	real_t theta = angle_to(p_b);
-	return rotated(cross(p_b).normalized(), theta * p_t);
+Vector3 Vector3::slerp(const Vector3 &p_to, real_t p_weight) const {
+	real_t theta = angle_to(p_to);
+	return rotated(cross(p_to).normalized(), theta * p_weight);
 }
 
-real_t Vector3::distance_to(const Vector3 &p_b) const {
+real_t Vector3::distance_to(const Vector3 &p_to) const {
 
-	return (p_b - *this).length();
+	return (p_to - *this).length();
 }
 
-real_t Vector3::distance_squared_to(const Vector3 &p_b) const {
+real_t Vector3::distance_squared_to(const Vector3 &p_to) const {
 
-	return (p_b - *this).length_squared();
+	return (p_to - *this).length_squared();
 }
 
 Vector3 Vector3::posmod(const real_t p_mod) const {
@@ -227,17 +227,17 @@ Vector3 Vector3::posmodv(const Vector3 &p_modv) const {
 	return Vector3(Math::fposmod(x, p_modv.x), Math::fposmod(y, p_modv.y), Math::fposmod(z, p_modv.z));
 }
 
-Vector3 Vector3::project(const Vector3 &p_b) const {
-	return p_b * (dot(p_b) / p_b.length_squared());
+Vector3 Vector3::project(const Vector3 &p_to) const {
+	return p_to * (dot(p_to) / p_to.length_squared());
 }
 
-real_t Vector3::angle_to(const Vector3 &p_b) const {
+real_t Vector3::angle_to(const Vector3 &p_to) const {
 
-	return Math::atan2(cross(p_b).length(), dot(p_b));
+	return Math::atan2(cross(p_to).length(), dot(p_to));
 }
 
-Vector3 Vector3::direction_to(const Vector3 &p_b) const {
-	Vector3 ret(p_b.x - x, p_b.y - y, p_b.z - z);
+Vector3 Vector3::direction_to(const Vector3 &p_to) const {
+	Vector3 ret(p_to.x - x, p_to.y - y, p_to.z - z);
 	ret.normalize();
 	return ret;
 }
diff --git a/core/variant_call.cpp b/core/variant_call.cpp
index eb0e76684200..b7296de1c2c6 100644
--- a/core/variant_call.cpp
+++ b/core/variant_call.cpp
@@ -1681,9 +1681,9 @@ void register_variant_methods() {
 	ADDFUNC1R(VECTOR2, VECTOR2, Vector2, posmod, REAL, "mod", varray());
 	ADDFUNC1R(VECTOR2, VECTOR2, Vector2, posmodv, VECTOR2, "modv", varray());
 	ADDFUNC1R(VECTOR2, VECTOR2, Vector2, project, VECTOR2, "b", varray());
-	ADDFUNC2R(VECTOR2, VECTOR2, Vector2, linear_interpolate, VECTOR2, "b", REAL, "t", varray());
-	ADDFUNC2R(VECTOR2, VECTOR2, Vector2, slerp, VECTOR2, "b", REAL, "t", varray());
-	ADDFUNC4R(VECTOR2, VECTOR2, Vector2, cubic_interpolate, VECTOR2, "b", VECTOR2, "pre_a", VECTOR2, "post_b", REAL, "t", varray());
+	ADDFUNC2R(VECTOR2, VECTOR2, Vector2, linear_interpolate, VECTOR2, "to", REAL, "weight", varray());
+	ADDFUNC2R(VECTOR2, VECTOR2, Vector2, slerp, VECTOR2, "to", REAL, "weight", varray());
+	ADDFUNC4R(VECTOR2, VECTOR2, Vector2, cubic_interpolate, VECTOR2, "b", VECTOR2, "pre_a", VECTOR2, "post_b", REAL, "weight", varray());
 	ADDFUNC2R(VECTOR2, VECTOR2, Vector2, move_toward, VECTOR2, "to", REAL, "delta", varray());
 	ADDFUNC1R(VECTOR2, VECTOR2, Vector2, rotated, REAL, "phi", varray());
 	ADDFUNC0R(VECTOR2, VECTOR2, Vector2, tangent, varray());
@@ -1729,9 +1729,9 @@ void register_variant_methods() {
 	ADDFUNC0R(VECTOR3, VECTOR3, Vector3, inverse, varray());
 	ADDFUNC1R(VECTOR3, VECTOR3, Vector3, snapped, VECTOR3, "by", varray());
 	ADDFUNC2R(VECTOR3, VECTOR3, Vector3, rotated, VECTOR3, "axis", REAL, "phi", varray());
-	ADDFUNC2R(VECTOR3, VECTOR3, Vector3, linear_interpolate, VECTOR3, "b", REAL, "t", varray());
-	ADDFUNC2R(VECTOR3, VECTOR3, Vector3, slerp, VECTOR3, "b", REAL, "t", varray());
-	ADDFUNC4R(VECTOR3, VECTOR3, Vector3, cubic_interpolate, VECTOR3, "b", VECTOR3, "pre_a", VECTOR3, "post_b", REAL, "t", varray());
+	ADDFUNC2R(VECTOR3, VECTOR3, Vector3, linear_interpolate, VECTOR3, "to", REAL, "weight", varray());
+	ADDFUNC2R(VECTOR3, VECTOR3, Vector3, slerp, VECTOR3, "to", REAL, "weight", varray());
+	ADDFUNC4R(VECTOR3, VECTOR3, Vector3, cubic_interpolate, VECTOR3, "b", VECTOR3, "pre_a", VECTOR3, "post_b", REAL, "weight", varray());
 	ADDFUNC2R(VECTOR3, VECTOR3, Vector3, move_toward, VECTOR3, "to", REAL, "delta", varray());
 	ADDFUNC1R(VECTOR3, REAL, Vector3, dot, VECTOR3, "b", varray());
 	ADDFUNC1R(VECTOR3, VECTOR3, Vector3, cross, VECTOR3, "b", varray());
@@ -1769,9 +1769,9 @@ void register_variant_methods() {
 	ADDFUNC0R(QUAT, QUAT, Quat, inverse, varray());
 	ADDFUNC1R(QUAT, REAL, Quat, dot, QUAT, "b", varray());
 	ADDFUNC1R(QUAT, VECTOR3, Quat, xform, VECTOR3, "v", varray());
-	ADDFUNC2R(QUAT, QUAT, Quat, slerp, QUAT, "b", REAL, "t", varray());
-	ADDFUNC2R(QUAT, QUAT, Quat, slerpni, QUAT, "b", REAL, "t", varray());
-	ADDFUNC4R(QUAT, QUAT, Quat, cubic_slerp, QUAT, "b", QUAT, "pre_a", QUAT, "post_b", REAL, "t", varray());
+	ADDFUNC2R(QUAT, QUAT, Quat, slerp, QUAT, "to", REAL, "weight", varray());
+	ADDFUNC2R(QUAT, QUAT, Quat, slerpni, QUAT, "to", REAL, "weight", varray());
+	ADDFUNC4R(QUAT, QUAT, Quat, cubic_slerp, QUAT, "b", QUAT, "pre_a", QUAT, "post_b", REAL, "weight", varray());
 	ADDFUNC0R(QUAT, VECTOR3, Quat, get_euler, varray());
 	ADDFUNC1(QUAT, NIL, Quat, set_euler, VECTOR3, "euler", varray());
 	ADDFUNC2(QUAT, NIL, Quat, set_axis_angle, VECTOR3, "axis", REAL, "angle", varray());
@@ -1785,7 +1785,7 @@ void register_variant_methods() {
 	ADDFUNC0R(COLOR, REAL, Color, gray, varray());
 	ADDFUNC0R(COLOR, COLOR, Color, inverted, varray());
 	ADDFUNC0R(COLOR, COLOR, Color, contrasted, varray());
-	ADDFUNC2R(COLOR, COLOR, Color, linear_interpolate, COLOR, "b", REAL, "t", varray());
+	ADDFUNC2R(COLOR, COLOR, Color, linear_interpolate, COLOR, "to", REAL, "weight", varray());
 	ADDFUNC1R(COLOR, COLOR, Color, blend, COLOR, "over", varray());
 	ADDFUNC1R(COLOR, COLOR, Color, lightened, REAL, "amount", varray());
 	ADDFUNC1R(COLOR, COLOR, Color, darkened, REAL, "amount", varray());
@@ -1988,7 +1988,7 @@ void register_variant_methods() {
 	ADDFUNC1R(BASIS, VECTOR3, Basis, xform, VECTOR3, "v", varray());
 	ADDFUNC1R(BASIS, VECTOR3, Basis, xform_inv, VECTOR3, "v", varray());
 	ADDFUNC0R(BASIS, INT, Basis, get_orthogonal_index, varray());
-	ADDFUNC2R(BASIS, BASIS, Basis, slerp, BASIS, "b", REAL, "t", varray());
+	ADDFUNC2R(BASIS, BASIS, Basis, slerp, BASIS, "to", REAL, "weight", varray());
 	// For complicated reasons, the epsilon argument is always discarded. See #45062.
 	ADDFUNC2R(BASIS, BOOL, Basis, is_equal_approx, BASIS, "b", REAL, "epsilon", varray(CMP_EPSILON));
 	ADDFUNC0R(BASIS, QUAT, Basis, get_rotation_quat, varray());
diff --git a/doc/classes/Basis.xml b/doc/classes/Basis.xml
index e8bf7d401150..aae0f2b47239 100644
--- a/doc/classes/Basis.xml
+++ b/doc/classes/Basis.xml
@@ -148,9 +148,9 @@
 		<method name="slerp">
 			<return type="Basis">
 			</return>
-			<argument index="0" name="b" type="Basis">
+			<argument index="0" name="to" type="Basis">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
 				Assuming that the matrix is a proper rotation matrix, slerp performs a spherical-linear interpolation with another rotation matrix.
diff --git a/doc/classes/Color.xml b/doc/classes/Color.xml
index 439c0bb77fb3..7ce169ed30bc 100644
--- a/doc/classes/Color.xml
+++ b/doc/classes/Color.xml
@@ -182,12 +182,12 @@
 		<method name="linear_interpolate">
 			<return type="Color">
 			</return>
-			<argument index="0" name="b" type="Color">
+			<argument index="0" name="to" type="Color">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the linear interpolation with another color. The interpolation factor [code]t[/code] is between 0 and 1.
+				Returns the linear interpolation with another color. The interpolation factor [code]weight[/code] is between 0 and 1.
 				[codeblock]
 				var c1 = Color(1.0, 0.0, 0.0)
 				var c2 = Color(0.0, 1.0, 0.0)
diff --git a/doc/classes/Quat.xml b/doc/classes/Quat.xml
index 05f38310dfbf..22cbe45f0a51 100644
--- a/doc/classes/Quat.xml
+++ b/doc/classes/Quat.xml
@@ -66,10 +66,10 @@
 			</argument>
 			<argument index="2" name="post_b" type="Quat">
 			</argument>
-			<argument index="3" name="t" type="float">
+			<argument index="3" name="weight" type="float">
 			</argument>
 			<description>
-				Performs a cubic spherical interpolation between quaternions [code]preA[/code], this vector, [code]b[/code], and [code]postB[/code], by the given amount [code]t[/code].
+				Performs a cubic spherical interpolation between quaternions [code]pre_a[/code], this vector, [code]b[/code], and [code]post_b[/code], by the given amount [code]weight[/code].
 			</description>
 		</method>
 		<method name="dot">
@@ -151,9 +151,9 @@
 		<method name="slerp">
 			<return type="Quat">
 			</return>
-			<argument index="0" name="b" type="Quat">
+			<argument index="0" name="to" type="Quat">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
 				Returns the result of the spherical linear interpolation between this quaternion and [code]to[/code] by amount [code]weight[/code].
@@ -163,9 +163,9 @@
 		<method name="slerpni">
 			<return type="Quat">
 			</return>
-			<argument index="0" name="b" type="Quat">
+			<argument index="0" name="to" type="Quat">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
 				Returns the result of the spherical linear interpolation between this quaternion and [code]to[/code] by amount [code]weight[/code], but without checking if the rotation path is not bigger than 90 degrees.
diff --git a/doc/classes/Transform2D.xml b/doc/classes/Transform2D.xml
index 5151fe64e1b9..7a33eac10f0f 100644
--- a/doc/classes/Transform2D.xml
+++ b/doc/classes/Transform2D.xml
@@ -103,7 +103,7 @@
 			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns a transform interpolated between this transform and another by a given weight (on the range of 0.0 to 1.0).
+				Returns a transform interpolated between this transform and another by a given [code]weight[/code] (on the range of 0.0 to 1.0).
 			</description>
 		</method>
 		<method name="inverse">
diff --git a/doc/classes/Vector2.xml b/doc/classes/Vector2.xml
index 2776461a584b..3247bbe9df7b 100644
--- a/doc/classes/Vector2.xml
+++ b/doc/classes/Vector2.xml
@@ -111,10 +111,10 @@
 			</argument>
 			<argument index="2" name="post_b" type="Vector2">
 			</argument>
-			<argument index="3" name="t" type="float">
+			<argument index="3" name="weight" type="float">
 			</argument>
 			<description>
-				Cubically interpolates between this vector and [code]b[/code] using [code]pre_a[/code] and [code]post_b[/code] as handles, and returns the result at position [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Cubically interpolates between this vector and [code]b[/code] using [code]pre_a[/code] and [code]post_b[/code] as handles, and returns the result at position [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 			</description>
 		</method>
 		<method name="direction_to">
@@ -198,12 +198,12 @@
 		<method name="linear_interpolate">
 			<return type="Vector2">
 			</return>
-			<argument index="0" name="b" type="Vector2">
+			<argument index="0" name="to" type="Vector2">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of the linear interpolation between this vector and [code]b[/code] by amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of the linear interpolation between this vector and [code]to[/code] by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 			</description>
 		</method>
 		<method name="move_toward">
@@ -286,12 +286,12 @@
 		<method name="slerp">
 			<return type="Vector2">
 			</return>
-			<argument index="0" name="b" type="Vector2">
+			<argument index="0" name="to" type="Vector2">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of spherical linear interpolation between this vector and [code]b[/code], by amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of spherical linear interpolation between this vector and [code]to[/code], by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 				[b]Note:[/b] Both vectors must be normalized.
 			</description>
 		</method>
diff --git a/doc/classes/Vector3.xml b/doc/classes/Vector3.xml
index e79095572c3c..a6f11a6ee3c6 100644
--- a/doc/classes/Vector3.xml
+++ b/doc/classes/Vector3.xml
@@ -79,10 +79,10 @@
 			</argument>
 			<argument index="2" name="post_b" type="Vector3">
 			</argument>
-			<argument index="3" name="t" type="float">
+			<argument index="3" name="weight" type="float">
 			</argument>
 			<description>
-				Performs a cubic interpolation between vectors [code]pre_a[/code], [code]a[/code], [code]b[/code], [code]post_b[/code] ([code]a[/code] is current), by the given amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Performs a cubic interpolation between vectors [code]pre_a[/code], [code]a[/code], [code]b[/code], [code]post_b[/code] ([code]a[/code] is current), by the given amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 			</description>
 		</method>
 		<method name="direction_to">
@@ -173,12 +173,12 @@
 		<method name="linear_interpolate">
 			<return type="Vector3">
 			</return>
-			<argument index="0" name="b" type="Vector3">
+			<argument index="0" name="to" type="Vector3">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of the linear interpolation between this vector and [code]b[/code] by amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of the linear interpolation between this vector and [code]to[/code] by amount [code]t[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 			</description>
 		</method>
 		<method name="max_axis">
@@ -286,12 +286,12 @@
 		<method name="slerp">
 			<return type="Vector3">
 			</return>
-			<argument index="0" name="b" type="Vector3">
+			<argument index="0" name="to" type="Vector3">
 			</argument>
-			<argument index="1" name="t" type="float">
+			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of spherical linear interpolation between this vector and [code]b[/code], by amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of spherical linear interpolation between this vector and [code]to[/code], by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 				[b]Note:[/b] Both vectors must be normalized.
 			</description>
 		</method>
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Quat.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Quat.cs
index 0b676c1211d8..91c27c39f474 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Quat.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Quat.cs
@@ -120,10 +120,11 @@ public real_t LengthSquared
         /// <param name="b">The destination quaternion.</param>
         /// <param name="preA">A quaternion before this quaternion.</param>
         /// <param name="postB">A quaternion after `b`.</param>
-        /// <param name="t">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
+        /// <param name="weight">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
         /// <returns>The interpolated quaternion.</returns>
-        public Quat CubicSlerp(Quat b, Quat preA, Quat postB, real_t t)
+        public Quat CubicSlerp(Quat b, Quat preA, Quat postB, real_t weight)
         {
+            real_t t = weight;
             real_t t2 = (1.0f - t) * t * 2f;
             Quat sp = Slerp(b, t);
             Quat sq = preA.Slerpni(postB, t);
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs
index 61cbbc865450..99b16aa64090 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector2.cs
@@ -194,15 +194,16 @@ public real_t Cross(Vector2 b)
         /// <param name="b">The destination vector.</param>
         /// <param name="preA">A vector before this vector.</param>
         /// <param name="postB">A vector after `b`.</param>
-        /// <param name="t">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
+        /// <param name="weight">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
         /// <returns>The interpolated vector.</returns>
-        public Vector2 CubicInterpolate(Vector2 b, Vector2 preA, Vector2 postB, real_t t)
+        public Vector2 CubicInterpolate(Vector2 b, Vector2 preA, Vector2 postB, real_t weight)
         {
             Vector2 p0 = preA;
             Vector2 p1 = this;
             Vector2 p2 = b;
             Vector2 p3 = postB;
 
+            real_t t = weight;
             real_t t2 = t * t;
             real_t t3 = t2 * t;
 
diff --git a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs
index af51ad5d9990..1fd7dc76e14c 100644
--- a/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs
+++ b/modules/mono/glue/GodotSharp/GodotSharp/Core/Vector3.cs
@@ -161,15 +161,16 @@ public Vector3 Cross(Vector3 b)
         /// <param name="b">The destination vector.</param>
         /// <param name="preA">A vector before this vector.</param>
         /// <param name="postB">A vector after `b`.</param>
-        /// <param name="t">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
+        /// <param name="weight">A value on the range of 0.0 to 1.0, representing the amount of interpolation.</param>
         /// <returns>The interpolated vector.</returns>
-        public Vector3 CubicInterpolate(Vector3 b, Vector3 preA, Vector3 postB, real_t t)
+        public Vector3 CubicInterpolate(Vector3 b, Vector3 preA, Vector3 postB, real_t weight)
         {
             Vector3 p0 = preA;
             Vector3 p1 = this;
             Vector3 p2 = b;
             Vector3 p3 = postB;
 
+            real_t t = weight;
             real_t t2 = t * t;
             real_t t3 = t2 * t;
 

From 7a6a150bc2cf2cbfa2212169477048a706a02a59 Mon Sep 17 00:00:00 2001
From: Yuri Roubinsky <chaosus89@gmail.com>
Date: Mon, 12 Apr 2021 13:29:44 +0300
Subject: [PATCH 15/84] [3.2] Prevents default values of VSNodeCustom from
 overriding by a script

(cherry picked from commit ac91e2ca0d44d5102d50c5b68c0177684bddcc2e)
---
 .../plugins/visual_shader_editor_plugin.cpp   | 16 ++++++++--
 scene/resources/visual_shader.cpp             | 30 +++++++++++++++++++
 scene/resources/visual_shader.h               | 13 ++++++--
 3 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/editor/plugins/visual_shader_editor_plugin.cpp b/editor/plugins/visual_shader_editor_plugin.cpp
index 3e1a20f1c705..6d3ad9178898 100644
--- a/editor/plugins/visual_shader_editor_plugin.cpp
+++ b/editor/plugins/visual_shader_editor_plugin.cpp
@@ -552,6 +552,11 @@ void VisualShaderEditor::_update_graph() {
 		bool is_group = !group_node.is_null();
 		Size2 size = Size2(0, 0);
 
+		VisualShaderNodeCustom *custom = Object::cast_to<VisualShaderNodeCustom>(vsnode.ptr());
+		if (custom) {
+			custom->_set_initialized(true);
+		}
+
 		Ref<VisualShaderNodeExpression> expression_node = Object::cast_to<VisualShaderNodeExpression>(group_node.ptr());
 		bool is_expression = !expression_node.is_null();
 		String expression = "";
@@ -1293,10 +1298,15 @@ void VisualShaderEditor::_port_edited() {
 	Variant value = property_editor->get_variant();
 	Ref<VisualShaderNode> vsn = visual_shader->get_node(type, editing_node);
 	ERR_FAIL_COND(!vsn.is_valid());
-
 	undo_redo->create_action(TTR("Set Input Default Port"));
-	undo_redo->add_do_method(vsn.ptr(), "set_input_port_default_value", editing_port, value);
-	undo_redo->add_undo_method(vsn.ptr(), "set_input_port_default_value", editing_port, vsn->get_input_port_default_value(editing_port));
+	Ref<VisualShaderNodeCustom> custom = Object::cast_to<VisualShaderNodeCustom>(vsn.ptr());
+	if (custom.is_valid()) {
+		undo_redo->add_do_method(custom.ptr(), "_set_input_port_default_value", editing_port, value);
+		undo_redo->add_undo_method(custom.ptr(), "_set_input_port_default_value", editing_port, vsn->get_input_port_default_value(editing_port));
+	} else {
+		undo_redo->add_do_method(vsn.ptr(), "set_input_port_default_value", editing_port, value);
+		undo_redo->add_undo_method(vsn.ptr(), "set_input_port_default_value", editing_port, vsn->get_input_port_default_value(editing_port));
+	}
 	undo_redo->add_do_method(this, "_update_graph");
 	undo_redo->add_undo_method(this, "_update_graph");
 	undo_redo->commit_action();
diff --git a/scene/resources/visual_shader.cpp b/scene/resources/visual_shader.cpp
index c9d094d0dd26..a2d288cfa7d4 100644
--- a/scene/resources/visual_shader.cpp
+++ b/scene/resources/visual_shader.cpp
@@ -295,6 +295,30 @@ String VisualShaderNodeCustom::generate_global_per_node(Shader::Mode p_mode, Vis
 	return "";
 }
 
+void VisualShaderNodeCustom::set_input_port_default_value(int p_port, const Variant &p_value) {
+	if (!is_initialized) {
+		VisualShaderNode::set_input_port_default_value(p_port, p_value);
+	}
+}
+
+void VisualShaderNodeCustom::set_default_input_values(const Array &p_values) {
+	if (!is_initialized) {
+		VisualShaderNode::set_default_input_values(p_values);
+	}
+}
+
+void VisualShaderNodeCustom::_set_input_port_default_value(int p_port, const Variant &p_value) {
+	VisualShaderNode::set_input_port_default_value(p_port, p_value);
+}
+
+bool VisualShaderNodeCustom::_is_initialized() {
+	return is_initialized;
+}
+
+void VisualShaderNodeCustom::_set_initialized(bool p_enabled) {
+	is_initialized = p_enabled;
+}
+
 void VisualShaderNodeCustom::_bind_methods() {
 
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_name"));
@@ -310,6 +334,12 @@ void VisualShaderNodeCustom::_bind_methods() {
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_output_port_name", PropertyInfo(Variant::INT, "port")));
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_code", PropertyInfo(Variant::ARRAY, "input_vars"), PropertyInfo(Variant::ARRAY, "output_vars"), PropertyInfo(Variant::INT, "mode"), PropertyInfo(Variant::INT, "type")));
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_global_code", PropertyInfo(Variant::INT, "mode")));
+
+	ClassDB::bind_method(D_METHOD("_set_initialized", "enabled"), &VisualShaderNodeCustom::_set_initialized);
+	ClassDB::bind_method(D_METHOD("_is_initialized"), &VisualShaderNodeCustom::_is_initialized);
+	ClassDB::bind_method(D_METHOD("_set_input_port_default_value", "port", "value"), &VisualShaderNodeCustom::_set_input_port_default_value);
+
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "initialized", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR | PROPERTY_USAGE_INTERNAL), "_set_initialized", "_is_initialized");
 }
 
 VisualShaderNodeCustom::VisualShaderNodeCustom() {
diff --git a/scene/resources/visual_shader.h b/scene/resources/visual_shader.h
index 61fd96a5c27d..e519421415ce 100644
--- a/scene/resources/visual_shader.h
+++ b/scene/resources/visual_shader.h
@@ -199,10 +199,10 @@ class VisualShaderNode : public Resource {
 	virtual PortType get_input_port_type(int p_port) const = 0;
 	virtual String get_input_port_name(int p_port) const = 0;
 
-	void set_input_port_default_value(int p_port, const Variant &p_value);
+	virtual void set_input_port_default_value(int p_port, const Variant &p_value);
 	Variant get_input_port_default_value(int p_port) const; // if NIL (default if node does not set anything) is returned, it means no default value is wanted if disconnected, thus no input var must be supplied (empty string will be supplied)
 	Array get_default_input_values() const;
-	void set_default_input_values(const Array &p_values);
+	virtual void set_default_input_values(const Array &p_values);
 
 	virtual int get_output_port_count() const = 0;
 	virtual PortType get_output_port_type(int p_port) const = 0;
@@ -246,6 +246,7 @@ class VisualShaderNodeCustom : public VisualShaderNode {
 		int type;
 	};
 
+	bool is_initialized = false;
 	List<Port> input_ports;
 	List<Port> output_ports;
 
@@ -262,7 +263,12 @@ class VisualShaderNodeCustom : public VisualShaderNode {
 	virtual PortType get_output_port_type(int p_port) const;
 	virtual String get_output_port_name(int p_port) const;
 
+	virtual void set_input_port_default_value(int p_port, const Variant &p_value);
+	virtual void set_default_input_values(const Array &p_values);
+
 protected:
+	void _set_input_port_default_value(int p_port, const Variant &p_value);
+
 	virtual String generate_code(Shader::Mode p_mode, VisualShader::Type p_type, int p_id, const String *p_input_vars, const String *p_output_vars, bool p_for_preview = false) const;
 	virtual String generate_global_per_node(Shader::Mode p_mode, VisualShader::Type p_type, int p_id) const;
 
@@ -271,6 +277,9 @@ class VisualShaderNodeCustom : public VisualShaderNode {
 public:
 	VisualShaderNodeCustom();
 	void update_ports();
+
+	bool _is_initialized();
+	void _set_initialized(bool p_enabled);
 };
 
 /////

From 3ab33d3ce9953ce891ecec63dfb521f711964568 Mon Sep 17 00:00:00 2001
From: rafallus <rafaelmtzg@gmail.com>
Date: Tue, 30 Mar 2021 00:36:30 -0600
Subject: [PATCH 16/84] Check if _direct_state_changed() argument is valid

- Modified classes: RigidBody, PhysicalBone, VehicleBody, RigidBody2D, KinematicBody2D
- The input argument is untrusted even in release mode

(cherry picked from commit e075b6b411f1c1a5cf63e7fb0e66104d6a62c33f)
---
 scene/2d/physics_body_2d.cpp |  8 ++------
 scene/3d/physics_body.cpp    | 15 +++------------
 scene/3d/vehicle_body.cpp    |  2 +-
 3 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/scene/2d/physics_body_2d.cpp b/scene/2d/physics_body_2d.cpp
index f62804eb5392..f989a5c67337 100644
--- a/scene/2d/physics_body_2d.cpp
+++ b/scene/2d/physics_body_2d.cpp
@@ -440,12 +440,8 @@ bool RigidBody2D::_test_motion(const Vector2 &p_motion, bool p_infinite_inertia,
 }
 
 void RigidBody2D::_direct_state_changed(Object *p_state) {
-
-#ifdef DEBUG_ENABLED
 	state = Object::cast_to<Physics2DDirectBodyState>(p_state);
-#else
-	state = (Physics2DDirectBodyState *)p_state; //trust it
-#endif
+	ERR_FAIL_COND_MSG(!state, "Method '_direct_state_changed' must receive a valid Physics2DDirectBodyState object as argument");
 
 	set_block_transform_notify(true); // don't want notify (would feedback loop)
 	if (mode != MODE_KINEMATIC)
@@ -1437,11 +1433,11 @@ bool KinematicBody2D::is_sync_to_physics_enabled() const {
 }
 
 void KinematicBody2D::_direct_state_changed(Object *p_state) {
-
 	if (!sync_to_physics)
 		return;
 
 	Physics2DDirectBodyState *state = Object::cast_to<Physics2DDirectBodyState>(p_state);
+	ERR_FAIL_COND_MSG(!state, "Method '_direct_state_changed' must receive a valid Physics2DDirectBodyState object as argument");
 
 	last_valid_transform = state->get_transform();
 	set_notify_local_transform(false);
diff --git a/scene/3d/physics_body.cpp b/scene/3d/physics_body.cpp
index 3bf0c57e36ff..0d21fbe1b609 100644
--- a/scene/3d/physics_body.cpp
+++ b/scene/3d/physics_body.cpp
@@ -442,12 +442,8 @@ struct _RigidBodyInOut {
 };
 
 void RigidBody::_direct_state_changed(Object *p_state) {
-
-#ifdef DEBUG_ENABLED
 	state = Object::cast_to<PhysicsDirectBodyState>(p_state);
-#else
-	state = (PhysicsDirectBodyState *)p_state; //trust it
-#endif
+	ERR_FAIL_COND_MSG(!state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState object as argument");
 
 	set_ignore_transform_notification(true);
 	set_global_transform(state->get_transform());
@@ -2220,13 +2216,8 @@ void PhysicalBone::_direct_state_changed(Object *p_state) {
 
 	/// Update bone transform
 
-	PhysicsDirectBodyState *state;
-
-#ifdef DEBUG_ENABLED
-	state = Object::cast_to<PhysicsDirectBodyState>(p_state);
-#else
-	state = (PhysicsDirectBodyState *)p_state; //trust it
-#endif
+	PhysicsDirectBodyState *state = Object::cast_to<PhysicsDirectBodyState>(p_state);
+	ERR_FAIL_COND_MSG(!state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState object as argument");
 
 	Transform global_transform(state->get_transform());
 
diff --git a/scene/3d/vehicle_body.cpp b/scene/3d/vehicle_body.cpp
index 8d3206e714cb..8f18720c1dba 100644
--- a/scene/3d/vehicle_body.cpp
+++ b/scene/3d/vehicle_body.cpp
@@ -857,10 +857,10 @@ void VehicleBody::_update_friction(PhysicsDirectBodyState *s) {
 }
 
 void VehicleBody::_direct_state_changed(Object *p_state) {
-
 	RigidBody::_direct_state_changed(p_state);
 
 	state = Object::cast_to<PhysicsDirectBodyState>(p_state);
+	ERR_FAIL_COND_MSG(!state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState object as argument");
 
 	float step = state->get_step();
 

From 3111910331b5ce2f0321d4f66ab07824be7273ed Mon Sep 17 00:00:00 2001
From: Francois Belair <razoric480@gmail.com>
Date: Wed, 14 Apr 2021 13:11:38 -0400
Subject: [PATCH 17/84] Make LSP update the filesystem of changed scripts

This updates global classes and exposes base member variables.
Fixes #39713

(cherry picked from commit b16bb33a5b462750c70ae1fec791dac8e50fe3ef)
---
 modules/gdscript/language_server/gdscript_text_document.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/gdscript/language_server/gdscript_text_document.cpp b/modules/gdscript/language_server/gdscript_text_document.cpp
index 9238762954e6..d57aca5b66ba 100644
--- a/modules/gdscript/language_server/gdscript_text_document.cpp
+++ b/modules/gdscript/language_server/gdscript_text_document.cpp
@@ -416,6 +416,7 @@ GDScriptTextDocument::~GDScriptTextDocument() {
 void GDScriptTextDocument::sync_script_content(const String &p_path, const String &p_content) {
 	String path = GDScriptLanguageProtocol::get_singleton()->get_workspace()->get_file_path(p_path);
 	GDScriptLanguageProtocol::get_singleton()->get_workspace()->parse_script(path, p_content);
+	EditorFileSystem::get_singleton()->update_file(path);
 }
 
 void GDScriptTextDocument::show_native_symbol_in_editor(const String &p_symbol_id) {

From d696d89bc8966cbb3d20e480f1fae630400b9562 Mon Sep 17 00:00:00 2001
From: lawnjelly <lawnjelly@gmail.com>
Date: Sat, 24 Apr 2021 10:21:33 +0100
Subject: [PATCH 18/84] Batching - GLES3 fix light pass modulates

The final_modulate was incorrectly being set in the uniform on light passes in GLES3 in situations where color was baked in the vertices. This was already correct in GLES2. This PR makes prevents setting final_modulate in this situation.

(cherry picked from commit 35c5ccce9e3f098de1a305027899c36074089b06)
---
 drivers/gles3/rasterizer_canvas_gles3.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gles3/rasterizer_canvas_gles3.cpp b/drivers/gles3/rasterizer_canvas_gles3.cpp
index 0b480e43e77a..967149ae557a 100644
--- a/drivers/gles3/rasterizer_canvas_gles3.cpp
+++ b/drivers/gles3/rasterizer_canvas_gles3.cpp
@@ -1493,7 +1493,11 @@ void RasterizerCanvasGLES3::render_joined_item(const BItemJoined &p_bij, RenderI
 		Light *light = r_ris.item_group_light;
 		bool light_used = false;
 		VS::CanvasLightMode mode = VS::CANVAS_LIGHT_MODE_ADD;
-		state.canvas_item_modulate = p_ci->final_modulate; // remove the canvas modulate
+
+		// we leave this set to 1, 1, 1, 1 if using software because the colors are baked into the vertices
+		if (p_bij.is_single_item()) {
+			state.canvas_item_modulate = p_ci->final_modulate; // remove the canvas modulate
+		}
 
 		while (light) {
 

From 1d8d7b0d4bb731e8ac009b3808b835c90bc14435 Mon Sep 17 00:00:00 2001
From: lawnjelly <lawnjelly@gmail.com>
Date: Fri, 23 Apr 2021 09:04:45 +0100
Subject: [PATCH 19/84] Batching - fix number of verts in translation

The translation to larger vertex formats was assuming that batches were rects, and not accounting that the num_commands had a different meaning for lines and polys, so the calculation for number of vertices to translate was incorrect in these cases.

Also prevents infinite loop if a single polygon has too many vertices to fit in the batch buffer.

(cherry picked from commit d08cf5f434013c60080c49c6387bbe7cffc39978)
---
 .../gles_common/rasterizer_canvas_batcher.h   | 45 ++++++++++++++++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/drivers/gles_common/rasterizer_canvas_batcher.h b/drivers/gles_common/rasterizer_canvas_batcher.h
index 37a078d53dcd..f1e96f74a67b 100644
--- a/drivers/gles_common/rasterizer_canvas_batcher.h
+++ b/drivers/gles_common/rasterizer_canvas_batcher.h
@@ -179,6 +179,7 @@ class RasterizerCanvasBatcher {
 		// in the case of DEFAULT, this is num commands.
 		// with rects, is number of command and rects.
 		// with lines, is number of lines
+		// with polys, is number of indices (actual rendered verts)
 		uint32_t num_commands;
 
 		// first vertex of this batch in the vertex lists
@@ -193,6 +194,29 @@ class RasterizerCanvasBatcher {
 			// for default batches we will store the parent item
 			const RasterizerCanvas::Item *item;
 		};
+
+		uint32_t get_num_verts() const {
+			switch (type) {
+				default: {
+				} break;
+				case RasterizerStorageCommon::BT_RECT: {
+					return num_commands * 4;
+				} break;
+				case RasterizerStorageCommon::BT_LINE: {
+					return num_commands * 2;
+				} break;
+				case RasterizerStorageCommon::BT_LINE_AA: {
+					return num_commands * 2;
+				} break;
+				case RasterizerStorageCommon::BT_POLY: {
+					return num_commands;
+				} break;
+			}
+
+			// error condition
+			WARN_PRINT_ONCE("reading num_verts from incorrect batch type");
+			return 0;
+		}
 	};
 
 	struct BatchTex {
@@ -1596,7 +1620,15 @@ bool C_PREAMBLE::_prefill_polygon(RasterizerCanvas::Item::CommandPolygon *p_poly
 	// could be done with a temporary vertex buffer
 	BatchVertex *bvs = bdata.vertices.request(num_inds);
 	if (!bvs) {
-		// run out of space in the vertex buffer .. finish this function and draw what we have so far
+		// run out of space in the vertex buffer
+		// check for special case where the batching buffer is simply not big enough to fit this primitive.
+		if (!bdata.vertices.size()) {
+			// can't draw, ignore the primitive, otherwise we would enter an infinite loop
+			WARN_PRINT_ONCE("poly has too many indices to draw, increase batch buffer size");
+			return false;
+		}
+
+		// .. finish this function and draw what we have so far
 		// return where we got to
 		r_command_start = command_num;
 		return true;
@@ -2952,10 +2984,9 @@ void C_PREAMBLE::_translate_batches_to_larger_FVF(uint32_t p_sequence_batch_type
 						needs_new_batch = false;
 
 						// create the colored verts (only if not default)
-						//int first_vert = source_batch.first_quad * 4;
-						//int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands);
 						int first_vert = source_batch.first_vert;
-						int end_vert = first_vert + (4 * source_batch.num_commands);
+						int num_verts = source_batch.get_num_verts();
+						int end_vert = first_vert + num_verts;
 
 						for (int v = first_vert; v < end_vert; v++) {
 							RAST_DEV_DEBUG_ASSERT(bdata.vertices.size());
@@ -3012,10 +3043,10 @@ void C_PREAMBLE::_translate_batches_to_larger_FVF(uint32_t p_sequence_batch_type
 
 			// create the colored verts (only if not default)
 			if (source_batch.type != RasterizerStorageCommon::BT_DEFAULT) {
-				//					int first_vert = source_batch.first_quad * 4;
-				//					int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands);
+
 				int first_vert = source_batch.first_vert;
-				int end_vert = first_vert + (4 * source_batch.num_commands);
+				int num_verts = source_batch.get_num_verts();
+				int end_vert = first_vert + num_verts;
 
 				for (int v = first_vert; v < end_vert; v++) {
 					RAST_DEV_DEBUG_ASSERT(bdata.vertices.size());

From 3f3108006ff3ac8e51783dca042a9a77602bb708 Mon Sep 17 00:00:00 2001
From: Maganty Rushyendra <mrushyendra@yahoo.com.sg>
Date: Sat, 24 Apr 2021 11:06:28 -0400
Subject: [PATCH 20/84] Fix AudioServer Crash when bus count equals 0

(cherry picked from commit ccc375f16334d0eb8d56d5e5f2b248ef1ee55258)
---
 servers/audio_server.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/servers/audio_server.cpp b/servers/audio_server.cpp
index abe3c08870de..1a78f479636f 100644
--- a/servers/audio_server.cpp
+++ b/servers/audio_server.cpp
@@ -246,6 +246,7 @@ void AudioServer::_driver_process(int p_frames, int32_t *p_buffer) {
 		init_channels_and_buffers();
 	}
 
+	ERR_FAIL_COND_MSG(buses.empty() && todo, "AudioServer bus count is less than 1.");
 	while (todo) {
 
 		if (to_mix == 0) {

From 84dc716d0d773c69926c1ba1b06adf5f6396b146 Mon Sep 17 00:00:00 2001
From: Marcel Admiraal <madmiraal@users.noreply.github.com>
Date: Sun, 25 Apr 2021 08:01:14 +0100
Subject: [PATCH 21/84] Fix empty CSGShape error

(cherry picked from commit decdf4fcbc51d3e3cff765bdecb5418f87fa3847)
---
 modules/csg/csg_gizmos.cpp | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/modules/csg/csg_gizmos.cpp b/modules/csg/csg_gizmos.cpp
index 43d7fcb79764..d5c63ce9ca55 100644
--- a/modules/csg/csg_gizmos.cpp
+++ b/modules/csg/csg_gizmos.cpp
@@ -317,27 +317,16 @@ bool CSGShapeSpatialGizmoPlugin::is_selectable_when_hidden() const {
 
 void CSGShapeSpatialGizmoPlugin::redraw(EditorSpatialGizmo *p_gizmo) {
 
-	CSGShape *cs = Object::cast_to<CSGShape>(p_gizmo->get_spatial_node());
-
 	p_gizmo->clear();
 
-	Ref<Material> material;
-	switch (cs->get_operation()) {
-		case CSGShape::OPERATION_UNION:
-			material = get_material("shape_union_material", p_gizmo);
-			break;
-		case CSGShape::OPERATION_INTERSECTION:
-			material = get_material("shape_intersection_material", p_gizmo);
-			break;
-		case CSGShape::OPERATION_SUBTRACTION:
-			material = get_material("shape_subtraction_material", p_gizmo);
-			break;
-	}
-
-	Ref<Material> handles_material = get_material("handles");
+	CSGShape *cs = Object::cast_to<CSGShape>(p_gizmo->get_spatial_node());
 
 	PoolVector<Vector3> faces = cs->get_brush_faces();
 
+	if (faces.size() == 0) {
+		return;
+	}
+
 	Vector<Vector3> lines;
 	lines.resize(faces.size() * 2);
 	{
@@ -353,6 +342,21 @@ void CSGShapeSpatialGizmoPlugin::redraw(EditorSpatialGizmo *p_gizmo) {
 		}
 	}
 
+	Ref<Material> material;
+	switch (cs->get_operation()) {
+		case CSGShape::OPERATION_UNION:
+			material = get_material("shape_union_material", p_gizmo);
+			break;
+		case CSGShape::OPERATION_INTERSECTION:
+			material = get_material("shape_intersection_material", p_gizmo);
+			break;
+		case CSGShape::OPERATION_SUBTRACTION:
+			material = get_material("shape_subtraction_material", p_gizmo);
+			break;
+	}
+
+	Ref<Material> handles_material = get_material("handles");
+
 	p_gizmo->add_lines(lines, material);
 	p_gizmo->add_collision_segments(lines);
 

From d8607d80453a7538202cfb6e5ba54d756b12a6e3 Mon Sep 17 00:00:00 2001
From: Marcel Admiraal <madmiraal@users.noreply.github.com>
Date: Sun, 25 Apr 2021 09:07:26 +0100
Subject: [PATCH 22/84] Fix CSGMesh undo not refreshing gizmo

(cherry picked from commit 4311c2f66e4a1c9b7b77e8d3cdce4be8fa3ec7fa)
---
 modules/csg/csg_shape.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/csg/csg_shape.cpp b/modules/csg/csg_shape.cpp
index 0949b38a9ed1..392b38d696a3 100644
--- a/modules/csg/csg_shape.cpp
+++ b/modules/csg/csg_shape.cpp
@@ -911,7 +911,7 @@ void CSGMesh::set_mesh(const Ref<Mesh> &p_mesh) {
 		mesh->connect("changed", this, "_mesh_changed");
 	}
 
-	_make_dirty();
+	_mesh_changed();
 }
 
 Ref<Mesh> CSGMesh::get_mesh() {

From 5a26bcda128915274ef823bd921e34ff12aed848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Tue, 27 Apr 2021 09:50:48 +0200
Subject: [PATCH 23/84] OSX: Clarify min version requirement (10.12) in
 Info.plist

The min requirement was upped by #45618 to have proper support for C++14.

Related to #48222.

(cherry picked from commit 8851fa78084b1ed4f4a3f5a028d7ab3f72213db1)
---
 misc/dist/osx_template.app/Contents/Info.plist | 4 ++--
 misc/dist/osx_tools.app/Contents/Info.plist    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/misc/dist/osx_template.app/Contents/Info.plist b/misc/dist/osx_template.app/Contents/Info.plist
index 300dc28e526c..530fb6eada50 100644
--- a/misc/dist/osx_template.app/Contents/Info.plist
+++ b/misc/dist/osx_template.app/Contents/Info.plist
@@ -37,11 +37,11 @@
 	<key>NSPrincipalClass</key>
 	<string>NSApplication</string>
 	<key>LSMinimumSystemVersion</key>
-	<string>10.9</string>
+	<string>10.12</string>
 	<key>LSMinimumSystemVersionByArchitecture</key>
 	<dict>
 		<key>x86_64</key>
-		<string>10.9</string>
+		<string>10.12</string>
 	</dict>
 	<key>NSHighResolutionCapable</key>
 	$highres
diff --git a/misc/dist/osx_tools.app/Contents/Info.plist b/misc/dist/osx_tools.app/Contents/Info.plist
index 3e7257eed8af..1fe0069a6ffd 100644
--- a/misc/dist/osx_tools.app/Contents/Info.plist
+++ b/misc/dist/osx_tools.app/Contents/Info.plist
@@ -39,11 +39,11 @@
 	<key>NSPrincipalClass</key>
 	<string>NSApplication</string>
 	<key>LSMinimumSystemVersion</key>
-	<string>10.9</string>
+	<string>10.12</string>
 	<key>LSMinimumSystemVersionByArchitecture</key>
 	<dict>
 		<key>x86_64</key>
-		<string>10.9</string>
+		<string>10.12</string>
 	</dict>
 	<key>NSHighResolutionCapable</key>
 	<true/>

From feac30b96a450bce6b90bb16d34b803af3f94f33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Mon, 26 Apr 2021 13:34:35 +0200
Subject: [PATCH 24/84] Linux: Remove use_static_cpp override on x86_32

After further testing it seems to work fine now when building binaries with GCC 5
on Ubuntu 16.04 (previously we were using GCC 9 on Ubuntu 14.04).

Follow-up to #45629.

(cherry picked from commit aa15ad72ee4c80d3e3af959a74ae1fbbf58f48d2)
---
 platform/x11/detect.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/platform/x11/detect.py b/platform/x11/detect.py
index 1a5204415090..91652aad552e 100644
--- a/platform/x11/detect.py
+++ b/platform/x11/detect.py
@@ -389,10 +389,7 @@ def configure(env):
 
     # Link those statically for portability
     if env["use_static_cpp"]:
-        # Workaround for GH-31743, Ubuntu 18.04 i386 crashes when it's used.
-        # That doesn't make any sense but it's likely a Ubuntu bug?
-        if is64 or env["bits"] == "64":
-            env.Append(LINKFLAGS=["-static-libgcc", "-static-libstdc++"])
+        env.Append(LINKFLAGS=["-static-libgcc", "-static-libstdc++"])
         if env["use_llvm"]:
             env["LINKCOM"] = env["LINKCOM"] + " -l:libatomic.a"
 

From f172123a7af2887cd6b14c60bcbf900fe262296e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Wed, 28 Apr 2021 10:59:57 +0200
Subject: [PATCH 25/84] CI: Upgrade Emscripten to 2.0.15 (same as official
 standard builds)

We still use Emscripten 1.39.9 for official Mono builds so ideally we want to test
against an old Emscripten version to ensure we don't break compatibility.

But then google-closure-compiler-linux broke compatibility for us and is not properly
pinned, so we need to use a more recent version for now to fix CI.

Cf. https://github.com/emscripten-core/emsdk/issues/802

(cherry picked from commit 9571ae3a3390958a721d89a20fd6c42fb4cad7a1)
---
 .github/workflows/javascript_builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/javascript_builds.yml b/.github/workflows/javascript_builds.yml
index b86607ef220b..597694663702 100644
--- a/.github/workflows/javascript_builds.yml
+++ b/.github/workflows/javascript_builds.yml
@@ -6,7 +6,7 @@ env:
   GODOT_BASE_BRANCH: 3.x
   SCONSFLAGS: platform=javascript verbose=yes warnings=all werror=yes debug_symbols=no --jobs=2
   SCONS_CACHE_LIMIT: 4096
-  EM_VERSION: 1.39.20
+  EM_VERSION: 2.0.15
   EM_CACHE_FOLDER: 'emsdk-cache'
 
 jobs:

From cde16a994c68b6c568a411577716ad20c68e6e4e Mon Sep 17 00:00:00 2001
From: ray90514 <ray90514@hotmail.com>
Date: Thu, 8 Apr 2021 23:43:50 +0800
Subject: [PATCH 26/84] Fix LineEdit undo behaves strangely

(cherry picked from commit 7501c7f48aae218b693b5b37af6a00380457240e)
---
 scene/gui/line_edit.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scene/gui/line_edit.cpp b/scene/gui/line_edit.cpp
index 45c9b81737ff..40b799a5a29b 100644
--- a/scene/gui/line_edit.cpp
+++ b/scene/gui/line_edit.cpp
@@ -1258,6 +1258,7 @@ void LineEdit::set_text(String p_text) {
 
 	clear_internal();
 	append_at_cursor(p_text);
+	_create_undo_state();
 
 	if (expand_to_text_length) {
 		minimum_size_changed();

From 88cfde0aba38c7a530a589d7d8bf7632ddbea45f Mon Sep 17 00:00:00 2001
From: Thomas ten Cate <ttencate@gmail.com>
Date: Sat, 24 Apr 2021 13:15:40 +0200
Subject: [PATCH 27/84] Add OpenSimplexNoise output change to changelog

(cherry picked from commit a980bad0b0d13e9b3f758fb98fdaa367b84fbeb6)
---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de7f1f46d599..f05ba90793cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -148,6 +148,8 @@ See the [release announcement](https://godotengine.org/article/godot-3-3-has-arr
 - [SVG images can now be used as a project icon](https://github.com/godotengine/godot/pull/43369).
 - [Tweaked log file names for consistency between Mono and non-Mono builds](https://github.com/godotengine/godot/pull/44148).
 - [Tweaked command line `--print-fps` display to display milliseconds per frame timings in addition to FPS](https://github.com/godotengine/godot/pull/47735).
+- [OpenSimplexNoise is now guaranteed to give consistent results across platforms](https://github.com/godotengine/godot/issues/47211).
+  - This change breaks compatibility: you get different results even for the same seed.
 
 #### Editor
 

From 278af7d02312f9cc04bee84cecfe30ee75162cf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Wed, 28 Apr 2021 16:21:43 +0200
Subject: [PATCH 28/84] doc: Mark LargeTexture as deprecated (removed in 4.0)

Cf. https://github.com/godotengine/godot/pull/48269.

(cherry picked from commit 562c6bd9ab4c57c811c3a04a9ac4944918c2b6df)
---
 doc/classes/LargeTexture.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/classes/LargeTexture.xml b/doc/classes/LargeTexture.xml
index 7d0a19a57e9c..de2da7ee7966 100644
--- a/doc/classes/LargeTexture.xml
+++ b/doc/classes/LargeTexture.xml
@@ -1,10 +1,10 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="LargeTexture" inherits="Texture" version="3.3">
 	<brief_description>
-		A [Texture] capable of storing many smaller textures with offsets.
+		[i]Deprecated.[/i] A [Texture] capable of storing many smaller textures with offsets.
 	</brief_description>
 	<description>
-		A [Texture] capable of storing many smaller textures with offsets.
+		[i]Deprecated (will be removed in Godot 4.0).[/i] A [Texture] capable of storing many smaller textures with offsets.
 		You can dynamically add pieces ([Texture]s) to this [LargeTexture] using different offsets.
 	</description>
 	<tutorials>

From dbf71c1b70fbef71897e616449835704cabc78a2 Mon Sep 17 00:00:00 2001
From: skyace65 <trekie96@hotmail.com>
Date: Thu, 10 Sep 2020 13:41:58 -0400
Subject: [PATCH 29/84] Improve SpriteFrames get_animation_loop description

(cherry picked from commit d311c48d6a3cabd434a5dda85efc21f3ef8243d0)
---
 doc/classes/SpriteFrames.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/classes/SpriteFrames.xml b/doc/classes/SpriteFrames.xml
index de3601792c08..374233ca982a 100644
--- a/doc/classes/SpriteFrames.xml
+++ b/doc/classes/SpriteFrames.xml
@@ -54,7 +54,7 @@
 			<argument index="0" name="anim" type="String">
 			</argument>
 			<description>
-				If [code]true[/code], the given animation will loop.
+				Returns [code]true[/code] if the given animation is configured to loop when it finishes playing. Otherwise, returns [code]false[/code].
 			</description>
 		</method>
 		<method name="get_animation_names" qualifiers="const">

From 890ec034315ee506d082afd506045436d11f44e6 Mon Sep 17 00:00:00 2001
From: Fabio Alessandrelli <fabio.alessandrelli@gmail.com>
Date: Mon, 26 Apr 2021 07:23:39 +0200
Subject: [PATCH 30/84] [Net] Fix socket poll timeout on Windows.

Now correctly computes the timeout value in milliseconds.

(cherry picked from commit 46f7b0f74bc8907fe988eb55169203a095babaf3)
---
 drivers/unix/net_socket_posix.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/unix/net_socket_posix.cpp b/drivers/unix/net_socket_posix.cpp
index 92cd69755f60..4bae6d3002f2 100644
--- a/drivers/unix/net_socket_posix.cpp
+++ b/drivers/unix/net_socket_posix.cpp
@@ -452,7 +452,7 @@ Error NetSocketPosix::poll(PollType p_type, int p_timeout) const {
 	FD_ZERO(&wr);
 	FD_ZERO(&ex);
 	FD_SET(_sock, &ex);
-	struct timeval timeout = { p_timeout, 0 };
+	struct timeval timeout = { p_timeout / 1000, (p_timeout % 1000) * 1000 };
 	// For blocking operation, pass NULL timeout pointer to select.
 	struct timeval *tp = NULL;
 	if (p_timeout >= 0) {

From 46a5f3a18e91e88fb67aa0c7951562c8b0776a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Mon, 26 Apr 2021 17:22:46 +0200
Subject: [PATCH 31/84] Android: Upgrade buildTools from 30.0.1 to 30.0.3

It seems 30.0.1 had issues with compatibility with JDK 8 and 11,
which appear to be solved in 30.0.3 as per godotengine/godot-docs#4796.

(cherry picked from commit d88e1f04df69976479928d03cd957fc77fe73478)
---
 platform/android/java/app/config.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platform/android/java/app/config.gradle b/platform/android/java/app/config.gradle
index c0ae4007d27c..294398c80327 100644
--- a/platform/android/java/app/config.gradle
+++ b/platform/android/java/app/config.gradle
@@ -3,7 +3,7 @@ ext.versions = [
     compileSdk         : 29,
     minSdk             : 18,
     targetSdk          : 29,
-    buildTools         : '30.0.1',
+    buildTools         : '30.0.3',
     supportCoreUtils   : '1.0.0',
     kotlinVersion      : '1.4.10',
     v4Support          : '1.0.0',

From 00b70f60bb9da53fc291f11c1ad9fbdf5bfc689c Mon Sep 17 00:00:00 2001
From: Johannes Witt <johawitt@outlook.de>
Date: Tue, 27 Apr 2021 13:58:01 +0200
Subject: [PATCH 32/84] Fix CSG Path Polygon cache being removed after connect

fixes #30229

(cherry picked from commit bab36f12737769dd5f5c876731bc334323a4c5d2)
---
 modules/csg/csg_shape.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/csg/csg_shape.cpp b/modules/csg/csg_shape.cpp
index 392b38d696a3..fefd1dd4beaa 100644
--- a/modules/csg/csg_shape.cpp
+++ b/modules/csg/csg_shape.cpp
@@ -1832,7 +1832,6 @@ CSGBrush *CSGPolygon::_build_brush() {
 
 			path_cache->connect("tree_exited", this, "_path_exited");
 			path_cache->connect("curve_changed", this, "_path_changed");
-			path_cache = NULL;
 		}
 		curve = path->get_curve();
 		if (curve.is_null()) {

From 72122564962269ce2ecc2e092b3cf502d5910a17 Mon Sep 17 00:00:00 2001
From: David Snopek <dsnopek@gmail.com>
Date: Wed, 28 Apr 2021 08:26:05 -0500
Subject: [PATCH 33/84] Fixes #48178: WebXR broken when built with Emscripten
 2.0.13 or later

(cherry picked from commit b77925d24624599c8db319d602b6c1d2e3071ff9)
---
 modules/webxr/native/library_godot_webxr.js | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/webxr/native/library_godot_webxr.js b/modules/webxr/native/library_godot_webxr.js
index a8643e241473..13057aea98b5 100644
--- a/modules/webxr/native/library_godot_webxr.js
+++ b/modules/webxr/native/library_godot_webxr.js
@@ -72,10 +72,8 @@ const GodotWebXR = {
 			// enabled or disabled. When using the WebXR API Emulator, this
 			// gets picked up automatically, however, in the Oculus Browser
 			// on the Quest, we need to pause and resume the main loop.
-			Browser.pauseAsyncCallbacks();
 			Browser.mainLoop.pause();
 			window.setTimeout(function () {
-				Browser.resumeAsyncCallbacks();
 				Browser.mainLoop.resume();
 			}, 0);
 		},

From 05a8ddf3d2589b901fb7c6343590efc17337ade1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Wed, 28 Apr 2021 09:40:47 +0200
Subject: [PATCH 34/84] SceneTree: Fix type hints for `global_menu_action`
 signal

Fixes https://github.com/godotengine/godot-headers/issues/89.

(cherry picked from commit 6ba10c6c1ffef8bbd5eb02233c5dad91c0e99469)
---
 scene/main/scene_tree.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scene/main/scene_tree.cpp b/scene/main/scene_tree.cpp
index 79c071832ed2..05da85dc48e5 100644
--- a/scene/main/scene_tree.cpp
+++ b/scene/main/scene_tree.cpp
@@ -1940,7 +1940,9 @@ void SceneTree::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("physics_frame"));
 
 	ADD_SIGNAL(MethodInfo("files_dropped", PropertyInfo(Variant::POOL_STRING_ARRAY, "files"), PropertyInfo(Variant::INT, "screen")));
-	ADD_SIGNAL(MethodInfo("global_menu_action", PropertyInfo(Variant::NIL, "id"), PropertyInfo(Variant::NIL, "meta")));
+	ADD_SIGNAL(MethodInfo("global_menu_action",
+			PropertyInfo(Variant::NIL, "id", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NIL_IS_VARIANT),
+			PropertyInfo(Variant::NIL, "meta", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NIL_IS_VARIANT)));
 	ADD_SIGNAL(MethodInfo("network_peer_connected", PropertyInfo(Variant::INT, "id")));
 	ADD_SIGNAL(MethodInfo("network_peer_disconnected", PropertyInfo(Variant::INT, "id")));
 	ADD_SIGNAL(MethodInfo("connected_to_server"));

From 42c88d99aeb6277fcf8db0f469960b6b33e90b2c Mon Sep 17 00:00:00 2001
From: smix8 <>
Date: Thu, 15 Apr 2021 23:00:40 +0200
Subject: [PATCH 35/84] Add SkeletonIK function documentation

Add SkeletonIK function documentation

(cherry picked from commit 84e603ec7e271d711342c8280ec2e98ba3c8056f)
---
 doc/classes/SkeletonIK.xml | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/doc/classes/SkeletonIK.xml b/doc/classes/SkeletonIK.xml
index d8c083cdaa1f..7935675273f5 100644
--- a/doc/classes/SkeletonIK.xml
+++ b/doc/classes/SkeletonIK.xml
@@ -1,8 +1,29 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="SkeletonIK" inherits="Node" version="3.3">
 	<brief_description>
+		SkeletonIK is used to place the end bone of a [Skeleton] bone chain at a certain point in 3D by rotating all bones in the chain accordingly.
 	</brief_description>
 	<description>
+		SkeletonIK is used to place the end bone of a [Skeleton] bone chain at a certain point in 3D by rotating all bones in the chain accordingly. A typical scenario for IK in games is to place a characters feet on the ground or a characters hands on a currently hold object. SkeletonIK uses FabrikInverseKinematic internally to solve the bone chain and applies the results to the [Skeleton] [code]bones_global_pose_override[/code] property for all affected bones in the chain. If fully applied this overwrites any bone transform from [Animation]s or bone custom poses set by users. The applied amount can be controlled with the [code]interpolation[/code] property.
+		[codeblock]
+		# Apply IK effect automatically on every new frame (not the current)
+		skeleton_ik_node.start()
+
+		# Apply IK effect only on the current frame
+		skeleton_ik_node.start(true)
+
+		# Stop IK effect and reset bones_global_pose_override on Skeleton
+		skeleton_ik_node.stop()
+
+		# Apply full IK effect
+		skeleton_ik_node.set_interpolation(1.0)
+
+		# Apply half IK effect
+		skeleton_ik_node.set_interpolation(0.5)
+
+		# Apply zero IK effect (a value at or below 0.01 also removes bones_global_pose_override on Skeleton)
+		skeleton_ik_node.set_interpolation(0.0)
+		[/codeblock]
 	</description>
 	<tutorials>
 		<link title="3D Inverse Kinematics Demo">https://godotengine.org/asset-library/asset/523</link>
@@ -12,12 +33,14 @@
 			<return type="Skeleton">
 			</return>
 			<description>
+				Returns the parent [Skeleton] Node that was present when SkeletonIK entered the [SceneTree]. Returns null if the parent node was not a [Skeleton] Node when SkeletonIK entered the [SceneTree].
 			</description>
 		</method>
 		<method name="is_running">
 			<return type="bool">
 			</return>
 			<description>
+				Returns [code]true[/code] if SkeletonIK is applying IK effects on continues frames to the [Skeleton] bones. Returns [code]false[/code] if SkeletonIK is stopped or [method start] was used with the [code]one_time[/code] parameter set to [code]true[/code].
 			</description>
 		</method>
 		<method name="start">
@@ -26,35 +49,47 @@
 			<argument index="0" name="one_time" type="bool" default="false">
 			</argument>
 			<description>
+				Starts applying IK effects on each frame to the [Skeleton] bones but will only take effect starting on the next frame. If [code]one_time[/code] is [code]true[/code], this will take effect immediately but also reset on the next frame.
 			</description>
 		</method>
 		<method name="stop">
 			<return type="void">
 			</return>
 			<description>
+				Stops applying IK effects on each frame to the [Skeleton] bones and also calls [method Skeleton.clear_bones_global_pose_override] to remove existing overrides on all bones.
 			</description>
 		</method>
 	</methods>
 	<members>
 		<member name="interpolation" type="float" setter="set_interpolation" getter="get_interpolation" default="1.0">
+			Interpolation value for how much the IK results are applied to the current skeleton bone chain. A value of [code]1.0[/code] will overwrite all skeleton bone transforms completely while a value of [code]0.0[/code] will visually disable the SkeletonIK. A value at or below [code]0.01[/code] also calls [method Skeleton.clear_bones_global_pose_override].
 		</member>
 		<member name="magnet" type="Vector3" setter="set_magnet_position" getter="get_magnet_position" default="Vector3( 0, 0, 0 )">
+			Secondary target position (first is [member target] property or [member target_node]) for the IK chain. Use magnet position (pole target) to control the bending of the IK chain. Only works if the bone chain has more than 2 bones. The middle chain bone position will be linearly interpolated with the magnet position.
 		</member>
 		<member name="max_iterations" type="int" setter="set_max_iterations" getter="get_max_iterations" default="10">
+			Number of iteration loops used by the IK solver to produce more accurate (and elegant) bone chain results.
 		</member>
 		<member name="min_distance" type="float" setter="set_min_distance" getter="get_min_distance" default="0.01">
+			The minimum distance between bone and goal target. If the distance is below this value, the IK solver stops further iterations.
 		</member>
 		<member name="override_tip_basis" type="bool" setter="set_override_tip_basis" getter="is_override_tip_basis" default="true">
+			If [code]true[/code] overwrites the rotation of the tip bone with the rotation of the [member target] (or [member target_node] if defined).
 		</member>
 		<member name="root_bone" type="String" setter="set_root_bone" getter="get_root_bone" default="&quot;&quot;">
+			The name of the current root bone, the first bone in the IK chain.
 		</member>
 		<member name="target" type="Transform" setter="set_target_transform" getter="get_target_transform" default="Transform( 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0 )">
+			First target of the IK chain where the tip bone is placed and, if [member override_tip_basis] is [code]true[/code], how the tip bone is rotated. If a [member target_node] path is available the nodes transform is used instead and this property is ignored.
 		</member>
 		<member name="target_node" type="NodePath" setter="set_target_node" getter="get_target_node" default="NodePath(&quot;&quot;)">
+			Target node [NodePath] for the IK chain. If available, the node's current [Transform] is used instead of the [member target] property.
 		</member>
 		<member name="tip_bone" type="String" setter="set_tip_bone" getter="get_tip_bone" default="&quot;&quot;">
+			The name of the current tip bone, the last bone in the IK chain placed at the [member target] transform (or [member target_node] if defined).
 		</member>
 		<member name="use_magnet" type="bool" setter="set_use_magnet" getter="is_using_magnet" default="false">
+			If [code]true[/code], instructs the IK solver to consider the secondary magnet target (pole target) when calculating the bone chain. Use the magnet position (pole target) to control the bending of the IK chain.
 		</member>
 	</members>
 	<constants>

From 08bedba3411a5392fe947f3d625c8c933e724b13 Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Wed, 28 Apr 2021 17:40:14 +0200
Subject: [PATCH 36/84] Document that clearcoat/rim lighting is not visible on
 unshaded materials

(cherry picked from commit 8df0e61cc22fcb34d2ea2bd5099f0b503d9b7d6a)
---
 doc/classes/SpatialMaterial.xml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/classes/SpatialMaterial.xml b/doc/classes/SpatialMaterial.xml
index 8b80e883e02c..777420832f21 100644
--- a/doc/classes/SpatialMaterial.xml
+++ b/doc/classes/SpatialMaterial.xml
@@ -107,6 +107,7 @@
 		</member>
 		<member name="clearcoat_enabled" type="bool" setter="set_feature" getter="get_feature" default="false">
 			If [code]true[/code], clearcoat rendering is enabled. Adds a secondary transparent pass to the lighting calculation resulting in an added specular blob. This makes materials appear as if they have a clear layer on them that can be either glossy or rough.
+			[b]Note:[/b] Clearcoat rendering is not visible if the material has [member flags_unshaded] set to [code]true[/code].
 		</member>
 		<member name="clearcoat_gloss" type="float" setter="set_clearcoat_gloss" getter="get_clearcoat_gloss">
 			Sets the roughness of the clearcoat pass. A higher value results in a smoother clearcoat while a lower value results in a rougher clearcoat.
@@ -320,6 +321,7 @@
 		</member>
 		<member name="rim_enabled" type="bool" setter="set_feature" getter="get_feature" default="false">
 			If [code]true[/code], rim effect is enabled. Rim lighting increases the brightness at glancing angles on an object.
+			[b]Note:[/b] Rim lighting is not visible if the material has [member flags_unshaded] set to [code]true[/code].
 		</member>
 		<member name="rim_texture" type="Texture" setter="set_texture" getter="get_texture">
 			Texture used to set the strength of the rim lighting effect per-pixel. Multiplied by [member rim].

From 31bc9d859b29152df6a02e62530a59e13b5666d6 Mon Sep 17 00:00:00 2001
From: thebestnom <shoval.arad@gmail.com>
Date: Thu, 11 Mar 2021 01:03:17 +0200
Subject: [PATCH 37/84] [Android] Allow to build dev template with symbols

(cherry picked from commit fd7141fc03cae3a50b5669760ae765f26a85315e)
---
 platform/android/java/app/config.gradle |  2 +-
 platform/android/java/build.gradle      | 36 ++++++++++++++++++++-----
 platform/android/java/gradle.properties |  2 +-
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/platform/android/java/app/config.gradle b/platform/android/java/app/config.gradle
index 294398c80327..b278d15bdfcc 100644
--- a/platform/android/java/app/config.gradle
+++ b/platform/android/java/app/config.gradle
@@ -239,5 +239,5 @@ ext.shouldSign = { ->
 }
 
 ext.shouldNotStrip = { ->
-    return isAndroidStudio()
+    return isAndroidStudio() || project.hasProperty("doNotStrip")
 }
diff --git a/platform/android/java/build.gradle b/platform/android/java/build.gradle
index ec02b0fc7a7c..81570d9d8645 100644
--- a/platform/android/java/build.gradle
+++ b/platform/android/java/build.gradle
@@ -122,16 +122,17 @@ task zipCustomBuild(type: Zip) {
     destinationDir(file(binDir))
 }
 
-/**
- * Master task used to coordinate the tasks defined above to generate the set of Godot templates.
- */
-task generateGodotTemplates(type: GradleBuild) {
+def templateExcludedBuildTask() {
     // We exclude these gradle tasks so we can run the scons command manually.
+    def excludedTasks = []
     for (String buildType : supportedTargets) {
-        startParameter.excludedTaskNames += ":lib:" + getSconsTaskName(buildType)
+        excludedTasks += ":lib:" + getSconsTaskName(buildType)
     }
+    return excludedTasks
+}
 
-    tasks = []
+def templateBuildTasks() {
+    def tasks = []
 
     // Only build the apks and aar files for which we have native shared libraries.
     for (String target : supportedTargets) {
@@ -152,6 +153,29 @@ task generateGodotTemplates(type: GradleBuild) {
         }
     }
 
+    return tasks
+}
+
+/**
+ * Master task used to coordinate the tasks defined above to generate the set of Godot templates.
+ */
+task generateGodotTemplates(type: GradleBuild) {
+    startParameter.excludedTaskNames = templateExcludedBuildTask()
+    tasks = templateBuildTasks()
+
+    finalizedBy 'zipCustomBuild'
+}
+
+/**
+ * Generates the same output as generateGodotTemplates but with dev symbols
+ */
+task generateDevTemplate (type: GradleBuild) {
+    // add parameter to set symbols to true
+    startParameter.projectProperties += [doNotStrip: true]
+
+    startParameter.excludedTaskNames = templateExcludedBuildTask()
+    tasks = templateBuildTasks()
+
     finalizedBy 'zipCustomBuild'
 }
 
diff --git a/platform/android/java/gradle.properties b/platform/android/java/gradle.properties
index 2dc069ad2fb7..6b3b62a9da4c 100644
--- a/platform/android/java/gradle.properties
+++ b/platform/android/java/gradle.properties
@@ -12,7 +12,7 @@ android.useAndroidX=true
 
 # Specifies the JVM arguments used for the daemon process.
 # The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx1536m
+org.gradle.jvmargs=-Xmx4536m
 
 # When configured, Gradle will run in incubating parallel mode.
 # This option should only be used with decoupled projects. More details, visit

From 8dc3d5eb7af3d3762bd34397fcf639e3702fd39e Mon Sep 17 00:00:00 2001
From: thebestnom <shoval.arad@gmail.com>
Date: Wed, 28 Apr 2021 21:59:42 +0300
Subject: [PATCH 38/84] [Android] fix generateDevTemplate

(cherry picked from commit 04772567935d14a6568373f2f96bcff4c122d142)
---
 platform/android/java/build.gradle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platform/android/java/build.gradle b/platform/android/java/build.gradle
index 81570d9d8645..a7fe500be2a1 100644
--- a/platform/android/java/build.gradle
+++ b/platform/android/java/build.gradle
@@ -112,7 +112,7 @@ task copyReleaseAARToBin(type: Copy) {
  * The zip file also includes some gradle tools to allow building of the custom build.
  */
 task zipCustomBuild(type: Zip) {
-    dependsOn ':generateGodotTemplates'
+    onlyIf { generateGodotTemplates.state.executed || generateDevTemplate.state.executed }
     doFirst {
         logger.lifecycle("Generating Godot custom build template")
     }

From e4cbf9c58a5ffd296a2284c04d8b80af7c880576 Mon Sep 17 00:00:00 2001
From: Fabio Alessandrelli <fabio.alessandrelli@gmail.com>
Date: Fri, 30 Apr 2021 07:40:10 +0200
Subject: [PATCH 39/84] [HTML5] Fix build for recent emscripten versions.

Library suffix should be `.a`, the `EXTRA_` in
`EXTRA_EXPORTED_RUNTIME_METHODS` is deprecated.

(cherry picked from commit b0f6dec02e6b6cc51949a47f9ad943a68d099cc1)
---
 platform/javascript/detect.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/platform/javascript/detect.py b/platform/javascript/detect.py
index 7562cf26dc8b..49502a9e464d 100644
--- a/platform/javascript/detect.py
+++ b/platform/javascript/detect.py
@@ -172,7 +172,7 @@ def configure(env):
     # Program() output consists of multiple files, so specify suffixes manually at builder.
     env["PROGSUFFIX"] = ""
     env["LIBPREFIX"] = "lib"
-    env["LIBSUFFIX"] = ".bc"
+    env["LIBSUFFIX"] = ".a"
     env["LIBPREFIXES"] = ["$LIBPREFIX"]
     env["LIBSUFFIXES"] = ["$LIBSUFFIX"]
 
@@ -226,8 +226,8 @@ def configure(env):
     # Allow use to take control of swapping WebGL buffers.
     env.Append(LINKFLAGS=["-s", "OFFSCREEN_FRAMEBUFFER=1"])
 
-    # callMain for manual start.
-    env.Append(LINKFLAGS=["-s", "EXTRA_EXPORTED_RUNTIME_METHODS=['callMain','cwrap']"])
+    # callMain for manual start, cwrap for the mono version.
+    env.Append(LINKFLAGS=["-s", "EXPORTED_RUNTIME_METHODS=['callMain','cwrap']"])
 
     # Add code that allow exiting runtime.
     env.Append(LINKFLAGS=["-s", "EXIT_RUNTIME=1"])

From 2bd40b4c9c18562c7e082b972a39d005d1a1cc2c Mon Sep 17 00:00:00 2001
From: Sergey Minakov <naithar@icloud.com>
Date: Fri, 30 Apr 2021 11:40:12 +0300
Subject: [PATCH 40/84] [iOS] Nonnegative start index for virtual keyboard
 range

(cherry picked from commit 275772818ddc6b03cee4b72435828dca01b291e8)
---
 platform/iphone/keyboard_input_view.mm | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/platform/iphone/keyboard_input_view.mm b/platform/iphone/keyboard_input_view.mm
index abf9cae77303..a7834cabef1b 100644
--- a/platform/iphone/keyboard_input_view.mm
+++ b/platform/iphone/keyboard_input_view.mm
@@ -87,13 +87,15 @@ - (BOOL)becomeFirstResponderWithString:(NSString *)existingString multiline:(BOO
 	self.text = existingString;
 	self.previousText = existingString;
 
+	NSInteger safeStartIndex = MAX(start, 0);
+
 	NSRange textRange;
 
 	// Either a simple cursor or a selection.
 	if (end > 0) {
-		textRange = NSMakeRange(start, end - start);
+		textRange = NSMakeRange(safeStartIndex, end - start);
 	} else {
-		textRange = NSMakeRange(start, 0);
+		textRange = NSMakeRange(safeStartIndex, 0);
 	}
 
 	self.selectedRange = textRange;

From edf5a037149f4fd14170570f015060c6316a2884 Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Thu, 29 Apr 2021 21:22:32 +0200
Subject: [PATCH 41/84] Document that `SceneTree.call_group()` is deferred

(cherry picked from commit 7516ff380586f66ff5333ec81fd25172fbbd4182)
---
 doc/classes/SceneTree.xml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/classes/SceneTree.xml b/doc/classes/SceneTree.xml
index 4e4fcddeb5bf..9a6ea40bf55f 100644
--- a/doc/classes/SceneTree.xml
+++ b/doc/classes/SceneTree.xml
@@ -23,6 +23,7 @@
 			<description>
 				Calls [code]method[/code] on each member of the given group. You can pass arguments to [code]method[/code] by specifying them at the end of the method call.
 				[b]Note:[/b] [code]method[/code] may only have 5 arguments at most (7 arguments passed to this method in total).
+				[b]Note:[/b] [method call_group] will always call methods with an one-frame delay, in a way similar to [method Object.call_deferred]. To call methods immediately, use [method call_group_flags] with the [constant GROUP_CALL_REALTIME] flag.
 			</description>
 		</method>
 		<method name="call_group_flags" qualifiers="vararg">
@@ -37,6 +38,7 @@
 			<description>
 				Calls [code]method[/code] on each member of the given group, respecting the given [enum GroupCallFlags]. You can pass arguments to [code]method[/code] by specifying them at the end of the method call.
 				[b]Note:[/b] [code]method[/code] may only have 5 arguments at most (8 arguments passed to this method in total).
+				[b]Note:[/b] Group call flags are used to control the method calling behavior. If the [constant GROUP_CALL_REALTIME] flag is present in the [code]flags[/code] argument, methods will be called immediately. If this flag isn't present in [code]flags[/code], methods will be called with a one-frame delay in a way similar to [method call_group].
 			</description>
 		</method>
 		<method name="change_scene">

From 87aa694ae5003af797b3a1de56f6d4579997bb47 Mon Sep 17 00:00:00 2001
From: bruvzg <7645683+bruvzg@users.noreply.github.com>
Date: Thu, 29 Apr 2021 23:27:52 +0300
Subject: [PATCH 42/84] Duplicate DynamicFontData resources in the editor
 preview generation thread to avoid race condition.

(cherry picked from commit 301bedd5d4ae303d19dffb07380590e885e6fca6)
---
 editor/plugins/editor_preview_plugins.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/editor/plugins/editor_preview_plugins.cpp b/editor/plugins/editor_preview_plugins.cpp
index f24feae7e702..647dd6b6e429 100644
--- a/editor/plugins/editor_preview_plugins.cpp
+++ b/editor/plugins/editor_preview_plugins.cpp
@@ -838,14 +838,15 @@ Ref<Texture> EditorFontPreviewPlugin::generate_from_path(const String &p_path, c
 	ril.ptr()->wait();
 	RES res = ril.ptr()->get_resource();
 	Ref<DynamicFont> sampled_font;
+	sampled_font.instance();
 	if (res->is_class("DynamicFont")) {
-		sampled_font = res->duplicate();
-		if (sampled_font->get_outline_color() == Color(1, 1, 1, 1)) {
-			sampled_font->set_outline_color(Color(0, 0, 0, 1));
+		Ref<DynamicFont> font = res;
+		sampled_font->set_font_data(font->get_font_data()->duplicate());
+		for (int i = 0; i < font->get_fallback_count(); i++) {
+			sampled_font->add_fallback(font->get_fallback(i)->duplicate());
 		}
 	} else if (res->is_class("DynamicFontData")) {
-		sampled_font.instance();
-		sampled_font->set_font_data(res);
+		sampled_font->set_font_data(res->duplicate());
 	}
 	sampled_font->set_size(50);
 

From dacd16fd337f0f2c584afbf9c74e7315c189e556 Mon Sep 17 00:00:00 2001
From: lawnjelly <lawnjelly@gmail.com>
Date: Mon, 3 May 2021 12:18:56 +0100
Subject: [PATCH 43/84] Fix 2d software skinning relative transforms

All my earlier test cases for software skinning had the polys parent transform to be identity. This works fine until you had cases where the user had moved the transform of the parent nodes of skinned polys.

This PR fixes this situation by taking into account the final (concatenated) transform of the polys RELATIVE to the skeleton base transform. It does this by applying the inverse skeleton base transform to the poly final transform.

(cherry picked from commit f33e22001f184f17522bcb3342dba9c639049396)
---
 drivers/gles2/rasterizer_canvas_gles2.cpp     |  6 ++--
 drivers/gles3/rasterizer_canvas_gles3.cpp     |  6 ++--
 .../gles_common/rasterizer_canvas_batcher.h   | 33 +++++++++++++++----
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/drivers/gles2/rasterizer_canvas_gles2.cpp b/drivers/gles2/rasterizer_canvas_gles2.cpp
index eefdde514874..c7573d561e84 100644
--- a/drivers/gles2/rasterizer_canvas_gles2.cpp
+++ b/drivers/gles2/rasterizer_canvas_gles2.cpp
@@ -2194,7 +2194,7 @@ void RasterizerCanvasGLES2::render_joined_item(const BItemJoined &p_bij, RenderI
 	_set_uniforms();
 
 	if (unshaded || (state.uniforms.final_modulate.a > 0.001 && (!r_ris.shader_cache || r_ris.shader_cache->canvas_item.light_mode != RasterizerStorageGLES2::Shader::CanvasItem::LIGHT_MODE_LIGHT_ONLY) && !ci->light_masked))
-		render_joined_item_commands(p_bij, NULL, reclip, material_ptr, false);
+		render_joined_item_commands(p_bij, NULL, reclip, material_ptr, false, r_ris);
 
 	r_ris.rebind_shader = true; // hacked in for now.
 
@@ -2288,10 +2288,10 @@ void RasterizerCanvasGLES2::render_joined_item(const BItemJoined &p_bij, RenderI
 				// this can greatly reduce fill rate ..
 				// at the cost of glScissor commands, so is optional
 				if (!bdata.settings_scissor_lights || r_ris.current_clip) {
-					render_joined_item_commands(p_bij, NULL, reclip, material_ptr, true);
+					render_joined_item_commands(p_bij, NULL, reclip, material_ptr, true, r_ris);
 				} else {
 					bool scissor = _light_scissor_begin(p_bij.bounding_rect, light->xform_cache, light->rect_cache);
-					render_joined_item_commands(p_bij, NULL, reclip, material_ptr, true);
+					render_joined_item_commands(p_bij, NULL, reclip, material_ptr, true, r_ris);
 					if (scissor) {
 						glDisable(GL_SCISSOR_TEST);
 					}
diff --git a/drivers/gles3/rasterizer_canvas_gles3.cpp b/drivers/gles3/rasterizer_canvas_gles3.cpp
index 967149ae557a..cc9f0b884c6d 100644
--- a/drivers/gles3/rasterizer_canvas_gles3.cpp
+++ b/drivers/gles3/rasterizer_canvas_gles3.cpp
@@ -1485,7 +1485,7 @@ void RasterizerCanvasGLES3::render_joined_item(const BItemJoined &p_bij, RenderI
 	}
 	if (unshaded || (state.canvas_item_modulate.a > 0.001 && (!r_ris.shader_cache || r_ris.shader_cache->canvas_item.light_mode != RasterizerStorageGLES3::Shader::CanvasItem::LIGHT_MODE_LIGHT_ONLY) && !p_ci->light_masked)) {
 		RasterizerStorageGLES3::Material *material_ptr = nullptr;
-		render_joined_item_commands(p_bij, NULL, reclip, material_ptr, false);
+		render_joined_item_commands(p_bij, NULL, reclip, material_ptr, false, r_ris);
 	}
 
 	if ((blend_mode == RasterizerStorageGLES3::Shader::CanvasItem::BLEND_MODE_MIX || blend_mode == RasterizerStorageGLES3::Shader::CanvasItem::BLEND_MODE_PMALPHA) && r_ris.item_group_light && !unshaded) {
@@ -1600,10 +1600,10 @@ void RasterizerCanvasGLES3::render_joined_item(const BItemJoined &p_bij, RenderI
 				// this can greatly reduce fill rate ..
 				// at the cost of glScissor commands, so is optional
 				if (!bdata.settings_scissor_lights || r_ris.current_clip) {
-					render_joined_item_commands(p_bij, NULL, reclip, nullptr, true);
+					render_joined_item_commands(p_bij, NULL, reclip, nullptr, true, r_ris);
 				} else {
 					bool scissor = _light_scissor_begin(p_bij.bounding_rect, light->xform_cache, light->rect_cache);
-					render_joined_item_commands(p_bij, NULL, reclip, nullptr, true);
+					render_joined_item_commands(p_bij, NULL, reclip, nullptr, true, r_ris);
 					if (scissor) {
 						glDisable(GL_SCISSOR_TEST);
 					}
diff --git a/drivers/gles_common/rasterizer_canvas_batcher.h b/drivers/gles_common/rasterizer_canvas_batcher.h
index f1e96f74a67b..b4c68c243590 100644
--- a/drivers/gles_common/rasterizer_canvas_batcher.h
+++ b/drivers/gles_common/rasterizer_canvas_batcher.h
@@ -504,6 +504,7 @@ class RasterizerCanvasBatcher {
 		bool extra_matrix_sent; // whether sent on this item (in which case sofware transform can't be used untl end of item)
 		int transform_extra_command_number_p1; // plus one to allow fast checking against zero
 		Transform2D transform_combined; // final * extra
+		Transform2D skeleton_base_inverse_xform; // used in software skinning
 	};
 
 	// used during try_join
@@ -587,7 +588,7 @@ class RasterizerCanvasBatcher {
 	bool _detect_item_batch_break(RenderItemState &r_ris, RasterizerCanvas::Item *p_ci, bool &r_batch_break);
 
 	// drives the loop filling batches and flushing
-	void render_joined_item_commands(const BItemJoined &p_bij, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit);
+	void render_joined_item_commands(const BItemJoined &p_bij, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit, const RenderItemState &p_ris);
 
 private:
 	// flush once full or end of joined item
@@ -1824,9 +1825,12 @@ PREAMBLE(bool)::_software_skin_poly(RasterizerCanvas::Item::CommandPolygon *p_po
 	Vector2 *pTemps = (Vector2 *)alloca(num_verts * sizeof(Vector2));
 	memset((void *)pTemps, 0, num_verts * sizeof(Vector2));
 
-	// these are used in the shader but don't appear to be needed for software transform
-	//	const Transform2D &skel_trans = get_this()->state.skeleton_transform;
-	//	const Transform2D &skel_trans_inv = get_this()->state.skeleton_transform_inverse;
+	// only the inverse appears to be needed
+	const Transform2D &skel_trans_inv = p_fill_state.skeleton_base_inverse_xform;
+	// we can't get this from the state, because more than one skeleton item may have been joined together..
+	// we need to handle the base skeleton on a per item basis as the joined item is rendered.
+	// const Transform2D &skel_trans = get_this()->state.skeleton_transform;
+	// const Transform2D &skel_trans_inv = get_this()->state.skeleton_transform_inverse;
 
 	// get the bone transforms.
 	// this is not ideal because we don't know in advance which bones are needed
@@ -1838,7 +1842,10 @@ PREAMBLE(bool)::_software_skin_poly(RasterizerCanvas::Item::CommandPolygon *p_po
 
 	if (num_verts && (p_poly->bones.size() == num_verts * 4) && (p_poly->weights.size() == p_poly->bones.size())) {
 
-		const Transform2D &item_transform = p_item->xform;
+		// instead of using the p_item->xform we use the final transform,
+		// because we want the poly transform RELATIVE to the base skeleton.
+		Transform2D item_transform = skel_trans_inv * p_item->final_transform;
+
 		Transform2D item_transform_inv = item_transform.affine_inverse();
 
 		for (int n = 0; n < num_verts; n++) {
@@ -2534,7 +2541,7 @@ PREAMBLE(void)::flush_render_batches(RasterizerCanvas::Item *p_first_item, Raste
 #endif
 }
 
-PREAMBLE(void)::render_joined_item_commands(const BItemJoined &p_bij, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit) {
+PREAMBLE(void)::render_joined_item_commands(const BItemJoined &p_bij, RasterizerCanvas::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit, const RenderItemState &p_ris) {
 
 	RasterizerCanvas::Item *item = 0;
 	RasterizerCanvas::Item *first_item = bdata.item_refs[p_bij.first_item_ref].item;
@@ -2581,6 +2588,20 @@ PREAMBLE(void)::render_joined_item_commands(const BItemJoined &p_bij, Rasterizer
 		// prefill_joined_item()
 		fill_state.transform_combined = item->final_transform;
 
+		// calculate skeleton base inverse transform if required for software skinning
+		// put in the fill state as this is readily accessible from the software skinner
+		if (item->skeleton.is_valid() && bdata.settings_use_software_skinning && get_storage()->skeleton_owner.owns(item->skeleton)) {
+			typename T_STORAGE::Skeleton *skeleton = nullptr;
+			skeleton = get_storage()->skeleton_owner.get(item->skeleton);
+
+			if (skeleton->use_2d) {
+				// with software skinning we still need to know the skeleton inverse transform, the other two aren't needed
+				// but are left in for simplicity here
+				Transform2D skeleton_transform = p_ris.item_group_base_transform * skeleton->base_transform_2d;
+				fill_state.skeleton_base_inverse_xform = skeleton_transform.affine_inverse();
+			}
+		}
+
 		// decide the initial transform mode, and make a backup
 		// in orig_transform_mode in case we need to switch back
 		if (fill_state.use_software_transform) {

From a93f52ee7e493e4dc74746364708ae92b809d98c Mon Sep 17 00:00:00 2001
From: MaxStgs <peters2011.pm@gmail.com>
Date: Fri, 30 Apr 2021 18:36:14 +0500
Subject: [PATCH 44/84] Add WebSocketMultiplayerPeer _incoming_packets check
 bound

(cherry picked from commit 05ad08941b93c513dfd8667148204f10fe20cd8e)
---
 modules/websocket/websocket_multiplayer_peer.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/websocket/websocket_multiplayer_peer.cpp b/modules/websocket/websocket_multiplayer_peer.cpp
index e4a998172f52..5cfac0f0db6a 100644
--- a/modules/websocket/websocket_multiplayer_peer.cpp
+++ b/modules/websocket/websocket_multiplayer_peer.cpp
@@ -114,6 +114,8 @@ Error WebSocketMultiplayerPeer::get_packet(const uint8_t **r_buffer, int &r_buff
 		_current_packet.data = NULL;
 	}
 
+	ERR_FAIL_COND_V(_incoming_packets.size() == 0, ERR_UNAVAILABLE);
+
 	_current_packet = _incoming_packets.front()->get();
 	_incoming_packets.pop_front();
 

From 839f602859fc15094f0ad5ef18fb29fd2dca8162 Mon Sep 17 00:00:00 2001
From: kleonc <9283098+kleonc@users.noreply.github.com>
Date: Mon, 3 May 2021 14:49:52 +0200
Subject: [PATCH 45/84] TileMapEditor::_bucket_fill Check autotile coordinates
 only if autotile is selected

(cherry picked from commit a1b903066e7b216176e3f0d646fc36c1720b5792)
---
 editor/plugins/tile_map_editor_plugin.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/editor/plugins/tile_map_editor_plugin.cpp b/editor/plugins/tile_map_editor_plugin.cpp
index 448e3ed7b4df..9cd386c90be5 100644
--- a/editor/plugins/tile_map_editor_plugin.cpp
+++ b/editor/plugins/tile_map_editor_plugin.cpp
@@ -662,11 +662,15 @@ PoolVector<Vector2> TileMapEditor::_bucket_fill(const Point2i &p_start, bool era
 	}
 
 	// Check if the tile variation is the same
-	Vector2 prev_position = node->get_cell_autotile_coord(p_start.x, p_start.y);
 	if (ids.size() == 1 && ids[0] == prev_id) {
 		int current = manual_palette->get_current();
-		Vector2 position = manual_palette->get_item_metadata(current);
-		if (prev_position == position) {
+		if (current == -1) {
+			// Same ID, no variation selected, nothing to change
+			return PoolVector<Vector2>();
+		}
+		Vector2 prev_autotile_coord = node->get_cell_autotile_coord(p_start.x, p_start.y);
+		Vector2 autotile_coord = manual_palette->get_item_metadata(current);
+		if (autotile_coord == prev_autotile_coord) {
 			// Same ID and variation, nothing to change
 			return PoolVector<Vector2>();
 		}

From 44a4df047605e921ac53c88758fa78d0747e4371 Mon Sep 17 00:00:00 2001
From: kleonc <9283098+kleonc@users.noreply.github.com>
Date: Mon, 3 May 2021 14:58:02 +0200
Subject: [PATCH 46/84] TileMapEditor Modulate autotile previews

(cherry picked from commit 3f1b95cfb1d7c89d5e4ca07e00afe4fbbf2ffdbc)
---
 editor/plugins/tile_map_editor_plugin.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/editor/plugins/tile_map_editor_plugin.cpp b/editor/plugins/tile_map_editor_plugin.cpp
index 9cd386c90be5..30a67ba8f6ed 100644
--- a/editor/plugins/tile_map_editor_plugin.cpp
+++ b/editor/plugins/tile_map_editor_plugin.cpp
@@ -578,6 +578,7 @@ void TileMapEditor::_update_palette() {
 		entries2.sort_custom<SwapComparator>();
 
 		Ref<Texture> tex = tileset->tile_get_texture(sel_tile);
+		Color modulate = tileset->tile_get_modulate(sel_tile);
 
 		for (int i = 0; i < entries2.size(); i++) {
 
@@ -594,6 +595,7 @@ void TileMapEditor::_update_palette() {
 					manual_palette->set_item_icon_region(manual_palette->get_item_count() - 1, region);
 
 				manual_palette->set_item_icon(manual_palette->get_item_count() - 1, tex);
+				manual_palette->set_item_icon_modulate(manual_palette->get_item_count() - 1, modulate);
 			}
 
 			manual_palette->set_item_metadata(manual_palette->get_item_count() - 1, entries2[i]);

From 387d2a69cac8732ee3101a2fbb08806de1e51aa5 Mon Sep 17 00:00:00 2001
From: MaxStgs <peters2011.pm@gmail.com>
Date: Fri, 30 Apr 2021 14:08:45 +0500
Subject: [PATCH 47/84] Fix BakedLightmap bias bound check

(cherry picked from commit b4cc8ed6f2c3080f1c5e58d5b085b90e3564095e)
---
 scene/3d/baked_lightmap.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scene/3d/baked_lightmap.cpp b/scene/3d/baked_lightmap.cpp
index af044fccdb48..deae87aa5811 100644
--- a/scene/3d/baked_lightmap.cpp
+++ b/scene/3d/baked_lightmap.cpp
@@ -1457,7 +1457,7 @@ int BakedLightmap::get_bounces() const {
 }
 
 void BakedLightmap::set_bias(float p_bias) {
-	ERR_FAIL_COND(p_bias < 0.00001);
+	ERR_FAIL_COND(p_bias < 0.00001f);
 	bias = p_bias;
 }
 

From 703c290b71e9fa3a9677d8f3f1d01db52c640225 Mon Sep 17 00:00:00 2001
From: PouleyKetchoupp <pouleyketchoup@gmail.com>
Date: Mon, 26 Apr 2021 11:42:46 -0700
Subject: [PATCH 48/84] Fix skinning initialization in MeshInstance when loaded
 from thread

Fix for a regression from software skinning support:
instance_attach_skeleton wasn't called in set_mesh before, and it's
causing issues when the mesh instance is loaded from a thread.
1. Call from a thread queues instance_attach_skeleton with RID() in the
visual server.
2. Call from the main thread when entering tree calls
instance_attach_skeleton immediately with a valid skeleton
3. Queued instance_attach_skeleton resets the attached skeleton

This change prevents that to happen by making sure
instance_attach_skeleton is not called on set_mesh as it was doing
before, but there might be a more general problem to solve in how
visual server commands are executed when resources are loaded from
a different thread.

(cherry picked from commit feee9f9695e988c0e7192f9c3cc452349e400e4d)
---
 scene/3d/mesh_instance.cpp | 16 +++++++++++-----
 scene/3d/mesh_instance.h   |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/scene/3d/mesh_instance.cpp b/scene/3d/mesh_instance.cpp
index 1e6886c148ec..09225df9b761 100644
--- a/scene/3d/mesh_instance.cpp
+++ b/scene/3d/mesh_instance.cpp
@@ -143,7 +143,7 @@ void MeshInstance::set_mesh(const Ref<Mesh> &p_mesh) {
 		mesh->connect(CoreStringNames::get_singleton()->changed, this, SceneStringNames::get_singleton()->_mesh_changed);
 		materials.resize(mesh->get_surface_count());
 
-		_initialize_skinning();
+		_initialize_skinning(false, false);
 	} else {
 
 		set_base(RID());
@@ -208,7 +208,7 @@ bool MeshInstance::_is_software_skinning_enabled() const {
 	return global_software_skinning;
 }
 
-void MeshInstance::_initialize_skinning(bool p_force_reset) {
+void MeshInstance::_initialize_skinning(bool p_force_reset, bool p_call_attach_skeleton) {
 	if (mesh.is_null()) {
 		return;
 	}
@@ -324,7 +324,9 @@ void MeshInstance::_initialize_skinning(bool p_force_reset) {
 				update_mesh = true;
 			}
 
-			visual_server->instance_attach_skeleton(get_instance(), RID());
+			if (p_call_attach_skeleton) {
+				visual_server->instance_attach_skeleton(get_instance(), RID());
+			}
 
 			if (is_visible_in_tree() && (software_skinning_flags & SoftwareSkinning::FLAG_BONES_READY)) {
 				// Intialize from current skeleton pose.
@@ -336,7 +338,9 @@ void MeshInstance::_initialize_skinning(bool p_force_reset) {
 				skin_ref->get_skeleton_node()->disconnect("skeleton_updated", this, "_update_skinning");
 			}
 
-			visual_server->instance_attach_skeleton(get_instance(), skin_ref->get_skeleton());
+			if (p_call_attach_skeleton) {
+				visual_server->instance_attach_skeleton(get_instance(), skin_ref->get_skeleton());
+			}
 
 			if (software_skinning) {
 				memdelete(software_skinning);
@@ -345,7 +349,9 @@ void MeshInstance::_initialize_skinning(bool p_force_reset) {
 			}
 		}
 	} else {
-		visual_server->instance_attach_skeleton(get_instance(), RID());
+		if (p_call_attach_skeleton) {
+			visual_server->instance_attach_skeleton(get_instance(), RID());
+		}
 		if (software_skinning) {
 			memdelete(software_skinning);
 			software_skinning = nullptr;
diff --git a/scene/3d/mesh_instance.h b/scene/3d/mesh_instance.h
index 151dbd601948..fe874b25c6e6 100644
--- a/scene/3d/mesh_instance.h
+++ b/scene/3d/mesh_instance.h
@@ -93,7 +93,7 @@ class MeshInstance : public GeometryInstance {
 	bool _is_software_skinning_enabled() const;
 	static bool _is_global_software_skinning_enabled();
 
-	void _initialize_skinning(bool p_force_reset = false);
+	void _initialize_skinning(bool p_force_reset = false, bool p_call_attach_skeleton = true);
 	void _update_skinning();
 
 protected:

From 62d80ba15d7ea3a5f2a28b925058c91e63171325 Mon Sep 17 00:00:00 2001
From: PouleyKetchoupp <pouleyketchoup@gmail.com>
Date: Mon, 3 May 2021 17:48:23 -0700
Subject: [PATCH 49/84] Allow values > 1 for friction and bounce in
 PhysicsMaterial

(cherry picked from commit 67987be6448c612b116188e088bef07e29fe5faa)
---
 scene/resources/physics_material.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scene/resources/physics_material.cpp b/scene/resources/physics_material.cpp
index aa35276af963..d89e61ee3a4d 100644
--- a/scene/resources/physics_material.cpp
+++ b/scene/resources/physics_material.cpp
@@ -44,9 +44,9 @@ void PhysicsMaterial::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_absorbent", "absorbent"), &PhysicsMaterial::set_absorbent);
 	ClassDB::bind_method(D_METHOD("is_absorbent"), &PhysicsMaterial::is_absorbent);
 
-	ADD_PROPERTY(PropertyInfo(Variant::REAL, "friction", PROPERTY_HINT_RANGE, "0,1,0.01"), "set_friction", "get_friction");
+	ADD_PROPERTY(PropertyInfo(Variant::REAL, "friction", PROPERTY_HINT_RANGE, "0,1,0.01,or_greater"), "set_friction", "get_friction");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "rough"), "set_rough", "is_rough");
-	ADD_PROPERTY(PropertyInfo(Variant::REAL, "bounce", PROPERTY_HINT_RANGE, "0,1,0.01"), "set_bounce", "get_bounce");
+	ADD_PROPERTY(PropertyInfo(Variant::REAL, "bounce", PROPERTY_HINT_RANGE, "0,1,0.01,or_greater"), "set_bounce", "get_bounce");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "absorbent"), "set_absorbent", "is_absorbent");
 }
 

From 045b85b6e064f1229d2b5d0c522037a2473b5f4f Mon Sep 17 00:00:00 2001
From: kleonc <9283098+kleonc@users.noreply.github.com>
Date: Mon, 3 May 2021 21:16:29 +0200
Subject: [PATCH 50/84] Make posmod use int64_t instead of int

(cherry picked from commit f04a964627d469e6de4227ebe6301ba18757b9de)
---
 core/math/math_funcs.h                                | 4 ++--
 modules/gdscript/gdscript_functions.cpp               | 2 +-
 modules/visual_script/visual_script_builtin_funcs.cpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/math/math_funcs.h b/core/math/math_funcs.h
index f4259e809086..332e3d344858 100644
--- a/core/math/math_funcs.h
+++ b/core/math/math_funcs.h
@@ -199,8 +199,8 @@ class Math {
 		value += 0.0;
 		return value;
 	}
-	static _ALWAYS_INLINE_ int posmod(int p_x, int p_y) {
-		int value = p_x % p_y;
+	static _ALWAYS_INLINE_ int64_t posmod(int64_t p_x, int64_t p_y) {
+		int64_t value = p_x % p_y;
 		if ((value < 0 && p_y > 0) || (value > 0 && p_y < 0)) {
 			value += p_y;
 		}
diff --git a/modules/gdscript/gdscript_functions.cpp b/modules/gdscript/gdscript_functions.cpp
index 50a4ecea1bdb..06f361070d34 100644
--- a/modules/gdscript/gdscript_functions.cpp
+++ b/modules/gdscript/gdscript_functions.cpp
@@ -250,7 +250,7 @@ void GDScriptFunctions::call(Function p_func, const Variant **p_args, int p_arg_
 			VALIDATE_ARG_COUNT(2);
 			VALIDATE_ARG_NUM(0);
 			VALIDATE_ARG_NUM(1);
-			r_ret = Math::posmod((int)*p_args[0], (int)*p_args[1]);
+			r_ret = Math::posmod((int64_t)*p_args[0], (int64_t)*p_args[1]);
 		} break;
 		case MATH_FLOOR: {
 			VALIDATE_ARG_COUNT(1);
diff --git a/modules/visual_script/visual_script_builtin_funcs.cpp b/modules/visual_script/visual_script_builtin_funcs.cpp
index 3cdd62368c35..cb868a9a15fd 100644
--- a/modules/visual_script/visual_script_builtin_funcs.cpp
+++ b/modules/visual_script/visual_script_builtin_funcs.cpp
@@ -760,7 +760,7 @@ void VisualScriptBuiltinFunc::exec_func(BuiltinFunc p_func, const Variant **p_in
 
 			VALIDATE_ARG_NUM(0);
 			VALIDATE_ARG_NUM(1);
-			*r_return = Math::posmod((int)*p_inputs[0], (int)*p_inputs[1]);
+			*r_return = Math::posmod((int64_t)*p_inputs[0], (int64_t)*p_inputs[1]);
 		} break;
 		case VisualScriptBuiltinFunc::MATH_FLOOR: {
 

From 508cd0bb124a3f5dfc0a0913f5606e845f9d57d3 Mon Sep 17 00:00:00 2001
From: Koala <koalasintraffic@gmail.com>
Date: Sun, 25 Apr 2021 20:59:37 +0100
Subject: [PATCH 51/84] Fix indent left line selection

(cherry picked from commit 2c64008718ecb92b464e71b0981163068b52a74c)
---
 scene/gui/text_edit.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/scene/gui/text_edit.cpp b/scene/gui/text_edit.cpp
index effeb1812795..2e4701d3f0ce 100644
--- a/scene/gui/text_edit.cpp
+++ b/scene/gui/text_edit.cpp
@@ -2102,6 +2102,7 @@ void TextEdit::indent_left() {
 	if (is_selection_active() && get_selection_to_column() == 0) {
 		end_line--;
 	}
+	String first_line_text = get_line(start_line);
 	String last_line_text = get_line(end_line);
 
 	for (int i = start_line; i <= end_line; i++) {
@@ -2126,10 +2127,17 @@ void TextEdit::indent_left() {
 		}
 	}
 
-	// Fix selection and cursor being off by one on the last line.
-	if (is_selection_active() && last_line_text != get_line(end_line)) {
-		select(selection.from_line, selection.from_column - removed_characters,
-				selection.to_line, initial_selection_end_column - removed_characters);
+	if (is_selection_active()) {
+		// Fix selection being off by one on the first line.
+		if (first_line_text != get_line(start_line)) {
+			select(selection.from_line, selection.from_column - removed_characters,
+					selection.to_line, initial_selection_end_column);
+		}
+		// Fix selection being off by one on the last line.
+		if (last_line_text != get_line(end_line)) {
+			select(selection.from_line, selection.from_column,
+					selection.to_line, initial_selection_end_column - removed_characters);
+		}
 	}
 	cursor_set_column(initial_cursor_column - removed_characters, false);
 	end_complex_operation();

From 3c089f6a4f77e1bd95fb1fdfbea6c158f13a8759 Mon Sep 17 00:00:00 2001
From: Mateo Kuruk Miccino <mateomiccino@gmail.com>
Date: Sun, 28 Feb 2021 17:41:11 -0300
Subject: [PATCH 52/84] LineEdit: Now double click to select a word, and triple
 click to select all the content

(cherry picked from commit 74b30216910076243828e56b8583f66c45246030)
---
 scene/gui/line_edit.cpp | 34 ++++++++++++++++++++++++++++------
 scene/gui/line_edit.h   |  1 +
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/scene/gui/line_edit.cpp b/scene/gui/line_edit.cpp
index 40b799a5a29b..20fbac27eab5 100644
--- a/scene/gui/line_edit.cpp
+++ b/scene/gui/line_edit.cpp
@@ -88,12 +88,34 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 
 			} else {
 
-				if (b->is_doubleclick() && selecting_enabled) {
-
-					selection.enabled = true;
-					selection.begin = 0;
-					selection.end = text.length();
-					selection.doubleclick = true;
+				if (selecting_enabled) {
+					if (!b->is_doubleclick() && (OS::get_singleton()->get_ticks_msec() - selection.last_dblclk) < 600) {
+						// Triple-click select all.
+						selection.enabled = true;
+						selection.begin = 0;
+						selection.end = text.length();
+						selection.doubleclick = true;
+						selection.last_dblclk = 0;
+					} else if (b->is_doubleclick()) {
+						// Double-click select word.
+						selection.enabled = true;
+						int beg = cursor_pos;
+						int end = beg;
+						bool symbol = beg < text.length() && is_symbol(text[beg]);
+						while (beg > 0 && text[beg - 1] > 32 && (symbol == is_symbol(text[beg - 1]))) {
+							beg--;
+						}
+						while (end < text.length() && text[end + 1] > 32 && (symbol == is_symbol(text[end + 1]))) {
+							end++;
+						}
+						if (end < text.length()) {
+							end += 1;
+						}
+						selection.begin = beg;
+						selection.end = end;
+						selection.doubleclick = true;
+						selection.last_dblclk = OS::get_singleton()->get_ticks_msec();
+					}
 				}
 
 				selection.drag_attempt = false;
diff --git a/scene/gui/line_edit.h b/scene/gui/line_edit.h
index d1ebda9bf605..85df7cbb0359 100644
--- a/scene/gui/line_edit.h
+++ b/scene/gui/line_edit.h
@@ -104,6 +104,7 @@ class LineEdit : public Control {
 		bool creating;
 		bool doubleclick;
 		bool drag_attempt;
+		uint64_t last_dblclk = 0;
 	} selection;
 
 	struct TextOperation {

From c5332b1d7a354a48de0f8d83663c7a3eddbde18a Mon Sep 17 00:00:00 2001
From: MaxStgs <peters2011.pm@gmail.com>
Date: Mon, 3 May 2021 19:54:03 +0500
Subject: [PATCH 53/84] Check PHashTranslation generate p_from is valid

(cherry picked from commit 0bb40df4bb65e2c784f29baf4e98d7bf84233804)
---
 core/compressed_translation.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/compressed_translation.cpp b/core/compressed_translation.cpp
index 19851c4fad8b..9085802d56cf 100644
--- a/core/compressed_translation.cpp
+++ b/core/compressed_translation.cpp
@@ -45,6 +45,7 @@ struct _PHashTranslationCmp {
 
 void PHashTranslation::generate(const Ref<Translation> &p_from) {
 #ifdef TOOLS_ENABLED
+	ERR_FAIL_COND(p_from.is_null());
 	List<StringName> keys;
 	p_from->get_message_list(&keys);
 

From 06136d433bb55401505a51f0e18303647cfae15c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Thu, 17 Dec 2020 16:01:36 +0100
Subject: [PATCH 54/84] SCons: Add explicit dependencies on thirdparty code in
 cloned env

Since we clone the environments to build thirdparty code, we don't get an
explicit dependency on the build objects produced by that environment.

So when we update thirdparty code, Godot code using it is not necessarily
rebuilt (I think it is for changed headers, but not for changed .c/.cpp files),
which can lead to an invalid compilation output (linking old Godot .o files
with a newer, potentially ABI breaking version of thirdparty code).

This was only seen as really problematic with bullet updates (leading to
crashes when rebuilding Godot after a bullet update without cleaning .o files),
but it's safer to fix it everywhere, even if it's a LOT of hacky boilerplate.

(cherry picked from commit c7b53c03ae7f7feb45a6023ee5cf764025ebb5e1)
(cherry picked from commit e94161dada6d68bdab447114bc7faaef9e87099b)
---
 core/SCsub                  | 20 +++++++++++++++-----
 core/crypto/SCsub           | 15 +++++++++++++--
 drivers/gl_context/SCsub    | 15 +++++++++++++--
 drivers/png/SCsub           | 19 +++++++++++++++----
 modules/bullet/SCsub        | 14 ++++++++++++--
 modules/cvtt/SCsub          | 15 +++++++++++++--
 modules/denoise/SCsub       | 18 +++++++++++++++---
 modules/enet/SCsub          | 16 ++++++++++++++--
 modules/etc/SCsub           | 15 +++++++++++++--
 modules/freetype/SCsub      | 15 ++++++++++++++-
 modules/jpg/SCsub           | 17 ++++++++++++++---
 modules/mbedtls/SCsub       | 21 +++++++++++++++++----
 modules/ogg/SCsub           | 16 ++++++++++++++--
 modules/opensimplex/SCsub   | 17 ++++++++++++++---
 modules/opus/SCsub          | 19 ++++++++++++++++---
 modules/pvr/SCsub           | 15 +++++++++++++--
 modules/raycast/SCsub       | 14 ++++++++++++--
 modules/recast/SCsub        | 15 +++++++++++++--
 modules/regex/SCsub         | 18 ++++++++++++++++--
 modules/squish/SCsub        | 16 ++++++++++++++--
 modules/stb_vorbis/SCsub    | 17 ++++++++++++++---
 modules/svg/SCsub           | 17 ++++++++++++++---
 modules/theora/SCsub        | 16 ++++++++++++++--
 modules/upnp/SCsub          | 15 +++++++++++++--
 modules/vhacd/SCsub         | 17 ++++++++++++++---
 modules/vorbis/SCsub        | 20 +++++++++++++++-----
 modules/webm/SCsub          | 15 +++++++++++++--
 modules/webp/SCsub          | 16 ++++++++++++++--
 modules/webrtc/SCsub        |  2 --
 modules/websocket/SCsub     | 32 ++++++++++++++++++++++++--------
 modules/xatlas_unwrap/SCsub | 16 ++++++++++++++--
 platform/android/SCsub      |  6 +++++-
 scene/SCsub                 | 18 +-----------------
 scene/animation/SCsub       | 21 ++++++++++++++++++++-
 scene/resources/SCsub       | 21 ++++++++++++++++++++-
 servers/camera/SCsub        |  2 --
 36 files changed, 475 insertions(+), 106 deletions(-)

diff --git a/core/SCsub b/core/SCsub
index 579c99b81245..aaa59618f36e 100644
--- a/core/SCsub
+++ b/core/SCsub
@@ -40,6 +40,9 @@ with open("script_encryption_key.gen.cpp", "w") as f:
 
 
 # Add required thirdparty code.
+
+thirdparty_obj = []
+
 env_thirdparty = env.Clone()
 env_thirdparty.disable_warnings()
 
@@ -57,7 +60,7 @@ thirdparty_misc_sources = [
     "clipper.cpp",
 ]
 thirdparty_misc_sources = [thirdparty_misc_dir + file for file in thirdparty_misc_sources]
-env_thirdparty.add_source_files(env.core_sources, thirdparty_misc_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_misc_sources)
 
 # Zlib library, can be unbundled
 if env["builtin_zlib"]:
@@ -83,7 +86,7 @@ if env["builtin_zlib"]:
     if env["target"] == "debug":
         env_thirdparty.Append(CPPDEFINES=["ZLIB_DEBUG"])
 
-    env_thirdparty.add_source_files(env.core_sources, thirdparty_zlib_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_zlib_sources)
 
 # Minizip library, could be unbundled in theory
 # However, our version has some custom modifications, so it won't compile with the system one
@@ -94,7 +97,7 @@ thirdparty_minizip_sources = [
     "zip.c",
 ]
 thirdparty_minizip_sources = [thirdparty_minizip_dir + file for file in thirdparty_minizip_sources]
-env_thirdparty.add_source_files(env.core_sources, thirdparty_minizip_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_minizip_sources)
 
 # Zstd library, can be unbundled in theory
 # though we currently use some private symbols
@@ -136,10 +139,14 @@ if env["builtin_zstd"]:
     # Also needed in main env includes will trigger warnings
     env.Append(CPPDEFINES=["ZSTD_STATIC_LINKING_ONLY"])
 
-    env_thirdparty.add_source_files(env.core_sources, thirdparty_zstd_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_zstd_sources)
+
 
+env.core_sources += thirdparty_obj
+
+
+# Godot source files
 
-# Godot's own sources
 env.add_source_files(env.core_sources, "*.cpp")
 
 # Certificates
@@ -185,3 +192,6 @@ SConscript("bind/SCsub")
 # Build it all as a library
 lib = env.add_library("core", env.core_sources)
 env.Prepend(LIBS=[lib])
+
+# Needed to force rebuilding the core files when the thirdparty code is updated.
+env.Depends(lib, thirdparty_obj)
diff --git a/core/crypto/SCsub b/core/crypto/SCsub
index da4a9c9381d9..4f3104d84be7 100644
--- a/core/crypto/SCsub
+++ b/core/crypto/SCsub
@@ -6,6 +6,7 @@ env_crypto = env.Clone()
 
 is_builtin = env["builtin_mbedtls"]
 has_module = env["module_mbedtls_enabled"]
+thirdparty_obj = []
 
 if is_builtin or not has_module:
     # Use our headers for builtin or if the module is not going to be compiled.
@@ -35,6 +36,16 @@ if not has_module:
         "godot_core_mbedtls_platform.c",
     ]
     thirdparty_mbedtls_sources = [thirdparty_mbedtls_dir + file for file in thirdparty_mbedtls_sources]
-    env_thirdparty.add_source_files(env.core_sources, thirdparty_mbedtls_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_mbedtls_sources)
+    env.core_sources += thirdparty_obj
 
-env_crypto.add_source_files(env.core_sources, "*.cpp")
+
+# Godot source files
+
+core_obj = []
+
+env_crypto.add_source_files(core_obj, "*.cpp")
+env.core_sources += core_obj
+
+# Needed to force rebuilding the core files when the thirdparty library is updated.
+env.Depends(core_obj, thirdparty_obj)
diff --git a/drivers/gl_context/SCsub b/drivers/gl_context/SCsub
index e2e499f5c2af..3084ddd9591d 100644
--- a/drivers/gl_context/SCsub
+++ b/drivers/gl_context/SCsub
@@ -2,6 +2,8 @@
 
 Import("env")
 
+thirdparty_obj = []
+
 if env["platform"] in ["haiku", "osx", "windows", "x11"]:
     # Thirdparty source files
     thirdparty_dir = "#thirdparty/glad/"
@@ -17,7 +19,16 @@ if env["platform"] in ["haiku", "osx", "windows", "x11"]:
 
     env_thirdparty = env.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.drivers_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.drivers_sources += thirdparty_obj
+
 
 # Godot source files
-env.add_source_files(env.drivers_sources, "*.cpp")
+
+driver_obj = []
+
+env.add_source_files(driver_obj, "*.cpp")
+env.drivers_sources += driver_obj
+
+# Needed to force rebuilding the driver files when the thirdparty code is updated.
+env.Depends(driver_obj, thirdparty_obj)
diff --git a/drivers/png/SCsub b/drivers/png/SCsub
index db08be0c47c0..26508dc6121d 100644
--- a/drivers/png/SCsub
+++ b/drivers/png/SCsub
@@ -5,6 +5,9 @@ Import("env")
 env_png = env.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_libpng"]:
     thirdparty_dir = "#thirdparty/libpng/"
     thirdparty_sources = [
@@ -41,7 +44,7 @@ if env["builtin_libpng"]:
 
     env_thirdparty = env_png.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.drivers_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
 
     if use_neon:
         env_neon = env_thirdparty.Clone()
@@ -52,9 +55,17 @@ if env["builtin_libpng"]:
         neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/filter_neon_intrinsics.c"))
         neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/filter_neon.S"))
         neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/palette_neon_intrinsics.c"))
-        env.drivers_sources += neon_sources
+        thirdparty_obj += neon_sources
+
+    env.drivers_sources += thirdparty_obj
+
 
 # Godot source files
-env_png.add_source_files(env.drivers_sources, "*.cpp")
 
-Export("env")
+driver_obj = []
+
+env_png.add_source_files(driver_obj, "*.cpp")
+env.drivers_sources += driver_obj
+
+# Needed to force rebuilding the driver files when the thirdparty library is updated.
+env.Depends(driver_obj, thirdparty_obj)
diff --git a/modules/bullet/SCsub b/modules/bullet/SCsub
index 14ee85e05146..6fce7d9ee166 100644
--- a/modules/bullet/SCsub
+++ b/modules/bullet/SCsub
@@ -7,6 +7,8 @@ env_bullet = env_modules.Clone()
 
 # Thirdparty source files
 
+thirdparty_obj = []
+
 if env["builtin_bullet"]:
     # Build only version 2 for now (as of 2.89)
     # Sync file list with relevant upstream CMakeLists.txt for each folder.
@@ -207,8 +209,16 @@ if env["builtin_bullet"]:
 
     env_thirdparty = env_bullet.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
 
 
 # Godot source files
-env_bullet.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_bullet.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/cvtt/SCsub b/modules/cvtt/SCsub
index 3a27a5994568..bd83ce0032ac 100644
--- a/modules/cvtt/SCsub
+++ b/modules/cvtt/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_cvtt = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 thirdparty_dir = "#thirdparty/cvtt/"
 thirdparty_sources = ["ConvectionKernels.cpp"]
 
@@ -15,7 +18,15 @@ env_cvtt.Prepend(CPPPATH=[thirdparty_dir])
 
 env_thirdparty = env_cvtt.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
 
 # Godot source files
-env_cvtt.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_cvtt.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/denoise/SCsub b/modules/denoise/SCsub
index a202d1c7c04c..83193f5222da 100644
--- a/modules/denoise/SCsub
+++ b/modules/denoise/SCsub
@@ -8,6 +8,9 @@ Import("env_modules")
 env_oidn = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 thirdparty_dir = "#thirdparty/oidn/"
 thirdparty_sources = [
     "core/api.cpp",
@@ -106,7 +109,8 @@ env_oidn.Append(
 
 env_thirdparty = env_oidn.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
 
 if env["platform"] == "windows" and not env.msvc:
     env_thirdparty.Append(CPPFLAGS=["-mstackrealign"])
@@ -117,5 +121,13 @@ weights_out_path = thirdparty_dir + "weights/rtlightmap_hdr.gen.cpp"
 env_thirdparty.Depends(weights_out_path, weights_in_path)
 env_thirdparty.CommandNoCache(weights_out_path, weights_in_path, resource_to_cpp.tza_to_cpp)
 
-env_oidn.add_source_files(env.modules_sources, "denoise_wrapper.cpp")
-env_modules.add_source_files(env.modules_sources, ["register_types.cpp", "lightmap_denoiser.cpp"])
+# Godot source files
+
+module_obj = []
+
+env_oidn.add_source_files(module_obj, "denoise_wrapper.cpp")
+env_modules.add_source_files(module_obj, ["register_types.cpp", "lightmap_denoiser.cpp"])
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/enet/SCsub b/modules/enet/SCsub
index c8f4b3885e40..580e5a3eb021 100644
--- a/modules/enet/SCsub
+++ b/modules/enet/SCsub
@@ -7,6 +7,8 @@ env_enet = env_modules.Clone()
 
 # Thirdparty source files
 
+thirdparty_obj = []
+
 if env["builtin_enet"]:
     thirdparty_dir = "#thirdparty/enet/"
     thirdparty_sources = [
@@ -26,6 +28,16 @@ if env["builtin_enet"]:
 
     env_thirdparty = env_enet.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
+
+# Godot source files
+
+module_obj = []
+
+env_enet.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-env_enet.add_source_files(env.modules_sources, "*.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/etc/SCsub b/modules/etc/SCsub
index 383bbf83c3e0..9b46f1791694 100644
--- a/modules/etc/SCsub
+++ b/modules/etc/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_etc = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 # Not unbundled so far since not widespread as shared library
 thirdparty_dir = "#thirdparty/etc2comp/"
 thirdparty_sources = [
@@ -31,7 +34,15 @@ env_etc.Prepend(CPPPATH=[thirdparty_dir])
 
 env_thirdparty = env_etc.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
 
 # Godot source files
-env_etc.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_etc.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/freetype/SCsub b/modules/freetype/SCsub
index cb9eb36b410c..cc0fcaa1169e 100644
--- a/modules/freetype/SCsub
+++ b/modules/freetype/SCsub
@@ -8,6 +8,9 @@ from compat import isbasestring
 env_freetype = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_freetype"]:
     thirdparty_dir = "#thirdparty/freetype/"
     thirdparty_sources = [
@@ -86,6 +89,7 @@ if env["builtin_freetype"]:
     env_thirdparty = env_freetype.Clone()
     env_thirdparty.disable_warnings()
     lib = env_thirdparty.add_library("freetype_builtin", thirdparty_sources)
+    thirdparty_obj += lib
 
     # Needs to be appended to arrive after libscene in the linker call,
     # but we don't want it to arrive *after* system libs, so manual hack
@@ -100,7 +104,16 @@ if env["builtin_freetype"]:
     if not inserted:
         env.Append(LIBS=[lib])
 
+
 # Godot source files
-env_freetype.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_freetype.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
 # Used in scene/, needs to be in main env
 env.Append(CPPDEFINES=["FREETYPE_ENABLED"])
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/jpg/SCsub b/modules/jpg/SCsub
index 8ee8e6dd6ee6..7c6ceeea29fc 100644
--- a/modules/jpg/SCsub
+++ b/modules/jpg/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_jpg = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 # Not unbundled for now as they are not commonly available as shared library
 thirdparty_dir = "#thirdparty/jpeg-compressor/"
 thirdparty_sources = [
@@ -17,7 +20,15 @@ env_jpg.Prepend(CPPPATH=[thirdparty_dir])
 
 env_thirdparty = env_jpg.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_jpg.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-# Godot's own source files
-env_jpg.add_source_files(env.modules_sources, "*.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/mbedtls/SCsub b/modules/mbedtls/SCsub
index 5f5d25a3ee2e..295fad49b439 100644
--- a/modules/mbedtls/SCsub
+++ b/modules/mbedtls/SCsub
@@ -5,8 +5,11 @@ Import("env_modules")
 
 env_mbed_tls = env_modules.Clone()
 
+# Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_mbedtls"]:
-    # Thirdparty source files
     thirdparty_sources = [
         "aes.c",
         "aesni.c",
@@ -96,7 +99,17 @@ if env["builtin_mbedtls"]:
 
     env_thirdparty = env_mbed_tls.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
+
+# Godot source files
+
+module_obj = []
+
+env_mbed_tls.add_source_files(module_obj, "*.cpp")
+
+env.modules_sources += module_obj
 
-# Module sources
-env_mbed_tls.add_source_files(env.modules_sources, "*.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/ogg/SCsub b/modules/ogg/SCsub
index e768fb4ae848..e415d9249852 100644
--- a/modules/ogg/SCsub
+++ b/modules/ogg/SCsub
@@ -9,6 +9,9 @@ Import("env_modules")
 env_ogg = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_libogg"]:
     thirdparty_dir = "#thirdparty/libogg/"
     thirdparty_sources = [
@@ -21,7 +24,16 @@ if env["builtin_libogg"]:
 
     env_thirdparty = env_ogg.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
 
 # Godot source files
-env_ogg.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_ogg.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/opensimplex/SCsub b/modules/opensimplex/SCsub
index 52d8b145ef30..86d77c3dfbd0 100644
--- a/modules/opensimplex/SCsub
+++ b/modules/opensimplex/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_opensimplex = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 thirdparty_dir = "#thirdparty/misc/"
 thirdparty_sources = [
     "open-simplex-noise.c",
@@ -16,7 +19,15 @@ env_opensimplex.Prepend(CPPPATH=[thirdparty_dir])
 
 env_thirdparty = env_opensimplex.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_opensimplex.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-# Godot's own source files
-env_opensimplex.add_source_files(env.modules_sources, "*.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/opus/SCsub b/modules/opus/SCsub
index 52c61fa70802..1437cd86df2f 100644
--- a/modules/opus/SCsub
+++ b/modules/opus/SCsub
@@ -9,6 +9,10 @@ Import("env_modules")
 
 env_opus = env_modules.Clone()
 
+# Thirdparty source files
+
+thirdparty_obj = []
+
 # Thirdparty source files
 if env["builtin_opus"]:
     thirdparty_dir = "#thirdparty/opus/"
@@ -233,7 +237,16 @@ if env["builtin_opus"]:
 
     env_thirdparty = env_opus.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
+
+# Godot source files
+
+module_obj = []
+
+env_opus.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-# Module files
-env_opus.add_source_files(env.modules_sources, "register_types.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/pvr/SCsub b/modules/pvr/SCsub
index e0baf851f18d..36052cffed5b 100644
--- a/modules/pvr/SCsub
+++ b/modules/pvr/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_pvr = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 # Not unbundled so far since not widespread as shared library
 thirdparty_dir = "#thirdparty/pvrtccompressor/"
 thirdparty_sources = [
@@ -21,7 +24,15 @@ env_pvr.Prepend(CPPPATH=[thirdparty_dir])
 
 env_thirdparty = env_pvr.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
 
 # Godot source files
-env_pvr.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_pvr.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub
index e6808d76ba1c..978428e85cce 100644
--- a/modules/raycast/SCsub
+++ b/modules/raycast/SCsub
@@ -7,6 +7,8 @@ env_raycast = env_modules.Clone()
 
 # Thirdparty source files
 
+thirdparty_obj = []
+
 if env["builtin_embree"]:
     thirdparty_dir = "#thirdparty/embree/"
 
@@ -92,8 +94,16 @@ if env["builtin_embree"]:
 
     env_thirdparty = env_raycast.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
 
 
 # Godot source files
-env_raycast.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_raycast.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/recast/SCsub b/modules/recast/SCsub
index 5ef7e0b489cd..050ca23082e2 100644
--- a/modules/recast/SCsub
+++ b/modules/recast/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_recast = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_recast"]:
     thirdparty_dir = "#thirdparty/recastnavigation/Recast/"
     thirdparty_sources = [
@@ -27,7 +30,15 @@ if env["builtin_recast"]:
 
     env_thirdparty = env_recast.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
 
 # Godot source files
-env_recast.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_recast.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/regex/SCsub b/modules/regex/SCsub
index 2afacc1d9c77..deb9db7591f5 100644
--- a/modules/regex/SCsub
+++ b/modules/regex/SCsub
@@ -5,6 +5,10 @@ Import("env_modules")
 
 env_regex = env_modules.Clone()
 
+# Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_pcre2"]:
     thirdparty_dir = "#thirdparty/pcre2/src/"
     thirdparty_flags = ["PCRE2_STATIC", "HAVE_CONFIG_H", "SUPPORT_UNICODE"]
@@ -52,11 +56,21 @@ if env["builtin_pcre2"]:
         env_pcre2 = env_regex.Clone()
         env_pcre2.disable_warnings()
         env_pcre2["OBJSUFFIX"] = "_" + width + env_pcre2["OBJSUFFIX"]
-        env_pcre2.add_source_files(env.modules_sources, thirdparty_sources)
         env_pcre2.Append(CPPDEFINES=[("PCRE2_CODE_UNIT_WIDTH", width)])
+        env_pcre2.add_source_files(thirdparty_obj, thirdparty_sources)
+        env.modules_sources += thirdparty_obj
 
     pcre2_builtin("16")
     pcre2_builtin("32")
 
+
+# Godot source files
+
+module_obj = []
+
 env_regex.Append(CPPDEFINES=[("PCRE2_CODE_UNIT_WIDTH", 0)])
-env_regex.add_source_files(env.modules_sources, "*.cpp")
+env_regex.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/squish/SCsub b/modules/squish/SCsub
index b31032403f24..c9e29911d859 100644
--- a/modules/squish/SCsub
+++ b/modules/squish/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_squish = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_squish"]:
     thirdparty_dir = "#thirdparty/squish/"
     thirdparty_sources = [
@@ -26,7 +29,16 @@ if env["builtin_squish"]:
 
     env_thirdparty = env_squish.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
 
 # Godot source files
-env_squish.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_squish.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/stb_vorbis/SCsub b/modules/stb_vorbis/SCsub
index 266c87c80246..8fddb23dc865 100644
--- a/modules/stb_vorbis/SCsub
+++ b/modules/stb_vorbis/SCsub
@@ -6,11 +6,22 @@ Import("env_modules")
 env_stb_vorbis = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 thirdparty_sources = ["#thirdparty/misc/stb_vorbis.c"]
 
 env_thirdparty = env_stb_vorbis.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_stb_vorbis.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-# Godot's own source files
-env_stb_vorbis.add_source_files(env.modules_sources, "*.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/svg/SCsub b/modules/svg/SCsub
index 3d17f2dcf855..09f8ad94d4f6 100644
--- a/modules/svg/SCsub
+++ b/modules/svg/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_svg = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 thirdparty_dir = "#thirdparty/nanosvg/"
 thirdparty_sources = ["nanosvg.cc"]
 thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
@@ -18,7 +21,15 @@ env.Append(CPPDEFINES=["SVG_ENABLED"])
 
 env_thirdparty = env_svg.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_svg.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-# Godot's own source files
-env_svg.add_source_files(env.modules_sources, "*.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/theora/SCsub b/modules/theora/SCsub
index a01e65b4b075..6038ea086a27 100644
--- a/modules/theora/SCsub
+++ b/modules/theora/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_theora = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_libtheora"]:
     thirdparty_dir = "#thirdparty/libtheora/"
     thirdparty_sources = [
@@ -80,7 +83,16 @@ if env["builtin_libtheora"]:
 
     env_thirdparty = env_theora.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
 
 # Godot source files
-env_theora.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_theora.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/upnp/SCsub b/modules/upnp/SCsub
index 9ec99a9e99a9..b2fed0cb23ba 100644
--- a/modules/upnp/SCsub
+++ b/modules/upnp/SCsub
@@ -7,6 +7,8 @@ env_upnp = env_modules.Clone()
 
 # Thirdparty source files
 
+thirdparty_obj = []
+
 if env["builtin_miniupnpc"]:
     thirdparty_dir = "#thirdparty/miniupnpc/"
     thirdparty_sources = [
@@ -32,7 +34,16 @@ if env["builtin_miniupnpc"]:
 
     env_thirdparty = env_upnp.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
 
 # Godot source files
-env_upnp.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_upnp.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/vhacd/SCsub b/modules/vhacd/SCsub
index ecd432b275e0..1ff4122114ef 100644
--- a/modules/vhacd/SCsub
+++ b/modules/vhacd/SCsub
@@ -7,6 +7,8 @@ env_vhacd = env_modules.Clone()
 
 # Thirdparty source files
 
+thirdparty_obj = []
+
 thirdparty_dir = "#thirdparty/vhacd/"
 
 thirdparty_sources = [
@@ -24,10 +26,19 @@ thirdparty_sources = [
 
 thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
 
-env_vhacd.Prepend(CPPPATH=[thirdparty_dir + "/inc"])
+env_vhacd.Prepend(CPPPATH=[thirdparty_dir + "inc"])
 
 env_thirdparty = env_vhacd.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_vhacd.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-env_vhacd.add_source_files(env.modules_sources, "*.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/vorbis/SCsub b/modules/vorbis/SCsub
index 05d46757d3fd..bc31fff066f8 100644
--- a/modules/vorbis/SCsub
+++ b/modules/vorbis/SCsub
@@ -8,9 +8,10 @@ Import("env_modules")
 
 env_vorbis = env_modules.Clone()
 
-stub = True
-
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_libvorbis"]:
     thirdparty_dir = "#thirdparty/libvorbis/"
     thirdparty_sources = [
@@ -51,7 +52,16 @@ if env["builtin_libvorbis"]:
 
     env_thirdparty = env_vorbis.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
+
+# Godot source files
+
+module_obj = []
+
+env_vorbis.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
 
-# Module files
-env_vorbis.add_source_files(env.modules_sources, "register_types.cpp")
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/webm/SCsub b/modules/webm/SCsub
index 247b4ead37cc..44e80e2870e5 100644
--- a/modules/webm/SCsub
+++ b/modules/webm/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_webm = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 thirdparty_dir = "#thirdparty/libsimplewebm/"
 thirdparty_sources = [
     "libwebm/mkvparser/mkvparser.cc",
@@ -31,7 +34,15 @@ if env["builtin_libvpx"]:
 
 env_thirdparty = env_webm.Clone()
 env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
 
 # Godot source files
-env_webm.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_webm.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/webp/SCsub b/modules/webp/SCsub
index 58f2bb35e615..4c0c2f78937e 100644
--- a/modules/webp/SCsub
+++ b/modules/webp/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_webp = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_libwebp"]:
     thirdparty_dir = "#thirdparty/libwebp/"
     thirdparty_sources = [
@@ -130,7 +133,16 @@ if env["builtin_libwebp"]:
 
     env_thirdparty = env_webp.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
 
 # Godot source files
-env_webp.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_webp.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/webrtc/SCsub b/modules/webrtc/SCsub
index 4f870ddb2f27..31b8a73bf2ba 100644
--- a/modules/webrtc/SCsub
+++ b/modules/webrtc/SCsub
@@ -3,8 +3,6 @@
 Import("env")
 Import("env_modules")
 
-# Thirdparty source files
-
 env_webrtc = env_modules.Clone()
 use_gdnative = env_webrtc["module_gdnative_enabled"]
 
diff --git a/modules/websocket/SCsub b/modules/websocket/SCsub
index 13e51a39c05c..4c022c43cfdc 100644
--- a/modules/websocket/SCsub
+++ b/modules/websocket/SCsub
@@ -5,28 +5,44 @@ Import("env_modules")
 
 env_ws = env_modules.Clone()
 
+thirdparty_obj = []
+
 if env["platform"] == "javascript":
     # Our JavaScript/C++ interface.
     env.AddJSLibraries(["library_godot_websocket.js"])
+
 elif env["builtin_wslay"]:
     # Thirdparty source files
-    wslay_dir = "#thirdparty/wslay/"
-    wslay_sources = [
+    thirdparty_dir = "#thirdparty/wslay/"
+    thirdparty_sources = [
         "wslay_net.c",
         "wslay_event.c",
         "wslay_queue.c",
         "wslay_stack.c",
         "wslay_frame.c",
     ]
-    wslay_sources = [wslay_dir + s for s in wslay_sources]
-    env_ws.Prepend(CPPPATH=[wslay_dir + "includes/"])
+    thirdparty_sources = [thirdparty_dir + s for s in thirdparty_sources]
+
+    env_ws.Prepend(CPPPATH=[thirdparty_dir + "includes/"])
     env_ws.Append(CPPDEFINES=["HAVE_CONFIG_H"])
+
     if env["platform"] == "windows" or env["platform"] == "uwp":
         env_ws.Append(CPPDEFINES=["HAVE_WINSOCK2_H"])
     else:
         env_ws.Append(CPPDEFINES=["HAVE_NETINET_IN_H"])
-    env_wslay = env_ws.Clone()
-    env_wslay.disable_warnings()
-    env_wslay.add_source_files(env.modules_sources, wslay_sources)
 
-env_ws.add_source_files(env.modules_sources, "*.cpp")
+    env_thirdparty = env_ws.Clone()
+    env_thirdparty.disable_warnings()
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
+
+# Godot source files
+
+module_obj = []
+
+env_ws.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/xatlas_unwrap/SCsub b/modules/xatlas_unwrap/SCsub
index c659349d05f2..aa6bdaea3349 100644
--- a/modules/xatlas_unwrap/SCsub
+++ b/modules/xatlas_unwrap/SCsub
@@ -6,6 +6,9 @@ Import("env_modules")
 env_xatlas_unwrap = env_modules.Clone()
 
 # Thirdparty source files
+
+thirdparty_obj = []
+
 if env["builtin_xatlas"]:
     thirdparty_dir = "#thirdparty/xatlas/"
     thirdparty_sources = [
@@ -17,7 +20,16 @@ if env["builtin_xatlas"]:
 
     env_thirdparty = env_xatlas_unwrap.Clone()
     env_thirdparty.disable_warnings()
-    env_thirdparty.add_source_files(env.modules_sources, thirdparty_sources)
+    env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+    env.modules_sources += thirdparty_obj
+
 
 # Godot source files
-env_xatlas_unwrap.add_source_files(env.modules_sources, "*.cpp")
+
+module_obj = []
+
+env_xatlas_unwrap.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/platform/android/SCsub b/platform/android/SCsub
index 6dd9dc04101c..f2cea80611c1 100644
--- a/platform/android/SCsub
+++ b/platform/android/SCsub
@@ -28,10 +28,14 @@ for x in android_files:
 
 env_thirdparty = env_android.Clone()
 env_thirdparty.disable_warnings()
-android_objects.append(env_thirdparty.SharedObject("#thirdparty/misc/ifaddrs-android.cc"))
+thirdparty_obj = env_thirdparty.SharedObject("#thirdparty/misc/ifaddrs-android.cc")
+android_objects.append(thirdparty_obj)
 
 lib = env_android.add_shared_library("#bin/libgodot", [android_objects], SHLIBSUFFIX=env["SHLIBSUFFIX"])
 
+# Needed to force rebuilding the platform files when the thirdparty code is updated.
+env.Depends(lib, thirdparty_obj)
+
 lib_arch_dir = ""
 if env["android_arch"] == "armv7":
     lib_arch_dir = "armeabi-v7a"
diff --git a/scene/SCsub b/scene/SCsub
index f9fc00f3f241..ccd2bab8ffe1 100644
--- a/scene/SCsub
+++ b/scene/SCsub
@@ -4,24 +4,9 @@ Import("env")
 
 env.scene_sources = []
 
-# Thirdparty code
-thirdparty_dir = "#thirdparty/misc/"
-thirdparty_sources = [
-    # C++ sources
-    "easing_equations.cpp",
-    # C sources
-    "mikktspace.c",
-]
-thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
-
-env_thirdparty = env.Clone()
-env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(env.scene_sources, thirdparty_sources)
-
-# Godot's own sources
+# Godot source files
 env.add_source_files(env.scene_sources, "*.cpp")
 
-
 # Chain load SCsubs
 SConscript("main/SCsub")
 SConscript("gui/SCsub")
@@ -32,7 +17,6 @@ SConscript("audio/SCsub")
 SConscript("resources/SCsub")
 SConscript("debugger/SCsub")
 
-
 # Build it all as a library
 lib = env.add_library("scene", env.scene_sources)
 env.Prepend(LIBS=[lib])
diff --git a/scene/animation/SCsub b/scene/animation/SCsub
index fc61250247d2..cc33a5af84fd 100644
--- a/scene/animation/SCsub
+++ b/scene/animation/SCsub
@@ -2,4 +2,23 @@
 
 Import("env")
 
-env.add_source_files(env.scene_sources, "*.cpp")
+# Thirdparty code
+
+thirdparty_obj = []
+
+thirdparty_sources = "#thirdparty/misc/easing_equations.cpp"
+
+env_thirdparty = env.Clone()
+env_thirdparty.disable_warnings()
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.scene_sources += thirdparty_obj
+
+# Godot source files
+
+scene_obj = []
+
+env.add_source_files(scene_obj, "*.cpp")
+env.scene_sources += scene_obj
+
+# Needed to force rebuilding the scene files when the thirdparty code is updated.
+env.Depends(scene_obj, thirdparty_obj)
diff --git a/scene/resources/SCsub b/scene/resources/SCsub
index 3a86b228351f..f4dc7a46fb09 100644
--- a/scene/resources/SCsub
+++ b/scene/resources/SCsub
@@ -2,6 +2,25 @@
 
 Import("env")
 
-env.add_source_files(env.scene_sources, "*.cpp")
+# Thirdparty code
+
+thirdparty_obj = []
+
+thirdparty_sources = "#thirdparty/misc/mikktspace.c"
+
+env_thirdparty = env.Clone()
+env_thirdparty.disable_warnings()
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.scene_sources += thirdparty_obj
+
+# Godot source files
+
+scene_obj = []
+
+env.add_source_files(scene_obj, "*.cpp")
+env.scene_sources += scene_obj
+
+# Needed to force rebuilding the scene files when the thirdparty code is updated.
+env.Depends(scene_obj, thirdparty_obj)
 
 SConscript("default_theme/SCsub")
diff --git a/servers/camera/SCsub b/servers/camera/SCsub
index c949f3bb2598..86681f9c74d3 100644
--- a/servers/camera/SCsub
+++ b/servers/camera/SCsub
@@ -3,5 +3,3 @@
 Import("env")
 
 env.add_source_files(env.servers_sources, "*.cpp")
-
-Export("env")

From 4b8a1d21f31e1d447de36b5fdaf1c4407d38f969 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Mon, 3 May 2021 21:41:42 +0200
Subject: [PATCH 55/84] CI: Add `--doctool` check to find missing classref
 updates

This will enforce that PRs properly sync the class reference templates to match
their changes to the public API, and help notice binding bugs in the process
(e.g. missing enum bindings, unexpected API changes or missing argument names).

This should also serve as a reminder to contributors that their changes impact
the scripting API and might warrant actually filling the descriptions for the
new methods/properties/etc.

(cherry picked from commit b3884122701232fb76a4a72ce2352d913d364e2d)
---
 .github/workflows/linux_builds.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/linux_builds.yml b/.github/workflows/linux_builds.yml
index b77cea475610..85418032f11d 100644
--- a/.github/workflows/linux_builds.yml
+++ b/.github/workflows/linux_builds.yml
@@ -208,3 +208,11 @@ jobs:
           echo "----- Run and test project -----"
           DRI_PRIME=0 xvfb-run bin/godot.x11.tools.64s 30 --video-driver GLES3 --audio-driver Dummy --path test_project 2>&1 | tee sanitizers_log.txt || true
           misc/scripts/check_ci_log.py sanitizers_log.txt
+
+      # Check class reference
+      - name: Check for class reference updates
+        run: |
+          echo "Running --doctool to see if this changes the public API without updating the documentation."
+          echo -e "If a diff is shown, it means that your code/doc changes are incomplete and you should update the class reference with --doctool.\n\n"
+          DRI_PRIME=0 xvfb-run bin/godot.x11.tools.64s --doctool . 2>&1 > /dev/null || true
+          git diff --color --exit-code

From 1cfed0d58365bfaad92c5e395468621896e5bd30 Mon Sep 17 00:00:00 2001
From: JFonS <joan.fonssanchez@gmail.com>
Date: Tue, 4 May 2021 11:07:12 +0200
Subject: [PATCH 56/84] Switch to embree-aarch64

(cherry picked from commit 73e2ccd60309ab598d1817ebf9678ea3587513e0)
---
 COPYRIGHT.txt                                 |    5 +
 modules/lightmapper_cpu/config.py             |   20 +-
 modules/raycast/SCsub                         |   16 +-
 modules/raycast/config.py                     |   21 +-
 modules/raycast/godot_update_embree.py        |    9 +-
 modules/raycast/lightmap_raycaster.cpp        |    7 +-
 thirdparty/README.md                          |   14 +-
 .../embree/common/algorithms/parallel_for.h   |   95 +-
 .../common/algorithms/parallel_reduce.h       |   14 +-
 .../embree/common/algorithms/parallel_sort.h  |    7 +-
 .../embree/common/lexers/stringstream.cpp     |    5 +-
 thirdparty/embree/common/math/AVX2NEON.h      |  986 +++++++++
 thirdparty/embree/common/math/SSE2NEON.h      | 1753 +++++++++++++++++
 thirdparty/embree/common/math/bbox.h          |    6 +-
 thirdparty/embree/common/math/col3.h          |    2 +-
 thirdparty/embree/common/math/col4.h          |    2 +-
 thirdparty/embree/common/math/color.h         |   42 +-
 thirdparty/embree/common/math/constants.cpp   |   34 +
 thirdparty/embree/common/math/constants.h     |   58 +-
 thirdparty/embree/common/math/math.h          |  119 +-
 thirdparty/embree/common/math/vec2.h          |    6 +-
 thirdparty/embree/common/math/vec2fa.h        |   26 +-
 thirdparty/embree/common/math/vec3.h          |   11 +-
 thirdparty/embree/common/math/vec3fa.h        |  123 +-
 thirdparty/embree/common/math/vec3ia.h        |   38 +-
 thirdparty/embree/common/math/vec4.h          |    8 +-
 thirdparty/embree/common/simd/simd.h          |    2 +-
 thirdparty/embree/common/simd/sse.h           |    2 +-
 thirdparty/embree/common/simd/vboold4_avx.h   |    9 +-
 thirdparty/embree/common/simd/vboolf4_sse2.h  |   33 +-
 thirdparty/embree/common/simd/vboolf8_avx.h   |    5 +-
 thirdparty/embree/common/simd/vdouble4_avx.h  |    9 +-
 thirdparty/embree/common/simd/vfloat4_sse2.h  |  359 +++-
 thirdparty/embree/common/simd/vfloat8_avx.h   |  177 +-
 thirdparty/embree/common/simd/vint16_avx512.h |    4 +-
 thirdparty/embree/common/simd/vint4_sse2.h    |  226 ++-
 thirdparty/embree/common/simd/vint8_avx.h     |   75 +-
 thirdparty/embree/common/simd/vint8_avx2.h    |   47 +-
 .../embree/common/simd/vllong8_avx512.h       |    2 +-
 .../embree/common/simd/vuint16_avx512.h       |    2 +-
 thirdparty/embree/common/simd/vuint4_sse2.h   |  123 +-
 thirdparty/embree/common/simd/vuint8_avx.h    |   76 +-
 thirdparty/embree/common/simd/vuint8_avx2.h   |   45 +-
 thirdparty/embree/common/sys/alloc.cpp        |   35 +-
 thirdparty/embree/common/sys/array.h          |    4 +-
 thirdparty/embree/common/sys/intrinsics.h     |  303 +--
 thirdparty/embree/common/sys/library.cpp      |    6 +-
 thirdparty/embree/common/sys/mutex.cpp        |    1 +
 thirdparty/embree/common/sys/mutex.h          |    4 +-
 thirdparty/embree/common/sys/platform.h       |   29 +-
 thirdparty/embree/common/sys/sysinfo.cpp      |  109 +-
 thirdparty/embree/common/sys/sysinfo.h        |   20 +-
 thirdparty/embree/common/sys/thread.cpp       |   72 +-
 thirdparty/embree/common/sys/thread.h         |    3 -
 .../embree/common/tasking/taskscheduler.h     |    2 +
 .../embree/common/tasking/taskschedulergcd.h  |   49 +
 .../common/tasking/taskschedulerinternal.cpp  |   44 +-
 .../common/tasking/taskschedulerinternal.h    |   18 +-
 .../embree/common/tasking/taskschedulertbb.h  |    6 -
 .../embree/include/embree3/rtcore_common.h    |   22 +-
 .../embree/kernels/builders/bvh_builder_sah.h |    2 +-
 thirdparty/embree/kernels/bvh/bvh.cpp         |    4 +-
 thirdparty/embree/kernels/bvh/bvh.h           |    2 +-
 .../embree/kernels/bvh/bvh_builder_morton.cpp |    2 +-
 .../kernels/bvh/bvh_intersector_stream.h      |   11 +
 thirdparty/embree/kernels/bvh/bvh_node_ref.h  |    4 +-
 .../embree/kernels/bvh/bvh_statistics.cpp     |    7 +-
 .../embree/kernels/bvh/node_intersector1.h    |  123 +-
 .../kernels/bvh/node_intersector_frustum.h    |   22 +-
 .../kernels/bvh/node_intersector_packet.h     |   54 +-
 .../bvh/node_intersector_packet_stream.h      |   26 +
 thirdparty/embree/kernels/common/accel.h      |    4 +-
 thirdparty/embree/kernels/common/acceln.cpp   |    6 +-
 thirdparty/embree/kernels/common/alloc.cpp    |    3 +
 thirdparty/embree/kernels/common/alloc.h      |   70 +-
 thirdparty/embree/kernels/common/default.h    |    5 +
 thirdparty/embree/kernels/common/device.cpp   |    7 +
 thirdparty/embree/kernels/common/isa.h        |    2 +-
 thirdparty/embree/kernels/common/primref.h    |    4 +-
 thirdparty/embree/kernels/common/primref_mb.h |   10 +-
 thirdparty/embree/kernels/common/rtcore.cpp   |   77 +-
 thirdparty/embree/kernels/common/rtcore.h     |  136 +-
 thirdparty/embree/kernels/common/scene.cpp    |  171 +-
 .../embree/kernels/common/scene_subdiv_mesh.h |    6 +-
 thirdparty/embree/kernels/common/state.cpp    |   15 +
 thirdparty/embree/kernels/geometry/curveNi.h  |  128 +-
 .../embree/kernels/geometry/curveNi_mb.h      |  162 +-
 .../curve_intersector_virtual_bezier_curve.h  |   21 +
 .../curve_intersector_virtual_bspline_curve.h |   21 +
 ...rve_intersector_virtual_catmullrom_curve.h |   21 +
 .../curve_intersector_virtual_hermite_curve.h |   21 +
 .../curve_intersector_virtual_linear_curve.h  |   21 +
 .../curve_intersector_virtual_point.h         |   22 +
 thirdparty/embree/kernels/geometry/grid_soa.h |   10 +-
 thirdparty/embree/kernels/hash.h              |    2 +-
 .../kernels/subdiv/tessellation_cache.h       |    2 +-
 thirdparty/embree/patches/godot-changes.patch |  787 ++++++--
 97 files changed, 6055 insertions(+), 1286 deletions(-)
 create mode 100644 thirdparty/embree/common/math/AVX2NEON.h
 create mode 100644 thirdparty/embree/common/math/SSE2NEON.h
 create mode 100644 thirdparty/embree/common/tasking/taskschedulergcd.h
 create mode 100644 thirdparty/embree/kernels/geometry/curve_intersector_virtual_bezier_curve.h
 create mode 100644 thirdparty/embree/kernels/geometry/curve_intersector_virtual_bspline_curve.h
 create mode 100644 thirdparty/embree/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
 create mode 100644 thirdparty/embree/kernels/geometry/curve_intersector_virtual_hermite_curve.h
 create mode 100644 thirdparty/embree/kernels/geometry/curve_intersector_virtual_linear_curve.h
 create mode 100644 thirdparty/embree/kernels/geometry/curve_intersector_virtual_point.h

diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
index ad5aaf33082d..70f8297846ef 100644
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@@ -133,6 +133,11 @@ Copyright: 2018, Eric Lasota
  2018, Microsoft Corp.
 License: Expat
 
+Files: ./thirdparty/embree/
+Comment: Embree
+Copyright: 2009-2021 Intel Corporation
+License: Apache-2.0
+
 Files: ./thirdparty/enet/
 Comment: ENet
 Copyright: 2002-2020, Lee Salzman
diff --git a/modules/lightmapper_cpu/config.py b/modules/lightmapper_cpu/config.py
index 0b8837aa4edd..96efd47d9fb2 100644
--- a/modules/lightmapper_cpu/config.py
+++ b/modules/lightmapper_cpu/config.py
@@ -6,23 +6,13 @@ def can_build(env, platform):
     # `can_build()` for that module, so we need to duplicate that code as a short-term
     # solution.
 
-    # Embree requires at least SSE2 to be available, so 32-bit and ARM64 builds are
-    # not supported.
-    # It's also only relevant for tools build and desktop platforms,
-    # as doing lightmap generation on Android or HTML5 would be a bit far-fetched.
-    supported_platform = platform in ["x11", "osx", "windows", "server"]
-    supported_bits = env["bits"] == "64"
-    supported_arch = env["arch"] != "arm64"
+    if platform == "android":
+        return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
 
-    # Hack to disable on Linux arm64. This won't work well for cross-compilation (checks
-    # host, not target) and would need a more thorough fix by refactoring our arch and
-    # bits-handling code.
-    from platform import machine
-
-    if platform == "x11" and machine() != "x86_64":
-        supported_arch = False
+    if platform in ["javascript", "server"]:
+        return False
 
-    return supported_platform and supported_bits and supported_arch
+    return True
 
 
 def configure(env):
diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub
index 978428e85cce..834e6e2afa7a 100644
--- a/modules/raycast/SCsub
+++ b/modules/raycast/SCsub
@@ -70,25 +70,19 @@ if env["builtin_embree"]:
     thirdparty_sources = [thirdparty_dir + file for file in embree_src]
 
     env_raycast.Prepend(CPPPATH=[thirdparty_dir, thirdparty_dir + "include"])
-    env_raycast.Append(
-        CPPDEFINES=[
-            "EMBREE_TARGET_SSE2",
-            "EMBREE_LOWEST_ISA",
-            "TASKING_INTERNAL",
-            "NDEBUG",
-            "__SSE2__",
-            "__SSE__",
-        ]
-    )
+    env_raycast.Append(CPPDEFINES=["EMBREE_TARGET_SSE2", "EMBREE_LOWEST_ISA", "TASKING_INTERNAL", "NDEBUG"])
 
     if not env.msvc:
-        env_raycast.Append(CPPFLAGS=["-msse2", "-mxsave"])
+        if env["arch"] in ["x86", "x86_64"]:
+            env_raycast.Append(CPPFLAGS=["-msse2", "-mxsave"])
+
         if env["platform"] == "windows":
             env_raycast.Append(CPPFLAGS=["-mstackrealign"])
 
     if env["platform"] == "windows":
         if env.msvc:
             env.Append(LINKFLAGS=["psapi.lib"])
+            env_raycast.Append(CPPDEFINES=["__SSE2__", "__SSE__"])
         else:
             env.Append(LIBS=["psapi"])
 
diff --git a/modules/raycast/config.py b/modules/raycast/config.py
index 5307c62d0660..a2692d3612ed 100644
--- a/modules/raycast/config.py
+++ b/modules/raycast/config.py
@@ -1,21 +1,14 @@
 def can_build(env, platform):
-    # Embree requires at least SSE2 to be available, so 32-bit and ARM64 builds are
-    # not supported.
-    # It's also only relevant for tools build and desktop platforms,
-    # as doing lightmap generation on Android or HTML5 would be a bit far-fetched.
-    supported_platform = platform in ["x11", "osx", "windows", "server"]
-    supported_bits = env["bits"] == "64"
-    supported_arch = env["arch"] != "arm64"
+    if not env["tools"]:
+        return False
 
-    # Hack to disable on Linux arm64. This won't work well for cross-compilation (checks
-    # host, not target) and would need a more thorough fix by refactoring our arch and
-    # bits-handling code.
-    from platform import machine
+    if platform == "android":
+        return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
 
-    if platform == "x11" and machine() != "x86_64":
-        supported_arch = False
+    if platform in ["javascript", "server"]:
+        return False
 
-    return env["tools"] and supported_platform and supported_bits and supported_arch
+    return True
 
 
 def configure(env):
diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py
index 92649bbf7427..787ea2a0e42c 100644
--- a/modules/raycast/godot_update_embree.py
+++ b/modules/raycast/godot_update_embree.py
@@ -74,17 +74,18 @@
 
 os.chdir("../../thirdparty")
 
-if os.path.exists("embree"):
-    shutil.rmtree("embree")
+dir_name = "embree"
+if os.path.exists(dir_name):
+    shutil.rmtree(dir_name)
 
-subprocess.run(["git", "clone", "https://github.com/embree/embree.git", "embree-tmp"])
+subprocess.run(["git", "clone", "https://github.com/lighttransport/embree-aarch64.git", "embree-tmp"])
 os.chdir("embree-tmp")
 
 commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
 
-dest_dir = "../embree"
 all_files = set(cpp_files)
 
+dest_dir = os.path.join("..", dir_name)
 for include_dir in include_dirs:
     headers = glob.iglob(os.path.join(include_dir, "*.h"))
     all_files.update(headers)
diff --git a/modules/raycast/lightmap_raycaster.cpp b/modules/raycast/lightmap_raycaster.cpp
index fac4385d7711..6f51ff582f83 100644
--- a/modules/raycast/lightmap_raycaster.cpp
+++ b/modules/raycast/lightmap_raycaster.cpp
@@ -192,8 +192,11 @@ LightmapRaycasterEmbree::~LightmapRaycasterEmbree() {
 	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
 	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
 
-	if (embree_scene != nullptr)
+	if (embree_scene != nullptr) {
 		rtcReleaseScene(embree_scene);
-	if (embree_device != nullptr)
+	}
+
+	if (embree_device != nullptr) {
 		rtcReleaseDevice(embree_device);
+	}
 }
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 90ae4ce34ce2..ffc97810f176 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -41,19 +41,19 @@ Files extracted from upstream source:
 
 ## embree
 
-- Upstream: https://github.com/embree/embree
-- Version: 3.12.1 (69bd4c272f1ed608494f233ecfff3feec516880b, 2020)
+- Upstream: https://github.com/lighttransport/embree-aarch64
+- Version: 3.12.1 (6ef362f99af80c9dfe8dd2bfc582d9067897edc6, 2020)
 - License: Apache 2.0
 
 Files extracted from upstream:
 
-- All cpp files listed in `modules/raytrace/godot_update_embree.py`
-- All header files in the directories listed in `modules/raytrace/godot_update_embree.py`
+- All cpp files listed in `modules/raycast/godot_update_embree.py`
+- All header files in the directories listed in `modules/raycast/godot_update_embree.py`
 
-The `modules/raytrace/godot_update_embree.py`script can be used to pull the 
-relevant files from the latest Embree release and apply some automatic changes.
+The `modules/raycast/godot_update_embree.py`script can be used to pull the 
+relevant files from the latest Embree-aarch64 release and apply some automatic changes.
 
-Some minor changes have been made in order to fix build errors.
+Some changes have been made in order to remove exceptions and fix minor build errors.
 They are marked with `// -- GODOT start --` and `// -- GODOT end --`
 comments. Apply the patches in the `patches/` folder when syncing on newer upstream
 commits.
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
index cd2d5b0b7e74..51d296fb161e 100644
--- a/thirdparty/embree/common/algorithms/parallel_for.h
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -8,6 +8,12 @@
 #include "../math/math.h"
 #include "../math/range.h"
 
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include <dispatch/dispatch.h>
+#include <algorithm>
+#include <type_traits>
+#endif
+
 namespace embree
 {
   /* parallel_for without range */
@@ -21,9 +27,30 @@ namespace embree
           func(r.begin());
         });
       if (!TaskScheduler::wait())
-        throw std::runtime_error("task cancelled");
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
     }
-    
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
+    const size_t length = N;
+    const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = (currentBlock * blockSize);
+        const size_t blockLength = std::min(length - start, blockSize);
+        const size_t end = start + blockLength;
+          
+        for(size_t i=start; i < end; i++)
+        {
+            func(i);
+        }
+    });
+      
 #elif defined(TASKING_TBB)
   #if TBB_INTERFACE_VERSION >= 12002
     tbb::task_group_context context;
@@ -31,13 +58,19 @@ namespace embree
         func(i);
       },context);
     if (context.is_group_execution_cancelled())
-      throw std::runtime_error("task cancelled");
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
   #else
     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
         func(i);
       });
     if (tbb::task::self().is_cancelled())
-      throw std::runtime_error("task cancelled");
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
   #endif
 
 #elif defined(TASKING_PPL)
@@ -57,7 +90,29 @@ namespace embree
 #if defined(TASKING_INTERNAL)
     TaskScheduler::spawn(first,last,minStepSize,func);
     if (!TaskScheduler::wait())
-      throw std::runtime_error("task cancelled");
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
+    const size_t length = last - first;
+    const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
+    blockSize += blockSize % 4;
+      
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = first + (currentBlock * blockSize);
+        const size_t end = std::min<size_t>(last, start + blockSize);
+          
+        func( embree::range<Index>(start,end) );
+    });
+      
 
 #elif defined(TASKING_TBB)
   #if TBB_INTERFACE_VERSION >= 12002
@@ -66,13 +121,19 @@ namespace embree
         func(range<Index>(r.begin(),r.end()));
       },context);
     if (context.is_group_execution_cancelled())
-      throw std::runtime_error("task cancelled");
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
   #else
     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
         func(range<Index>(r.begin(),r.end()));
       });
     if (tbb::task::self().is_cancelled())
-      throw std::runtime_error("task cancelled");
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
   #endif
 
 #elif defined(TASKING_PPL)
@@ -104,13 +165,19 @@ namespace embree
           func(i);
         },tbb::simple_partitioner(),context);
       if (context.is_group_execution_cancelled())
-        throw std::runtime_error("task cancelled");
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
     #else
       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
           func(i);
         },tbb::simple_partitioner());
       if (tbb::task::self().is_cancelled())
-        throw std::runtime_error("task cancelled");
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
     #endif
   }
 
@@ -125,13 +192,19 @@ namespace embree
           func(i);
         },ap,context);
       if (context.is_group_execution_cancelled())
-        throw std::runtime_error("task cancelled");
+       // -- GODOT start --
+       // throw std::runtime_error("task cancelled");
+       abort(); 
+       // -- GODOT end --
     #else
       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
           func(i);
         },ap);
       if (tbb::task::self().is_cancelled())
-        throw std::runtime_error("task cancelled");
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
     #endif
   }
 
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
index 9c6be67f070d..0daf94e50e04 100644
--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -43,7 +43,7 @@ namespace embree
   template<typename Index, typename Value, typename Func, typename Reduction>
     __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
   {
-#if defined(TASKING_INTERNAL)
+#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
 
     /* fast path for small number of iterations */
     Index taskCount = (last-first+minStepSize-1)/minStepSize;
@@ -58,15 +58,19 @@ namespace embree
     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
       reduction,context);
-    if (context.is_group_execution_cancelled())
-      throw std::runtime_error("task cancelled");
+    // -- GODOT start --
+    // if (context.is_group_execution_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
     return v;
   #else
     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
       reduction);
-    if (tbb::task::self().is_cancelled())
-      throw std::runtime_error("task cancelled");
+    // -- GODOT start --
+    // if (tbb::task::self().is_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
     return v;
   #endif
 #else // TASKING_PPL
diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h
index 5a3382079375..a758227c1b83 100644
--- a/thirdparty/embree/common/algorithms/parallel_sort.h
+++ b/thirdparty/embree/common/algorithms/parallel_sort.h
@@ -5,6 +5,9 @@
 
 #include "../simd/simd.h"
 #include "parallel_for.h"
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include "../sys/alloc.h"
+#endif
 #include <algorithm>
 
 namespace embree
@@ -320,7 +323,7 @@ namespace embree
 #pragma nounroll      
 #endif
       for (size_t i=startID; i<endID; i++) {
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
         const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
 #else
         const Key index = ((Key)src[i] >> shift) & mask;
@@ -382,7 +385,7 @@ namespace embree
 #endif
       for (size_t i=startID; i<endID; i++) {
         const Ty elt = src[i];
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
         const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
 #else
         const size_t index = ((Key)src[i] >> shift) & mask;
diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
index 7e7b9faef800..98dc80ad59a3 100644
--- a/thirdparty/embree/common/lexers/stringstream.cpp
+++ b/thirdparty/embree/common/lexers/stringstream.cpp
@@ -39,7 +39,10 @@ namespace embree
     std::vector<char> str; str.reserve(64);
     while (cin->peek() != EOF && !isSeparator(cin->peek())) {
       int c = cin->get();
-      if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      // -- GODOT start --
+      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      if (!isValidChar(c)) abort();
+      // -- GODOT end --
       str.push_back((char)c);
     }
     str.push_back(0);
diff --git a/thirdparty/embree/common/math/AVX2NEON.h b/thirdparty/embree/common/math/AVX2NEON.h
new file mode 100644
index 000000000000..e8698ac56d1f
--- /dev/null
+++ b/thirdparty/embree/common/math/AVX2NEON.h
@@ -0,0 +1,986 @@
+#pragma once
+
+#include "SSE2NEON.h"
+
+
+#define AVX2NEON_ABI static inline  __attribute__((always_inline))
+
+
+struct __m256d;
+
+struct __m256 {
+    __m128 lo,hi;
+    __m256() {}
+};
+
+
+
+
+struct __m256i {
+    __m128i lo,hi;
+    explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
+    operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
+    __m256i() {}
+};
+ 
+
+
+
+struct __m256d {
+    float64x2_t lo,hi;
+    __m256d() {}
+    __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+    __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+};
+
+#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
+
+
+#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
+#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
+
+#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
+
+
+#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
+
+
+
+#define _mm_stream_load_si128 _mm_load_si128
+#define _mm256_stream_load_si256 _mm256_load_si256
+
+
+AVX2NEON_ABI
+__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
+{
+    __m128 res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
+{
+    __m128i res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
+{
+    int64x2_t y;
+    y[0] = *(int64_t *)mem_addr;
+    y[1] = 0;
+    return __m128i(y);
+}
+
+AVX2NEON_ABI
+int _mm_movemask_popcnt(__m128 a)
+{
+    return __builtin_popcount(_mm_movemask_ps(a));
+}
+
+AVX2NEON_ABI
+__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
+{
+    __m128 res;
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vnegq_f32(vfmaq_f32(c,a,b));
+}
+
+#define _mm_fnmsub_ss _mm_fnmsub_ps
+
+AVX2NEON_ABI
+__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmsq_f32(c,a,b);
+}
+
+#define _mm_fnmadd_ss _mm_fnmadd_ps
+
+
+AVX2NEON_ABI
+__m128 _mm_broadcast_ss (float const * mem_addr)
+{
+    return vdupq_n_f32(*mem_addr);
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmaq_f32(vnegq_f32(c),a,b);
+}
+
+#define _mm_fmsub_ss _mm_fmsub_ps
+#define _mm_fmadd_ps _mm_madd_ps
+#define _mm_fmadd_ss _mm_madd_ps
+
+
+
+template<int code>
+AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    v = 0;
+    v += (code & 0x10) ? a[0]*b[0] : 0;
+    v += (code & 0x20) ? a[1]*b[1] : 0;
+    v += (code & 0x40) ? a[2]*b[2] : 0;
+    v += (code & 0x80) ? a[3]*b[3] : 0;
+    float32x4_t res;
+    res[0] = (code & 0x1) ? v : 0;
+    res[1] = (code & 0x2) ? v : 0;
+    res[2] = (code & 0x4) ? v : 0;
+    res[3] = (code & 0x8) ? v : 0;
+    return res;
+}
+
+template<>
+inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    m[3] = 0;
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+template<>
+inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
+
+
+
+AVX2NEON_ABI
+__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_permutevar_ps (__m128 a, __m128i b)
+{
+    __m128 x;
+    for (int i=0;i<4;i++)
+    {
+        x[i] = a[b[i&3]];
+    }
+    return x;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_setzero_si256()
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_setzero_ps()
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(0.0f);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_undefined_si256()
+{
+    return _mm256_setzero_si256();
+}
+
+AVX2NEON_ABI
+__m256 _mm256_undefined_ps()
+{
+    return _mm256_setzero_ps();
+}
+
+CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t)
+CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i)
+CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128)
+CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128)
+CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t)
+CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i)
+
+
+
+
+AVX2NEON_ABI
+__m128 _mm256_castps256_ps128 (__m256 a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_castsi128_si256 (__m128i a)
+{
+    __m256i res;
+    res.lo = a ;
+    res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm256_castsi256_si128 (__m256i a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_castps128_ps256 (__m128 a)
+{
+    __m256 res;
+    res.lo = a;
+    res.hi = vdupq_n_f32(0);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ss (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(*mem_addr);
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
+{
+    __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi32 (int a)
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(a);
+    return res;
+}
+
+
+
+
+AVX2NEON_ABI
+int _mm256_movemask_ps(const __m256& v)
+{
+    return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_permute_ps (const __m256& a)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
+
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi64x (long long a)
+{
+    __m256i res;
+    int64x2_t t = vdupq_n_s64(a);
+    res.lo = res.hi = __m128i(t);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
+{
+    __m256 res;
+    __m128 tmp;
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+
+    
+    
+    res.lo = tmp;
+    imm8 >>= 4;
+    
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+    
+    res.hi = tmp;
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_moveldup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    res.lo[2] = res.lo[3] = a.lo[2];
+    res.hi[0] = res.hi[1] = a.hi[0];
+    res.hi[2] = res.hi[3] = a.hi[2];
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256 _mm256_movehdup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[1];
+    res.lo[2] = res.lo[3] = a.lo[3];
+    res.hi[0] = res.hi[1] = a.hi[1];
+    res.hi[2] = res.hi[3] = a.hi[3];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
+{
+    __m256 res = a;
+    if (imm8 & 1) res.hi = b;
+    else res.lo = b;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+
+AVX2NEON_ABI
+__m256d _mm256_movedup_pd (__m256d a)
+{
+    __m256d res;
+    res.hi = a.hi;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_abs_epi32(__m256i a)
+{
+   __m256i res;
+   res.lo = vabsq_s32(a.lo);
+   res.hi = vabsq_s32(a.hi);
+   return res;
+}
+
+UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
+UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
+UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
+BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
+BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
+
+BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
+BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
+BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
+BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
+
+BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
+BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
+
+BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
+BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
+BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
+BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
+
+BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
+BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
+BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
+BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
+
+BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
+
+
+
+BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
+BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
+BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
+
+
+BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
+BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
+TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
+
+
+TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
+
+
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
+BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
+BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvtps_epi32(a.lo);
+    res.hi = _mm_cvtps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvttps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvttps_epi32(a.lo);
+    res.hi = _mm_cvttps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256 _mm256_loadu_ps (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = *(__m128 *)(mem_addr + 0);
+    res.hi = *(__m128 *)(mem_addr + 4);
+    return res;
+}
+#define _mm256_load_ps _mm256_loadu_ps
+
+
+AVX2NEON_ABI
+int _mm256_testz_ps (const __m256& a, const __m256& b)
+{
+    __m256 t = a;
+    if (&a != &b)
+        t = _mm256_and_ps(a,b);
+
+    __m128i l  = vshrq_n_s32(__m128i(t.lo),31);
+    __m128i h  = vshrq_n_s32(__m128i(t.hi),31);
+    return vaddvq_s32(vaddq_s32(l,h)) == 0;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
+{
+    __m256i res;
+    int64x2_t t0 = {e0,e1};
+    int64x2_t t1 = {e2,e3};
+    res.lo = __m128i(t0);
+    res.hi = __m128i(t1);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_setzero_pd ()
+{
+    __m256d res;
+    res.lo = res.hi = vdupq_n_f64(0);
+    return res;
+}
+
+AVX2NEON_ABI
+int _mm256_movemask_pd (__m256d a)
+{
+    int res = 0;
+    uint64x2_t x;
+    x = uint64x2_t(a.lo);
+    res |= (x[0] >> 63) ? 1 : 0;
+    res |= (x[0] >> 63) ? 2 : 0;
+    x = uint64x2_t(a.hi);
+    res |= (x[0] >> 63) ? 4 : 0;
+    res |= (x[0] >> 63) ? 8 : 0;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo)));
+    res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_pd (__m256d a, __m256d b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_f64(a.lo,b.lo));
+    res.hi = __m128i(vceqq_f64(a.hi,b.hi));
+    return res;
+}
+
+
+AVX2NEON_ABI
+int _mm256_testz_pd (const __m256d& a, const __m256d& b)
+{
+    __m256d t = a;
+
+    if (&a != &b)
+        t = _mm256_and_pd(a,b);
+
+    return _mm256_movemask_pd(t) == 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
+{
+    __m256d res;
+    uint64x2_t t = uint64x2_t(mask.lo);
+    res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0];
+    res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1];
+    t = uint64x2_t(mask.hi);
+    res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0];
+    res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1];
+    return res;
+}
+
+template<int imm8>
+__m256 __mm256_dp_ps (__m256 a, __m256 b)
+{
+    __m256 res;
+    res.lo = _mm_dp_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_dp_ps(a.hi,b.hi,imm8);
+    return res;
+}
+
+#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
+
+AVX2NEON_ABI
+double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
+{
+    switch (imm8 & 3) {
+        case 0:
+            return a.lo[0];
+        case 1:
+            return a.lo[1];
+        case 2:
+            return a.hi[0];
+        case 3:
+            return a.hi[1];
+    }
+    __builtin_unreachable();
+    return 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
+{
+    __m256d res;
+    res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
+    res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
+    res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
+    res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
+{
+    return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
+{
+    __m256i res;
+    res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
+    res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
+    return res;
+}
+
+#define _mm256_load_si256 _mm256_loadu_si256
+
+AVX2NEON_ABI
+void _mm256_storeu_ps (float * mem_addr, __m256 a)
+{
+    *(__m128 *)(mem_addr + 0) = a.lo;
+    *(__m128 *)(mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_ps _mm256_storeu_ps
+#define _mm256_stream_ps _mm256_storeu_ps
+
+
+AVX2NEON_ABI
+void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
+{
+    *(__m128i *)((int *)mem_addr + 0) = a.lo;
+    *(__m128i *)((int *)mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_si256 _mm256_storeu_si256
+
+
+
+AVX2NEON_ABI
+__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
+{
+    __m256 res;
+    res.lo = _mm_maskload_ps(mem_addr,mask.lo);
+    res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu8_epi32 (__m128i a)
+{
+    __m256i res;
+    uint8x16_t x = uint8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi8_epi32 (__m128i a)
+{
+    __m256i res;
+    int8x16_t x = int8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu16_epi32 (__m128i a)
+{
+    __m256i res;
+    uint16x8_t x = uint16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi16_epi32 (__m128i a)
+{
+    __m256i res;
+    int16x8_t x = int16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
+{
+    _mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
+    _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
+}
+
+AVX2NEON_ABI
+__m256i _mm256_slli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_slli_epi32(a.lo,imm8);
+    res.hi = _mm_slli_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srli_epi32(a.lo,imm8);
+    res.hi = _mm_srli_epi32(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srai_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srai_epi32(a.lo,imm8);
+    res.hi = _mm_srai_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,count.lo);
+    res.hi = vshlq_s32(a.hi,count.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
+    res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
+    res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
+{
+    return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set1_ps(float x)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(x);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
+{
+    __m256 res;
+    res.lo = _mm_set_ps(e3,e2,e1,e0);
+    res.hi = _mm_set_ps(e7,e6,e5,e4);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = *mem_addr;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_cvtepi32_ps (__m256i a)
+{
+    __m256 res;
+    res.lo = _mm_cvtepi32_ps(a.lo);
+    res.hi = _mm_cvtepi32_ps(a.hi);
+    return res;
+}
+AVX2NEON_ABI
+void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i];
+        if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i];
+    }
+}
+
+AVX2NEON_ABI
+__m256d _mm256_andnot_pd (__m256d a, __m256d b)
+{
+    __m256d res;
+    res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
+    res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
+{
+    __m256 res;
+    res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
+{
+    __m256i res;
+    res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
+{
+    __m256i res;
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
+{
+    __m256i res = _mm256_setzero_si256();
+    for (int i=0;i<4;i++)
+    {
+        if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    
+    return res;
+
+}
+
+
diff --git a/thirdparty/embree/common/math/SSE2NEON.h b/thirdparty/embree/common/math/SSE2NEON.h
new file mode 100644
index 000000000000..2013151d31fa
--- /dev/null
+++ b/thirdparty/embree/common/math/SSE2NEON.h
@@ -0,0 +1,1753 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding ARM NEON versions
+//
+// This header file does not (yet) translate *all* of the SSE intrinsics.
+// Since this is in support of a specific porting effort, I have only
+// included the intrinsics I needed to get my port to work.
+//
+// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com
+//
+// If you want to improve or add to this project, send me an
+// email and I will probably approve your access to the depot.
+//
+// Project is located here:
+//
+//	https://github.com/jratcliff63367/sse2neon
+//
+// Show your appreciation for open source by sending me a bitcoin tip to the following
+// address.
+//
+// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p :
+// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p
+//
+//
+// Contributors to this project are:
+//
+// John W. Ratcliff : jratcliffscarab@gmail.com
+// Brandon Rowlett  : browlett@nvidia.com
+// Ken Fast         : kfast@gdeb.com
+// Eric van Beurden : evanbeurden@nvidia.com
+//
+//
+// *********************************************************************************************************************
+// Release notes for January 20, 2017 version:
+//
+// The unit tests have been refactored.  They no longer assert on an error, instead they return a pass/fail condition
+// The unit-tests now test 10,000 random float and int values against each intrinsic.
+//
+// SSE2NEON now supports 95 SSE intrinsics.  39 of them have formal unit tests which have been implemented and
+// fully tested on NEON/ARM.  The remaining 56 still need unit tests implemented.
+//
+// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which
+// attempt to access the contents of an _m128 struct directly.  It is important to note that accessing the __m128
+// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer
+// can use the SIMDVec as an alias for it.  Any casting must be done manually by the developer, as you cannot
+// cast or otherwise alias the base NEON data type for intrinsic operations.
+//
+// A bug was found with the _mm_shuffle_ps intrinsic.  If the shuffle permutation was not one of the ones with
+// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing
+// to return the correct value.  This is now fixed.
+//
+// A bug was found with the _mm_cvtps_epi32 intrinsic.  This converts floating point values to integers.
+// It was not honoring the correct rounding mode.  In SSE the default rounding mode when converting from float to int
+// is to use 'round to even' otherwise known as 'bankers rounding'.  ARMv7 did not support this feature but ARMv8 does.
+// As it stands today, this header file assumes ARMv8.  If you are trying to target really old ARM devices, you may get
+// a build error.
+//
+// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are
+// producing the correct results on NEON.  These unit tests will be added as soon as possible.
+//
+// Here is the list of new instrinsics which have been added:
+//
+// _mm_cvtss_f32     :  extracts the lower order floating point value from the parameter
+// _mm_add_ss        : adds the scalar single - precision floating point values of a and b
+// _mm_div_ps        : Divides the four single - precision, floating - point values of a and b.
+// _mm_div_ss        : Divides the scalar single - precision floating point value of a by b.
+// _mm_sqrt_ss       : Computes the approximation of the square root of the scalar single - precision floating point value of in.
+// _mm_rsqrt_ps      : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in.
+// _mm_comilt_ss     : Compares the lower single - precision floating point scalar values of a and b using a less than operation
+// _mm_comigt_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than operation.
+// _mm_comile_ss     :  Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation.
+// _mm_comige_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation.
+// _mm_comieq_ss     :  Compares the lower single - precision floating point scalar values of a and b using an equality operation.
+// _mm_comineq_s     :  Compares the lower single - precision floating point scalar values of a and b using an inequality operation
+// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b.
+// _mm_unpackhi_epi16:  Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b.
+//
+// *********************************************************************************************************************
+/*
+** The MIT license:
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and associated documentation files (the "Software"), to deal
+** in the Software without restriction, including without limitation the rights
+** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+** copies of the Software, and to permit persons to whom the Software is furnished
+** to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in all
+** copies or substantial portions of the Software.
+
+** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#pragma once
+
+#define GCC 1
+#define ENABLE_CPP_VERSION 0
+
+// enable precise emulation of _mm_min_ps and _mm_max_ps?
+// This would slow down the computation a bit, but gives consistent result with x86 SSE2.
+// (e.g. would solve a hole or NaN pixel in the rendering result)
+#define USE_PRECISE_MINMAX_IMPLEMENTATION (1)
+
+#if GCC
+#define FORCE_INLINE					inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x)					__attribute__((aligned(x)))
+#else
+#define FORCE_INLINE					inline
+#define ALIGN_STRUCT(x)					__declspec(align(x))
+#endif
+
+#include <stdint.h>
+#include "arm_neon.h"
+#if defined(__aarch64__)
+#include "constants.h"
+#endif
+
+
+#if !defined(__has_builtin)
+#define __has_builtin(x) (0)
+#endif
+
+/*******************************************************/
+/* MACRO for shuffle parameter for _mm_shuffle_ps().   */
+/* Argument fp3 is a digit[0123] that represents the fp*/
+/* from argument "b" of mm_shuffle_ps that will be     */
+/* placed in fp3 of result. fp2 is the same for fp2 in */
+/* result. fp1 is a digit[0123] that represents the fp */
+/* from argument "a" of mm_shuffle_ps that will be     */
+/* places in fp1 of result. fp0 is the same for fp0 of */
+/* result                                              */
+/*******************************************************/
+#if defined(__aarch64__)
+#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3),  (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
+#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } )
+#endif
+
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \
+  ((fp1) << 2) | ((fp0)))
+
+typedef float32x4_t __m128;
+typedef int32x4_t __m128i;
+
+// union intended to allow direct access to an __m128 variable using the names that the MSVC
+// compiler provides.  This union should really only be used when trying to access the members
+// of the vector as integer values.  GCC/clang allow native access to the float members through
+// a simple array access operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance
+// hit.  If it really is needed however, the original __m128 variable can be aliased with a
+// pointer to this union and used to access individual components.  The use of this union should
+// be hidden behind a macro that is used throughout the codebase to access the members instead
+// of always declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec
+{
+  float       m128_f32[4];    // as floats - do not to use this.  Added for convenience.
+  int8_t      m128_i8[16];    // as signed 8-bit integers.
+  int16_t     m128_i16[8];    // as signed 16-bit integers.
+  int32_t     m128_i32[4];    // as signed 32-bit integers.
+  int64_t     m128_i64[2];    // as signed 64-bit integers.
+  uint8_t     m128_u8[16];    // as unsigned 8-bit integers.
+  uint16_t    m128_u16[8];    // as unsigned 16-bit integers.
+  uint32_t    m128_u32[4];    // as unsigned 32-bit integers.
+  uint64_t    m128_u64[2];    // as unsigned 64-bit integers.
+  double	    m128_f64[2];    // as signed double
+} SIMDVec;
+
+// ******************************************
+// CPU stuff
+// ******************************************
+
+typedef SIMDVec __m128d;
+
+#include <stdlib.h>
+
+#ifndef _MM_MASK_MASK
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_DIV_ZERO 0x200
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_DENORMALS_ZERO_ON 0x40
+#define _MM_MASK_DENORM 0x100
+#endif
+#define _MM_SET_EXCEPTION_MASK(x)
+#define _MM_SET_FLUSH_ZERO_MODE(x)
+#define _MM_SET_DENORMALS_ZERO_MODE(x)
+
+FORCE_INLINE void _mm_pause()
+{
+}
+
+FORCE_INLINE void _mm_mfence()
+{
+    __sync_synchronize();
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level)
+{
+   __builtin_prefetch(ptr);
+ 
+}
+
+FORCE_INLINE void* _mm_malloc(int size, int align)
+{
+    void *ptr;
+    // align must be multiple of sizeof(void *) for posix_memalign.
+    if (align < sizeof(void *)) {
+        align = sizeof(void *);
+    }
+
+    if ((align % sizeof(void *)) != 0) {
+        // fallback to malloc
+        ptr = malloc(size);
+    } else {
+        if (posix_memalign(&ptr, align, size)) {
+          return 0;
+        }
+    }
+
+    return ptr;
+}
+
+FORCE_INLINE void _mm_free(void* ptr)
+{
+        free(ptr);
+}
+
+FORCE_INLINE int _mm_getcsr()
+{
+        return 0;
+}
+
+FORCE_INLINE void _mm_setcsr(int val)
+{
+        return;
+}
+
+// ******************************************
+// Set/get methods
+// ******************************************
+
+// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
+#if defined(__aarch64__)
+FORCE_INLINE float _mm_cvtss_f32(const __m128& x)
+{
+    return x[0];
+}
+#else
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(a, 0);
+}
+#endif
+
+// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128()
+{
+  return vdupq_n_s32(0);
+}
+
+// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+  return vdupq_n_f32(0);
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+  return vdupq_n_f32(_w);
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+  return vdupq_n_f32(_w);
+}
+
+// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+#if defined(__aarch64__) 
+FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x)
+{
+    float32x4_t t = { x, y, z, w };
+    return t;
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x )
+{
+    float32x4_t t = { w, z, y, x };
+    return t;
+}
+#else
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float __attribute__((aligned(16))) data[4] = { x, y, z, w };
+    return vld1q_f32(data);
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x )
+{
+    float __attribute__ ((aligned (16))) data[4] = { w, z, y, x };
+    return vld1q_f32(data);
+}
+#endif
+
+// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+  return vdupq_n_s32(_i);
+}
+
+//Set the first lane to of 4 signed single-position, floating-point number to w
+#if defined(__aarch64__)
+FORCE_INLINE __m128 _mm_set_ss(float _w)
+{
+    float32x4_t res = {_w, 0, 0, 0};
+    return res;
+}
+
+// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32x4_t t = {i0,i1,i2,i3};
+    return t;
+}
+#else
+FORCE_INLINE __m128 _mm_set_ss(float _w)
+{
+    __m128 val = _mm_setzero_ps();
+    return vsetq_lane_f32(_w, val, 0);
+}
+
+// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 };
+    return vld1q_s32(data);
+}
+#endif
+
+// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+  vst1q_f32(p, a);
+}
+
+// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+  vst1q_f32(p, a);
+}
+
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+  vst1q_s32((int32_t*) p,a);
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a )
+{
+  vst1q_s32((int32_t*) p,a);
+}
+
+// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+  vst1q_lane_f32(p, a, 0);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.  https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
+{
+  *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0);
+}
+
+// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float * p)
+{
+  return vld1q_dup_f32(p);
+}
+
+// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float * p)
+{
+  return vld1q_f32(p);
+}
+
+// Loads four single-precision, floating-point values.  https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float * p)
+{
+  // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon
+  return vld1q_f32(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and clears the upper three words.  https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float * p)
+{
+  __m128 result = vdupq_n_f32(0);
+  return vsetq_lane_f32(*p, result, 0);
+}
+
+FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p)
+{
+  return (__m128i)vld1q_s32((const int32_t*) p);
+}
+
+
+// ******************************************
+// Logic/Binary operations
+// ******************************************
+
+// Compares for inequality.  https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+  return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b));
+}
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+  return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vandq_s32(a, b);
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+  return (__m128)vandq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+  return (__m128)vorrq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+  return (__m128)veorq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vorrq_s32(a, b);
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+  return veorq_s32(a, b);
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this
+  uint32x4_t &ia = *(uint32x4_t *)&a;
+  return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8);
+#else
+    
+#if defined(__aarch64__)
+    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
+    return vaddvq_u32(t2);
+#else
+  static const uint32x4_t movemask = { 1, 2, 4, 8 };
+  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+  uint32x4_t t0 = vreinterpretq_u32_f32(a);
+  uint32x4_t t1 = vtstq_u32(t0, highbit);
+  uint32x4_t t2 = vandq_u32(t1, movemask);
+  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+#endif
+    
+#endif
+}
+
+#if defined(__aarch64__)
+FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a)
+{
+    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
+    t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2)));
+    return vaddvq_u32(t2);
+    
+}
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+  return vcombine_f32(vget_high_f32(a), vget_low_f32(b));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high end of result
+// takes the higher two 32 bit values from b and swaps them and places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+  return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b)));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+  return vcombine_f32(vget_low_f32(a), vget_high_f32(b));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+  float32_t a0 = vgetq_lane_f32(a, 0);
+  float32_t a2 = vgetq_lane_f32(a, 2);
+  float32x2_t aVal = vdup_n_f32(a2);
+  aVal = vset_lane_f32(a0, aVal, 1);
+  return vcombine_f32(aVal, vget_high_f32(b));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vget_low_f32(a), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vget_high_f32(a), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+  float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3));
+  float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3));
+  return vcombine_f32(a21, b03);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+  float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3));
+  float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3));
+  return vcombine_f32(a03, b21);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+  float32x2_t a10 = vget_low_f32(a);
+  float32x2_t b10 = vget_low_f32(b);
+  return vcombine_f32(a10, b10);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
+  float32x2_t b10 = vget_low_f32(b);
+  return vcombine_f32(a01, b10);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
+  float32x2_t b01 = vrev64_f32(vget_low_f32(b));
+  return vcombine_f32(a01, b01);
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Currently I am not sure whether the C implementation is faster or slower than the NEON version.
+// Note, this has to be expanded as a template because the shuffle value must be an immediate value.
+// The same is true on SSE as well.
+// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+template <int i>
+FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b)
+{
+#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet.
+  __m128 ret;
+  ret[0] = a[i & 0x3];
+  ret[1] = a[(i >> 2) & 0x3];
+  ret[2] = b[(i >> 4) & 0x03];
+  ret[3] = b[(i >> 6) & 0x03];
+  return ret;
+#else
+# if __has_builtin(__builtin_shufflevector)
+    return __builtin_shufflevector(             \
+        a, b, (i) & (0x3), ((i) >> 2) & 0x3,
+        (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4);
+# else
+    const int i0 = (i >> 0)&0x3;
+    const int i1 = (i >> 2)&0x3;
+    const int i2 = (i >> 4)&0x3;
+    const int i3 = (i >> 6)&0x3;
+
+    if (&a == &b)
+     {
+         if (i0 == i1 && i0 == i2 && i0 == i3)
+         {
+             return (float32x4_t)vdupq_laneq_f32(a,i0);
+         }
+         static const uint8_t tbl[16] = {
+             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
+             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
+             (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3,
+             (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3
+         };
+         
+         return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl);
+         
+     }
+     else
+     {
+         
+         static const uint8_t tbl[16] = {
+             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
+             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
+             (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16,
+             (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16
+         };
+         
+         return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl));
+     }
+# endif //builtin(shufflevector)
+#endif
+}
+
+template <int i >
+FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(1, 0, 3, 2):
+      return _mm_shuffle_ps_1032(a, b);
+      break;
+    case _MM_SHUFFLE(2, 3, 0, 1):
+      return _mm_shuffle_ps_2301(a, b);
+      break;
+    case _MM_SHUFFLE(3, 2, 1, 0):
+      return _mm_shuffle_ps_3210(a, b);
+      break;
+    case _MM_SHUFFLE(0, 0, 1, 1):
+      return _mm_shuffle_ps_0011(a, b);
+      break;
+    case _MM_SHUFFLE(0, 0, 2, 2):
+      return _mm_shuffle_ps_0022(a, b);
+      break;
+    case _MM_SHUFFLE(2, 2, 0, 0):
+      return _mm_shuffle_ps_2200(a, b);
+      break;
+    case _MM_SHUFFLE(3, 2, 0, 2):
+      return _mm_shuffle_ps_3202(a, b);
+      break;
+    case _MM_SHUFFLE(1, 1, 3, 3):
+      return _mm_shuffle_ps_1133(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 1, 0):
+      return _mm_shuffle_ps_2010(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 0, 1):
+      return _mm_shuffle_ps_2001(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 3, 2):
+      return _mm_shuffle_ps_2032(a, b);
+      break;
+    case _MM_SHUFFLE(0, 3, 2, 1):
+      return _mm_shuffle_ps_0321(a, b);
+      break;
+    case _MM_SHUFFLE(2, 1, 0, 3):
+      return _mm_shuffle_ps_2103(a, b);
+      break;
+    case _MM_SHUFFLE(1, 0, 1, 0):
+      return _mm_shuffle_ps_1010(a, b);
+      break;
+    case _MM_SHUFFLE(1, 0, 0, 1):
+      return _mm_shuffle_ps_1001(a, b);
+      break;
+    case _MM_SHUFFLE(0, 1, 0, 1):
+      return _mm_shuffle_ps_0101(a, b);
+      break;
+  }
+  return _mm_shuffle_ps_default<i>(a, b);
+}
+
+# if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default<i>(a,b)
+# else
+#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function<i>(a,b)
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_high_s32(a), vget_low_s32(b));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end of result
+// takes the higher two 32 bit values from b and swaps them and places in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b)));
+}
+
+// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b
+// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b)
+{
+  return vextq_s32(a, b, 1);
+}
+
+// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a
+// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b)
+{
+  return vextq_s32(a, b, 3);
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of b and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_low_s32(a), vget_low_s32(a));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits
+// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b)
+{
+  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b)
+{
+  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3)));
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b)
+{
+#if ENABLE_CPP_VERSION
+  __m128i ret;
+  ret[0] = a[i & 0x3];
+  ret[1] = a[(i >> 2) & 0x3];
+  ret[2] = b[(i >> 4) & 0x03];
+  ret[3] = b[(i >> 6) & 0x03];
+  return ret;
+#else
+  __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3));
+  ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1);
+  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2);
+  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3);
+  return ret;
+#endif
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break;
+    case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break;
+    case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break;
+    case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break;
+    case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break;
+    case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break;
+    case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break;
+    case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break;
+    case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break;
+    case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break;
+    default: return _mm_shuffle_epi32_default<i>(a, b);
+  }
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a)
+{
+  return vdupq_n_s32(vgetq_lane_s32(a, i));
+}
+
+template <int i>
+FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break;
+    case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break;
+    case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break;
+    case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break;
+    default: return _mm_shuffle_epi32_function<i>(a, a);
+  }
+}
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.	https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single<i>(a)
+
+template <int i>
+FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a)
+{
+  int16x8_t ret = (int16x8_t)a;
+  int16x4_t highBits = vget_high_s16(ret);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7);
+  return (__m128i)ret;
+}
+
+// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm.  https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function<i>(a)
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int32x4_t s = vdupq_n_s32(imm8);
+    return vshlq_s32(a, s);
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+  const int s = (imm8 > 31) ? 0 : imm8;
+  data[0] = data[0] << s;
+  data[1] = data[1] << s;
+  data[2] = data[2] << s;
+  data[3] = data[3] << s;
+
+  return vld1q_s32(data);
+#endif
+}
+
+
+//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros.  https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx
+//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int shift = (imm8 > 31) ? 0 : imm8;  // Unfortunately, we need to check for this case for embree.
+    const int32x4_t s = vdupq_n_s32(-shift);
+    return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s));
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+
+  const int s = (imm8 > 31) ? 0 : imm8;
+
+  data[0] = data[0] >> s;
+  data[1] = data[1] >> s;
+  data[2] = data[2] >> s;
+  data[3] = data[3] >> s;
+
+  return vld1q_s32(data);
+#endif
+}
+
+
+// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit.  https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
+//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int32x4_t s = vdupq_n_s32(-imm8);
+    return vshlq_s32(a, s);
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+  const uint32_t m = (uint32_t) ((~0U) << (32 - imm8));
+
+  for (int i = 0; i < 4; i++) {
+    uint32_t is_neg = ((uint32_t) (((data[i]) >> 31)));
+    data[i] = (data[i] >> imm8) | (m * is_neg);
+  }
+
+  return vld1q_s32(data);
+#endif
+}
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm))
+#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm))
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate.  https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm))
+
+// NEON does not provide a version of this function, here is an article about some ways to repro the results.
+// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
+// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i _a)
+{
+  uint8x16_t input = (uint8x16_t)_a;
+  const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
+  uint8x8_t mask_and = vdup_n_u8(0x80);
+  int8x8_t mask_shift = vld1_s8(xr);
+
+  uint8x8_t lo = vget_low_u8(input);
+  uint8x8_t hi = vget_high_u8(input);
+
+  lo = vand_u8(lo, mask_and);
+  lo = vshl_u8(lo, mask_shift);
+
+  hi = vand_u8(hi, mask_and);
+  hi = vshl_u8(hi, mask_shift);
+
+  lo = vpadd_u8(lo, lo);
+  lo = vpadd_u8(lo, lo);
+  lo = vpadd_u8(lo, lo);
+
+  hi = vpadd_u8(hi, hi);
+  hi = vpadd_u8(hi, hi);
+  hi = vpadd_u8(hi, hi);
+
+  return ((hi[0] << 8) | (lo[0] & 0xFF));
+}
+
+
+// ******************************************
+// Math operations
+// ******************************************
+
+// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+  return vsubq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+  return vsubq_f32(a, b);
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+  return vsubq_s32(a, b);
+}
+
+// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+  return vaddq_f32(a, b);
+}
+
+// adds the scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+  const float32_t     b0 = vgetq_lane_f32(b, 0);
+  float32x4_t         value = vdupq_n_f32(0);
+
+  //the upper values in the result must be the remnants of <a>.
+  value = vsetq_lane_f32(b0, value, 0);
+  return vaddq_f32(a, value);
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+  return vaddq_s32(a, b);
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b)
+{
+  return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b);
+}
+
+// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+  return vmulq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+  return vmulq_f32(a, b);
+}
+
+// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+#if defined(BUILD_IOS)
+  return vdivq_f32(vdupq_n_f32(1.0f),in);
+    
+#endif
+    // Get an initial estimate of 1/in.
+  float32x4_t reciprocal = vrecpeq_f32(in);
+
+  // We only return estimated 1/in.
+  // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps().
+
+  // TODO(LTE): We could delete these ifdef?
+  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
+  return reciprocal;
+
+}
+
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 in)
+{
+  float32x4_t value;
+  float32x4_t result = in;
+
+  value = _mm_rcp_ps(in);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(BUILD_IOS) 
+  return vdivq_f32(a,b);
+#else
+  float32x4_t reciprocal = _mm_rcp_ps(b);
+    
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+  // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp.
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+  // Another round for safety
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+    
+  return vmulq_f32(a, reciprocal);
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.  https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+  value = _mm_div_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in.  https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+	
+  float32x4_t value = vrsqrteq_f32(in);
+  
+  // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values.
+  // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html
+  // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps
+  
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  // one more round to get better precision
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  // another round for safety
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  return value;
+}
+
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+  float32x4_t result = in;
+  
+  __m128 value = _mm_rsqrt_ps(in);
+
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+
+// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if defined(BUILD_IOS)
+  return vsqrtq_f32(in);
+#else
+  __m128 reciprocal = _mm_rsqrt_ps(in);
+  
+  // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf.
+  // We assign 0 to reciprocal wherever required.
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+  const uint32x4_t mask = vceqq_f32(in, vzero);
+  reciprocal = vbslq_f32(mask, vzero, reciprocal);
+  
+  // sqrt(x) = x * (1 / sqrt(x))
+  return vmulq_f32(in, reciprocal);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision floating point value of in.  https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+  float32x4_t value;
+  float32x4_t result = in;
+
+  value = _mm_sqrt_ps(in);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+
+// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if USE_PRECISE_MINMAX_IMPLEMENTATION
+  return vbslq_f32(vcltq_f32(b,a),a,b);
+#else
+  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+  return vmaxq_f32(a, b);
+#endif
+}
+
+// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if USE_PRECISE_MINMAX_IMPLEMENTATION
+  return vbslq_f32(vcltq_f32(a,b),a,b);
+#else
+  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+  return vminq_f32(a, b);
+#endif
+}
+
+// Computes the maximum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+ 
+  value = _mm_max_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+
+    
+  value = _mm_min_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b )
+{
+  return vmaxq_s32(a,b);
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b )
+{
+  return vminq_s32(a,b);
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+  int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b);
+  ret = vshrq_n_s16(ret, 1);
+  return (__m128i)ret;
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point values a and b.
+//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b )
+{
+#if defined(__aarch64__)
+    return vpaddq_f32(a,b);
+#else
+// This does not work, no vpaddq...
+//	return (__m128) vpaddq_f32(a,b);
+        //
+        // get two f32x2_t values from a
+        // do vpadd
+        // put result in low half of f32x4 result
+        //
+        // get two f32x2_t values from b
+        // do vpadd
+        // put result in high half of f32x4 result
+        //
+        // combine
+        return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) );
+#endif
+}
+
+// ******************************************
+// Compare operations
+// ******************************************
+
+// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcltq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+  return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b));
+}
+
+// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcgtq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+  return (__m128) _mm_cmpgt_ps(a,b);
+}
+
+
+// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcgeq_f32(a, b);
+}
+
+// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcleq_f32(a, b);
+}
+
+// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+  return (__m128)vceqq_f32(a, b);
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcltq_s32(a, b);
+}
+
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+  return (__m128i) vceqq_s32(a,b);
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcgtq_s32(a, b);
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
+// see also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b )
+{
+  // Note: NEON does not have ordered compare builtin
+  // Need to compare a eq a and b eq b to check for NaN
+  // Do AND of results to get final
+  return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) );
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcltq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcgtq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcleq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcgeq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vceqq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vceqq_f32(a, b);
+  return !vgetq_lane_u32(value, 0);
+}
+
+// according to the documentation, these intrinsics behave the same as the non-'u' versions.  We'll just alias them here.
+#define _mm_ucomilt_ss      _mm_comilt_ss
+#define _mm_ucomile_ss      _mm_comile_ss
+#define _mm_ucomigt_ss      _mm_comigt_ss
+#define _mm_ucomige_ss      _mm_comige_ss
+#define _mm_ucomieq_ss      _mm_comieq_ss
+#define _mm_ucomineq_ss     _mm_comineq_ss
+
+// ******************************************
+// Conversions
+// ******************************************
+
+// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+  return vcvtq_s32_f32(a);
+}
+
+// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+  return vcvtq_f32_s32(a);
+}
+
+// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support!
+// It is supported on ARMv8 however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if 1
+  return vcvtnq_s32_f32(a);
+#else
+  __m128 half = vdupq_n_f32(0.5f);
+  const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
+  const __m128 aPlusHalf = vaddq_f32(a, half);
+  const __m128 aRound = vsubq_f32(aPlusHalf, sign);
+  return vcvtq_s32_f32(aRound);
+#endif
+}
+
+// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+  return vgetq_lane_s32(a, 0);
+}
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+  __m128i result = vdupq_n_s32(0);
+  return vsetq_lane_s32(a, result, 0);
+}
+
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+#if defined(__aarch64__)
+    return (__m128i)a;
+#else
+  return *(const __m128i *)&a;
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+#if defined(__aarch64__)
+    return (__m128)a;
+#else
+  return *(const __m128 *)&a;
+#endif
+}
+
+// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+  return vld1q_s32((int32_t *)p);
+}
+
+FORCE_INLINE __m128d _mm_castps_pd(const __m128 a)
+{
+  return *(const __m128d *)&a;
+}
+
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+  return *(const __m128d *)&a;
+}
+// ******************************************
+// Miscellaneous Operations
+// ******************************************
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+  return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+  int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a);
+  int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b);
+
+  int8x8x2_t result = vzip_s8(a1, b1);
+
+  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+  int16x4_t a1 = vget_low_s16((int16x8_t)a);
+  int16x4_t b1 = vget_low_s16((int16x8_t)b);
+
+  int16x4x2_t result = vzip_s16(a1, b1);
+
+  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b.  https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+  int32x2_t a1 = vget_low_s32(a);
+  int32x2_t b1 = vget_low_s32(b);
+
+  int32x2x2_t result = vzip_s32(a1, b1);
+
+  return vcombine_s32(result.val[0], result.val[1]);
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+  float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b));
+  return vcombine_f32(result.val[0], result.val[1]);
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+  float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b));
+  return vcombine_f32(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+  int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a);
+  int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b);
+
+  int8x8x2_t result = vzip_s8(a1, b1);
+
+  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+  int16x4_t a1 = vget_high_s16((int16x8_t)a);
+  int16x4_t b1 = vget_high_s16((int16x8_t)b);
+
+  int16x4x2_t result = vzip_s16(a1, b1);
+
+  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b.  https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+  int32x2_t a1 = vget_high_s32(a);
+  int32x2_t b1 = vget_high_s32(b);
+
+  int32x2x2_t result = vzip_s32(a1, b1);
+
+  return vcombine_s32(result.val[0], result.val[1]);
+}
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero extends.  https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm)
+
+// ******************************************
+// Streaming Extensions
+// ******************************************
+
+// Guarantees that every preceding store is globally visible before any subsequent store.  https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+  __sync_synchronize();
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned.  https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+  *p = a;
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const*p)
+{
+  // no corollary for Neon?
+}
+
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b)
+{
+  // Stick to the flipped behavior of x86.
+  int64_t __attribute__((aligned(16))) data[2] = { b, a };
+  return (__m128i)vld1q_s64(data);
+}
+
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+  return (__m128i)vmovq_n_s64(_i);
+}
+
+#if defined(__aarch64__)
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c)
+{
+    int32x4_t mask = vshrq_n_s32(__m128i(c),31);
+    return vbslq_f32( uint32x4_t(mask), b, a);
+}
+
+FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr)
+{
+    uint8x8_t  t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t t1 = vmovl_u8(t0);
+    uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr)
+{
+    uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
+    uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
+    return vreinterpretq_s32_u32(t1);
+}
+
+FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr)
+{
+    int8x8_t    t0 = vld1_s8((int8_t*)ptr);
+    int16x8_t   t1 = vmovl_s8(t0);
+    int32x4_t   t2 = vmovl_s16(vget_low_s16(t1));
+    float32x4_t t3 = vcvtq_f32_s32(t2);
+    return vreinterpretq_s32_f32(t3);
+}
+
+FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr)
+{
+    uint8x8_t   t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t  t1 = vmovl_u8(t0);
+    uint32x4_t  t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr)
+{
+    int16x8_t   t0 = vld1q_s16((int16_t*)ptr);
+    int32x4_t   t1 = vmovl_s16(vget_low_s16(t0));
+    float32x4_t t2 = vcvtq_f32_s32(t1);
+    return vreinterpretq_s32_f32(t2);
+}
+
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
+}
+
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr)
+{
+    // No non-temporal load on a single register on ARM.
+    return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr));
+}
+
+FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a)
+{
+    // No non-temporal store on a single register on ARM.
+    vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a));
+}
+
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
+}
+
+FORCE_INLINE __m128 _mm_abs_ps(__m128 a)
+{
+    return vabsq_f32(a);
+}
+
+FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c)
+{
+    return vmlaq_f32(c, a, b);
+}
+
+FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c)
+{
+    return vmlsq_f32(c, a, b);
+}
+
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+  return vabsq_s32(a);
+}
+#endif  //defined(__aarch64__)
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+  return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+  return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+}
+
+#endif
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h
index 24d5b87223a2..29bb13912b13 100644
--- a/thirdparty/embree/common/math/bbox.h
+++ b/thirdparty/embree/common/math/bbox.h
@@ -77,7 +77,7 @@ namespace embree
     return lower > upper;
   }
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
   template<> __forceinline bool BBox<Vec3fa>::empty() const {
     return !all(le_mask(lower,upper));
   }
@@ -228,11 +228,11 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined __SSE__
+#if defined (__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
-#if defined __AVX__
+#if defined (__AVX__)
 #include "../simd/avx.h"
 #endif
 
diff --git a/thirdparty/embree/common/math/col3.h b/thirdparty/embree/common/math/col3.h
index 2a477ec1319b..f52015fb88eb 100644
--- a/thirdparty/embree/common/math/col3.h
+++ b/thirdparty/embree/common/math/col3.h
@@ -42,6 +42,6 @@ namespace embree
   }
 
   /*! default template instantiations */
-  typedef Col3<unsigned char> Col3uc;
+  typedef Col3<uint8_t      > Col3uc;
   typedef Col3<float        > Col3f;
 }
diff --git a/thirdparty/embree/common/math/col4.h b/thirdparty/embree/common/math/col4.h
index 27849840ec5b..90df293f8e41 100644
--- a/thirdparty/embree/common/math/col4.h
+++ b/thirdparty/embree/common/math/col4.h
@@ -42,6 +42,6 @@ namespace embree
   }
 
   /*! default template instantiations */
-  typedef Col4<unsigned char> Col4uc;
+  typedef Col4<uint8_t      > Col4uc;
   typedef Col4<float        > Col4f;
 }
diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h
index eae7b72ecfe0..c3083e4fc0f1 100644
--- a/thirdparty/embree/common/math/color.h
+++ b/thirdparty/embree/common/math/color.h
@@ -52,17 +52,17 @@ namespace embree
     __forceinline void set(Col3uc& d) const 
     {
       vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (unsigned char)(s[0]); 
-      d.g = (unsigned char)(s[1]); 
-      d.b = (unsigned char)(s[2]); 
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
     }
     __forceinline void set(Col4uc& d) const 
     {
       vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (unsigned char)(s[0]); 
-      d.g = (unsigned char)(s[1]); 
-      d.b = (unsigned char)(s[2]); 
-      d.a = (unsigned char)(s[3]); 
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+      d.a = (uint8_t)(s[3]); 
     }
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -114,16 +114,16 @@ namespace embree
     __forceinline void set(Col3uc& d) const 
     { 
       vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (unsigned char)(s[0]); 
-      d.g = (unsigned char)(s[1]); 
-      d.b = (unsigned char)(s[2]); 
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
     }
     __forceinline void set(Col4uc& d) const 
     { 
       vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (unsigned char)(s[0]); 
-      d.g = (unsigned char)(s[1]); 
-      d.b = (unsigned char)(s[2]); 
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
       d.a = 255; 
     }
 
@@ -152,21 +152,37 @@ namespace embree
   }
   __forceinline const Color rcp  ( const Color& a )
   {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 reciprocal = _mm_rcp_ps(a.m128);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    return (const Color)reciprocal;
+#else
 #if defined(__AVX512VL__)
     const Color r = _mm_rcp14_ps(a.m128);
 #else
     const Color r = _mm_rcp_ps(a.m128);
 #endif
     return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
   }
   __forceinline const Color rsqrt( const Color& a )
   {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 r = _mm_rsqrt_ps(a.m128);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    return r;
+#else
+      
 #if defined(__AVX512VL__)
     __m128 r = _mm_rsqrt14_ps(a.m128);
 #else
     __m128 r = _mm_rsqrt_ps(a.m128);
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+      
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
   }
   __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
 
diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp
index 26968297d951..eeff131664b5 100644
--- a/thirdparty/embree/common/math/constants.cpp
+++ b/thirdparty/embree/common/math/constants.cpp
@@ -1,6 +1,10 @@
 // Copyright 2009-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
 #include "constants.h"
 
 namespace embree
@@ -24,4 +28,34 @@ namespace embree
   ReverseStepTy reverse_step;
   EmptyTy empty;
   UndefinedTy undefined;
+
+#if defined(__aarch64__)
+const uint32x4_t movemask_mask = { 1, 2, 4, 8 };
+const uint32x4_t vzero = { 0, 0, 0, 0 };
+const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
+const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
+const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15};
+const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
+const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f };
+const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f };
+const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY };
+const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
+#endif
+
 }
diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h
index 77c2b7aec2bf..e80abec80f53 100644
--- a/thirdparty/embree/common/math/constants.h
+++ b/thirdparty/embree/common/math/constants.h
@@ -12,6 +12,19 @@
 #include <cfloat>
 #include <climits>
 
+// Math constants may not be defined in libcxx + mingw + strict C++ standard
+#if defined(__MINGW32__)
+
+// TODO(LTE): use constexpr
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_1_PI
+#define M_1_PI 0.31830988618379067154
+#endif
+
+#endif // __MINGW32__
+
 namespace embree
 {
   static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
@@ -44,8 +57,8 @@ namespace embree
     __forceinline operator unsigned int      ( ) const { return 0; }
     __forceinline operator          short    ( ) const { return 0; }
     __forceinline operator unsigned short    ( ) const { return 0; }
-    __forceinline operator          char     ( ) const { return 0; }
-    __forceinline operator unsigned char     ( ) const { return 0; }
+    __forceinline operator          int8_t     ( ) const { return 0; }
+    __forceinline operator uint8_t     ( ) const { return 0; }
   }; 
 
   extern MAYBE_UNUSED ZeroTy zero;
@@ -62,8 +75,8 @@ namespace embree
     __forceinline operator unsigned int      ( ) const { return 1; }
     __forceinline operator          short    ( ) const { return 1; }
     __forceinline operator unsigned short    ( ) const { return 1; }
-    __forceinline operator          char     ( ) const { return 1; }
-    __forceinline operator unsigned char     ( ) const { return 1; }
+    __forceinline operator          int8_t     ( ) const { return 1; }
+    __forceinline operator uint8_t     ( ) const { return 1; }
   };
 
   extern MAYBE_UNUSED OneTy one;
@@ -80,8 +93,8 @@ namespace embree
     __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
     __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
     __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
-    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::min(); }
-    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::min(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::min(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::min(); }
 
   };
 
@@ -99,8 +112,8 @@ namespace embree
     __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
     __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
     __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
-    __forceinline operator          char     ( ) const { return std::numeric_limits<char>::max(); }
-    __forceinline operator unsigned char     ( ) const { return std::numeric_limits<unsigned char>::max(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::max(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::max(); }
   };
 
   extern MAYBE_UNUSED PosInfTy inf;
@@ -194,4 +207,33 @@ namespace embree
   };
 
   extern MAYBE_UNUSED UndefinedTy undefined;
+    
+#if defined(__aarch64__)
+  extern const uint32x4_t movemask_mask;
+  extern const uint32x4_t vzero;
+  extern const uint32x4_t v0x80000000;
+  extern const uint32x4_t v0x7fffffff;
+  extern const uint32x4_t v000F;
+  extern const uint32x4_t v00F0;
+  extern const uint32x4_t v00FF;
+  extern const uint32x4_t v0F00;
+  extern const uint32x4_t v0F0F;
+  extern const uint32x4_t v0FF0;
+  extern const uint32x4_t v0FFF;
+  extern const uint32x4_t vF000;
+  extern const uint32x4_t vF00F;
+  extern const uint32x4_t vF0F0;
+  extern const uint32x4_t vF0FF;
+  extern const uint32x4_t vFF00;
+  extern const uint32x4_t vFF0F;
+  extern const uint32x4_t vFFF0;
+  extern const uint32x4_t vFFFF;
+  extern const uint8x16_t v0022;
+  extern const uint8x16_t v1133;
+  extern const uint8x16_t v0101;
+  extern const float32x4_t vOne;
+  extern const float32x4_t vmOne;
+  extern const float32x4_t vInf;
+  extern const float32x4_t vmInf;
+#endif
 }
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h
index 91e7a56608dd..6d54abd44dec 100644
--- a/thirdparty/embree/common/math/math.h
+++ b/thirdparty/embree/common/math/math.h
@@ -8,12 +8,19 @@
 #include "constants.h"
 #include <cmath>
 
+#if defined(__ARM_NEON)
+#include "SSE2NEON.h"
+#if defined(NEON_AVX2_EMULATION)
+#include "AVX2NEON.h"
+#endif
+#else
 #include <emmintrin.h>
 #include <xmmintrin.h>
 #include <immintrin.h>
+#endif
 
-#if defined(__WIN32__)
-#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+#if defined(__WIN32__) && !defined(__MINGW32__)
+#if (__MSV_VER <= 1700)
 namespace std
 {
   __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
@@ -40,7 +47,7 @@ namespace embree
   __forceinline int   toInt  (const float& a) { return int(a); }
   __forceinline float toFloat(const int&   a) { return float(a); }
 
-#if defined(__WIN32__)
+#if defined(__WIN32__) && !defined(__MINGW32__)
   __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
 #endif
 
@@ -49,6 +56,16 @@ namespace embree
 
   __forceinline float rcp  ( const float x )
   {
+#if defined(__aarch64__)
+      // Move scalar to vector register and do rcp.
+      __m128 a;
+      a[0] = x;
+      float32x4_t reciprocal = vrecpeq_f32(a);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      return reciprocal[0];
+#else
+
     const __m128 a = _mm_set_ss(x);
 
 #if defined(__AVX512VL__)
@@ -62,19 +79,61 @@ namespace embree
 #else
     return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
 #endif
+
+#endif  //defined(__aarch64__)
   }
 
   __forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = 0x80000000;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
   }
   __forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128 b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_xor_ps(a, b);
+      return a[0];
+#else
     return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
   }
   __forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__) 
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
   }
   __forceinline float rsqrt( const float x )
   {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      a[0] = x;
+      __m128 value = _mm_rsqrt_ps(a);
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      return value[0];
+#else
+
     const __m128 a = _mm_set_ss(x);
 #if defined(__AVX512VL__)
     const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
@@ -84,9 +143,10 @@ namespace embree
     const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
                                 _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
     return _mm_cvtss_f32(c);
+#endif
   }
 
-#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
+#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__)
   __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
   __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
   __forceinline int roundf(float f) { return (int)(f + 0.5f); }
@@ -140,7 +200,17 @@ namespace embree
   __forceinline double floor( const double x ) { return ::floor (x); }
   __forceinline double ceil ( const double x ) { return ::ceil (x); }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) 
+    __forceinline float mini(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_min_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
   __forceinline float mini(float a, float b) {
     const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
     const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -149,7 +219,17 @@ namespace embree
   }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) 
+    __forceinline float maxi(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_max_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
   __forceinline float maxi(float a, float b) {
     const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
     const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -166,7 +246,7 @@ namespace embree
   __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
   __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
   __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
   __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
 #endif
 
@@ -183,7 +263,7 @@ namespace embree
   __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
   __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
   __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
   __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
 #endif
 
@@ -225,6 +305,16 @@ namespace embree
   __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
   __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
   __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+
+
+__forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+
+#pragma clang fp contract(on)
 #else
   __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
   __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
@@ -273,6 +363,15 @@ namespace embree
   /*! exchange */
   template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
 
+
+  template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) {
+#if 1//!defined(__aarch64__)
+      return msub(a,b,c*d);
+#else
+      return nmadd(c,d,a*b);
+#endif
+  }
+
   /*! bit reverse operation */
   template<class T>
     __forceinline T bitReverse(const T& vin)
@@ -290,7 +389,7 @@ namespace embree
   template<class T>
     __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
   {
-	T x = xin, y = yin, z = zin;
+	  T x = xin, y = yin, z = zin;
     x = (x | (x << 16)) & 0x030000FF;
     x = (x | (x <<  8)) & 0x0300F00F;
     x = (x | (x <<  4)) & 0x030C30C3;
@@ -309,7 +408,7 @@ namespace embree
     return x | (y << 1) | (z << 2);
   }
 
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
 
   template<>
     __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h
index 0ecf8c6384d5..a619459e9c17 100644
--- a/thirdparty/embree/common/math/vec2.h
+++ b/thirdparty/embree/common/math/vec2.h
@@ -205,11 +205,11 @@ namespace embree
 
 #include "vec2fa.h"
 
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
-#if defined __AVX__
+#if defined(__AVX__)
 #include "../simd/avx.h"
 #endif
 
@@ -221,7 +221,7 @@ namespace embree
 {
   template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
   template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 #endif
 
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
index 6b1b6f33f237..451ecd556cfa 100644
--- a/thirdparty/embree/common/math/vec2fa.h
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -97,6 +97,12 @@ namespace embree
 
   __forceinline Vec2fa rcp  ( const Vec2fa& a )
   {
+#if defined(__aarch64__)
+        __m128 reciprocal = _mm_rcp_ps(a.m128);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        return (const Vec2fa)reciprocal;
+#else
 #if defined(__AVX512VL__)
     const Vec2fa r = _mm_rcp14_ps(a.m128);
 #else
@@ -111,6 +117,7 @@ namespace embree
 #endif
 
     return res;
+#endif  //defined(__aarch64__) 
   }
 
   __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -118,12 +125,21 @@ namespace embree
 
   __forceinline Vec2fa rsqrt( const Vec2fa& a )
   {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+        
 #if defined(__AVX512VL__)
     __m128 r = _mm_rsqrt14_ps(a.m128);
 #else
     __m128 r = _mm_rsqrt_ps(a.m128);
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+        
+#endif
   }
 
   __forceinline Vec2fa zero_fix(const Vec2fa& a) {
@@ -156,7 +172,7 @@ namespace embree
   __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
   __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
       const vint4 ai = _mm_castps_si128(a);
       const vint4 bi = _mm_castps_si128(b);
@@ -165,7 +181,7 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
       const vint4 ai = _mm_castps_si128(a);
       const vint4 bi = _mm_castps_si128(b);
@@ -275,7 +291,11 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+#elif defined (__SSE4_1__)
   //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
   __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h
index ab4753545b80..187032171517 100644
--- a/thirdparty/embree/common/math/vec3.h
+++ b/thirdparty/embree/common/math/vec3.h
@@ -206,8 +206,7 @@ namespace embree
   template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
   template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
   template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
-  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
-
+  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); }
   template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
   {
     const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
@@ -266,11 +265,11 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
-#if defined __AVX__
+#if defined(__AVX__)
 #include "../simd/avx.h"
 #endif
 
@@ -291,14 +290,14 @@ namespace embree
   template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
     x = a.x; y = a.y; z = a.z;
   }
-#elif defined(__SSE__)
+#elif defined(__SSE__) || defined(__ARM_NEON)
   template<>
   __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
     const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
   }
 #endif
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
   __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) {
     return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
   }
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
index 6576a15b4f24..6163cfb59662 100644
--- a/thirdparty/embree/common/math/vec3fa.h
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -55,7 +55,13 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline Vec3fa load( const void* const a ) {
+#if defined(__aarch64__)
+        __m128 t = _mm_load_ps((float*)a);
+        t[3] = 0.0f;
+        return Vec3fa(t);
+#else
       return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+#endif
     }
 
     static __forceinline Vec3fa loadu( const void* const a ) {
@@ -89,19 +95,42 @@ namespace embree
 
   __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
   __forceinline Vec3fa operator -( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return vnegq_f32(a.m128);
+#else
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+
     return _mm_xor_ps(a.m128, mask);
+#endif
   }
   __forceinline Vec3fa abs  ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return _mm_abs_ps(a.m128);
+#else
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
     return _mm_and_ps(a.m128, mask);
+#endif
   }
   __forceinline Vec3fa sign ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f)));
+    return r;
+#else
     return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+#endif
   }
 
   __forceinline Vec3fa rcp  ( const Vec3fa& a )
   {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+  return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#elif defined(__aarch64__)
+  __m128 reciprocal = _mm_rcp_ps(a.m128);
+  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+  return (const Vec3fa)reciprocal;
+#else
+        
 #if defined(__AVX512VL__)
     const Vec3fa r = _mm_rcp14_ps(a.m128);
 #else
@@ -116,6 +145,7 @@ namespace embree
 #endif
 
     return res;
+#endif  //defined(__aarch64__)
   }
 
   __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -123,12 +153,20 @@ namespace embree
 
   __forceinline Vec3fa rsqrt( const Vec3fa& a )
   {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+        
 #if defined(__AVX512VL__)
     __m128 r = _mm_rsqrt14_ps(a.m128);
 #else
     __m128 r = _mm_rsqrt_ps(a.m128);
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
   }
 
   __forceinline Vec3fa zero_fix(const Vec3fa& a) {
@@ -161,7 +199,7 @@ namespace embree
   __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
   __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -170,7 +208,7 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -192,11 +230,30 @@ namespace embree
   __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
   __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
   __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+                                                                                
+#if defined(__aarch64__)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_madd_ps(a.m128, b.m128, c.m128);  //a*b+c;
+    }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_msub_ps(a.m128, b.m128, c.m128);  //-a*b+c;
+    }
+  __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128);
+        return -t;
+    }
+  __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c
+    }
+
 #else
   __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
-  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
   __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
   __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
+#endif
+
 #endif
 
   __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
@@ -218,18 +275,37 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline float reduce_add(const Vec3fa& v) { 
+#if defined(__aarch64__) && defined(BUILD_IOS)
+  __forceinline float reduce_add(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+    t[3] = 0.0f;
+    return vaddvq_f32(t);
+  }
+                                                                                
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vminvq_f32(t);
+  }
+  __forceinline float reduce_max(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vmaxvq_f32(t);
+  }
+#else
+  __forceinline float reduce_add(const Vec3fa& v) {
     const vfloat4 a(v.m128);
     const vfloat4 b = shuffle<1>(a);
     const vfloat4 c = shuffle<2>(a);
-    return _mm_cvtss_f32(a+b+c); 
+    return _mm_cvtss_f32(a+b+c);
   }
 
   __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
   __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
   __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
-
+#endif
+                                                                                
   ////////////////////////////////////////////////////////////////////////////////
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
@@ -241,8 +317,13 @@ namespace embree
   __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
   __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
   __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
-  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+ #if defined(__aarch64__)
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+#else
+  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+#endif
 
   __forceinline bool isvalid ( const Vec3fa& v ) {
     return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
@@ -280,7 +361,7 @@ namespace embree
     vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
     vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
     vfloat4 b1 = vfloat4(b.m128);
-    return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+    return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)));
   }
 
   __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
@@ -335,7 +416,11 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
+  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#elif defined (__SSE4_1__)
   __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
   __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
@@ -393,8 +478,10 @@ namespace embree
 
     __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
     __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
-    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {      
-#if defined (__SSE4_1__)
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {
+#if defined (__aarch64__)
+      m128 = other.m128; m128[3] = w1;
+#elif defined (__SSE4_1__)
       m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
 #else
       const vint4 mask(-1,-1,-1,0);
@@ -526,7 +613,7 @@ namespace embree
   __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
   __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
     __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -535,7 +622,7 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
     __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
       const vint4 ai = _mm_castps_si128(a.m128);
       const vint4 bi = _mm_castps_si128(b.m128);
@@ -584,11 +671,11 @@ namespace embree
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline float reduce_add(const Vec3fx& v) { 
+  __forceinline float reduce_add(const Vec3fx& v) {
     const vfloat4 a(v.m128);
     const vfloat4 b = shuffle<1>(a);
     const vfloat4 c = shuffle<2>(a);
-    return _mm_cvtss_f32(a+b+c); 
+    return _mm_cvtss_f32(a+b+c);
   }
 
   __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
@@ -700,7 +787,7 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__SSE4_1__)
+#if defined (__SSE4_1__) && !defined(__aarch64__)
   __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
   __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h
index e1c997299400..737f67fd725e 100644
--- a/thirdparty/embree/common/math/vec3ia.h
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -65,7 +65,9 @@ namespace embree
 
   __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
   __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
-#if defined(__SSSE3__)
+#if (defined(__aarch64__)) 
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
+#elif defined(__SSSE3__)
   __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
 #endif
 
@@ -81,7 +83,7 @@ namespace embree
   __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
   __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
   __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
   __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
@@ -99,12 +101,14 @@ namespace embree
   __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
   __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
 
+#if !defined(__ARM_NEON)
   __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
   __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
 
   __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
   __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
   __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+#endif
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -116,7 +120,7 @@ namespace embree
   __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
   __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
   
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
   __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
 #endif
@@ -127,18 +131,38 @@ namespace embree
   __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
   __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
   
+#if !defined(__ARM_NEON)
   __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
   __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+#endif
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-
+#if defined(__aarch64__)
+  __forceinline int reduce_add(const Vec3ia& v) {
+    int32x4_t t = v.m128;
+    t[3] = 0;
+    return vaddvq_s32(t);
+        
+  }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) {
+    int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0);
+    return vminvq_s32(t);
+        
+  }
+  __forceinline int reduce_max(const Vec3ia& v) {
+    int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0);
+    return vmaxvq_s32(t);
+        
+  }
+#else
   __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
   __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
   __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
   __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
-
+#endif
   ////////////////////////////////////////////////////////////////////////////////
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
@@ -161,14 +185,14 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
     return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
 #else
     return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
 #endif
   }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
   __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
 #else
diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h
index 3354b443178a..d16542f507f7 100644
--- a/thirdparty/embree/common/math/vec4.h
+++ b/thirdparty/embree/common/math/vec4.h
@@ -192,7 +192,7 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   typedef Vec4<bool         > Vec4b;
-  typedef Vec4<unsigned char> Vec4uc;
+  typedef Vec4<uint8_t      > Vec4uc;
   typedef Vec4<int          > Vec4i;
   typedef Vec4<float        > Vec4f;
 }
@@ -205,7 +205,7 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
 #include "../simd/sse.h"
 #endif
 
@@ -225,13 +225,13 @@ namespace embree
   template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
     x = a.x; y = a.y; z = a.z; w = a.w;
   }
-#elif defined(__SSE__)
+#elif defined(__SSE__) || defined(__ARM_NEON)
   template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
     const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
   }
 #endif
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
   __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) {
     return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
   }
diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h
index c1351c2c8884..647851110b87 100644
--- a/thirdparty/embree/common/simd/simd.h
+++ b/thirdparty/embree/common/simd/simd.h
@@ -6,7 +6,7 @@
 #include "../math/math.h"
 
 /* include SSE wrapper classes */
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
 #  include "sse.h"
 #endif
 
diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h
index 67df3ec00980..6bc818b55bb5 100644
--- a/thirdparty/embree/common/simd/sse.h
+++ b/thirdparty/embree/common/simd/sse.h
@@ -11,7 +11,7 @@
 
 namespace embree 
 {
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
   __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
     return _mm_blendv_ps(f,t,mask);
   }
diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h
index 44e423b001be..6505ee56f38f 100644
--- a/thirdparty/embree/common/simd/vboold4_avx.h
+++ b/thirdparty/embree/common/simd/vboold4_avx.h
@@ -56,8 +56,12 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+#if !defined(__aarch64__)
     __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
-
+#else
+    __forceinline vboold(TrueTy)  : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
+#endif
+      
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
     ////////////////////////////////////////////////////////////////////////////////
@@ -101,9 +105,10 @@ namespace embree
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
+#if !defined(__aarch64__)
   __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
   __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
-
+#endif
 
 #if defined(__AVX2__)
   template<int i0, int i1, int i2, int i3>
diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h
index afec10fd499e..ed53b3c783f8 100644
--- a/thirdparty/embree/common/simd/vboolf4_sse2.h
+++ b/thirdparty/embree/common/simd/vboolf4_sse2.h
@@ -37,9 +37,13 @@ namespace embree
       : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
     __forceinline vboolf(bool a, bool b, bool c, bool d)
       : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      __forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; }
+      __forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; }
+#else
     __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
     __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
-
+#endif
     /* return int32 mask */
     __forceinline __m128i mask32() const { 
       return _mm_castps_si128(v);
@@ -56,8 +60,13 @@ namespace embree
     /// Array Access
     ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      __forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; }
+      __forceinline int& operator [](size_t index)       { return i[index]; }
+#else
     __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
     __forceinline int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+#endif
   };
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -92,7 +101,7 @@ namespace embree
   __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
   
   __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
     return _mm_blendv_ps(f, t, m); 
 #else
     return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
@@ -106,6 +115,17 @@ namespace embree
   __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
   __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
 
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf4 shuffle(const vboolf4& v) {
     return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
@@ -115,7 +135,8 @@ namespace embree
   __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
     return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
   }
-
+#endif
+                                                                
   template<int i0>
   __forceinline vboolf4 shuffle(const vboolf4& v) {
     return shuffle<i0,i0,i0,i0>(v);
@@ -127,7 +148,7 @@ namespace embree
   template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && !defined(__aarch64__)
   template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
   template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
   template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
@@ -149,10 +170,14 @@ namespace embree
   __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
   
   __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
+#if defined(__aarch64__) && defined(BUILD_IOS)
+__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); }
+#else
 #if defined(__SSE4_2__)
   __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
 #else
   __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h
index 5d7c0d68c1b0..4f64741b55ab 100644
--- a/thirdparty/embree/common/simd/vboolf8_avx.h
+++ b/thirdparty/embree/common/simd/vboolf8_avx.h
@@ -68,8 +68,11 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
+#if !defined(__aarch64__)
     __forceinline vboolf(TrueTy)  : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
-
+#else
+    __forceinline vboolf(TrueTy)  : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {}
+#endif
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
     ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h
index eedb04aafbe6..1f65b45d7e7e 100644
--- a/thirdparty/embree/common/simd/vdouble4_avx.h
+++ b/thirdparty/embree/common/simd/vdouble4_avx.h
@@ -181,13 +181,20 @@ namespace embree
   __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
   __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
   __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
-#else
+#elif !defined(__aarch64__)
   __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
   __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
   __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
   __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
   __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
   __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+#else
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b);  }
 #endif
 
   __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h
index 96f984cebd5e..5732c0fbc850 100644
--- a/thirdparty/embree/common/simd/vfloat4_sse2.h
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -10,18 +10,18 @@ namespace embree
   struct vfloat<4>
   {
     ALIGNED_STRUCT_(16);
-    
+
     typedef vboolf4 Bool;
     typedef vint4   Int;
     typedef vfloat4 Float;
-    
+
     enum  { size = 4 };                        // number of SIMD elements
     union { __m128 v; float f[4]; int i[4]; }; // data
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constructors, Assignment & Cast Operators
     ////////////////////////////////////////////////////////////////////////////////
-    
+
     __forceinline vfloat() {}
     __forceinline vfloat(const vfloat4& other) { v = other.v; }
     __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
@@ -34,14 +34,19 @@ namespace embree
     __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
 
     __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+#if defined(__aarch64__)
+    __forceinline explicit vfloat(const vuint4& x) {
+        v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
+    }
+#else
     __forceinline explicit vfloat(const vuint4& x) {
       const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
-      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
+      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31
       const __m128  af  = _mm_cvtepi32_ps(a);
-      const __m128  bf  = _mm_castsi128_ps(b);  
+      const __m128  bf  = _mm_castsi128_ps(b);
       v  = _mm_add_ps(af,bf);
     }
-
+#endif
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
@@ -102,32 +107,44 @@ namespace embree
 #if defined (__SSE4_1__)
     return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
 #else
-    return _mm_load_ps(ptr); 
+    return _mm_load_ps(ptr);
 #endif
   }
 
-#if defined(__SSE4_1__)
-    static __forceinline vfloat4 load(const char* ptr) {
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const int8_t* ptr) {
+        return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const int8_t* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
 #else
-    static __forceinline vfloat4 load(const char* ptr) {
+    static __forceinline vfloat4 load(const int8_t* ptr) {
       return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
     }
 #endif
 
-#if defined(__SSE4_1__)
-    static __forceinline vfloat4 load(const unsigned char* ptr) {
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
+        return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
 #else
-    static __forceinline vfloat4 load(const unsigned char* ptr) {
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
       //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
       return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const short* ptr) {
+        return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vfloat4 load(const short* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
@@ -140,11 +157,15 @@ namespace embree
     static __forceinline vfloat4 load(const unsigned short* ptr) {
       return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
     }
-    
+
     static __forceinline void store_nt(void* ptr, const vfloat4& v)
     {
 #if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      _mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v));
+#else
       _mm_stream_ps((float*)ptr,v);
+#endif
 #else
       _mm_store_ps((float*)ptr,v);
 #endif
@@ -152,14 +173,14 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm_i32gather_ps(ptr, index, scale);
 #else
       return vfloat4(
-        *(float*)(((char*)ptr)+scale*index[0]),
-        *(float*)(((char*)ptr)+scale*index[1]),
-        *(float*)(((char*)ptr)+scale*index[2]),
-        *(float*)(((char*)ptr)+scale*index[3]));
+        *(float*)(((int8_t*)ptr)+scale*index[0]),
+        *(float*)(((int8_t*)ptr)+scale*index[1]),
+        *(float*)(((int8_t*)ptr)+scale*index[2]),
+        *(float*)(((int8_t*)ptr)+scale*index[3]));
 #endif
     }
 
@@ -168,13 +189,13 @@ namespace embree
       vfloat4 r = zero;
 #if defined(__AVX512VL__)
       return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__)  && !defined(__aarch64__)
       return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
 #else
-      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
-      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
-      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
-      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
       return r;
 #endif
     }
@@ -185,10 +206,10 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm_i32scatter_ps((float*)ptr, index, v, scale);
 #else
-      *(float*)(((char*)ptr)+scale*index[0]) = v[0];
-      *(float*)(((char*)ptr)+scale*index[1]) = v[1];
-      *(float*)(((char*)ptr)+scale*index[2]) = v[2];
-      *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+      *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
 #endif
     }
 
@@ -198,20 +219,20 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
 #else
-      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0];
-      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1];
-      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2];
-      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3];
+      if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
 #endif
     }
 
-    static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) {
+    static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) {
       scatter<1>(mask,ptr,ofs,v);
     }
     static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
       scatter<4>(mask,ptr,ofs,v);
     }
-    
+
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
     ////////////////////////////////////////////////////////////////////////////////
@@ -222,10 +243,10 @@ namespace embree
     friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
 #if defined(__AVX512VL__)
       return _mm_mask_blend_ps(m, f, t);
-#elif defined(__SSE4_1__)
-      return _mm_blendv_ps(f, t, m); 
+#elif defined(__SSE4_1__) || (defined(__aarch64__))
+      return _mm_blendv_ps(f, t, m);
 #else
-      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
 #endif
     }
   };
@@ -243,18 +264,47 @@ namespace embree
   __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
 
   __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+#if defined(__aarch64__)
+  __forceinline vfloat4 operator -(const vfloat4& a) {
+    return vnegq_f32(a);
+  }
+#else
   __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
 
+#if defined(__aarch64__)
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
+#else
   __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#endif
+
 #if defined(__AVX512VL__)
   __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
 #else
   __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
 #endif
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); }
+#else
   __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
-  
+#endif
+
   __forceinline vfloat4 rcp(const vfloat4& a)
   {
+#if defined(__aarch64__)
+#if defined(BUILD_IOS)
+    return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
+#else //BUILD_IOS
+    __m128 reciprocal = _mm_rcp_ps(a);
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    // +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp.
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    return (const vfloat4)reciprocal;
+#endif // BUILD_IOS
+#else
+
 #if defined(__AVX512VL__)
     const vfloat4 r = _mm_rcp14_ps(a);
 #else
@@ -266,12 +316,22 @@ namespace embree
 #else
     return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
 #endif
+
+#endif  //defined(__aarch64__)
   }
   __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
   __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
 
   __forceinline vfloat4 rsqrt(const vfloat4& a)
   {
+#if defined(__aarch64__)
+    vfloat4 r = _mm_rsqrt_ps(a);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    return r;
+#else
+
 #if defined(__AVX512VL__)
     const vfloat4 r = _mm_rsqrt14_ps(a);
 #else
@@ -284,11 +344,17 @@ namespace embree
 #else
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
                       _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+
 #endif
   }
 
   __forceinline vboolf4 isnan(const vfloat4& a) {
+#if defined(__aarch64__)
+    const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff));
+#else
     const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+#endif
 #if defined(__AVX512VL__)
     return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
 #else
@@ -329,7 +395,8 @@ namespace embree
   __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
   __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
+
     __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
       const vint4 ai = _mm_castps_si128(a);
       const vint4 bi = _mm_castps_si128(b);
@@ -377,10 +444,24 @@ namespace embree
   __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
   __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
 #else
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return _mm_madd_ps(a, b, c);  //a*b+c;
+  }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return _mm_msub_ps(a, b, c);  //-a*b+c;
+  }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return vnegq_f32(vfmaq_f32(c,a, b));
+  }
+#else
   __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
-  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
   __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
   __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+#endif
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -414,8 +495,13 @@ namespace embree
   __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
   __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
   __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+#if defined(__aarch64__)
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
+#else
   __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
   __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+#endif
   __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
 #endif
 
@@ -427,7 +513,7 @@ namespace embree
 
   __forceinline vboolf4 operator < (const vfloat4& a, float          b) { return a <  vfloat4(b); }
   __forceinline vboolf4 operator < (float          a, const vfloat4& b) { return vfloat4(a) <  b; }
-  
+
   __forceinline vboolf4 operator >=(const vfloat4& a, float          b) { return a >= vfloat4(b); }
   __forceinline vboolf4 operator >=(float          a, const vfloat4& b) { return vfloat4(a) >= b; }
 
@@ -463,17 +549,68 @@ namespace embree
   template<int mask>
     __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
   {
-#if defined(__SSE4_1__) 
+#if defined(__SSE4_1__)
     return _mm_blend_ps(f, t, mask);
 #else
     return select(vboolf4(mask), t, f);
 #endif
   }
-  
+
+#if defined(__aarch64__)
+    template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero));
+    }
+    template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F));
+    }
+    template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0));
+    }
+    template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF));
+    }
+    template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00));
+    }
+    template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F));
+    }
+    template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0));
+    }
+    template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF));
+    }
+    template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000));
+    }
+    template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F));
+    }
+    template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0));
+    }
+    template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF));
+    }
+    template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00));
+    }
+    template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F));
+    }
+    template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0));
+    }
+    template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF));
+    }
+#endif
+
   __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
     return madd(t,b-a,a);
   }
-  
+
   __forceinline bool isvalid(const vfloat4& v) {
     return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
   }
@@ -485,16 +622,21 @@ namespace embree
   __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
     return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
   }
-  
+
   ////////////////////////////////////////////////////////////////////////////////
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__SSE4_1__)
-  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
-  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO       ); }
-  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#if defined(__aarch64__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
+#elif defined (__SSE4_1__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF   ); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF   ); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO      ); }
+  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd
 #else
   __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
   __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
@@ -504,7 +646,9 @@ namespace embree
   __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
 
   __forceinline vint4 floori(const vfloat4& a) {
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    return vcvtq_s32_f32(floor(a));
+#elif defined(__SSE4_1__)
     return vint4(floor(a));
 #else
     return vint4(a-vfloat4(0.5f));
@@ -518,6 +662,16 @@ namespace embree
   __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
   __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
 
+#if defined(__aarch64__)
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& v) {
+          return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+      }
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+          return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+      }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat4 shuffle(const vfloat4& v) {
     return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
@@ -527,14 +681,19 @@ namespace embree
   __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
     return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
   }
+#endif
 
 #if defined (__SSSE3__)
   __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) {
-    return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); 
+    return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
   }
 #endif
 
-#if defined(__SSE3__)
+#if defined(__aarch64__) 
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); }
+#elif defined(__SSE3__)
   template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
   template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
   template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
@@ -545,14 +704,56 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined (__SSE4_1__) && !defined(__GNUC__)
+#if defined(__aarch64__)
+  template<int i> __forceinline float extract(const vfloat4& a);
+  template<> __forceinline float extract<0>(const vfloat4& b) {
+      return b[0];
+  }
+  template<> __forceinline float extract<1>(const vfloat4& b) {
+      return b[1];
+  }
+  template<> __forceinline float extract<2>(const vfloat4& b) {
+      return b[2];
+  }
+  template<> __forceinline float extract<3>(const vfloat4& b) {
+      return b[3];
+  }
+#elif defined (__SSE4_1__) && !defined(__GNUC__)
   template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); }
+  template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
 #else
   template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
-#endif
   template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#endif
 
-#if defined (__SSE4_1__)
+
+#if defined(__aarch64__)
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b);
+  template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[0] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[1] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[2] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[3] = b;
+        return c;
+  }
+#elif defined (__SSE4_1__)
   template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
   template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
   template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
@@ -561,14 +762,19 @@ namespace embree
   template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
 #endif
 
+#if defined(__aarch64__)
+  __forceinline float toScalar(const vfloat4& v) {
+    return v[0];
+  }
+#else
   __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
-
+#endif
   __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) {
     return vfloat4::broadcast(&a[k]);
   }
 
   __forceinline vfloat4 shift_right_1(const vfloat4& x) {
-    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); 
+    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4));
   }
 
 #if defined (__AVX2__)
@@ -584,7 +790,7 @@ namespace embree
   template<int i>
   __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
     return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
-  }  
+  }
 #endif
 
 
@@ -658,28 +864,39 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-
+#if defined(__aarch64__)
+      __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
+      __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
+      __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
+#else
   __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+#endif
 
+#if defined(__aarch64__)
+  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
+  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
+  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
+#else
   __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
   __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
   __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+#endif
 
-  __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) 
-  { 
-    const vfloat4 a = select(valid,v,vfloat4(pos_inf)); 
+  __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v)
+  {
+    const vfloat4 a = select(valid,v,vfloat4(pos_inf));
     const vbool4 valid_min = valid & (a == vreduce_min(a));
-    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+    return bsf(movemask(any(valid_min) ? valid_min : valid));
   }
-  __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) 
-  { 
-    const vfloat4 a = select(valid,v,vfloat4(neg_inf)); 
+  __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v)
+  {
+    const vfloat4 a = select(valid,v,vfloat4(neg_inf));
     const vbool4 valid_max = valid & (a == vreduce_max(a));
-    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+    return bsf(movemask(any(valid_max) ? valid_max : valid));
   }
-  
+
   ////////////////////////////////////////////////////////////////////////////////
   /// Euclidian Space Operators
   ////////////////////////////////////////////////////////////////////////////////
@@ -694,7 +911,7 @@ namespace embree
     const vfloat4 b0 = shuffle<1,2,0,3>(b);
     const vfloat4 a1 = shuffle<1,2,0,3>(a);
     const vfloat4 b1 = b;
-    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+    return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h
index 09d7ccc71e96..3c7e4a8cdcbb 100644
--- a/thirdparty/embree/common/simd/vfloat8_avx.h
+++ b/thirdparty/embree/common/simd/vfloat8_avx.h
@@ -33,7 +33,7 @@ namespace embree
     __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
     __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
 
-    __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {}
+    __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {}
     __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
     __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
     __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
@@ -75,7 +75,7 @@ namespace embree
       return _mm256_broadcast_ps((__m128*)ptr); 
     }
 
-    static __forceinline vfloat8 load(const char* ptr) {
+    static __forceinline vfloat8 load(const int8_t* ptr) {
 #if defined(__AVX2__)
       return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
 #else
@@ -83,7 +83,7 @@ namespace embree
 #endif
     }
 
-    static __forceinline vfloat8 load(const unsigned char* ptr) {
+    static __forceinline vfloat8 load(const uint8_t* ptr) {
 #if defined(__AVX2__)
       return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
 #else
@@ -119,6 +119,12 @@ namespace embree
 
     static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
     static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__aarch64__)
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
 #else
     static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
     static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
@@ -139,18 +145,18 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm256_i32gather_ps(ptr, index ,scale);
 #else
       return vfloat8(
-          *(float*)(((char*)ptr)+scale*index[0]),
-          *(float*)(((char*)ptr)+scale*index[1]),
-          *(float*)(((char*)ptr)+scale*index[2]),
-          *(float*)(((char*)ptr)+scale*index[3]),
-          *(float*)(((char*)ptr)+scale*index[4]),
-          *(float*)(((char*)ptr)+scale*index[5]),
-          *(float*)(((char*)ptr)+scale*index[6]),
-          *(float*)(((char*)ptr)+scale*index[7]));
+          *(float*)(((int8_t*)ptr)+scale*index[0]),
+          *(float*)(((int8_t*)ptr)+scale*index[1]),
+          *(float*)(((int8_t*)ptr)+scale*index[2]),
+          *(float*)(((int8_t*)ptr)+scale*index[3]),
+          *(float*)(((int8_t*)ptr)+scale*index[4]),
+          *(float*)(((int8_t*)ptr)+scale*index[5]),
+          *(float*)(((int8_t*)ptr)+scale*index[6]),
+          *(float*)(((int8_t*)ptr)+scale*index[7]));
 #endif
     }
 
@@ -159,17 +165,17 @@ namespace embree
       vfloat8 r = zero;
 #if defined(__AVX512VL__)
       return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) && !defined(__aarch64__)
       return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
 #else
-      if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
-      if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
-      if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]);
-      if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]);
-      if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]);
-      if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]);
-      if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]);
-      if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]);
+      if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]);
       return r;
     #endif
     }
@@ -180,14 +186,14 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
 #else
-      *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
 #endif
     }
 
@@ -197,18 +203,18 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
 #else
-      if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
 #endif
     }
 
-    static __forceinline void store(const vboolf8& mask, char* ptr, const vint8& ofs, const vfloat8& v) {
+    static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) {
       scatter<1>(mask,ptr,ofs,v);
     }
     static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) {
@@ -235,27 +241,60 @@ namespace embree
   __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
 
   __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+#if !defined(__aarch64__)
   __forceinline vfloat8 operator -(const vfloat8& a) {
     const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
     return _mm256_xor_ps(a, mask);
   }
-  __forceinline vfloat8 abs(const vfloat8& a) {
-    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
-    return _mm256_and_ps(a, mask);
-  }
+#else
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+      __m256 res;
+      res.lo = vnegq_f32(a.v.lo);
+      res.hi = vnegq_f32(a.v.hi);
+      return res;
+}
+#endif
+
+#if !defined(__aarch64__)
+__forceinline vfloat8 abs(const vfloat8& a) {
+  const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+  return _mm256_and_ps(a, mask);
+}
+#else
+__forceinline vfloat8 abs(const vfloat8& a) {
+    __m256 res;
+    res.lo = vabsq_f32(a.v.lo);
+    res.hi = vabsq_f32(a.v.hi);
+    return res;
+}
+#endif
+
+#if !defined(__aarch64__)
   __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+#else
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
+#endif
   __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
 
 
   static __forceinline vfloat8 rcp(const vfloat8& a)
   {
+#if defined(BUILD_IOS) && defined(__aarch64__)
+    // ios devices are faster doing full divide, no need for NR fixup
+    vfloat8 ret;
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    ret.v.lo = vdivq_f32(one, a.v.lo);
+    ret.v.hi = vdivq_f32(one, a.v.hi);
+    return ret;
+#endif
+
 #if defined(__AVX512VL__)
     const vfloat8 r = _mm256_rcp14_ps(a);
 #else
     const vfloat8 r = _mm256_rcp_ps(a);
 #endif
-
-#if defined(__AVX2__)
+      
+#if defined(__AVX2__) //&& !defined(aarch64)
     return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
 #else
     return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
@@ -404,17 +443,29 @@ namespace embree
   static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
     return _mm256_mask_blend_ps(m, f, t);
   }
-#else
-  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
-  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
-  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
-  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
-  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
-  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
+#elif !defined(__aarch64__)
+  __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
+  __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+  __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+  __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
 
-  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+  __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
     return _mm256_blendv_ps(f, t, m); 
   }
+#else
+  __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b);  }
+  __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
+  __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b);  }
+  __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b);  }
+  __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b);  }
+  __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b);  }
+
+  __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m);
+  }
+
 #endif
 
   template<int mask>
@@ -483,10 +534,17 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
+#if !defined(__aarch64__)
   __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
   __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
   __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
+#endif
+
+
   __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -521,9 +579,11 @@ namespace embree
     return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
+#if !defined(__aarch64__)
   template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
   template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
   template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+#endif
 
   __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
   template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
@@ -534,8 +594,8 @@ namespace embree
 
   __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); }
 
-#if defined (__AVX2__)
-  static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
+#if defined (__AVX2__) && !defined(__aarch64__)
+  __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
     return _mm256_permutevar8x32_ps(a, index);
   }
 #endif
@@ -639,7 +699,7 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-
+#if !defined(__aarch64__)
   __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
   __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
   __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
@@ -655,7 +715,14 @@ namespace embree
   __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
   __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
   __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+#else
+  __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
+  __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
+  __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
+  __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
 
+#endif
   __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) 
   { 
     const vfloat8 a = select(valid,v,vfloat8(pos_inf)); 
diff --git a/thirdparty/embree/common/simd/vint16_avx512.h b/thirdparty/embree/common/simd/vint16_avx512.h
index 34e3d5ca0785..3249bc2b4555 100644
--- a/thirdparty/embree/common/simd/vint16_avx512.h
+++ b/thirdparty/embree/common/simd/vint16_avx512.h
@@ -90,10 +90,10 @@ namespace embree
 
     static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
 
-    static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
     static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
 
-    static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
     static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
 
     static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h
index 458f8cfaa670..96f105a7c531 100644
--- a/thirdparty/embree/common/simd/vint4_sse2.h
+++ b/thirdparty/embree/common/simd/vint4_sse2.h
@@ -23,7 +23,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
     /// Constructors, Assignment & Cast Operators
     ////////////////////////////////////////////////////////////////////////////////
-    
+
     __forceinline vint() {}
     __forceinline vint(const vint4& a) { v = a.v; }
     __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
@@ -68,7 +68,7 @@ namespace embree
 
     static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
     static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
-    
+
 #if defined(__AVX512VL__)
 
     static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
@@ -98,61 +98,81 @@ namespace embree
 #endif
 
 
-#if defined(__SSE4_1__)
-    static __forceinline vint4 load(const unsigned char* ptr) {
+#if defined(__aarch64__)
+    static __forceinline vint4 load(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
+        return  _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vint4 load(const uint8_t* ptr) {
       return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
     }
 
-    static __forceinline vint4 loadu(const unsigned char* ptr) {
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
       return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
     }
 #else
 
-    static __forceinline vint4 load(const unsigned char* ptr) {
+    static __forceinline vint4 load(const uint8_t* ptr) {
       return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
-    } 
+    }
 
-    static __forceinline vint4 loadu(const unsigned char* ptr) {
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
       return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
     }
 
 #endif
 
     static __forceinline vint4 load(const unsigned short* ptr) {
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      return __m128i(vmovl_u16(vld1_u16(ptr)));
+#elif defined (__SSE4_1__)
       return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
 #else
       return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
 #endif
-    } 
+    }
 
-    static __forceinline void store(unsigned char* ptr, const vint4& v) {
-#if defined(__SSE4_1__)
+    static __forceinline void store(uint8_t* ptr, const vint4& v) {
+#if defined(__aarch64__)
+        int32x4_t x = v;
+        uint16x4_t y = vqmovn_u32(uint32x4_t(x));
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
       __m128i x = v;
       x = _mm_packus_epi32(x, x);
       x = _mm_packus_epi16(x, x);
       *(int*)ptr = _mm_cvtsi128_si32(x);
 #else
       for (size_t i=0;i<4;i++)
-        ptr[i] = (unsigned char)v[i];
+        ptr[i] = (uint8_t)v[i];
 #endif
     }
 
     static __forceinline void store(unsigned short* ptr, const vint4& v) {
+#if defined(__aarch64__)
+      uint32x4_t x = uint32x4_t(v.v);
+      uint16x4_t y = vqmovn_u32(x);
+      vst1_u16(ptr, y);
+#else
       for (size_t i=0;i<4;i++)
         ptr[i] = (unsigned short)v[i];
+#endif
     }
 
     static __forceinline vint4 load_nt(void* ptr) {
-#if defined(__SSE4_1__)
-      return _mm_stream_load_si128((__m128i*)ptr); 
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr);
 #else
-      return _mm_load_si128((__m128i*)ptr); 
+      return _mm_load_si128((__m128i*)ptr);
 #endif
     }
-    
+
     static __forceinline void store_nt(void* ptr, const vint4& v) {
-#if defined(__SSE4_1__)
+#if !defined(__aarch64__) && defined(__SSE4_1__)
       _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
 #else
       _mm_store_si128((__m128i*)ptr,v);
@@ -161,14 +181,14 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vint4 gather(const int* ptr, const vint4& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm_i32gather_epi32(ptr, index, scale);
 #else
       return vint4(
-          *(int*)(((char*)ptr)+scale*index[0]),
-          *(int*)(((char*)ptr)+scale*index[1]),
-          *(int*)(((char*)ptr)+scale*index[2]),
-          *(int*)(((char*)ptr)+scale*index[3]));
+          *(int*)(((int8_t*)ptr)+scale*index[0]),
+          *(int*)(((int8_t*)ptr)+scale*index[1]),
+          *(int*)(((int8_t*)ptr)+scale*index[2]),
+          *(int*)(((int8_t*)ptr)+scale*index[3]));
 #endif
     }
 
@@ -177,13 +197,13 @@ namespace embree
       vint4 r = zero;
 #if defined(__AVX512VL__)
       return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) && !defined(__aarch64__)
       return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
 #else
-      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
-      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
-      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
-      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
       return r;
 #endif
     }
@@ -194,10 +214,10 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm_i32scatter_epi32((int*)ptr, index, v, scale);
 #else
-      *(int*)(((char*)ptr)+scale*index[0]) = v[0];
-      *(int*)(((char*)ptr)+scale*index[1]) = v[1];
-      *(int*)(((char*)ptr)+scale*index[2]) = v[2];
-      *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+      *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
 #endif
     }
 
@@ -207,14 +227,14 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
 #else
-      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
-      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
-      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2];
-      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3];
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
 #endif
     }
 
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(__aarch64__)
     static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
 #endif
 
@@ -228,10 +248,12 @@ namespace embree
     friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
 #if defined(__AVX512VL__)
       return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__aarch64__)
+      return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
 #elif defined(__SSE4_1__)
-      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
 #else
-      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
 #endif
     }
   };
@@ -248,7 +270,9 @@ namespace embree
 
   __forceinline vint4 operator +(const vint4& a) { return a; }
   __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
-#if defined(__SSSE3__)
+#if defined(__aarch64__)
+  __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
+#elif defined(__SSSE3__)
   __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
 #endif
 
@@ -264,7 +288,7 @@ namespace embree
   __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
   __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
 
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
   __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
 #else
   __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
@@ -284,34 +308,34 @@ namespace embree
   __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
   __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
 
-  __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); }
-  __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); }
+  __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
 
   __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
   __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
   __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
-  
+
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
   __forceinline vint4& operator +=(vint4& a, int          b) { return a = a + b; }
-  
+
   __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
   __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
 
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
   __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
   __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
 #endif
-  
+
   __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
   __forceinline vint4& operator &=(vint4& a, int          b) { return a = a & b; }
-  
+
   __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
   __forceinline vint4& operator |=(vint4& a, int          b) { return a = a | b; }
-  
+
   __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
   __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
 
@@ -378,14 +402,15 @@ namespace embree
 
   template<int mask>
   __forceinline vint4 select(const vint4& t, const vint4& f) {
-#if defined(__SSE4_1__) 
+#if defined(__SSE4_1__)
     return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
 #else
     return select(vboolf4(mask), t, f);
-#endif    
+#endif
   }
 
-#if defined(__SSE4_1__)
+      
+#if defined(__aarch64__) || defined(__SSE4_1__)
   __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
   __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
 
@@ -409,16 +434,25 @@ namespace embree
   __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
   __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
 
+#if defined(__aarch64__)
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& v) {
+        return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+    }
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+        return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+    }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vint4 shuffle(const vint4& v) {
     return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
   }
-
   template<int i0, int i1, int i2, int i3>
   __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
     return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
-
+#endif
 #if defined(__SSE3__)
   template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
   template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@@ -430,7 +464,10 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    template<int src> __forceinline int extract(const vint4& b);
+    template<int dst> __forceinline vint4 insert(const vint4& a, const int b);
+#elif defined(__SSE4_1__)
   template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
   template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
 #else
@@ -438,19 +475,69 @@ namespace embree
   template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
 #endif
 
-
+#if defined(__aarch64__)
+    template<> __forceinline int extract<0>(const vint4& b) {
+        return b.v[0];
+    }
+    template<> __forceinline int extract<1>(const vint4& b) {
+        return b.v[1];
+    }
+    template<> __forceinline int extract<2>(const vint4& b) {
+        return b.v[2];
+    }
+    template<> __forceinline int extract<3>(const vint4& b) {
+        return b.v[3];
+    }
+    template<> __forceinline vint4 insert<0>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[0] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<1>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[1] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<2>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[2] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<3>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[3] = b;
+        return c;
+    }
+      
+    __forceinline int toScalar(const vint4& v) {
+        return v[0];
+    }
+      
+    __forceinline size_t toSizeT(const vint4& v) {
+        uint64x2_t x = uint64x2_t(v.v);
+        return x[0];
+    }
+#else
   template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
 
   __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
 
-  __forceinline size_t toSizeT(const vint4& v) { 
+  __forceinline size_t toSizeT(const vint4& v) {
 #if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
     return toScalar(v);
+#elif defined(__ARM_NEON)
+    // FIXME(LTE): Do we need a swap(i.e. use lane 1)?
+    return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
 #else
-    return _mm_cvtsi128_si64(v); 
+    return _mm_cvtsi128_si64(v);
 #endif
   }
-
+#endif
+      
 #if defined(__AVX512VL__)
 
   __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
@@ -459,15 +546,25 @@ namespace embree
 
   template<int i>
   __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
-    return _mm_alignr_epi32(a, b, i);    
-  }  
+    return _mm_alignr_epi32(a, b, i);
+  }
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      
+#if defined(__aarch64__)
+    __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
+      
+    __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
+    __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
+    __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
+#else
   __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
   __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
   __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
@@ -475,7 +572,8 @@ namespace embree
   __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
   __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
   __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
-
+#endif
+      
   __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
   __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
 
@@ -494,7 +592,7 @@ namespace embree
   /// Sorting networks
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
 
   __forceinline vint4 usort_ascending(const vint4& v)
   {
diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h
index c373907e9c32..25a771284da8 100644
--- a/thirdparty/embree/common/simd/vint8_avx.h
+++ b/thirdparty/embree/common/simd/vint8_avx.h
@@ -71,20 +71,25 @@ namespace embree
     static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     
+#if !defined(__aarch64__)
     static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
     static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+#else
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+#endif
 
     static __forceinline void store_nt(void* ptr, const vint8& v) {
       _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
     }
 
-    static __forceinline vint8 load(const unsigned char* ptr) {
+    static __forceinline vint8 load(const uint8_t* ptr) {
       vint4 il = vint4::load(ptr+0);
       vint4 ih = vint4::load(ptr+4);
       return vint8(il,ih);
     }
 
-    static __forceinline vint8 loadu(const unsigned char* ptr) {
+    static __forceinline vint8 loadu(const uint8_t* ptr) {
       vint4 il = vint4::loadu(ptr+0);
       vint4 ih = vint4::loadu(ptr+4);
       return vint8(il,ih);
@@ -102,7 +107,7 @@ namespace embree
       return vint8(il,ih);
     }
 
-    static __forceinline void store(unsigned char* ptr, const vint8& i) {
+    static __forceinline void store(uint8_t* ptr, const vint8& i) {
       vint4 il(i.vl);
       vint4 ih(i.vh);
       vint4::store(ptr + 0,il);
@@ -117,54 +122,54 @@ namespace embree
     template<int scale = 4>
     static __forceinline vint8 gather(const int* ptr, const vint8& index) {
       return vint8(
-          *(int*)(((char*)ptr)+scale*index[0]),
-          *(int*)(((char*)ptr)+scale*index[1]),
-          *(int*)(((char*)ptr)+scale*index[2]),
-          *(int*)(((char*)ptr)+scale*index[3]),
-          *(int*)(((char*)ptr)+scale*index[4]),
-          *(int*)(((char*)ptr)+scale*index[5]),
-          *(int*)(((char*)ptr)+scale*index[6]),
-          *(int*)(((char*)ptr)+scale*index[7]));
+          *(int*)(((int8_t*)ptr)+scale*index[0]),
+          *(int*)(((int8_t*)ptr)+scale*index[1]),
+          *(int*)(((int8_t*)ptr)+scale*index[2]),
+          *(int*)(((int8_t*)ptr)+scale*index[3]),
+          *(int*)(((int8_t*)ptr)+scale*index[4]),
+          *(int*)(((int8_t*)ptr)+scale*index[5]),
+          *(int*)(((int8_t*)ptr)+scale*index[6]),
+          *(int*)(((int8_t*)ptr)+scale*index[7]));
     }
 
     template<int scale = 4>
     static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
       vint8 r = zero;
-      if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
-      if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
-      if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]);
-      if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]);
-      if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]);
-      if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]);
-      if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]);
-      if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]);
+      if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]);
       return r;
     }
 
     template<int scale = 4>
     static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
     {
-      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
     }
 
     template<int scale = 4>
     static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
     {
-      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
     }
 
 
diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h
index ea97d3eb346e..4937d972cfdc 100644
--- a/thirdparty/embree/common/simd/vint8_avx2.h
+++ b/thirdparty/embree/common/simd/vint8_avx2.h
@@ -67,8 +67,8 @@ namespace embree
     /// Loads and Stores
     ////////////////////////////////////////////////////////////////////////////////
 
-    static __forceinline vint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
-    static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 load(const uint8_t* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
     static __forceinline vint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
     static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
 
@@ -108,7 +108,7 @@ namespace embree
       _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
     }
 
-    static __forceinline void store(unsigned char* ptr, const vint8& i)
+    static __forceinline void store(uint8_t* ptr, const vint8& i)
     {
       for (size_t j=0; j<8; j++)
         ptr[j] = i[j];
@@ -140,14 +140,14 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
 #else
-      *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
 #endif
     }
 
@@ -157,14 +157,14 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
 #else
-      if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
 #endif
     }
 
@@ -385,7 +385,9 @@ namespace embree
 
   __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
 
-  __forceinline vint8 permute(const vint8& v, const __m256i& index) {
+#if !defined(__aarch64__)
+
+__forceinline vint8 permute(const vint8& v, const __m256i& index) {
     return _mm256_permutevar8x32_epi32(v, index);
   }
 
@@ -393,6 +395,8 @@ namespace embree
     return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
   }
 
+
+
   template<int i>
   static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
 #if defined(__AVX512VL__)
@@ -402,6 +406,9 @@ namespace embree
 #endif
   }  
 
+#endif
+
+
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/simd/vllong8_avx512.h b/thirdparty/embree/common/simd/vllong8_avx512.h
index 4a724de0621b..76dddd8991dc 100644
--- a/thirdparty/embree/common/simd/vllong8_avx512.h
+++ b/thirdparty/embree/common/simd/vllong8_avx512.h
@@ -78,7 +78,7 @@ namespace embree
       return _mm512_load_si512(addr);
     }
 
-    static __forceinline vllong8 load(const unsigned char* ptr) {
+    static __forceinline vllong8 load(const uint8_t* ptr) {
       return _mm512_cvtepu8_epi64(*(__m128i*)ptr); 
     }
 
diff --git a/thirdparty/embree/common/simd/vuint16_avx512.h b/thirdparty/embree/common/simd/vuint16_avx512.h
index c5a2bb047840..39752611bbb1 100644
--- a/thirdparty/embree/common/simd/vuint16_avx512.h
+++ b/thirdparty/embree/common/simd/vuint16_avx512.h
@@ -83,7 +83,7 @@ namespace embree
       return _mm512_loadu_si512(addr);
     }
 
-    static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
     static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
 
     static __forceinline vuint16 load(const vuint16* addr) {
diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h
index 396eb45d5dfe..a3f393ebf2f6 100644
--- a/thirdparty/embree/common/simd/vuint4_sse2.h
+++ b/thirdparty/embree/common/simd/vuint4_sse2.h
@@ -87,44 +87,64 @@ namespace embree
     static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
 #endif
 
-#if defined(__SSE4_1__)
-    static __forceinline vuint4 load(const unsigned char* ptr) {
+#if defined(__aarch64__)
+    static __forceinline vuint4 load(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vuint4 loadu(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vuint4 load(const uint8_t* ptr) {
       return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
     }
 
-    static __forceinline vuint4 loadu(const unsigned char* ptr) {
+    static __forceinline vuint4 loadu(const uint8_t* ptr) {
       return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
     }
 
 #endif
 
     static __forceinline vuint4 load(const unsigned short* ptr) {
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      return _mm_load4epu16_epi32(((__m128i*)ptr));
+#elif defined (__SSE4_1__)
       return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
 #else
       return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
 #endif
     } 
 
-    static __forceinline void store_uchar(unsigned char* ptr, const vuint4& v) {
-#if defined(__SSE4_1__)
+    static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) {
+#if defined(__aarch64__) 
+        uint32x4_t x = uint32x4_t(v.v);
+        uint16x4_t y = vqmovn_u32(x);
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
       __m128i x = v;
       x = _mm_packus_epi32(x, x);
       x = _mm_packus_epi16(x, x);
       *(unsigned*)ptr = _mm_cvtsi128_si32(x);
 #else
       for (size_t i=0;i<4;i++)
-        ptr[i] = (unsigned char)v[i];
+        ptr[i] = (uint8_t)v[i];
 #endif
     }
 
-    static __forceinline void store_uchar(unsigned short* ptr, const vuint4& v) {
+    static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) {
+#if defined(__aarch64__)
+        uint32x4_t x = (uint32x4_t)v.v;
+        uint16x4_t y = vqmovn_u32(x);
+        vst1_u16(ptr, y);
+#else
       for (size_t i=0;i<4;i++)
         ptr[i] = (unsigned short)v[i];
+#endif
     }
 
     static __forceinline vuint4 load_nt(void* ptr) {
-#if defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
       return _mm_stream_load_si128((__m128i*)ptr); 
 #else
       return _mm_load_si128((__m128i*)ptr); 
@@ -132,8 +152,8 @@ namespace embree
     }
     
     static __forceinline void store_nt(void* ptr, const vuint4& v) {
-#if defined(__SSE4_1__)
-      _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); 
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); 
 #else
       _mm_store_si128((__m128i*)ptr,v);
 #endif
@@ -141,14 +161,14 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm_i32gather_epi32((const int*)ptr, index, scale);
 #else
       return vuint4(
-          *(unsigned int*)(((char*)ptr)+scale*index[0]),
-          *(unsigned int*)(((char*)ptr)+scale*index[1]),
-          *(unsigned int*)(((char*)ptr)+scale*index[2]),
-          *(unsigned int*)(((char*)ptr)+scale*index[3]));
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[3]));
 #endif
     }
 
@@ -157,13 +177,13 @@ namespace embree
       vuint4 r = zero;
 #if defined(__AVX512VL__)
       return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) && !defined(__aarch64__)
       return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
 #else
-      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
-      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
-      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
-      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
       return r;
 #endif
     }
@@ -353,16 +373,25 @@ namespace embree
   __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
   __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
 
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint4 shuffle(const vuint4& v) {
     return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
   }
-
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
     return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
-
+#endif
 #if defined(__SSE3__)
   template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
   template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
@@ -374,7 +403,10 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b);
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b);
+#elif defined(__SSE4_1__)
   template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
   template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
 #else
@@ -382,11 +414,50 @@ namespace embree
   template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
 #endif
 
-
+#if defined(__aarch64__)
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) {
+    return b[0];
+  }
+  template<> __forceinline unsigned int extract<1>(const vuint4& b) {
+    return b[1];
+  }
+  template<> __forceinline unsigned int extract<2>(const vuint4& b) {
+    return b[2];
+  }
+  template<> __forceinline unsigned int extract<3>(const vuint4& b) {
+    return b[3];
+  }
+                                                                               
+  template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[0] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[1] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[2] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[3] = b;
+    return c;
+  }
+                                                                               
+  __forceinline unsigned int toScalar(const vuint4& v) {
+    return v[0];
+  }
+#else
   template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
 
   __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
-
+#endif
+                                                                               
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h
index 437e73c7fbf3..d4e86ae92d78 100644
--- a/thirdparty/embree/common/simd/vuint8_avx.h
+++ b/thirdparty/embree/common/simd/vuint8_avx.h
@@ -69,20 +69,24 @@ namespace embree
     static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
     
+#if !defined(__aarch64__)
     static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
     static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
-
+#else
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+#endif
     static __forceinline void store_nt(void* ptr, const vuint8& v) {
       _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
     }
 
-    static __forceinline vuint8 load(const unsigned char* ptr) {
+    static __forceinline vuint8 load(const uint8_t* ptr) {
       vuint4 il = vuint4::load(ptr+0);
       vuint4 ih = vuint4::load(ptr+4);
       return vuint8(il,ih);
     }
 
-    static __forceinline vuint8 loadu(const unsigned char* ptr) {
+    static __forceinline vuint8 loadu(const uint8_t* ptr) {
       vuint4 il = vuint4::loadu(ptr+0);
       vuint4 ih = vuint4::loadu(ptr+4);
       return vuint8(il,ih);
@@ -100,7 +104,7 @@ namespace embree
       return vuint8(il,ih);
     }
 
-    static __forceinline void store(unsigned char* ptr, const vuint8& i) {
+    static __forceinline void store(uint8_t* ptr, const vuint8& i) {
       vuint4 il(i.vl);
       vuint4 ih(i.vh);
       vuint4::store(ptr + 0,il);
@@ -115,54 +119,54 @@ namespace embree
     template<int scale = 4>
     static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
       return vuint8(
-          *(unsigned int*)(((char*)ptr)+scale*index[0]),
-          *(unsigned int*)(((char*)ptr)+scale*index[1]),
-          *(unsigned int*)(((char*)ptr)+scale*index[2]),
-          *(unsigned int*)(((char*)ptr)+scale*index[3]),
-          *(unsigned int*)(((char*)ptr)+scale*index[4]),
-          *(unsigned int*)(((char*)ptr)+scale*index[5]),
-          *(unsigned int*)(((char*)ptr)+scale*index[6]),
-          *(unsigned int*)(((char*)ptr)+scale*index[7]));
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[3]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[4]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[5]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[6]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[7]));
     }
 
     template<int scale = 4>
     static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
       vuint8 r = zero;
-      if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
-      if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
-      if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]);
-      if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]);
-      if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]);
-      if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]);
-      if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]);
-      if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]);
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]);
       return r;
     }
 
     template<int scale = 4>
     static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
     {
-      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
     }
 
     template<int scale = 4>
     static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
     {
-      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
     }
 
 
diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h
index ae243ddfb173..b2a965448d58 100644
--- a/thirdparty/embree/common/simd/vuint8_avx2.h
+++ b/thirdparty/embree/common/simd/vuint8_avx2.h
@@ -66,8 +66,8 @@ namespace embree
     /// Loads and Stores
     ////////////////////////////////////////////////////////////////////////////////
 
-    static __forceinline vuint8 load(const unsigned char* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
-    static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 load(const uint8_t* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
     static __forceinline vuint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
     static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
 
@@ -107,7 +107,7 @@ namespace embree
       _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
     }
 
-    static __forceinline void store(unsigned char* ptr, const vuint8& i)
+    static __forceinline void store(uint8_t* ptr, const vuint8& i)
     {
       for (size_t j=0; j<8; j++)
         ptr[j] = i[j];
@@ -139,14 +139,14 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
 #else
-      *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7];
 #endif
     }
 
@@ -156,14 +156,14 @@ namespace embree
 #if defined(__AVX512VL__)
       _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
 #else
-      if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
-      if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
-      if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2];
-      if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3];
-      if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4];
-      if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5];
-      if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6];
-      if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7];
+      if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
 #endif
     }
 
@@ -379,6 +379,8 @@ namespace embree
 
   __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
 
+#if !defined(__aarch64__)
+
   __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
     return _mm256_permutevar8x32_epi32(v, index);
   }
@@ -394,7 +396,10 @@ namespace embree
 #else
     return _mm256_alignr_epi8(a, b, 4*i);
 #endif
-  }  
+  }
+
+#endif
+
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
index 4e8928242eb8..12f143f13174 100644
--- a/thirdparty/embree/common/sys/alloc.cpp
+++ b/thirdparty/embree/common/sys/alloc.cpp
@@ -21,7 +21,10 @@ namespace embree
     void* ptr = _mm_malloc(size,align);
 
     if (size != 0 && ptr == nullptr)
-      throw std::bad_alloc();
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort(); 
+      // -- GODOT end --
     
     return ptr;
   }
@@ -128,7 +131,10 @@ namespace embree
     /* fall back to 4k pages */
     int flags = MEM_COMMIT | MEM_RESERVE;
     char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
-    if (ptr == nullptr) throw std::bad_alloc();
+    // -- GODOT start --
+    // if (ptr == nullptr) throw std::bad_alloc();
+    if (ptr == nullptr) abort();
+    // -- GODOT end --
     hugepages = false;
     return ptr;
   }
@@ -145,7 +151,10 @@ namespace embree
       return bytesOld;
 
     if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
-      throw std::bad_alloc();
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
 
     return bytesNew;
   }
@@ -156,7 +165,10 @@ namespace embree
       return;
 
     if (!VirtualFree(ptr,0,MEM_RELEASE))
-      throw std::bad_alloc();
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
   }
 
   void os_advise(void *ptr, size_t bytes)
@@ -260,7 +272,10 @@ namespace embree
 
     /* fallback to 4k pages */
     void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
-    if (ptr == MAP_FAILED) throw std::bad_alloc();
+    // -- GODOT start --
+    // if (ptr == MAP_FAILED) throw std::bad_alloc();
+    if (ptr == MAP_FAILED) abort();
+    // -- GODOT end --
     hugepages = false;
 
     /* advise huge page hint for THP */
@@ -277,7 +292,10 @@ namespace embree
       return bytesOld;
 
     if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
-      throw std::bad_alloc();
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
 
     return bytesNew;
   }
@@ -291,7 +309,10 @@ namespace embree
     const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
     bytes = (bytes+pageSize-1) & ~(pageSize-1);
     if (munmap(ptr,bytes) == -1)
-      throw std::bad_alloc();
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
   }
 
   /* hint for transparent huge pages (THP) */
diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h
index 6f6f98eac816..77722a39f670 100644
--- a/thirdparty/embree/common/sys/array.h
+++ b/thirdparty/embree/common/sys/array.h
@@ -139,7 +139,7 @@ namespace embree
     __forceinline       Ty& operator[](const unsigned i)       { assert(i<N); return data[i]; }
     __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
 
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
     __forceinline       Ty& operator[](const size_t i)       { assert(i<N); return data[i]; }
     __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
 #endif
@@ -196,7 +196,7 @@ namespace embree
     __forceinline       Ty& operator[](const int i)      { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
     __forceinline       Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
 
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
     __forceinline       Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
 #endif
 
diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h
index dc8e71450086..44cdbd8f0fc8 100644
--- a/thirdparty/embree/common/sys/intrinsics.h
+++ b/thirdparty/embree/common/sys/intrinsics.h
@@ -9,7 +9,14 @@
 #include <intrin.h>
 #endif
 
+#if defined(__ARM_NEON)
+#include "../math/SSE2NEON.h"
+#if defined(NEON_AVX2_EMULATION)
+#include "../math/AVX2NEON.h"
+#endif
+#else
 #include <immintrin.h>
+#endif
 
 #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
   #if !defined(_tzcnt_u32)
@@ -20,6 +27,14 @@
   #endif
 #endif
 
+#if defined(__aarch64__)
+#if !defined(_lzcnt_u32)
+  #define _lzcnt_u32 __builtin_clz
+#endif
+#if !defined(_lzcnt_u32)
+  #define _lzcnt_u32 __builtin_clzll
+#endif
+#else
 #if defined(__LZCNT__)
   #if !defined(_lzcnt_u32)
     #define _lzcnt_u32 __lzcnt32
@@ -28,16 +43,13 @@
     #define _lzcnt_u64 __lzcnt64
   #endif
 #endif
+#endif
 
 #if defined(__WIN32__)
-// -- GODOT start --
-#if !defined(NOMINMAX)
-// -- GODOT end --
-#define NOMINMAX
-// -- GODOT start --
-#endif
-#include "windows.h"
-// -- GODOT end --
+#  ifndef NOMINMAX
+#  define NOMINMAX
+#  endif
+#  include <windows.h>
 #endif
 
 /* normally defined in pmmintrin.h, but we always need this */
@@ -50,133 +62,133 @@
 
 namespace embree
 {
-  
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Windows Platform
 ////////////////////////////////////////////////////////////////////////////////
-  
+
 #if defined(__WIN32__)
-  
-  __forceinline size_t read_tsc()  
+
+  __forceinline size_t read_tsc()
   {
     LARGE_INTEGER li;
     QueryPerformanceCounter(&li);
     return (size_t)li.QuadPart;
   }
-  
+
   __forceinline int bsf(int v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _tzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanForward(&r,v); return r;
 #endif
   }
-  
+
   __forceinline unsigned bsf(unsigned v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _tzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanForward(&r,v); return r;
 #endif
   }
-  
+
 #if defined(__X86_64__)
   __forceinline size_t bsf(size_t v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__)
     return _tzcnt_u64(v);
 #else
     unsigned long r = 0; _BitScanForward64(&r,v); return r;
 #endif
   }
 #endif
-  
-  __forceinline int bscf(int& v) 
+
+  __forceinline int bscf(int& v)
   {
     int i = bsf(v);
     v &= v-1;
     return i;
   }
-  
-  __forceinline unsigned bscf(unsigned& v) 
+
+  __forceinline unsigned bscf(unsigned& v)
   {
     unsigned i = bsf(v);
     v &= v-1;
     return i;
   }
-  
+
 #if defined(__X86_64__)
-  __forceinline size_t bscf(size_t& v) 
+  __forceinline size_t bscf(size_t& v)
   {
     size_t i = bsf(v);
     v &= v-1;
     return i;
   }
 #endif
-  
+
   __forceinline int bsr(int v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__)  && !defined(__aarch64__)
     return 31 - _lzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanReverse(&r,v); return r;
 #endif
   }
-  
+
   __forceinline unsigned bsr(unsigned v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return 31 - _lzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanReverse(&r,v); return r;
 #endif
   }
-  
+
 #if defined(__X86_64__)
   __forceinline size_t bsr(size_t v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__)
     return 63 -_lzcnt_u64(v);
 #else
     unsigned long r = 0; _BitScanReverse64(&r, v); return r;
 #endif
   }
 #endif
-  
+
   __forceinline int lzcnt(const int x)
   {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _lzcnt_u32(x);
 #else
     if (unlikely(x == 0)) return 32;
-    return 31 - bsr(x);    
+    return 31 - bsr(x);
 #endif
   }
-  
+
   __forceinline int btc(int v, int i) {
     long r = v; _bittestandcomplement(&r,i); return r;
   }
-  
+
   __forceinline int bts(int v, int i) {
     long r = v; _bittestandset(&r,i); return r;
   }
-  
+
   __forceinline int btr(int v, int i) {
     long r = v; _bittestandreset(&r,i); return r;
   }
-  
+
 #if defined(__X86_64__)
-  
+
   __forceinline size_t btc(size_t v, size_t i) {
     size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
   }
-  
+
   __forceinline size_t bts(size_t v, size_t i) {
     __int64 r = v; _bittestandset64(&r,i); return r;
   }
-  
+
   __forceinline size_t btr(size_t v, size_t i) {
     __int64 r = v; _bittestandreset64(&r,i); return r;
   }
-  
+
 #endif
-  
+
   __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
     return _InterlockedCompareExchange((volatile long*)p,v,c);
   }
@@ -184,143 +196,174 @@ namespace embree
 ////////////////////////////////////////////////////////////////////////////////
 /// Unix Platform
 ////////////////////////////////////////////////////////////////////////////////
-  
+
 #else
-  
+
 #if defined(__i386__) && defined(__PIC__)
-  
-  __forceinline void __cpuid(int out[4], int op) 
+
+  __forceinline void __cpuid(int out[4], int op)
   {
     asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
                   "cpuid\n\t"
                   "xchg{l}\t{%%}ebx, %1\n\t"
-                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 
-                  : "0"(op)); 
+                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                  : "0"(op));
   }
-  
-  __forceinline void __cpuid_count(int out[4], int op1, int op2) 
+
+  __forceinline void __cpuid_count(int out[4], int op1, int op2)
   {
     asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
                   "cpuid\n\t"
                   "xchg{l}\t{%%}ebx, %1\n\t"
                   : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
-                  : "0" (op1), "2" (op2)); 
+                  : "0" (op1), "2" (op2));
   }
-  
+
 #else
-  
+
   __forceinline void __cpuid(int out[4], int op) {
-    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); 
+#if defined(__ARM_NEON)
+    if (op == 0) { // Get CPU name
+      out[0] = 0x41524d20;
+      out[1] = 0x41524d20;
+      out[2] = 0x41524d20;
+      out[3] = 0x41524d20;
+    }
+#else
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
+#endif
   }
-  
+
+#if !defined(__ARM_NEON)
   __forceinline void __cpuid_count(int out[4], int op1, int op2) {
-    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); 
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
   }
-  
 #endif
-  
+
+#endif
+
   __forceinline uint64_t read_tsc()  {
+#if defined(__ARM_NEON)
+    return 0; // FIXME(LTE): mimic rdtsc
+#else
     uint32_t high,low;
     asm volatile ("rdtsc" : "=d"(high), "=a"(low));
     return (((uint64_t)high) << 32) + (uint64_t)low;
+#endif
   }
-  
+
   __forceinline int bsf(int v) {
-#if defined(__AVX2__) 
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
     return _tzcnt_u32(v);
 #else
     int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
 #endif
   }
-  
-#if defined(__X86_64__)
-  __forceinline unsigned bsf(unsigned v) 
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned bsf(unsigned v)
   {
-#if defined(__AVX2__) 
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
     return _tzcnt_u32(v);
 #else
     unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
 #endif
   }
 #endif
-  
+
   __forceinline size_t bsf(size_t v) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
 #if defined(__X86_64__)
     return _tzcnt_u64(v);
 #else
     return _tzcnt_u32(v);
 #endif
+#elif defined(__ARM_NEON)
+    return __builtin_ctzl(v);
 #else
     size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
 #endif
   }
 
-  __forceinline int bscf(int& v) 
+  __forceinline int bscf(int& v)
   {
     int i = bsf(v);
     v &= v-1;
     return i;
   }
-  
-#if defined(__X86_64__)
-  __forceinline unsigned int bscf(unsigned int& v) 
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned int bscf(unsigned int& v)
   {
     unsigned int i = bsf(v);
     v &= v-1;
     return i;
   }
 #endif
-  
-  __forceinline size_t bscf(size_t& v) 
+
+  __forceinline size_t bscf(size_t& v)
   {
     size_t i = bsf(v);
     v &= v-1;
     return i;
   }
-  
+
   __forceinline int bsr(int v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
     return 31 - _lzcnt_u32(v);
+#elif defined(__ARM_NEON)
+    return __builtin_clz(v)^31;
 #else
     int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
 #endif
   }
-  
-#if defined(__X86_64__)
+
+#if defined(__X86_64__) || defined(__aarch64__)
   __forceinline unsigned bsr(unsigned v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__)
     return 31 - _lzcnt_u32(v);
+#elif defined(__ARM_NEON)
+    return __builtin_clz(v)^31;
 #else
     unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
 #endif
   }
 #endif
-  
+
   __forceinline size_t bsr(size_t v) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
 #if defined(__X86_64__)
     return 63 - _lzcnt_u64(v);
 #else
     return 31 - _lzcnt_u32(v);
 #endif
+#elif defined(__aarch64__)
+    return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
 #else
     size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
 #endif
   }
-  
+
   __forceinline int lzcnt(const int x)
   {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
     return _lzcnt_u32(x);
 #else
     if (unlikely(x == 0)) return 32;
-    return 31 - bsr(x);    
+    return 31 - bsr(x);
 #endif
   }
 
   __forceinline size_t blsr(size_t v) {
-#if defined(__AVX2__) 
+#if defined(__AVX2__) && !defined(__aarch64__)
 #if defined(__INTEL_COMPILER)
     return _blsr_u64(v);
 #else
@@ -334,41 +377,79 @@ namespace embree
     return v & (v-1);
 #endif
   }
-  
+
   __forceinline int btc(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandcomplement(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    // *a = *a ^ (1 << b);
+    // return x;
+
+    // We only need `*a`
+    return (v ^ (1 << i));
+#else
     int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#endif
   }
-  
+
   __forceinline int bts(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandset(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    //  *a = *a | (1 << b);
+    //  return x;
+    return (v | (v << i));
+#else
     int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
   }
-  
+
   __forceinline int btr(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandreset(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    //  *a = *a & ~(1 << b);
+    //  return x;
+    return (v & ~(v << i));
+#else
     int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
   }
-  
+
   __forceinline size_t btc(size_t v, size_t i) {
+#if defined(__aarch64__)
+    return (v ^ (1 << i));
+#else
     size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#endif
   }
-  
+
   __forceinline size_t bts(size_t v, size_t i) {
+#if defined(__aarch64__)
+    return (v | (v << i));
+#else
     size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
   }
-  
+
   __forceinline size_t btr(size_t v, size_t i) {
+#if defined(__ARM_NEON)
+    return (v & ~(v << i));
+#else
     size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
   }
 
   __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
     return __sync_val_compare_and_swap(value, comparand, input);
   }
-  
+
 #endif
-  
+
 ////////////////////////////////////////////////////////////////////////////////
 /// All Platforms
 ////////////////////////////////////////////////////////////////////////////////
-  
+
 #if defined(__clang__) || defined(__GNUC__)
 #if !defined(_mm_undefined_ps)
   __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
@@ -390,39 +471,39 @@ namespace embree
 #endif
 #endif
 
-#if defined(__SSE4_2__)
-  
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+
   __forceinline int popcnt(int in) {
     return _mm_popcnt_u32(in);
   }
-  
+
   __forceinline unsigned popcnt(unsigned in) {
     return _mm_popcnt_u32(in);
   }
-  
-#if defined(__X86_64__)
+
+#if defined(__X86_64__) || defined(__ARM_NEON)
   __forceinline size_t popcnt(size_t in) {
     return _mm_popcnt_u64(in);
   }
 #endif
-  
+
 #endif
 
   __forceinline uint64_t rdtsc()
   {
-    int dummy[4]; 
-    __cpuid(dummy,0); 
-    uint64_t clock = read_tsc(); 
-    __cpuid(dummy,0); 
+    int dummy[4];
+    __cpuid(dummy,0);
+    uint64_t clock = read_tsc();
+    __cpuid(dummy,0);
     return clock;
   }
-  
+
   __forceinline void pause_cpu(const size_t N = 8)
   {
     for (size_t i=0; i<N; i++)
       _mm_pause();
   }
-  
+
   /* prefetches */
   __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
   __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
@@ -432,18 +513,18 @@ namespace embree
 #if defined(__INTEL_COMPILER)
     _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
 #else
-    _mm_prefetch((const char*)ptr,_MM_HINT_T0);    
+    _mm_prefetch((const char*)ptr,_MM_HINT_T0);
 #endif
   }
 
-  __forceinline void prefetchL1EX(const void* ptr) { 
-    prefetchEX(ptr); 
+  __forceinline void prefetchL1EX(const void* ptr) {
+    prefetchEX(ptr);
   }
-  
-  __forceinline void prefetchL2EX(const void* ptr) { 
-    prefetchEX(ptr); 
+
+  __forceinline void prefetchL2EX(const void* ptr) {
+    prefetchEX(ptr);
   }
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
    __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
    __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
 #if defined(__X86_64__)
diff --git a/thirdparty/embree/common/sys/library.cpp b/thirdparty/embree/common/sys/library.cpp
index 8ec9186600f6..899267a1e4b5 100644
--- a/thirdparty/embree/common/sys/library.cpp
+++ b/thirdparty/embree/common/sys/library.cpp
@@ -27,9 +27,7 @@ namespace embree
 
   /* returns address of a symbol from the library */
   void* getSymbol(lib_t lib, const std::string& sym) {
-    // -- GODOT start --
-    return (void*) GetProcAddress(HMODULE(lib),sym.c_str());
-    // -- GODOT end --
+    return reinterpret_cast<void *>(GetProcAddress(HMODULE(lib),sym.c_str()));
   }
 
   /* closes the shared library */
@@ -63,7 +61,7 @@ namespace embree
     lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
     if (lib == nullptr) {
       const char* error = dlerror();
-      if (error) { 
+      if (error) {
         THROW_RUNTIME_ERROR(error);
       } else {
         THROW_RUNTIME_ERROR("could not load library "+executable.str());
diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp
index 57ef360981ac..11779bc9b9ee 100644
--- a/thirdparty/embree/common/sys/mutex.cpp
+++ b/thirdparty/embree/common/sys/mutex.cpp
@@ -36,6 +36,7 @@ namespace embree
     MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
     assert(ok);
     delete (pthread_mutex_t*)mutex; 
+    mutex = nullptr;
   }
   
   void MutexSys::lock() 
diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h
index e10ab19acf4b..1164210f23aa 100644
--- a/thirdparty/embree/common/sys/mutex.h
+++ b/thirdparty/embree/common/sys/mutex.h
@@ -47,7 +47,7 @@ namespace embree
       {
         while (flag.load()) 
         {
-          _mm_pause();
+          _mm_pause(); 
           _mm_pause();
         }
         
@@ -74,7 +74,7 @@ namespace embree
     {
       while(flag.load())
       {
-        _mm_pause();
+        _mm_pause(); 
         _mm_pause();
       }
     }
diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
index 61a622a6b495..737f14aa6e37 100644
--- a/thirdparty/embree/common/sys/platform.h
+++ b/thirdparty/embree/common/sys/platform.h
@@ -88,10 +88,10 @@
 #define dll_import __declspec(dllimport)
 #else
 #define dll_export __attribute__ ((visibility ("default")))
-#define dll_import 
+#define dll_import
 #endif
 
-#if defined(__WIN32__) && !defined(__MINGW32__)
+#ifdef __WIN32__
 #if !defined(__noinline)
 #define __noinline             __declspec(noinline)
 #endif
@@ -103,11 +103,16 @@
 #define __restrict__           //__restrict // causes issues with MSVC
 #endif
 #if !defined(__thread)
+// NOTE: Require `-fms-extensions` for clang
 #define __thread               __declspec(thread)
 #endif
 #if !defined(__aligned)
+#if defined(__MINGW32__)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#else
 #define __aligned(...)           __declspec(align(__VA_ARGS__))
 #endif
+#endif
 //#define __FUNCTION__           __FUNCTION__
 #define debugbreak()           __debugbreak()
 
@@ -142,7 +147,7 @@
 #endif
 
 // -- GODOT start --
-#if !defined(likely)
+#ifndef likely
 // -- GODOT end --
 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 #define   likely(expr) (expr)
@@ -169,11 +174,19 @@
 #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
 
 #if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
   #define THROW_RUNTIME_ERROR(str) \
-    throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
 #else
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(str);
   #define THROW_RUNTIME_ERROR(str) \
-    throw std::runtime_error(str);
+    abort();
+  // -- GODOT end --
 #endif
 
 #define FATAL(x)   THROW_RUNTIME_ERROR(x)
@@ -192,7 +205,7 @@ namespace embree {
 
 /* windows does not have ssize_t */
 #if defined(__WIN32__)
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
 typedef int64_t ssize_t;
 #else
 typedef int32_t ssize_t;
@@ -316,7 +329,7 @@ __forceinline std::string toString(long long value) {
 /// Some macros for static profiling
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__GNUC__) 
+#if defined (__GNUC__)
 #define IACA_SSC_MARK( MARK_ID )						\
 __asm__ __volatile__ (									\
 					  "\n\t  movl $"#MARK_ID", %%ebx"	\
@@ -355,7 +368,7 @@ namespace embree
     bool active;
     const Closure f;
   };
-  
+
   template <typename Closure>
     OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
     return OnScopeExitHelper<Closure>(f);
diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp
index 74438260dbe4..1d11436770c5 100644
--- a/thirdparty/embree/common/sys/sysinfo.cpp
+++ b/thirdparty/embree/common/sys/sysinfo.cpp
@@ -18,10 +18,16 @@ typedef cpuset_t cpu_set_t;
 namespace embree
 {
   NullTy null;
-  
-  std::string getPlatformName() 
+
+  std::string getPlatformName()
   {
-#if defined(__LINUX__) && !defined(__X86_64__)
+#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON)
+    return "Android Linux (aarch64 / arm64)";
+#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__)
+    return "Android Linux (x64)";
+#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86))
+    return "Android Linux (x86)";
+#elif defined(__LINUX__) && !defined(__X86_64__)
     return "Linux (32bit)";
 #elif defined(__LINUX__) && defined(__X86_64__)
     return "Linux (64bit)";
@@ -37,10 +43,16 @@ namespace embree
     return "Windows (32bit)";
 #elif defined(__WIN32__) && defined(__X86_64__)
     return "Windows (64bit)";
+#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__)
+    return "iOS Simulator (x64)";
+#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON)
+    return "iOS (aarch64 / arm64)";
 #elif defined(__MACOSX__) && !defined(__X86_64__)
     return "Mac OS X (32bit)";
 #elif defined(__MACOSX__) && defined(__X86_64__)
     return "Mac OS X (64bit)";
+#elif defined(__UNIX__) && defined(__aarch64__)
+    return "Unix (aarch64)";
 #elif defined(__UNIX__) && !defined(__X86_64__)
     return "Unix (32bit)";
 #elif defined(__UNIX__) && defined(__X86_64__)
@@ -79,8 +91,8 @@ namespace embree
 
   std::string getCPUVendor()
   {
-    int cpuinfo[4]; 
-    __cpuid (cpuinfo, 0); 
+    int cpuinfo[4];
+    __cpuid (cpuinfo, 0);
     int name[4];
     name[0] = cpuinfo[1];
     name[1] = cpuinfo[3];
@@ -89,11 +101,11 @@ namespace embree
     return (char*)name;
   }
 
-  CPU getCPUModel() 
+  CPU getCPUModel()
   {
     if (getCPUVendor() != "GenuineIntel")
       return CPU::UNKNOWN;
-    
+
     int out[4];
     __cpuid(out, 0);
     if (out[0] < 1) return CPU::UNKNOWN;
@@ -183,11 +195,13 @@ namespace embree
     case CPU::NEHALEM                 : return "Nehalem";
     case CPU::CORE2                   : return "Core2";
     case CPU::CORE1                   : return "Core";
+    case CPU::ARM                     : return "Arm";
     case CPU::UNKNOWN                 : return "Unknown CPU";
     }
     return "Unknown CPU (error)";
   }
 
+#if !defined(__ARM_NEON)
   /* constants to access destination registers of CPUID instruction */
   static const int EAX = 0;
   static const int EBX = 1;
@@ -227,13 +241,16 @@ namespace embree
   static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30;    // AVX512BW (byte and word instructions)
   static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31;    // AVX512VL (vector length extensions)
   static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21;  // AVX512IFMA (integer fused multiple-add instructions)
-  
+
   /* cpuid[eax=7,ecx=0].ecx */
   static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1;   // AVX512VBMI (vector bit manipulation instructions)
+#endif
 
-  __noinline int64_t get_xcr0() 
+#if !defined(__ARM_NEON)
+  __noinline int64_t get_xcr0()
   {
-#if defined (__WIN32__) /* -- GODOT start -- */ && !defined (__MINGW32__) /* -- GODOT end -- */
+    // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466
+#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK)
     int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
     xcr0 = _xgetbv(0);
     return xcr0;
@@ -243,21 +260,44 @@ namespace embree
     return xcr0;
 #endif
   }
+#endif
 
   int getCPUFeatures()
   {
+#if defined(__ARM_NEON)
+      int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
+#if defined(NEON_AVX2_EMULATION)
+      cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
+      cpu_features |= CPU_FEATURE_XMM_ENABLED;
+      cpu_features |= CPU_FEATURE_YMM_ENABLED;
+      cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C;
+      cpu_features |= CPU_FEATURE_POPCNT;
+      cpu_features |= CPU_FEATURE_AVX;
+      cpu_features |= CPU_FEATURE_AVX2;
+      cpu_features |= CPU_FEATURE_FMA3;
+      cpu_features |= CPU_FEATURE_LZCNT;
+      cpu_features |= CPU_FEATURE_BMI1;
+      cpu_features |= CPU_FEATURE_BMI2;
+      cpu_features |= CPU_FEATURE_NEON_2X;
+
+
+ 
+#endif
+     return cpu_features;
+      
+#else
     /* cache CPU features access */
     static int cpu_features = 0;
-    if (cpu_features) 
+    if (cpu_features)
       return cpu_features;
 
     /* get number of CPUID leaves */
-    int cpuid_leaf0[4]; 
+    int cpuid_leaf0[4];
     __cpuid(cpuid_leaf0, 0x00000000);
-    unsigned nIds = cpuid_leaf0[EAX];  
+    unsigned nIds = cpuid_leaf0[EAX];
 
     /* get number of extended CPUID leaves */
-    int cpuid_leafe[4]; 
+    int cpuid_leafe[4];
     __cpuid(cpuid_leafe, 0x80000000);
     unsigned nExIds = cpuid_leafe[EAX];
 
@@ -289,7 +329,7 @@ namespace embree
     if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED;
     if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED;
     if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED;
-    
+
     if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE   ) cpu_features |= CPU_FEATURE_SSE;
     if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2  ) cpu_features |= CPU_FEATURE_SSE2;
     if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3  ) cpu_features |= CPU_FEATURE_SSE3;
@@ -297,8 +337,8 @@ namespace embree
     if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
     if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
     if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
-    
     if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX   ) cpu_features |= CPU_FEATURE_AVX;
+
     if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C  ) cpu_features |= CPU_FEATURE_F16C;
     if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
     if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2  ) cpu_features |= CPU_FEATURE_AVX2;
@@ -310,7 +350,7 @@ namespace embree
     if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F   ) cpu_features |= CPU_FEATURE_AVX512F;
     if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ  ) cpu_features |= CPU_FEATURE_AVX512DQ;
     if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF  ) cpu_features |= CPU_FEATURE_AVX512PF;
-    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER  ) cpu_features |= CPU_FEATURE_AVX512ER; 
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER  ) cpu_features |= CPU_FEATURE_AVX512ER;
     if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD  ) cpu_features |= CPU_FEATURE_AVX512CD;
     if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW  ) cpu_features |= CPU_FEATURE_AVX512BW;
     if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA;
@@ -318,6 +358,7 @@ namespace embree
     if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
 
     return cpu_features;
+#endif
   }
 
   std::string stringOfCPUFeatures(int features)
@@ -350,9 +391,11 @@ namespace embree
     if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
     if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
     if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
+    if (features & CPU_FEATURE_NEON) str += "NEON ";
+    if (features & CPU_FEATURE_NEON_2X) str += "2xNEON ";
     return str;
   }
-  
+
   std::string stringOfISA (int isa)
   {
     if (isa == SSE) return "SSE";
@@ -365,13 +408,15 @@ namespace embree
     if (isa == AVX2) return "AVX2";
     if (isa == AVX512KNL) return "AVX512KNL";
     if (isa == AVX512SKX) return "AVX512SKX";
+    if (isa == NEON) return "NEON";    
+    if (isa == NEON_2X) return "2xNEON";
     return "UNKNOWN";
   }
 
   bool hasISA(int features, int isa) {
     return (features & isa) == isa;
   }
-  
+
   std::string supportedTargetList (int features)
   {
     std::string v;
@@ -386,6 +431,8 @@ namespace embree
     if (hasISA(features,AVX2)) v += "AVX2 ";
     if (hasISA(features,AVX512KNL)) v += "AVX512KNL ";
     if (hasISA(features,AVX512SKX)) v += "AVX512SKX ";
+    if (hasISA(features,NEON)) v += "NEON ";
+    if (hasISA(features,NEON_2X)) v += "2xNEON ";
     return v;
   }
 }
@@ -409,7 +456,7 @@ namespace embree
     return std::string(filename);
   }
 
-  unsigned int getNumberOfLogicalThreads() 
+  unsigned int getNumberOfLogicalThreads()
   {
     static int nThreads = -1;
     if (nThreads != -1) return nThreads;
@@ -420,11 +467,11 @@ namespace embree
     GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
     GetActiveProcessorCountFunc      pGetActiveProcessorCount      = (GetActiveProcessorCountFunc)     GetProcAddress(hlib, "GetActiveProcessorCount");
 
-    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) 
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount)
     {
       int groups = pGetActiveProcessorGroupCount();
       int totalProcessors = 0;
-      for (int i = 0; i < groups; i++) 
+      for (int i = 0; i < groups; i++)
         totalProcessors += pGetActiveProcessorCount(i);
       nThreads = totalProcessors;
     }
@@ -438,7 +485,7 @@ namespace embree
     return nThreads;
   }
 
-  int getTerminalWidth() 
+  int getTerminalWidth()
   {
     HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
     if (handle == INVALID_HANDLE_VALUE) return 80;
@@ -448,7 +495,7 @@ namespace embree
     return info.dwSize.X;
   }
 
-  double getSeconds() 
+  double getSeconds()
   {
     LARGE_INTEGER freq, val;
     QueryPerformanceFrequency(&freq);
@@ -487,7 +534,7 @@ namespace embree
 
 namespace embree
 {
-  std::string getExecutableFileName() 
+  std::string getExecutableFileName()
   {
     std::string pid = "/proc/" + toString(getpid()) + "/exe";
     char buf[4096];
@@ -540,7 +587,7 @@ namespace embree
   size_t getVirtualMemoryBytes() {
     return 0;
   }
-   
+
   size_t getResidentMemoryBytes() {
     return 0;
   }
@@ -570,7 +617,7 @@ namespace embree
   size_t getVirtualMemoryBytes() {
     return 0;
   }
-   
+
   size_t getResidentMemoryBytes() {
     return 0;
   }
@@ -591,12 +638,12 @@ namespace embree
 
 namespace embree
 {
-  unsigned int getNumberOfLogicalThreads() 
+  unsigned int getNumberOfLogicalThreads()
   {
     static int nThreads = -1;
     if (nThreads != -1) return nThreads;
 
-#if defined(__MACOSX__)
+#if defined(__MACOSX__) || defined(__ANDROID__)
     nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
     assert(nThreads);
 #else
@@ -604,12 +651,12 @@ namespace embree
     if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
       nThreads = CPU_COUNT(&set);
 #endif
-    
+
     assert(nThreads);
     return nThreads;
   }
 
-  int getTerminalWidth() 
+  int getTerminalWidth()
   {
     struct winsize info;
     if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80;
diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h
index cee1017ddecf..8e313a59b32e 100644
--- a/thirdparty/embree/common/sys/sysinfo.h
+++ b/thirdparty/embree/common/sys/sysinfo.h
@@ -59,7 +59,12 @@
 #  define isa sse
 #  define ISA SSE
 #  define ISA_STR "SSE"
-#else 
+#elif defined(__ARM_NEON)
+// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
+#define isa sse2
+#define ISA NEON
+#define ISA_STR "NEON"
+#else
 #error Unknown ISA
 #endif
 
@@ -87,6 +92,7 @@ namespace embree
     NEHALEM,
     CORE2,
     CORE1,
+    ARM,
     UNKNOWN,
   };
   
@@ -114,7 +120,7 @@ namespace embree
   static const int CPU_FEATURE_SSE3   = 1 << 2;
   static const int CPU_FEATURE_SSSE3  = 1 << 3;
   static const int CPU_FEATURE_SSE41  = 1 << 4;
-  static const int CPU_FEATURE_SSE42  = 1 << 5; 
+  static const int CPU_FEATURE_SSE42  = 1 << 5;
   static const int CPU_FEATURE_POPCNT = 1 << 6;
   static const int CPU_FEATURE_AVX    = 1 << 7;
   static const int CPU_FEATURE_F16C   = 1 << 8;
@@ -125,7 +131,7 @@ namespace embree
   static const int CPU_FEATURE_BMI1   = 1 << 13;
   static const int CPU_FEATURE_BMI2   = 1 << 14;
   static const int CPU_FEATURE_AVX512F = 1 << 16;
-  static const int CPU_FEATURE_AVX512DQ = 1 << 17;    
+  static const int CPU_FEATURE_AVX512DQ = 1 << 17;
   static const int CPU_FEATURE_AVX512PF = 1 << 18;
   static const int CPU_FEATURE_AVX512ER = 1 << 19;
   static const int CPU_FEATURE_AVX512CD = 1 << 20;
@@ -136,7 +142,9 @@ namespace embree
   static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
   static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
   static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
- 
+  static const int CPU_FEATURE_NEON = 1 << 28;
+  static const int CPU_FEATURE_NEON_2X = 1 << 29;
+
   /*! get CPU features */
   int getCPUFeatures();
 
@@ -147,7 +155,7 @@ namespace embree
   std::string supportedTargetList (int isa);
 
   /*! ISAs */
-  static const int SSE    = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; 
+  static const int SSE    = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED;
   static const int SSE2   = SSE | CPU_FEATURE_SSE2;
   static const int SSE3   = SSE2 | CPU_FEATURE_SSE3;
   static const int SSSE3  = SSE3 | CPU_FEATURE_SSSE3;
@@ -158,6 +166,8 @@ namespace embree
   static const int AVX2   = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
   static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED;
   static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
+  static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2;
+  static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2;
 
   /*! converts ISA bitvector into a string */
   std::string stringOfISA(int features);
diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp
index 4d86853c47c2..f9ea5b7d9668 100644
--- a/thirdparty/embree/common/sys/thread.cpp
+++ b/thirdparty/embree/common/sys/thread.cpp
@@ -6,7 +6,11 @@
 #include "string.h"
 
 #include <iostream>
+#if defined(__ARM_NEON)
+#include "../math/SSE2NEON.h"
+#else
 #include <xmmintrin.h>
+#endif
 
 #if defined(PTHREADS_WIN32)
 #pragma comment (lib, "pthreadVC.lib")
@@ -35,7 +39,7 @@ namespace embree
     GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
     SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
     SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
-    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) 
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx)
     {
       int groups = pGetActiveProcessorGroupCount();
       int totalProcessors = 0, group = 0, number = 0;
@@ -48,7 +52,7 @@ namespace embree
         }
         totalProcessors += processors;
       }
-  
+
       GROUP_AFFINITY groupAffinity;
       groupAffinity.Group = (WORD)group;
       groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
@@ -57,15 +61,15 @@ namespace embree
       groupAffinity.Reserved[2] = 0;
       if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
         WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
-  
+
       PROCESSOR_NUMBER processorNumber;
       processorNumber.Group = group;
       processorNumber.Number = number;
       processorNumber.Reserved = 0;
       if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
         WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
-    } 
-    else 
+    }
+    else
     {
       if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
         WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
@@ -79,10 +83,10 @@ namespace embree
     setAffinity(GetCurrentThread(), affinity);
   }
 
-  struct ThreadStartupData 
+  struct ThreadStartupData
   {
   public:
-    ThreadStartupData (thread_func f, void* arg) 
+    ThreadStartupData (thread_func f, void* arg)
       : f(f), arg(arg) {}
   public:
     thread_func f;
@@ -95,6 +99,7 @@ namespace embree
     _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
     parg->f(parg->arg);
     delete parg;
+    parg = nullptr;
     return 0;
   }
 
@@ -120,12 +125,6 @@ namespace embree
     CloseHandle(HANDLE(tid));
   }
 
-  /*! destroy a hardware thread by its handle */
-  void destroyThread(thread_t tid) {
-    TerminateThread(HANDLE(tid),0);
-    CloseHandle(HANDLE(tid));
-  }
-
   /*! creates thread local storage */
   tls_t createTls() {
     return tls_t(size_t(TlsAlloc()));
@@ -160,16 +159,21 @@ namespace embree
 #include <sstream>
 #include <algorithm>
 
+#if defined(__ANDROID__)
+#include <pthread.h>
+#endif
+
 namespace embree
 {
   static MutexSys mutex;
   static std::vector<size_t> threadIDs;
-  
+
+#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target
   /* changes thread ID mapping such that we first fill up all thread on one core */
   size_t mapThreadID(size_t threadID)
   {
     Lock<MutexSys> lock(mutex);
-    
+
     if (threadIDs.size() == 0)
     {
       /* parse thread/CPU topology */
@@ -181,11 +185,11 @@ namespace embree
         if (fs.fail()) break;
 
         int i;
-        while (fs >> i) 
+        while (fs >> i)
         {
           if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
             threadIDs.push_back(i);
-          if (fs.peek() == ',') 
+          if (fs.peek() == ',')
             fs.ignore();
         }
         fs.close();
@@ -229,16 +233,21 @@ namespace embree
 
     return ID;
   }
+#endif
 
   /*! set affinity of the calling thread */
   void setAffinity(ssize_t affinity)
   {
+#if defined(__ANDROID__)
+    // TODO(LTE): Implement
+#else
     cpu_set_t cset;
     CPU_ZERO(&cset);
     size_t threadID = mapThreadID(affinity);
     CPU_SET(threadID, &cset);
 
     pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+#endif
   }
 }
 #endif
@@ -303,21 +312,21 @@ namespace embree
 
 namespace embree
 {
-  struct ThreadStartupData 
+  struct ThreadStartupData
   {
   public:
-    ThreadStartupData (thread_func f, void* arg, int affinity) 
+    ThreadStartupData (thread_func f, void* arg, int affinity)
       : f(f), arg(arg), affinity(affinity) {}
-  public: 
+  public:
     thread_func f;
     void* arg;
     ssize_t affinity;
   };
-  
+
   static void* threadStartup(ThreadStartupData* parg)
   {
     _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
-    
+
     /*! Mac OS X does not support setting affinity at thread creation time */
 #if defined(__MACOSX__)
     if (parg->affinity >= 0)
@@ -326,6 +335,7 @@ namespace embree
 
     parg->f(parg->arg);
     delete parg;
+    parg = nullptr;
     return nullptr;
   }
 
@@ -341,13 +351,13 @@ namespace embree
     pthread_t* tid = new pthread_t;
     if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
       pthread_attr_destroy(&attr);
-      delete tid; 
+      delete tid;
       FATAL("pthread_create failed");
     }
     pthread_attr_destroy(&attr);
 
     /* set affinity */
-#if defined(__LINUX__)
+#if defined(__LINUX__) && !defined(__ANDROID__)
     if (threadID >= 0) {
       cpu_set_t cset;
       CPU_ZERO(&cset);
@@ -379,14 +389,8 @@ namespace embree
     delete (pthread_t*)tid;
   }
 
-  /*! destroy a hardware thread by its handle */
-  void destroyThread(thread_t tid) {
-    pthread_cancel(*(pthread_t*)tid);
-    delete (pthread_t*)tid;
-  }
-
   /*! creates thread local storage */
-  tls_t createTls() 
+  tls_t createTls()
   {
     pthread_key_t* key = new pthread_key_t;
     if (pthread_key_create(key,nullptr) != 0) {
@@ -398,14 +402,14 @@ namespace embree
   }
 
   /*! return the thread local storage pointer */
-  void* getTls(tls_t tls) 
+  void* getTls(tls_t tls)
   {
     assert(tls);
     return pthread_getspecific(*(pthread_key_t*)tls);
   }
 
   /*! set the thread local storage pointer */
-  void setTls(tls_t tls, void* const ptr) 
+  void setTls(tls_t tls, void* const ptr)
   {
     assert(tls);
     if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
@@ -413,7 +417,7 @@ namespace embree
   }
 
   /*! destroys thread local storage identifier */
-  void destroyTls(tls_t tls) 
+  void destroyTls(tls_t tls)
   {
     assert(tls);
     if (pthread_key_delete(*(pthread_key_t*)tls) != 0)
diff --git a/thirdparty/embree/common/sys/thread.h b/thirdparty/embree/common/sys/thread.h
index 5261a985eefa..45da6e6a704e 100644
--- a/thirdparty/embree/common/sys/thread.h
+++ b/thirdparty/embree/common/sys/thread.h
@@ -29,9 +29,6 @@ namespace embree
   /*! waits until the given thread has terminated */
   void join(thread_t tid);
 
-  /*! destroy handle of a thread */
-  void destroyThread(thread_t tid);
-
   /*! type for handle to thread local storage */
   typedef struct opaque_tls_t* tls_t;
 
diff --git a/thirdparty/embree/common/tasking/taskscheduler.h b/thirdparty/embree/common/tasking/taskscheduler.h
index 298d09255b2c..9940e068d064 100644
--- a/thirdparty/embree/common/tasking/taskscheduler.h
+++ b/thirdparty/embree/common/tasking/taskscheduler.h
@@ -5,6 +5,8 @@
 
 #if defined(TASKING_INTERNAL)
 #  include "taskschedulerinternal.h"
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+#  include "taskschedulergcd.h"
 #elif defined(TASKING_TBB)
 #  include "taskschedulertbb.h"
 #elif defined(TASKING_PPL)
diff --git a/thirdparty/embree/common/tasking/taskschedulergcd.h b/thirdparty/embree/common/tasking/taskschedulergcd.h
new file mode 100644
index 000000000000..d31f8bb478fb
--- /dev/null
+++ b/thirdparty/embree/common/tasking/taskschedulergcd.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#include <dispatch/dispatch.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy() {}
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+        currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads;
+        return currentThreadIndex;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount()
+    {
+        return GCDNumThreads;
+    }
+
+    private:
+      static size_t GCDNumThreads;
+      static size_t currentThreadIndex;
+
+  };
+
+};
+
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
index 2152e92f4462..ebf656d1a0c3 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
@@ -48,13 +48,15 @@ namespace embree
     {
       Task* prevTask = thread.task;
       thread.task = this;
-      try {
-        if (thread.scheduler->cancellingException == nullptr)
+      // -- GODOT start --
+      // try {
+      // if (thread.scheduler->cancellingException == nullptr)
           closure->execute();
-      } catch (...) {
-        if (thread.scheduler->cancellingException == nullptr)
-          thread.scheduler->cancellingException = std::current_exception();
-      }
+      // } catch (...) {
+      //   if (thread.scheduler->cancellingException == nullptr)
+      //     thread.scheduler->cancellingException = std::current_exception();
+      // }
+      // -- GODOT end --
       thread.task = prevTask;
       add_dependencies(-1);
     }
@@ -152,6 +154,12 @@ namespace embree
     assert(newNumThreads);
     newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
 
+    // We are observing a few % gain by increasing number threads by 2 on aarch64.
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    numThreads = newNumThreads*2;
+#else
+    numThreads = newNumThreads;
+#endif
     numThreads = newNumThreads;
     if (!startThreads && !running) return;
     running = true;
@@ -291,8 +299,11 @@ namespace embree
     size_t threadIndex = allocThreadIndex();
     condition.wait(mutex, [&] () { return hasRootTask.load(); });
     mutex.unlock();
-    std::exception_ptr except = thread_loop(threadIndex);
-    if (except != nullptr) std::rethrow_exception(except);
+    // -- GODOT start --
+    // std::exception_ptr except = thread_loop(threadIndex);
+    // if (except != nullptr) std::rethrow_exception(except);
+    thread_loop(threadIndex);
+    // -- GODOT end --
   }
 
   void TaskScheduler::reset() {
@@ -324,7 +335,10 @@ namespace embree
     return thread->scheduler->cancellingException == nullptr;
   }
 
-  std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT start --
+//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+  void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
   {
     /* allocate thread structure */
     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
@@ -347,9 +361,10 @@ namespace embree
     swapThread(oldThread);
 
     /* remember exception to throw */
-    std::exception_ptr except = nullptr;
-    if (cancellingException != nullptr) except = cancellingException;
-
+    // -- GODOT start --
+    // std::exception_ptr except = nullptr;
+    // if (cancellingException != nullptr) except = cancellingException;
+    // -- GODOT end --
     /* wait for all threads to terminate */
     threadCounter--;
 #if defined(__WIN32__)
@@ -367,7 +382,10 @@ namespace embree
           yield();
 #endif
 	}
-    return except;
+    // -- GODOT start --
+    // return except;
+    return;
+    // -- GODOT end --
   }
 
   bool TaskScheduler::steal_from_other_threads(Thread& thread)
diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
index ef4d65f6fd3f..8bd70b2b8cfd 100644
--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
+++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
@@ -123,7 +123,10 @@ namespace embree
       {
         size_t ofs = bytes + ((align - stackPtr) & (align-1));
         if (stackPtr + ofs > CLOSURE_STACK_SIZE)
-          throw std::runtime_error("closure stack overflow");
+          // -- GODOT start --
+          // throw std::runtime_error("closure stack overflow");
+          abort();
+          // -- GODOT end --
         stackPtr += ofs;
         return &stack[stackPtr-bytes];
       }
@@ -132,12 +135,16 @@ namespace embree
       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
       {
         if (right >= TASK_STACK_SIZE)
-          throw std::runtime_error("task stack overflow");
+          // -- GODOT start --
+          // throw std::runtime_error("task stack overflow");
+          abort();
+          // -- GODOT end --
 
 	/* allocate new task on right side of stack */
         size_t oldStackPtr = stackPtr;
         TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
-        new (&tasks[right]) Task(func,thread.task,oldStackPtr,size);
+        /* gcc 8 or later fails to compile without explicit .load() */
+        new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
         right++;
 
 	/* also move left pointer */
@@ -238,7 +245,10 @@ namespace embree
     void wait_for_threads(size_t threadCount);
 
     /*! thread loop for all worker threads */
-    std::exception_ptr thread_loop(size_t threadIndex);
+    // -- GODOT start --
+    // std::exception_ptr thread_loop(size_t threadIndex);
+    void thread_loop(size_t threadIndex);
+    // -- GODOT end --
 
     /*! steals a task from a different thread */
     bool steal_from_other_threads(Thread& thread);
diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h
index 369e5edf076b..98dba2687194 100644
--- a/thirdparty/embree/common/tasking/taskschedulertbb.h
+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h
@@ -12,13 +12,7 @@
 #include "../sys/ref.h"
 
 #if defined(__WIN32__)
-// -- GODOT start --
-#if !defined(NOMINMAX)
-// -- GODOT end --
 #  define NOMINMAX
-// -- GODOT start --
-#endif
-// -- GODOT end --
 #endif
 
 // We need to define these to avoid implicit linkage against
diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h
index bd2e7144dd26..890e06faa3c3 100644
--- a/thirdparty/embree/include/embree3/rtcore_common.h
+++ b/thirdparty/embree/include/embree3/rtcore_common.h
@@ -19,7 +19,7 @@ typedef int ssize_t;
 #endif
 #endif
 
-#if defined(_WIN32) && defined(_MSC_VER)
+#if defined(_WIN32) && !defined(__MINGW32__)
 #  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
 #else
 #  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
@@ -35,7 +35,7 @@ typedef int ssize_t;
 #endif
 #endif
 
-#if defined(_WIN32) 
+#if defined(_WIN32)
 #  define RTC_FORCEINLINE __forceinline
 #else
 #  define RTC_FORCEINLINE inline __attribute__((always_inline))
@@ -224,13 +224,13 @@ RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context
 }
 
 /* Point query structure for closest point query */
-struct RTC_ALIGN(16) RTCPointQuery 
+struct RTC_ALIGN(16) RTCPointQuery
 {
   float x;                // x coordinate of the query point
   float y;                // y coordinate of the query point
   float z;                // z coordinate of the query point
   float time;             // time of the point query
-  float radius;           // radius of the point query 
+  float radius;           // radius of the point query
 };
 
 /* Structure of a packet of 4 query points */
@@ -250,7 +250,7 @@ struct RTC_ALIGN(32) RTCPointQuery8
   float y[8];                // y coordinate of the query point
   float z[8];                // z coordinate of the query point
   float time[8];             // time of the point query
-  float radius[8];           // radius ofr the point query 
+  float radius[8];           // radius ofr the point query
 };
 
 /* Structure of a packet of 16 query points */
@@ -269,11 +269,11 @@ struct RTC_ALIGN(16) RTCPointQueryContext
 {
   // accumulated 4x4 column major matrices from world space to instance space.
   // undefined if size == 0.
-  float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; 
+  float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
 
   // accumulated 4x4 column major matrices from instance space to world space.
   // undefined if size == 0.
-  float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; 
+  float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
 
   // instance ids.
   unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
@@ -301,13 +301,13 @@ struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
   void* userPtr;
 
   // primitive and geometry ID of primitive
-  unsigned int  primID;        
-  unsigned int  geomID;    
+  unsigned int  primID;
+  unsigned int  geomID;
 
   // the context with transformation and instance ID stack
   struct RTCPointQueryContext* context;
 
-  // If the current instance transform M (= context->world2inst[context->instStackSize]) 
+  // If the current instance transform M (= context->world2inst[context->instStackSize])
   // is a similarity matrix, i.e there is a constant factor similarityScale such that,
   //    for all x,y: dist(Mx, My) = similarityScale * dist(x, y),
   // The similarity scale is 0, if the current instance transform is not a
@@ -322,5 +322,5 @@ struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
 };
 
 typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
-  
+
 RTC_NAMESPACE_END
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
index 79ccdf946f5e..3f7e678a1010 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_sah.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
@@ -43,7 +43,7 @@ namespace embree
         {
           if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
           if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
-          if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize      )) logBlockSize    = bsr(settings.sahBlockSize);
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize      )) logBlockSize    = bsr(static_cast<size_t>(settings.sahBlockSize));
           if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
           if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
           if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost     )) travCost        = settings.traversalCost;
diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp
index 9dbb3bcd732e..bd102bd6ef53 100644
--- a/thirdparty/embree/kernels/bvh/bvh.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh.cpp
@@ -51,7 +51,7 @@ namespace embree
   template<int N>
   void BVHN<N>::layoutLargeNodes(size_t num)
   {
-#if defined(__X86_64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
     struct NodeArea 
     {
       __forceinline NodeArea() {}
@@ -183,7 +183,7 @@ namespace embree
   template class BVHN<8>;
 #endif
 
-#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
   template class BVHN<4>;
 #endif
 }
diff --git a/thirdparty/embree/kernels/bvh/bvh.h b/thirdparty/embree/kernels/bvh/bvh.h
index 7c1a45b63288..8fdf912e520b 100644
--- a/thirdparty/embree/kernels/bvh/bvh.h
+++ b/thirdparty/embree/kernels/bvh/bvh.h
@@ -81,7 +81,7 @@ namespace embree
     struct CreateAlloc : public FastAllocator::Create {
       __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {}
     };
-    
+
     typedef BVHNodeRecord<NodeRef>     NodeRecord;
     typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
     typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
index fa1710428378..64759c1294a8 100644
--- a/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp
@@ -18,7 +18,7 @@
 #include "../geometry/object.h"
 #include "../geometry/instance.h"
 
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
 #  define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
 #else
 #  define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
index f5beb6ca9110..83d1fb4d3dfc 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
@@ -172,12 +172,23 @@ namespace embree
           TravRayKStream<K,robust> &p = packets[rayID / K];
           const size_t i = rayID % K;
           const vint<Nx> bitmask(shiftTable[rayID]);
+
+#if defined (__aarch64__)
+          const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+          const vfloat<Nx> tFarX  = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<Nx> tFarY  = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<Nx> tFarZ  = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); 
+#else
           const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
           const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
           const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
           const vfloat<Nx> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
           const vfloat<Nx> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
           const vfloat<Nx> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
+#endif
+
           const vfloat<Nx> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
           const vfloat<Nx> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<Nx>(p.tfar[i]));      
 
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_ref.h b/thirdparty/embree/kernels/bvh/bvh_node_ref.h
index 5efc9c72c7c0..0f6d4dac7e7a 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_ref.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_ref.h
@@ -102,7 +102,7 @@ namespace embree
     
     /*! Sets the barrier bit. */
     __forceinline void setBarrier() {
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       assert(!isBarrier());
       ptr |= barrier_mask;
 #else
@@ -112,7 +112,7 @@ namespace embree
     
     /*! Clears the barrier bit. */
     __forceinline void clearBarrier() {
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       ptr &= ~barrier_mask;
 #else
       assert(false);
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
index 05460843af22..aa5603502691 100644
--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -150,7 +150,10 @@ namespace embree
       }
     }
     else {
-      throw std::runtime_error("not supported node type in bvh_statistics");
+      // -- GODOT start --
+      // throw std::runtime_error("not supported node type in bvh_statistics");
+      abort();
+      // -- GODOT end --
     }
     return s;
   } 
@@ -159,7 +162,7 @@ namespace embree
   template class BVHNStatistics<8>;
 #endif
 
-#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
   template class BVHNStatistics<4>;
 #endif
 }
diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h
index b1e63ce345ce..aa0d4ba4d746 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector1.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector1.h
@@ -5,6 +5,15 @@
 
 #include "node_intersector.h"
 
+#if defined(__AVX2__)
+#define __FMA_X4__
+#endif
+
+#if defined(__aarch64__)
+#define __FMA_X4__
+#endif
+
+
 namespace embree
 {
   namespace isa
@@ -29,9 +38,15 @@ namespace embree
         org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
         dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
         rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
-#if defined(__AVX2__)
+#if defined(__FMA_X4__)
         const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+#if !defined(__aarch64__)
         org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#else
+          //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
+          //x86 will use msub
+        neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
+#endif
 #endif
         nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
         nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
@@ -59,8 +74,12 @@ namespace embree
         org  = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
         dir  = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
         rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
-#if defined(__AVX2__)
-	org_rdir = org*rdir;
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+        org_rdir = org*rdir;
+#else
+        neg_org_rdir = -(org*rdir);
+#endif
 #endif
 	nearX = nearXYZ.x[k];
 	nearY = nearXYZ.y[k];
@@ -81,8 +100,14 @@ namespace embree
 
       Vec3fa org_xyz, dir_xyz;
       Vec3vf<Nx> org, dir, rdir;
-#if defined(__AVX2__)
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
       Vec3vf<Nx> org_rdir;
+#else
+        //aarch64 version are keeping negation of the org_rdir and use madd
+        //x86 uses msub
+      Vec3vf<Nx> neg_org_rdir;
+#endif
 #endif
 #if defined(__AVX512ER__) // KNL+
       vint16 permX, permY, permZ;
@@ -110,7 +135,6 @@ namespace embree
         dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
         rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
         rdir_far  = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
-
         nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
         nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
         nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
@@ -447,13 +471,22 @@ namespace embree
     template<>
       __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
     {
-#if defined(__AVX2__)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
       const vfloat4 tFarX  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tFarY  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tFarZ  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
       const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@@ -462,8 +495,13 @@ namespace embree
       const vfloat4 tFarY  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
       const vfloat4 tFarZ  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
 #endif
-      
-#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+
+#if defined(__aarch64__)
+      const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
+      const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
       const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool4 vmask = asInt(tNear) > asInt(tFar);
@@ -489,12 +527,22 @@ namespace embree
       __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
     {
 #if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
       const vfloat8 tFarX  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tFarY  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tFarZ  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+        
 #else
       const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
       const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@@ -638,13 +686,22 @@ namespace embree
       const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
-#if defined(__AVX2__)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@@ -653,7 +710,7 @@ namespace embree
       const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
       const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
 #endif
-#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
       const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool<N> vmask = asInt(tNear) > asInt(tFar);
@@ -714,13 +771,22 @@ namespace embree
       const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
-#if defined (__AVX2__)
+#if defined (__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@@ -729,7 +795,7 @@ namespace embree
       const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
       const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
 #endif
-#if defined(__AVX2__) && !defined(__AVX512F__)
+#if defined(__FMA_X4__) && !defined(__AVX512F__)
       const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
       const vfloat<N> tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
 #else
@@ -803,13 +869,22 @@ namespace embree
       const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
       const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
 
-#if defined(__AVX2__)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat4 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@@ -819,7 +894,7 @@ namespace embree
       const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
 #endif
       
-#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
       const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool4 vmask = asInt(tNear) > asInt(tFar);
@@ -892,12 +967,21 @@ namespace embree
       const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
 
 #if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat8 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@@ -1078,13 +1162,22 @@ namespace embree
       const vfloat<N> upper_y   = node->dequantizeUpperY(time);
       const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
       const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
-#if defined(__AVX2__)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
index dbce46932495..800ac8b47824 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
@@ -81,9 +81,13 @@ namespace embree
         min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
         max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
 
+#if defined (__aarch64__)
+        neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
+        neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
+#else
         min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
         max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
-
+#endif
         min_dist = reduced_min_dist;
         max_dist = reduced_max_dist;
 
@@ -101,9 +105,13 @@ namespace embree
       Vec3fa min_rdir;
       Vec3fa max_rdir;
 
+#if defined (__aarch64__)
+      Vec3fa neg_min_org_rdir;
+      Vec3fa neg_max_org_rdir;
+#else
       Vec3fa min_org_rdir;
       Vec3fa max_org_rdir;
-
+#endif
       float min_dist;
       float max_dist;
     };
@@ -203,13 +211,21 @@ namespace embree
       const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
       const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
 
+#if defined (__aarch64__)
+      const vfloat<Nx> fminX = madd(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.neg_min_org_rdir.x));
+      const vfloat<Nx> fminY = madd(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.neg_min_org_rdir.y));
+      const vfloat<Nx> fminZ = madd(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.neg_min_org_rdir.z));
+      const vfloat<Nx> fmaxX = madd(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.neg_max_org_rdir.x));
+      const vfloat<Nx> fmaxY = madd(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.neg_max_org_rdir.y));
+      const vfloat<Nx> fmaxZ = madd(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.neg_max_org_rdir.z));
+#else
       const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x));
       const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y));
       const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z));
       const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x));
       const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y));
       const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z));
-
+#endif
       const vfloat<Nx> fmin  = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
       dist = fmin;
       const vfloat<Nx> fmax  = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
index 1cc0d47fabd1..0543e56f8ea4 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_packet.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
@@ -39,10 +39,11 @@ namespace embree
         org = ray_org;
         dir = ray_dir;
         rdir = rcp_safe(ray_dir);
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+        neg_org_rdir = -(org * rdir);
+#elif defined(__AVX2__)
         org_rdir = org * rdir;
 #endif
-
         if (N)
         {
           const int size = sizeof(float)*N;
@@ -55,7 +56,9 @@ namespace embree
       Vec3vf<K> org;
       Vec3vf<K> dir;
       Vec3vf<K> rdir;
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#elif defined(__AVX2__)
       Vec3vf<K> org_rdir;
 #endif
       Vec3vi<K> nearXYZ;
@@ -119,7 +122,14 @@ namespace embree
                                          const TravRayKFast<K>& ray, vfloat<K>& dist)
 
     {
-  #if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
@@ -199,7 +209,14 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@@ -302,7 +319,14 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@@ -464,7 +488,14 @@ namespace embree
       const vfloat<N> lower_z = node->dequantizeLowerZ();
       const vfloat<N> upper_z = node->dequantizeUpperZ();
 
-  #if defined(__AVX2__)
+  #if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+  #elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
@@ -549,7 +580,14 @@ namespace embree
         const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
         const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
         
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+        const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+        const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
         const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
         const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
         const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
index c2b5b0cb7a47..f379b57aeac8 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
@@ -32,11 +32,19 @@ namespace embree
       __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
       {
         rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+        neg_org_rdir = -(ray_org * rdir);
+#else
         org_rdir = ray_org * rdir;
+#endif
       }
 
       Vec3vf<K> rdir;
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#else
       Vec3vf<K> org_rdir;
+#endif
       vfloat<K> tnear;
       vfloat<K> tfar;
     };
@@ -87,12 +95,21 @@ namespace embree
       const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
       const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
 
+#if defined (__aarch64__)
+      const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+      const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+      const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+      const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+      const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+      const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+#else
       const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
       const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
       const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
       const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
       const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
       const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
+#endif
       const vfloat<Nx> rmin  = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
       const vfloat<Nx> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
 
@@ -113,12 +130,21 @@ namespace embree
       const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
       const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
 
+#if defined (__aarch64__)
+      const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
       const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+#endif
 
       const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
       const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h
index f332d3655516..c038d3cf21e3 100644
--- a/thirdparty/embree/kernels/common/accel.h
+++ b/thirdparty/embree/kernels/common/accel.h
@@ -332,7 +332,7 @@ namespace embree
         intersectorN.intersect(this,rayN,N,context);
       }
       
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
       __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
         const vint<4> mask = valid.mask32();
         intersect4(&mask,(RTCRayHit4&)ray,context);
@@ -388,7 +388,7 @@ namespace embree
         intersectorN.occluded(this,rayN,N,context);
       }
       
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
       __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
         const vint<4> mask = valid.mask32();
         occluded4(&mask,(RTCRay4&)ray,context);
diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp
index c9f7e921932e..aadb4a64efb1 100644
--- a/thirdparty/embree/kernels/common/acceln.cpp
+++ b/thirdparty/embree/kernels/common/acceln.cpp
@@ -97,7 +97,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded4(valid,ray,context);
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       if (unlikely(none(valid0 & hit0))) break;
@@ -111,7 +111,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded8(valid,ray,context);
-#if defined(__SSE2__) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
@@ -127,7 +127,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded16(valid,ray,context);
-#if defined(__SSE2__) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp
index f958a16f5653..6fa406f03a6c 100644
--- a/thirdparty/embree/kernels/common/alloc.cpp
+++ b/thirdparty/embree/kernels/common/alloc.cpp
@@ -3,6 +3,9 @@
 
 #include "alloc.h"
 #include "../../common/sys/thread.h"
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include "../../common/sys/barrier.h"
+#endif
 
 namespace embree
 {
diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
index 3a5bb966b880..488fa707eff2 100644
--- a/thirdparty/embree/kernels/common/alloc.h
+++ b/thirdparty/embree/kernels/common/alloc.h
@@ -8,6 +8,10 @@
 #include "scene.h"
 #include "primref.h"
 
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
 namespace embree
 {
   class FastAllocator
@@ -26,7 +30,7 @@ namespace embree
   public:
 
     struct ThreadLocal2;
-    enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
+    enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
 
     /*! Per thread structure holding the current memory block. */
     struct __aligned(64) ThreadLocal
@@ -132,7 +136,11 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() == alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
         Lock<SpinLock> lock(mutex);
+#endif
         //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
         if (alloc.load()) {
           alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
@@ -150,7 +158,11 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() != alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
         Lock<SpinLock> lock(mutex);
+#endif
         if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
         alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
         alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
@@ -161,7 +173,11 @@ namespace embree
       }
 
     public:
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::mutex mutex;
+#else
       SpinLock mutex;        //!< required as unbind is called from other threads
+#endif
       std::atomic<FastAllocator*> alloc;  //!< parent allocator
       ThreadLocal alloc0;
       ThreadLocal alloc1;
@@ -169,7 +185,7 @@ namespace embree
 
     FastAllocator (Device* device, bool osAllocation) 
       : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
-        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
         primrefarray(device,0)
     {
       for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
@@ -206,7 +222,7 @@ namespace embree
 
     void setOSallocation(bool flag)
     {
-      atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
+      atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
     }
 
   private:
@@ -217,7 +233,11 @@ namespace embree
       ThreadLocal2* alloc = thread_local_allocator2;
       if (alloc == nullptr) {
         thread_local_allocator2 = alloc = new ThreadLocal2;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
         Lock<SpinLock> lock(s_thread_local_allocators_lock);
+#endif
         s_thread_local_allocators.push_back(make_unique(alloc));
       }
       return alloc;
@@ -227,7 +247,11 @@ namespace embree
 
     __forceinline void join(ThreadLocal2* alloc)
     {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
       Lock<SpinLock> lock(thread_local_allocators_lock);
+#endif
       thread_local_allocators.push_back(alloc);
     }
 
@@ -496,7 +520,11 @@ namespace embree
         /* parallel block creation in case of no freeBlocks, avoids single global mutex */
         if (likely(freeBlocks.load() == nullptr))
         {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+          std::scoped_lock lock(slotMutex[slot]);
+#else
           Lock<SpinLock> lock(slotMutex[slot]);
+#endif
           if (myUsedBlocks == threadUsedBlocks[slot]) {
             const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
             const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
@@ -509,7 +537,11 @@ namespace embree
 
         /* if this fails allocate new block */
         {
-          Lock<SpinLock> lock(mutex);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+            std::scoped_lock lock(mutex);
+#else
+            Lock<SpinLock> lock(mutex);
+#endif
 	  if (myUsedBlocks == threadUsedBlocks[slot])
 	  {
             if (freeBlocks.load() != nullptr) {
@@ -531,7 +563,11 @@ namespace embree
     /*! add new block */
     void addBlock(void* ptr, ssize_t bytes)
     {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(mutex);
+#else
       Lock<SpinLock> lock(mutex);
+#endif
       const size_t sizeof_Header = offsetof(Block,data[0]);
       void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
       size_t ofs = (size_t) aptr - (size_t) ptr;
@@ -617,8 +653,8 @@ namespace embree
         bytesWasted(alloc->bytesWasted),
         stat_all(alloc,ANY_TYPE),
         stat_malloc(alloc,ALIGNED_MALLOC),
-        stat_4K(alloc,OS_MALLOC,false),
-        stat_2M(alloc,OS_MALLOC,true),
+        stat_4K(alloc,EMBREE_OS_MALLOC,false),
+        stat_2M(alloc,EMBREE_OS_MALLOC,true),
         stat_shared(alloc,SHARED) {}
 
       AllStatistics (size_t bytesUsed,
@@ -711,7 +747,7 @@ namespace embree
         /* We avoid using os_malloc for small blocks as this could
          * cause a risk of fragmenting the virtual address space and
          * reach the limit of vm.max_map_count = 65k under Linux. */
-        if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
+        if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
           atype = ALIGNED_MALLOC;
 
         /* we need to additionally allocate some header */
@@ -720,7 +756,7 @@ namespace embree
         bytesReserve  = sizeof_Header+bytesReserve;
 
         /* consume full 4k pages with using os_malloc */
-        if (atype == OS_MALLOC) {
+        if (atype == EMBREE_OS_MALLOC) {
           bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
           bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
         }
@@ -752,11 +788,11 @@ namespace embree
             return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
           }
         }
-        else if (atype == OS_MALLOC)
+        else if (atype == EMBREE_OS_MALLOC)
         {
           if (device) device->memoryMonitor(bytesAllocate,false);
           bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
-          return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+          return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
         }
         else
           assert(false);
@@ -800,7 +836,7 @@ namespace embree
           if (device) device->memoryMonitor(-sizeof_Alloced,true);
         }
 
-        else if (atype == OS_MALLOC) {
+        else if (atype == EMBREE_OS_MALLOC) {
          size_t sizeof_This = sizeof_Header+reserveEnd;
          os_free(this,sizeof_This,huge_pages);
          if (device) device->memoryMonitor(-sizeof_Alloced,true);
@@ -861,7 +897,7 @@ namespace embree
       bool hasType(AllocationType atype_i, bool huge_pages_i) const
       {
         if      (atype_i == ANY_TYPE ) return true;
-        else if (atype   == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else if (atype   == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
         else                           return atype_i == atype;
       }
 
@@ -910,7 +946,7 @@ namespace embree
       void print_block() const
       {
         if (atype == ALIGNED_MALLOC) std::cout << "A";
-        else if (atype == OS_MALLOC) std::cout << "O";
+        else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
         else if (atype == SHARED) std::cout << "S";
         if (huge_pages) std::cout << "H";
         size_t bytesUsed = getBlockUsedBytes();
@@ -940,7 +976,11 @@ namespace embree
     std::atomic<Block*> freeBlocks;
 
     std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#else
     SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#endif
 
     bool use_single_mode;
     size_t defaultBlockSize;
@@ -954,7 +994,11 @@ namespace embree
     static __thread ThreadLocal2* thread_local_allocator2;
     static SpinLock s_thread_local_allocators_lock;
     static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex thread_local_allocators_lock;
+#else
     SpinLock thread_local_allocators_lock;
+#endif
     std::vector<ThreadLocal2*> thread_local_allocators;
     AllocationType atype;
     mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
diff --git a/thirdparty/embree/kernels/common/default.h b/thirdparty/embree/kernels/common/default.h
index 3db53413bc57..709119163b8e 100644
--- a/thirdparty/embree/kernels/common/default.h
+++ b/thirdparty/embree/kernels/common/default.h
@@ -55,6 +55,11 @@
 #include <utility>
 #include <sstream>
 
+#if !defined(_DEBUG) && defined(BUILD_IOS)
+#undef assert
+#define assert(_EXPR)
+#endif
+
 namespace embree
 {
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp
index d8c3d9c748f5..16ec11b89295 100644
--- a/thirdparty/embree/kernels/common/device.cpp
+++ b/thirdparty/embree/kernels/common/device.cpp
@@ -221,6 +221,9 @@ namespace embree
 #if defined(TASKING_INTERNAL)
     std::cout << "internal_tasking_system ";
 #endif
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    std::cout << "GCD tasking system ";
+#endif
 #if defined(TASKING_PPL)
 	std::cout << "PPL ";
 #endif
@@ -503,6 +506,10 @@ namespace embree
 #if defined(TASKING_PPL)
     case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
 #endif
+            
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3;
+#endif
 
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
     case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h
index 9fd1ea58b749..63fb8d335106 100644
--- a/thirdparty/embree/kernels/common/isa.h
+++ b/thirdparty/embree/kernels/common/isa.h
@@ -46,7 +46,7 @@ namespace embree
 #define SELECT_SYMBOL_DEFAULT(features,intersector) \
   intersector = isa::intersector;
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
 #if !defined(EMBREE_TARGET_SIMD4)
 #define EMBREE_TARGET_SIMD4
 #endif
diff --git a/thirdparty/embree/kernels/common/primref.h b/thirdparty/embree/kernels/common/primref.h
index 3d4f9c0d44d8..ce75c982bbe7 100644
--- a/thirdparty/embree/kernels/common/primref.h
+++ b/thirdparty/embree/kernels/common/primref.h
@@ -29,7 +29,7 @@ namespace embree
 
     __forceinline PrimRef (const BBox3fa& bounds, size_t id) 
     {
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
       upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
 #else
@@ -79,7 +79,7 @@ namespace embree
 
     /*! returns an size_t sized ID */
     __forceinline size_t ID() const { 
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       return size_t(lower.u) + (size_t(upper.u) << 32);
 #else
       return size_t(lower.u);
diff --git a/thirdparty/embree/kernels/common/primref_mb.h b/thirdparty/embree/kernels/common/primref_mb.h
index 97a3fbe4e6b7..b6c1ad57121f 100644
--- a/thirdparty/embree/kernels/common/primref_mb.h
+++ b/thirdparty/embree/kernels/common/primref_mb.h
@@ -32,7 +32,7 @@ namespace embree
       : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
     {
       assert(activeTimeSegments > 0);
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
       lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
 #else
@@ -47,7 +47,7 @@ namespace embree
       : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
     {
       assert(activeTimeSegments > 0);
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
       lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
 #else
@@ -115,7 +115,7 @@ namespace embree
 
     /*! returns an size_t sized ID */
     __forceinline size_t ID() const {
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
 #else
       return size_t(lbounds.bounds0.lower.u);
@@ -163,7 +163,7 @@ namespace embree
       : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
     {
       assert(activeTimeSegments > 0);
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       bbox.lower.u = id & 0xFFFFFFFF;
       bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
 #else
@@ -229,7 +229,7 @@ namespace embree
 
     /*! returns an size_t sized ID */
     __forceinline size_t ID() const { 
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
       return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
 #else
       return size_t(bbox.lower.u);
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
index 2cfb466a60c5..625fbf6d4f20 100644
--- a/thirdparty/embree/kernels/common/rtcore.cpp
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -8,18 +8,31 @@
 #include "scene.h"
 #include "context.h"
 #include "../../include/embree3/rtcore_ray.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
 using namespace embree;
 
 RTC_NAMESPACE_BEGIN;
 
   /* mutex to make API thread safe */
-  static MutexSys g_mutex;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    static std::mutex g_mutex;
+#else
+    static MutexSys g_mutex;
+#endif
 
   RTC_API RTCDevice rtcNewDevice(const char* config)
   {
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcNewDevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
     Lock<MutexSys> lock(g_mutex);
+#endif
     Device* device = new Device(config);
     return (RTCDevice) device->refInc();
     RTC_CATCH_END(nullptr);
@@ -32,7 +45,11 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcRetainDevice);
     RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
     Lock<MutexSys> lock(g_mutex);
+#endif
     device->refInc();
     RTC_CATCH_END(nullptr);
   }
@@ -43,7 +60,11 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcReleaseDevice);
     RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
     Lock<MutexSys> lock(g_mutex);
+#endif
     device->refDec();
     RTC_CATCH_END(nullptr);
   }
@@ -54,7 +75,11 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcGetDeviceProperty);
     RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
     Lock<MutexSys> lock(g_mutex);
+#endif
     return device->getProperty(prop);
     RTC_CATCH_END(device);
     return 0;
@@ -67,7 +92,11 @@ RTC_NAMESPACE_BEGIN;
     RTC_TRACE(rtcSetDeviceProperty);
     const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
     if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
     Lock<MutexSys> lock(g_mutex);
+#endif
     device->setProperty(prop,val);
     RTC_CATCH_END(device);
   }
@@ -183,7 +212,11 @@ RTC_NAMESPACE_BEGIN;
     RTC_CATCH_BEGIN;
     RTC_TRACE(rtcSetSceneProgressMonitorFunction);
     RTC_VERIFY_HANDLE(hscene);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
     Lock<MutexSys> lock(g_mutex);
+#endif
     scene->setProgressMonitorFunction(progress,ptr);
     RTC_CATCH_END2(scene);
   }
@@ -197,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
     if (quality != RTC_BUILD_QUALITY_LOW &&
         quality != RTC_BUILD_QUALITY_MEDIUM &&
         quality != RTC_BUILD_QUALITY_HIGH)
-      throw std::runtime_error("invalid build quality");
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
     scene->setBuildQuality(quality);
     RTC_CATCH_END2(scene);
   }
@@ -479,12 +515,12 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray4* ray4 = (Ray4*) rayhit;
+    RayHit4* rayhit4 = (RayHit4*)rayhit;
     for (size_t i=0; i<4; i++) {
       if (!valid[i]) continue;
-      RayHit ray1; ray4->get(i,ray1);
+      RayHit ray1; rayhit4->get(i,ray1);
       scene->intersectors.intersect((RTCRayHit&)ray1,&context);
-      ray4->set(i,ray1);
+      rayhit4->set(i,ray1);
     }
 #else
     scene->intersectors.intersect4(valid,*rayhit,&context);
@@ -510,12 +546,12 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray8* ray8 = (Ray8*) rayhit;
+    RayHit8* rayhit8 = (RayHit8*) rayhit;
     for (size_t i=0; i<8; i++) {
       if (!valid[i]) continue;
-      RayHit ray1; ray8->get(i,ray1);
+      RayHit ray1; rayhit8->get(i,ray1);
       scene->intersectors.intersect((RTCRayHit&)ray1,&context);
-      ray8->set(i,ray1);
+      rayhit8->set(i,ray1);
     }
 #else
     if (likely(scene->intersectors.intersector8))
@@ -543,12 +579,12 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray16* ray16 = (Ray16*) rayhit;
+    RayHit16* rayhit16 = (RayHit16*) rayhit;
     for (size_t i=0; i<16; i++) {
       if (!valid[i]) continue;
-      RayHit ray1; ray16->get(i,ray1);
+      RayHit ray1; rayhit16->get(i,ray1);
       scene->intersectors.intersect((RTCRayHit&)ray1,&context);
-      ray16->set(i,ray1);
+      rayhit16->set(i,ray1);
     }
 #else
     if (likely(scene->intersectors.intersector16))
@@ -730,12 +766,12 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    RayHit4* ray4 = (RayHit4*) ray;
+    Ray4* ray4 = (Ray4*) ray;
     for (size_t i=0; i<4; i++) {
       if (!valid[i]) continue;
-      RayHit ray1; ray4->get(i,ray1);
+      Ray ray1; ray4->get(i,ray1);
       scene->intersectors.occluded((RTCRay&)ray1,&context);
-      ray4->geomID[i] = ray1.geomID; 
+      ray4->set(i,ray1);
     }
 #else
     scene->intersectors.occluded4(valid,*ray,&context);
@@ -761,10 +797,10 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    RayHit8* ray8 = (RayHit8*) ray;
+    Ray8* ray8 = (Ray8*) ray;
     for (size_t i=0; i<8; i++) {
       if (!valid[i]) continue;
-      RayHit ray1; ray8->get(i,ray1);
+      Ray ray1; ray8->get(i,ray1);
       scene->intersectors.occluded((RTCRay&)ray1,&context);
       ray8->set(i,ray1);
     }
@@ -795,10 +831,10 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    RayHit16* ray16 = (RayHit16*) ray;
+    Ray16* ray16 = (Ray16*) ray;
     for (size_t i=0; i<16; i++) {
       if (!valid[i]) continue;
-      RayHit ray1; ray16->get(i,ray1);
+      Ray ray1; ray16->get(i,ray1);
       scene->intersectors.occluded((RTCRay&)ray1,&context);
       ray16->set(i,ray1);
     }
@@ -1350,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
         quality != RTC_BUILD_QUALITY_MEDIUM &&
         quality != RTC_BUILD_QUALITY_HIGH &&
         quality != RTC_BUILD_QUALITY_REFIT)
-      throw std::runtime_error("invalid build quality");
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
     geometry->setBuildQuality(quality);
     RTC_CATCH_END2(geometry);
   }
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
index 6583d12d57cc..4b070e122ba3 100644
--- a/thirdparty/embree/kernels/common/rtcore.h
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -25,52 +25,58 @@ namespace embree
 #endif
 
 /*! Macros used in the rtcore API implementation */
-#define RTC_CATCH_BEGIN try {
+// -- GODOT start --
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
   
-#define RTC_CATCH_END(device)                                                \
-  } catch (std::bad_alloc&) {                                                   \
-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-  } catch (rtcore_error& e) {                                                   \
-    Device::process_error(device,e.error,e.what());                             \
-  } catch (std::exception& e) {                                                 \
-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-  } catch (...) {                                                               \
-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-  }
+// #define RTC_CATCH_END(device)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END(device)
   
-#define RTC_CATCH_END2(scene)                                                \
-  } catch (std::bad_alloc&) {                                                   \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-  } catch (rtcore_error& e) {                                                   \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,e.error,e.what());                             \
-  } catch (std::exception& e) {                                                 \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-  } catch (...) {                                                               \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-  }
+// #define RTC_CATCH_END2(scene)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END2(scene)
 
-#define RTC_CATCH_END2_FALSE(scene)                                             \
-  } catch (std::bad_alloc&) {                                                   \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-    return false;                                                               \
-  } catch (rtcore_error& e) {                                                   \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,e.error,e.what());                             \
-    return false;                                                               \
-  } catch (std::exception& e) {                                                 \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-    return false;                                                               \
-  } catch (...) {                                                               \
-    Device* device = scene ? scene->device : nullptr;                           \
-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-    return false;                                                               \
-  }
+// #define RTC_CATCH_END2_FALSE(scene)                                             \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//     return false;                                                               \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//     return false;                                                               \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//     return false;                                                               \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//     return false;                                                               \
+//   }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
 
 #define RTC_VERIFY_HANDLE(handle)                               \
   if (handle == nullptr) {                                         \
@@ -97,28 +103,38 @@ namespace embree
 #define RTC_TRACE(x) 
 #endif
 
-  /*! used to throw embree API errors */
-  struct rtcore_error : public std::exception
-  {
-    __forceinline rtcore_error(RTCError error, const std::string& str)
-      : error(error), str(str) {}
-    
-    ~rtcore_error() throw() {}
-    
-    const char* what () const throw () {
-      return str.c_str();
-    }
-    
-    RTCError error;
-    std::string str;
-  };
+// -- GODOT begin --
+//   /*! used to throw embree API errors */
+//   struct rtcore_error : public std::exception
+//   {
+//     __forceinline rtcore_error(RTCError error, const std::string& str)
+//       : error(error), str(str) {}
+//     
+//     ~rtcore_error() throw() {}
+//     
+//     const char* what () const throw () {
+//       return str.c_str();
+//     }
+//     
+//     RTCError error;
+//     std::string str;
+//   };
+// -- GODOT end --
 
 #if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
   #define throw_RTCError(error,str) \
-    throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
 #else
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,str);
   #define throw_RTCError(error,str) \
-    throw rtcore_error(error,str);
+    abort();
+  // -- GODOT end --
 #endif
 
 #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
index b0d1a723d12e..1e23aeb415de 100644
--- a/thirdparty/embree/kernels/common/scene.cpp
+++ b/thirdparty/embree/kernels/common/scene.cpp
@@ -6,7 +6,7 @@
 #include "../bvh/bvh4_factory.h"
 #include "../bvh/bvh8_factory.h"
 #include "../../common/algorithms/parallel_reduce.h"
- 
+
 namespace embree
 {
   /* error raising rtcIntersect and rtcOccluded functions */
@@ -40,7 +40,7 @@ namespace embree
   {
     device->refDec();
   }
-  
+
   void Scene::printStatistics()
   {
     /* calculate maximum number of time segments */
@@ -56,12 +56,12 @@ namespace embree
       statistics[i].resize(max_time_steps);
 
     /* gather statistics */
-    for (size_t i=0; i<size(); i++) 
+    for (size_t i=0; i<size(); i++)
     {
       if (!get(i)) continue;
-      int ty = get(i)->getType(); 
+      int ty = get(i)->getType();
       assert(ty<Geometry::GTY_END);
-      int timesegments = get(i)->numTimeSegments(); 
+      int timesegments = get(i)->numTimeSegments();
       assert((unsigned int)timesegments < max_time_steps);
       statistics[ty][timesegments] += get(i)->size();
     }
@@ -76,7 +76,7 @@ namespace embree
     for (size_t t=0; t<max_time_steps; t++)
       std::cout << "----------";
     std::cout << std::endl;
-    
+
     for (size_t p=0; p<Geometry::GTY_END; p++)
     {
       if (std::string(Geometry::gtype_names[p]) == "") continue;
@@ -90,34 +90,34 @@ namespace embree
   void Scene::createTriangleAccel()
   {
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
-    if (device->tri_accel == "default") 
+    if (device->tri_accel == "default")
     {
       if (quality_flags != RTC_BUILD_QUALITY_LOW)
       {
-        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
         switch (mode) {
-        case /*0b00*/ 0: 
+        case /*0b00*/ 0:
 #if defined (EMBREE_TARGET_SIMD8)
           if (device->canUseAVX())
 	  {
-            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
               accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
             else
               accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
           }
-          else 
+          else
 #endif
-          { 
-            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
               accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
-            else 
+            else
               accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
           }
           break;
 
-        case /*0b01*/ 1: 
+        case /*0b01*/ 1:
 #if defined (EMBREE_TARGET_SIMD8)
-          if (device->canUseAVX()) 
+          if (device->canUseAVX())
             accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
           else
 #endif
@@ -175,8 +175,8 @@ namespace embree
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
     if (device->tri_accel_mb == "default")
     {
-      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
-      
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+
 #if defined (EMBREE_TARGET_SIMD8)
       if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
       {
@@ -211,18 +211,18 @@ namespace embree
   void Scene::createQuadAccel()
   {
 #if defined(EMBREE_GEOMETRY_QUAD)
-    if (device->quad_accel == "default") 
+    if (device->quad_accel == "default")
     {
       if (quality_flags != RTC_BUILD_QUALITY_LOW)
       {
         /* static */
-        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
         switch (mode) {
         case /*0b00*/ 0:
 #if defined (EMBREE_TARGET_SIMD8)
           if (device->canUseAVX())
           {
-            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
               accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
             else
               accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
@@ -230,7 +230,7 @@ namespace embree
           else
 #endif
           {
-            if (quality_flags == RTC_BUILD_QUALITY_HIGH) 
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
               accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
             else
               accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
@@ -292,9 +292,9 @@ namespace embree
   void Scene::createQuadMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_QUAD)
-    if (device->quad_accel_mb == "default") 
+    if (device->quad_accel_mb == "default")
     {
-      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel(); 
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
       switch (mode) {
       case /*0b00*/ 0:
 #if defined (EMBREE_TARGET_SIMD8)
@@ -416,7 +416,7 @@ namespace embree
   void Scene::createUserGeometryAccel()
   {
 #if defined(EMBREE_GEOMETRY_USER)
-    if (device->object_accel == "default") 
+    if (device->object_accel == "default")
     {
 #if defined (EMBREE_TARGET_SIMD8)
       if (device->canUseAVX() && !isCompactAccel())
@@ -554,7 +554,7 @@ namespace embree
   {
     BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
 #if defined(EMBREE_GEOMETRY_GRID)
-    if (device->grid_accel == "default") 
+    if (device->grid_accel == "default")
     {
 #if defined (EMBREE_TARGET_SIMD8)
       if (device->canUseAVX() && !isCompactAccel())
@@ -579,7 +579,7 @@ namespace embree
   void Scene::createGridMBAccel()
   {
 #if defined(EMBREE_GEOMETRY_GRID)
-    if (device->grid_accel_mb == "default") 
+    if (device->grid_accel_mb == "default")
     {
       accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
     }
@@ -588,13 +588,17 @@ namespace embree
 #endif
 
   }
-  
+
   void Scene::clear() {
   }
 
-  unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry) 
+  unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry)
   {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
     Lock<SpinLock> lock(geometriesMutex);
+#endif
     if (geomID == RTC_INVALID_GEOMETRY_ID) {
       geomID = id_pool.allocate();
       if (geomID == RTC_INVALID_GEOMETRY_ID)
@@ -620,15 +624,19 @@ namespace embree
 
   void Scene::detachGeometry(size_t geomID)
   {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
     Lock<SpinLock> lock(geometriesMutex);
-    
+#endif
+
     if (geomID >= geometries.size())
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
 
     Ref<Geometry>& geometry = geometries[geomID];
     if (geometry == null)
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
-    
+
     if (geometry->isEnabled()) {
       setModified ();
     }
@@ -650,21 +658,21 @@ namespace embree
     if (!isModified()) {
       return;
     }
-    
+
     /* print scene statistics */
     if (device->verbosity(2))
       printStatistics();
 
     progress_monitor_counter = 0;
-    
+
     /* gather scene stats and call preCommit function of each geometry */
-    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), 
+    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (),
       [this](const range<size_t>& r)->GeometryCounts
       {
         GeometryCounts c;
-        for (auto i=r.begin(); i<r.end(); ++i) 
+        for (auto i=r.begin(); i<r.end(); ++i)
         {
-          if (geometries[i] && geometries[i]->isEnabled()) 
+          if (geometries[i] && geometries[i]->isEnabled())
           {
             geometries[i]->preCommit();
             geometries[i]->addElementsToCount (c);
@@ -675,19 +683,19 @@ namespace embree
       },
       std::plus<GeometryCounts>()
     );
-    
+
     /* select acceleration structures to build */
     unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
     if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
     {
       accels_init();
 
-      /* we need to make all geometries modified, otherwise two level builder will 
+      /* we need to make all geometries modified, otherwise two level builder will
         not rebuild currently not modified geometries */
       parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
           geometryModCounters_[i] = 0;
         });
-      
+
       if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
       if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
       if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
@@ -704,14 +712,14 @@ namespace embree
       if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
       if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
       if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
-      
+
       flags_modified = false;
       enabled_geometry_types = new_enabled_geometry_types;
     }
-    
+
     /* select fast code path if no filter function is present */
     accels_select(hasFilterFunction());
-  
+
     /* build all hierarchies of this scene */
     accels_build();
 
@@ -729,7 +737,7 @@ namespace embree
           geometryModCounters_[i] = geometries[i]->getModCounter();
         }
       });
-      
+
     updateInterface();
 
     if (device->verbosity(2)) {
@@ -738,7 +746,7 @@ namespace embree
       std::cout << "selected scene intersector" << std::endl;
       intersectors.print(2);
     }
-    
+
     setModified(false);
   }
 
@@ -763,16 +771,16 @@ namespace embree
   RTCSceneFlags Scene::getSceneFlags() const {
     return scene_flags;
   }
-                   
+
 #if defined(TASKING_INTERNAL)
 
-  void Scene::commit (bool join) 
+  void Scene::commit (bool join)
   {
     Lock<MutexSys> buildLock(buildMutex,false);
 
     /* allocates own taskscheduler for each build */
     Ref<TaskScheduler> scheduler = nullptr;
-    { 
+    {
       Lock<MutexSys> lock(schedulerMutex);
       scheduler = this->scheduler;
       if (scheduler == null) {
@@ -784,31 +792,33 @@ namespace embree
     /* worker threads join build */
     if (!buildLock.isLocked())
     {
-      if (!join) 
+      if (!join)
         throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
-      
+
       scheduler->join();
       return;
     }
 
     /* initiate build */
-    try {
+    // -- GODOT start --
+    // try {
       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
-    }
-    catch (...) {
-      accels_clear();
-      updateInterface();
-      Lock<MutexSys> lock(schedulerMutex);
-      this->scheduler = nullptr;
-      throw;
-    }
+    // }
+    // catch (...) {
+    //   accels_clear();
+    //   updateInterface();
+    //   Lock<MutexSys> lock(schedulerMutex);
+    //   this->scheduler = nullptr;
+    //   throw;
+    // }
+    // -- GODOT end --
   }
 
 #endif
 
-#if defined(TASKING_TBB)
+#if defined(TASKING_TBB) || defined(TASKING_GCD)
 
-  void Scene::commit (bool join) 
+  void Scene::commit (bool join)
   {
 #if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
     if (join)
@@ -822,12 +832,15 @@ namespace embree
     if (!lock.isLocked())
     {
 #if !TASKING_TBB_USE_TASK_ISOLATION
-      if (!join) 
+      if (!join)
         throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
 #endif
-      
+
       do {
 
+#if defined(TASKING_GCD)
+      // Do Nothing
+#else
 #if USE_TASK_ARENA
         if (join) {
           device->arena->execute([&]{ group.wait(); });
@@ -837,21 +850,24 @@ namespace embree
         {
           group.wait();
         }
+#endif
 
         pause_cpu();
         yield();
+
       } while (!buildMutex.try_lock());
-      
+
       buildMutex.unlock();
       return;
-    }   
+    }
 
     /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
     const unsigned int mxcsr = _mm_getcsr();
     _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
-    
+
     try {
-#if TBB_INTERFACE_VERSION_MAJOR < 8    
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION_MAJOR < 8
       tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
 #else
       tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
@@ -876,15 +892,22 @@ namespace embree
           });
         group.wait();
       }
-     
+
       /* reset MXCSR register again */
       _mm_setcsr(mxcsr);
-    } 
+
+#elif defined(TASKING_GCD)
+
+      commit_task();
+
+#endif  // #if defined(TASKING_TBB)
+
+    }
     catch (...)
     {
       /* reset MXCSR register again */
       _mm_setcsr(mxcsr);
-      
+
       accels_clear();
       updateInterface();
       throw;
@@ -894,7 +917,7 @@ namespace embree
 
 #if defined(TASKING_PPL)
 
-  void Scene::commit (bool join) 
+  void Scene::commit (bool join)
   {
 #if defined(TASKING_PPL)
     if (join)
@@ -912,7 +935,7 @@ namespace embree
     /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
     const unsigned int mxcsr = _mm_getcsr();
     _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
-    
+
     try {
 
       group.run([&]{
@@ -922,12 +945,12 @@ namespace embree
 
        /* reset MXCSR register again */
       _mm_setcsr(mxcsr);
-    } 
+    }
     catch (...)
     {
       /* reset MXCSR register again */
       _mm_setcsr(mxcsr);
-      
+
       accels_clear();
       updateInterface();
       throw;
@@ -935,7 +958,7 @@ namespace embree
   }
 #endif
 
-  void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) 
+  void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr)
   {
     progress_monitor_function = func;
     progress_monitor_ptr      = ptr;
diff --git a/thirdparty/embree/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
index 25ee8e8efade..d0246009dbd3 100644
--- a/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
+++ b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h
@@ -275,11 +275,11 @@ namespace embree
     parallel_set<uint32_t> holeSet;
 
     /*! fast lookup table to detect invalid faces */
-    mvector<char> invalid_face;
+    mvector<int8_t> invalid_face;
 
     /*! test if face i is invalid in timestep j */
-    __forceinline       char& invalidFace(size_t i, size_t j = 0)       { return invalid_face[i*numTimeSteps+j]; }
-    __forceinline const char& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
+    __forceinline       int8_t& invalidFace(size_t i, size_t j = 0)       { return invalid_face[i*numTimeSteps+j]; }
+    __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
 
     /*! interpolation cache */
   public:
diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp
index e9a912d7667d..51fc9b782620 100644
--- a/thirdparty/embree/kernels/common/state.cpp
+++ b/thirdparty/embree/kernels/common/state.cpp
@@ -147,7 +147,20 @@ namespace embree
   }
 
   bool State::checkISASupport() {
+#if defined(__ARM_NEON)
+    /*
+     * NEON CPU type is a mixture of NEON and SSE2
+     */
+
+    bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
+
+    /* this will be true when explicitly initialize Device with `isa=neon` config */
+    bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
+
+    return hasSSE2 || hasNEON;
+#else
     return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+#endif
   }
   
   void State::verify()
@@ -160,8 +173,10 @@ namespace embree
      * functions */
 #if defined(DEBUG)
 #if defined(EMBREE_TARGET_SSE2)
+#if !defined(__ARM_NEON)
     assert(sse2::getISA() <= SSE2);
 #endif
+#endif
 #if defined(EMBREE_TARGET_SSE42)
     assert(sse42::getISA() <= SSE42);
 #endif
diff --git a/thirdparty/embree/kernels/geometry/curveNi.h b/thirdparty/embree/kernels/geometry/curveNi.h
index 00ca9b8b6503..51384f19590c 100644
--- a/thirdparty/embree/kernels/geometry/curveNi.h
+++ b/thirdparty/embree/kernels/geometry/curveNi.h
@@ -43,10 +43,10 @@ namespace embree
     __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
     {  
       size_t end = min(begin+M,_end);
-      N = (unsigned char)(end-begin);
+      N = (uint8_t)(end-begin);
       const unsigned int geomID0 = prims[begin].geomID();
       this->geomID(N) = geomID0;
-      ty = (unsigned char) scene->get(geomID0)->getType();
+      ty = (uint8_t) scene->get(geomID0)->getType();
 
       /* encode all primitives */
       BBox3fa bounds = empty;
@@ -76,25 +76,25 @@ namespace embree
         const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
         const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
         
-        bounds_vx_x(N)[i] = (char) space3.vx.x;
-        bounds_vx_y(N)[i] = (char) space3.vx.y;
-        bounds_vx_z(N)[i] = (char) space3.vx.z;
+        bounds_vx_x(N)[i] = (int8_t) space3.vx.x;
+        bounds_vx_y(N)[i] = (int8_t) space3.vx.y;
+        bounds_vx_z(N)[i] = (int8_t) space3.vx.z;
         bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
         bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
         assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
         assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
 
-        bounds_vy_x(N)[i] = (char) space3.vy.x;
-        bounds_vy_y(N)[i] = (char) space3.vy.y;
-        bounds_vy_z(N)[i] = (char) space3.vy.z;
+        bounds_vy_x(N)[i] = (int8_t) space3.vy.x;
+        bounds_vy_y(N)[i] = (int8_t) space3.vy.y;
+        bounds_vy_z(N)[i] = (int8_t) space3.vy.z;
         bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
         bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
         assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
         assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
 
-        bounds_vz_x(N)[i] = (char) space3.vz.x;
-        bounds_vz_y(N)[i] = (char) space3.vz.y;
-        bounds_vz_z(N)[i] = (char) space3.vz.z;
+        bounds_vz_x(N)[i] = (int8_t) space3.vz.x;
+        bounds_vz_y(N)[i] = (int8_t) space3.vz.y;
+        bounds_vz_z(N)[i] = (int8_t) space3.vz.z;
         bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
         bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
         assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
@@ -114,15 +114,15 @@ namespace embree
       for (size_t i=0; i<items; i++) {
         accel[i].fill(prims,start,set.end(),bvh->scene);
       }
-      return bvh->encodeLeaf((char*)accel,items);
+      return bvh->encodeLeaf((int8_t*)accel,items);
     };
     
   public:
     
     // 27.6 - 46 bytes per primitive
-    unsigned char ty;
-    unsigned char N;
-    unsigned char data[4+25*M+16];
+    uint8_t ty;
+    uint8_t N;
+    uint8_t data[4+25*M+16];
 
     /*
     struct Layout
@@ -130,21 +130,21 @@ namespace embree
       unsigned int geomID;
       unsigned int primID[N];
       
-      char bounds_vx_x[N];
-      char bounds_vx_y[N];
-      char bounds_vx_z[N];
+      int8_t bounds_vx_x[N];
+      int8_t bounds_vx_y[N];
+      int8_t bounds_vx_z[N];
       short bounds_vx_lower[N];
       short bounds_vx_upper[N];
       
-      char bounds_vy_x[N];
-      char bounds_vy_y[N];
-      char bounds_vy_z[N];
+      int8_t bounds_vy_x[N];
+      int8_t bounds_vy_y[N];
+      int8_t bounds_vy_z[N];
       short bounds_vy_lower[N];
       short bounds_vy_upper[N];
       
-      char bounds_vz_x[N];
-      char bounds_vz_y[N];
-      char bounds_vz_z[N];
+      int8_t bounds_vz_x[N];
+      int8_t bounds_vz_y[N];
+      int8_t bounds_vz_z[N];
       short bounds_vz_lower[N];
       short bounds_vz_upper[N];
       
@@ -153,65 +153,65 @@ namespace embree
     };
     */
     
-    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((char*)this+2); }
-    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((int8_t*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
     
-    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((char*)this+6); }
-    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((int8_t*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
     
-    __forceinline       char* bounds_vx_x(size_t N)       { return (char*)((char*)this+6+4*N); }
-    __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
+    __forceinline       int8_t* bounds_vx_x(size_t N)       { return (int8_t*)((int8_t*)this+6+4*N); }
+    __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
     
-    __forceinline       char* bounds_vx_y(size_t N)       { return (char*)((char*)this+6+5*N); }
-    __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
+    __forceinline       int8_t* bounds_vx_y(size_t N)       { return (int8_t*)((int8_t*)this+6+5*N); }
+    __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
     
-    __forceinline       char* bounds_vx_z(size_t N)       { return (char*)((char*)this+6+6*N); }
-    __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
+    __forceinline       int8_t* bounds_vx_z(size_t N)       { return (int8_t*)((int8_t*)this+6+6*N); }
+    __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
     
-    __forceinline       short* bounds_vx_lower(size_t N)       { return (short*)((char*)this+6+7*N); }
-    __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((char*)this+6+7*N); }
+    __forceinline       short* bounds_vx_lower(size_t N)       { return (short*)((int8_t*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
     
-    __forceinline       short* bounds_vx_upper(size_t N)       { return (short*)((char*)this+6+9*N); }
-    __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((char*)this+6+9*N); }
+    __forceinline       short* bounds_vx_upper(size_t N)       { return (short*)((int8_t*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
     
-    __forceinline       char* bounds_vy_x(size_t N)       { return (char*)((char*)this+6+11*N); }
-    __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+11*N); }
+    __forceinline       int8_t* bounds_vy_x(size_t N)       { return (int8_t*)((int8_t*)this+6+11*N); }
+    __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); }
     
-    __forceinline       char* bounds_vy_y(size_t N)       { return (char*)((char*)this+6+12*N); }
-    __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+12*N); }
+    __forceinline       int8_t* bounds_vy_y(size_t N)       { return (int8_t*)((int8_t*)this+6+12*N); }
+    __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); }
     
-    __forceinline       char* bounds_vy_z(size_t N)       { return (char*)((char*)this+6+13*N); }
-    __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+13*N); }
+    __forceinline       int8_t* bounds_vy_z(size_t N)       { return (int8_t*)((int8_t*)this+6+13*N); }
+    __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); }
     
-    __forceinline       short* bounds_vy_lower(size_t N)       { return (short*)((char*)this+6+14*N); }
-    __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((char*)this+6+14*N); }
+    __forceinline       short* bounds_vy_lower(size_t N)       { return (short*)((int8_t*)this+6+14*N); }
+    __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); }
     
-    __forceinline       short* bounds_vy_upper(size_t N)       { return (short*)((char*)this+6+16*N); }
-    __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((char*)this+6+16*N); }
+    __forceinline       short* bounds_vy_upper(size_t N)       { return (short*)((int8_t*)this+6+16*N); }
+    __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); }
     
-    __forceinline       char* bounds_vz_x(size_t N)       { return (char*)((char*)this+6+18*N); }
-    __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+18*N); }
+    __forceinline       int8_t* bounds_vz_x(size_t N)       { return (int8_t*)((int8_t*)this+6+18*N); }
+    __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); }
     
-    __forceinline       char* bounds_vz_y(size_t N)       { return (char*)((char*)this+6+19*N); }
-    __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+19*N); }
+    __forceinline       int8_t* bounds_vz_y(size_t N)       { return (int8_t*)((int8_t*)this+6+19*N); }
+    __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); }
     
-    __forceinline       char* bounds_vz_z(size_t N)       { return (char*)((char*)this+6+20*N); }
-    __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+20*N); }
+    __forceinline       int8_t* bounds_vz_z(size_t N)       { return (int8_t*)((int8_t*)this+6+20*N); }
+    __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); }
     
-    __forceinline       short* bounds_vz_lower(size_t N)       { return (short*)((char*)this+6+21*N); }
-    __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((char*)this+6+21*N); }
+    __forceinline       short* bounds_vz_lower(size_t N)       { return (short*)((int8_t*)this+6+21*N); }
+    __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); }
     
-    __forceinline       short* bounds_vz_upper(size_t N)       { return (short*)((char*)this+6+23*N); }
-    __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((char*)this+6+23*N); }
+    __forceinline       short* bounds_vz_upper(size_t N)       { return (short*)((int8_t*)this+6+23*N); }
+    __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); }
     
-    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((char*)this+6+25*N); }
-    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+25*N); }
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((int8_t*)this+6+25*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); }
     
-    __forceinline       float* scale(size_t N)       { return (float*)((char*)this+6+25*N+12); }
-    __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+25*N+12); }
+    __forceinline       float* scale(size_t N)       { return (float*)((int8_t*)this+6+25*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); }
 
-    __forceinline       char* end(size_t N)       { return (char*)this+6+25*N+16; }
-    __forceinline const char* end(size_t N) const { return (char*)this+6+25*N+16; }
+    __forceinline       int8_t* end(size_t N)       { return (int8_t*)this+6+25*N+16; }
+    __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; }
   };
 
   template<int M>
diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb.h b/thirdparty/embree/kernels/geometry/curveNi_mb.h
index d2e1926220f2..0cd8f833fd42 100644
--- a/thirdparty/embree/kernels/geometry/curveNi_mb.h
+++ b/thirdparty/embree/kernels/geometry/curveNi_mb.h
@@ -43,10 +43,10 @@ namespace embree
     __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
     {
       size_t end = min(begin+M,_end);
-      N = (unsigned char)(end-begin);
+      N = (uint8_t)(end-begin);
       const unsigned int geomID0 = prims[begin].geomID();
       this->geomID(N) = geomID0;
-      ty = (unsigned char) scene->get(geomID0)->getType();
+      ty = (uint8_t) scene->get(geomID0)->getType();
 
       /* encode all primitives */
       LBBox3fa lbounds = empty;
@@ -79,10 +79,10 @@ namespace embree
         const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
         const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
         
-        // NOTE: this weird (char) (short) cast works around VS2015 Win32 compiler bug
-        bounds_vx_x(N)[i] = (char) (short) space3.vx.x;
-        bounds_vx_y(N)[i] = (char) (short) space3.vx.y;
-        bounds_vx_z(N)[i] = (char) (short) space3.vx.z;
+        // NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug
+        bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x;
+        bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y;
+        bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z;
         bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
         bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
         bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
@@ -92,9 +92,9 @@ namespace embree
         assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
         assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
         
-        bounds_vy_x(N)[i] = (char) (short) space3.vy.x;
-        bounds_vy_y(N)[i] = (char) (short) space3.vy.y;
-        bounds_vy_z(N)[i] = (char) (short) space3.vy.z;
+        bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x;
+        bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y;
+        bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z;
         bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
         bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
         bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
@@ -104,9 +104,9 @@ namespace embree
         assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
         assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
 
-        bounds_vz_x(N)[i] = (char) (short) space3.vz.x;
-        bounds_vz_y(N)[i] = (char) (short) space3.vz.y;
-        bounds_vz_z(N)[i] = (char) (short) space3.vz.z;
+        bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x;
+        bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y;
+        bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z;
         bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
         bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
         bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
@@ -130,7 +130,7 @@ namespace embree
       size_t items = CurveNiMB::blocks(prims.size());
       size_t numbytes = CurveNiMB::bytes(prims.size());
       CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
-      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+      const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items);
       
       LBBox3fa bounds = empty;
       for (size_t i=0; i<items; i++)
@@ -143,9 +143,9 @@ namespace embree
   public:
     
     // 27.6 - 46 bytes per primitive
-    unsigned char ty;
-    unsigned char N;
-    unsigned char data[4+37*M+24];
+    uint8_t ty;
+    uint8_t N;
+    uint8_t data[4+37*M+24];
 
     /*
     struct Layout
@@ -153,25 +153,25 @@ namespace embree
       unsigned int geomID;
       unsigned int primID[N];
       
-      char bounds_vx_x[N];
-      char bounds_vx_y[N];
-      char bounds_vx_z[N];
+      int8_t bounds_vx_x[N];
+      int8_t bounds_vx_y[N];
+      int8_t bounds_vx_z[N];
       short bounds_vx_lower0[N];
       short bounds_vx_upper0[N];
       short bounds_vx_lower1[N];
       short bounds_vx_upper1[N];
       
-      char bounds_vy_x[N];
-      char bounds_vy_y[N];
-      char bounds_vy_z[N];
+      int8_t bounds_vy_x[N];
+      int8_t bounds_vy_y[N];
+      int8_t bounds_vy_z[N];
       short bounds_vy_lower0[N];
       short bounds_vy_upper0[N];
       short bounds_vy_lower1[N];
       short bounds_vy_upper1[N];
       
-      char bounds_vz_x[N];
-      char bounds_vz_y[N];
-      char bounds_vz_z[N];
+      int8_t bounds_vz_x[N];
+      int8_t bounds_vz_y[N];
+      int8_t bounds_vz_z[N];
       short bounds_vz_lower0[N];
       short bounds_vz_upper0[N];
       short bounds_vz_lower1[N];
@@ -185,89 +185,89 @@ namespace embree
     };
     */
     
-    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((char*)this+2); }
-    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); }
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((int8_t*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
     
-    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((char*)this+6); }
-    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); }
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((int8_t*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
     
-    __forceinline       char* bounds_vx_x(size_t N)       { return (char*)((char*)this+6+4*N); }
-    __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); }
+    __forceinline       int8_t* bounds_vx_x(size_t N)       { return (int8_t*)((int8_t*)this+6+4*N); }
+    __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
     
-    __forceinline       char* bounds_vx_y(size_t N)       { return (char*)((char*)this+6+5*N); }
-    __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); }
+    __forceinline       int8_t* bounds_vx_y(size_t N)       { return (int8_t*)((int8_t*)this+6+5*N); }
+    __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
     
-    __forceinline       char* bounds_vx_z(size_t N)       { return (char*)((char*)this+6+6*N); }
-    __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); }
+    __forceinline       int8_t* bounds_vx_z(size_t N)       { return (int8_t*)((int8_t*)this+6+6*N); }
+    __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
     
-    __forceinline       short* bounds_vx_lower0(size_t N)       { return (short*)((char*)this+6+7*N); }
-    __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((char*)this+6+7*N); }
+    __forceinline       short* bounds_vx_lower0(size_t N)       { return (short*)((int8_t*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
     
-    __forceinline       short* bounds_vx_upper0(size_t N)       { return (short*)((char*)this+6+9*N); }
-    __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((char*)this+6+9*N); }
+    __forceinline       short* bounds_vx_upper0(size_t N)       { return (short*)((int8_t*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
 
-    __forceinline       short* bounds_vx_lower1(size_t N)       { return (short*)((char*)this+6+11*N); }
-    __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((char*)this+6+11*N); }
+    __forceinline       short* bounds_vx_lower1(size_t N)       { return (short*)((int8_t*)this+6+11*N); }
+    __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); }
     
-    __forceinline       short* bounds_vx_upper1(size_t N)       { return (short*)((char*)this+6+13*N); }
-    __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((char*)this+6+13*N); }
+    __forceinline       short* bounds_vx_upper1(size_t N)       { return (short*)((int8_t*)this+6+13*N); }
+    __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); }
 
-    __forceinline       char* bounds_vy_x(size_t N)       { return (char*)((char*)this+6+15*N); }
-    __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+15*N); }
+    __forceinline       int8_t* bounds_vy_x(size_t N)       { return (int8_t*)((int8_t*)this+6+15*N); }
+    __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); }
     
-    __forceinline       char* bounds_vy_y(size_t N)       { return (char*)((char*)this+6+16*N); }
-    __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+16*N); }
+    __forceinline       int8_t* bounds_vy_y(size_t N)       { return (int8_t*)((int8_t*)this+6+16*N); }
+    __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); }
     
-    __forceinline       char* bounds_vy_z(size_t N)       { return (char*)((char*)this+6+17*N); }
-    __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+17*N); }
+    __forceinline       int8_t* bounds_vy_z(size_t N)       { return (int8_t*)((int8_t*)this+6+17*N); }
+    __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); }
     
-    __forceinline       short* bounds_vy_lower0(size_t N)       { return (short*)((char*)this+6+18*N); }
-    __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((char*)this+6+18*N); }
+    __forceinline       short* bounds_vy_lower0(size_t N)       { return (short*)((int8_t*)this+6+18*N); }
+    __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); }
     
-    __forceinline       short* bounds_vy_upper0(size_t N)       { return (short*)((char*)this+6+20*N); }
-    __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((char*)this+6+20*N); }
+    __forceinline       short* bounds_vy_upper0(size_t N)       { return (short*)((int8_t*)this+6+20*N); }
+    __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); }
 
-    __forceinline       short* bounds_vy_lower1(size_t N)       { return (short*)((char*)this+6+22*N); }
-    __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((char*)this+6+22*N); }
+    __forceinline       short* bounds_vy_lower1(size_t N)       { return (short*)((int8_t*)this+6+22*N); }
+    __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); }
     
-    __forceinline       short* bounds_vy_upper1(size_t N)       { return (short*)((char*)this+6+24*N); }
-    __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((char*)this+6+24*N); }
+    __forceinline       short* bounds_vy_upper1(size_t N)       { return (short*)((int8_t*)this+6+24*N); }
+    __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); }
     
-    __forceinline       char* bounds_vz_x(size_t N)       { return (char*)((char*)this+6+26*N); }
-    __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+26*N); }
+    __forceinline       int8_t* bounds_vz_x(size_t N)       { return (int8_t*)((int8_t*)this+6+26*N); }
+    __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); }
     
-    __forceinline       char* bounds_vz_y(size_t N)       { return (char*)((char*)this+6+27*N); }
-    __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+27*N); }
+    __forceinline       int8_t* bounds_vz_y(size_t N)       { return (int8_t*)((int8_t*)this+6+27*N); }
+    __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); }
     
-    __forceinline       char* bounds_vz_z(size_t N)       { return (char*)((char*)this+6+28*N); }
-    __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+28*N); }
+    __forceinline       int8_t* bounds_vz_z(size_t N)       { return (int8_t*)((int8_t*)this+6+28*N); }
+    __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); }
     
-    __forceinline       short* bounds_vz_lower0(size_t N)       { return (short*)((char*)this+6+29*N); }
-    __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((char*)this+6+29*N); }
+    __forceinline       short* bounds_vz_lower0(size_t N)       { return (short*)((int8_t*)this+6+29*N); }
+    __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); }
     
-    __forceinline       short* bounds_vz_upper0(size_t N)       { return (short*)((char*)this+6+31*N); }
-    __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((char*)this+6+31*N); }
+    __forceinline       short* bounds_vz_upper0(size_t N)       { return (short*)((int8_t*)this+6+31*N); }
+    __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); }
 
-    __forceinline       short* bounds_vz_lower1(size_t N)       { return (short*)((char*)this+6+33*N); }
-    __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((char*)this+6+33*N); }
+    __forceinline       short* bounds_vz_lower1(size_t N)       { return (short*)((int8_t*)this+6+33*N); }
+    __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); }
     
-    __forceinline       short* bounds_vz_upper1(size_t N)       { return (short*)((char*)this+6+35*N); }
-    __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((char*)this+6+35*N); }
+    __forceinline       short* bounds_vz_upper1(size_t N)       { return (short*)((int8_t*)this+6+35*N); }
+    __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); }
 
-    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((char*)this+6+37*N); }
-    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+37*N); }
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((int8_t*)this+6+37*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); }
     
-    __forceinline       float* scale(size_t N)       { return (float*)((char*)this+6+37*N+12); }
-    __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+37*N+12); }
+    __forceinline       float* scale(size_t N)       { return (float*)((int8_t*)this+6+37*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); }
 
-    __forceinline       float& time_offset(size_t N)       { return *(float*)((char*)this+6+37*N+16); }
-    __forceinline const float& time_offset(size_t N) const { return *(float*)((char*)this+6+37*N+16); }
+    __forceinline       float& time_offset(size_t N)       { return *(float*)((int8_t*)this+6+37*N+16); }
+    __forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); }
     
-    __forceinline       float& time_scale(size_t N)       { return *(float*)((char*)this+6+37*N+20); }
-    __forceinline const float& time_scale(size_t N) const { return *(float*)((char*)this+6+37*N+20); }
+    __forceinline       float& time_scale(size_t N)       { return *(float*)((int8_t*)this+6+37*N+20); }
+    __forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); }
 
-    __forceinline       char* end(size_t N)       { return (char*)this+6+37*N+24; }
-    __forceinline const char* end(size_t N) const { return (char*)this+6+37*N+24; }
+    __forceinline       int8_t* end(size_t N)       { return (int8_t*)this+6+37*N+24; }
+    __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; }
   };
 
   template<int M>
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual_bezier_curve.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_bezier_curve.h
new file mode 100644
index 000000000000..69cf612275f8
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_bezier_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual_bspline_curve.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_bspline_curve.h
new file mode 100644
index 000000000000..d37e41098ed7
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_bspline_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
new file mode 100644
index 000000000000..a133a11d63e4
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual_hermite_curve.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_hermite_curve.h
new file mode 100644
index 000000000000..9aec35da4524
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_hermite_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual_linear_curve.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_linear_curve.h
new file mode 100644
index 000000000000..dd37d194f5dd
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_linear_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual_point.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_point.h
new file mode 100644
index 000000000000..fe5ceed840a5
--- /dev/null
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual_point.h
@@ -0,0 +1,22 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim);
+
+#if defined (__AVX__)
+    void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree/kernels/geometry/grid_soa.h b/thirdparty/embree/kernels/geometry/grid_soa.h
index 02edbbed5e1d..d3b275586c97 100644
--- a/thirdparty/embree/kernels/geometry/grid_soa.h
+++ b/thirdparty/embree/kernels/geometry/grid_soa.h
@@ -41,7 +41,7 @@ namespace embree
         }
         const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);  
         size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
-#if !defined(__X86_64__)
+#if !defined(__X86_64__) && !defined(__aarch64__)
         rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
 #endif
         void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
@@ -62,8 +62,8 @@ namespace embree
       __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
 
       /*! returns pointer to BVH array */
-      __forceinline       char* bvhData()       { return &data[0]; }
-      __forceinline const char* bvhData() const { return &data[0]; }
+      __forceinline       int8_t* bvhData()       { return &data[0]; }
+      __forceinline const int8_t* bvhData() const { return &data[0]; }
 
       /*! returns pointer to Grid array */
       __forceinline       float* gridData(size_t t = 0)       { return (float*) &data[gridOffset + t*gridBytes]; }
@@ -253,7 +253,7 @@ namespace embree
 
     public:
       BVH4::NodeRef troot;
-#if !defined(__X86_64__)
+#if !defined(__X86_64__) && !defined(__aarch64__)
       unsigned align1;
 #endif
       unsigned time_steps;
@@ -269,7 +269,7 @@ namespace embree
       unsigned gridBytes;
       unsigned rootOffset;
 
-      char data[1];        //!< after the struct we first store the BVH, then the grid, and finally the roots
+      int8_t data[1];      //!< after the struct we first store the BVH, then the grid, and finally the roots
     };
   }
 }
diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h
index f11598b7ab89..4abbe203d6bb 100644
--- a/thirdparty/embree/kernels/hash.h
+++ b/thirdparty/embree/kernels/hash.h
@@ -2,4 +2,4 @@
 // Copyright 2009-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#define RTC_HASH "69bd4c272f1ed608494f233ecfff3feec516880b"
+#define RTC_HASH "6ef362f99af80c9dfe8dd2bfc582d9067897edc6"
diff --git a/thirdparty/embree/kernels/subdiv/tessellation_cache.h b/thirdparty/embree/kernels/subdiv/tessellation_cache.h
index 116b4db88bc0..5c215288b637 100644
--- a/thirdparty/embree/kernels/subdiv/tessellation_cache.h
+++ b/thirdparty/embree/kernels/subdiv/tessellation_cache.h
@@ -63,7 +63,7 @@ namespace embree
    static const size_t NUM_CACHE_SEGMENTS              = 8;
    static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
    static const size_t COMMIT_INDEX_SHIFT              = 32+8;
-#if defined(__X86_64__)
+#if defined(__X86_64__) || defined(__aarch64__)
    static const size_t REF_TAG_MASK                    = 0xffffffffff;
 #else
    static const size_t REF_TAG_MASK                    = 0x7FFFFFFF;
diff --git a/thirdparty/embree/patches/godot-changes.patch b/thirdparty/embree/patches/godot-changes.patch
index 6ccfd8eb1214..79a60ce3a593 100644
--- a/thirdparty/embree/patches/godot-changes.patch
+++ b/thirdparty/embree/patches/godot-changes.patch
@@ -1,215 +1,630 @@
-diff --git a/common/math/math.h b/common/math/math.h
-index 5af0691a2..1982c27c1 100644
---- a/common/math/math.h
-+++ b/common/math/math.h
-@@ -13,7 +13,7 @@
- #include <immintrin.h>
+diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
+index 76c6b740aa..51d296fb16 100644
+--- a/thirdparty/embree/common/algorithms/parallel_for.h
++++ b/thirdparty/embree/common/algorithms/parallel_for.h
+@@ -27,7 +27,10 @@ namespace embree
+           func(r.begin());
+         });
+       if (!TaskScheduler::wait())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     }
+ #elif defined(TASKING_GCD) && defined(BUILD_IOS)
+       
+@@ -55,13 +58,19 @@ namespace embree
+         func(i);
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+         func(i);
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #endif
  
- #if defined(__WIN32__)
--#if (__MSV_VER <= 1700)
-+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
- namespace std
- {
-   __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
-@@ -86,7 +86,7 @@
-     return _mm_cvtss_f32(c);
+ #elif defined(TASKING_PPL)
+@@ -81,7 +90,10 @@ namespace embree
+ #if defined(TASKING_INTERNAL)
+     TaskScheduler::spawn(first,last,minStepSize,func);
+     if (!TaskScheduler::wait())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+ 
+ #elif defined(TASKING_GCD) && defined(BUILD_IOS)
+       
+@@ -109,13 +121,19 @@ namespace embree
+         func(range<Index>(r.begin(),r.end()));
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+         func(range<Index>(r.begin(),r.end()));
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -147,13 +165,19 @@ namespace embree
+           func(i);
+         },tbb::simple_partitioner(),context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },tbb::simple_partitioner());
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #endif
    }
  
--#if defined(__WIN32__) && (__MSC_VER <= 1700)
-+#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
-   __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
-   __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
-   __forceinline int roundf(float f) { return (int)(f + 0.5f); }
-diff --git a/common/sys/intrinsics.h b/common/sys/intrinsics.h
-index 3f0619cac..58f5c3bb4 100644
---- a/common/sys/intrinsics.h
-+++ b/common/sys/intrinsics.h
-@@ -11,6 +11,12 @@
+@@ -168,13 +192,19 @@ namespace embree
+           func(i);
+         },ap,context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++       // -- GODOT start --
++       // throw std::runtime_error("task cancelled");
++       abort(); 
++       // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },ap);
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #endif
+   }
  
- #include <immintrin.h>
+diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
+index d444b6a2e4..0daf94e50e 100644
+--- a/thirdparty/embree/common/algorithms/parallel_reduce.h
++++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
+@@ -58,15 +58,19 @@ namespace embree
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction,context);
+-    if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (context.is_group_execution_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #else
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction);
+-    if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (tbb::task::self().is_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #endif
+ #else // TASKING_PPL
+diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp
+index 7e7b9faef8..98dc80ad59 100644
+--- a/thirdparty/embree/common/lexers/stringstream.cpp
++++ b/thirdparty/embree/common/lexers/stringstream.cpp
+@@ -39,7 +39,10 @@ namespace embree
+     std::vector<char> str; str.reserve(64);
+     while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+       int c = cin->get();
+-      if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      // -- GODOT start --
++      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      if (!isValidChar(c)) abort();
++      // -- GODOT end --
+       str.push_back((char)c);
+     }
+     str.push_back(0);
+diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp
+index 4e8928242e..12f143f131 100644
+--- a/thirdparty/embree/common/sys/alloc.cpp
++++ b/thirdparty/embree/common/sys/alloc.cpp
+@@ -21,7 +21,10 @@ namespace embree
+     void* ptr = _mm_malloc(size,align);
  
-+// -- GODOT start --
-+#if defined(__WIN32__) && defined(__MINGW32__)
-+#include <unistd.h>
-+#endif
-+// -- GODOT end --
-+
- #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
-   #if !defined(_tzcnt_u32)
-     #define _tzcnt_u32 __tzcnt_u32
-@@ -30,8 +36,14 @@
- #endif
+     if (size != 0 && ptr == nullptr)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort(); 
++      // -- GODOT end --
+     
+     return ptr;
+   }
+@@ -128,7 +131,10 @@ namespace embree
+     /* fall back to 4k pages */
+     int flags = MEM_COMMIT | MEM_RESERVE;
+     char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+-    if (ptr == nullptr) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == nullptr) throw std::bad_alloc();
++    if (ptr == nullptr) abort();
++    // -- GODOT end --
+     hugepages = false;
+     return ptr;
+   }
+@@ -145,7 +151,10 @@ namespace embree
+       return bytesOld;
  
- #if defined(__WIN32__)
--#  define NOMINMAX
--#  include <windows.h>
-+// -- GODOT start --
-+#if !defined(NOMINMAX)
-+// -- GODOT end --
-+#define NOMINMAX
-+// -- GODOT start --
-+#endif
-+#include "windows.h"
-+// -- GODOT end --
+     if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -156,7 +165,10 @@ namespace embree
+       return;
+ 
+     if (!VirtualFree(ptr,0,MEM_RELEASE))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   void os_advise(void *ptr, size_t bytes)
+@@ -260,7 +272,10 @@ namespace embree
+ 
+     /* fallback to 4k pages */
+     void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+-    if (ptr == MAP_FAILED) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == MAP_FAILED) throw std::bad_alloc();
++    if (ptr == MAP_FAILED) abort();
++    // -- GODOT end --
+     hugepages = false;
+ 
+     /* advise huge page hint for THP */
+@@ -277,7 +292,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -291,7 +309,10 @@ namespace embree
+     const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+     bytes = (bytes+pageSize-1) & ~(pageSize-1);
+     if (munmap(ptr,bytes) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   /* hint for transparent huge pages (THP) */
+diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h
+index 7914eb7a52..737f14aa6e 100644
+--- a/thirdparty/embree/common/sys/platform.h
++++ b/thirdparty/embree/common/sys/platform.h
+@@ -174,11 +174,19 @@
+ #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(str);
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(str);
++    abort();
++  // -- GODOT end --
  #endif
  
- /* normally defined in pmmintrin.h, but we always need this */
-@@ -413,8 +425,16 @@ namespace embree
-   
-   __forceinline void pause_cpu(const size_t N = 8)
-   {
-+// -- GODOT start --
-     for (size_t i=0; i<N; i++)
-+#if !(defined(__WIN32__) && defined(__MINGW32__))
-+// -- GODOT end --
-       _mm_pause();    
+ #define FATAL(x)   THROW_RUNTIME_ERROR(x)
+diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+index 98d7fb9249..ebf656d1a0 100644
+--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp
+@@ -48,13 +48,15 @@ namespace embree
+     {
+       Task* prevTask = thread.task;
+       thread.task = this;
+-      try {
+-        if (thread.scheduler->cancellingException == nullptr)
++      // -- GODOT start --
++      // try {
++      // if (thread.scheduler->cancellingException == nullptr)
+           closure->execute();
+-      } catch (...) {
+-        if (thread.scheduler->cancellingException == nullptr)
+-          thread.scheduler->cancellingException = std::current_exception();
+-      }
++      // } catch (...) {
++      //   if (thread.scheduler->cancellingException == nullptr)
++      //     thread.scheduler->cancellingException = std::current_exception();
++      // }
++      // -- GODOT end --
+       thread.task = prevTask;
+       add_dependencies(-1);
+     }
+@@ -297,8 +299,11 @@ namespace embree
+     size_t threadIndex = allocThreadIndex();
+     condition.wait(mutex, [&] () { return hasRootTask.load(); });
+     mutex.unlock();
+-    std::exception_ptr except = thread_loop(threadIndex);
+-    if (except != nullptr) std::rethrow_exception(except);
++    // -- GODOT start --
++    // std::exception_ptr except = thread_loop(threadIndex);
++    // if (except != nullptr) std::rethrow_exception(except);
++    thread_loop(threadIndex);
++    // -- GODOT end --
+   }
+ 
+   void TaskScheduler::reset() {
+@@ -330,7 +335,10 @@ namespace embree
+     return thread->scheduler->cancellingException == nullptr;
+   }
+ 
+-  std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
 +// -- GODOT start --
-+#else
-+      usleep(1);
-+#endif
++//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++  void TaskScheduler::thread_loop(size_t threadIndex)
 +// -- GODOT end --
-   }
-   
-   /* prefetches */
-diff --git a/common/sys/library.cpp b/common/sys/library.cpp
-index e448b195d..8ec918660 100644
---- a/common/sys/library.cpp
-+++ b/common/sys/library.cpp
-@@ -27,7 +27,9 @@ namespace embree
- 
-   /* returns address of a symbol from the library */
-   void* getSymbol(lib_t lib, const std::string& sym) {
--    return GetProcAddress(HMODULE(lib),sym.c_str());
+   {
+     /* allocate thread structure */
+     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+@@ -353,9 +361,10 @@ namespace embree
+     swapThread(oldThread);
+ 
+     /* remember exception to throw */
+-    std::exception_ptr except = nullptr;
+-    if (cancellingException != nullptr) except = cancellingException;
+-
 +    // -- GODOT start --
-+    return (void*) GetProcAddress(HMODULE(lib),sym.c_str());
++    // std::exception_ptr except = nullptr;
++    // if (cancellingException != nullptr) except = cancellingException;
++    // -- GODOT end --
+     /* wait for all threads to terminate */
+     threadCounter--;
+ #if defined(__WIN32__)
+@@ -373,7 +382,10 @@ namespace embree
+           yield();
+ #endif
+ 	}
+-    return except;
++    // -- GODOT start --
++    // return except;
++    return;
 +    // -- GODOT end --
    }
  
-   /* closes the shared library */
-diff --git a/common/sys/mutex.h b/common/sys/mutex.h
-index 1164210f2..f0f55340a 100644
---- a/common/sys/mutex.h
-+++ b/common/sys/mutex.h
-@@ -47,8 +47,17 @@ namespace embree
+   bool TaskScheduler::steal_from_other_threads(Thread& thread)
+diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h
+index c2a9391aea..8bd70b2b8c 100644
+--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h
++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h
+@@ -123,7 +123,10 @@ namespace embree
        {
-         while (flag.load()) 
-         {
-+// -- GODOT start --
-+#if !(defined (__WIN32__) && defined (__MINGW32__))
-+// -- GODOT end --
-           _mm_pause(); 
-           _mm_pause();
-+// -- GODOT start --
-+#else
-+          __builtin_ia32_pause();
-+          __builtin_ia32_pause();
-+#endif
-+// -- GODOT end --
-         }
-         
-         bool expected = false;
-@@ -74,8 +82,17 @@ namespace embree
-     {
-       while(flag.load())
+         size_t ofs = bytes + ((align - stackPtr) & (align-1));
+         if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+-          throw std::runtime_error("closure stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("closure stack overflow");
++          abort();
++          // -- GODOT end --
+         stackPtr += ofs;
+         return &stack[stackPtr-bytes];
+       }
+@@ -132,7 +135,10 @@ namespace embree
+       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
        {
-+// -- GODOT start --
-+#if !(defined (__WIN32__) && defined(__MINGW32__))
-+// -- GODOT end --
-         _mm_pause(); 
-         _mm_pause();
-+// -- GODOT start --
-+#else
-+        __builtin_ia32_pause();
-+        __builtin_ia32_pause();
-+#endif
-+// -- GODOT end --
+         if (right >= TASK_STACK_SIZE)
+-          throw std::runtime_error("task stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("task stack overflow");
++          abort();
++          // -- GODOT end --
+ 
+ 	/* allocate new task on right side of stack */
+         size_t oldStackPtr = stackPtr;
+@@ -239,7 +245,10 @@ namespace embree
+     void wait_for_threads(size_t threadCount);
+ 
+     /*! thread loop for all worker threads */
+-    std::exception_ptr thread_loop(size_t threadIndex);
++    // -- GODOT start --
++    // std::exception_ptr thread_loop(size_t threadIndex);
++    void thread_loop(size_t threadIndex);
++    // -- GODOT end --
+ 
+     /*! steals a task from a different thread */
+     bool steal_from_other_threads(Thread& thread);
+diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+index 20cdd2d320..aa56035026 100644
+--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
++++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+@@ -150,7 +150,10 @@ namespace embree
        }
      }
- 
-diff --git a/common/sys/platform.h b/common/sys/platform.h
-index 96f9aab01..08617452f 100644
---- a/common/sys/platform.h
-+++ b/common/sys/platform.h
-@@ -141,6 +141,9 @@
-   #define DELETED  = delete
+     else {
+-      throw std::runtime_error("not supported node type in bvh_statistics");
++      // -- GODOT start --
++      // throw std::runtime_error("not supported node type in bvh_statistics");
++      abort();
++      // -- GODOT end --
+     }
+     return s;
+   } 
+diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
+index ee5c37b238..625fbf6d4f 100644
+--- a/thirdparty/embree/kernels/common/rtcore.cpp
++++ b/thirdparty/embree/kernels/common/rtcore.cpp
+@@ -230,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
+     if (quality != RTC_BUILD_QUALITY_LOW &&
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     scene->setBuildQuality(quality);
+     RTC_CATCH_END2(scene);
+   }
+@@ -1383,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH &&
+         quality != RTC_BUILD_QUALITY_REFIT)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     geometry->setBuildQuality(quality);
+     RTC_CATCH_END2(geometry);
+   }
+diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
+index 6583d12d57..4b070e122b 100644
+--- a/thirdparty/embree/kernels/common/rtcore.h
++++ b/thirdparty/embree/kernels/common/rtcore.h
+@@ -25,52 +25,58 @@ namespace embree
  #endif
  
+ /*! Macros used in the rtcore API implementation */
+-#define RTC_CATCH_BEGIN try {
 +// -- GODOT start --
-+#if !defined(likely)
-+// -- GODOT end --
- #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
- #define   likely(expr) (expr)
- #define unlikely(expr) (expr)
-@@ -148,6 +151,9 @@
- #define   likely(expr) __builtin_expect((bool)(expr),true )
- #define unlikely(expr) __builtin_expect((bool)(expr),false)
- #endif
-+// -- GODOT start --
-+#endif
++// #define RTC_CATCH_BEGIN try {
++#define RTC_CATCH_BEGIN
+   
+-#define RTC_CATCH_END(device)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END(device)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END(device)
+   
+-#define RTC_CATCH_END2(scene)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END2(scene)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END2(scene)
+ 
+-#define RTC_CATCH_END2_FALSE(scene)                                             \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-    return false;                                                               \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-    return false;                                                               \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-    return false;                                                               \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-    return false;                                                               \
+-  }
++// #define RTC_CATCH_END2_FALSE(scene)                                             \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//     return false;                                                               \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//     return false;                                                               \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//     return false;                                                               \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//     return false;                                                               \
++//   }
++#define RTC_CATCH_END2_FALSE(scene) return false;
 +// -- GODOT end --
  
- ////////////////////////////////////////////////////////////////////////////////
- /// Error handling and debugging
-diff --git a/common/sys/sysinfo.cpp b/common/sys/sysinfo.cpp
-index eb0a10eaf..74438260d 100644
---- a/common/sys/sysinfo.cpp
-+++ b/common/sys/sysinfo.cpp
-@@ -233,7 +233,7 @@ namespace embree
+ #define RTC_VERIFY_HANDLE(handle)                               \
+   if (handle == nullptr) {                                         \
+@@ -97,28 +103,38 @@ namespace embree
+ #define RTC_TRACE(x) 
+ #endif
  
-   __noinline int64_t get_xcr0() 
-   {
--#if defined (__WIN32__)
-+#if defined (__WIN32__) /* -- GODOT start -- */ && !defined (__MINGW32__) /* -- GODOT end -- */
-     int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
-     xcr0 = _xgetbv(0);
-     return xcr0;
-diff --git a/common/tasking/taskschedulerinternal.cpp b/common/tasking/taskschedulerinternal.cpp
-index 2152e92f4..923d62f83 100644
---- a/common/tasking/taskschedulerinternal.cpp
-+++ b/common/tasking/taskschedulerinternal.cpp
-@@ -361,7 +361,15 @@ namespace embree
-           if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
-             yield();
-           else
-+// -- GODOT start --
-+#if !defined(__MINGW32__)
+-  /*! used to throw embree API errors */
+-  struct rtcore_error : public std::exception
+-  {
+-    __forceinline rtcore_error(RTCError error, const std::string& str)
+-      : error(error), str(str) {}
+-    
+-    ~rtcore_error() throw() {}
+-    
+-    const char* what () const throw () {
+-      return str.c_str();
+-    }
+-    
+-    RTCError error;
+-    std::string str;
+-  };
++// -- GODOT begin --
++//   /*! used to throw embree API errors */
++//   struct rtcore_error : public std::exception
++//   {
++//     __forceinline rtcore_error(RTCError error, const std::string& str)
++//       : error(error), str(str) {}
++//     
++//     ~rtcore_error() throw() {}
++//     
++//     const char* what () const throw () {
++//       return str.c_str();
++//     }
++//     
++//     RTCError error;
++//     std::string str;
++//   };
 +// -- GODOT end --
-             _mm_pause();
-+// -- GODOT start --
-+#else
-+            usleep(1);
-+#endif
-+// -- GODOT end --
- 	  loopIndex++;
- #else
-           yield();
-diff --git a/common/tasking/taskschedulertbb.h b/common/tasking/taskschedulertbb.h
-index 98dba2687..369e5edf0 100644
---- a/common/tasking/taskschedulertbb.h
-+++ b/common/tasking/taskschedulertbb.h
-@@ -12,7 +12,13 @@
- #include "../sys/ref.h"
  
- #if defined(__WIN32__)
-+// -- GODOT start --
-+#if !defined(NOMINMAX)
-+// -- GODOT end --
- #  define NOMINMAX
-+// -- GODOT start --
-+#endif
-+// -- GODOT end --
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,str);
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,str);
++    abort();
++  // -- GODOT end --
  #endif
  
- // We need to define these to avoid implicit linkage against
-diff a/include/embree3/rtcore_common.h b/include/embree3/rtcore_common.h
---- a/include/embree3/rtcore_common.h
-+++ b/include/embree3/rtcore_common.h
-@@ -19,7 +19,7 @@
- #endif
- #endif
+ #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
+index e75aa968f9..1e23aeb415 100644
+--- a/thirdparty/embree/kernels/common/scene.cpp
++++ b/thirdparty/embree/kernels/common/scene.cpp
+@@ -800,16 +800,18 @@ namespace embree
+     }
  
--#ifdef _WIN32
-+#if defined(_WIN32) && defined(_MSC_VER)
- #  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
- #else
- #  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
+     /* initiate build */
+-    try {
++    // -- GODOT start --
++    // try {
+       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+-    }
+-    catch (...) {
+-      accels_clear();
+-      updateInterface();
+-      Lock<MutexSys> lock(schedulerMutex);
+-      this->scheduler = nullptr;
+-      throw;
+-    }
++    // }
++    // catch (...) {
++    //   accels_clear();
++    //   updateInterface();
++    //   Lock<MutexSys> lock(schedulerMutex);
++    //   this->scheduler = nullptr;
++    //   throw;
++    // }
++    // -- GODOT end --
+   }
+ 
+ #endif

From b97b37f91c2efb53211fded07e44cd66887743e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Wed, 5 May 2021 18:01:39 +0200
Subject: [PATCH 57/84] SCons: Disable embree-based modules on x86 (32-bit)

Fixes #48482.

(cherry picked from commit e53422c8f96770c9a9b7497955c84f4b742fdd73)
---
 modules/lightmapper_cpu/config.py | 5 ++++-
 modules/raycast/config.py         | 8 +++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/modules/lightmapper_cpu/config.py b/modules/lightmapper_cpu/config.py
index 96efd47d9fb2..a3a33b3443e3 100644
--- a/modules/lightmapper_cpu/config.py
+++ b/modules/lightmapper_cpu/config.py
@@ -7,11 +7,14 @@ def can_build(env, platform):
     # solution.
 
     if platform == "android":
-        return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
+        return env["android_arch"] in ["arm64v8", "x86_64"]
 
     if platform in ["javascript", "server"]:
         return False
 
+    if env["bits"] == "32":
+        return False
+
     return True
 
 
diff --git a/modules/raycast/config.py b/modules/raycast/config.py
index a2692d3612ed..6ea8e0a5de7f 100644
--- a/modules/raycast/config.py
+++ b/modules/raycast/config.py
@@ -2,12 +2,18 @@ def can_build(env, platform):
     if not env["tools"]:
         return False
 
+    # Depends on Embree library, which supports only x86_64 (originally)
+    # and aarch64 (thanks to the embree-aarch64 fork).
+
     if platform == "android":
-        return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
+        return env["android_arch"] in ["arm64v8", "x86_64"]
 
     if platform in ["javascript", "server"]:
         return False
 
+    if env["bits"] == "32":
+        return False
+
     return True
 
 

From 6a84390cd62170bafbdc8c2188feb9912618b96a Mon Sep 17 00:00:00 2001
From: JFonS <joan.fonssanchez@gmail.com>
Date: Wed, 5 May 2021 18:24:13 +0200
Subject: [PATCH 58/84] Add checks for __SSE2__ in the lightmap raycaster

(cherry picked from commit 20717990fd2a7ad300fd9c6fab0394f25e3b7294)
---
 modules/raycast/lightmap_raycaster.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/raycast/lightmap_raycaster.cpp b/modules/raycast/lightmap_raycaster.cpp
index 6f51ff582f83..5cfd976e33af 100644
--- a/modules/raycast/lightmap_raycaster.cpp
+++ b/modules/raycast/lightmap_raycaster.cpp
@@ -30,7 +30,9 @@
 
 #include "lightmap_raycaster.h"
 
+#ifdef __SSE2__
 #include <pmmintrin.h>
+#endif
 
 LightmapRaycaster *LightmapRaycasterEmbree::create_embree_raycaster() {
 	return memnew(LightmapRaycasterEmbree);
@@ -180,8 +182,10 @@ void embree_error_handler(void *p_user_data, RTCError p_code, const char *p_str)
 }
 
 LightmapRaycasterEmbree::LightmapRaycasterEmbree() {
+#ifdef __SSE2__
 	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
 	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
 
 	embree_device = rtcNewDevice(nullptr);
 	rtcSetDeviceErrorFunction(embree_device, &embree_error_handler, nullptr);
@@ -189,8 +193,10 @@ LightmapRaycasterEmbree::LightmapRaycasterEmbree() {
 }
 
 LightmapRaycasterEmbree::~LightmapRaycasterEmbree() {
+#ifdef __SSE2__
 	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
 	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+#endif
 
 	if (embree_scene != nullptr) {
 		rtcReleaseScene(embree_scene);

From ba4f15b37618a334a81edf39abe70c3e21832afe Mon Sep 17 00:00:00 2001
From: Brian Semrau <brian.semrau@gmail.com>
Date: Tue, 4 May 2021 10:48:58 -0400
Subject: [PATCH 59/84] Batching fix polygon basis polarity

Changes based on fix in #46898

(cherry picked from commit 57e3f357dc4f7fd0fd774c6a4f3ce0cc22e8bc8b)
---
 drivers/gles_common/rasterizer_canvas_batcher.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gles_common/rasterizer_canvas_batcher.h b/drivers/gles_common/rasterizer_canvas_batcher.h
index b4c68c243590..4b55a15564a8 100644
--- a/drivers/gles_common/rasterizer_canvas_batcher.h
+++ b/drivers/gles_common/rasterizer_canvas_batcher.h
@@ -1661,9 +1661,8 @@ bool C_PREAMBLE::_prefill_polygon(RasterizerCanvas::Item::CommandPolygon *p_poly
 		const Transform2D &tr = r_fill_state.transform_combined;
 
 		pBT[0].translate.set(tr.elements[2]);
-		// could do swizzling in shader?
-		pBT[0].basis[0].set(tr.elements[0][0], tr.elements[1][0]);
-		pBT[0].basis[1].set(tr.elements[0][1], tr.elements[1][1]);
+		pBT[0].basis[0].set(tr.elements[0][0], tr.elements[0][1]);
+		pBT[0].basis[1].set(tr.elements[1][0], tr.elements[1][1]);
 	}
 	////////////////////////////////////
 

From b4529c7e8db41eefdf321a5623ff9b3afee95d37 Mon Sep 17 00:00:00 2001
From: CaptainProton42 <john.wigg@gmx.net>
Date: Fri, 19 Mar 2021 20:54:29 +0100
Subject: [PATCH 60/84] Fix 3D scene preview generation.

File system dock previews will now be generated for 3D scenes when no
editor feature profile is set.

(cherry picked from commit 16304aaa3b381cfe391acbb6ab884e3f9596bff1)
---
 editor/editor_node.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index b9eeec508c5e..db904bbc70ed 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -1321,8 +1321,9 @@ void EditorNode::_save_scene_with_preview(String p_file, int p_idx) {
 		} else {
 			// The 3D editor may be disabled as a feature, but scenes can still be opened.
 			// This check prevents the preview from regenerating in case those scenes are then saved.
+			// The preview will be generated if no feature profile is set (as the 3D editor is enabled by default).
 			Ref<EditorFeatureProfile> profile = feature_profile_manager->get_current_profile();
-			if (profile.is_valid() && !profile->is_feature_disabled(EditorFeatureProfile::FEATURE_3D)) {
+			if (!profile.is_valid() || !profile->is_feature_disabled(EditorFeatureProfile::FEATURE_3D)) {
 				img = SpatialEditor::get_singleton()->get_editor_viewport(0)->get_viewport_node()->get_texture()->get_data();
 			}
 		}

From b1cb84b452e6b41b781b8e361782938de7d59a60 Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Thu, 29 Apr 2021 00:47:16 +0200
Subject: [PATCH 61/84] Document that `File.open_compressed()` can only open
 files saved by Godot

(cherry picked from commit 5f098d6db6da62deb77d6d555ab67d6cb15f80c2)
---
 doc/classes/File.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/classes/File.xml b/doc/classes/File.xml
index d98765ea10be..09181b7770e9 100644
--- a/doc/classes/File.xml
+++ b/doc/classes/File.xml
@@ -255,6 +255,7 @@
 			</argument>
 			<description>
 				Opens a compressed file for reading or writing.
+				[b]Note:[/b] [method open_compressed] can only read files that were saved by Godot, not third-party compression formats. See [url=https://github.com/godotengine/godot/issues/28999]GitHub issue #28999[/url] for a workaround.
 			</description>
 		</method>
 		<method name="open_encrypted">

From 7c5633c032d472323fee1bc47d723b06927fb9ac Mon Sep 17 00:00:00 2001
From: PouleyKetchoupp <pouleyketchoup@gmail.com>
Date: Thu, 29 Apr 2021 18:20:29 -0700
Subject: [PATCH 62/84] Expose get_debug_mesh in Shape to scripting API

Can be useful for custom drawing of physics shapes without having to add
a collision object node to the tree.

(cherry picked from commit 0ba5001fb62bc0330b9e29c70694ef13e01f22a3)
---
 doc/classes/Shape.xml     | 7 +++++++
 scene/resources/shape.cpp | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/doc/classes/Shape.xml b/doc/classes/Shape.xml
index a728df1eb829..66ce155d52fc 100644
--- a/doc/classes/Shape.xml
+++ b/doc/classes/Shape.xml
@@ -10,6 +10,13 @@
 		<link>https://docs.godotengine.org/en/3.3/tutorials/physics/physics_introduction.html</link>
 	</tutorials>
 	<methods>
+		<method name="get_debug_mesh">
+			<return type="ArrayMesh">
+			</return>
+			<description>
+				Returns the [ArrayMesh] used to draw the debug collision for this [Shape].
+			</description>
+		</method>
 	</methods>
 	<members>
 		<member name="margin" type="float" setter="set_margin" getter="get_margin" default="0.04">
diff --git a/scene/resources/shape.cpp b/scene/resources/shape.cpp
index 3361016de5bc..849243757688 100644
--- a/scene/resources/shape.cpp
+++ b/scene/resources/shape.cpp
@@ -106,6 +106,8 @@ void Shape::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_margin", "margin"), &Shape::set_margin);
 	ClassDB::bind_method(D_METHOD("get_margin"), &Shape::get_margin);
 
+	ClassDB::bind_method(D_METHOD("get_debug_mesh"), &Shape::get_debug_mesh);
+
 	ADD_PROPERTY(PropertyInfo(Variant::REAL, "margin", PROPERTY_HINT_RANGE, "0.001,10,0.001"), "set_margin", "get_margin");
 }
 

From 7e22dfdd05dec9ac9fd61f5a2d4586f87200ea75 Mon Sep 17 00:00:00 2001
From: MaxStgs <peters2011.pm@gmail.com>
Date: Sat, 1 May 2021 23:01:23 +0500
Subject: [PATCH 63/84] Add PackedDataContainer data pointer check for non
 nullable

(cherry picked from commit 94d0c4182ba8bd166c96c200573891ea76000738)
---
 core/packed_data_container.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/core/packed_data_container.cpp b/core/packed_data_container.cpp
index 3fcdf37b806f..10dfe3aca3b2 100644
--- a/core/packed_data_container.cpp
+++ b/core/packed_data_container.cpp
@@ -128,6 +128,7 @@ Variant PackedDataContainer::_get_at_ofs(uint32_t p_ofs, const uint8_t *p_buf, b
 uint32_t PackedDataContainer::_type_at_ofs(uint32_t p_ofs) const {
 
 	PoolVector<uint8_t>::Read rd = data.read();
+	ERR_FAIL_COND_V(!rd.ptr(), 0);
 	const uint8_t *r = &rd[p_ofs];
 	uint32_t type = decode_uint32(r);
 
@@ -158,6 +159,10 @@ int PackedDataContainer::_size(uint32_t p_ofs) const {
 Variant PackedDataContainer::_key_at_ofs(uint32_t p_ofs, const Variant &p_key, bool &err) const {
 
 	PoolVector<uint8_t>::Read rd = data.read();
+	if (!rd.ptr()) {
+		err = true;
+		ERR_FAIL_COND_V(!rd.ptr(), Variant());
+	}
 	const uint8_t *r = &rd[p_ofs];
 	uint32_t type = decode_uint32(r);
 

From 476bc5191bafcdea99e918100394479c06f0e5b4 Mon Sep 17 00:00:00 2001
From: kobewi <kobewi4e@gmail.com>
Date: Fri, 26 Mar 2021 14:10:41 +0100
Subject: [PATCH 64/84] Save project after opening

(cherry picked from commit 76240515d8a4def96eb1a409259ce70fa653ae73)
---
 editor/editor_node.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index db904bbc70ed..7d242da6ed0d 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -509,6 +509,9 @@ void EditorNode::_notification(int p_what) {
 
 			_update_debug_options();
 
+			// Save the project after opening to mark it as last modified.
+			ProjectSettings::get_singleton()->save();
+
 			/* DO NOT LOAD SCENES HERE, WAIT FOR FILE SCANNING AND REIMPORT TO COMPLETE */
 		} break;
 

From edd63aeefaa8aba7a61f9663c6f8b3d148a5ce83 Mon Sep 17 00:00:00 2001
From: rafallus <rafaelmtzg@gmail.com>
Date: Sun, 11 Apr 2021 17:06:27 -0500
Subject: [PATCH 65/84] Check input mesh is valid in SurfaceTool methods

(cherry picked from commit 0ad0f71ba0d564a8383197ef2febb45d3506758f)
---
 scene/resources/surface_tool.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scene/resources/surface_tool.cpp b/scene/resources/surface_tool.cpp
index a8c4a74e7c6d..ccf9a7b89bba 100644
--- a/scene/resources/surface_tool.cpp
+++ b/scene/resources/surface_tool.cpp
@@ -526,7 +526,7 @@ void SurfaceTool::deindex() {
 }
 
 void SurfaceTool::_create_list(const Ref<Mesh> &p_existing, int p_surface, List<Vertex> *r_vertex, List<int> *r_index, int &lformat) {
-
+	ERR_FAIL_COND_MSG(p_existing.is_null(), "First argument in SurfaceTool::_create_list() must be a valid object of type Mesh");
 	Array arr = p_existing->surface_get_arrays(p_surface);
 	ERR_FAIL_COND(arr.size() != VS::ARRAY_MAX);
 	_create_list_from_arrays(arr, r_vertex, r_index, lformat);
@@ -763,7 +763,7 @@ void SurfaceTool::create_from_triangle_arrays(const Array &p_arrays) {
 }
 
 void SurfaceTool::create_from(const Ref<Mesh> &p_existing, int p_surface) {
-
+	ERR_FAIL_COND_MSG(p_existing.is_null(), "First argument in SurfaceTool::create_from() must be a valid object of type Mesh");
 	clear();
 	primitive = p_existing->surface_get_primitive_type(p_surface);
 	_create_list(p_existing, p_surface, &vertex_array, &index_array, format);
@@ -771,6 +771,7 @@ void SurfaceTool::create_from(const Ref<Mesh> &p_existing, int p_surface) {
 }
 
 void SurfaceTool::create_from_blend_shape(const Ref<Mesh> &p_existing, int p_surface, const String &p_blend_shape_name) {
+	ERR_FAIL_COND_MSG(p_existing.is_null(), "First argument in SurfaceTool::create_from_blend_shape() must be a valid object of type Mesh");
 	clear();
 	primitive = p_existing->surface_get_primitive_type(p_surface);
 	Array arr = p_existing->surface_get_blend_shape_arrays(p_surface);
@@ -791,7 +792,7 @@ void SurfaceTool::create_from_blend_shape(const Ref<Mesh> &p_existing, int p_sur
 }
 
 void SurfaceTool::append_from(const Ref<Mesh> &p_existing, int p_surface, const Transform &p_xform) {
-
+	ERR_FAIL_COND_MSG(p_existing.is_null(), "First argument in SurfaceTool::append_from() must be a valid object of type Mesh");
 	if (vertex_array.size() == 0) {
 		primitive = p_existing->surface_get_primitive_type(p_surface);
 		format = 0;

From feaf4e620745547764ad19f953ab2ca996be995a Mon Sep 17 00:00:00 2001
From: Lightning_A <aaronjrecord@gmail.com>
Date: Thu, 22 Apr 2021 18:17:01 -0600
Subject: [PATCH 66/84] Fix Array.max() navigating to @GDScript.max() etc.

(cherry picked from commit 2c4aa50648c86f83dd11abc20cb2d4f34d49dbad)
---
 modules/gdscript/gdscript_editor.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/modules/gdscript/gdscript_editor.cpp b/modules/gdscript/gdscript_editor.cpp
index 7bf7666434fb..edcbe22a636a 100644
--- a/modules/gdscript/gdscript_editor.cpp
+++ b/modules/gdscript/gdscript_editor.cpp
@@ -3342,15 +3342,6 @@ Error GDScriptLanguage::lookup_code(const String &p_code, const String &p_symbol
 		}
 	}
 
-	for (int i = 0; i < GDScriptFunctions::FUNC_MAX; i++) {
-		if (GDScriptFunctions::get_func_name(GDScriptFunctions::Function(i)) == p_symbol) {
-			r_result.type = ScriptLanguage::LookupResult::RESULT_CLASS_METHOD;
-			r_result.class_name = "@GDScript";
-			r_result.class_member = p_symbol;
-			return OK;
-		}
-	}
-
 	if ("PI" == p_symbol || "TAU" == p_symbol || "INF" == p_symbol || "NAN" == p_symbol) {
 		r_result.type = ScriptLanguage::LookupResult::RESULT_CLASS_CONSTANT;
 		r_result.class_name = "@GDScript";
@@ -3566,6 +3557,16 @@ Error GDScriptLanguage::lookup_code(const String &p_code, const String &p_symbol
 		}
 	}
 
+	for (int i = 0; i < GDScriptFunctions::FUNC_MAX; i++) {
+		// this has to get run after parsing because otherwise functions like Array.max() will trigger it
+		if (GDScriptFunctions::get_func_name(GDScriptFunctions::Function(i)) == p_symbol) {
+			r_result.type = ScriptLanguage::LookupResult::RESULT_CLASS_METHOD;
+			r_result.class_name = "@GDScript";
+			r_result.class_member = p_symbol;
+			return OK;
+		}
+	}
+
 	return ERR_CANT_RESOLVE;
 }
 

From e9c8889ae8d08f4003be48cc7ddfa939efa593e5 Mon Sep 17 00:00:00 2001
From: TwistedTwigleg <beard.noah@gmail.com>
Date: Tue, 27 Apr 2021 17:56:19 -0400
Subject: [PATCH 67/84] Fixes the SkeletonIK twisting issue by using the
 skeleton global pose without overrides

(cherry picked from commit c1bc87ed0dcd12ae36e84a6a9134f76de8d08480)
---
 doc/classes/Skeleton.xml        |  9 +++
 scene/3d/skeleton.cpp           | 97 ++++++++++++++++-----------------
 scene/3d/skeleton.h             |  2 +
 scene/animation/skeleton_ik.cpp | 72 ++++++------------------
 4 files changed, 76 insertions(+), 104 deletions(-)

diff --git a/doc/classes/Skeleton.xml b/doc/classes/Skeleton.xml
index a699c573f801..10075de7c589 100644
--- a/doc/classes/Skeleton.xml
+++ b/doc/classes/Skeleton.xml
@@ -80,6 +80,15 @@
 				Returns the overall transform of the specified bone, with respect to the skeleton. Being relative to the skeleton frame, this is not the actual "global" transform of the bone.
 			</description>
 		</method>
+		<method name="get_bone_global_pose_no_override" qualifiers="const">
+			<return type="Transform">
+			</return>
+			<argument index="0" name="bone_idx" type="int">
+			</argument>
+			<description>
+				Returns the overall transform of the specified bone, with respect to the skeleton, but without any global pose overrides. Being relative to the skeleton frame, this is not the actual "global" transform of the bone.
+			</description>
+		</method>
 		<method name="get_bone_name" qualifiers="const">
 			<return type="String">
 			</return>
diff --git a/scene/3d/skeleton.cpp b/scene/3d/skeleton.cpp
index 5a44cdac239c..2bc4905dd642 100644
--- a/scene/3d/skeleton.cpp
+++ b/scene/3d/skeleton.cpp
@@ -243,63 +243,54 @@ void Skeleton::_notification(int p_what) {
 
 				Bone &b = bonesptr[order[i]];
 
-				if (b.global_pose_override_amount >= 0.999) {
-					b.pose_global = b.global_pose_override;
+				if (b.disable_rest) {
+					if (b.enabled) {
+						Transform pose = b.pose;
+						if (b.custom_pose_enable) {
+							pose = b.custom_pose * pose;
+						}
+						if (b.parent >= 0) {
+							b.pose_global = bonesptr[b.parent].pose_global * pose;
+							b.pose_global_no_override = bonesptr[b.parent].pose_global_no_override * pose;
+						} else {
+							b.pose_global = pose;
+							b.pose_global_no_override = pose;
+						}
+					} else {
+						if (b.parent >= 0) {
+							b.pose_global = bonesptr[b.parent].pose_global;
+							b.pose_global_no_override = bonesptr[b.parent].pose_global_no_override;
+						} else {
+							b.pose_global = Transform();
+							b.pose_global_no_override = Transform();
+						}
+					}
 				} else {
-					if (b.disable_rest) {
-						if (b.enabled) {
-
-							Transform pose = b.pose;
-							if (b.custom_pose_enable) {
-								pose = b.custom_pose * pose;
-							}
-							if (b.parent >= 0) {
-
-								b.pose_global = bonesptr[b.parent].pose_global * pose;
-							} else {
-
-								b.pose_global = pose;
-							}
+					if (b.enabled) {
+						Transform pose = b.pose;
+						if (b.custom_pose_enable) {
+							pose = b.custom_pose * pose;
+						}
+						if (b.parent >= 0) {
+							b.pose_global = bonesptr[b.parent].pose_global * (b.rest * pose);
+							b.pose_global_no_override = bonesptr[b.parent].pose_global_no_override * (b.rest * pose);
 						} else {
-
-							if (b.parent >= 0) {
-
-								b.pose_global = bonesptr[b.parent].pose_global;
-							} else {
-
-								b.pose_global = Transform();
-							}
+							b.pose_global = b.rest * pose;
+							b.pose_global_no_override = b.rest * pose;
 						}
-
 					} else {
-						if (b.enabled) {
-
-							Transform pose = b.pose;
-							if (b.custom_pose_enable) {
-								pose = b.custom_pose * pose;
-							}
-							if (b.parent >= 0) {
-
-								b.pose_global = bonesptr[b.parent].pose_global * (b.rest * pose);
-							} else {
-
-								b.pose_global = b.rest * pose;
-							}
+						if (b.parent >= 0) {
+							b.pose_global = bonesptr[b.parent].pose_global * b.rest;
+							b.pose_global_no_override = bonesptr[b.parent].pose_global_no_override * b.rest;
 						} else {
-
-							if (b.parent >= 0) {
-
-								b.pose_global = bonesptr[b.parent].pose_global * b.rest;
-							} else {
-
-								b.pose_global = b.rest;
-							}
+							b.pose_global = b.rest;
+							b.pose_global_no_override = b.rest;
 						}
 					}
+				}
 
-					if (b.global_pose_override_amount >= CMP_EPSILON) {
-						b.pose_global = b.pose_global.interpolate_with(b.global_pose_override, b.global_pose_override_amount);
-					}
+				if (b.global_pose_override_amount >= CMP_EPSILON) {
+					b.pose_global = b.pose_global.interpolate_with(b.global_pose_override, b.global_pose_override_amount);
 				}
 
 				if (b.global_pose_override_reset) {
@@ -405,6 +396,13 @@ Transform Skeleton::get_bone_global_pose(int p_bone) const {
 	return bones[p_bone].pose_global;
 }
 
+Transform Skeleton::get_bone_global_pose_no_override(int p_bone) const {
+	ERR_FAIL_INDEX_V(p_bone, bones.size(), Transform());
+	if (dirty)
+		const_cast<Skeleton *>(this)->notification(NOTIFICATION_UPDATE_SKELETON);
+	return bones[p_bone].pose_global_no_override;
+}
+
 // skeleton creation api
 void Skeleton::add_bone(const String &p_name) {
 
@@ -885,6 +883,7 @@ void Skeleton::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("clear_bones_global_pose_override"), &Skeleton::clear_bones_global_pose_override);
 	ClassDB::bind_method(D_METHOD("set_bone_global_pose_override", "bone_idx", "pose", "amount", "persistent"), &Skeleton::set_bone_global_pose_override, DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("get_bone_global_pose", "bone_idx"), &Skeleton::get_bone_global_pose);
+	ClassDB::bind_method(D_METHOD("get_bone_global_pose_no_override", "bone_idx"), &Skeleton::get_bone_global_pose_no_override);
 
 	ClassDB::bind_method(D_METHOD("get_bone_custom_pose", "bone_idx"), &Skeleton::get_bone_custom_pose);
 	ClassDB::bind_method(D_METHOD("set_bone_custom_pose", "bone_idx", "custom_pose"), &Skeleton::set_bone_custom_pose);
diff --git a/scene/3d/skeleton.h b/scene/3d/skeleton.h
index fb5d1367d4ed..8819fde0770f 100644
--- a/scene/3d/skeleton.h
+++ b/scene/3d/skeleton.h
@@ -90,6 +90,7 @@ class Skeleton : public Spatial {
 
 		Transform pose;
 		Transform pose_global;
+		Transform pose_global_no_override;
 
 		bool custom_pose_enable;
 		Transform custom_pose;
@@ -177,6 +178,7 @@ class Skeleton : public Spatial {
 	void set_bone_rest(int p_bone, const Transform &p_rest);
 	Transform get_bone_rest(int p_bone) const;
 	Transform get_bone_global_pose(int p_bone) const;
+	Transform get_bone_global_pose_no_override(int p_bone) const;
 
 	void clear_bones_global_pose_override();
 	void set_bone_global_pose_override(int p_bone, const Transform &p_pose, float p_amount, bool p_persistent = false);
diff --git a/scene/animation/skeleton_ik.cpp b/scene/animation/skeleton_ik.cpp
index 25eee0a7636b..8e0784582982 100644
--- a/scene/animation/skeleton_ik.cpp
+++ b/scene/animation/skeleton_ik.cpp
@@ -257,7 +257,7 @@ void FabrikInverseKinematic::make_goal(Task *p_task, const Transform &p_inverse_
 	} else {
 
 		// End effector in local transform
-		const Transform end_effector_pose(p_task->skeleton->get_bone_global_pose(p_task->end_effectors[0].tip_bone));
+		const Transform end_effector_pose(p_task->skeleton->get_bone_global_pose_no_override(p_task->end_effectors[0].tip_bone));
 
 		// Update the end_effector (local transform) by blending with current pose
 		p_task->end_effectors.write[0].goal_transform = end_effector_pose.interpolate_with(p_inverse_transf * p_task->goal_global_transform, blending_delta);
@@ -282,18 +282,7 @@ void FabrikInverseKinematic::solve(Task *p_task, real_t blending_delta, bool ove
 		return; // Skip solving
 	}
 
-	p_task->skeleton->set_bone_global_pose_override(p_task->chain.chain_root.bone, Transform(), 0.0, true);
-
-	if (p_task->chain.middle_chain_item) {
-		p_task->skeleton->set_bone_global_pose_override(p_task->chain.middle_chain_item->bone, Transform(), 0.0, true);
-	}
-
-	for (int i = 0; i < p_task->chain.tips.size(); i += 1) {
-		p_task->skeleton->set_bone_global_pose_override(p_task->chain.tips[i].chain_item->bone, Transform(), 0.0, true);
-	}
-
-	// Update the initial root transform
-	// (Needed to sync IK with animation)
+	// Update the initial root transform so its synced with any animation changes
 	_update_chain(p_task->skeleton, &p_task->chain.chain_root);
 
 	make_goal(p_task, p_task->skeleton->get_global_transform().affine_inverse(), blending_delta);
@@ -310,49 +299,22 @@ void FabrikInverseKinematic::solve(Task *p_task, real_t blending_delta, bool ove
 		Transform new_bone_pose(ci->initial_transform);
 		new_bone_pose.origin = ci->current_pos;
 
-		// The root bone needs to be rotated differently so it isn't frozen in place
-		if (ci == &p_task->chain.chain_root && !ci->children.empty()) {
-			new_bone_pose = new_bone_pose.looking_at(ci->children[0].current_pos, Vector3(0, 1, 0));
-			const Vector3 bone_rest_dir = p_task->skeleton->get_bone_rest(ci->children[0].bone).origin.normalized().abs();
-			const Vector3 bone_rest_dir_abs = bone_rest_dir.abs();
-			if (bone_rest_dir_abs.x > bone_rest_dir_abs.y && bone_rest_dir_abs.x > bone_rest_dir_abs.z) {
-				if (bone_rest_dir.x < 0) {
-					new_bone_pose.basis.rotate_local(Vector3(0, 1, 0), -Math_PI / 2.0f);
-				} else {
-					new_bone_pose.basis.rotate_local(Vector3(0, 1, 0), Math_PI / 2.0f);
-				}
-			} else if (bone_rest_dir_abs.y > bone_rest_dir_abs.x && bone_rest_dir_abs.y > bone_rest_dir_abs.z) {
-				if (bone_rest_dir.y < 0) {
-					new_bone_pose.basis.rotate_local(Vector3(1, 0, 0), Math_PI / 2.0f);
-				} else {
-					new_bone_pose.basis.rotate_local(Vector3(1, 0, 0), -Math_PI / 2.0f);
-				}
-			} else {
-				if (bone_rest_dir.z < 0) {
-					// Do nothing!
-				} else {
-					new_bone_pose.basis.rotate_local(Vector3(0, 0, 1), Math_PI);
-				}
-			}
-		} else {
-			if (!ci->children.empty()) {
-
-				/// Rotate basis
-				const Vector3 initial_ori((ci->children[0].initial_transform.origin - ci->initial_transform.origin).normalized());
-				const Vector3 rot_axis(initial_ori.cross(ci->current_ori).normalized());
+		if (!ci->children.empty()) {
+			/// Rotate basis
+			const Vector3 initial_ori((ci->children[0].initial_transform.origin - ci->initial_transform.origin).normalized());
+			const Vector3 rot_axis(initial_ori.cross(ci->current_ori).normalized());
 
-				if (rot_axis[0] != 0 && rot_axis[1] != 0 && rot_axis[2] != 0) {
-					const real_t rot_angle(Math::acos(CLAMP(initial_ori.dot(ci->current_ori), -1, 1)));
-					new_bone_pose.basis.rotate(rot_axis, rot_angle);
-				}
-
-			} else {
-				// Set target orientation to tip
-				if (override_tip_basis)
-					new_bone_pose.basis = p_task->chain.tips[0].end_effector->goal_transform.basis;
-				else
-					new_bone_pose.basis = new_bone_pose.basis * p_task->chain.tips[0].end_effector->goal_transform.basis;
+			if (rot_axis[0] != 0 && rot_axis[1] != 0 && rot_axis[2] != 0) {
+				const real_t rot_angle(Math::acos(CLAMP(initial_ori.dot(ci->current_ori), -1, 1)));
+				new_bone_pose.basis.rotate(rot_axis, rot_angle);
 			}
+
+		} else {
+			// Set target orientation to tip
+			if (override_tip_basis)
+				new_bone_pose.basis = p_task->chain.tips[0].end_effector->goal_transform.basis;
+			else
+				new_bone_pose.basis = new_bone_pose.basis * p_task->chain.tips[0].end_effector->goal_transform.basis;
 		}
 
 		// IK should not affect scale, so undo any scaling
@@ -373,7 +335,7 @@ void FabrikInverseKinematic::_update_chain(const Skeleton *p_sk, ChainItem *p_ch
 		return;
 	}
 
-	p_chain_item->initial_transform = p_sk->get_bone_global_pose(p_chain_item->bone);
+	p_chain_item->initial_transform = p_sk->get_bone_global_pose_no_override(p_chain_item->bone);
 	p_chain_item->current_pos = p_chain_item->initial_transform.origin;
 
 	ChainItem *items = p_chain_item->children.ptrw();

From e6186dad5915097310e2fadd06a9ad366fdde646 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Wed, 5 May 2021 10:26:34 +0200
Subject: [PATCH 68/84] Fix crash with user-defined `ResourceFormatLoader.load`

There's still some fishy recursive relationship between `load_interactive` and
`load` which needs to be investigated here, but this patch solves the crash
when returning an error code in user-defined `load`.

Fixes #48463.

(cherry picked from commit bf9f288c7dc1f03232077408a5ac49b88a5e0718)
---
 core/io/resource_loader.cpp | 46 ++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/core/io/resource_loader.cpp b/core/io/resource_loader.cpp
index 19cd36d2a953..2c2a8602a830 100644
--- a/core/io/resource_loader.cpp
+++ b/core/io/resource_loader.cpp
@@ -138,18 +138,6 @@ class ResourceInteractiveLoaderDefault : public ResourceInteractiveLoader {
 	ResourceInteractiveLoaderDefault() {}
 };
 
-Ref<ResourceInteractiveLoader> ResourceFormatLoader::load_interactive(const String &p_path, const String &p_original_path, Error *r_error) {
-
-	//either this
-	Ref<Resource> res = load(p_path, p_original_path, r_error);
-	if (res.is_null())
-		return Ref<ResourceInteractiveLoader>();
-
-	Ref<ResourceInteractiveLoaderDefault> ril = Ref<ResourceInteractiveLoaderDefault>(memnew(ResourceInteractiveLoaderDefault));
-	ril->resource = res;
-	return ril;
-}
-
 bool ResourceFormatLoader::exists(const String &p_path) const {
 	return FileAccess::exists(p_path); //by default just check file
 }
@@ -168,25 +156,41 @@ void ResourceFormatLoader::get_recognized_extensions(List<String> *p_extensions)
 	}
 }
 
-RES ResourceFormatLoader::load(const String &p_path, const String &p_original_path, Error *r_error) {
+// Warning: Derived classes must override either `load` or `load_interactive`. The base code
+// here can trigger an infinite recursion otherwise, since `load` calls `load_interactive`
+// vice versa.
+
+Ref<ResourceInteractiveLoader> ResourceFormatLoader::load_interactive(const String &p_path, const String &p_original_path, Error *r_error) {
+	// Warning: See previous note about the risk of infinite recursion.
+	Ref<Resource> res = load(p_path, p_original_path, r_error);
+	if (res.is_null()) {
+		return Ref<ResourceInteractiveLoader>();
+	}
+
+	Ref<ResourceInteractiveLoaderDefault> ril = Ref<ResourceInteractiveLoaderDefault>(memnew(ResourceInteractiveLoaderDefault));
+	ril->resource = res;
+	return ril;
+}
 
+RES ResourceFormatLoader::load(const String &p_path, const String &p_original_path, Error *r_error) {
+	// Check user-defined loader if there's any. Hard fail if it returns an error.
 	if (get_script_instance() && get_script_instance()->has_method("load")) {
 		Variant res = get_script_instance()->call("load", p_path, p_original_path);
 
-		if (res.get_type() == Variant::INT) {
-
-			if (r_error)
+		if (res.get_type() == Variant::INT) { // Error code, abort.
+			if (r_error) {
 				*r_error = (Error)res.operator int64_t();
-
-		} else {
-
-			if (r_error)
+			}
+			return RES();
+		} else { // Success, pass on result.
+			if (r_error) {
 				*r_error = OK;
+			}
 			return res;
 		}
 	}
 
-	//or this must be implemented
+	// Warning: See previous note about the risk of infinite recursion.
 	Ref<ResourceInteractiveLoader> ril = load_interactive(p_path, p_original_path, r_error);
 	if (!ril.is_valid())
 		return RES();

From bb6b38680c56cf4ccfa9d6a81401ff0abed52d3e Mon Sep 17 00:00:00 2001
From: clayjohn <claynjohn@gmail.com>
Date: Wed, 5 May 2021 22:35:21 -0700
Subject: [PATCH 69/84] Only set base in Sprite3D when needed

(cherry picked from commit 3dd2e5d8703dba0f80eb7ef7dbd9869a9281f51f)
---
 scene/3d/sprite_3d.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/scene/3d/sprite_3d.cpp b/scene/3d/sprite_3d.cpp
index e686ce0b74d6..5e96f8a8c941 100644
--- a/scene/3d/sprite_3d.cpp
+++ b/scene/3d/sprite_3d.cpp
@@ -437,6 +437,7 @@ SpriteBase3D::SpriteBase3D() {
 	mesh_surface_format = VS::get_singleton()->mesh_surface_get_format(mesh, 0);
 	mesh_buffer = VS::get_singleton()->mesh_surface_get_array(mesh, 0);
 	mesh_stride = VS::get_singleton()->mesh_surface_make_offsets_from_format(mesh_surface_format, surface_vertex_len, surface_index_len, mesh_surface_offsets);
+	set_base(mesh);
 }
 
 SpriteBase3D::~SpriteBase3D() {
@@ -448,11 +449,13 @@ SpriteBase3D::~SpriteBase3D() {
 ///////////////////////////////////////////
 
 void Sprite3D::_draw() {
-
-	set_base(RID());
-
-	if (!texture.is_valid())
+	if (get_base() != get_mesh()) {
+		set_base(get_mesh());
+	}
+	if (!texture.is_valid()) {
+		set_base(RID());
 		return;
+	}
 	Vector2 tsize = texture->get_size();
 	if (tsize.x == 0 || tsize.y == 0)
 		return;
@@ -604,8 +607,6 @@ void Sprite3D::_draw() {
 	VS::get_singleton()->mesh_set_custom_aabb(mesh, aabb);
 	set_aabb(aabb);
 
-	set_base(mesh);
-
 	RID mat = SpatialMaterial::get_material_rid_for_2d(get_draw_flag(FLAG_SHADED), get_draw_flag(FLAG_TRANSPARENT), get_draw_flag(FLAG_DOUBLE_SIDED), get_alpha_cut_mode() == ALPHA_CUT_DISCARD, get_alpha_cut_mode() == ALPHA_CUT_OPAQUE_PREPASS, get_billboard_mode() == SpatialMaterial::BILLBOARD_ENABLED, get_billboard_mode() == SpatialMaterial::BILLBOARD_FIXED_Y);
 	VS::get_singleton()->material_set_shader(get_material(), VS::get_singleton()->material_get_shader(mat));
 	VS::get_singleton()->material_set_param(get_material(), "texture_albedo", texture->get_rid());
@@ -802,8 +803,9 @@ Sprite3D::Sprite3D() {
 ////////////////////////////////////////
 
 void AnimatedSprite3D::_draw() {
-
-	set_base(RID());
+	if (get_base() != get_mesh()) {
+		set_base(get_mesh());
+	}
 
 	if (frames.is_null()) {
 		return;
@@ -818,8 +820,10 @@ void AnimatedSprite3D::_draw() {
 	}
 
 	Ref<Texture> texture = frames->get_frame(animation, frame);
-	if (!texture.is_valid())
+	if (!texture.is_valid()) {
+		set_base(RID());
 		return; //no texuture no life
+	}
 	Vector2 tsize = texture->get_size();
 	if (tsize.x == 0 || tsize.y == 0)
 		return;
@@ -966,8 +970,6 @@ void AnimatedSprite3D::_draw() {
 	VS::get_singleton()->mesh_set_custom_aabb(mesh, aabb);
 	set_aabb(aabb);
 
-	set_base(mesh);
-
 	RID mat = SpatialMaterial::get_material_rid_for_2d(get_draw_flag(FLAG_SHADED), get_draw_flag(FLAG_TRANSPARENT), get_draw_flag(FLAG_DOUBLE_SIDED), get_alpha_cut_mode() == ALPHA_CUT_DISCARD, get_alpha_cut_mode() == ALPHA_CUT_OPAQUE_PREPASS, get_billboard_mode() == SpatialMaterial::BILLBOARD_ENABLED, get_billboard_mode() == SpatialMaterial::BILLBOARD_FIXED_Y);
 	VS::get_singleton()->material_set_shader(get_material(), VS::get_singleton()->material_get_shader(mat));
 	VS::get_singleton()->material_set_param(get_material(), "texture_albedo", texture->get_rid());

From 523faf04448fac04f78309933efc6c568746de39 Mon Sep 17 00:00:00 2001
From: paru <paru@sqnya.se>
Date: Fri, 7 May 2021 18:33:35 +0200
Subject: [PATCH 70/84] Fixed usage of proxy textures on GLES2 sky

(cherry picked from commit eed4655644986c6e7e1957a08fbd8b1ef4bc0af1)
---
 drivers/gles2/rasterizer_scene_gles2.cpp   | 2 ++
 drivers/gles2/rasterizer_storage_gles2.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/gles2/rasterizer_scene_gles2.cpp b/drivers/gles2/rasterizer_scene_gles2.cpp
index 2cfb4b07a744..794732213705 100644
--- a/drivers/gles2/rasterizer_scene_gles2.cpp
+++ b/drivers/gles2/rasterizer_scene_gles2.cpp
@@ -2639,6 +2639,8 @@ void RasterizerSceneGLES2::_draw_sky(RasterizerStorageGLES2::Sky *p_sky, const C
 	RasterizerStorageGLES2::Texture *tex = storage->texture_owner.getornull(p_sky->panorama);
 	ERR_FAIL_COND(!tex);
 
+	tex = tex->get_ptr(); //resolve for proxies
+
 	glActiveTexture(GL_TEXTURE0);
 	glBindTexture(tex->target, tex->tex_id);
 
diff --git a/drivers/gles2/rasterizer_storage_gles2.cpp b/drivers/gles2/rasterizer_storage_gles2.cpp
index 1e7f27a2b8e7..39a6db83e3c9 100644
--- a/drivers/gles2/rasterizer_storage_gles2.cpp
+++ b/drivers/gles2/rasterizer_storage_gles2.cpp
@@ -1229,6 +1229,8 @@ void RasterizerStorageGLES2::sky_set_texture(RID p_sky, RID p_panorama, int p_ra
 		ERR_FAIL_COND(!texture);
 	}
 
+	texture = texture->get_ptr(); //resolve for proxies
+
 	// glBindVertexArray(0) and more
 	{
 		glBindBuffer(GL_ARRAY_BUFFER, 0);

From 5514b169500e7e87d9662693a2f7518885b35f31 Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Wed, 5 May 2021 20:37:35 +0200
Subject: [PATCH 71/84] Improve the AudioStreamPlayer(2D/3D) class descriptions

(cherry picked from commit b90adec417b304738303d218e85aa9e319f94d4d)
---
 doc/classes/AudioStreamPlayer.xml   | 1 +
 doc/classes/AudioStreamPlayer2D.xml | 4 +++-
 doc/classes/AudioStreamPlayer3D.xml | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/doc/classes/AudioStreamPlayer.xml b/doc/classes/AudioStreamPlayer.xml
index e68607b6277a..0489116e3b6f 100644
--- a/doc/classes/AudioStreamPlayer.xml
+++ b/doc/classes/AudioStreamPlayer.xml
@@ -5,6 +5,7 @@
 	</brief_description>
 	<description>
 		Plays an audio stream non-positionally.
+		To play audio positionally, use [AudioStreamPlayer2D] or [AudioStreamPlayer3D] instead of [AudioStreamPlayer].
 	</description>
 	<tutorials>
 		<link title="Audio streams">https://docs.godotengine.org/en/3.3/tutorials/audio/audio_streams.html</link>
diff --git a/doc/classes/AudioStreamPlayer2D.xml b/doc/classes/AudioStreamPlayer2D.xml
index 023d65036b17..ddf44bb50e3a 100644
--- a/doc/classes/AudioStreamPlayer2D.xml
+++ b/doc/classes/AudioStreamPlayer2D.xml
@@ -1,10 +1,12 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="AudioStreamPlayer2D" inherits="Node2D" version="3.3">
 	<brief_description>
-		Plays audio in 2D.
+		Plays positional sound in 2D space.
 	</brief_description>
 	<description>
 		Plays audio that dampens with distance from screen center.
+		See also [AudioStreamPlayer] to play a sound non-positionally.
+		[b]Note:[/b] Hiding an [AudioStreamPlayer2D] node does not disable its audio output. To temporarily disable an [AudioStreamPlayer2D]'s audio output, set [member volume_db] to a very low value like [code]-100[/code] (which isn't audible to human hearing).
 	</description>
 	<tutorials>
 		<link>https://docs.godotengine.org/en/3.3/tutorials/audio/audio_streams.html</link>
diff --git a/doc/classes/AudioStreamPlayer3D.xml b/doc/classes/AudioStreamPlayer3D.xml
index 6af3f0d4526f..fd404f90999a 100644
--- a/doc/classes/AudioStreamPlayer3D.xml
+++ b/doc/classes/AudioStreamPlayer3D.xml
@@ -1,11 +1,13 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="AudioStreamPlayer3D" inherits="Spatial" version="3.3">
 	<brief_description>
-		Plays 3D sound in 3D space.
+		Plays positional sound in 3D space.
 	</brief_description>
 	<description>
 		Plays a sound effect with directed sound effects, dampens with distance if needed, generates effect of hearable position in space. For greater realism, a low-pass filter is automatically applied to distant sounds. This can be disabled by setting [member attenuation_filter_cutoff_hz] to [code]20500[/code].
 		By default, audio is heard from the camera position. This can be changed by adding a [Listener] node to the scene and enabling it by calling [method Listener.make_current] on it.
+		See also [AudioStreamPlayer] to play a sound non-positionally.
+		[b]Note:[/b] Hiding an [AudioStreamPlayer3D] node does not disable its audio output. To temporarily disable an [AudioStreamPlayer3D]'s audio output, set [member unit_db] to a very low value like [code]-100[/code] (which isn't audible to human hearing).
 	</description>
 	<tutorials>
 		<link>https://docs.godotengine.org/en/3.3/tutorials/audio/audio_streams.html</link>

From 88210227dc7a4b07aca9b136a7eda54a32565444 Mon Sep 17 00:00:00 2001
From: Fabio Alessandrelli <fabio.alessandrelli@gmail.com>
Date: Thu, 6 May 2021 14:00:52 +0200
Subject: [PATCH 72/84] [HTML5] Use 64KiB chunk size in JS HTTPClient.

For consistency with the native one, and the documentation.

(cherry picked from commit 6243835619b38134a78a98283ac9a350a990348e)
---
 platform/javascript/http_client.h.inc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/platform/javascript/http_client.h.inc b/platform/javascript/http_client.h.inc
index 842a93fcbae3..6544d41c98e6 100644
--- a/platform/javascript/http_client.h.inc
+++ b/platform/javascript/http_client.h.inc
@@ -34,7 +34,8 @@ Error make_request(Method p_method, const String &p_url, const Vector<String> &p
 static void _parse_headers(int p_len, const char **p_headers, void *p_ref);
 
 int js_id = 0;
-int read_limit = 4096;
+// 64 KiB by default (favors fast download speeds at the cost of memory usage).
+int read_limit = 65536;
 Status status = STATUS_DISCONNECTED;
 
 String host;

From e6cbb4d460af51220642aff98bbc44e8bd7913df Mon Sep 17 00:00:00 2001
From: Kyle <eichlinkyle@gmail.com>
Date: Thu, 6 May 2021 16:01:36 -0400
Subject: [PATCH 73/84] Fixed cut/copy/paste visibility

Fixes #48514 by moving the visibility of these buttons into their own if statement that depends on if scene tree editing is allowed. Previously it was under the script editing setting which is unexpected as it works with nodes and the scene tree.

(cherry picked from commit 10d5d4d3cd95a30d7196d07cac7dedeeda37a2db)
---
 editor/scene_tree_dock.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/editor/scene_tree_dock.cpp b/editor/scene_tree_dock.cpp
index a5571968b3ea..f067b9db3ca9 100644
--- a/editor/scene_tree_dock.cpp
+++ b/editor/scene_tree_dock.cpp
@@ -2610,14 +2610,16 @@ void SceneTreeDock::_tree_rmb(const Vector2 &p_menu_pos) {
 		}
 	}
 
-	if (profile_allow_script_editing) {
+	if (profile_allow_editing) {
 		menu->add_shortcut(ED_GET_SHORTCUT("scene_tree/cut_node"), TOOL_CUT);
 		menu->add_shortcut(ED_GET_SHORTCUT("scene_tree/copy_node"), TOOL_COPY);
 		if (selection.size() == 1 && !node_clipboard.empty()) {
 			menu->add_shortcut(ED_GET_SHORTCUT("scene_tree/paste_node"), TOOL_PASTE);
 		}
 		menu->add_separator();
+	}
 
+	if (profile_allow_script_editing) {
 		bool add_separator = false;
 
 		if (full_selection.size() == 1) {

From b622a9e35997785edde77e3748747ac8f2f0735e Mon Sep 17 00:00:00 2001
From: Kyle <eichlinkyle@gmail.com>
Date: Thu, 6 May 2021 17:37:12 -0400
Subject: [PATCH 74/84] Remove extra separator

Removes an extra separator when Scene Tree Editing is disabled. Discussed in #48518

(cherry picked from commit e168baf433fee964d103693e227cce7f27b8905e)
---
 editor/scene_tree_dock.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/editor/scene_tree_dock.cpp b/editor/scene_tree_dock.cpp
index f067b9db3ca9..2b79311babd9 100644
--- a/editor/scene_tree_dock.cpp
+++ b/editor/scene_tree_dock.cpp
@@ -2647,7 +2647,7 @@ void SceneTreeDock::_tree_rmb(const Vector2 &p_menu_pos) {
 			}
 		}
 
-		if (add_separator) {
+		if (add_separator && profile_allow_editing) {
 			menu->add_separator();
 		}
 	}

From 488f448fbb4c5b00df5f36f2032eed24637be742 Mon Sep 17 00:00:00 2001
From: Fabio Alessandrelli <fabio.alessandrelli@gmail.com>
Date: Fri, 7 May 2021 13:17:54 +0200
Subject: [PATCH 75/84] [HTML5] Remove "fixed-size.html".

No longer used in 3.3+.

(cherry picked from commit 3faf8d6e407d8f93d98d505e7d5f74c9404d8fd3)
---
 misc/dist/html/fixed-size.html | 393 ---------------------------------
 1 file changed, 393 deletions(-)
 delete mode 100644 misc/dist/html/fixed-size.html

diff --git a/misc/dist/html/fixed-size.html b/misc/dist/html/fixed-size.html
deleted file mode 100644
index e7a23b3f296c..000000000000
--- a/misc/dist/html/fixed-size.html
+++ /dev/null
@@ -1,393 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
-<head>
-	<meta charset="utf-8" />
-	<link id='-gd-engine-icon' rel='icon' type='image/png' href='favicon.png' />
-	<title>$GODOT_PROJECT_NAME</title>
-	<style type="text/css">
-
-		body {
-			margin: 0;
-			border: 0 none;
-			padding: 0;
-			text-align: center;
-			background-color: #222226;
-			font-family: 'Noto Sans', Arial, sans-serif;
-		}
-
-
-		/* Godot Engine default theme style
-		 * ================================ */
-
-		.godot {
-			color: #e0e0e0;
-			background-color: #3b3943;
-			background-image: linear-gradient(to bottom, #403e48, #35333c);
-			border: 1px solid #45434e;
-			box-shadow: 0 0 1px 1px #2f2d35;
-		}
-
-		button.godot {
-			font-family: 'Droid Sans', Arial, sans-serif; /* override user agent style */
-			padding: 1px 5px;
-			background-color: #37353f;
-			background-image: linear-gradient(to bottom, #413e49, #3a3842);
-			border: 1px solid #514f5d;
-			border-radius: 1px;
-			box-shadow: 0 0 1px 1px #2a2930;
-		}
-
-		button.godot:hover {
-			color: #f0f0f0;
-			background-color: #44414e;
-			background-image: linear-gradient(to bottom, #494652, #423f4c);
-			border: 1px solid #5a5667;
-			box-shadow: 0 0 1px 1px #26252b;
-		}
-
-		button.godot:active {
-			color: #fff;
-			background-color: #3e3b46;
-			background-image: linear-gradient(to bottom, #36343d, #413e49);
-			border: 1px solid #4f4c59;
-			box-shadow: 0 0 1px 1px #26252b;
-		}
-
-		button.godot:disabled {
-			color: rgba(230, 230, 230, 0.2);
-			background-color: #3d3d3d;
-			background-image: linear-gradient(to bottom, #434343, #393939);
-			border: 1px solid #474747;
-			box-shadow: 0 0 1px 1px #2d2b33;
-		}
-
-
-		/* Canvas / wrapper
-		 * ================ */
-
-		#container {
-			display: inline-block; /* scale with canvas */
-			vertical-align: top; /* prevent extra height */
-			position: relative; /* root for absolutely positioned overlay */
-			margin: 0;
-			border: 0 none;
-			padding: 0;
-			background-color: #0c0c0c;
-		}
-
-		#canvas {
-			display: block;
-			margin: 0 auto;
-			color: white;
-		}
-
-		#canvas:focus {
-			outline: none;
-		}
-
-
-		/* Status display
-		 * ============== */
-
-		#status {
-			position: absolute;
-			left: 0;
-			top: 0;
-			right: 0;
-			bottom: 0;
-			display: flex;
-			justify-content: center;
-			align-items: center;
-			/* don't consume click events - make children visible explicitly */
-			visibility: hidden;
-		}
-
-		#status-progress {
-			width: 244px;
-			height: 7px;
-			background-color: #38363A;
-			border: 1px solid #444246;
-			padding: 1px;
-			box-shadow: 0 0 2px 1px #1B1C22;
-			border-radius: 2px;
-			visibility: visible;
-		}
-
-		#status-progress-inner {
-			height: 100%;
-			width: 0;
-			box-sizing: border-box;
-			transition: width 0.5s linear;
-			background-color: #202020;
-			border: 1px solid #222223;
-			box-shadow: 0 0 1px 1px #27282E;
-			border-radius: 3px;
-		}
-
-		#status-indeterminate {
-			visibility: visible;
-			position: relative;
-		}
-
-		#status-indeterminate > div {
-			width: 3px;
-			height: 0;
-			border-style: solid;
-			border-width: 6px 2px 0 2px;
-			border-color: #2b2b2b transparent transparent transparent;
-			transform-origin: center 14px;
-			position: absolute;
-		}
-
-		#status-indeterminate > div:nth-child(1) { transform: rotate( 22.5deg); }
-		#status-indeterminate > div:nth-child(2) { transform: rotate( 67.5deg); }
-		#status-indeterminate > div:nth-child(3) { transform: rotate(112.5deg); }
-		#status-indeterminate > div:nth-child(4) { transform: rotate(157.5deg); }
-		#status-indeterminate > div:nth-child(5) { transform: rotate(202.5deg); }
-		#status-indeterminate > div:nth-child(6) { transform: rotate(247.5deg); }
-		#status-indeterminate > div:nth-child(7) { transform: rotate(292.5deg); }
-		#status-indeterminate > div:nth-child(8) { transform: rotate(337.5deg); }
-
-		#status-notice {
-			margin: 0 100px;
-			line-height: 1.3;
-			visibility: visible;
-			padding: 4px 6px;
-			visibility: visible;
-		}
-
-
-		/* Debug output
-		 * ============ */
-
-		#output-panel {
-			display: none;
-			max-width: 700px;
-			font-size: small;
-			margin: 6px auto 0;
-			padding: 0 4px 4px;
-			text-align: left;
-			line-height: 2.2;
-		}
-
-		#output-header {
-			display: flex;
-			justify-content: space-between;
-			align-items: center;
-		}
-
-		#output-container {
-			padding: 6px;
-			background-color: #2c2a32;
-			box-shadow: inset 0 0 1px 1px #232127;
-			color: #bbb;
-		}
-
-		#output-scroll {
-			line-height: 1;
-			height: 12em;
-			overflow-y: scroll;
-			white-space: pre-wrap;
-			font-size: small;
-			font-family: "Lucida Console", Monaco, monospace;
-		}
-	</style>
-$GODOT_HEAD_INCLUDE
-</head>
-<body>
-	<div id="container">
-		<canvas id="canvas" width="640" height="480">
-			HTML5 canvas appears to be unsupported in the current browser.<br />
-			Please try updating or use a different browser.
-		</canvas>
-		<div id="status">
-			<div id='status-progress' style='display: none;' oncontextmenu="event.preventDefault();"><div id ='status-progress-inner'></div></div>
-			<div id='status-indeterminate' style='display: none;' oncontextmenu="event.preventDefault();">
-				<div></div>
-				<div></div>
-				<div></div>
-				<div></div>
-				<div></div>
-				<div></div>
-				<div></div>
-				<div></div>
-			</div>
-			<div id="status-notice" class="godot" style='display: none;'></div>
-		</div>
-	</div>
-	<div id="output-panel" class="godot">
-		<div id="output-header">
-			Output:
-			<button id='output-clear' class='godot' type='button' autocomplete='off'>Clear</button>
-		</div>
-		<div id="output-container"><div id="output-scroll"></div></div>
-	</div>
-
-	<script type="text/javascript" src="$GODOT_BASENAME.js"></script>
-	<script type="text/javascript">//<![CDATA[
-
-		var engine = new Engine;
-
-		(function() {
-
-			const EXECUTABLE_NAME = '$GODOT_BASENAME';
-			const MAIN_PACK = '$GODOT_BASENAME.pck';
-			const DEBUG_ENABLED = $GODOT_DEBUG_ENABLED;
-			const INDETERMINATE_STATUS_STEP_MS = 100;
-
-			var container = document.getElementById('container');
-			var canvas = document.getElementById('canvas');
-			var statusProgress = document.getElementById('status-progress');
-			var statusProgressInner = document.getElementById('status-progress-inner');
-			var statusIndeterminate = document.getElementById('status-indeterminate');
-			var statusNotice = document.getElementById('status-notice');
-
-			var initializing = true;
-			var statusMode = 'hidden';
-			var indeterminiateStatusAnimationId = 0;
-
-			function setStatusMode(mode) {
-
-				if (statusMode === mode || !initializing)
-					return;
-				[statusProgress, statusIndeterminate, statusNotice].forEach(elem => {
-					elem.style.display = 'none';
-				});
-				if (indeterminiateStatusAnimationId !== 0) {
-					cancelAnimationFrame(indeterminiateStatusAnimationId);
-					indeterminiateStatusAnimationId = 0;
-				}
-				switch (mode) {
-					case 'progress':
-						statusProgress.style.display = 'block';
-						break;
-					case 'indeterminate':
-						statusIndeterminate.style.display = 'block';
-						indeterminiateStatusAnimationId = requestAnimationFrame(animateStatusIndeterminate);
-						break;
-					case 'notice':
-						statusNotice.style.display = 'block';
-						break;
-					case 'hidden':
-						break;
-					default:
-						throw new Error("Invalid status mode");
-				}
-				statusMode = mode;
-			}
-
-			function animateStatusIndeterminate(ms) {
-				var i = Math.floor(ms / INDETERMINATE_STATUS_STEP_MS % 8);
-				if (statusIndeterminate.children[i].style.borderTopColor == '') {
-					Array.prototype.slice.call(statusIndeterminate.children).forEach(child => {
-						child.style.borderTopColor = '';
-					});
-					statusIndeterminate.children[i].style.borderTopColor = '#dfdfdf';
-				}
-				requestAnimationFrame(animateStatusIndeterminate);
-			}
-
-			function setStatusNotice(text) {
-
-				while (statusNotice.lastChild) {
-					statusNotice.removeChild(statusNotice.lastChild);
-				}
-				var lines = text.split('\n');
-				lines.forEach((line, index) => {
-					statusNotice.appendChild(document.createTextNode(line));
-					statusNotice.appendChild(document.createElement('br'));
-				});
-			};
-
-			engine.setProgressFunc((current, total) => {
-
-				if (total > 0) {
-					statusProgressInner.style.width = current/total * 100 + '%';
-					setStatusMode('progress');
-					if (current === total) {
-						// wait for progress bar animation
-						setTimeout(() => {
-							setStatusMode('indeterminate');
-						}, 500);
-					}
-				} else {
-					setStatusMode('indeterminate');
-				}
-			});
-
-			if (DEBUG_ENABLED) {
-				var outputRoot = document.getElementById("output-panel");
-				var outputScroll = document.getElementById("output-scroll");
-				var OUTPUT_MSG_COUNT_MAX = 400;
-
-				document.getElementById('output-clear').addEventListener('click', () => {
-					while (outputScroll.firstChild) {
-						outputScroll.firstChild.remove();
-					}
-				});
-
-				outputRoot.style.display = 'block';
-
-				function print(text) {
-					while (outputScroll.childElementCount >= OUTPUT_MSG_COUNT_MAX) {
-						outputScroll.firstChild.remove();
-					}
-					var msg = document.createElement("div");
-					if (String.prototype.trim.call(text).startsWith("**ERROR**")) {
-						msg.style.color = "#d44";
-					} else if (String.prototype.trim.call(text).startsWith("**WARNING**")) {
-						msg.style.color = "#ccc000";
-					} else if (String.prototype.trim.call(text).startsWith("**SCRIPT ERROR**")) {
-						msg.style.color = "#c6d";
-					}
-					msg.textContent = text;
-					var scrollToBottom = outputScroll.scrollHeight - (outputScroll.clientHeight + outputScroll.scrollTop) < 10;
-					outputScroll.appendChild(msg);
-					if (scrollToBottom) {
-						outputScroll.scrollTop = outputScroll.scrollHeight;
-					}
-				};
-
-				function printError(text) {
-					if (!String.prototype.trim.call(text).startsWith('**ERROR**: ')) {
-						text = '**ERROR**: ' + text;
-					}
-					print(text);
-				}
-
-				engine.setStdoutFunc(text => {
-					print(text);
-					console.log(text);
-				});
-
-				engine.setStderrFunc(text => {
-					printError(text);
-					console.warn(text);
-				});
-			}
-
-			function displayFailureNotice(err) {
-				var msg = err.message || err;
-				if (DEBUG_ENABLED) {
-					printError(msg);
-				}
-				console.error(msg);
-				setStatusNotice(msg);
-				setStatusMode('notice');
-				initializing = false;
-			};
-
-			if (!Engine.isWebGLAvailable()) {
-				displayFailureNotice("WebGL not available");
-			} else {
-				setStatusMode('indeterminate');
-				engine.setCanvas(canvas);
-				engine.startGame(EXECUTABLE_NAME, MAIN_PACK).then(() => {
-					setStatusMode('hidden');
-					initializing = false;
-				}, displayFailureNotice);
-			}
-		})();
-	//]]></script>
-</body>
-</html>

From 3fc59fbcc73350bb4ab248ace837b0f0514a9ebb Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Fri, 7 May 2021 16:24:37 +0200
Subject: [PATCH 76/84] Tweak the setting hint for the custom editor theme
 setting

The custom editor theme is only visible after restarting the editor.

(cherry picked from commit 027301fec79d6cd8143f25c95c23c6007279ec23)
---
 editor/editor_settings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/editor/editor_settings.cpp b/editor/editor_settings.cpp
index 0ac460a4bf95..5e50a88e0f84 100644
--- a/editor/editor_settings.cpp
+++ b/editor/editor_settings.cpp
@@ -395,7 +395,7 @@ void EditorSettings::_load_defaults(Ref<ConfigFile> p_extra_config) {
 	_initial_set("interface/theme/additional_spacing", 0);
 	hints["interface/theme/additional_spacing"] = PropertyInfo(Variant::REAL, "interface/theme/additional_spacing", PROPERTY_HINT_RANGE, "0,5,0.1", PROPERTY_USAGE_DEFAULT);
 	_initial_set("interface/theme/custom_theme", "");
-	hints["interface/theme/custom_theme"] = PropertyInfo(Variant::STRING, "interface/theme/custom_theme", PROPERTY_HINT_GLOBAL_FILE, "*.res,*.tres,*.theme", PROPERTY_USAGE_DEFAULT);
+	hints["interface/theme/custom_theme"] = PropertyInfo(Variant::STRING, "interface/theme/custom_theme", PROPERTY_HINT_GLOBAL_FILE, "*.res,*.tres,*.theme", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_RESTART_IF_CHANGED);
 
 	// Scene tabs
 	_initial_set("interface/scene_tabs/show_extension", false);

From 0a7193c5f42aab58b6e0701d7ddc1ae7567c5ef7 Mon Sep 17 00:00:00 2001
From: Fabio Alessandrelli <fabio.alessandrelli@gmail.com>
Date: Fri, 7 May 2021 14:06:46 +0200
Subject: [PATCH 77/84] [HTML5] Fix target_fps when window loses focus.

We don't get updates when the window is unfocused/minimized, so we must
detect the situation where the counted ticks start drifting away
resulting in more frames drawn than needed.
This commit adds a check to ensure that the target ticks do not drift
away more than one second.

(cherry picked from commit a1fe6d6899c5ed4cf13c16f9d6bcd64958ab8254)
---
 platform/javascript/javascript_main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/platform/javascript/javascript_main.cpp b/platform/javascript/javascript_main.cpp
index e1fba262de61..b86e41555dd1 100644
--- a/platform/javascript/javascript_main.cpp
+++ b/platform/javascript/javascript_main.cpp
@@ -65,6 +65,11 @@ void main_loop_callback() {
 
 	int target_fps = Engine::get_singleton()->get_target_fps();
 	if (target_fps > 0) {
+		if (current_ticks - target_ticks > 1000000) {
+			// When the window loses focus, we stop getting updates and accumulate delay.
+			// For this reason, if the difference is too big, we reset target ticks to the current ticks.
+			target_ticks = current_ticks;
+		}
 		target_ticks += (uint64_t)(1000000 / target_fps);
 	}
 	if (os->main_loop_iterate()) {

From 7eacb604b1dbe9f712303116783b399e4af5917c Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Sun, 9 May 2021 13:11:16 +0200
Subject: [PATCH 78/84] Fix EditorPropertyResource focus outline being drawn
 behind the preview

(cherry picked from commit 0b47f1be8c94a4a037980d7d3f1f79559c09ce58)
---
 editor/editor_properties.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/editor/editor_properties.cpp b/editor/editor_properties.cpp
index 73e3e3b7b7ed..4d814bfd677f 100644
--- a/editor/editor_properties.cpp
+++ b/editor/editor_properties.cpp
@@ -3022,6 +3022,8 @@ EditorPropertyResource::EditorPropertyResource() {
 	preview->set_margin(MARGIN_TOP, 1);
 	preview->set_margin(MARGIN_BOTTOM, -1);
 	preview->set_margin(MARGIN_RIGHT, -1);
+	// This is required to draw the focus outline in front of the preview, rather than behind.
+	preview->set_draw_behind_parent(true);
 	assign->add_child(preview);
 	assign->connect("gui_input", this, "_button_input");
 

From 32996000932944c249dc56ccc711d274e094e934 Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Sun, 9 May 2021 13:23:53 +0200
Subject: [PATCH 79/84] Document caveats of `OS.get_unique_id()`

(cherry picked from commit 7350f90c579ce6db0be3cf67175778546459322e)
---
 doc/classes/OS.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/classes/OS.xml b/doc/classes/OS.xml
index 8db8db901485..79a7811e93da 100644
--- a/doc/classes/OS.xml
+++ b/doc/classes/OS.xml
@@ -530,6 +530,7 @@
 			</return>
 			<description>
 				Returns a string that is unique to the device.
+				[b]Note:[/b] This string may change without notice if the user reinstalls/upgrades their operating system or changes their hardware. This means it should generally not be used to encrypt persistent data as the data saved prior to an unexpected ID change would become inaccessible. The returned string may also be falsified using external programs, so do not rely on the string returned by [method get_unique_id] for security purposes.
 				[b]Note:[/b] Returns an empty string on HTML5 and UWP, as this method isn't implemented on those platforms yet.
 			</description>
 		</method>

From dd2013990106472cfc6fa0e0d801931c9412d4ba Mon Sep 17 00:00:00 2001
From: Hugo Locurcio <hugo.locurcio@hugo.pro>
Date: Fri, 7 May 2021 18:14:52 +0200
Subject: [PATCH 80/84] Allow negative contrast values in the editor theme
 settings

When using a negative contrast value, the base color will be lightened
to create the derivative colors instead of being darkened.

This can lead to better-looking themes, especially for light themes.

(cherry picked from commit e7e2ef07676822d848b8916121033fae559b68b2)
---
 editor/editor_settings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/editor/editor_settings.cpp b/editor/editor_settings.cpp
index 5e50a88e0f84..75be1ccc2037 100644
--- a/editor/editor_settings.cpp
+++ b/editor/editor_settings.cpp
@@ -385,7 +385,7 @@ void EditorSettings::_load_defaults(Ref<ConfigFile> p_extra_config) {
 	_initial_set("interface/theme/accent_color", Color(0.41, 0.61, 0.91));
 	hints["interface/theme/accent_color"] = PropertyInfo(Variant::COLOR, "interface/theme/accent_color", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_DEFAULT);
 	_initial_set("interface/theme/contrast", 0.25);
-	hints["interface/theme/contrast"] = PropertyInfo(Variant::REAL, "interface/theme/contrast", PROPERTY_HINT_RANGE, "0.01, 1, 0.01");
+	hints["interface/theme/contrast"] = PropertyInfo(Variant::REAL, "interface/theme/contrast", PROPERTY_HINT_RANGE, "-1, 1, 0.01");
 	_initial_set("interface/theme/relationship_line_opacity", 0.1);
 	hints["interface/theme/relationship_line_opacity"] = PropertyInfo(Variant::REAL, "interface/theme/relationship_line_opacity", PROPERTY_HINT_RANGE, "0.00, 1, 0.01");
 	_initial_set("interface/theme/highlight_tabs", false);

From 6f780d7e3aba7252bb89bc67b0e381f1457e55a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
Date: Sun, 9 May 2021 16:56:52 +0200
Subject: [PATCH 81/84] TileSet: Improve error message for invalid IDs

Supersedes #47321.
Fixes #47313.

(cherry picked from commit ee86dc3011c1b8ca83faa617902341ffc1441e0c)
---
 scene/resources/tile_set.cpp | 194 ++++++++++++-----------------------
 1 file changed, 67 insertions(+), 127 deletions(-)

diff --git a/scene/resources/tile_set.cpp b/scene/resources/tile_set.cpp
index b33edc0129ef..db65a364657a 100644
--- a/scene/resources/tile_set.cpp
+++ b/scene/resources/tile_set.cpp
@@ -367,7 +367,7 @@ void TileSet::_get_property_list(List<PropertyInfo> *p_list) const {
 }
 
 void TileSet::create_tile(int p_id) {
-	ERR_FAIL_COND(tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(tile_map.has(p_id), vformat("The TileSet already has a tile with ID '%d'.", p_id));
 	tile_map[p_id] = TileData();
 	tile_map[p_id].autotile_data = AutotileData();
 	_change_notify("");
@@ -375,168 +375,145 @@ void TileSet::create_tile(int p_id) {
 }
 
 void TileSet::autotile_set_bitmask_mode(int p_id, BitmaskMode p_mode) {
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].autotile_data.bitmask_mode = p_mode;
 	_change_notify("");
 	emit_changed();
 }
 
 TileSet::BitmaskMode TileSet::autotile_get_bitmask_mode(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), BITMASK_2X2);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), BITMASK_2X2, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.bitmask_mode;
 }
 
 void TileSet::tile_set_texture(int p_id, const Ref<Texture> &p_texture) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].texture = p_texture;
 	emit_changed();
 	_change_notify("texture");
 }
 
 Ref<Texture> TileSet::tile_get_texture(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<Texture>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<Texture>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].texture;
 }
 
 void TileSet::tile_set_normal_map(int p_id, const Ref<Texture> &p_normal_map) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].normal_map = p_normal_map;
 	emit_changed();
 }
 
 Ref<Texture> TileSet::tile_get_normal_map(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<Texture>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<Texture>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].normal_map;
 }
 
 void TileSet::tile_set_material(int p_id, const Ref<ShaderMaterial> &p_material) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].material = p_material;
 	emit_changed();
 }
 
 Ref<ShaderMaterial> TileSet::tile_get_material(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<ShaderMaterial>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<ShaderMaterial>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].material;
 }
 
 void TileSet::tile_set_modulate(int p_id, const Color &p_modulate) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].modulate = p_modulate;
 	emit_changed();
 	_change_notify("modulate");
 }
 
 Color TileSet::tile_get_modulate(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Color(1, 1, 1));
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Color(1, 1, 1), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].modulate;
 }
 
 void TileSet::tile_set_texture_offset(int p_id, const Vector2 &p_offset) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].offset = p_offset;
 	emit_changed();
 }
 
 Vector2 TileSet::tile_get_texture_offset(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Vector2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Vector2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].offset;
 }
 
 void TileSet::tile_set_region(int p_id, const Rect2 &p_region) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].region = p_region;
 	emit_changed();
 	_change_notify("region");
 }
 
 Rect2 TileSet::tile_get_region(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Rect2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Rect2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].region;
 }
 
 void TileSet::tile_set_tile_mode(int p_id, TileMode p_tile_mode) {
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].tile_mode = p_tile_mode;
 	emit_changed();
 	_change_notify("tile_mode");
 }
 
 TileSet::TileMode TileSet::tile_get_tile_mode(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), SINGLE_TILE);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), SINGLE_TILE, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].tile_mode;
 }
 
 void TileSet::autotile_set_icon_coordinate(int p_id, Vector2 coord) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].autotile_data.icon_coord = coord;
 	emit_changed();
 }
 
 Vector2 TileSet::autotile_get_icon_coordinate(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Vector2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Vector2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.icon_coord;
 }
 
 void TileSet::autotile_set_spacing(int p_id, int p_spacing) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND(p_spacing < 0);
 	tile_map[p_id].autotile_data.spacing = p_spacing;
 	emit_changed();
 }
 
 int TileSet::autotile_get_spacing(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), 0);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), 0, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.spacing;
 }
 
 void TileSet::autotile_set_size(int p_id, Size2 p_size) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND(p_size.x <= 0 || p_size.y <= 0);
 	tile_map[p_id].autotile_data.size = p_size;
 }
 
 Size2 TileSet::autotile_get_size(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Size2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Size2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.size;
 }
 
 void TileSet::autotile_clear_bitmask_map(int p_id) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].autotile_data.flags.clear();
 }
 
 void TileSet::autotile_set_subtile_priority(int p_id, const Vector2 &p_coord, int p_priority) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND(p_priority <= 0);
 	tile_map[p_id].autotile_data.priority_map[p_coord] = p_priority;
 }
 
 int TileSet::autotile_get_subtile_priority(int p_id, const Vector2 &p_coord) {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), 1);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), 1, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (tile_map[p_id].autotile_data.priority_map.has(p_coord)) {
 		return tile_map[p_id].autotile_data.priority_map[p_coord];
 	}
@@ -547,20 +524,18 @@ int TileSet::autotile_get_subtile_priority(int p_id, const Vector2 &p_coord) {
 const Map<Vector2, int> &TileSet::autotile_get_priority_map(int p_id) const {
 
 	static Map<Vector2, int> dummy;
-	ERR_FAIL_COND_V(!tile_map.has(p_id), dummy);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), dummy, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.priority_map;
 }
 
 void TileSet::autotile_set_z_index(int p_id, const Vector2 &p_coord, int p_z_index) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].autotile_data.z_index_map[p_coord] = p_z_index;
 	emit_changed();
 }
 
 int TileSet::autotile_get_z_index(int p_id, const Vector2 &p_coord) {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), 1);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), 1, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (tile_map[p_id].autotile_data.z_index_map.has(p_coord)) {
 		return tile_map[p_id].autotile_data.z_index_map[p_coord];
 	}
@@ -571,13 +546,12 @@ int TileSet::autotile_get_z_index(int p_id, const Vector2 &p_coord) {
 const Map<Vector2, int> &TileSet::autotile_get_z_index_map(int p_id) const {
 
 	static Map<Vector2, int> dummy;
-	ERR_FAIL_COND_V(!tile_map.has(p_id), dummy);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), dummy, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.z_index_map;
 }
 
 void TileSet::autotile_set_bitmask(int p_id, Vector2 p_coord, uint32_t p_flag) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (p_flag == 0) {
 		if (tile_map[p_id].autotile_data.flags.has(p_coord))
 			tile_map[p_id].autotile_data.flags.erase(p_coord);
@@ -587,8 +561,7 @@ void TileSet::autotile_set_bitmask(int p_id, Vector2 p_coord, uint32_t p_flag) {
 }
 
 uint32_t TileSet::autotile_get_bitmask(int p_id, Vector2 p_coord) {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), 0);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), 0, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (!tile_map[p_id].autotile_data.flags.has(p_coord)) {
 		return 0;
 	}
@@ -599,7 +572,7 @@ const Map<Vector2, uint32_t> &TileSet::autotile_get_bitmask_map(int p_id) {
 
 	static Map<Vector2, uint32_t> dummy;
 	static Map<Vector2, uint32_t> dummy_atlas;
-	ERR_FAIL_COND_V(!tile_map.has(p_id), dummy);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), dummy, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (tile_get_tile_mode(p_id) == ATLAS_TILE) {
 		dummy_atlas = Map<Vector2, uint32_t>();
 		Rect2 region = tile_get_region(p_id);
@@ -616,8 +589,7 @@ const Map<Vector2, uint32_t> &TileSet::autotile_get_bitmask_map(int p_id) {
 }
 
 Vector2 TileSet::autotile_get_subtile_for_bitmask(int p_id, uint16_t p_bitmask, const Node *p_tilemap_node, const Vector2 &p_tile_location) {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Vector2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Vector2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	//First try to forward selection to script
 	if (p_tilemap_node->get_class_name() == "TileMap") {
 		if (get_script_instance() != NULL) {
@@ -678,8 +650,7 @@ Vector2 TileSet::autotile_get_subtile_for_bitmask(int p_id, uint16_t p_bitmask,
 }
 
 Vector2 TileSet::atlastile_get_subtile_by_priority(int p_id, const Node *p_tilemap_node, const Vector2 &p_tile_location) {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Vector2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Vector2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	//First try to forward selection to script
 	if (get_script_instance() != NULL) {
 		if (get_script_instance()->has_method("_forward_atlas_subtile_selection")) {
@@ -708,28 +679,24 @@ Vector2 TileSet::atlastile_get_subtile_by_priority(int p_id, const Node *p_tilem
 }
 
 void TileSet::tile_set_name(int p_id, const String &p_name) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].name = p_name;
 	emit_changed();
 	_change_notify("name");
 }
 
 String TileSet::tile_get_name(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), String());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), String(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].name;
 }
 
 void TileSet::tile_clear_shapes(int p_id) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].shapes_data.clear();
 }
 
 void TileSet::tile_add_shape(int p_id, const Ref<Shape2D> &p_shape, const Transform2D &p_transform, bool p_one_way, const Vector2 &p_autotile_coord) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 
 	ShapeData new_data = ShapeData();
 	new_data.shape = p_shape;
@@ -741,14 +708,12 @@ void TileSet::tile_add_shape(int p_id, const Ref<Shape2D> &p_shape, const Transf
 }
 
 int TileSet::tile_get_shape_count(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), 0);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), 0, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].shapes_data.size();
 }
 
 void TileSet::tile_set_shape(int p_id, int p_shape_id, const Ref<Shape2D> &p_shape) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND(p_shape_id < 0);
 
 	if (p_shape_id >= tile_map[p_id].shapes_data.size())
@@ -759,8 +724,7 @@ void TileSet::tile_set_shape(int p_id, int p_shape_id, const Ref<Shape2D> &p_sha
 }
 
 Ref<Shape2D> TileSet::tile_get_shape(int p_id, int p_shape_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<Shape2D>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<Shape2D>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND_V(p_shape_id < 0, Ref<Shape2D>());
 
 	if (p_shape_id < tile_map[p_id].shapes_data.size())
@@ -770,8 +734,7 @@ Ref<Shape2D> TileSet::tile_get_shape(int p_id, int p_shape_id) const {
 }
 
 void TileSet::tile_set_shape_transform(int p_id, int p_shape_id, const Transform2D &p_offset) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND(p_shape_id < 0);
 
 	if (p_shape_id >= tile_map[p_id].shapes_data.size())
@@ -781,8 +744,7 @@ void TileSet::tile_set_shape_transform(int p_id, int p_shape_id, const Transform
 }
 
 Transform2D TileSet::tile_get_shape_transform(int p_id, int p_shape_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Transform2D());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Transform2D(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND_V(p_shape_id < 0, Transform2D());
 
 	if (p_shape_id < tile_map[p_id].shapes_data.size())
@@ -802,8 +764,7 @@ Vector2 TileSet::tile_get_shape_offset(int p_id, int p_shape_id) const {
 }
 
 void TileSet::tile_set_shape_one_way(int p_id, int p_shape_id, const bool p_one_way) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND(p_shape_id < 0);
 
 	if (p_shape_id >= tile_map[p_id].shapes_data.size())
@@ -813,8 +774,7 @@ void TileSet::tile_set_shape_one_way(int p_id, int p_shape_id, const bool p_one_
 }
 
 bool TileSet::tile_get_shape_one_way(int p_id, int p_shape_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), false);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), false, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND_V(p_shape_id < 0, false);
 
 	if (p_shape_id < tile_map[p_id].shapes_data.size())
@@ -824,8 +784,7 @@ bool TileSet::tile_get_shape_one_way(int p_id, int p_shape_id) const {
 }
 
 void TileSet::tile_set_shape_one_way_margin(int p_id, int p_shape_id, float p_margin) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND(p_shape_id < 0);
 
 	if (p_shape_id >= tile_map[p_id].shapes_data.size())
@@ -835,8 +794,7 @@ void TileSet::tile_set_shape_one_way_margin(int p_id, int p_shape_id, float p_ma
 }
 
 float TileSet::tile_get_shape_one_way_margin(int p_id, int p_shape_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), 0);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), 0, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	ERR_FAIL_COND_V(p_shape_id < 0, 0);
 
 	if (p_shape_id < tile_map[p_id].shapes_data.size())
@@ -846,19 +804,17 @@ float TileSet::tile_get_shape_one_way_margin(int p_id, int p_shape_id) const {
 }
 
 void TileSet::tile_set_light_occluder(int p_id, const Ref<OccluderPolygon2D> &p_light_occluder) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].occluder = p_light_occluder;
 }
 
 Ref<OccluderPolygon2D> TileSet::tile_get_light_occluder(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<OccluderPolygon2D>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<OccluderPolygon2D>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].occluder;
 }
 
 void TileSet::autotile_set_light_occluder(int p_id, const Ref<OccluderPolygon2D> &p_light_occluder, const Vector2 &p_coord) {
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (p_light_occluder.is_null()) {
 		if (tile_map[p_id].autotile_data.occluder_map.has(p_coord)) {
 			tile_map[p_id].autotile_data.occluder_map.erase(p_coord);
@@ -869,8 +825,7 @@ void TileSet::autotile_set_light_occluder(int p_id, const Ref<OccluderPolygon2D>
 }
 
 Ref<OccluderPolygon2D> TileSet::autotile_get_light_occluder(int p_id, const Vector2 &p_coord) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<OccluderPolygon2D>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<OccluderPolygon2D>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 
 	if (!tile_map[p_id].autotile_data.occluder_map.has(p_coord)) {
 		return Ref<OccluderPolygon2D>();
@@ -880,39 +835,33 @@ Ref<OccluderPolygon2D> TileSet::autotile_get_light_occluder(int p_id, const Vect
 }
 
 void TileSet::tile_set_navigation_polygon_offset(int p_id, const Vector2 &p_offset) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].navigation_polygon_offset = p_offset;
 }
 
 Vector2 TileSet::tile_get_navigation_polygon_offset(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Vector2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Vector2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].navigation_polygon_offset;
 }
 
 void TileSet::tile_set_navigation_polygon(int p_id, const Ref<NavigationPolygon> &p_navigation_polygon) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].navigation_polygon = p_navigation_polygon;
 }
 
 Ref<NavigationPolygon> TileSet::tile_get_navigation_polygon(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<NavigationPolygon>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<NavigationPolygon>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].navigation_polygon;
 }
 
 const Map<Vector2, Ref<OccluderPolygon2D> > &TileSet::autotile_get_light_oclusion_map(int p_id) const {
-
 	static Map<Vector2, Ref<OccluderPolygon2D> > dummy;
-	ERR_FAIL_COND_V(!tile_map.has(p_id), dummy);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), dummy, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.occluder_map;
 }
 
 void TileSet::autotile_set_navigation_polygon(int p_id, const Ref<NavigationPolygon> &p_navigation_polygon, const Vector2 &p_coord) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (p_navigation_polygon.is_null()) {
 		if (tile_map[p_id].autotile_data.navpoly_map.has(p_coord)) {
 			tile_map[p_id].autotile_data.navpoly_map.erase(p_coord);
@@ -923,8 +872,7 @@ void TileSet::autotile_set_navigation_polygon(int p_id, const Ref<NavigationPoly
 }
 
 Ref<NavigationPolygon> TileSet::autotile_get_navigation_polygon(int p_id, const Vector2 &p_coord) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Ref<NavigationPolygon>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Ref<NavigationPolygon>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	if (!tile_map[p_id].autotile_data.navpoly_map.has(p_coord)) {
 		return Ref<NavigationPolygon>();
 	} else {
@@ -933,27 +881,23 @@ Ref<NavigationPolygon> TileSet::autotile_get_navigation_polygon(int p_id, const
 }
 
 const Map<Vector2, Ref<NavigationPolygon> > &TileSet::autotile_get_navigation_map(int p_id) const {
-
 	static Map<Vector2, Ref<NavigationPolygon> > dummy;
-	ERR_FAIL_COND_V(!tile_map.has(p_id), dummy);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), dummy, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].autotile_data.navpoly_map;
 }
 
 void TileSet::tile_set_occluder_offset(int p_id, const Vector2 &p_offset) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].occluder_offset = p_offset;
 }
 
 Vector2 TileSet::tile_get_occluder_offset(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Vector2());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Vector2(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].occluder_offset;
 }
 
 void TileSet::tile_set_shapes(int p_id, const Vector<ShapeData> &p_shapes) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].shapes_data = p_shapes;
 	for (int i = 0; i < p_shapes.size(); i++) {
 		_decompose_convex_shape(p_shapes[i].shape);
@@ -962,21 +906,18 @@ void TileSet::tile_set_shapes(int p_id, const Vector<ShapeData> &p_shapes) {
 }
 
 Vector<TileSet::ShapeData> TileSet::tile_get_shapes(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), Vector<ShapeData>());
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), Vector<ShapeData>(), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 
 	return tile_map[p_id].shapes_data;
 }
 
 int TileSet::tile_get_z_index(int p_id) const {
-
-	ERR_FAIL_COND_V(!tile_map.has(p_id), 0);
+	ERR_FAIL_COND_V_MSG(!tile_map.has(p_id), 0, vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	return tile_map[p_id].z_index;
 }
 
 void TileSet::tile_set_z_index(int p_id, int p_z_index) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map[p_id].z_index = p_z_index;
 	emit_changed();
 }
@@ -1120,8 +1061,7 @@ bool TileSet::is_tile_bound(int p_drawn_id, int p_neighbor_id) {
 }
 
 void TileSet::remove_tile(int p_id) {
-
-	ERR_FAIL_COND(!tile_map.has(p_id));
+	ERR_FAIL_COND_MSG(!tile_map.has(p_id), vformat("The TileSet doesn't have a tile with ID '%d'.", p_id));
 	tile_map.erase(p_id);
 	_change_notify("");
 	emit_changed();

From 5193c3c8eb8ad8cb5a35e9dbca1cffa27f507a90 Mon Sep 17 00:00:00 2001
From: Kyle <eichlinkyle@gmail.com>
Date: Wed, 31 Mar 2021 19:14:29 -0400
Subject: [PATCH 82/84] Add ctrl+shift+a to instance scene in scenetree dock

Adds another key shortcut to instance a scene in the scentree dock. Complements ctrl+a to add a node.

(cherry picked from commit ea5445655ced4568a37d596cd0af71ce8c7809d4)
---
 editor/scene_tree_dock.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/editor/scene_tree_dock.cpp b/editor/scene_tree_dock.cpp
index 2b79311babd9..48b7df315047 100644
--- a/editor/scene_tree_dock.cpp
+++ b/editor/scene_tree_dock.cpp
@@ -3088,7 +3088,7 @@ SceneTreeDock::SceneTreeDock(EditorNode *p_editor, Node *p_scene_root, EditorSel
 	ED_SHORTCUT("scene_tree/rename", TTR("Rename"), KEY_F2);
 	ED_SHORTCUT("scene_tree/batch_rename", TTR("Batch Rename"), KEY_MASK_SHIFT | KEY_F2);
 	ED_SHORTCUT("scene_tree/add_child_node", TTR("Add Child Node"), KEY_MASK_CMD | KEY_A);
-	ED_SHORTCUT("scene_tree/instance_scene", TTR("Instance Child Scene"));
+	ED_SHORTCUT("scene_tree/instance_scene", TTR("Instance Child Scene"), KEY_MASK_CMD | KEY_MASK_SHIFT | KEY_A);
 	ED_SHORTCUT("scene_tree/expand_collapse_all", TTR("Expand/Collapse All"));
 	ED_SHORTCUT("scene_tree/cut_node", TTR("Cut"), KEY_MASK_CMD | KEY_X);
 	ED_SHORTCUT("scene_tree/copy_node", TTR("Copy"), KEY_MASK_CMD | KEY_C);

From 140cf0f2cb7b51d7866e63aba1aa6d8029cf540b Mon Sep 17 00:00:00 2001
From: trollodel <33117082+trollodel@users.noreply.github.com>
Date: Tue, 27 Apr 2021 11:23:08 +0200
Subject: [PATCH 83/84] Create CollisionObject debug shapes using VS

(cherry picked from commit 60ee8c9639d5abacdc6dfb7bd96b0c51d337ad43)
---
 scene/3d/collision_object.cpp | 121 +++++++++++++++++++++++++---------
 scene/3d/collision_object.h   |  12 ++--
 scene/3d/collision_shape.cpp  |  17 ++---
 scene/3d/collision_shape.h    |   2 -
 scene/3d/physics_body.cpp     |   2 +
 5 files changed, 103 insertions(+), 51 deletions(-)

diff --git a/scene/3d/collision_object.cpp b/scene/3d/collision_object.cpp
index 5331a060088f..25ad315fd853 100644
--- a/scene/3d/collision_object.cpp
+++ b/scene/3d/collision_object.cpp
@@ -38,6 +38,21 @@
 void CollisionObject::_notification(int p_what) {
 
 	switch (p_what) {
+		case NOTIFICATION_ENTER_TREE: {
+			if (_are_collision_shapes_visible()) {
+				debug_shape_old_transform = get_global_transform();
+				for (Map<uint32_t, ShapeData>::Element *E = shapes.front(); E; E = E->next()) {
+					debug_shapes_to_update.insert(E->key());
+				}
+				_update_debug_shapes();
+			}
+		} break;
+
+		case NOTIFICATION_EXIT_TREE: {
+			if (debug_shapes_count > 0) {
+				_clear_debug_shapes();
+			}
+		} break;
 
 		case NOTIFICATION_ENTER_WORLD: {
 
@@ -63,6 +78,8 @@ void CollisionObject::_notification(int p_what) {
 			else
 				PhysicsServer::get_singleton()->body_set_state(rid, PhysicsServer::BODY_STATE_TRANSFORM, get_global_transform());
 
+			_on_transform_changed();
+
 		} break;
 		case NOTIFICATION_VISIBILITY_CHANGED: {
 
@@ -77,11 +94,6 @@ void CollisionObject::_notification(int p_what) {
 				PhysicsServer::get_singleton()->body_set_space(rid, RID());
 
 		} break;
-		case NOTIFICATION_PREDELETE: {
-			if (debug_shape_count > 0) {
-				_clear_debug_shapes();
-			}
-		} break;
 	}
 }
 
@@ -120,6 +132,33 @@ void CollisionObject::_update_pickable() {
 		PhysicsServer::get_singleton()->body_set_ray_pickable(rid, pickable);
 }
 
+bool CollisionObject::_are_collision_shapes_visible() {
+	return is_inside_tree() && get_tree()->is_debugging_collisions_hint() && !Engine::get_singleton()->is_editor_hint();
+}
+
+void CollisionObject::_update_shape_data(uint32_t p_owner) {
+	if (_are_collision_shapes_visible()) {
+		if (debug_shapes_to_update.empty()) {
+			call_deferred("_update_debug_shapes");
+		}
+		debug_shapes_to_update.insert(p_owner);
+	}
+}
+
+void CollisionObject::_shape_changed(const Ref<Shape> &p_shape) {
+	for (Map<uint32_t, ShapeData>::Element *E = shapes.front(); E; E = E->next()) {
+		ShapeData &shapedata = E->get();
+		ShapeData::ShapeBase *shapes = shapedata.shapes.ptrw();
+		for (int i = 0; i < shapedata.shapes.size(); i++) {
+			ShapeData::ShapeBase &s = shapes[i];
+			if (s.shape == p_shape && s.debug_shape.is_valid()) {
+				Ref<Mesh> mesh = s.shape->get_debug_mesh();
+				VS::get_singleton()->instance_set_base(s.debug_shape, mesh->get_rid());
+			}
+		}
+	}
+}
+
 void CollisionObject::_update_debug_shapes() {
 	for (Set<uint32_t>::Element *shapedata_idx = debug_shapes_to_update.front(); shapedata_idx; shapedata_idx = shapedata_idx->next()) {
 		if (shapes.has(shapedata_idx->get())) {
@@ -127,24 +166,27 @@ void CollisionObject::_update_debug_shapes() {
 			ShapeData::ShapeBase *shapes = shapedata.shapes.ptrw();
 			for (int i = 0; i < shapedata.shapes.size(); i++) {
 				ShapeData::ShapeBase &s = shapes[i];
-				if (s.debug_shape) {
-					s.debug_shape->queue_delete();
-					s.debug_shape = nullptr;
-					--debug_shape_count;
-				}
 				if (s.shape.is_null() || shapedata.disabled) {
-					continue;
+					if (s.debug_shape.is_valid()) {
+						VS::get_singleton()->free(s.debug_shape);
+						s.debug_shape = RID();
+						--debug_shapes_count;
+					}
+				}
+				if (!s.debug_shape.is_valid()) {
+					s.debug_shape = VS::get_singleton()->instance_create();
+					VS::get_singleton()->instance_set_scenario(s.debug_shape, get_world()->get_scenario());
+
+					if (!s.shape->is_connected("changed", this, "_shape_changed")) {
+						s.shape->connect("changed", this, "_shape_changed", varray(s.shape), CONNECT_DEFERRED);
+					}
+
+					++debug_shapes_count;
 				}
 
 				Ref<Mesh> mesh = s.shape->get_debug_mesh();
-				MeshInstance *mi = memnew(MeshInstance);
-				mi->set_transform(shapedata.xform);
-				mi->set_mesh(mesh);
-				add_child(mi);
-
-				mi->force_update_transform();
-				s.debug_shape = mi;
-				++debug_shape_count;
+				VS::get_singleton()->instance_set_base(s.debug_shape, mesh->get_rid());
+				VS::get_singleton()->instance_set_transform(s.debug_shape, get_global_transform() * shapedata.xform);
 			}
 		}
 	}
@@ -157,22 +199,29 @@ void CollisionObject::_clear_debug_shapes() {
 		ShapeData::ShapeBase *shapes = shapedata.shapes.ptrw();
 		for (int i = 0; i < shapedata.shapes.size(); i++) {
 			ShapeData::ShapeBase &s = shapes[i];
-			if (s.debug_shape) {
-				s.debug_shape->queue_delete();
-				s.debug_shape = nullptr;
+			if (s.debug_shape.is_valid()) {
+				VS::get_singleton()->free(s.debug_shape);
+				s.debug_shape = RID();
+				if (s.shape.is_valid() && s.shape->is_connected("changed", this, "_shape_changed")) {
+					s.shape->disconnect("changed", this, "_shape_changed");
+				}
 			}
 		}
 	}
 
-	debug_shape_count = 0;
+	debug_shapes_count = 0;
 }
 
-void CollisionObject::_update_shape_data(uint32_t p_owner) {
-	if (is_inside_tree() && get_tree()->is_debugging_collisions_hint() && !Engine::get_singleton()->is_editor_hint()) {
-		if (debug_shapes_to_update.empty()) {
-			call_deferred("_update_debug_shapes");
+void CollisionObject::_on_transform_changed() {
+	if (debug_shapes_count > 0 && !debug_shape_old_transform.is_equal_approx(get_global_transform())) {
+		debug_shape_old_transform = get_global_transform();
+		for (Map<uint32_t, ShapeData>::Element *E = shapes.front(); E; E = E->next()) {
+			ShapeData &shapedata = E->get();
+			const ShapeData::ShapeBase *shapes = shapedata.shapes.ptr();
+			for (int i = 0; i < shapedata.shapes.size(); i++) {
+				VS::get_singleton()->instance_set_transform(shapes[i].debug_shape, debug_shape_old_transform * shapedata.xform);
+			}
 		}
-		debug_shapes_to_update.insert(p_owner);
 	}
 }
 
@@ -211,6 +260,7 @@ void CollisionObject::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("shape_find_owner", "shape_index"), &CollisionObject::shape_find_owner);
 
 	ClassDB::bind_method(D_METHOD("_update_debug_shapes"), &CollisionObject::_update_debug_shapes);
+	ClassDB::bind_method(D_METHOD("_shape_changed", "shape"), &CollisionObject::_shape_changed);
 
 	BIND_VMETHOD(MethodInfo("_input_event", PropertyInfo(Variant::OBJECT, "camera"), PropertyInfo(Variant::OBJECT, "event", PROPERTY_HINT_RESOURCE_TYPE, "InputEvent"), PropertyInfo(Variant::VECTOR3, "click_position"), PropertyInfo(Variant::VECTOR3, "click_normal"), PropertyInfo(Variant::INT, "shape_idx")));
 
@@ -253,7 +303,11 @@ void CollisionObject::shape_owner_set_disabled(uint32_t p_owner, bool p_disabled
 	ERR_FAIL_COND(!shapes.has(p_owner));
 
 	ShapeData &sd = shapes[p_owner];
+	if (sd.disabled == p_disabled) {
+		return;
+	}
 	sd.disabled = p_disabled;
+
 	for (int i = 0; i < sd.shapes.size(); i++) {
 		if (area) {
 			PhysicsServer::get_singleton()->area_set_shape_disabled(rid, sd.shapes[i].index, p_disabled);
@@ -365,7 +419,7 @@ void CollisionObject::shape_owner_remove_shape(uint32_t p_owner, int p_shape) {
 	ERR_FAIL_COND(!shapes.has(p_owner));
 	ERR_FAIL_INDEX(p_shape, shapes[p_owner].shapes.size());
 
-	const ShapeData::ShapeBase &s = shapes[p_owner].shapes[p_shape];
+	ShapeData::ShapeBase &s = shapes[p_owner].shapes.write[p_shape];
 	int index_to_remove = s.index;
 
 	if (area) {
@@ -374,9 +428,12 @@ void CollisionObject::shape_owner_remove_shape(uint32_t p_owner, int p_shape) {
 		PhysicsServer::get_singleton()->body_remove_shape(rid, index_to_remove);
 	}
 
-	if (s.debug_shape) {
-		s.debug_shape->queue_delete();
-		--debug_shape_count;
+	if (s.debug_shape.is_valid()) {
+		VS::get_singleton()->free(s.debug_shape);
+		if (s.shape.is_valid() && s.shape->is_connected("changed", this, "_shape_changed")) {
+			s.shape->disconnect("changed", this, "_shape_changed");
+		}
+		--debug_shapes_count;
 	}
 
 	shapes[p_owner].shapes.remove(p_shape);
diff --git a/scene/3d/collision_object.h b/scene/3d/collision_object.h
index 02f5c8bef943..f8bf8d44703e 100644
--- a/scene/3d/collision_object.h
+++ b/scene/3d/collision_object.h
@@ -47,7 +47,7 @@ class CollisionObject : public Spatial {
 		Object *owner;
 		Transform xform;
 		struct ShapeBase {
-			Node *debug_shape = nullptr;
+			RID debug_shape;
 			Ref<Shape> shape;
 			int index;
 		};
@@ -69,11 +69,16 @@ class CollisionObject : public Spatial {
 	bool ray_pickable;
 
 	Set<uint32_t> debug_shapes_to_update;
-	int debug_shape_count = 0;
+	int debug_shapes_count = 0;
+	Transform debug_shape_old_transform;
 
 	void _update_pickable();
 
+	bool _are_collision_shapes_visible();
 	void _update_shape_data(uint32_t p_owner);
+	void _shape_changed(const Ref<Shape> &p_shape);
+	void _update_debug_shapes();
+	void _clear_debug_shapes();
 
 protected:
 	CollisionObject(RID p_rid, bool p_area);
@@ -85,8 +90,7 @@ class CollisionObject : public Spatial {
 	virtual void _mouse_enter();
 	virtual void _mouse_exit();
 
-	void _update_debug_shapes();
-	void _clear_debug_shapes();
+	void _on_transform_changed();
 
 public:
 	uint32_t create_shape_owner(Object *p_owner);
diff --git a/scene/3d/collision_shape.cpp b/scene/3d/collision_shape.cpp
index 63c9a0373919..09b785dcdbf2 100644
--- a/scene/3d/collision_shape.cpp
+++ b/scene/3d/collision_shape.cpp
@@ -158,8 +158,6 @@ void CollisionShape::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("make_convex_from_brothers"), &CollisionShape::make_convex_from_brothers);
 	ClassDB::set_method_flags("CollisionShape", "make_convex_from_brothers", METHOD_FLAGS_DEFAULT | METHOD_FLAG_EDITOR);
 
-	ClassDB::bind_method(D_METHOD("_shape_changed"), &CollisionShape::_shape_changed);
-
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "shape", PROPERTY_HINT_RESOURCE_TYPE, "Shape"), "set_shape", "get_shape");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "disabled"), "set_disabled", "is_disabled");
 }
@@ -170,12 +168,10 @@ void CollisionShape::set_shape(const Ref<Shape> &p_shape) {
 	}
 	if (!shape.is_null()) {
 		shape->unregister_owner(this);
-		shape->disconnect("changed", this, "_shape_changed");
 	}
 	shape = p_shape;
 	if (!shape.is_null()) {
 		shape->register_owner(this);
-		shape->connect("changed", this, "_shape_changed");
 	}
 	update_gizmo();
 	if (parent) {
@@ -185,8 +181,10 @@ void CollisionShape::set_shape(const Ref<Shape> &p_shape) {
 		}
 	}
 
-	if (is_inside_tree())
-		_shape_changed();
+	if (is_inside_tree() && parent) {
+		// If this is a heightfield shape our center may have changed
+		_update_in_shape_owner(true);
+	}
 	update_configuration_warning();
 }
 
@@ -223,10 +221,3 @@ CollisionShape::~CollisionShape() {
 		shape->unregister_owner(this);
 	//VisualServer::get_singleton()->free(indicator);
 }
-
-void CollisionShape::_shape_changed() {
-	// If this is a heightfield shape our center may have changed
-	if (parent) {
-		_update_in_shape_owner(true);
-	}
-}
diff --git a/scene/3d/collision_shape.h b/scene/3d/collision_shape.h
index fc331315d299..5f265389cc84 100644
--- a/scene/3d/collision_shape.h
+++ b/scene/3d/collision_shape.h
@@ -48,8 +48,6 @@ class CollisionShape : public Spatial {
 	bool disabled;
 
 protected:
-	void _shape_changed();
-
 	void _update_in_shape_owner(bool p_xform_only = false);
 
 protected:
diff --git a/scene/3d/physics_body.cpp b/scene/3d/physics_body.cpp
index 0d21fbe1b609..03cf645e3899 100644
--- a/scene/3d/physics_body.cpp
+++ b/scene/3d/physics_body.cpp
@@ -457,6 +457,7 @@ void RigidBody::_direct_state_changed(Object *p_state) {
 	if (get_script_instance())
 		get_script_instance()->call("_integrate_forces", state);
 	set_ignore_transform_notification(false);
+	_on_transform_changed();
 
 	if (contact_monitor) {
 
@@ -2224,6 +2225,7 @@ void PhysicalBone::_direct_state_changed(Object *p_state) {
 	set_ignore_transform_notification(true);
 	set_global_transform(global_transform);
 	set_ignore_transform_notification(false);
+	_on_transform_changed();
 
 	// Update skeleton
 	if (parent_skeleton) {

From 04ebfaa3897d5d96396c34b3255da1840e1fc20b Mon Sep 17 00:00:00 2001
From: Will Whitty <tavurth@gmail.com>
Date: Sun, 9 May 2021 14:05:32 +0300
Subject: [PATCH 84/84] Work on porting HTTPRequest compression to 3.3

Fix doc issues
---
 core/io/compression.cpp       |  86 ++++++++++++++++++++++++++
 core/io/compression.h         |   3 +
 core/variant_call.cpp         |  19 ++++++
 doc/classes/HTTPRequest.xml   |  40 ++++++++++--
 doc/classes/PoolByteArray.xml |  14 +++++
 scene/main/http_request.cpp   | 113 ++++++++++++++++++++++++++++++++--
 scene/main/http_request.h     |  14 ++++-
 7 files changed, 276 insertions(+), 13 deletions(-)

diff --git a/core/io/compression.cpp b/core/io/compression.cpp
index 09983ae3d6e9..ad24fed82e0b 100644
--- a/core/io/compression.cpp
+++ b/core/io/compression.cpp
@@ -187,8 +187,94 @@ int Compression::decompress(uint8_t *p_dst, int p_dst_max_size, const uint8_t *p
 	ERR_FAIL_V(-1);
 }
 
+/**
+	This will handle both Gzip and Deflat streams. It will automatically allocate the output buffer into the provided p_dst_vect Vector.
+	This is required for compressed data who's final uncompressed size is unknown, as is the case for HTTP response bodies.
+	This is much slower however than using Compression::decompress because it may result in multiple full copies of the output buffer.
+*/
+int Compression::decompress_dynamic(PoolVector<uint8_t> *p_dst, int p_max_dst_size, const uint8_t *p_src, int p_src_size, Mode p_mode) {
+	int ret;
+	uint8_t *dst = nullptr;
+	int out_mark = 0;
+	z_stream strm;
+
+	ERR_FAIL_COND_V(p_src_size <= 0, Z_DATA_ERROR);
+
+	// This function only supports GZip and Deflate
+	int window_bits = p_mode == MODE_DEFLATE ? 15 : 15 + 16;
+	ERR_FAIL_COND_V(p_mode != MODE_DEFLATE && p_mode != MODE_GZIP, Z_ERRNO);
+
+	// Initialize the stream
+	strm.zalloc = Z_NULL;
+	strm.zfree = Z_NULL;
+	strm.opaque = Z_NULL;
+	strm.avail_in = 0;
+	strm.next_in = Z_NULL;
+
+	int err = inflateInit2(&strm, window_bits);
+	ERR_FAIL_COND_V(err != Z_OK, -1);
+
+	// Setup the stream inputs
+	strm.next_in = (Bytef *)p_src;
+	strm.avail_in = p_src_size;
+
+	// Ensure the destination buffer is empty
+	p_dst->resize(0);
+
+	// decompress until deflate stream ends or end of file
+	do {
+		// Add another chunk size to the output buffer
+		// This forces a copy of the whole buffer
+		p_dst->resize(p_dst->size() + gzip_chunk);
+		// Get pointer to the actual output buffer
+		dst = p_dst->write().ptr();
+
+		// Set the stream to the new output stream
+		// Since it was copied, we need to reset the stream to the new buffer
+		strm.next_out = &(dst[out_mark]);
+		strm.avail_out = gzip_chunk;
+
+		// run inflate() on input until output buffer is full and needs to be resized
+		// or input runs out
+		do {
+			ret = inflate(&strm, Z_SYNC_FLUSH);
+
+			switch (ret) {
+				case Z_NEED_DICT:
+					ret = Z_DATA_ERROR;
+				case Z_DATA_ERROR:
+				case Z_MEM_ERROR:
+				case Z_STREAM_ERROR:
+					WARN_PRINT(strm.msg);
+					(void)inflateEnd(&strm);
+					p_dst->resize(0);
+					return ret;
+			}
+		} while (strm.avail_out > 0 && strm.avail_in > 0);
+
+		out_mark += gzip_chunk;
+
+		// Encorce max output size
+		if (p_max_dst_size > -1 && strm.total_out > (uint64_t)p_max_dst_size) {
+			(void)inflateEnd(&strm);
+			p_dst->resize(0);
+			return Z_BUF_ERROR;
+		}
+	} while (ret != Z_STREAM_END);
+
+	// If all done successfully, resize the output if it's larger than the actual output
+	if (ret == Z_STREAM_END && (unsigned long)p_dst->size() > strm.total_out) {
+		p_dst->resize(strm.total_out);
+	}
+
+	// clean up and return
+	(void)inflateEnd(&strm);
+	return ret == Z_STREAM_END ? Z_OK : Z_DATA_ERROR;
+}
+
 int Compression::zlib_level = Z_DEFAULT_COMPRESSION;
 int Compression::gzip_level = Z_DEFAULT_COMPRESSION;
 int Compression::zstd_level = 3;
 bool Compression::zstd_long_distance_matching = false;
 int Compression::zstd_window_log_size = 27; // ZSTD_WINDOWLOG_LIMIT_DEFAULT
+int Compression::gzip_chunk = 16384;
diff --git a/core/io/compression.h b/core/io/compression.h
index fa49440677c1..7dc0e20448d0 100644
--- a/core/io/compression.h
+++ b/core/io/compression.h
@@ -31,6 +31,7 @@
 #ifndef COMPRESSION_H
 #define COMPRESSION_H
 
+#include "core/pool_vector.h"
 #include "core/typedefs.h"
 
 class Compression {
@@ -41,6 +42,7 @@ class Compression {
 	static int zstd_level;
 	static bool zstd_long_distance_matching;
 	static int zstd_window_log_size;
+	static int gzip_chunk;
 
 	enum Mode {
 		MODE_FASTLZ,
@@ -52,6 +54,7 @@ class Compression {
 	static int compress(uint8_t *p_dst, const uint8_t *p_src, int p_src_size, Mode p_mode = MODE_ZSTD);
 	static int get_max_compressed_buffer_size(int p_src_size, Mode p_mode = MODE_ZSTD);
 	static int decompress(uint8_t *p_dst, int p_dst_max_size, const uint8_t *p_src, int p_src_size, Mode p_mode = MODE_ZSTD);
+	static int decompress_dynamic(PoolVector<uint8_t> *p_dst, int p_max_dst_size, const uint8_t *p_src, int p_src_size, Mode p_mode);
 
 	Compression();
 };
diff --git a/core/variant_call.cpp b/core/variant_call.cpp
index b7296de1c2c6..ffece2dc3328 100644
--- a/core/variant_call.cpp
+++ b/core/variant_call.cpp
@@ -642,6 +642,24 @@ struct _VariantCall {
 		r_ret = decompressed;
 	}
 
+	static void _call_PoolByteArray_decompress_dynamic(Variant &r_ret, Variant &p_self, const Variant **p_args) {
+		PoolByteArray *ba = reinterpret_cast<PoolByteArray *>(p_self._data._mem);
+		PoolByteArray *decompressed = memnew(PoolByteArray);
+		int max_output_size = (int)(*p_args[0]);
+		Compression::Mode mode = (Compression::Mode)(int)(*p_args[1]);
+
+		decompressed->resize(1024);
+		int result = Compression::decompress_dynamic(decompressed, max_output_size, ba->read().ptr(), ba->size(), mode);
+
+		if (result == OK) {
+			r_ret = decompressed;
+		} else {
+			decompressed->resize(0);
+			r_ret = decompressed;
+			ERR_FAIL_MSG("Decompression failed.");
+		}
+	}
+
 	static void _call_PoolByteArray_hex_encode(Variant &r_ret, Variant &p_self, const Variant **p_args) {
 		PoolByteArray *ba = reinterpret_cast<PoolByteArray *>(p_self._data._mem);
 		if (ba->size() == 0) {
@@ -1865,6 +1883,7 @@ void register_variant_methods() {
 	ADDFUNC0R(POOL_BYTE_ARRAY, STRING, PoolByteArray, hex_encode, varray());
 	ADDFUNC1R(POOL_BYTE_ARRAY, POOL_BYTE_ARRAY, PoolByteArray, compress, INT, "compression_mode", varray(0));
 	ADDFUNC2R(POOL_BYTE_ARRAY, POOL_BYTE_ARRAY, PoolByteArray, decompress, INT, "buffer_size", INT, "compression_mode", varray(0));
+	ADDFUNC2R(POOL_BYTE_ARRAY, POOL_BYTE_ARRAY, PoolByteArray, decompress_dynamic, INT, "max_output_size", INT, "compression_mode", varray(0));
 
 	ADDFUNC0R(POOL_INT_ARRAY, INT, PoolIntArray, size, varray());
 	ADDFUNC0R(POOL_INT_ARRAY, BOOL, PoolIntArray, empty, varray());
diff --git a/doc/classes/HTTPRequest.xml b/doc/classes/HTTPRequest.xml
index 280350d7bdb3..869fd4767b27 100644
--- a/doc/classes/HTTPRequest.xml
+++ b/doc/classes/HTTPRequest.xml
@@ -64,6 +64,10 @@
 		    add_child(texture_rect)
 		    texture_rect.texture = texture
 		[/codeblock]
+		[b]Gzipped response bodies[/b]
+		HttpRequest will automatically handle decompression of response bodies.
+		A "Accept-Encoding" header will be automatically added to each of your requests, unless one is already specified.
+		Any response with a "Content-Encoding: gzip" header will automatically be decompressed and delivered to you as a uncompressed bytes.
 		[b]Note:[/b] When performing HTTP requests from a project exported to HTML5, keep in mind the remote server may not allow requests from foreign origins due to [url=https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS]CORS[/url]. If you host the server in question, you should modify its backend to allow requests from foreign origins by adding the [code]Access-Control-Allow-Origin: *[/code] HTTP header.
 		[b]Note:[/b] SSL/TLS support is currently limited to TLS 1.0, TLS 1.1, and TLS 1.2. Attempting to connect to a TLS 1.3-only server will return an error.
 	</description>
@@ -120,10 +124,34 @@
 				[b]Note:[/b] The [code]request_data[/code] parameter is ignored if [code]method[/code] is [constant HTTPClient.METHOD_GET]. This is because GET methods can't contain request data. As a workaround, you can pass request data as a query string in the URL. See [method String.http_escape] for an example.
 			</description>
 		</method>
+		<method name="request_raw">
+			<return type="int" enum="Error">
+			</return>
+			<argument index="0" name="url" type="String">
+			</argument>
+			<argument index="1" name="custom_headers" type="PoolStringArray" default="PoolStringArray(  )">
+			</argument>
+			<argument index="2" name="ssl_validate_domain" type="bool" default="true">
+			</argument>
+			<argument index="3" name="method" type="int" enum="HTTPClient.Method" default="0">
+			</argument>
+			<argument index="4" name="request_data_raw" type="PoolByteArray" default="PoolByteArray(  )">
+			</argument>
+			<description>
+				Creates request on the underlying [HTTPClient] using a raw array of bytes for the request body. If there is no configuration errors, it tries to connect using [method HTTPClient.connect_to_host] and passes parameters onto [method HTTPClient.request].
+				Returns [constant OK] if request is successfully created. (Does not imply that the server has responded), [constant ERR_UNCONFIGURED] if not in the tree, [constant ERR_BUSY] if still processing previous request, [constant ERR_INVALID_PARAMETER] if given string is not a valid URL format, or [constant ERR_CANT_CONNECT] if not using thread and the [HTTPClient] cannot connect to host.
+			</description>
+		</method>
 	</methods>
 	<members>
+		<member name="accept_gzip" type="bool" setter="set_accept_gzip" getter="is_accepting_gzip" default="true">
+			If [code]true[/code], this header will be added to each request: [code]Accept-Encoding: gzip, deflate[/code] telling servers that it's okay to compress response bodies.
+			Any Reponse body declaring a [code]Content-Encoding[/code] of either [code]gzip[/code] or [code]deflate[/code] will then be automatically decompressed, and the uncompressed bytes will be delivered via [code]request_completed[/code].
+			If the user has specified their own [code]Accept-Encoding[/code] header, then no header will be added regaurdless of [code]accept_gzip[/code].
+			If [code]false[/code] no header will be added, and no decompression will be performed on response bodies. The raw bytes of the response body will be returned via [code]request_completed[/code].
+		</member>
 		<member name="body_size_limit" type="int" setter="set_body_size_limit" getter="get_body_size_limit" default="-1">
-			Maximum allowed size for response bodies.
+			Maximum allowed size for response bodies. If the response body is compressed, this will be used as the maximum allowed size for the decompressed body.
 		</member>
 		<member name="download_chunk_size" type="int" setter="set_download_chunk_size" getter="get_download_chunk_size" default="65536">
 			The size of the buffer used and maximum bytes to read per iteration. See [member HTTPClient.read_chunk_size].
@@ -180,19 +208,19 @@
 		<constant name="RESULT_BODY_SIZE_LIMIT_EXCEEDED" value="7" enum="Result">
 			Request exceeded its maximum size limit, see [member body_size_limit].
 		</constant>
-		<constant name="RESULT_REQUEST_FAILED" value="8" enum="Result">
+		<constant name="RESULT_REQUEST_FAILED" value="9" enum="Result">
 			Request failed (currently unused).
 		</constant>
-		<constant name="RESULT_DOWNLOAD_FILE_CANT_OPEN" value="9" enum="Result">
+		<constant name="RESULT_DOWNLOAD_FILE_CANT_OPEN" value="10" enum="Result">
 			HTTPRequest couldn't open the download file.
 		</constant>
-		<constant name="RESULT_DOWNLOAD_FILE_WRITE_ERROR" value="10" enum="Result">
+		<constant name="RESULT_DOWNLOAD_FILE_WRITE_ERROR" value="11" enum="Result">
 			HTTPRequest couldn't write to the download file.
 		</constant>
-		<constant name="RESULT_REDIRECT_LIMIT_REACHED" value="11" enum="Result">
+		<constant name="RESULT_REDIRECT_LIMIT_REACHED" value="12" enum="Result">
 			Request reached its maximum redirect limit, see [member max_redirects].
 		</constant>
-		<constant name="RESULT_TIMEOUT" value="12" enum="Result">
+		<constant name="RESULT_TIMEOUT" value="13" enum="Result">
 		</constant>
 	</constants>
 </class>
diff --git a/doc/classes/PoolByteArray.xml b/doc/classes/PoolByteArray.xml
index 9f72802ecd49..0f33510f38c9 100644
--- a/doc/classes/PoolByteArray.xml
+++ b/doc/classes/PoolByteArray.xml
@@ -53,6 +53,20 @@
 				Returns a new [PoolByteArray] with the data decompressed. Set [code]buffer_size[/code] to the size of the uncompressed data. Set the compression mode using one of [enum File.CompressionMode]'s constants.
 			</description>
 		</method>
+		<method name="decompress_dynamic">
+			<return type="PoolByteArray">
+			</return>
+			<argument index="0" name="max_output_size" type="int">
+			</argument>
+			<argument index="1" name="compression_mode" type="int" default="0">
+			</argument>
+			<description>
+				Returns a new [PoolByteArray] with the data decompressed. Set the compression mode using one of [enum File.CompressionMode]'s constants. [b]This method only accepts gzip and deflate compression modes.[/b]
+				This method is potentially slower than [code]decompress[/code], as it may have to re-allocate it's output buffer multiple times while decompressing, where as [code]decompress[/code] knows it's output buffer size from the begining.
+
+				GZIP has a maximal compression ratio of 1032:1, meaning it's very possible for a small compressed payload to decompress to a potentially very large output. To guard against this, you may provide a maximum size this function is allowed to allocate in bytes via [code]max_output_size[/code]. Passing -1 will allow for unbounded output. If any positive value is passed, and the decompression exceeds that ammount in bytes, then an error will be returned.
+			</description>
+		</method>
 		<method name="empty">
 			<return type="bool">
 			</return>
diff --git a/scene/main/http_request.cpp b/scene/main/http_request.cpp
index ec49689acaa4..cef92947dde7 100644
--- a/scene/main/http_request.cpp
+++ b/scene/main/http_request.cpp
@@ -29,6 +29,8 @@
 /*************************************************************************/
 
 #include "http_request.h"
+#include "core/io/compression.h"
+#include "core/ustring.h"
 
 void HTTPRequest::_redirect_request(const String &p_new_url) {
 }
@@ -84,7 +86,50 @@ Error HTTPRequest::_parse_url(const String &p_url) {
 	return OK;
 }
 
+bool HTTPRequest::has_header(const Vector<String> &p_headers, const String &p_header_name) {
+	bool exists = false;
+
+	String lower_case_header_name = p_header_name.to_lower();
+	for (int i = 0; i < p_headers.size() && !exists; i++) {
+		String sanitized = p_headers[i].strip_edges().to_lower();
+		if (sanitized.begins_with(lower_case_header_name)) {
+			exists = true;
+		}
+	}
+
+	return exists;
+}
+
+String HTTPRequest::get_header_value(const PoolStringArray &p_headers, const String &p_header_name) {
+	String value = "";
+
+	String lowwer_case_header_name = p_header_name.to_lower();
+	for (int i = 0; i < p_headers.size(); i++) {
+		if (p_headers[i].find(":", 0) >= 0) {
+			Vector<String> parts = p_headers[i].split(":", false, 1);
+			if (parts[0].strip_edges().to_lower() == lowwer_case_header_name) {
+				value = parts[1].strip_edges();
+				break;
+			}
+		}
+	}
+
+	return value;
+}
+
 Error HTTPRequest::request(const String &p_url, const Vector<String> &p_custom_headers, bool p_ssl_validate_domain, HTTPClient::Method p_method, const String &p_request_data) {
+	// Copy the string into a raw buffer
+	PoolVector<uint8_t> raw_data;
+
+	CharString charstr = p_request_data.utf8();
+	size_t len = charstr.length();
+	raw_data.resize(len);
+	copymem(raw_data.write().ptr(), charstr.ptr(), len);
+
+	return request_raw(p_url, p_custom_headers, p_ssl_validate_domain, p_method, raw_data);
+}
+
+Error HTTPRequest::request_raw(const String &p_url, const Vector<String> &p_custom_headers, bool p_ssl_validate_domain, HTTPClient::Method p_method, const PoolVector<uint8_t> &p_request_data_raw) {
 
 	ERR_FAIL_COND_V(!is_inside_tree(), ERR_UNCONFIGURED);
 	ERR_FAIL_COND_V_MSG(requesting, ERR_BUSY, "HTTPRequest is processing a request. Wait for completion or cancel it before attempting a new one.");
@@ -104,7 +149,14 @@ Error HTTPRequest::request(const String &p_url, const Vector<String> &p_custom_h
 
 	headers = p_custom_headers;
 
-	request_data = p_request_data;
+	if (accept_gzip) {
+		// If the user has specified a different Accept-Encoding, don't overwrite it
+		if (!has_header(headers, "Accept-Encoding")) {
+			headers.push_back("Accept-Encoding: gzip, deflate");
+		}
+	}
+
+	request_data = p_request_data_raw;
 
 	requesting = true;
 
@@ -296,7 +348,7 @@ bool HTTPRequest::_update_connection() {
 			} else {
 				// Did not request yet, do request
 
-				Error err = client->request(method, request_string, headers, request_data);
+				Error err = client->request_raw(method, request_string, headers, request_data);
 				if (err != OK) {
 					call_deferred("_request_done", RESULT_CONNECTION_ERROR, 0, PoolStringArray(), PoolByteArray());
 					return true;
@@ -398,10 +450,47 @@ bool HTTPRequest::_update_connection() {
 	ERR_FAIL_V(false);
 }
 
-void HTTPRequest::_request_done(int p_status, int p_code, const PoolStringArray &headers, const PoolByteArray &p_data) {
-
+void HTTPRequest::_request_done(int p_status, int p_code, const PoolStringArray &p_headers, const PoolByteArray &p_data) {
 	cancel_request();
-	emit_signal("request_completed", p_status, p_code, headers, p_data);
+
+	// Determine if the request body is compressed
+	bool is_compressed;
+	String content_encoding = get_header_value(p_headers, "Content-Encoding").to_lower();
+	Compression::Mode mode;
+	if (content_encoding == "gzip") {
+		mode = Compression::Mode::MODE_GZIP;
+		is_compressed = true;
+	} else if (content_encoding == "deflate") {
+		mode = Compression::Mode::MODE_DEFLATE;
+		is_compressed = true;
+	} else {
+		is_compressed = false;
+	}
+
+	const PoolByteArray *data = NULL;
+
+	if (accept_gzip && is_compressed && p_data.size() > 0) {
+		// Decompress request body
+		PoolByteArray *decompressed = memnew(PoolByteArray);
+		int result = Compression::decompress_dynamic(decompressed, body_size_limit, p_data.read().ptr(), p_data.size(), mode);
+		if (result == OK) {
+			data = decompressed;
+		} else if (result == -5) {
+			WARN_PRINT("Decompressed size of HTTP response body exceeded body_size_limit");
+			p_status = RESULT_BODY_SIZE_LIMIT_EXCEEDED;
+			// Just return the raw data if we failed to decompress it
+			data = &p_data;
+		} else {
+			WARN_PRINT("Failed to decompress HTTP response body");
+			p_status = RESULT_BODY_DECOMPRESS_FAILED;
+			// Just return the raw data if we failed to decompress it
+			data = &p_data;
+		}
+	} else {
+		data = &p_data;
+	}
+
+	emit_signal("request_completed", p_status, p_code, p_headers, *data);
 }
 
 void HTTPRequest::_notification(int p_what) {
@@ -425,6 +514,14 @@ void HTTPRequest::_notification(int p_what) {
 	}
 }
 
+void HTTPRequest::set_accept_gzip(bool p_gzip) {
+	accept_gzip = p_gzip;
+}
+
+bool HTTPRequest::is_accepting_gzip() const {
+	return accept_gzip;
+}
+
 void HTTPRequest::set_use_threads(bool p_use) {
 
 	ERR_FAIL_COND(get_http_client_status() != HTTPClient::STATUS_DISCONNECTED);
@@ -513,6 +610,7 @@ void HTTPRequest::_timeout() {
 void HTTPRequest::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("request", "url", "custom_headers", "ssl_validate_domain", "method", "request_data"), &HTTPRequest::request, DEFVAL(PoolStringArray()), DEFVAL(true), DEFVAL(HTTPClient::METHOD_GET), DEFVAL(String()));
+	ClassDB::bind_method(D_METHOD("request_raw", "url", "custom_headers", "ssl_validate_domain", "method", "request_data_raw"), &HTTPRequest::request_raw, DEFVAL(PoolStringArray()), DEFVAL(true), DEFVAL(HTTPClient::METHOD_GET), DEFVAL(PoolVector<uint8_t>()));
 	ClassDB::bind_method(D_METHOD("cancel_request"), &HTTPRequest::cancel_request);
 
 	ClassDB::bind_method(D_METHOD("get_http_client_status"), &HTTPRequest::get_http_client_status);
@@ -520,6 +618,9 @@ void HTTPRequest::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_use_threads", "enable"), &HTTPRequest::set_use_threads);
 	ClassDB::bind_method(D_METHOD("is_using_threads"), &HTTPRequest::is_using_threads);
 
+	ClassDB::bind_method(D_METHOD("set_accept_gzip", "enable"), &HTTPRequest::set_accept_gzip);
+	ClassDB::bind_method(D_METHOD("is_accepting_gzip"), &HTTPRequest::is_accepting_gzip);
+
 	ClassDB::bind_method(D_METHOD("set_body_size_limit", "bytes"), &HTTPRequest::set_body_size_limit);
 	ClassDB::bind_method(D_METHOD("get_body_size_limit"), &HTTPRequest::get_body_size_limit);
 
@@ -546,6 +647,7 @@ void HTTPRequest::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "download_file", PROPERTY_HINT_FILE), "set_download_file", "get_download_file");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "download_chunk_size", PROPERTY_HINT_RANGE, "256,16777216"), "set_download_chunk_size", "get_download_chunk_size");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_threads"), "set_use_threads", "is_using_threads");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "accept_gzip"), "set_accept_gzip", "is_accepting_gzip");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "body_size_limit", PROPERTY_HINT_RANGE, "-1,2000000000"), "set_body_size_limit", "get_body_size_limit");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "max_redirects", PROPERTY_HINT_RANGE, "-1,64"), "set_max_redirects", "get_max_redirects");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "timeout", PROPERTY_HINT_RANGE, "0,86400"), "set_timeout", "get_timeout");
@@ -577,6 +679,7 @@ HTTPRequest::HTTPRequest() {
 	got_response = false;
 	validate_ssl = false;
 	use_ssl = false;
+	accept_gzip = true;
 	response_code = 0;
 	request_sent = false;
 	requesting = false;
diff --git a/scene/main/http_request.h b/scene/main/http_request.h
index d5cdbcbed41b..6715a8eb6075 100644
--- a/scene/main/http_request.h
+++ b/scene/main/http_request.h
@@ -52,6 +52,7 @@ class HTTPRequest : public Node {
 		RESULT_SSL_HANDSHAKE_ERROR,
 		RESULT_NO_RESPONSE,
 		RESULT_BODY_SIZE_LIMIT_EXCEEDED,
+		RESULT_BODY_DECOMPRESS_FAILED,
 		RESULT_REQUEST_FAILED,
 		RESULT_DOWNLOAD_FILE_CANT_OPEN,
 		RESULT_DOWNLOAD_FILE_WRITE_ERROR,
@@ -70,13 +71,14 @@ class HTTPRequest : public Node {
 	bool validate_ssl;
 	bool use_ssl;
 	HTTPClient::Method method;
-	String request_data;
+	PoolVector<uint8_t> request_data;
 
 	bool request_sent;
 	Ref<HTTPClient> client;
 	PoolByteArray body;
 	SafeFlag use_threads;
 
+	bool accept_gzip;
 	bool got_response;
 	int response_code;
 	PoolVector<String> response_headers;
@@ -104,12 +106,15 @@ class HTTPRequest : public Node {
 	Error _parse_url(const String &p_url);
 	Error _request();
 
+	bool has_header(const Vector<String> &p_headers, const String &p_header_name);
+	String get_header_value(const PoolStringArray &p_headers, const String &header_name);
+
 	SafeFlag thread_done;
 	SafeFlag thread_request_quit;
 
 	Thread thread;
 
-	void _request_done(int p_status, int p_code, const PoolStringArray &headers, const PoolByteArray &p_data);
+	void _request_done(int p_status, int p_code, const PoolStringArray &p_headers, const PoolByteArray &p_data);
 	static void _thread_func(void *p_userdata);
 
 protected:
@@ -118,12 +123,17 @@ class HTTPRequest : public Node {
 
 public:
 	Error request(const String &p_url, const Vector<String> &p_custom_headers = Vector<String>(), bool p_ssl_validate_domain = true, HTTPClient::Method p_method = HTTPClient::METHOD_GET, const String &p_request_data = ""); //connects to a full url and perform request
+	Error request_raw(const String &p_url, const Vector<String> &p_custom_headers = Vector<String>(), bool p_ssl_validate_domain = true, HTTPClient::Method p_method = HTTPClient::METHOD_GET, const PoolVector<uint8_t> &p_request_data_raw = PoolVector<uint8_t>()); //connects to a full url and perform request
 	void cancel_request();
+
 	HTTPClient::Status get_http_client_status() const;
 
 	void set_use_threads(bool p_use);
 	bool is_using_threads() const;
 
+	void set_accept_gzip(bool p_gzip);
+	bool is_accepting_gzip() const;
+
 	void set_download_file(const String &p_file);
 	String get_download_file() const;