From be83510ca1a4cee2b539e5de65eb1187138f3860 Mon Sep 17 00:00:00 2001
From: Juan Linietsky <reduzio@gmail.com>
Date: Sun, 25 Feb 2024 10:43:08 +0100
Subject: [PATCH] Fixes to divergent control flow on Intel.

---
 .../renderer_rd/shaders/environment/gi.glsl   | 287 ++++++------
 .../shaders/environment/hddagi_integrate.glsl | 353 ++++++++-------
 .../environment/hddagi_preprocess.glsl        | 424 +++++++++---------
 3 files changed, 545 insertions(+), 519 deletions(-)

diff --git a/servers/rendering/renderer_rd/shaders/environment/gi.glsl b/servers/rendering/renderer_rd/shaders/environment/gi.glsl
index 8b65b5bdc801..bb4b9ab65d67 100644
--- a/servers/rendering/renderer_rd/shaders/environment/gi.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/gi.glsl
@@ -1100,6 +1100,7 @@ void main() {
 #if defined(USE_HDDAGI) || defined(USE_VOXEL_GI_INSTANCES)
 
 	uint vrs_x, vrs_y;
+	bool thread_active = true;
 #ifdef USE_VRS
 	if (sc_use_vrs) {
 		ivec2 vrs_pos;
@@ -1117,198 +1118,208 @@ void main() {
 		vrs_y = 1 << (vrs_texel & 3);
 
 		if (mod(pos.x, vrs_x) != 0) {
-			return;
+			thread_active = false;
 		}
 
 		if (mod(pos.y, vrs_y) != 0) {
-			return;
+			thread_active = false;
 		}
 	}
 #endif
 
-	if (sc_half_res) {
-		pos <<= 1;
-	}
+	if (thread_active) {
+		if (sc_half_res) {
+			pos <<= 1;
+		}
 
-	if (any(greaterThanEqual(pos, scene_data.screen_size))) { //too large, do nothing
-		return;
+		if (any(greaterThanEqual(pos, scene_data.screen_size))) { //too large, do nothing
+			thread_active = false;
+		}
 	}
 
 	vec4 ambient_light = vec4(0.0);
 	vec4 reflection_light = vec4(0.0);
-
-	vec3 vertex;
-	vec3 normal;
 	float roughness;
 
-	bool found_vertex = false;
+	if (thread_active) {
+		vec3 vertex;
+		vec3 normal;
 
-	vertex = reconstruct_position(pos);
-	vec4 normal_roughness = fetch_normal_and_roughness(pos);
-	found_vertex = length(normal_roughness.xyz) > 0.5;
-	normal = normal_roughness.xyz;
-	roughness = normal_roughness.w;
-	bool dynamic_object = roughness > 0.5;
-	if (dynamic_object) {
-		roughness = 1.0 - roughness;
-	}
-	roughness /= (127.0 / 255.0);
-	vertex.y = -vertex.y;
+		bool found_vertex = false;
 
-	if (found_vertex) {
-		process_gi(pos, vertex, normal, roughness, dynamic_object, ambient_light, reflection_light);
-	}
+		vertex = reconstruct_position(pos);
+		vec4 normal_roughness = fetch_normal_and_roughness(pos);
+		found_vertex = length(normal_roughness.xyz) > 0.5;
+		normal = normal_roughness.xyz;
+		roughness = normal_roughness.w;
+		bool dynamic_object = roughness > 0.5;
+		if (dynamic_object) {
+			roughness = 1.0 - roughness;
+		}
+		roughness /= (127.0 / 255.0);
+		vertex.y = -vertex.y;
+
+		if (found_vertex) {
+			process_gi(pos, vertex, normal, roughness, dynamic_object, ambient_light, reflection_light);
+		}
 
 #ifdef USE_HDDAGI
 
-	// If using reflections, blend the 4 adjacent pixels to get rid of dither
-	uint group_pos = gl_LocalInvocationID.y * GROUP_SIZE + gl_LocalInvocationID.x;
-	group_positions[group_pos] = vertex;
-	group_normals[group_pos] = normal;
-	group_reflections[group_pos] = reflection_light;
+		// If using reflections, blend the 4 adjacent pixels to get rid of dither
+		uint group_pos = gl_LocalInvocationID.y * GROUP_SIZE + gl_LocalInvocationID.x;
+		group_positions[group_pos] = vertex;
+		group_normals[group_pos] = normal;
+		group_reflections[group_pos] = reflection_light;
+#endif
+	}
 
 	memoryBarrierShared();
 	barrier();
 
-	if (roughness < ROUGHNESS_TO_REFLECTION_TRESHOOLD) {
-		uvec2 local_group_pos_base = gl_LocalInvocationID.xy - (gl_LocalInvocationID.xy % DITHER_SIZE);
-		uint local_group_pos = local_group_pos_base.y * GROUP_SIZE + local_group_pos_base.x;
-
-		vec3 positions[DITHER_SIZE * DITHER_SIZE];
-		vec3 normals[DITHER_SIZE * DITHER_SIZE];
+#ifdef USE_HDDAGI
 
-		vec4 average = vec4(0.0);
-		for (int i = 0; i < DITHER_SIZE; i++) {
-			for (int j = 0; j < DITHER_SIZE; j++) {
-				uint src_pos = local_group_pos + i * GROUP_SIZE + j;
-				normals[i * DITHER_SIZE + j] = group_normals[src_pos];
-				positions[i * DITHER_SIZE + j] = group_positions[src_pos];
-				average += group_reflections[src_pos];
+	if (thread_active) {
+		if (roughness < ROUGHNESS_TO_REFLECTION_TRESHOOLD) {
+			uvec2 local_group_pos_base = gl_LocalInvocationID.xy - (gl_LocalInvocationID.xy % DITHER_SIZE);
+			uint local_group_pos = local_group_pos_base.y * GROUP_SIZE + local_group_pos_base.x;
+
+			vec3 positions[DITHER_SIZE * DITHER_SIZE];
+			vec3 normals[DITHER_SIZE * DITHER_SIZE];
+
+			vec4 average = vec4(0.0);
+			for (int i = 0; i < DITHER_SIZE; i++) {
+				for (int j = 0; j < DITHER_SIZE; j++) {
+					uint src_pos = local_group_pos + i * GROUP_SIZE + j;
+					normals[i * DITHER_SIZE + j] = group_normals[src_pos];
+					positions[i * DITHER_SIZE + j] = group_positions[src_pos];
+					average += group_reflections[src_pos];
+				}
 			}
-		}
 
-		average /= 4.0;
+			average /= 4.0;
 
-		const int subgroup_count = (DITHER_SIZE - 1) * (DITHER_SIZE - 1);
-		uvec4 subgroups[subgroup_count] = uvec4[](
+			const int subgroup_count = (DITHER_SIZE - 1) * (DITHER_SIZE - 1);
+			uvec4 subgroups[subgroup_count] = uvec4[](
 #if DITHER_SIZE == 2
-				uvec4(0, 1, 2, 3)
+					uvec4(0, 1, 2, 3)
 #elif DITHER_SIZE == 3
-				uvec4(0, 1, 3, 4), uvec4(1, 2, 4, 5), uvec4(3, 4, 6, 7), uvec4(4, 5, 7, 8)
+					uvec4(0, 1, 3, 4), uvec4(1, 2, 4, 5), uvec4(3, 4, 6, 7), uvec4(4, 5, 7, 8)
 #endif
-		);
-
-		const float same_plane_threshold = 0.9659258262890683; // 15 degrees tolerance
+			);
+
+			const float same_plane_threshold = 0.9659258262890683; // 15 degrees tolerance
+
+			float weight = 1.0;
+			for (int i = 0; i < subgroup_count; i++) {
+				uvec4 sg = subgroups[i];
+				// Weight positions in plane.
+				vec3 p[4] = vec3[](positions[sg.x], positions[sg.y], positions[sg.z], positions[sg.w]);
+				vec3 n1 = normalize(cross(p[0] - p[2], p[0] - p[1]));
+				vec3 n2 = normalize(cross(p[2] - p[3], p[2] - p[1]));
+				weight *= max(0.0, smoothstep(same_plane_threshold, 1, dot(n1, n2)));
+
+				// Weight normal difference.
+				vec3 n[4] = vec3[](normals[sg.x], normals[sg.y], normals[sg.z], normals[sg.w]);
+				weight *= max(0.0, smoothstep(same_plane_threshold, 1, length((n[0] + n[1] + n[2] + n[3]) / 4.0)));
+			}
 
-		float weight = 1.0;
-		for (int i = 0; i < subgroup_count; i++) {
-			uvec4 sg = subgroups[i];
-			// Weight positions in plane.
-			vec3 p[4] = vec3[](positions[sg.x], positions[sg.y], positions[sg.z], positions[sg.w]);
-			vec3 n1 = normalize(cross(p[0] - p[2], p[0] - p[1]));
-			vec3 n2 = normalize(cross(p[2] - p[3], p[2] - p[1]));
-			weight *= max(0.0, smoothstep(same_plane_threshold, 1, dot(n1, n2)));
-
-			// Weight normal difference.
-			vec3 n[4] = vec3[](normals[sg.x], normals[sg.y], normals[sg.z], normals[sg.w]);
-			weight *= max(0.0, smoothstep(same_plane_threshold, 1, length((n[0] + n[1] + n[2] + n[3]) / 4.0)));
+			reflection_light = mix(reflection_light, average, weight);
 		}
-
-		reflection_light = mix(reflection_light, average, weight);
 	}
 #endif
 
-	if (sc_half_res) {
-		pos >>= 1;
-	}
+	if (thread_active) {
+		if (sc_half_res) {
+			pos >>= 1;
+		}
 
-	uint ambient_rgbe = rgbe_encode(ambient_light.rgb);
-	uint reflection_rgbe = rgbe_encode(reflection_light.rgb);
-	uint blend = uint(clamp(reflection_light.a * 0xF, 0, 0xF)) | (uint(clamp(ambient_light.a * 0xF, 0, 0xF)) << 4);
+		uint ambient_rgbe = rgbe_encode(ambient_light.rgb);
+		uint reflection_rgbe = rgbe_encode(reflection_light.rgb);
+		uint blend = uint(clamp(reflection_light.a * 0xF, 0, 0xF)) | (uint(clamp(ambient_light.a * 0xF, 0, 0xF)) << 4);
 
-	imageStore(ambient_buffer, pos, uvec4(ambient_rgbe));
-	imageStore(reflection_buffer, pos, uvec4(reflection_rgbe));
-	imageStore(blend_buffer, pos, vec4(ambient_light.a, reflection_light.a, 0, 0));
+		imageStore(ambient_buffer, pos, uvec4(ambient_rgbe));
+		imageStore(reflection_buffer, pos, uvec4(reflection_rgbe));
+		imageStore(blend_buffer, pos, vec4(ambient_light.a, reflection_light.a, 0, 0));
 
 #ifdef USE_VRS
-	if (sc_use_vrs) {
-		if (vrs_x > 1) {
-			imageStore(ambient_buffer, pos + ivec2(1, 0), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(1, 0), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(1, 0), uvec4(blend));
-		}
+		if (sc_use_vrs) {
+			if (vrs_x > 1) {
+				imageStore(ambient_buffer, pos + ivec2(1, 0), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(1, 0), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(1, 0), uvec4(blend));
+			}
 
-		if (vrs_x > 2) {
-			imageStore(ambient_buffer, pos + ivec2(2, 0), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(2, 0), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(2, 0), uvec4(blend));
+			if (vrs_x > 2) {
+				imageStore(ambient_buffer, pos + ivec2(2, 0), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(2, 0), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(2, 0), uvec4(blend));
 
-			imageStore(ambient_buffer, pos + ivec2(3, 0), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(3, 0), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(3, 0), uvec4(blend));
-		}
+				imageStore(ambient_buffer, pos + ivec2(3, 0), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(3, 0), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(3, 0), uvec4(blend));
+			}
 
-		if (vrs_y > 1) {
-			imageStore(ambient_buffer, pos + ivec2(0, 1), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(0, 1), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(0, 1), uvec4(blend));
-		}
+			if (vrs_y > 1) {
+				imageStore(ambient_buffer, pos + ivec2(0, 1), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(0, 1), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(0, 1), uvec4(blend));
+			}
 
-		if (vrs_y > 1 && vrs_x > 1) {
-			imageStore(ambient_buffer, pos + ivec2(1, 1), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(1, 1), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(1, 1), uvec4(blend));
-		}
+			if (vrs_y > 1 && vrs_x > 1) {
+				imageStore(ambient_buffer, pos + ivec2(1, 1), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(1, 1), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(1, 1), uvec4(blend));
+			}
 
-		if (vrs_y > 1 && vrs_x > 2) {
-			imageStore(ambient_buffer, pos + ivec2(2, 1), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(2, 1), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(2, 1), uvec4(blend));
+			if (vrs_y > 1 && vrs_x > 2) {
+				imageStore(ambient_buffer, pos + ivec2(2, 1), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(2, 1), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(2, 1), uvec4(blend));
 
-			imageStore(ambient_buffer, pos + ivec2(3, 1), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(3, 1), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(3, 1), uvec4(blend));
-		}
+				imageStore(ambient_buffer, pos + ivec2(3, 1), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(3, 1), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(3, 1), uvec4(blend));
+			}
 
-		if (vrs_y > 2) {
-			imageStore(ambient_buffer, pos + ivec2(0, 2), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(0, 2), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(0, 2), uvec4(blend));
+			if (vrs_y > 2) {
+				imageStore(ambient_buffer, pos + ivec2(0, 2), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(0, 2), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(0, 2), uvec4(blend));
 
-			imageStore(ambient_buffer, pos + ivec2(0, 3), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(0, 3), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(0, 3), uvec4(blend));
-		}
+				imageStore(ambient_buffer, pos + ivec2(0, 3), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(0, 3), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(0, 3), uvec4(blend));
+			}
 
-		if (vrs_y > 2 && vrs_x > 1) {
-			imageStore(ambient_buffer, pos + ivec2(1, 2), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(1, 2), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(1, 2), uvec4(blend));
+			if (vrs_y > 2 && vrs_x > 1) {
+				imageStore(ambient_buffer, pos + ivec2(1, 2), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(1, 2), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(1, 2), uvec4(blend));
 
-			imageStore(ambient_buffer, pos + ivec2(1, 3), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(1, 3), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(1, 3), uvec4(blend));
-		}
+				imageStore(ambient_buffer, pos + ivec2(1, 3), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(1, 3), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(1, 3), uvec4(blend));
+			}
 
-		if (vrs_y > 2 && vrs_x > 2) {
-			imageStore(ambient_buffer, pos + ivec2(2, 2), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(2, 2), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(2, 2), uvec4(blend));
+			if (vrs_y > 2 && vrs_x > 2) {
+				imageStore(ambient_buffer, pos + ivec2(2, 2), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(2, 2), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(2, 2), uvec4(blend));
 
-			imageStore(ambient_buffer, pos + ivec2(2, 3), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(2, 3), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(2, 3), uvec4(blend));
+				imageStore(ambient_buffer, pos + ivec2(2, 3), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(2, 3), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(2, 3), uvec4(blend));
 
-			imageStore(ambient_buffer, pos + ivec2(3, 2), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(3, 2), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(3, 2), uvec4(blend));
+				imageStore(ambient_buffer, pos + ivec2(3, 2), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(3, 2), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(3, 2), uvec4(blend));
 
-			imageStore(ambient_buffer, pos + ivec2(3, 3), uvec4(ambient_rgbe));
-			imageStore(reflection_buffer, pos + ivec2(3, 3), uvec4(reflection_rgbe));
-			imageStore(blend_buffer, pos + ivec2(3, 3), uvec4(blend));
+				imageStore(ambient_buffer, pos + ivec2(3, 3), uvec4(ambient_rgbe));
+				imageStore(reflection_buffer, pos + ivec2(3, 3), uvec4(reflection_rgbe));
+				imageStore(blend_buffer, pos + ivec2(3, 3), uvec4(blend));
+			}
 		}
-	}
 #endif
-
+	}
 #endif
 }
diff --git a/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl b/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl
index 352b2607f87c..699073cac6ab 100644
--- a/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl
@@ -488,155 +488,164 @@ void main() {
 	memoryBarrierShared();
 	barrier();
 
+	bool thread_active = true;
+	vec3 light;
+	ivec3 cache_texture_pos;
+	vec3 ray_dir;
+	vec2 sample_ofs;
+	vec3 ray_pos;
+	bool hit;
+	ivec3 hit_cell;
+	int hit_cascade;
+	bool cache_valid;
+	vec3 cache_invalidated_debug;
+	uint cache_entry;
+
 	if (probe_history_index < 0) {
-		return;
-	}
+		thread_active = false;
+	} else {
+		float probe_cell_size = float(params.grid_size.x) / float(params.probe_axis_size.x - 1) / cascades.data[params.cascade].to_cell;
 
-	float probe_cell_size = float(params.grid_size.x) / float(params.probe_axis_size.x - 1) / cascades.data[params.cascade].to_cell;
+		ray_pos = cascades.data[params.cascade].offset + vec3(probe_cell) * probe_cell_size;
 
-	vec3 ray_pos = cascades.data[params.cascade].offset + vec3(probe_cell) * probe_cell_size;
+		// Ensure a unique hash that includes the probe world position, the local octahedron pixel, and the history frame index
+		uvec3 h3 = hash3(uvec3((uvec3(probe_world_pos) * LIGHTPROBE_OCT_SIZE * LIGHTPROBE_OCT_SIZE + uvec3(probe_index)) * uvec3(params.history_size) + uvec3(probe_history_index)));
+		uint h = (h3.x ^ h3.y) ^ h3.z;
+		sample_ofs = vec2(ivec2(h >> 16, h & 0xFFFF)) / vec2(0xFFFF);
+		ray_dir = octahedron_decode((vec2(local_pos) + sample_ofs) / vec2(LIGHTPROBE_OCT_SIZE));
 
-	// Ensure a unique hash that includes the probe world position, the local octahedron pixel, and the history frame index
-	uvec3 h3 = hash3(uvec3((uvec3(probe_world_pos) * LIGHTPROBE_OCT_SIZE * LIGHTPROBE_OCT_SIZE + uvec3(probe_index)) * uvec3(params.history_size) + uvec3(probe_history_index)));
-	uint h = (h3.x ^ h3.y) ^ h3.z;
-	vec2 sample_ofs = vec2(ivec2(h >> 16, h & 0xFFFF)) / vec2(0xFFFF);
-	vec3 ray_dir = octahedron_decode((vec2(local_pos) + sample_ofs) / vec2(LIGHTPROBE_OCT_SIZE));
+		ray_dir.y *= params.y_mult;
+		ray_dir = normalize(ray_dir);
 
-	ray_dir.y *= params.y_mult;
-	ray_dir = normalize(ray_dir);
+		// Apply bias (by a cell)
+		float bias = params.ray_bias;
+		vec3 abs_ray_dir = abs(ray_dir);
+		ray_pos += ray_dir * 1.0 / max(abs_ray_dir.x, max(abs_ray_dir.y, abs_ray_dir.z)) * bias / cascades.data[params.cascade].to_cell;
 
-	// Apply bias (by a cell)
-	float bias = params.ray_bias;
-	vec3 abs_ray_dir = abs(ray_dir);
-	ray_pos += ray_dir * 1.0 / max(abs_ray_dir.x, max(abs_ray_dir.y, abs_ray_dir.z)) * bias / cascades.data[params.cascade].to_cell;
+		cache_texture_pos = ivec3(probe_texture_pos.xy * LIGHTPROBE_OCT_SIZE + local_pos, probe_texture_pos.z * params.history_size + probe_history_index);
+		cache_entry = imageLoad(ray_hit_cache, cache_texture_pos).r;
 
-	ivec3 cache_texture_pos = ivec3(probe_texture_pos.xy * LIGHTPROBE_OCT_SIZE + local_pos, probe_texture_pos.z * params.history_size + probe_history_index);
-	uint cache_entry = imageLoad(ray_hit_cache, cache_texture_pos).r;
+		cache_valid = bool(cache_entry & CACHE_IS_VALID);
 
-	bool hit;
-	ivec3 hit_cell;
-	int hit_cascade;
+		cache_invalidated_debug = vec3(0.0);
 
-	bool cache_valid = bool(cache_entry & CACHE_IS_VALID);
-
-	vec3 cache_invalidated_debug = vec3(0.0);
-
-	if (cache_valid) {
-		// Make sure the cache is really valid
-		hit = bool(cache_entry & CACHE_IS_HIT);
-		uvec4 uhit = (uvec4(cache_entry) >> uvec4(0, 8, 16, 24)) & uvec4(0xFF, 0xFF, 0xFF, 0x7);
-		hit_cell = ivec3(uhit.xyz);
-		hit_cascade = int(uhit.w);
-		uint axis = (cache_entry >> 27) & 0x3;
-		if (bool((1 << axis) & params.motion_accum)) {
-			// There was motion in this axis, cache is no longer valid.
-			cache_valid = false;
-			cache_invalidated_debug = vec3(0, 0, 4.0);
-		} else if (hit) {
-			// Check if the region pointed to is still valid.
-			uint version = imageLoad(ray_hit_cache_version, cache_texture_pos).r;
-			uint region_version = imageLoad(region_versions, (hit_cell / REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r;
-
-			if (region_version != version) {
+		if (cache_valid) {
+			// Make sure the cache is really valid
+			hit = bool(cache_entry & CACHE_IS_HIT);
+			uvec4 uhit = (uvec4(cache_entry) >> uvec4(0, 8, 16, 24)) & uvec4(0xFF, 0xFF, 0xFF, 0x7);
+			hit_cell = ivec3(uhit.xyz);
+			hit_cascade = int(uhit.w);
+			uint axis = (cache_entry >> 27) & 0x3;
+			if (bool((1 << axis) & params.motion_accum)) {
+				// There was motion in this axis, cache is no longer valid.
 				cache_valid = false;
-				cache_invalidated_debug = (hit_cascade == params.cascade) ? vec3(0.0, 4.00, 0.0) : vec3(4.0, 0, 0.0);
+				cache_invalidated_debug = vec3(0, 0, 4.0);
+			} else if (hit) {
+				// Check if the region pointed to is still valid.
+				uint version = imageLoad(ray_hit_cache_version, cache_texture_pos).r;
+				uint region_version = imageLoad(region_versions, (hit_cell / REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r;
+
+				if (region_version != version) {
+					cache_valid = false;
+					cache_invalidated_debug = (hit_cascade == params.cascade) ? vec3(0.0, 4.00, 0.0) : vec3(4.0, 0, 0.0);
+				}
 			}
 		}
-	}
 
-	if (!cache_valid) {
-		ivec3 hit_face;
-		hit = trace_ray_hdda(ray_pos, ray_dir, params.cascade, hit_cell, hit_face, hit_cascade);
-		if (hit) {
-			hit_cell += hit_face;
+		if (!cache_valid) {
+			ivec3 hit_face;
+			hit = trace_ray_hdda(ray_pos, ray_dir, params.cascade, hit_cell, hit_face, hit_cascade);
+			if (hit) {
+				hit_cell += hit_face;
 
-			ivec3 reg_cell_offset = cascades.data[hit_cascade].region_world_offset * REGION_SIZE;
-			hit_cell = (hit_cell + reg_cell_offset) & (params.grid_size - 1); // Read from wrapped world coordinates
+				ivec3 reg_cell_offset = cascades.data[hit_cascade].region_world_offset * REGION_SIZE;
+				hit_cell = (hit_cell + reg_cell_offset) & (params.grid_size - 1); // Read from wrapped world coordinates
+			}
 		}
-	}
-
-	vec3 light;
 
-	if (hit) {
-		ivec3 spos = hit_cell;
-		spos.y += hit_cascade * params.grid_size.y;
-		light = texelFetch(sampler3D(light_cascades, linear_sampler), spos, 0).rgb;
-	} else if (params.sky_mode == SKY_MODE_SKY) {
+		if (hit) {
+			ivec3 spos = hit_cell;
+			spos.y += hit_cascade * params.grid_size.y;
+			light = texelFetch(sampler3D(light_cascades, linear_sampler), spos, 0).rgb;
+		} else if (params.sky_mode == SKY_MODE_SKY) {
 #ifdef USE_CUBEMAP_ARRAY
-		light = textureLod(samplerCubeArray(sky_irradiance, linear_sampler_mipmaps), vec4(ray_dir, 0.0), 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates.
+			light = textureLod(samplerCubeArray(sky_irradiance, linear_sampler_mipmaps), vec4(ray_dir, 0.0), 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates.
 #else
-		light = textureLod(samplerCube(sky_irradiance, linear_sampler_mipmaps), ray_dir, 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates.
+			light = textureLod(samplerCube(sky_irradiance, linear_sampler_mipmaps), ray_dir, 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates.
 #endif
-		light *= params.sky_energy;
-	} else if (params.sky_mode == SKY_MODE_COLOR) {
-		light = params.sky_color;
-		light *= params.sky_energy;
-	} else {
-		light = vec3(0);
+			light *= params.sky_energy;
+		} else if (params.sky_mode == SKY_MODE_COLOR) {
+			light = params.sky_color;
+			light *= params.sky_energy;
+		} else {
+			light = vec3(0);
+		}
 	}
 
 	memoryBarrierShared();
 	barrier();
 
-	// Plot the light to the octahedron using bilinear filtering
+	if (thread_active) {
+		// Plot the light to the octahedron using bilinear filtering
 #ifdef TRACE_SUBPIXEL
-	sample_ofs = sample_ofs * 2.0 - 1.0;
-	ivec2 bilinear_base = ivec2(1) + local_pos - mix(ivec2(0), ivec2(1), lessThan(sample_ofs, vec2(0)));
-	vec2 blend = mix(sample_ofs, 1.0 + sample_ofs, lessThan(sample_ofs, vec2(0)));
-	for (int i = 0; i < 2; i++) {
-		float i_w = i == 0 ? 1.0 - blend.y : blend.y;
-		for (int j = 0; j < 2; j++) {
-			float j_w = j == 0 ? 1.0 - blend.x : blend.x;
-			uint wrap_neighbour = wrap_neighbours[(bilinear_base.y + i) * (LIGHTPROBE_OCT_SIZE + 2) + (bilinear_base.x + j)];
-			ivec2 write_to = ivec2(wrap_neighbour & 0xFFFF, wrap_neighbour >> 16);
-			int write_offset = write_to.y * LIGHTPROBE_OCT_SIZE + write_to.x;
-			float write_weight = i_w * j_w;
-
-			uvec3 lightu = uvec3(clamp((light * write_weight) * float(1 << FP_BITS), 0, float(FP_MAX)));
-			atomicAdd(neighbours_accum[write_offset].r, lightu.r);
-			atomicAdd(neighbours_accum[write_offset].g, lightu.g);
-			atomicAdd(neighbours_accum[write_offset].b, lightu.b);
+		sample_ofs = sample_ofs * 2.0 - 1.0;
+		ivec2 bilinear_base = ivec2(1) + local_pos - mix(ivec2(0), ivec2(1), lessThan(sample_ofs, vec2(0)));
+		vec2 blend = mix(sample_ofs, 1.0 + sample_ofs, lessThan(sample_ofs, vec2(0)));
+		for (int i = 0; i < 2; i++) {
+			float i_w = i == 0 ? 1.0 - blend.y : blend.y;
+			for (int j = 0; j < 2; j++) {
+				float j_w = j == 0 ? 1.0 - blend.x : blend.x;
+				uint wrap_neighbour = wrap_neighbours[(bilinear_base.y + i) * (LIGHTPROBE_OCT_SIZE + 2) + (bilinear_base.x + j)];
+				ivec2 write_to = ivec2(wrap_neighbour & 0xFFFF, wrap_neighbour >> 16);
+				int write_offset = write_to.y * LIGHTPROBE_OCT_SIZE + write_to.x;
+				float write_weight = i_w * j_w;
+
+				uvec3 lightu = uvec3(clamp((light * write_weight) * float(1 << FP_BITS), 0, float(FP_MAX)));
+				atomicAdd(neighbours_accum[write_offset].r, lightu.r);
+				atomicAdd(neighbours_accum[write_offset].g, lightu.g);
+				atomicAdd(neighbours_accum[write_offset].b, lightu.b);
+			}
 		}
-	}
 #else
 
-	neighbours[probe_index] = light;
+		neighbours[probe_index] = light;
 #endif
 
-	if (!cache_valid) {
-		cache_entry = CACHE_IS_VALID;
-		if (hit) {
-			// Determine the side of the cascade box this ray exited through, this is important for invalidation purposes.
+		if (!cache_valid) {
+			cache_entry = CACHE_IS_VALID;
+			if (hit) {
+				// Determine the side of the cascade box this ray exited through, this is important for invalidation purposes.
+
+				vec3 unit_pos = ray_pos - cascades.data[params.cascade].offset;
+				unit_pos *= cascades.data[params.cascade].to_cell;
+
+				vec3 t0 = -unit_pos / ray_dir;
+				vec3 t1 = (vec3(params.grid_size) - unit_pos) / ray_dir;
+				vec3 tmax = max(t0, t1);
+
+				uint axis;
+				float m;
+				if (tmax.x < tmax.y) {
+					axis = 0;
+					m = tmax.x;
+				} else {
+					axis = 1;
+					m = tmax.y;
+				}
+				if (tmax.z < m) {
+					axis = 2;
+				}
 
-			vec3 unit_pos = ray_pos - cascades.data[params.cascade].offset;
-			unit_pos *= cascades.data[params.cascade].to_cell;
+				uvec3 ucell = (uvec3(hit_cell) & uvec3(0xFF)) << uvec3(0, 8, 16);
+				cache_entry |= CACHE_IS_HIT | ucell.x | ucell.y | ucell.z | (uint(min(7, hit_cascade)) << 24) | (axis << 27);
 
-			vec3 t0 = -unit_pos / ray_dir;
-			vec3 t1 = (vec3(params.grid_size) - unit_pos) / ray_dir;
-			vec3 tmax = max(t0, t1);
+				uint region_version = imageLoad(region_versions, (hit_cell >> REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r;
 
-			uint axis;
-			float m;
-			if (tmax.x < tmax.y) {
-				axis = 0;
-				m = tmax.x;
-			} else {
-				axis = 1;
-				m = tmax.y;
+				imageStore(ray_hit_cache_version, cache_texture_pos, uvec4(region_version));
 			}
-			if (tmax.z < m) {
-				axis = 2;
-			}
-
-			uvec3 ucell = (uvec3(hit_cell) & uvec3(0xFF)) << uvec3(0, 8, 16);
-			cache_entry |= CACHE_IS_HIT | ucell.x | ucell.y | ucell.z | (uint(min(7, hit_cascade)) << 24) | (axis << 27);
 
-			uint region_version = imageLoad(region_versions, (hit_cell >> REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r;
-
-			imageStore(ray_hit_cache_version, cache_texture_pos, uvec4(region_version));
+			imageStore(ray_hit_cache, cache_texture_pos, uvec4(cache_entry));
 		}
-
-		imageStore(ray_hit_cache, cache_texture_pos, uvec4(cache_entry));
 	}
 
 	groupMemoryBarrier();
@@ -644,7 +653,7 @@ void main() {
 
 	// convert back to float and do moving average
 
-	{
+	if (thread_active) {
 #ifdef TRACE_SUBPIXEL
 		light = vec3(neighbours_accum[probe_index]) / float(1 << FP_BITS);
 #else
@@ -693,69 +702,71 @@ void main() {
 
 	// Compute specular, diffuse, ambient
 
-	vec3 diffuse_light = vec3(0);
-	vec3 specular_light = light;
-
-	for (uint i = 0; i < neighbour_max_weights; i++) {
-		uint n = neighbour_weights[probe_index * neighbour_max_weights + i];
-		uint index = n >> 16;
-		float weight = float(n & 0xFFFF) / float(0xFFFF);
-		diffuse_light += neighbours[index] * weight;
-	}
-
-	ivec3 store_texture_pos = ivec3(probe_texture_pos.xy * (LIGHTPROBE_OCT_SIZE + 2) + ivec2(1), probe_texture_pos.z);
-	ivec3 probe_read_pos = store_texture_pos + ivec3(local_pos, 0);
-
-	//if (cache_invalidated_debug!=vec3(0.0)) {
-	//	diffuse_light = cache_invalidated_debug;
-	//}
+	if (thread_active) {
+		vec3 diffuse_light = vec3(0);
+		vec3 specular_light = light;
 
-	// Store in octahedral map
-
-	ivec3 copy_to[4] = ivec3[](ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2));
-	copy_to[0] = probe_read_pos;
+		for (uint i = 0; i < neighbour_max_weights; i++) {
+			uint n = neighbour_weights[probe_index * neighbour_max_weights + i];
+			uint index = n >> 16;
+			float weight = float(n & 0xFFFF) / float(0xFFFF);
+			diffuse_light += neighbours[index] * weight;
+		}
 
-	if (local_pos == ivec2(0, 0)) {
-		copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, -1, 0);
-		copy_to[2] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE - 1, 0);
-		copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE, 0);
-	} else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, 0)) {
-		copy_to[1] = store_texture_pos + ivec3(0, -1, 0);
-		copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE - 1, 0);
-		copy_to[3] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE, 0);
-	} else if (local_pos == ivec2(0, LIGHTPROBE_OCT_SIZE - 1)) {
-		copy_to[1] = store_texture_pos + ivec3(-1, 0, 0);
-		copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE, 0);
-		copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, -1, 0);
-	} else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE - 1)) {
-		copy_to[1] = store_texture_pos + ivec3(0, LIGHTPROBE_OCT_SIZE, 0);
-		copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, 0, 0);
-		copy_to[3] = store_texture_pos + ivec3(-1, -1, 0);
-	} else if (local_pos.y == 0) {
-		copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y - 1, 0);
-	} else if (local_pos.x == 0) {
-		copy_to[1] = store_texture_pos + ivec3(local_pos.x - 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0);
-	} else if (local_pos.y == LIGHTPROBE_OCT_SIZE - 1) {
-		copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y + 1, 0);
-	} else if (local_pos.x == LIGHTPROBE_OCT_SIZE - 1) {
-		copy_to[1] = store_texture_pos + ivec3(local_pos.x + 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0);
-	}
+		ivec3 store_texture_pos = ivec3(probe_texture_pos.xy * (LIGHTPROBE_OCT_SIZE + 2) + ivec2(1), probe_texture_pos.z);
+		ivec3 probe_read_pos = store_texture_pos + ivec3(local_pos, 0);
+
+		//if (cache_invalidated_debug!=vec3(0.0)) {
+		//	diffuse_light = cache_invalidated_debug;
+		//}
+
+		// Store in octahedral map
+
+		ivec3 copy_to[4] = ivec3[](ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2));
+		copy_to[0] = probe_read_pos;
+
+		if (local_pos == ivec2(0, 0)) {
+			copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, -1, 0);
+			copy_to[2] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE - 1, 0);
+			copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE, 0);
+		} else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, 0)) {
+			copy_to[1] = store_texture_pos + ivec3(0, -1, 0);
+			copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE - 1, 0);
+			copy_to[3] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE, 0);
+		} else if (local_pos == ivec2(0, LIGHTPROBE_OCT_SIZE - 1)) {
+			copy_to[1] = store_texture_pos + ivec3(-1, 0, 0);
+			copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE, 0);
+			copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, -1, 0);
+		} else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE - 1)) {
+			copy_to[1] = store_texture_pos + ivec3(0, LIGHTPROBE_OCT_SIZE, 0);
+			copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, 0, 0);
+			copy_to[3] = store_texture_pos + ivec3(-1, -1, 0);
+		} else if (local_pos.y == 0) {
+			copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y - 1, 0);
+		} else if (local_pos.x == 0) {
+			copy_to[1] = store_texture_pos + ivec3(local_pos.x - 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0);
+		} else if (local_pos.y == LIGHTPROBE_OCT_SIZE - 1) {
+			copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y + 1, 0);
+		} else if (local_pos.x == LIGHTPROBE_OCT_SIZE - 1) {
+			copy_to[1] = store_texture_pos + ivec3(local_pos.x + 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0);
+		}
 
-	uint light_rgbe = rgbe_encode(specular_light);
-	uint diffuse_rgbe = rgbe_encode(diffuse_light);
+		uint light_rgbe = rgbe_encode(specular_light);
+		uint diffuse_rgbe = rgbe_encode(diffuse_light);
 
-	for (int i = 0; i < 4; i++) {
-		if (copy_to[i] == ivec3(-2, -2, -2)) {
-			continue;
+		for (int i = 0; i < 4; i++) {
+			if (copy_to[i] == ivec3(-2, -2, -2)) {
+				continue;
+			}
+			imageStore(lightprobe_texture_data, copy_to[i], uvec4(light_rgbe));
+			imageStore(lightprobe_diffuse_data, copy_to[i], uvec4(diffuse_rgbe));
+			// also to diffuse
 		}
-		imageStore(lightprobe_texture_data, copy_to[i], uvec4(light_rgbe));
-		imageStore(lightprobe_diffuse_data, copy_to[i], uvec4(diffuse_rgbe));
-		// also to diffuse
-	}
 
-	if (params.store_ambient_texture && probe_index == 0) {
-		vec3 ambient_light = vec3(ambient_accum) / float(1 << FP_BITS);
-		imageStore(lightprobe_ambient_data, ivec3(probe_texture_pos.xy, params.cascade), uvec4(rgbe_encode(ambient_light)));
+		if (params.store_ambient_texture && probe_index == 0) {
+			vec3 ambient_light = vec3(ambient_accum) / float(1 << FP_BITS);
+			imageStore(lightprobe_ambient_data, ivec3(probe_texture_pos.xy, params.cascade), uvec4(rgbe_encode(ambient_light)));
+		}
 	}
 
 #endif
diff --git a/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl b/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl
index e247a26661be..b56e224e2d5b 100644
--- a/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl
@@ -378,16 +378,13 @@ void main() {
 	int src_index = int(gl_GlobalInvocationID).x;
 	int local = int(gl_LocalInvocationID).x;
 
-	if (src_index >= src_dispatch_data.total_count) {
-		// Do not process.
-		return;
-	}
-
-	if (src_index >= params.maximum_light_cells) {
-		return;
-	}
+	bool thread_active = true; // Early return deadlocks Intel, so code must avoid it.
 
-	if (local == 0) {
+	if (src_index >= src_dispatch_data.total_count) {
+		thread_active = false;
+	} else if (src_index >= params.maximum_light_cells) {
+		thread_active = false;
+	} else if (local == 0) {
 		store_position_count = 0; // Base one stores as zero, the others wait
 		if (src_index == 0) {
 			// This lone thread clears y and z.
@@ -399,52 +396,57 @@ void main() {
 	groupMemoryBarrier();
 	barrier();
 
-	ivec3 src_pos = (ivec3(src_process_voxels.data[src_index].position) >> ivec3(0, 7, 14)) & ivec3(0x7F);
-	bool inside_area = all(greaterThanEqual(src_pos, params.offset)) && all(lessThan(src_pos, params.limit));
+	bool inside_area = false;
+	uint index = 0;
+	ivec3 src_pos;
 
-	if (!inside_area) {
-		ivec3 light_pos = src_pos + params.scroll;
-		light_pos = (light_pos + (params.region_world_pos * REGION_SIZE)) & (params.grid_size - 1);
-		light_pos.y += params.grid_size.y * params.cascade;
+	if (thread_active) {
+		src_pos = (ivec3(src_process_voxels.data[src_index].position) >> ivec3(0, 7, 14)) & ivec3(0x7F);
+		inside_area = all(greaterThanEqual(src_pos, params.offset)) && all(lessThan(src_pos, params.limit));
 
-		// As this will be a new area, clear the new region from the old values.
-		imageStore(light_tex, light_pos, uvec4(0));
-	}
-	uint index;
+		if (!inside_area) {
+			ivec3 light_pos = src_pos + params.scroll;
+			light_pos = (light_pos + (params.region_world_pos * REGION_SIZE)) & (params.grid_size - 1);
+			light_pos.y += params.grid_size.y * params.cascade;
 
-	if (inside_area) {
-		index = atomicAdd(store_position_count, 1);
-	}
+			// As this will be a new area, clear the new region from the old values.
+			imageStore(light_tex, light_pos, uvec4(0));
+		}
 
+		if (inside_area) {
+			index = atomicAdd(store_position_count, 1);
+		}
+	}
 	groupMemoryBarrier();
 	barrier();
 
 	// global increment only once per group, to reduce pressure
 
-	if (!inside_area || store_position_count == 0) {
-		return;
-	}
-
-	if (index == 0) {
-		store_from_index = atomicAdd(dispatch_data.total_count, store_position_count);
-		uint group_count = (store_from_index + store_position_count - 1) / 64 + 1;
-		atomicMax(dispatch_data.x, group_count);
+	if (thread_active) {
+		if (!inside_area || store_position_count == 0) {
+			thread_active = false;
+		} else if (index == 0) {
+			store_from_index = atomicAdd(dispatch_data.total_count, store_position_count);
+			uint group_count = (store_from_index + store_position_count - 1) / 64 + 1;
+			atomicMax(dispatch_data.x, group_count);
+		}
 	}
 
 	groupMemoryBarrier();
 	barrier();
 
-	index += store_from_index;
-
-	ivec3 dst_pos = src_pos + params.scroll;
+	if (thread_active) {
+		index += store_from_index;
 
-	uint src_pending_bits = src_process_voxels.data[src_index].position & ~uint((1 << 21) - 1);
+		ivec3 dst_pos = src_pos + params.scroll;
 
-	dst_process_voxels.data[index].position = uint(dst_pos.x | (dst_pos.y << 7) | (dst_pos.z << 14)) | src_pending_bits;
-	dst_process_voxels.data[index].albedo_normal = src_process_voxels.data[src_index].albedo_normal;
-	dst_process_voxels.data[index].emission = src_process_voxels.data[src_index].emission;
-	dst_process_voxels.data[index].occlusion = src_process_voxels.data[src_index].occlusion;
+		uint src_pending_bits = src_process_voxels.data[src_index].position & ~uint((1 << 21) - 1);
 
+		dst_process_voxels.data[index].position = uint(dst_pos.x | (dst_pos.y << 7) | (dst_pos.z << 14)) | src_pending_bits;
+		dst_process_voxels.data[index].albedo_normal = src_process_voxels.data[src_index].albedo_normal;
+		dst_process_voxels.data[index].emission = src_process_voxels.data[src_index].emission;
+		dst_process_voxels.data[index].occlusion = src_process_voxels.data[src_index].occlusion;
+	}
 #endif
 
 #ifdef MODE_LIGHT_STORE
@@ -472,190 +474,193 @@ void main() {
 		}
 	}
 
+	bool thread_active = true; // Early return deadlocks Intel, so code must avoid it.
+
 	if (any(greaterThanEqual(pos, params.limit))) {
 		// Storing is not a multiple of the workgroup, so invalid threads can happen.
-		return;
+		thread_active = false;
 	}
 
 	groupMemoryBarrier();
 	barrier();
 
-	uint solid = get_normal_facing(local);
-
-	if (local == ivec3(0)) {
-		store_position_count = 0; // Base one stores as zero, the others wait
-		if (pos == params.offset) {
-			// This lone thread clears y and z.
-			dispatch_data.y = 1;
-			dispatch_data.z = 1;
-		}
-	}
-
 	vec4 albedo_accum = vec4(0.0);
 	vec4 emission_accum = vec4(0.0);
 	vec3 normal_accum = vec3(0.0);
 	uint occlusionu = 0;
+	bool voxels_found = false;
+	ivec3 base_dst_pos;
 
-	//opposite to aniso dir
-	const ivec3 offsets[6] = ivec3[](
-			ivec3(1, 0, 0),
-			ivec3(-1, 0, 0),
-			ivec3(0, 1, 0),
-			ivec3(0, -1, 0),
-			ivec3(0, 0, 1),
-			ivec3(0, 0, -1));
-
-	const vec3 aniso_dir[6] = vec3[](
-			vec3(-1, 0, 0),
-			vec3(1, 0, 0),
-			vec3(0, -1, 0),
-			vec3(0, 1, 0),
-			vec3(0, 0, -1),
-			vec3(0, 0, 1));
-
-	// aniso dir in bitform
-	const uint aniso_mask[6] = uint[](
-			(1 << 0),
-			(1 << 1),
-			(1 << 2),
-			(1 << 3),
-			(1 << 4),
-			(1 << 5));
-
-	const uint aniso_offset_mask[6] = uint[](
-			(1 << 1),
-			(1 << 0),
-			(1 << 3),
-			(1 << 2),
-			(1 << 5),
-			(1 << 4));
+	if (thread_active) {
+		uint solid = get_normal_facing(local);
 
-	bool voxels_found = false;
-	uint disocclusion = 0;
+		if (local == ivec3(0)) {
+			store_position_count = 0; // Base one stores as zero, the others wait
+			if (pos == params.offset) {
+				// This lone thread clears y and z.
+				dispatch_data.y = 1;
+				dispatch_data.z = 1;
+			}
+		}
 
-	const int facing_direction_count = 26;
-	const vec3 facing_directions[26] = vec3[](vec3(-1.0, 0.0, 0.0), vec3(1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 1.0, 0.0), vec3(0.0, 0.0, -1.0), vec3(0.0, 0.0, 1.0), vec3(-0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, -0.7071067811865475, 0.0), vec3(-0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(-0.7071067811865475, 0.0, -0.7071067811865475), vec3(-0.7071067811865475, 0.0, 0.7071067811865475), vec3(-0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, 0.7071067811865475, 0.0), vec3(-0.5773502691896258, 0.5773502691896258, 0.5773502691896258), vec3(0.0, -0.7071067811865475, -0.7071067811865475), vec3(0.0, -0.7071067811865475, 0.7071067811865475), vec3(0.0, 0.7071067811865475, -0.7071067811865475), vec3(0.0, 0.7071067811865475, 0.7071067811865475), vec3(0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, -0.7071067811865475, 0.0), vec3(0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(0.7071067811865475, 0.0, -0.7071067811865475), vec3(0.7071067811865475, 0.0, 0.7071067811865475), vec3(0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, 0.7071067811865475, 0.0), vec3(0.5773502691896258, 0.5773502691896258, 0.5773502691896258));
+		//opposite to aniso dir
+		const ivec3 offsets[6] = ivec3[](
+				ivec3(1, 0, 0),
+				ivec3(-1, 0, 0),
+				ivec3(0, 1, 0),
+				ivec3(0, -1, 0),
+				ivec3(0, 0, 1),
+				ivec3(0, 0, -1));
 
-	bool use_for_filter = false;
+		const vec3 aniso_dir[6] = vec3[](
+				vec3(-1, 0, 0),
+				vec3(1, 0, 0),
+				vec3(0, -1, 0),
+				vec3(0, 1, 0),
+				vec3(0, 0, -1),
+				vec3(0, 0, 1));
 
-	for (int i = 0; i < 6; i++) {
-		uint n = get_normal_facing(local + offsets[i]);
-		if (n == 0) {
-			disocclusion |= aniso_offset_mask[i];
-		} else if (solid == 0) {
-			use_for_filter = true;
-		}
+		// aniso dir in bitform
+		const uint aniso_mask[6] = uint[](
+				(1 << 0),
+				(1 << 1),
+				(1 << 2),
+				(1 << 3),
+				(1 << 4),
+				(1 << 5));
 
-		if (solid != 0 || !bool(n & aniso_mask[i])) {
-			// Not solid, continue.
-			continue;
-		}
+		const uint aniso_offset_mask[6] = uint[](
+				(1 << 1),
+				(1 << 0),
+				(1 << 3),
+				(1 << 2),
+				(1 << 5),
+				(1 << 4));
 
-		voxels_found = true;
+		uint disocclusion = 0;
 
-		for (int j = 0; j < facing_direction_count; j++) {
-			if (bool(n & uint((1 << (j + 6))))) {
-				normal_accum += facing_directions[j];
-			}
-		}
+		const int facing_direction_count = 26;
+		const vec3 facing_directions[26] = vec3[](vec3(-1.0, 0.0, 0.0), vec3(1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 1.0, 0.0), vec3(0.0, 0.0, -1.0), vec3(0.0, 0.0, 1.0), vec3(-0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, -0.7071067811865475, 0.0), vec3(-0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(-0.7071067811865475, 0.0, -0.7071067811865475), vec3(-0.7071067811865475, 0.0, 0.7071067811865475), vec3(-0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, 0.7071067811865475, 0.0), vec3(-0.5773502691896258, 0.5773502691896258, 0.5773502691896258), vec3(0.0, -0.7071067811865475, -0.7071067811865475), vec3(0.0, -0.7071067811865475, 0.7071067811865475), vec3(0.0, 0.7071067811865475, -0.7071067811865475), vec3(0.0, 0.7071067811865475, 0.7071067811865475), vec3(0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, -0.7071067811865475, 0.0), vec3(0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(0.7071067811865475, 0.0, -0.7071067811865475), vec3(0.7071067811865475, 0.0, 0.7071067811865475), vec3(0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, 0.7071067811865475, 0.0), vec3(0.5773502691896258, 0.5773502691896258, 0.5773502691896258));
 
-		ivec3 ofs = pos + offsets[i];
-		//normal_accum += aniso_dir[i];
+		bool use_for_filter = false;
 
-		ivec3 albedo_ofs = ofs >> 1;
-		albedo_ofs.z *= 6;
-		albedo_ofs.z += i;
+		for (int i = 0; i < 6; i++) {
+			uint n = get_normal_facing(local + offsets[i]);
+			if (n == 0) {
+				disocclusion |= aniso_offset_mask[i];
+			} else if (solid == 0) {
+				use_for_filter = true;
+			}
 
-		uint a = imageLoad(src_albedo, albedo_ofs).r;
-		albedo_accum += vec4(vec3((ivec3(a) >> ivec3(0, 5, 11)) & ivec3(0x1f, 0x3f, 0x1f)) / vec3(31.0, 63.0, 31.0), 1.0);
+			if (solid != 0 || !bool(n & aniso_mask[i])) {
+				// Not solid, continue.
+				continue;
+			}
 
-		uint rgbe = imageLoad(src_emission, ofs >> 1).r;
+			voxels_found = true;
 
-		vec3 emission = rgbe_decode(rgbe);
+			for (int j = 0; j < facing_direction_count; j++) {
+				if (bool(n & uint((1 << (j + 6))))) {
+					normal_accum += facing_directions[j];
+				}
+			}
 
-		uint rgbe_aniso = imageLoad(src_emission_aniso, ofs >> 1).r;
-		float strength = ((rgbe_aniso >> (i * 5)) & 0x1F) / float(0x1F);
-		emission_accum += vec4(emission * strength, 1.0);
-	}
+			ivec3 ofs = pos + offsets[i];
+			//normal_accum += aniso_dir[i];
 
-	ivec3 base_dst_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - 1);
-	ivec3 dst_pos = base_dst_pos + params.grid_size.y * params.cascade;
-	imageStore(dst_disocclusion, dst_pos, uvec4(disocclusion));
+			ivec3 albedo_ofs = ofs >> 1;
+			albedo_ofs.z *= 6;
+			albedo_ofs.z += i;
 
-	if (solid != 0) {
-		return; // No further use for this.
-	}
+			uint a = imageLoad(src_albedo, albedo_ofs).r;
+			albedo_accum += vec4(vec3((ivec3(a) >> ivec3(0, 5, 11)) & ivec3(0x1f, 0x3f, 0x1f)) / vec3(31.0, 63.0, 31.0), 1.0);
 
-	if (use_for_filter) {
-		uint neighbour_voxels = 0;
+			uint rgbe = imageLoad(src_emission, ofs >> 1).r;
 
-		for (int i = 0; i < facing_direction_count; i++) {
-			ivec3 neighbour = ivec3(sign(facing_directions[i]));
-			ivec3 neighbour_pos = local + neighbour;
-			uint n = get_normal_facing(neighbour_pos);
-			if (n == 0) {
-				continue; // Nothing here
-			}
+			vec3 emission = rgbe_decode(rgbe);
 
-			for (int j = 0; j < 6; j++) {
-				//if (!bool(n&(1<<j))) {
-				//	continue; // Nothing here either.
-				//}
-				ivec3 neighbour_neighbour = neighbour_pos + ivec3(aniso_dir[j]);
-				ivec3 nn_rel = neighbour_neighbour - local;
+			uint rgbe_aniso = imageLoad(src_emission_aniso, ofs >> 1).r;
+			float strength = ((rgbe_aniso >> (i * 5)) & 0x1F) / float(0x1F);
+			emission_accum += vec4(emission * strength, 1.0);
+		}
 
-				if (any(lessThan(nn_rel, -ivec3(1))) || any(greaterThan(nn_rel, +ivec3(1)))) {
-					continue; // Too far away, ignore.
+		base_dst_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - 1);
+		ivec3 dst_pos = base_dst_pos + params.grid_size.y * params.cascade;
+		imageStore(dst_disocclusion, dst_pos, uvec4(disocclusion));
+
+		if (solid != 0) {
+			thread_active = false; // No further use for this, this is a solid voxel.
+		} else if (use_for_filter) {
+			uint neighbour_voxels = 0;
+
+			for (int i = 0; i < facing_direction_count; i++) {
+				ivec3 neighbour = ivec3(sign(facing_directions[i]));
+				ivec3 neighbour_pos = local + neighbour;
+				uint n = get_normal_facing(neighbour_pos);
+				if (n == 0) {
+					continue; // Nothing here
 				}
 
-				if (nn_rel == ivec3(0)) {
-					continue; // Point to itself, ignore.
-				}
+				for (int j = 0; j < 6; j++) {
+					//if (!bool(n&(1<<j))) {
+					//	continue; // Nothing here either.
+					//}
+					ivec3 neighbour_neighbour = neighbour_pos + ivec3(aniso_dir[j]);
+					ivec3 nn_rel = neighbour_neighbour - local;
 
-				uint q = get_normal_facing(local + nn_rel);
-				if (q != 0) {
-					continue; // Points to a solid block (can happen), Ignore.
-				}
+					if (any(lessThan(nn_rel, -ivec3(1))) || any(greaterThan(nn_rel, +ivec3(1)))) {
+						continue; // Too far away, ignore.
+					}
 
-				ivec3 nn_rel_abs = abs(nn_rel);
+					if (nn_rel == ivec3(0)) {
+						continue; // Point to itself, ignore.
+					}
 
-				int nn_steps = nn_rel_abs.x + nn_rel_abs.y + nn_rel_abs.z;
-				if (nn_steps == 3) {
-					continue;
-				}
-				if (nn_steps > 1) {
-					// must make sure we are not occluded towards this
-					ivec3 test_dirs[3] = ivec3[](ivec3(nn_rel.x, 0, 0), ivec3(0, nn_rel.y, 0), ivec3(0, 0, nn_rel.z));
-					int occlusions = 0;
-					for (int k = 0; k < 3; k++) {
-						if (test_dirs[k] == ivec3(0)) {
-							continue; // Direction not used
+					uint q = get_normal_facing(local + nn_rel);
+					if (q != 0) {
+						continue; // Points to a solid block (can happen), Ignore.
+					}
+
+					ivec3 nn_rel_abs = abs(nn_rel);
+
+					int nn_steps = nn_rel_abs.x + nn_rel_abs.y + nn_rel_abs.z;
+					if (nn_steps == 3) {
+						continue;
+					}
+					if (nn_steps > 1) {
+						// must make sure we are not occluded towards this
+						ivec3 test_dirs[3] = ivec3[](ivec3(nn_rel.x, 0, 0), ivec3(0, nn_rel.y, 0), ivec3(0, 0, nn_rel.z));
+						int occlusions = 0;
+						for (int k = 0; k < 3; k++) {
+							if (test_dirs[k] == ivec3(0)) {
+								continue; // Direction not used
+							}
+
+							q = get_normal_facing(local + test_dirs[k]);
+							if (q != 0) {
+								occlusions++;
+							}
 						}
 
-						q = get_normal_facing(local + test_dirs[k]);
-						if (q != 0) {
-							occlusions++;
+						if (occlusions >= 2) {
+							continue; // Occluded from here, ignore. May be unoccluded from another neighbour.
 						}
 					}
 
-					if (occlusions >= 2) {
-						continue; // Occluded from here, ignore. May be unoccluded from another neighbour.
-					}
+					const uint reverse_map[27] = uint[](6, 14, 18, 9, 4, 21, 11, 16, 23, 7, 2, 19, 0, 0, 1, 12, 3, 24, 8, 15, 20, 10, 5, 22, 13, 17, 25);
+					ivec3 abs_pos = nn_rel + ivec3(1);
+					// All good, this is a valid neighbour!
+					neighbour_voxels |= 1 << reverse_map[abs_pos.z * 3 * 3 + abs_pos.y * 3 + abs_pos.x];
 				}
-
-				const uint reverse_map[27] = uint[](6, 14, 18, 9, 4, 21, 11, 16, 23, 7, 2, 19, 0, 0, 1, 12, 3, 24, 8, 15, 20, 10, 5, 22, 13, 17, 25);
-				ivec3 abs_pos = nn_rel + ivec3(1);
-				// All good, this is a valid neighbour!
-				neighbour_voxels |= 1 << reverse_map[abs_pos.z * 3 * 3 + abs_pos.y * 3 + abs_pos.x];
 			}
-		}
 
-		ivec3 store_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - ivec3(1));
-		store_pos.y += params.grid_size.y * params.cascade;
-		imageStore(voxel_neighbours, store_pos, uvec4(neighbour_voxels));
-		if (!voxels_found) {
-			// Light voxels won't be stored here, but still ensure this is black to avoid light leaking from outside.
-			imageStore(light_tex, store_pos, uvec4(0));
+			ivec3 store_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - ivec3(1));
+			store_pos.y += params.grid_size.y * params.cascade;
+			imageStore(voxel_neighbours, store_pos, uvec4(neighbour_voxels));
+			if (!voxels_found) {
+				// Light voxels won't be stored here, but still ensure this is black to avoid light leaking from outside.
+				imageStore(light_tex, store_pos, uvec4(0));
+			}
 		}
 	}
 
@@ -664,29 +669,30 @@ void main() {
 
 	uint index;
 
-	if (voxels_found) {
+	if (thread_active && voxels_found) {
 		index = atomicAdd(store_position_count, 1);
 	}
 
 	groupMemoryBarrier();
 	barrier();
 
-	if (!voxels_found || store_position_count == 0) {
-		return;
-	}
-
-	// global increment only once per group, to reduce pressure
-
-	if (index == 0) {
-		store_from_index = atomicAdd(dispatch_data.total_count, store_position_count);
-		uint group_count = (store_from_index + store_position_count - 1) / 64 + 1;
-		atomicMax(dispatch_data.x, group_count);
+	if (thread_active) {
+		if (!voxels_found || store_position_count == 0) {
+			thread_active = false;
+		} else {
+			// global increment only once per group, to reduce pressure
+			if (thread_active && index == 0) {
+				store_from_index = atomicAdd(dispatch_data.total_count, store_position_count);
+				uint group_count = (store_from_index + store_position_count - 1) / 64 + 1;
+				atomicMax(dispatch_data.x, group_count);
+			}
+		}
 	}
 
 	groupMemoryBarrier();
 	barrier();
 
-	{
+	if (thread_active) {
 		// compute occlusion
 
 		ivec3 base_probe = params.region_world_pos + pos / PROBE_CELLS;
@@ -731,33 +737,31 @@ void main() {
 			}
 			occlusionu |= uint(clamp(w, 0.0, 15.0)) << (i * 4);
 		}
-	}
 
-	index += store_from_index;
+		index += store_from_index;
 
-	if (index >= params.maximum_light_cells) {
-		return;
-	}
-
-	normal_accum = normalize(normal_accum);
-	albedo_accum.rgb /= albedo_accum.a;
-	emission_accum.rgb /= emission_accum.a;
+		if (index < params.maximum_light_cells) {
+			normal_accum = normalize(normal_accum);
+			albedo_accum.rgb /= albedo_accum.a;
+			emission_accum.rgb /= emission_accum.a;
 
-	dst_process_voxels.data[index].position = uint(pos.x | (pos.y << 7) | (pos.z << 14)) | PROCESS_STATIC_PENDING_BIT | PROCESS_DYNAMIC_PENDING_BIT;
+			dst_process_voxels.data[index].position = uint(pos.x | (pos.y << 7) | (pos.z << 14)) | PROCESS_STATIC_PENDING_BIT | PROCESS_DYNAMIC_PENDING_BIT;
 
-	uint albedo_norm = 0;
-	albedo_norm |= clamp(uint(albedo_accum.r * 31.0), 0, 31) << 0;
-	albedo_norm |= clamp(uint(albedo_accum.g * 63.0), 0, 63) << 5;
-	albedo_norm |= clamp(uint(albedo_accum.b * 31.0), 0, 31) << 11;
+			uint albedo_norm = 0;
+			albedo_norm |= clamp(uint(albedo_accum.r * 31.0), 0, 31) << 0;
+			albedo_norm |= clamp(uint(albedo_accum.g * 63.0), 0, 63) << 5;
+			albedo_norm |= clamp(uint(albedo_accum.b * 31.0), 0, 31) << 11;
 
-	vec2 octa_normal = octahedron_encode(normal_accum);
-	uvec2 octa_unormal = clamp(uvec2(octa_normal * 255), uvec2(0), uvec2(255));
-	albedo_norm |= (octa_unormal.x << 16) | (octa_unormal.y << 24);
+			vec2 octa_normal = octahedron_encode(normal_accum);
+			uvec2 octa_unormal = clamp(uvec2(octa_normal * 255), uvec2(0), uvec2(255));
+			albedo_norm |= (octa_unormal.x << 16) | (octa_unormal.y << 24);
 
-	dst_process_voxels.data[index].albedo_normal = albedo_norm;
-	dst_process_voxels.data[index].emission = rgbe_encode(emission_accum.rgb);
+			dst_process_voxels.data[index].albedo_normal = albedo_norm;
+			dst_process_voxels.data[index].emission = rgbe_encode(emission_accum.rgb);
 
-	dst_process_voxels.data[index].occlusion = occlusionu;
+			dst_process_voxels.data[index].occlusion = occlusionu;
+		}
+	}
 
 	// Compute probe neighbours