From be83510ca1a4cee2b539e5de65eb1187138f3860 Mon Sep 17 00:00:00 2001 From: Juan Linietsky Date: Sun, 25 Feb 2024 10:43:08 +0100 Subject: [PATCH] Fixes to divergent control flow on Intel. --- .../renderer_rd/shaders/environment/gi.glsl | 287 ++++++------ .../shaders/environment/hddagi_integrate.glsl | 353 ++++++++------- .../environment/hddagi_preprocess.glsl | 424 +++++++++--------- 3 files changed, 545 insertions(+), 519 deletions(-) diff --git a/servers/rendering/renderer_rd/shaders/environment/gi.glsl b/servers/rendering/renderer_rd/shaders/environment/gi.glsl index 8b65b5bdc801..bb4b9ab65d67 100644 --- a/servers/rendering/renderer_rd/shaders/environment/gi.glsl +++ b/servers/rendering/renderer_rd/shaders/environment/gi.glsl @@ -1100,6 +1100,7 @@ void main() { #if defined(USE_HDDAGI) || defined(USE_VOXEL_GI_INSTANCES) uint vrs_x, vrs_y; + bool thread_active = true; #ifdef USE_VRS if (sc_use_vrs) { ivec2 vrs_pos; @@ -1117,198 +1118,208 @@ void main() { vrs_y = 1 << (vrs_texel & 3); if (mod(pos.x, vrs_x) != 0) { - return; + thread_active = false; } if (mod(pos.y, vrs_y) != 0) { - return; + thread_active = false; } } #endif - if (sc_half_res) { - pos <<= 1; - } + if (thread_active) { + if (sc_half_res) { + pos <<= 1; + } - if (any(greaterThanEqual(pos, scene_data.screen_size))) { //too large, do nothing - return; + if (any(greaterThanEqual(pos, scene_data.screen_size))) { //too large, do nothing + thread_active = false; + } } vec4 ambient_light = vec4(0.0); vec4 reflection_light = vec4(0.0); - - vec3 vertex; - vec3 normal; float roughness; - bool found_vertex = false; + if (thread_active) { + vec3 vertex; + vec3 normal; - vertex = reconstruct_position(pos); - vec4 normal_roughness = fetch_normal_and_roughness(pos); - found_vertex = length(normal_roughness.xyz) > 0.5; - normal = normal_roughness.xyz; - roughness = normal_roughness.w; - bool dynamic_object = roughness > 0.5; - if (dynamic_object) { - roughness = 1.0 - roughness; - } - roughness /= (127.0 / 255.0); - vertex.y = -vertex.y; + bool found_vertex = false; - if (found_vertex) { - process_gi(pos, vertex, normal, roughness, dynamic_object, ambient_light, reflection_light); - } + vertex = reconstruct_position(pos); + vec4 normal_roughness = fetch_normal_and_roughness(pos); + found_vertex = length(normal_roughness.xyz) > 0.5; + normal = normal_roughness.xyz; + roughness = normal_roughness.w; + bool dynamic_object = roughness > 0.5; + if (dynamic_object) { + roughness = 1.0 - roughness; + } + roughness /= (127.0 / 255.0); + vertex.y = -vertex.y; + + if (found_vertex) { + process_gi(pos, vertex, normal, roughness, dynamic_object, ambient_light, reflection_light); + } #ifdef USE_HDDAGI - // If using reflections, blend the 4 adjacent pixels to get rid of dither - uint group_pos = gl_LocalInvocationID.y * GROUP_SIZE + gl_LocalInvocationID.x; - group_positions[group_pos] = vertex; - group_normals[group_pos] = normal; - group_reflections[group_pos] = reflection_light; + // If using reflections, blend the 4 adjacent pixels to get rid of dither + uint group_pos = gl_LocalInvocationID.y * GROUP_SIZE + gl_LocalInvocationID.x; + group_positions[group_pos] = vertex; + group_normals[group_pos] = normal; + group_reflections[group_pos] = reflection_light; +#endif + } memoryBarrierShared(); barrier(); - if (roughness < ROUGHNESS_TO_REFLECTION_TRESHOOLD) { - uvec2 local_group_pos_base = gl_LocalInvocationID.xy - (gl_LocalInvocationID.xy % DITHER_SIZE); - uint local_group_pos = local_group_pos_base.y * GROUP_SIZE + local_group_pos_base.x; - - vec3 positions[DITHER_SIZE * DITHER_SIZE]; - vec3 normals[DITHER_SIZE * DITHER_SIZE]; +#ifdef USE_HDDAGI - vec4 average = vec4(0.0); - for (int i = 0; i < DITHER_SIZE; i++) { - for (int j = 0; j < DITHER_SIZE; j++) { - uint src_pos = local_group_pos + i * GROUP_SIZE + j; - normals[i * DITHER_SIZE + j] = group_normals[src_pos]; - positions[i * DITHER_SIZE + j] = group_positions[src_pos]; - average += group_reflections[src_pos]; + if (thread_active) { + if (roughness < ROUGHNESS_TO_REFLECTION_TRESHOOLD) { + uvec2 local_group_pos_base = gl_LocalInvocationID.xy - (gl_LocalInvocationID.xy % DITHER_SIZE); + uint local_group_pos = local_group_pos_base.y * GROUP_SIZE + local_group_pos_base.x; + + vec3 positions[DITHER_SIZE * DITHER_SIZE]; + vec3 normals[DITHER_SIZE * DITHER_SIZE]; + + vec4 average = vec4(0.0); + for (int i = 0; i < DITHER_SIZE; i++) { + for (int j = 0; j < DITHER_SIZE; j++) { + uint src_pos = local_group_pos + i * GROUP_SIZE + j; + normals[i * DITHER_SIZE + j] = group_normals[src_pos]; + positions[i * DITHER_SIZE + j] = group_positions[src_pos]; + average += group_reflections[src_pos]; + } } - } - average /= 4.0; + average /= 4.0; - const int subgroup_count = (DITHER_SIZE - 1) * (DITHER_SIZE - 1); - uvec4 subgroups[subgroup_count] = uvec4[]( + const int subgroup_count = (DITHER_SIZE - 1) * (DITHER_SIZE - 1); + uvec4 subgroups[subgroup_count] = uvec4[]( #if DITHER_SIZE == 2 - uvec4(0, 1, 2, 3) + uvec4(0, 1, 2, 3) #elif DITHER_SIZE == 3 - uvec4(0, 1, 3, 4), uvec4(1, 2, 4, 5), uvec4(3, 4, 6, 7), uvec4(4, 5, 7, 8) + uvec4(0, 1, 3, 4), uvec4(1, 2, 4, 5), uvec4(3, 4, 6, 7), uvec4(4, 5, 7, 8) #endif - ); - - const float same_plane_threshold = 0.9659258262890683; // 15 degrees tolerance + ); + + const float same_plane_threshold = 0.9659258262890683; // 15 degrees tolerance + + float weight = 1.0; + for (int i = 0; i < subgroup_count; i++) { + uvec4 sg = subgroups[i]; + // Weight positions in plane. + vec3 p[4] = vec3[](positions[sg.x], positions[sg.y], positions[sg.z], positions[sg.w]); + vec3 n1 = normalize(cross(p[0] - p[2], p[0] - p[1])); + vec3 n2 = normalize(cross(p[2] - p[3], p[2] - p[1])); + weight *= max(0.0, smoothstep(same_plane_threshold, 1, dot(n1, n2))); + + // Weight normal difference. + vec3 n[4] = vec3[](normals[sg.x], normals[sg.y], normals[sg.z], normals[sg.w]); + weight *= max(0.0, smoothstep(same_plane_threshold, 1, length((n[0] + n[1] + n[2] + n[3]) / 4.0))); + } - float weight = 1.0; - for (int i = 0; i < subgroup_count; i++) { - uvec4 sg = subgroups[i]; - // Weight positions in plane. - vec3 p[4] = vec3[](positions[sg.x], positions[sg.y], positions[sg.z], positions[sg.w]); - vec3 n1 = normalize(cross(p[0] - p[2], p[0] - p[1])); - vec3 n2 = normalize(cross(p[2] - p[3], p[2] - p[1])); - weight *= max(0.0, smoothstep(same_plane_threshold, 1, dot(n1, n2))); - - // Weight normal difference. - vec3 n[4] = vec3[](normals[sg.x], normals[sg.y], normals[sg.z], normals[sg.w]); - weight *= max(0.0, smoothstep(same_plane_threshold, 1, length((n[0] + n[1] + n[2] + n[3]) / 4.0))); + reflection_light = mix(reflection_light, average, weight); } - - reflection_light = mix(reflection_light, average, weight); } #endif - if (sc_half_res) { - pos >>= 1; - } + if (thread_active) { + if (sc_half_res) { + pos >>= 1; + } - uint ambient_rgbe = rgbe_encode(ambient_light.rgb); - uint reflection_rgbe = rgbe_encode(reflection_light.rgb); - uint blend = uint(clamp(reflection_light.a * 0xF, 0, 0xF)) | (uint(clamp(ambient_light.a * 0xF, 0, 0xF)) << 4); + uint ambient_rgbe = rgbe_encode(ambient_light.rgb); + uint reflection_rgbe = rgbe_encode(reflection_light.rgb); + uint blend = uint(clamp(reflection_light.a * 0xF, 0, 0xF)) | (uint(clamp(ambient_light.a * 0xF, 0, 0xF)) << 4); - imageStore(ambient_buffer, pos, uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos, uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos, vec4(ambient_light.a, reflection_light.a, 0, 0)); + imageStore(ambient_buffer, pos, uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos, uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos, vec4(ambient_light.a, reflection_light.a, 0, 0)); #ifdef USE_VRS - if (sc_use_vrs) { - if (vrs_x > 1) { - imageStore(ambient_buffer, pos + ivec2(1, 0), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(1, 0), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(1, 0), uvec4(blend)); - } + if (sc_use_vrs) { + if (vrs_x > 1) { + imageStore(ambient_buffer, pos + ivec2(1, 0), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(1, 0), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(1, 0), uvec4(blend)); + } - if (vrs_x > 2) { - imageStore(ambient_buffer, pos + ivec2(2, 0), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(2, 0), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(2, 0), uvec4(blend)); + if (vrs_x > 2) { + imageStore(ambient_buffer, pos + ivec2(2, 0), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(2, 0), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(2, 0), uvec4(blend)); - imageStore(ambient_buffer, pos + ivec2(3, 0), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(3, 0), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(3, 0), uvec4(blend)); - } + imageStore(ambient_buffer, pos + ivec2(3, 0), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(3, 0), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(3, 0), uvec4(blend)); + } - if (vrs_y > 1) { - imageStore(ambient_buffer, pos + ivec2(0, 1), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(0, 1), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(0, 1), uvec4(blend)); - } + if (vrs_y > 1) { + imageStore(ambient_buffer, pos + ivec2(0, 1), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(0, 1), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(0, 1), uvec4(blend)); + } - if (vrs_y > 1 && vrs_x > 1) { - imageStore(ambient_buffer, pos + ivec2(1, 1), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(1, 1), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(1, 1), uvec4(blend)); - } + if (vrs_y > 1 && vrs_x > 1) { + imageStore(ambient_buffer, pos + ivec2(1, 1), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(1, 1), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(1, 1), uvec4(blend)); + } - if (vrs_y > 1 && vrs_x > 2) { - imageStore(ambient_buffer, pos + ivec2(2, 1), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(2, 1), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(2, 1), uvec4(blend)); + if (vrs_y > 1 && vrs_x > 2) { + imageStore(ambient_buffer, pos + ivec2(2, 1), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(2, 1), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(2, 1), uvec4(blend)); - imageStore(ambient_buffer, pos + ivec2(3, 1), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(3, 1), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(3, 1), uvec4(blend)); - } + imageStore(ambient_buffer, pos + ivec2(3, 1), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(3, 1), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(3, 1), uvec4(blend)); + } - if (vrs_y > 2) { - imageStore(ambient_buffer, pos + ivec2(0, 2), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(0, 2), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(0, 2), uvec4(blend)); + if (vrs_y > 2) { + imageStore(ambient_buffer, pos + ivec2(0, 2), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(0, 2), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(0, 2), uvec4(blend)); - imageStore(ambient_buffer, pos + ivec2(0, 3), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(0, 3), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(0, 3), uvec4(blend)); - } + imageStore(ambient_buffer, pos + ivec2(0, 3), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(0, 3), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(0, 3), uvec4(blend)); + } - if (vrs_y > 2 && vrs_x > 1) { - imageStore(ambient_buffer, pos + ivec2(1, 2), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(1, 2), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(1, 2), uvec4(blend)); + if (vrs_y > 2 && vrs_x > 1) { + imageStore(ambient_buffer, pos + ivec2(1, 2), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(1, 2), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(1, 2), uvec4(blend)); - imageStore(ambient_buffer, pos + ivec2(1, 3), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(1, 3), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(1, 3), uvec4(blend)); - } + imageStore(ambient_buffer, pos + ivec2(1, 3), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(1, 3), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(1, 3), uvec4(blend)); + } - if (vrs_y > 2 && vrs_x > 2) { - imageStore(ambient_buffer, pos + ivec2(2, 2), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(2, 2), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(2, 2), uvec4(blend)); + if (vrs_y > 2 && vrs_x > 2) { + imageStore(ambient_buffer, pos + ivec2(2, 2), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(2, 2), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(2, 2), uvec4(blend)); - imageStore(ambient_buffer, pos + ivec2(2, 3), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(2, 3), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(2, 3), uvec4(blend)); + imageStore(ambient_buffer, pos + ivec2(2, 3), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(2, 3), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(2, 3), uvec4(blend)); - imageStore(ambient_buffer, pos + ivec2(3, 2), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(3, 2), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(3, 2), uvec4(blend)); + imageStore(ambient_buffer, pos + ivec2(3, 2), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(3, 2), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(3, 2), uvec4(blend)); - imageStore(ambient_buffer, pos + ivec2(3, 3), uvec4(ambient_rgbe)); - imageStore(reflection_buffer, pos + ivec2(3, 3), uvec4(reflection_rgbe)); - imageStore(blend_buffer, pos + ivec2(3, 3), uvec4(blend)); + imageStore(ambient_buffer, pos + ivec2(3, 3), uvec4(ambient_rgbe)); + imageStore(reflection_buffer, pos + ivec2(3, 3), uvec4(reflection_rgbe)); + imageStore(blend_buffer, pos + ivec2(3, 3), uvec4(blend)); + } } - } #endif - + } #endif } diff --git a/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl b/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl index 352b2607f87c..699073cac6ab 100644 --- a/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl +++ b/servers/rendering/renderer_rd/shaders/environment/hddagi_integrate.glsl @@ -488,155 +488,164 @@ void main() { memoryBarrierShared(); barrier(); + bool thread_active = true; + vec3 light; + ivec3 cache_texture_pos; + vec3 ray_dir; + vec2 sample_ofs; + vec3 ray_pos; + bool hit; + ivec3 hit_cell; + int hit_cascade; + bool cache_valid; + vec3 cache_invalidated_debug; + uint cache_entry; + if (probe_history_index < 0) { - return; - } + thread_active = false; + } else { + float probe_cell_size = float(params.grid_size.x) / float(params.probe_axis_size.x - 1) / cascades.data[params.cascade].to_cell; - float probe_cell_size = float(params.grid_size.x) / float(params.probe_axis_size.x - 1) / cascades.data[params.cascade].to_cell; + ray_pos = cascades.data[params.cascade].offset + vec3(probe_cell) * probe_cell_size; - vec3 ray_pos = cascades.data[params.cascade].offset + vec3(probe_cell) * probe_cell_size; + // Ensure a unique hash that includes the probe world position, the local octahedron pixel, and the history frame index + uvec3 h3 = hash3(uvec3((uvec3(probe_world_pos) * LIGHTPROBE_OCT_SIZE * LIGHTPROBE_OCT_SIZE + uvec3(probe_index)) * uvec3(params.history_size) + uvec3(probe_history_index))); + uint h = (h3.x ^ h3.y) ^ h3.z; + sample_ofs = vec2(ivec2(h >> 16, h & 0xFFFF)) / vec2(0xFFFF); + ray_dir = octahedron_decode((vec2(local_pos) + sample_ofs) / vec2(LIGHTPROBE_OCT_SIZE)); - // Ensure a unique hash that includes the probe world position, the local octahedron pixel, and the history frame index - uvec3 h3 = hash3(uvec3((uvec3(probe_world_pos) * LIGHTPROBE_OCT_SIZE * LIGHTPROBE_OCT_SIZE + uvec3(probe_index)) * uvec3(params.history_size) + uvec3(probe_history_index))); - uint h = (h3.x ^ h3.y) ^ h3.z; - vec2 sample_ofs = vec2(ivec2(h >> 16, h & 0xFFFF)) / vec2(0xFFFF); - vec3 ray_dir = octahedron_decode((vec2(local_pos) + sample_ofs) / vec2(LIGHTPROBE_OCT_SIZE)); + ray_dir.y *= params.y_mult; + ray_dir = normalize(ray_dir); - ray_dir.y *= params.y_mult; - ray_dir = normalize(ray_dir); + // Apply bias (by a cell) + float bias = params.ray_bias; + vec3 abs_ray_dir = abs(ray_dir); + ray_pos += ray_dir * 1.0 / max(abs_ray_dir.x, max(abs_ray_dir.y, abs_ray_dir.z)) * bias / cascades.data[params.cascade].to_cell; - // Apply bias (by a cell) - float bias = params.ray_bias; - vec3 abs_ray_dir = abs(ray_dir); - ray_pos += ray_dir * 1.0 / max(abs_ray_dir.x, max(abs_ray_dir.y, abs_ray_dir.z)) * bias / cascades.data[params.cascade].to_cell; + cache_texture_pos = ivec3(probe_texture_pos.xy * LIGHTPROBE_OCT_SIZE + local_pos, probe_texture_pos.z * params.history_size + probe_history_index); + cache_entry = imageLoad(ray_hit_cache, cache_texture_pos).r; - ivec3 cache_texture_pos = ivec3(probe_texture_pos.xy * LIGHTPROBE_OCT_SIZE + local_pos, probe_texture_pos.z * params.history_size + probe_history_index); - uint cache_entry = imageLoad(ray_hit_cache, cache_texture_pos).r; + cache_valid = bool(cache_entry & CACHE_IS_VALID); - bool hit; - ivec3 hit_cell; - int hit_cascade; + cache_invalidated_debug = vec3(0.0); - bool cache_valid = bool(cache_entry & CACHE_IS_VALID); - - vec3 cache_invalidated_debug = vec3(0.0); - - if (cache_valid) { - // Make sure the cache is really valid - hit = bool(cache_entry & CACHE_IS_HIT); - uvec4 uhit = (uvec4(cache_entry) >> uvec4(0, 8, 16, 24)) & uvec4(0xFF, 0xFF, 0xFF, 0x7); - hit_cell = ivec3(uhit.xyz); - hit_cascade = int(uhit.w); - uint axis = (cache_entry >> 27) & 0x3; - if (bool((1 << axis) & params.motion_accum)) { - // There was motion in this axis, cache is no longer valid. - cache_valid = false; - cache_invalidated_debug = vec3(0, 0, 4.0); - } else if (hit) { - // Check if the region pointed to is still valid. - uint version = imageLoad(ray_hit_cache_version, cache_texture_pos).r; - uint region_version = imageLoad(region_versions, (hit_cell / REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r; - - if (region_version != version) { + if (cache_valid) { + // Make sure the cache is really valid + hit = bool(cache_entry & CACHE_IS_HIT); + uvec4 uhit = (uvec4(cache_entry) >> uvec4(0, 8, 16, 24)) & uvec4(0xFF, 0xFF, 0xFF, 0x7); + hit_cell = ivec3(uhit.xyz); + hit_cascade = int(uhit.w); + uint axis = (cache_entry >> 27) & 0x3; + if (bool((1 << axis) & params.motion_accum)) { + // There was motion in this axis, cache is no longer valid. cache_valid = false; - cache_invalidated_debug = (hit_cascade == params.cascade) ? vec3(0.0, 4.00, 0.0) : vec3(4.0, 0, 0.0); + cache_invalidated_debug = vec3(0, 0, 4.0); + } else if (hit) { + // Check if the region pointed to is still valid. + uint version = imageLoad(ray_hit_cache_version, cache_texture_pos).r; + uint region_version = imageLoad(region_versions, (hit_cell / REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r; + + if (region_version != version) { + cache_valid = false; + cache_invalidated_debug = (hit_cascade == params.cascade) ? vec3(0.0, 4.00, 0.0) : vec3(4.0, 0, 0.0); + } } } - } - if (!cache_valid) { - ivec3 hit_face; - hit = trace_ray_hdda(ray_pos, ray_dir, params.cascade, hit_cell, hit_face, hit_cascade); - if (hit) { - hit_cell += hit_face; + if (!cache_valid) { + ivec3 hit_face; + hit = trace_ray_hdda(ray_pos, ray_dir, params.cascade, hit_cell, hit_face, hit_cascade); + if (hit) { + hit_cell += hit_face; - ivec3 reg_cell_offset = cascades.data[hit_cascade].region_world_offset * REGION_SIZE; - hit_cell = (hit_cell + reg_cell_offset) & (params.grid_size - 1); // Read from wrapped world coordinates + ivec3 reg_cell_offset = cascades.data[hit_cascade].region_world_offset * REGION_SIZE; + hit_cell = (hit_cell + reg_cell_offset) & (params.grid_size - 1); // Read from wrapped world coordinates + } } - } - - vec3 light; - if (hit) { - ivec3 spos = hit_cell; - spos.y += hit_cascade * params.grid_size.y; - light = texelFetch(sampler3D(light_cascades, linear_sampler), spos, 0).rgb; - } else if (params.sky_mode == SKY_MODE_SKY) { + if (hit) { + ivec3 spos = hit_cell; + spos.y += hit_cascade * params.grid_size.y; + light = texelFetch(sampler3D(light_cascades, linear_sampler), spos, 0).rgb; + } else if (params.sky_mode == SKY_MODE_SKY) { #ifdef USE_CUBEMAP_ARRAY - light = textureLod(samplerCubeArray(sky_irradiance, linear_sampler_mipmaps), vec4(ray_dir, 0.0), 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates. + light = textureLod(samplerCubeArray(sky_irradiance, linear_sampler_mipmaps), vec4(ray_dir, 0.0), 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates. #else - light = textureLod(samplerCube(sky_irradiance, linear_sampler_mipmaps), ray_dir, 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates. + light = textureLod(samplerCube(sky_irradiance, linear_sampler_mipmaps), ray_dir, 2.0).rgb; // Use second mipmap because we don't usually throw a lot of rays, so this compensates. #endif - light *= params.sky_energy; - } else if (params.sky_mode == SKY_MODE_COLOR) { - light = params.sky_color; - light *= params.sky_energy; - } else { - light = vec3(0); + light *= params.sky_energy; + } else if (params.sky_mode == SKY_MODE_COLOR) { + light = params.sky_color; + light *= params.sky_energy; + } else { + light = vec3(0); + } } memoryBarrierShared(); barrier(); - // Plot the light to the octahedron using bilinear filtering + if (thread_active) { + // Plot the light to the octahedron using bilinear filtering #ifdef TRACE_SUBPIXEL - sample_ofs = sample_ofs * 2.0 - 1.0; - ivec2 bilinear_base = ivec2(1) + local_pos - mix(ivec2(0), ivec2(1), lessThan(sample_ofs, vec2(0))); - vec2 blend = mix(sample_ofs, 1.0 + sample_ofs, lessThan(sample_ofs, vec2(0))); - for (int i = 0; i < 2; i++) { - float i_w = i == 0 ? 1.0 - blend.y : blend.y; - for (int j = 0; j < 2; j++) { - float j_w = j == 0 ? 1.0 - blend.x : blend.x; - uint wrap_neighbour = wrap_neighbours[(bilinear_base.y + i) * (LIGHTPROBE_OCT_SIZE + 2) + (bilinear_base.x + j)]; - ivec2 write_to = ivec2(wrap_neighbour & 0xFFFF, wrap_neighbour >> 16); - int write_offset = write_to.y * LIGHTPROBE_OCT_SIZE + write_to.x; - float write_weight = i_w * j_w; - - uvec3 lightu = uvec3(clamp((light * write_weight) * float(1 << FP_BITS), 0, float(FP_MAX))); - atomicAdd(neighbours_accum[write_offset].r, lightu.r); - atomicAdd(neighbours_accum[write_offset].g, lightu.g); - atomicAdd(neighbours_accum[write_offset].b, lightu.b); + sample_ofs = sample_ofs * 2.0 - 1.0; + ivec2 bilinear_base = ivec2(1) + local_pos - mix(ivec2(0), ivec2(1), lessThan(sample_ofs, vec2(0))); + vec2 blend = mix(sample_ofs, 1.0 + sample_ofs, lessThan(sample_ofs, vec2(0))); + for (int i = 0; i < 2; i++) { + float i_w = i == 0 ? 1.0 - blend.y : blend.y; + for (int j = 0; j < 2; j++) { + float j_w = j == 0 ? 1.0 - blend.x : blend.x; + uint wrap_neighbour = wrap_neighbours[(bilinear_base.y + i) * (LIGHTPROBE_OCT_SIZE + 2) + (bilinear_base.x + j)]; + ivec2 write_to = ivec2(wrap_neighbour & 0xFFFF, wrap_neighbour >> 16); + int write_offset = write_to.y * LIGHTPROBE_OCT_SIZE + write_to.x; + float write_weight = i_w * j_w; + + uvec3 lightu = uvec3(clamp((light * write_weight) * float(1 << FP_BITS), 0, float(FP_MAX))); + atomicAdd(neighbours_accum[write_offset].r, lightu.r); + atomicAdd(neighbours_accum[write_offset].g, lightu.g); + atomicAdd(neighbours_accum[write_offset].b, lightu.b); + } } - } #else - neighbours[probe_index] = light; + neighbours[probe_index] = light; #endif - if (!cache_valid) { - cache_entry = CACHE_IS_VALID; - if (hit) { - // Determine the side of the cascade box this ray exited through, this is important for invalidation purposes. + if (!cache_valid) { + cache_entry = CACHE_IS_VALID; + if (hit) { + // Determine the side of the cascade box this ray exited through, this is important for invalidation purposes. + + vec3 unit_pos = ray_pos - cascades.data[params.cascade].offset; + unit_pos *= cascades.data[params.cascade].to_cell; + + vec3 t0 = -unit_pos / ray_dir; + vec3 t1 = (vec3(params.grid_size) - unit_pos) / ray_dir; + vec3 tmax = max(t0, t1); + + uint axis; + float m; + if (tmax.x < tmax.y) { + axis = 0; + m = tmax.x; + } else { + axis = 1; + m = tmax.y; + } + if (tmax.z < m) { + axis = 2; + } - vec3 unit_pos = ray_pos - cascades.data[params.cascade].offset; - unit_pos *= cascades.data[params.cascade].to_cell; + uvec3 ucell = (uvec3(hit_cell) & uvec3(0xFF)) << uvec3(0, 8, 16); + cache_entry |= CACHE_IS_HIT | ucell.x | ucell.y | ucell.z | (uint(min(7, hit_cascade)) << 24) | (axis << 27); - vec3 t0 = -unit_pos / ray_dir; - vec3 t1 = (vec3(params.grid_size) - unit_pos) / ray_dir; - vec3 tmax = max(t0, t1); + uint region_version = imageLoad(region_versions, (hit_cell >> REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r; - uint axis; - float m; - if (tmax.x < tmax.y) { - axis = 0; - m = tmax.x; - } else { - axis = 1; - m = tmax.y; + imageStore(ray_hit_cache_version, cache_texture_pos, uvec4(region_version)); } - if (tmax.z < m) { - axis = 2; - } - - uvec3 ucell = (uvec3(hit_cell) & uvec3(0xFF)) << uvec3(0, 8, 16); - cache_entry |= CACHE_IS_HIT | ucell.x | ucell.y | ucell.z | (uint(min(7, hit_cascade)) << 24) | (axis << 27); - uint region_version = imageLoad(region_versions, (hit_cell >> REGION_SIZE) + ivec3(0, hit_cascade * (params.grid_size.y / REGION_SIZE), 0)).r; - - imageStore(ray_hit_cache_version, cache_texture_pos, uvec4(region_version)); + imageStore(ray_hit_cache, cache_texture_pos, uvec4(cache_entry)); } - - imageStore(ray_hit_cache, cache_texture_pos, uvec4(cache_entry)); } groupMemoryBarrier(); @@ -644,7 +653,7 @@ void main() { // convert back to float and do moving average - { + if (thread_active) { #ifdef TRACE_SUBPIXEL light = vec3(neighbours_accum[probe_index]) / float(1 << FP_BITS); #else @@ -693,69 +702,71 @@ void main() { // Compute specular, diffuse, ambient - vec3 diffuse_light = vec3(0); - vec3 specular_light = light; - - for (uint i = 0; i < neighbour_max_weights; i++) { - uint n = neighbour_weights[probe_index * neighbour_max_weights + i]; - uint index = n >> 16; - float weight = float(n & 0xFFFF) / float(0xFFFF); - diffuse_light += neighbours[index] * weight; - } - - ivec3 store_texture_pos = ivec3(probe_texture_pos.xy * (LIGHTPROBE_OCT_SIZE + 2) + ivec2(1), probe_texture_pos.z); - ivec3 probe_read_pos = store_texture_pos + ivec3(local_pos, 0); - - //if (cache_invalidated_debug!=vec3(0.0)) { - // diffuse_light = cache_invalidated_debug; - //} + if (thread_active) { + vec3 diffuse_light = vec3(0); + vec3 specular_light = light; - // Store in octahedral map - - ivec3 copy_to[4] = ivec3[](ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2)); - copy_to[0] = probe_read_pos; + for (uint i = 0; i < neighbour_max_weights; i++) { + uint n = neighbour_weights[probe_index * neighbour_max_weights + i]; + uint index = n >> 16; + float weight = float(n & 0xFFFF) / float(0xFFFF); + diffuse_light += neighbours[index] * weight; + } - if (local_pos == ivec2(0, 0)) { - copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, -1, 0); - copy_to[2] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE - 1, 0); - copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE, 0); - } else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, 0)) { - copy_to[1] = store_texture_pos + ivec3(0, -1, 0); - copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE - 1, 0); - copy_to[3] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE, 0); - } else if (local_pos == ivec2(0, LIGHTPROBE_OCT_SIZE - 1)) { - copy_to[1] = store_texture_pos + ivec3(-1, 0, 0); - copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE, 0); - copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, -1, 0); - } else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE - 1)) { - copy_to[1] = store_texture_pos + ivec3(0, LIGHTPROBE_OCT_SIZE, 0); - copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, 0, 0); - copy_to[3] = store_texture_pos + ivec3(-1, -1, 0); - } else if (local_pos.y == 0) { - copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y - 1, 0); - } else if (local_pos.x == 0) { - copy_to[1] = store_texture_pos + ivec3(local_pos.x - 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0); - } else if (local_pos.y == LIGHTPROBE_OCT_SIZE - 1) { - copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y + 1, 0); - } else if (local_pos.x == LIGHTPROBE_OCT_SIZE - 1) { - copy_to[1] = store_texture_pos + ivec3(local_pos.x + 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0); - } + ivec3 store_texture_pos = ivec3(probe_texture_pos.xy * (LIGHTPROBE_OCT_SIZE + 2) + ivec2(1), probe_texture_pos.z); + ivec3 probe_read_pos = store_texture_pos + ivec3(local_pos, 0); + + //if (cache_invalidated_debug!=vec3(0.0)) { + // diffuse_light = cache_invalidated_debug; + //} + + // Store in octahedral map + + ivec3 copy_to[4] = ivec3[](ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2)); + copy_to[0] = probe_read_pos; + + if (local_pos == ivec2(0, 0)) { + copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, -1, 0); + copy_to[2] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE - 1, 0); + copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE, 0); + } else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, 0)) { + copy_to[1] = store_texture_pos + ivec3(0, -1, 0); + copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, LIGHTPROBE_OCT_SIZE - 1, 0); + copy_to[3] = store_texture_pos + ivec3(-1, LIGHTPROBE_OCT_SIZE, 0); + } else if (local_pos == ivec2(0, LIGHTPROBE_OCT_SIZE - 1)) { + copy_to[1] = store_texture_pos + ivec3(-1, 0, 0); + copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE, 0); + copy_to[3] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, -1, 0); + } else if (local_pos == ivec2(LIGHTPROBE_OCT_SIZE - 1, LIGHTPROBE_OCT_SIZE - 1)) { + copy_to[1] = store_texture_pos + ivec3(0, LIGHTPROBE_OCT_SIZE, 0); + copy_to[2] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE, 0, 0); + copy_to[3] = store_texture_pos + ivec3(-1, -1, 0); + } else if (local_pos.y == 0) { + copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y - 1, 0); + } else if (local_pos.x == 0) { + copy_to[1] = store_texture_pos + ivec3(local_pos.x - 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0); + } else if (local_pos.y == LIGHTPROBE_OCT_SIZE - 1) { + copy_to[1] = store_texture_pos + ivec3(LIGHTPROBE_OCT_SIZE - local_pos.x - 1, local_pos.y + 1, 0); + } else if (local_pos.x == LIGHTPROBE_OCT_SIZE - 1) { + copy_to[1] = store_texture_pos + ivec3(local_pos.x + 1, LIGHTPROBE_OCT_SIZE - local_pos.y - 1, 0); + } - uint light_rgbe = rgbe_encode(specular_light); - uint diffuse_rgbe = rgbe_encode(diffuse_light); + uint light_rgbe = rgbe_encode(specular_light); + uint diffuse_rgbe = rgbe_encode(diffuse_light); - for (int i = 0; i < 4; i++) { - if (copy_to[i] == ivec3(-2, -2, -2)) { - continue; + for (int i = 0; i < 4; i++) { + if (copy_to[i] == ivec3(-2, -2, -2)) { + continue; + } + imageStore(lightprobe_texture_data, copy_to[i], uvec4(light_rgbe)); + imageStore(lightprobe_diffuse_data, copy_to[i], uvec4(diffuse_rgbe)); + // also to diffuse } - imageStore(lightprobe_texture_data, copy_to[i], uvec4(light_rgbe)); - imageStore(lightprobe_diffuse_data, copy_to[i], uvec4(diffuse_rgbe)); - // also to diffuse - } - if (params.store_ambient_texture && probe_index == 0) { - vec3 ambient_light = vec3(ambient_accum) / float(1 << FP_BITS); - imageStore(lightprobe_ambient_data, ivec3(probe_texture_pos.xy, params.cascade), uvec4(rgbe_encode(ambient_light))); + if (params.store_ambient_texture && probe_index == 0) { + vec3 ambient_light = vec3(ambient_accum) / float(1 << FP_BITS); + imageStore(lightprobe_ambient_data, ivec3(probe_texture_pos.xy, params.cascade), uvec4(rgbe_encode(ambient_light))); + } } #endif diff --git a/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl b/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl index e247a26661be..b56e224e2d5b 100644 --- a/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl +++ b/servers/rendering/renderer_rd/shaders/environment/hddagi_preprocess.glsl @@ -378,16 +378,13 @@ void main() { int src_index = int(gl_GlobalInvocationID).x; int local = int(gl_LocalInvocationID).x; - if (src_index >= src_dispatch_data.total_count) { - // Do not process. - return; - } - - if (src_index >= params.maximum_light_cells) { - return; - } + bool thread_active = true; // Early return deadlocks Intel, so code must avoid it. - if (local == 0) { + if (src_index >= src_dispatch_data.total_count) { + thread_active = false; + } else if (src_index >= params.maximum_light_cells) { + thread_active = false; + } else if (local == 0) { store_position_count = 0; // Base one stores as zero, the others wait if (src_index == 0) { // This lone thread clears y and z. @@ -399,52 +396,57 @@ void main() { groupMemoryBarrier(); barrier(); - ivec3 src_pos = (ivec3(src_process_voxels.data[src_index].position) >> ivec3(0, 7, 14)) & ivec3(0x7F); - bool inside_area = all(greaterThanEqual(src_pos, params.offset)) && all(lessThan(src_pos, params.limit)); + bool inside_area = false; + uint index = 0; + ivec3 src_pos; - if (!inside_area) { - ivec3 light_pos = src_pos + params.scroll; - light_pos = (light_pos + (params.region_world_pos * REGION_SIZE)) & (params.grid_size - 1); - light_pos.y += params.grid_size.y * params.cascade; + if (thread_active) { + src_pos = (ivec3(src_process_voxels.data[src_index].position) >> ivec3(0, 7, 14)) & ivec3(0x7F); + inside_area = all(greaterThanEqual(src_pos, params.offset)) && all(lessThan(src_pos, params.limit)); - // As this will be a new area, clear the new region from the old values. - imageStore(light_tex, light_pos, uvec4(0)); - } - uint index; + if (!inside_area) { + ivec3 light_pos = src_pos + params.scroll; + light_pos = (light_pos + (params.region_world_pos * REGION_SIZE)) & (params.grid_size - 1); + light_pos.y += params.grid_size.y * params.cascade; - if (inside_area) { - index = atomicAdd(store_position_count, 1); - } + // As this will be a new area, clear the new region from the old values. + imageStore(light_tex, light_pos, uvec4(0)); + } + if (inside_area) { + index = atomicAdd(store_position_count, 1); + } + } groupMemoryBarrier(); barrier(); // global increment only once per group, to reduce pressure - if (!inside_area || store_position_count == 0) { - return; - } - - if (index == 0) { - store_from_index = atomicAdd(dispatch_data.total_count, store_position_count); - uint group_count = (store_from_index + store_position_count - 1) / 64 + 1; - atomicMax(dispatch_data.x, group_count); + if (thread_active) { + if (!inside_area || store_position_count == 0) { + thread_active = false; + } else if (index == 0) { + store_from_index = atomicAdd(dispatch_data.total_count, store_position_count); + uint group_count = (store_from_index + store_position_count - 1) / 64 + 1; + atomicMax(dispatch_data.x, group_count); + } } groupMemoryBarrier(); barrier(); - index += store_from_index; - - ivec3 dst_pos = src_pos + params.scroll; + if (thread_active) { + index += store_from_index; - uint src_pending_bits = src_process_voxels.data[src_index].position & ~uint((1 << 21) - 1); + ivec3 dst_pos = src_pos + params.scroll; - dst_process_voxels.data[index].position = uint(dst_pos.x | (dst_pos.y << 7) | (dst_pos.z << 14)) | src_pending_bits; - dst_process_voxels.data[index].albedo_normal = src_process_voxels.data[src_index].albedo_normal; - dst_process_voxels.data[index].emission = src_process_voxels.data[src_index].emission; - dst_process_voxels.data[index].occlusion = src_process_voxels.data[src_index].occlusion; + uint src_pending_bits = src_process_voxels.data[src_index].position & ~uint((1 << 21) - 1); + dst_process_voxels.data[index].position = uint(dst_pos.x | (dst_pos.y << 7) | (dst_pos.z << 14)) | src_pending_bits; + dst_process_voxels.data[index].albedo_normal = src_process_voxels.data[src_index].albedo_normal; + dst_process_voxels.data[index].emission = src_process_voxels.data[src_index].emission; + dst_process_voxels.data[index].occlusion = src_process_voxels.data[src_index].occlusion; + } #endif #ifdef MODE_LIGHT_STORE @@ -472,190 +474,193 @@ void main() { } } + bool thread_active = true; // Early return deadlocks Intel, so code must avoid it. + if (any(greaterThanEqual(pos, params.limit))) { // Storing is not a multiple of the workgroup, so invalid threads can happen. - return; + thread_active = false; } groupMemoryBarrier(); barrier(); - uint solid = get_normal_facing(local); - - if (local == ivec3(0)) { - store_position_count = 0; // Base one stores as zero, the others wait - if (pos == params.offset) { - // This lone thread clears y and z. - dispatch_data.y = 1; - dispatch_data.z = 1; - } - } - vec4 albedo_accum = vec4(0.0); vec4 emission_accum = vec4(0.0); vec3 normal_accum = vec3(0.0); uint occlusionu = 0; + bool voxels_found = false; + ivec3 base_dst_pos; - //opposite to aniso dir - const ivec3 offsets[6] = ivec3[]( - ivec3(1, 0, 0), - ivec3(-1, 0, 0), - ivec3(0, 1, 0), - ivec3(0, -1, 0), - ivec3(0, 0, 1), - ivec3(0, 0, -1)); - - const vec3 aniso_dir[6] = vec3[]( - vec3(-1, 0, 0), - vec3(1, 0, 0), - vec3(0, -1, 0), - vec3(0, 1, 0), - vec3(0, 0, -1), - vec3(0, 0, 1)); - - // aniso dir in bitform - const uint aniso_mask[6] = uint[]( - (1 << 0), - (1 << 1), - (1 << 2), - (1 << 3), - (1 << 4), - (1 << 5)); - - const uint aniso_offset_mask[6] = uint[]( - (1 << 1), - (1 << 0), - (1 << 3), - (1 << 2), - (1 << 5), - (1 << 4)); + if (thread_active) { + uint solid = get_normal_facing(local); - bool voxels_found = false; - uint disocclusion = 0; + if (local == ivec3(0)) { + store_position_count = 0; // Base one stores as zero, the others wait + if (pos == params.offset) { + // This lone thread clears y and z. + dispatch_data.y = 1; + dispatch_data.z = 1; + } + } - const int facing_direction_count = 26; - const vec3 facing_directions[26] = vec3[](vec3(-1.0, 0.0, 0.0), vec3(1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 1.0, 0.0), vec3(0.0, 0.0, -1.0), vec3(0.0, 0.0, 1.0), vec3(-0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, -0.7071067811865475, 0.0), vec3(-0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(-0.7071067811865475, 0.0, -0.7071067811865475), vec3(-0.7071067811865475, 0.0, 0.7071067811865475), vec3(-0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, 0.7071067811865475, 0.0), vec3(-0.5773502691896258, 0.5773502691896258, 0.5773502691896258), vec3(0.0, -0.7071067811865475, -0.7071067811865475), vec3(0.0, -0.7071067811865475, 0.7071067811865475), vec3(0.0, 0.7071067811865475, -0.7071067811865475), vec3(0.0, 0.7071067811865475, 0.7071067811865475), vec3(0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, -0.7071067811865475, 0.0), vec3(0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(0.7071067811865475, 0.0, -0.7071067811865475), vec3(0.7071067811865475, 0.0, 0.7071067811865475), vec3(0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, 0.7071067811865475, 0.0), vec3(0.5773502691896258, 0.5773502691896258, 0.5773502691896258)); + //opposite to aniso dir + const ivec3 offsets[6] = ivec3[]( + ivec3(1, 0, 0), + ivec3(-1, 0, 0), + ivec3(0, 1, 0), + ivec3(0, -1, 0), + ivec3(0, 0, 1), + ivec3(0, 0, -1)); - bool use_for_filter = false; + const vec3 aniso_dir[6] = vec3[]( + vec3(-1, 0, 0), + vec3(1, 0, 0), + vec3(0, -1, 0), + vec3(0, 1, 0), + vec3(0, 0, -1), + vec3(0, 0, 1)); - for (int i = 0; i < 6; i++) { - uint n = get_normal_facing(local + offsets[i]); - if (n == 0) { - disocclusion |= aniso_offset_mask[i]; - } else if (solid == 0) { - use_for_filter = true; - } + // aniso dir in bitform + const uint aniso_mask[6] = uint[]( + (1 << 0), + (1 << 1), + (1 << 2), + (1 << 3), + (1 << 4), + (1 << 5)); - if (solid != 0 || !bool(n & aniso_mask[i])) { - // Not solid, continue. - continue; - } + const uint aniso_offset_mask[6] = uint[]( + (1 << 1), + (1 << 0), + (1 << 3), + (1 << 2), + (1 << 5), + (1 << 4)); - voxels_found = true; + uint disocclusion = 0; - for (int j = 0; j < facing_direction_count; j++) { - if (bool(n & uint((1 << (j + 6))))) { - normal_accum += facing_directions[j]; - } - } + const int facing_direction_count = 26; + const vec3 facing_directions[26] = vec3[](vec3(-1.0, 0.0, 0.0), vec3(1.0, 0.0, 0.0), vec3(0.0, -1.0, 0.0), vec3(0.0, 1.0, 0.0), vec3(0.0, 0.0, -1.0), vec3(0.0, 0.0, 1.0), vec3(-0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, -0.7071067811865475, 0.0), vec3(-0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(-0.7071067811865475, 0.0, -0.7071067811865475), vec3(-0.7071067811865475, 0.0, 0.7071067811865475), vec3(-0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(-0.7071067811865475, 0.7071067811865475, 0.0), vec3(-0.5773502691896258, 0.5773502691896258, 0.5773502691896258), vec3(0.0, -0.7071067811865475, -0.7071067811865475), vec3(0.0, -0.7071067811865475, 0.7071067811865475), vec3(0.0, 0.7071067811865475, -0.7071067811865475), vec3(0.0, 0.7071067811865475, 0.7071067811865475), vec3(0.5773502691896258, -0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, -0.7071067811865475, 0.0), vec3(0.5773502691896258, -0.5773502691896258, 0.5773502691896258), vec3(0.7071067811865475, 0.0, -0.7071067811865475), vec3(0.7071067811865475, 0.0, 0.7071067811865475), vec3(0.5773502691896258, 0.5773502691896258, -0.5773502691896258), vec3(0.7071067811865475, 0.7071067811865475, 0.0), vec3(0.5773502691896258, 0.5773502691896258, 0.5773502691896258)); - ivec3 ofs = pos + offsets[i]; - //normal_accum += aniso_dir[i]; + bool use_for_filter = false; - ivec3 albedo_ofs = ofs >> 1; - albedo_ofs.z *= 6; - albedo_ofs.z += i; + for (int i = 0; i < 6; i++) { + uint n = get_normal_facing(local + offsets[i]); + if (n == 0) { + disocclusion |= aniso_offset_mask[i]; + } else if (solid == 0) { + use_for_filter = true; + } - uint a = imageLoad(src_albedo, albedo_ofs).r; - albedo_accum += vec4(vec3((ivec3(a) >> ivec3(0, 5, 11)) & ivec3(0x1f, 0x3f, 0x1f)) / vec3(31.0, 63.0, 31.0), 1.0); + if (solid != 0 || !bool(n & aniso_mask[i])) { + // Not solid, continue. + continue; + } - uint rgbe = imageLoad(src_emission, ofs >> 1).r; + voxels_found = true; - vec3 emission = rgbe_decode(rgbe); + for (int j = 0; j < facing_direction_count; j++) { + if (bool(n & uint((1 << (j + 6))))) { + normal_accum += facing_directions[j]; + } + } - uint rgbe_aniso = imageLoad(src_emission_aniso, ofs >> 1).r; - float strength = ((rgbe_aniso >> (i * 5)) & 0x1F) / float(0x1F); - emission_accum += vec4(emission * strength, 1.0); - } + ivec3 ofs = pos + offsets[i]; + //normal_accum += aniso_dir[i]; - ivec3 base_dst_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - 1); - ivec3 dst_pos = base_dst_pos + params.grid_size.y * params.cascade; - imageStore(dst_disocclusion, dst_pos, uvec4(disocclusion)); + ivec3 albedo_ofs = ofs >> 1; + albedo_ofs.z *= 6; + albedo_ofs.z += i; - if (solid != 0) { - return; // No further use for this. - } + uint a = imageLoad(src_albedo, albedo_ofs).r; + albedo_accum += vec4(vec3((ivec3(a) >> ivec3(0, 5, 11)) & ivec3(0x1f, 0x3f, 0x1f)) / vec3(31.0, 63.0, 31.0), 1.0); - if (use_for_filter) { - uint neighbour_voxels = 0; + uint rgbe = imageLoad(src_emission, ofs >> 1).r; - for (int i = 0; i < facing_direction_count; i++) { - ivec3 neighbour = ivec3(sign(facing_directions[i])); - ivec3 neighbour_pos = local + neighbour; - uint n = get_normal_facing(neighbour_pos); - if (n == 0) { - continue; // Nothing here - } + vec3 emission = rgbe_decode(rgbe); - for (int j = 0; j < 6; j++) { - //if (!bool(n&(1<> 1).r; + float strength = ((rgbe_aniso >> (i * 5)) & 0x1F) / float(0x1F); + emission_accum += vec4(emission * strength, 1.0); + } - if (any(lessThan(nn_rel, -ivec3(1))) || any(greaterThan(nn_rel, +ivec3(1)))) { - continue; // Too far away, ignore. + base_dst_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - 1); + ivec3 dst_pos = base_dst_pos + params.grid_size.y * params.cascade; + imageStore(dst_disocclusion, dst_pos, uvec4(disocclusion)); + + if (solid != 0) { + thread_active = false; // No further use for this, this is a solid voxel. + } else if (use_for_filter) { + uint neighbour_voxels = 0; + + for (int i = 0; i < facing_direction_count; i++) { + ivec3 neighbour = ivec3(sign(facing_directions[i])); + ivec3 neighbour_pos = local + neighbour; + uint n = get_normal_facing(neighbour_pos); + if (n == 0) { + continue; // Nothing here } - if (nn_rel == ivec3(0)) { - continue; // Point to itself, ignore. - } + for (int j = 0; j < 6; j++) { + //if (!bool(n&(1< 1) { - // must make sure we are not occluded towards this - ivec3 test_dirs[3] = ivec3[](ivec3(nn_rel.x, 0, 0), ivec3(0, nn_rel.y, 0), ivec3(0, 0, nn_rel.z)); - int occlusions = 0; - for (int k = 0; k < 3; k++) { - if (test_dirs[k] == ivec3(0)) { - continue; // Direction not used + uint q = get_normal_facing(local + nn_rel); + if (q != 0) { + continue; // Points to a solid block (can happen), Ignore. + } + + ivec3 nn_rel_abs = abs(nn_rel); + + int nn_steps = nn_rel_abs.x + nn_rel_abs.y + nn_rel_abs.z; + if (nn_steps == 3) { + continue; + } + if (nn_steps > 1) { + // must make sure we are not occluded towards this + ivec3 test_dirs[3] = ivec3[](ivec3(nn_rel.x, 0, 0), ivec3(0, nn_rel.y, 0), ivec3(0, 0, nn_rel.z)); + int occlusions = 0; + for (int k = 0; k < 3; k++) { + if (test_dirs[k] == ivec3(0)) { + continue; // Direction not used + } + + q = get_normal_facing(local + test_dirs[k]); + if (q != 0) { + occlusions++; + } } - q = get_normal_facing(local + test_dirs[k]); - if (q != 0) { - occlusions++; + if (occlusions >= 2) { + continue; // Occluded from here, ignore. May be unoccluded from another neighbour. } } - if (occlusions >= 2) { - continue; // Occluded from here, ignore. May be unoccluded from another neighbour. - } + const uint reverse_map[27] = uint[](6, 14, 18, 9, 4, 21, 11, 16, 23, 7, 2, 19, 0, 0, 1, 12, 3, 24, 8, 15, 20, 10, 5, 22, 13, 17, 25); + ivec3 abs_pos = nn_rel + ivec3(1); + // All good, this is a valid neighbour! + neighbour_voxels |= 1 << reverse_map[abs_pos.z * 3 * 3 + abs_pos.y * 3 + abs_pos.x]; } - - const uint reverse_map[27] = uint[](6, 14, 18, 9, 4, 21, 11, 16, 23, 7, 2, 19, 0, 0, 1, 12, 3, 24, 8, 15, 20, 10, 5, 22, 13, 17, 25); - ivec3 abs_pos = nn_rel + ivec3(1); - // All good, this is a valid neighbour! - neighbour_voxels |= 1 << reverse_map[abs_pos.z * 3 * 3 + abs_pos.y * 3 + abs_pos.x]; } - } - ivec3 store_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - ivec3(1)); - store_pos.y += params.grid_size.y * params.cascade; - imageStore(voxel_neighbours, store_pos, uvec4(neighbour_voxels)); - if (!voxels_found) { - // Light voxels won't be stored here, but still ensure this is black to avoid light leaking from outside. - imageStore(light_tex, store_pos, uvec4(0)); + ivec3 store_pos = (pos + params.region_world_pos * REGION_SIZE) & (params.grid_size - ivec3(1)); + store_pos.y += params.grid_size.y * params.cascade; + imageStore(voxel_neighbours, store_pos, uvec4(neighbour_voxels)); + if (!voxels_found) { + // Light voxels won't be stored here, but still ensure this is black to avoid light leaking from outside. + imageStore(light_tex, store_pos, uvec4(0)); + } } } @@ -664,29 +669,30 @@ void main() { uint index; - if (voxels_found) { + if (thread_active && voxels_found) { index = atomicAdd(store_position_count, 1); } groupMemoryBarrier(); barrier(); - if (!voxels_found || store_position_count == 0) { - return; - } - - // global increment only once per group, to reduce pressure - - if (index == 0) { - store_from_index = atomicAdd(dispatch_data.total_count, store_position_count); - uint group_count = (store_from_index + store_position_count - 1) / 64 + 1; - atomicMax(dispatch_data.x, group_count); + if (thread_active) { + if (!voxels_found || store_position_count == 0) { + thread_active = false; + } else { + // global increment only once per group, to reduce pressure + if (thread_active && index == 0) { + store_from_index = atomicAdd(dispatch_data.total_count, store_position_count); + uint group_count = (store_from_index + store_position_count - 1) / 64 + 1; + atomicMax(dispatch_data.x, group_count); + } + } } groupMemoryBarrier(); barrier(); - { + if (thread_active) { // compute occlusion ivec3 base_probe = params.region_world_pos + pos / PROBE_CELLS; @@ -731,33 +737,31 @@ void main() { } occlusionu |= uint(clamp(w, 0.0, 15.0)) << (i * 4); } - } - index += store_from_index; + index += store_from_index; - if (index >= params.maximum_light_cells) { - return; - } - - normal_accum = normalize(normal_accum); - albedo_accum.rgb /= albedo_accum.a; - emission_accum.rgb /= emission_accum.a; + if (index < params.maximum_light_cells) { + normal_accum = normalize(normal_accum); + albedo_accum.rgb /= albedo_accum.a; + emission_accum.rgb /= emission_accum.a; - dst_process_voxels.data[index].position = uint(pos.x | (pos.y << 7) | (pos.z << 14)) | PROCESS_STATIC_PENDING_BIT | PROCESS_DYNAMIC_PENDING_BIT; + dst_process_voxels.data[index].position = uint(pos.x | (pos.y << 7) | (pos.z << 14)) | PROCESS_STATIC_PENDING_BIT | PROCESS_DYNAMIC_PENDING_BIT; - uint albedo_norm = 0; - albedo_norm |= clamp(uint(albedo_accum.r * 31.0), 0, 31) << 0; - albedo_norm |= clamp(uint(albedo_accum.g * 63.0), 0, 63) << 5; - albedo_norm |= clamp(uint(albedo_accum.b * 31.0), 0, 31) << 11; + uint albedo_norm = 0; + albedo_norm |= clamp(uint(albedo_accum.r * 31.0), 0, 31) << 0; + albedo_norm |= clamp(uint(albedo_accum.g * 63.0), 0, 63) << 5; + albedo_norm |= clamp(uint(albedo_accum.b * 31.0), 0, 31) << 11; - vec2 octa_normal = octahedron_encode(normal_accum); - uvec2 octa_unormal = clamp(uvec2(octa_normal * 255), uvec2(0), uvec2(255)); - albedo_norm |= (octa_unormal.x << 16) | (octa_unormal.y << 24); + vec2 octa_normal = octahedron_encode(normal_accum); + uvec2 octa_unormal = clamp(uvec2(octa_normal * 255), uvec2(0), uvec2(255)); + albedo_norm |= (octa_unormal.x << 16) | (octa_unormal.y << 24); - dst_process_voxels.data[index].albedo_normal = albedo_norm; - dst_process_voxels.data[index].emission = rgbe_encode(emission_accum.rgb); + dst_process_voxels.data[index].albedo_normal = albedo_norm; + dst_process_voxels.data[index].emission = rgbe_encode(emission_accum.rgb); - dst_process_voxels.data[index].occlusion = occlusionu; + dst_process_voxels.data[index].occlusion = occlusionu; + } + } // Compute probe neighbours