Proof of concept fix for Async GPU Readbacks #87850

granitrocky · 2024-02-02T02:59:29Z

I have an implementation and a basic demo to show how to achieve Asynchronous Compute Shader execution with Vulkan Timeline Semaphores.

Currently Vulkan specific just to get the ball rolling

And here's a repurposed ray tracing project that demonstrates how my changes can avoid locking up the CPU while waiting for Compute Shaders to finish execution:
https://github.com/granitrocky/Raytracing_Godot4

Bugsquad edit, related: Add support for asynchronous RenderingDevice.buffer_get_data() (hardware readbacks) godot-proposals#7886

AThousandShips · 2024-02-02T10:22:39Z

drivers/vulkan/vulkan_context.cpp

@@ -42,6 +42,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <cstdint>


Suggested change

#include <cstdint>

Already included (even in the header, only include what's necessary)

AThousandShips · 2024-02-02T10:22:55Z

drivers/vulkan/vulkan_context.h

@@ -40,6 +40,7 @@
 #include "rendering_device_driver_vulkan.h"
 #include "servers/display_server.h"
 #include "servers/rendering/renderer_rd/api_context_rd.h"
+#include <cstdint>


Suggested change

#include <cstdint>

AThousandShips · 2024-02-02T10:23:54Z

servers/rendering/rendering_device.cpp

@@ -29,6 +29,7 @@
 /**************************************************************************/

 #include "rendering_device.h"
+#include "core/os/thread_safe.h"


Suggested change

#include "core/os/thread_safe.h"

AThousandShips · 2024-02-02T10:24:45Z

servers/rendering/rendering_device.cpp

+bool RenderingDevice::check_status(){
+	_THREAD_SAFE_METHOD_
+
+        return context->local_device_check_status(local_device);


Suggested change

return context->local_device_check_status(local_device);

return context->local_device_check_status(local_device);

AThousandShips · 2024-02-02T10:25:16Z

servers/rendering/rendering_device.h

@@ -1333,6 +1333,7 @@ class RenderingDevice : public RenderingDeviceCommons {

 	void submit();
 	void sync();
+    bool check_status();


Suggested change

bool check_status();

bool check_status();

AThousandShips · 2024-02-02T10:29:19Z

drivers/vulkan/vulkan_context.cpp

+    uint64_t out = 0;
+    vkGetSemaphoreCounterValue(ld->device, compute_semaphore[0], &out);
+    return out >= compute_semaphore_signal_value;
+}
+


Suggested change

uint64_t out = 0;

vkGetSemaphoreCounterValue(ld->device, compute_semaphore[0], &out);

return out >= compute_semaphore_signal_value;

}

uint64_t out = 0;

vkGetSemaphoreCounterValue(ld->device, compute_semaphore[0], &out);

return out >= compute_semaphore_signal_value;

}

AThousandShips · 2024-02-02T10:29:55Z

drivers/vulkan/vulkan_context.h

@@ -133,6 +134,8 @@ class VulkanContext : public ApiContextRD {
 	VkFormat format;
 	VkSemaphore draw_complete_semaphores[FRAME_LAG];
 	VkSemaphore image_ownership_semaphores[FRAME_LAG];
+	VkSemaphore compute_semaphore[1];
+    uint64_t compute_semaphore_signal_value = 0;


Suggested change

uint64_t compute_semaphore_signal_value = 0;

uint64_t compute_semaphore_signal_value = 0;

AThousandShips · 2024-02-02T10:30:26Z

drivers/vulkan/vulkan_context.cpp

-	submit_info.pSignalSemaphores = nullptr;
+	submit_info.signalSemaphoreCount = 1;
+	submit_info.pSignalSemaphores = &compute_semaphore[0];
+


Suggested change

AThousandShips · 2024-02-02T10:30:59Z

drivers/vulkan/vulkan_context.cpp

+    err = vkCreateSemaphore(device, &computeSemaphoreCreateInfo, nullptr, &compute_semaphore[0]);
+		ERR_FAIL_COND_V(err, ERR_CANT_CREATE);


Suggested change

err = vkCreateSemaphore(device, &computeSemaphoreCreateInfo, nullptr, &compute_semaphore[0]);

ERR_FAIL_COND_V(err, ERR_CANT_CREATE);

err = vkCreateSemaphore(device, &computeSemaphoreCreateInfo, nullptr, &compute_semaphore[0]);

ERR_FAIL_COND_V(err, ERR_CANT_CREATE);

AThousandShips · 2024-02-02T10:32:06Z

drivers/vulkan/vulkan_context.cpp

+    // Every time a compute call finishes, we'll increment the value
+    // of this semaphore by one, so we can check the status
+    VkSemaphoreTypeCreateInfo timelineCreateInfo = {
+        /*sType*/ VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+        /*pNext*/ nullptr,
+        /*semaphoreType*/ VK_SEMAPHORE_TYPE_TIMELINE,
+        /*initialValue*/ 0,
+    };


Suggested change

// Every time a compute call finishes, we'll increment the value

// of this semaphore by one, so we can check the status

VkSemaphoreTypeCreateInfo timelineCreateInfo = {

/*sType*/ VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,

/*pNext*/ nullptr,

/*semaphoreType*/ VK_SEMAPHORE_TYPE_TIMELINE,

/*initialValue*/ 0,

};

// Every time a compute call finishes, we'll increment the value

// of this semaphore by one, so we can check the status.

VkSemaphoreTypeCreateInfo timelineCreateInfo = {

/*sType*/ VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,

/*pNext*/ nullptr,

/*semaphoreType*/ VK_SEMAPHORE_TYPE_TIMELINE,

/*initialValue*/ 0,

};

AThousandShips · 2024-02-02T10:33:47Z

servers/rendering/rendering_device.cpp

@@ -4659,6 +4660,12 @@ void RenderingDevice::sync() {
 	local_device_processing = false;
 }

+bool RenderingDevice::check_status(){


Suggested change

bool RenderingDevice::check_status(){

bool RenderingDevice::check_status() {

AThousandShips · 2024-02-02T10:37:04Z

drivers/vulkan/vulkan_context.cpp

@@ -2813,8 +2843,18 @@ void VulkanContext::local_device_sync(RID p_local_device) {

 	vkDeviceWaitIdle(ld->device);
 	ld->waiting = false;
+


Suggested change

DarioSamo · 2024-02-02T12:12:05Z

This is an interesting PR but we might run into conflicts since we intend to merge #87340 and that changes the entire design around RD being able to create semaphores and fences freely. I'd recommend looking into it as it probably would save you quite a bit of work.

That said I find the use of timeline semaphores interesting. These are not guaranteed to be supported so you must check for support I'm afraid, so they can't just be created without checking for that. But judging by the way they're being used in the PR, I get the feeling they might not be strictly necessary.

If you feel the use of timeline semaphores will be required in the future we could probably take that into account in the redesign of the PR I mentioned (as they're guaranteed in D3D12 but in Vulkan they must be queried for support). That's why the RHI goes for the lowest common denominator. Do you think it might be possible to achieve the effect you're looking for without using them?

granitrocky · 2024-02-02T13:43:42Z

I tried to find a way of doing this with a simple boolean semaphore, but from what I could read in the API, there's no non-blocking way of reading a boolean semaphore value.

If there is, a lot of this complexity goes away, so I'd be happy for that.

granitrocky · 2024-02-02T14:42:36Z

@DarioSamo

I'm going to take some time and see if I can get this to sit on top of #87340

Hopefully they play nicely together.

DarioSamo · 2024-02-02T15:25:41Z

I tried to find a way of doing this with a simple boolean semaphore, but from what I could read in the API, there's no non-blocking way of reading a boolean semaphore value.

If there is, a lot of this complexity goes away, so I'd be happy for that.

Fences actually allow a non-blocking (although not free) way to check their status, so if you only want to check if a command buffer in particular is finished, that could be a pretty cheap way to go about it. Considering the fact a local device does not allow more than one simultaneous command buffer, if you have to wait for a fence you can assume all work assigned to the previous buffer must be complete already, so you wouldn't need a counter for it either.

Keep in mind this is not me being strict for the sake of it, it's just that since Godot aims to be as compatible as possible, you might find some resistance in using features newer than 1.0 as they might not be available in the devices it aims to support. Therefore anything that needs a particular version supported must be explicitly checked and a suitable alternative path must be taken if support is not present (if possible). For something as core as this, if we can find a way to work around it the better.

I'm going to take some time and see if I can get this to sit on top of #87340
Hopefully they play nicely together.

Thanks! My suggestion mostly comes from the fact if it sits on top of the RHI, you won't need to re-implement it for all backends (since D3D12 is already in master and Metal is also coming at a later point).

granitrocky · 2024-02-02T18:36:40Z

@DarioSamo I totally understand. This was just a first pass attempt to see what gets the wheels spinning in people's heads since I almost certainly don't have the best approach.

Here's a bit of an architecture question. Are compute shaders run in the same command buffer as the render/swap chains? Because the use case for this PR is a compute shader that runs over multiple frames needing to be synchronized only after its work has completed.

Calinou · 2024-02-02T18:42:12Z

Keep in mind this is not me being strict for the sake of it, it's just that since Godot aims to be as compatible as possible, you might find some resistance in using features newer than 1.0 as they might not be available in the devices it aims to support.

To clarify, it's more of an issue of vendor support (check gpuinfo.org) than Vulkan version. Pretty much all devices where Vulkan is worth using support the 1.1 specification, but they don't always support the extensions you wish to use. On desktop, 1.2 is pretty much everywhere (catering to core 1.1 is only needed on Android). If supporting old Intel IGPs or outdated graphics drivers isn't required, then core 1.3 is pretty much universal too.

There's also the issue that we need to use the "lowest common denominator" that works across Vulkan, Direct3D 12 and (soon) Metal. A feature that is only available in Vulkan without counterparts available in other APIs is much more difficult to use (see the workaround that was required for specialization constants in Direct3D 12).

DarioSamo · 2024-02-02T18:48:21Z

Here's a bit of an architecture question. Are compute shaders run in the same command buffer as the render/swap chains? Because the use case for this PR is a compute shader that runs over multiple frames needing to be synchronized only after its work has completed.

That is currently the case. There's no architecture for allowing compute lists to run in parallel at the moment as resource tracking and the render graph are only guaranteed to work correctly when called from the main thread. Any work queued up from those is run in the command buffer used for rendering and is expected to finish before presenting to the swap chain. Therefore, if you queue up some expensive compute work, it will stall the rendering.

The use case you describe can be implemented very easily on top of the PR I posted (and in fact #87590 takes an approach like that to implement asynchronous transfer queues that run in parallel to the main ones). I can't say however what the ideal interface for accessing this would be yet (we have a lot of thread restrictions to enforce and resource tracking would need to be kept local to the context).

EDIT: What I said above doesn't apply as such to local device, which carries the cost of creating a new Vulkan device altogether but allows you to run stuff independently of the main rendering driver. However, if you intend to share resources in GPU memory, that is not the ideal approach.

granitrocky · 2024-02-02T20:37:00Z

@DarioSamo I think I follow. The actual dispatch of the compute payload would need to be done in a separate command buffer altogether, which your PR allows.

So since the use case here is that we don't actually want to transfer anything, just run a check that the compute workload is done, that should be able to be done atomically. In the linked project, you can see that I'm just checking every frame whether the workload is done, and if so, call RD.sync()

Maybe my use case is too simplistic to even really offer many benefits, and my efforts may be better spent working with PR #87590

DarioSamo · 2024-02-02T20:48:20Z

So since the use case here is that we don't actually want to transfer anything, just run a check that the compute workload is done, that should be able to be done atomically. In the linked project, you can see that I'm just checking every frame whether the workload is done, and if so, call RD.sync()

Your linked project uses local device, so it's a bit of a different story. On local devices, the command buffers are local to it and are not shared with the Rendering Device used for drawing graphics to the screen at all.

Given your particular use case now that I see it, I don't think you'd need to use timeline semaphores at all. Your current logic could be implemented if you call vkGetFenceStatus on the vkFence object for a particular 'frame'. In the case of local devices, 'frames' only ever holds one frame, so you'd just be checking if the frame has finished executing without doing an active wait on it. Note that these frames have no actual relation to the frames in the queue of the main rendering device instance.

My other suggestions mostly apply if you wish to look into a solution that uses the existing RenderingDevice without creating a local one, as that comes with extra cost and limitations for sharing resources between them.

granitrocky · 2024-02-02T21:01:52Z

Ah okay. So "local device" here means that I'm creating my own local RenderingDevice? As opposed to using the singleton version?

DarioSamo · 2024-02-02T21:03:05Z

Ah okay. So "local device" here means that I'm creating my own local RenderingDevice? As opposed to using the singleton version?

Yup. Therefore all resources you create there can't be shared with the Singleton version. That may or may not be useful to you (we use this on the offline lightmapper for example).

granitrocky · 2024-02-02T21:06:01Z

Okay cool. I appreciate your notes. I'll try and see whether #87590 exposes what I'm looking for and whether I have anything to contribute.

QuinnLeavitt · 2024-06-20T21:36:21Z

Is this still being actively worked on?

granitrocky · 2024-07-11T21:21:08Z

@QuinnLeavitt Unfortunately I haven't had time to see how much work it would be to get this to work with #87590 .

I would love to take another crack at this when I have some time.

Proof of concept for Async GPU usage

b1a1e49

granitrocky requested a review from a team as a code owner February 2, 2024 02:59

AThousandShips changed the title ~~Proof of concept fix for Async GPU Readbacks https://github.com/godotengine/godot-proposals/issues/7886~~ Proof of concept fix for Async GPU Readbacks Feb 2, 2024

AThousandShips added enhancement feature proposal topic:rendering labels Feb 2, 2024

AThousandShips added this to the 4.x milestone Feb 2, 2024

AThousandShips reviewed Feb 2, 2024

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Proof of concept fix for Async GPU Readbacks #87850

Proof of concept fix for Async GPU Readbacks #87850

granitrocky commented Feb 2, 2024 •

edited by AThousandShips

Loading

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024 •

edited

Loading

AThousandShips Feb 2, 2024

AThousandShips Feb 2, 2024

DarioSamo commented Feb 2, 2024 •

edited

Loading

granitrocky commented Feb 2, 2024

granitrocky commented Feb 2, 2024

DarioSamo commented Feb 2, 2024 •

edited

Loading

granitrocky commented Feb 2, 2024

Calinou commented Feb 2, 2024 •

edited

Loading

DarioSamo commented Feb 2, 2024 •

edited

Loading

granitrocky commented Feb 2, 2024

DarioSamo commented Feb 2, 2024 •

edited

Loading

granitrocky commented Feb 2, 2024

DarioSamo commented Feb 2, 2024

granitrocky commented Feb 2, 2024

QuinnLeavitt commented Jun 20, 2024

granitrocky commented Jul 11, 2024

	return context->local_device_check_status(local_device);
	return context->local_device_check_status(local_device);

	uint64_t compute_semaphore_signal_value = 0;
	uint64_t compute_semaphore_signal_value = 0;

		err = vkCreateSemaphore(device, &computeSemaphoreCreateInfo, nullptr, &compute_semaphore[0]);
		ERR_FAIL_COND_V(err, ERR_CANT_CREATE);

	bool RenderingDevice::check_status(){
	bool RenderingDevice::check_status() {

		@@ -2813,8 +2843,18 @@ void VulkanContext::local_device_sync(RID p_local_device) {

		vkDeviceWaitIdle(ld->device);
		ld->waiting = false;

Proof of concept fix for Async GPU Readbacks #87850

Are you sure you want to change the base?

Proof of concept fix for Async GPU Readbacks #87850

Conversation

granitrocky commented Feb 2, 2024 • edited by AThousandShips Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

AThousandShips Feb 2, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

DarioSamo commented Feb 2, 2024 • edited Loading

granitrocky commented Feb 2, 2024

granitrocky commented Feb 2, 2024

DarioSamo commented Feb 2, 2024 • edited Loading

granitrocky commented Feb 2, 2024

Calinou commented Feb 2, 2024 • edited Loading

DarioSamo commented Feb 2, 2024 • edited Loading

granitrocky commented Feb 2, 2024

DarioSamo commented Feb 2, 2024 • edited Loading

granitrocky commented Feb 2, 2024

DarioSamo commented Feb 2, 2024

granitrocky commented Feb 2, 2024

QuinnLeavitt commented Jun 20, 2024

granitrocky commented Jul 11, 2024

granitrocky commented Feb 2, 2024 •

edited by AThousandShips

Loading

AThousandShips Feb 2, 2024 •

edited

Loading

DarioSamo commented Feb 2, 2024 •

edited

Loading

DarioSamo commented Feb 2, 2024 •

edited

Loading

Calinou commented Feb 2, 2024 •

edited

Loading

DarioSamo commented Feb 2, 2024 •

edited

Loading

DarioSamo commented Feb 2, 2024 •

edited

Loading