Skip to content

Commit

Permalink
Bitmask lookup table based supersampling
Browse files Browse the repository at this point in the history
Many thanks to Venemo on #dri-devel for helping out with intrusction level parallelism.

The tiles can now divided in both x and y direction, which allows for:
- Less divergence on typical inputs
- Faster texture stores through coalescing
  • Loading branch information
ishitatsuyuki committed Apr 4, 2021
1 parent 678bfed commit 2057f9c
Show file tree
Hide file tree
Showing 12 changed files with 417 additions and 233 deletions.
4 changes: 3 additions & 1 deletion piet-gpu-hal/src/hub.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::vulkan;
use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait;
use crate::PipelineBuilder as PipelineBuilderTrait;
use crate::{Device, Error, SamplerParams};
use crate::vulkan::Format;

pub type MemFlags = <vulkan::VkDevice as Device>::MemFlags;
pub type Semaphore = <vulkan::VkDevice as Device>::Semaphore;
Expand Down Expand Up @@ -152,9 +153,10 @@ impl Session {
&self,
width: u32,
height: u32,
format: Format,
mem_flags: MemFlags,
) -> Result<Image, Error> {
let image = self.0.device.create_image2d(width, height, mem_flags)?;
let image = self.0.device.create_image2d(width, height, format, mem_flags)?;
Ok(Image(Arc::new(ImageInner {
image,
session: Arc::downgrade(&self.0),
Expand Down
3 changes: 3 additions & 0 deletions piet-gpu-hal/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use crate::vulkan::Format;

/// The cross-platform abstraction for a GPU device.
///
/// This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu.
Expand Down Expand Up @@ -57,6 +59,7 @@ pub trait Device: Sized {
&self,
width: u32,
height: u32,
format: Format,
mem_flags: Self::MemFlags,
) -> Result<Self::Image, Error>;

Expand Down
6 changes: 4 additions & 2 deletions piet-gpu-hal/src/vulkan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use ash::{vk, Device, Entry, Instance};
use once_cell::sync::Lazy;

use crate::{Device as DeviceTrait, Error, ImageLayout, SamplerParams};
pub use ash::vk::Format;

pub struct VkInstance {
/// Retain the dynamic lib.
Expand Down Expand Up @@ -460,6 +461,7 @@ impl crate::Device for VkDevice {
&self,
width: u32,
height: u32,
format: Format,
mem_flags: Self::MemFlags,
) -> Result<Self::Image, Error> {
let device = &self.device.device;
Expand All @@ -476,7 +478,7 @@ impl crate::Device for VkDevice {
let image = device.create_image(
&vk::ImageCreateInfo::builder()
.image_type(vk::ImageType::TYPE_2D)
.format(vk::Format::R8G8B8A8_UNORM)
.format(format)
.extent(extent)
.mip_levels(1)
.array_layers(1)
Expand Down Expand Up @@ -505,7 +507,7 @@ impl crate::Device for VkDevice {
&vk::ImageViewCreateInfo::builder()
.view_type(vk::ImageViewType::TYPE_2D)
.image(image)
.format(vk::Format::R8G8B8A8_UNORM)
.format(format)
.subresource_range(vk::ImageSubresourceRange {
aspect_mask: vk::ImageAspectFlags::COLOR,
base_mip_level: 0,
Expand Down
6 changes: 2 additions & 4 deletions piet-gpu-types/src/ptcl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ piet_gpu! {
index: u32,
offset: [i16; 2],
}
struct CmdAlpha {
alpha: f32,
}
struct CmdJump {
new_ref: u32,
}
Expand All @@ -32,12 +29,13 @@ piet_gpu! {
Fill(CmdFill),
Stroke(CmdStroke),
Solid,
Alpha(CmdAlpha),
Color(CmdColor),
Image(CmdImage),
BeginClip,
EndClip,
Jump(CmdJump),
SaveStencil,
RestoreStencil,
}
}
}
1 change: 0 additions & 1 deletion piet-gpu-types/src/scene.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ piet_gpu! {
}
struct Clip {
bbox: [f32; 4],
// TODO: add alpha?
}
struct SetFillMode {
fill_mode: u32,
Expand Down
171 changes: 106 additions & 65 deletions piet-gpu/shader/coarse.comp
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,16 @@ Alloc read_tile_alloc(uint el_ix) {
#endif

// The maximum number of commands per annotated element.
#define ANNO_COMMANDS 2
#define ANNO_COMMANDS 3

// Perhaps cmd_alloc should be a global? This is a style question.
bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
Alloc cmd_alloc;
Alloc alpha_cmd_alloc;
CmdRef cmd_ref;
CmdRef alpha_cmd_ref;
uint cmd_limit;
uint alpha_cmd_limit;

bool alloc_cmd() {
if (cmd_ref.offset < cmd_limit) {
return true;
}
Expand All @@ -91,6 +97,62 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
return true;
}

bool alloc_cmd_rev() {
if (alpha_cmd_ref.offset >= alpha_cmd_limit) {
return true;
}
MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
if (new_cmd.failed) {
return false;
}
CmdJump jump = CmdJump(alpha_cmd_ref.offset);
alpha_cmd_alloc = new_cmd.alloc;
alpha_cmd_ref = CmdRef(alpha_cmd_alloc.offset + PTCL_INITIAL_ALLOC - Cmd_size);
Cmd_Jump_write(alpha_cmd_alloc, alpha_cmd_ref, jump);
alpha_cmd_limit = alpha_cmd_alloc.offset + ANNO_COMMANDS * Cmd_size;
return true;
}

void write_fill(AnnotatedTag tag, Tile tile, float stroke_width) {
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
Cmd_Solid_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * stroke_width);
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
}
}

void write_fill_rev(AnnotatedTag tag, Tile tile, float stroke_width) {
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
alpha_cmd_ref.offset -= 4 + CmdFill_size;
Cmd_Fill_write(alpha_cmd_alloc, alpha_cmd_ref, cmd_fill);
} else {
alpha_cmd_ref.offset -= 4;
Cmd_Solid_write(alpha_cmd_alloc, alpha_cmd_ref);
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * stroke_width);
alpha_cmd_ref.offset -= 4 + CmdStroke_size;
Cmd_Stroke_write(alpha_cmd_alloc, alpha_cmd_ref, cmd_stroke);
}

alpha_cmd_ref.offset -= 4;
Cmd_RestoreStencil_write(alpha_cmd_alloc, alpha_cmd_ref);

Cmd_SaveStencil_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
}

void main() {
if (mem_error != NO_ERROR) {
return;
Expand All @@ -112,12 +174,18 @@ void main() {
uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC * 2, PTCL_INITIAL_ALLOC);
cmd_ref = CmdRef(cmd_alloc.offset);
// Reserve space for the maximum number of commands and a potential jump.
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
alpha_cmd_alloc = slice_mem(conf.ptcl_alloc, PTCL_INITIAL_ALLOC * (this_tile_ix * 2 + 1), PTCL_INITIAL_ALLOC);
alpha_cmd_ref = CmdRef(alpha_cmd_alloc.offset + PTCL_INITIAL_ALLOC - Cmd_size);
if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
Cmd_End_write(alpha_cmd_alloc, alpha_cmd_ref);
}
alpha_cmd_limit = alpha_cmd_ref.offset + ANNO_COMMANDS * Cmd_size;
// The nesting depth of the clip stack
uint clip_depth = 0;
uint clip_depth = 1;
// State for the "clip zero" optimization. If it's nonzero, then we are
// currently in a clip for which the entire tile has an alpha of zero, and
// the value is the depth after the "begin clip" of that element.
Expand Down Expand Up @@ -277,7 +345,7 @@ void main() {
}
if (include_tile) {
uint el_slice = el_ix / 32;
uint el_mask = 1 << (el_ix & 31);
uint el_mask = 1 << (el_ix & 31u);
atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
}
}
Expand Down Expand Up @@ -317,94 +385,66 @@ void main() {
Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
Cmd_Solid_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
if (unpackUnorm4x8(fill.rgba_color).wzyx.a == 1.0) {
if (!alloc_cmd()) {
break;
}
write_fill(tag, tile, fill.linewidth);
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
cmd_ref.offset += 4 + CmdColor_size;
if (tile.tile.offset == 0) {
// Tile is fully occluded. See include_tile logic above for the invariant.
clip_zero_depth = clip_depth;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill.linewidth);
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
if (!alloc_cmd_rev()) {
break;
}
alpha_cmd_ref.offset -= 4 + CmdColor_size;
Cmd_Color_write(alpha_cmd_alloc, alpha_cmd_ref, CmdColor(fill.rgba_color));
write_fill_rev(tag, tile, fill.linewidth);
}
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
cmd_ref.offset += 4 + CmdColor_size;
break;
case Annotated_Image:
tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
// For now, use back-to-front drawing for all images.
// Optimizations for opaque images can be done in the future.
if (!alloc_cmd_rev()) {
break;
}
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
Cmd_Solid_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill_img.linewidth);
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
}
Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
cmd_ref.offset += 4 + CmdImage_size;
alpha_cmd_ref.offset -= 4 + CmdImage_size;
Cmd_Image_write(alpha_cmd_alloc, alpha_cmd_ref, CmdImage(fill_img.index, fill_img.offset));
write_fill_rev(tag, tile, fill_img.linewidth);
break;
case Annotated_BeginClip:
tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
if (tile.tile.offset == 0 && tile.backdrop == 0) {
clip_zero_depth = clip_depth + 1;
} else if (tile.tile.offset == 0 && clip_depth < 32) {
clip_one_mask |= (1 << clip_depth);
clip_one_mask |= (1u << clip_depth);
} else {
AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
if (!alloc_cmd()) {
break;
}
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
// TODO: here is where a bunch of optimization magic should happen
float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
Cmd_Alpha_write(cmd_alloc, cmd_ref, CmdAlpha(alpha));
cmd_ref.offset += 4 + CmdAlpha_size;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * begin_clip.linewidth);
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
}
write_fill(tag, tile, begin_clip.linewidth);
Cmd_BeginClip_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
if (clip_depth < 32) {
clip_one_mask &= ~(1 << clip_depth);
clip_one_mask &= ~(1u << clip_depth);
}
}
clip_depth++;
break;
case Annotated_EndClip:
clip_depth--;
if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) {
if (!alloc_cmd()) {
break;
}
Cmd_Solid_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
Cmd_EndClip_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
}
Expand All @@ -431,6 +471,7 @@ void main() {
if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
}
if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
Cmd_End_write(cmd_alloc, cmd_ref);
CmdJump jump = CmdJump(alpha_cmd_ref.offset);
Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
}
}
Binary file modified piet-gpu/shader/coarse.spv
Binary file not shown.
Loading

0 comments on commit 2057f9c

Please sign in to comment.