Bitmask lookup table based supersampling

Many thanks to Venemo on #dri-devel for helping out with intrusction level parallelism. The tiles can now divided in both x and y direction, which allows for: - Less divergence on typical inputs - Faster texture stores through coalescing
linebender · Apr 4, 2021 · 2057f9c · 2057f9c
1 parent 678bfed
commit 2057f9c
Show file tree

Hide file tree

Showing 12 changed files with 417 additions and 233 deletions.
diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs
@@ -11,6 +11,7 @@ use crate::vulkan;
 use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait;
 use crate::PipelineBuilder as PipelineBuilderTrait;
 use crate::{Device, Error, SamplerParams};
+use crate::vulkan::Format;
 
 pub type MemFlags = <vulkan::VkDevice as Device>::MemFlags;
 pub type Semaphore = <vulkan::VkDevice as Device>::Semaphore;
@@ -152,9 +153,10 @@ impl Session {
         &self,
         width: u32,
         height: u32,
+        format: Format,
         mem_flags: MemFlags,
     ) -> Result<Image, Error> {
-        let image = self.0.device.create_image2d(width, height, mem_flags)?;
+        let image = self.0.device.create_image2d(width, height, format, mem_flags)?;
         Ok(Image(Arc::new(ImageInner {
             image,
             session: Arc::downgrade(&self.0),

diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
@@ -1,3 +1,5 @@
+use crate::vulkan::Format;
+
 /// The cross-platform abstraction for a GPU device.
 ///
 /// This abstraction is inspired by gfx-hal, but is specialized to the needs of piet-gpu.
@@ -57,6 +59,7 @@ pub trait Device: Sized {
         &self,
         width: u32,
         height: u32,
+        format: Format,
         mem_flags: Self::MemFlags,
     ) -> Result<Self::Image, Error>;
 

diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
@@ -10,6 +10,7 @@ use ash::{vk, Device, Entry, Instance};
 use once_cell::sync::Lazy;
 
 use crate::{Device as DeviceTrait, Error, ImageLayout, SamplerParams};
+pub use ash::vk::Format;
 
 pub struct VkInstance {
     /// Retain the dynamic lib.
@@ -460,6 +461,7 @@ impl crate::Device for VkDevice {
         &self,
         width: u32,
         height: u32,
+        format: Format,
         mem_flags: Self::MemFlags,
     ) -> Result<Self::Image, Error> {
         let device = &self.device.device;
@@ -476,7 +478,7 @@ impl crate::Device for VkDevice {
         let image = device.create_image(
             &vk::ImageCreateInfo::builder()
                 .image_type(vk::ImageType::TYPE_2D)
-                .format(vk::Format::R8G8B8A8_UNORM)
+                .format(format)
                 .extent(extent)
                 .mip_levels(1)
                 .array_layers(1)
@@ -505,7 +507,7 @@ impl crate::Device for VkDevice {
             &vk::ImageViewCreateInfo::builder()
                 .view_type(vk::ImageViewType::TYPE_2D)
                 .image(image)
-                .format(vk::Format::R8G8B8A8_UNORM)
+                .format(format)
                 .subresource_range(vk::ImageSubresourceRange {
                     aspect_mask: vk::ImageAspectFlags::COLOR,
                     base_mip_level: 0,

diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs
@@ -21,9 +21,6 @@ piet_gpu! {
             index: u32,
             offset: [i16; 2],
         }
-        struct CmdAlpha {
-            alpha: f32,
-        }
         struct CmdJump {
             new_ref: u32,
         }
@@ -32,12 +29,13 @@ piet_gpu! {
             Fill(CmdFill),
             Stroke(CmdStroke),
             Solid,
-            Alpha(CmdAlpha),
             Color(CmdColor),
             Image(CmdImage),
             BeginClip,
             EndClip,
             Jump(CmdJump),
+            SaveStencil,
+            RestoreStencil,
         }
     }
 }
diff --git a/piet-gpu-types/src/scene.rs b/piet-gpu-types/src/scene.rs
@@ -38,7 +38,6 @@ piet_gpu! {
         }
         struct Clip {
             bbox: [f32; 4],
-            // TODO: add alpha?
         }
         struct SetFillMode {
             fill_mode: u32,

diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
@@ -71,10 +71,16 @@ Alloc read_tile_alloc(uint el_ix) {
 #endif
 
 // The maximum number of commands per annotated element.
-#define ANNO_COMMANDS 2
+#define ANNO_COMMANDS 3
 
-// Perhaps cmd_alloc should be a global? This is a style question.
-bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
+Alloc cmd_alloc;
+Alloc alpha_cmd_alloc;
+CmdRef cmd_ref;
+CmdRef alpha_cmd_ref;
+uint cmd_limit;
+uint alpha_cmd_limit;
+
+bool alloc_cmd() {
     if (cmd_ref.offset < cmd_limit) {
         return true;
     }
@@ -91,6 +97,62 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
     return true;
 }
 
+bool alloc_cmd_rev() {
+    if (alpha_cmd_ref.offset >= alpha_cmd_limit) {
+        return true;
+    }
+    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    if (new_cmd.failed) {
+        return false;
+    }
+    CmdJump jump = CmdJump(alpha_cmd_ref.offset);
+    alpha_cmd_alloc = new_cmd.alloc;
+    alpha_cmd_ref = CmdRef(alpha_cmd_alloc.offset + PTCL_INITIAL_ALLOC - Cmd_size);
+    Cmd_Jump_write(alpha_cmd_alloc, alpha_cmd_ref, jump);
+    alpha_cmd_limit = alpha_cmd_alloc.offset + ANNO_COMMANDS * Cmd_size;
+    return true;
+}
+
+void write_fill(AnnotatedTag tag, Tile tile, float stroke_width) {
+    if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
+        if (tile.tile.offset != 0) {
+            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
+            Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
+            cmd_ref.offset += 4 + CmdFill_size;
+        } else {
+            Cmd_Solid_write(cmd_alloc, cmd_ref);
+            cmd_ref.offset += 4;
+        }
+    } else {
+        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * stroke_width);
+        Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
+        cmd_ref.offset += 4 + CmdStroke_size;
+    }
+}
+
+void write_fill_rev(AnnotatedTag tag, Tile tile, float stroke_width) {
+    if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
+        if (tile.tile.offset != 0) {
+            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
+            alpha_cmd_ref.offset -= 4 + CmdFill_size;
+            Cmd_Fill_write(alpha_cmd_alloc, alpha_cmd_ref, cmd_fill);
+        } else {
+            alpha_cmd_ref.offset -= 4;
+            Cmd_Solid_write(alpha_cmd_alloc, alpha_cmd_ref);
+        }
+    } else {
+        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * stroke_width);
+        alpha_cmd_ref.offset -= 4 + CmdStroke_size;
+        Cmd_Stroke_write(alpha_cmd_alloc, alpha_cmd_ref, cmd_stroke);
+    }
+
+    alpha_cmd_ref.offset -= 4;
+    Cmd_RestoreStencil_write(alpha_cmd_alloc, alpha_cmd_ref);
+
+    Cmd_SaveStencil_write(cmd_alloc, cmd_ref);
+    cmd_ref.offset += 4;
+}
+
 void main() {
     if (mem_error != NO_ERROR) {
         return;
@@ -112,12 +174,18 @@ void main() {
     uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
     uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
     uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
-    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
-    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
+    cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC * 2, PTCL_INITIAL_ALLOC);
+    cmd_ref = CmdRef(cmd_alloc.offset);
     // Reserve space for the maximum number of commands and a potential jump.
-    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
+    cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
+    alpha_cmd_alloc = slice_mem(conf.ptcl_alloc, PTCL_INITIAL_ALLOC * (this_tile_ix * 2 + 1), PTCL_INITIAL_ALLOC);
+    alpha_cmd_ref = CmdRef(alpha_cmd_alloc.offset + PTCL_INITIAL_ALLOC - Cmd_size);
+    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
+        Cmd_End_write(alpha_cmd_alloc, alpha_cmd_ref);
+    }
+    alpha_cmd_limit = alpha_cmd_ref.offset + ANNO_COMMANDS * Cmd_size;
     // The nesting depth of the clip stack
-    uint clip_depth = 0;
+    uint clip_depth = 1;
     // State for the "clip zero" optimization. If it's nonzero, then we are
     // currently in a clip for which the entire tile has an alpha of zero, and
     // the value is the depth after the "begin clip" of that element.
@@ -277,7 +345,7 @@ void main() {
             }
             if (include_tile) {
                 uint el_slice = el_ix / 32;
-                uint el_mask = 1 << (el_ix & 31);
+                uint el_mask = 1 << (el_ix & 31u);
                 atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
             }
         }
@@ -317,94 +385,66 @@ void main() {
                     Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
-                    if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
-                        if (tile.tile.offset != 0) {
-                            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
-                            Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
-                            cmd_ref.offset += 4 + CmdFill_size;
-                        } else {
-                            Cmd_Solid_write(cmd_alloc, cmd_ref);
-                            cmd_ref.offset += 4;
+                    if (unpackUnorm4x8(fill.rgba_color).wzyx.a == 1.0) {
+                        if (!alloc_cmd()) {
+                            break;
+                        }
+                        write_fill(tag, tile, fill.linewidth);
+                        Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
+                        cmd_ref.offset += 4 + CmdColor_size;
+                        if (tile.tile.offset == 0) {
+                            // Tile is fully occluded. See include_tile logic above for the invariant.
+                            clip_zero_depth = clip_depth;
                         }
                     } else {
-                        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill.linewidth);
-                        Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
-                        cmd_ref.offset += 4 + CmdStroke_size;
+                        if (!alloc_cmd_rev()) {
+                            break;
+                        }
+                        alpha_cmd_ref.offset -= 4 + CmdColor_size;
+                        Cmd_Color_write(alpha_cmd_alloc, alpha_cmd_ref, CmdColor(fill.rgba_color));
+                        write_fill_rev(tag, tile, fill.linewidth);
                     }
-                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
-                    cmd_ref.offset += 4 + CmdColor_size;
                     break;
                 case Annotated_Image:
                     tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                    // For now, use back-to-front drawing for all images.
+                    // Optimizations for opaque images can be done in the future.
+                    if (!alloc_cmd_rev()) {
                         break;
                     }
-                    if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
-                        if (tile.tile.offset != 0) {
-                            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
-                            Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
-                            cmd_ref.offset += 4 + CmdFill_size;
-                        } else {
-                            Cmd_Solid_write(cmd_alloc, cmd_ref);
-                            cmd_ref.offset += 4;
-                        }
-                    } else {
-                        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill_img.linewidth);
-                        Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
-                        cmd_ref.offset += 4 + CmdStroke_size;
-                    }
-                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
-                    cmd_ref.offset += 4 + CmdImage_size;
+                    alpha_cmd_ref.offset -= 4 + CmdImage_size;
+                    Cmd_Image_write(alpha_cmd_alloc, alpha_cmd_ref, CmdImage(fill_img.index, fill_img.offset));
+                    write_fill_rev(tag, tile, fill_img.linewidth);
                     break;
                 case Annotated_BeginClip:
                     tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     if (tile.tile.offset == 0 && tile.backdrop == 0) {
                         clip_zero_depth = clip_depth + 1;
                     } else if (tile.tile.offset == 0 && clip_depth < 32) {
-                        clip_one_mask |= (1 << clip_depth);
+                        clip_one_mask |= (1u << clip_depth);
                     } else {
                         AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
-                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        if (!alloc_cmd()) {
                             break;
                         }
-                        if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
-                            if (tile.tile.offset != 0) {
-                                CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
-                                Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
-                                cmd_ref.offset += 4 + CmdFill_size;
-                            } else {
-                                // TODO: here is where a bunch of optimization magic should happen
-                                float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
-                                Cmd_Alpha_write(cmd_alloc, cmd_ref, CmdAlpha(alpha));
-                                cmd_ref.offset += 4 + CmdAlpha_size;
-                            }
-                        } else {
-                            CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * begin_clip.linewidth);
-                            Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
-                            cmd_ref.offset += 4 + CmdStroke_size;
-                        }
+                        write_fill(tag, tile, begin_clip.linewidth);
                         Cmd_BeginClip_write(cmd_alloc, cmd_ref);
                         cmd_ref.offset += 4;
                         if (clip_depth < 32) {
-                            clip_one_mask &= ~(1 << clip_depth);
+                            clip_one_mask &= ~(1u << clip_depth);
                         }
                     }
                     clip_depth++;
                     break;
                 case Annotated_EndClip:
                     clip_depth--;
-                    if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
-                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                    if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) {
+                        if (!alloc_cmd()) {
                             break;
                         }
-                        Cmd_Solid_write(cmd_alloc, cmd_ref);
-                        cmd_ref.offset += 4;
                         Cmd_EndClip_write(cmd_alloc, cmd_ref);
                         cmd_ref.offset += 4;
                     }
@@ -431,6 +471,7 @@ void main() {
         if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
     }
     if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
-        Cmd_End_write(cmd_alloc, cmd_ref);
+        CmdJump jump = CmdJump(alpha_cmd_ref.offset);
+        Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
     }
 }
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv