binning.comp raw

   1  // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
   2  
   3  // The binning stage of the pipeline.
   4  //
   5  // Each workgroup processes N_TILE paths.
   6  // Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
   7  // based on the path bounding box to bin the paths.
   8  
   9  #version 450
  10  #extension GL_GOOGLE_include_directive : enable
  11  
  12  #include "mem.h"
  13  #include "setup.h"
  14  
  15  layout(local_size_x = N_TILE, local_size_y = 1) in;
  16  
  17  layout(set = 0, binding = 1) readonly buffer ConfigBuf {
  18      Config conf;
  19  };
  20  
  21  #include "annotated.h"
  22  #include "bins.h"
  23  
  24  // scale factors useful for converting coordinates to bins
  25  #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
  26  #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
  27  
  28  // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
  29  #define INFINITY (1.0 / 0.0)
  30  
  31  // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
  32  // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
  33  shared uint bitmaps[N_SLICE][N_TILE];
  34  shared uint count[N_SLICE][N_TILE];
  35  shared Alloc sh_chunk_alloc[N_TILE];
  36  shared bool sh_alloc_failed;
  37  
  38  void main() {
  39      uint my_n_elements = conf.n_elements;
  40      uint my_partition = gl_WorkGroupID.x;
  41  
  42      for (uint i = 0; i < N_SLICE; i++) {
  43          bitmaps[i][gl_LocalInvocationID.x] = 0;
  44      }
  45      if (gl_LocalInvocationID.x == 0) {
  46          sh_alloc_failed = false;
  47      }
  48      barrier();
  49  
  50      // Read inputs and determine coverage of bins
  51      uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
  52      AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
  53      uint tag = Annotated_Nop;
  54      if (element_ix < my_n_elements) {
  55          tag = Annotated_tag(conf.anno_alloc, ref).tag;
  56      }
  57      int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
  58      switch (tag) {
  59      case Annotated_Color:
  60      case Annotated_Image:
  61      case Annotated_BeginClip:
  62      case Annotated_EndClip:
  63          // Note: we take advantage of the fact that these drawing elements
  64          // have the bbox at the same place in their layout.
  65          AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
  66          x0 = int(floor(clip.bbox.x * SX));
  67          y0 = int(floor(clip.bbox.y * SY));
  68          x1 = int(ceil(clip.bbox.z * SX));
  69          y1 = int(ceil(clip.bbox.w * SY));
  70          break;
  71      }
  72  
  73      // At this point, we run an iterator over the coverage area,
  74      // trying to keep divergence low.
  75      // Right now, it's just a bbox, but we'll get finer with
  76      // segments.
  77      uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
  78      uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
  79      x0 = clamp(x0, 0, int(width_in_bins));
  80      x1 = clamp(x1, x0, int(width_in_bins));
  81      y0 = clamp(y0, 0, int(height_in_bins));
  82      y1 = clamp(y1, y0, int(height_in_bins));
  83      if (x0 == x1) y1 = y0;
  84      int x = x0, y = y0;
  85      uint my_slice = gl_LocalInvocationID.x / 32;
  86      uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
  87      while (y < y1) {
  88          atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
  89          x++;
  90          if (x == x1) {
  91              x = x0;
  92              y++;
  93          }
  94      }
  95  
  96      barrier();
  97      // Allocate output segments.
  98      uint element_count = 0;
  99      for (uint i = 0; i < N_SLICE; i++) {
 100          element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
 101          count[i][gl_LocalInvocationID.x] = element_count;
 102      }
 103      // element_count is number of elements covering bin for this invocation.
 104      Alloc chunk_alloc = new_alloc(0, 0, true);
 105      if (element_count != 0) {
 106          // TODO: aggregate atomic adds (subgroup is probably fastest)
 107          MallocResult chunk = malloc(element_count * BinInstance_size);
 108          chunk_alloc = chunk.alloc;
 109          sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
 110          if (chunk.failed) {
 111              sh_alloc_failed = true;
 112          }
 113      }
 114      // Note: it might be more efficient for reading to do this in the
 115      // other order (each bin is a contiguous sequence of partitions)
 116      uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
 117      write_mem(conf.bin_alloc, out_ix, element_count);
 118      write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
 119  
 120      barrier();
 121      if (sh_alloc_failed || mem_error != NO_ERROR) {
 122          return;
 123      }
 124  
 125      // Use similar strategy as Laine & Karras paper; loop over bbox of bins
 126      // touched by this element
 127      x = x0;
 128      y = y0;
 129      while (y < y1) {
 130          uint bin_ix = y * width_in_bins + x;
 131          uint out_mask = bitmaps[my_slice][bin_ix];
 132          if ((out_mask & my_mask) != 0) {
 133              uint idx = bitCount(out_mask & (my_mask - 1));
 134              if (my_slice > 0) {
 135                  idx += count[my_slice - 1][bin_ix];
 136              }
 137              Alloc out_alloc = sh_chunk_alloc[bin_ix];
 138              uint out_offset = out_alloc.offset + idx * BinInstance_size;
 139              BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
 140          }
 141          x++;
 142          if (x == x1) {
 143              x = x0;
 144              y++;
 145          }
 146      }
 147  }
 148