elements.comp raw

   1  // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
   2  
   3  // The element processing stage, first in the pipeline.
   4  //
   5  // This stage is primarily about applying transforms and computing bounding
   6  // boxes. It is organized as a scan over the input elements, producing
   7  // annotated output elements.
   8  
   9  #version 450
  10  #extension GL_GOOGLE_include_directive : enable
  11  
  12  #include "mem.h"
  13  #include "setup.h"
  14  
  15  #define N_ROWS 4
  16  #define WG_SIZE 32
  17  #define LG_WG_SIZE 5
  18  #define PARTITION_SIZE (WG_SIZE * N_ROWS)
  19  
  20  layout(local_size_x = WG_SIZE, local_size_y = 1) in;
  21  
  22  layout(set = 0, binding = 1) readonly buffer ConfigBuf {
  23      Config conf;
  24  };
  25  
  26  layout(set = 0, binding = 2) readonly buffer SceneBuf {
  27      uint[] scene;
  28  };
  29  
  30  // It would be better to use the Vulkan memory model than
  31  // "volatile" but shooting for compatibility here rather
  32  // than doing things right.
  33  layout(set = 0, binding = 3) volatile buffer StateBuf {
  34      uint part_counter;
  35      uint[] state;
  36  };
  37  
  38  #include "scene.h"
  39  #include "state.h"
  40  #include "annotated.h"
  41  #include "pathseg.h"
  42  #include "tile.h"
  43  
  44  #define StateBuf_stride (4 + 2 * State_size)
  45  
  46  StateRef state_aggregate_ref(uint partition_ix) {
  47      return StateRef(4 + partition_ix * StateBuf_stride);
  48  }
  49  
  50  StateRef state_prefix_ref(uint partition_ix) {
  51      return StateRef(4 + partition_ix * StateBuf_stride + State_size);
  52  }
  53  
  54  uint state_flag_index(uint partition_ix) {
  55      return partition_ix * (StateBuf_stride / 4);
  56  }
  57  
  58  // These correspond to X, A, P respectively in the prefix sum paper.
  59  #define FLAG_NOT_READY 0
  60  #define FLAG_AGGREGATE_READY 1
  61  #define FLAG_PREFIX_READY 2
  62  
  63  #define FLAG_SET_LINEWIDTH 1
  64  #define FLAG_SET_BBOX 2
  65  #define FLAG_RESET_BBOX 4
  66  #define FLAG_SET_FILL_MODE 8
  67  // Fill modes take up the next bit. Non-zero fill is 0, stroke is 1.
  68  #define LG_FILL_MODE 4
  69  #define FILL_MODE_BITS 1
  70  #define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE)
  71  
  72  // This is almost like a monoid (the interaction between transformation and
  73  // bounding boxes is approximate)
  74  State combine_state(State a, State b) {
  75      State c;
  76      c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
  77      c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
  78      c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
  79      c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
  80      if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
  81          c.bbox = a.bbox;
  82      } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
  83          (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
  84      {
  85          c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
  86          c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
  87      }
  88      // It would be more concise to cast to matrix types; ah well.
  89      c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
  90      c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
  91      c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
  92      c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
  93      c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
  94      c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
  95      c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
  96      c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags;
  97      c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
  98      uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags;
  99      fill_mode &= FILL_MODE_MASK;
 100      c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode;
 101      c.path_count = a.path_count + b.path_count;
 102      c.pathseg_count = a.pathseg_count + b.pathseg_count;
 103      c.trans_count = a.trans_count + b.trans_count;
 104      return c;
 105  }
 106  
 107  State map_element(ElementRef ref) {
 108      // TODO: it would *probably* be more efficient to make the memory read patterns less
 109      // divergent, though it would be more wasted memory.
 110      uint tag = Element_tag(ref).tag;
 111      State c;
 112      c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
 113      c.mat = vec4(1.0, 0.0, 0.0, 1.0);
 114      c.translate = vec2(0.0, 0.0);
 115      c.linewidth = 1.0; // TODO should be 0.0
 116      c.flags = 0;
 117      c.path_count = 0;
 118      c.pathseg_count = 0;
 119      c.trans_count = 0;
 120      switch (tag) {
 121      case Element_Line:
 122          LineSeg line = Element_Line_read(ref);
 123          c.bbox.xy = min(line.p0, line.p1);
 124          c.bbox.zw = max(line.p0, line.p1);
 125          c.pathseg_count = 1;
 126          break;
 127      case Element_Quad:
 128          QuadSeg quad = Element_Quad_read(ref);
 129          c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
 130          c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
 131          c.pathseg_count = 1;
 132          break;
 133      case Element_Cubic:
 134          CubicSeg cubic = Element_Cubic_read(ref);
 135          c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
 136          c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
 137          c.pathseg_count = 1;
 138          break;
 139      case Element_FillColor:
 140      case Element_FillImage:
 141      case Element_BeginClip:
 142          c.flags = FLAG_RESET_BBOX;
 143          c.path_count = 1;
 144          break;
 145      case Element_EndClip:
 146          c.path_count = 1;
 147          break;
 148      case Element_SetLineWidth:
 149          SetLineWidth lw = Element_SetLineWidth_read(ref);
 150          c.linewidth = lw.width;
 151          c.flags = FLAG_SET_LINEWIDTH;
 152          break;
 153      case Element_Transform:
 154          Transform t = Element_Transform_read(ref);
 155          c.mat = t.mat;
 156          c.translate = t.translate;
 157          c.trans_count = 1;
 158          break;
 159      case Element_SetFillMode:
 160          SetFillMode fm = Element_SetFillMode_read(ref);
 161          c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE);
 162          break;
 163      }
 164      return c;
 165  }
 166  
 167  // Get the bounding box of a circle transformed by the matrix into an ellipse.
 168  vec2 get_linewidth(State st) {
 169      // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
 170      return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
 171  }
 172  
 173  shared State sh_state[WG_SIZE];
 174  
 175  shared uint sh_part_ix;
 176  shared State sh_prefix;
 177  
 178  void main() {
 179      State th_state[N_ROWS];
 180      // Determine partition to process by atomic counter (described in Section
 181      // 4.4 of prefix sum paper).
 182      if (gl_LocalInvocationID.x == 0) {
 183          sh_part_ix = atomicAdd(part_counter, 1);
 184      }
 185      barrier();
 186      uint part_ix = sh_part_ix;
 187  
 188      uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
 189      ElementRef ref = ElementRef(ix * Element_size);
 190  
 191      th_state[0] = map_element(ref);
 192      for (uint i = 1; i < N_ROWS; i++) {
 193          // discussion question: would it be faster to load using more coherent patterns
 194          // into thread memory? This is kinda strided.
 195          th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
 196      }
 197      State agg = th_state[N_ROWS - 1];
 198      sh_state[gl_LocalInvocationID.x] = agg;
 199      for (uint i = 0; i < LG_WG_SIZE; i++) {
 200          barrier();
 201          if (gl_LocalInvocationID.x >= (1 << i)) {
 202              State other = sh_state[gl_LocalInvocationID.x - (1 << i)];
 203              agg = combine_state(other, agg);
 204          }
 205          barrier();
 206          sh_state[gl_LocalInvocationID.x] = agg;
 207      }
 208  
 209      State exclusive;
 210      exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
 211      exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
 212      exclusive.translate = vec2(0.0, 0.0);
 213      exclusive.linewidth = 1.0; //TODO should be 0.0
 214      exclusive.flags = 0;
 215      exclusive.path_count = 0;
 216      exclusive.pathseg_count = 0;
 217      exclusive.trans_count = 0;
 218  
 219      // Publish aggregate for this partition
 220      if (gl_LocalInvocationID.x == WG_SIZE - 1) {
 221          // Note: with memory model, we'd want to generate the atomic store version of this.
 222          State_write(state_aggregate_ref(part_ix), agg);
 223          uint flag = FLAG_AGGREGATE_READY;
 224          memoryBarrierBuffer();
 225          if (part_ix == 0) {
 226              State_write(state_prefix_ref(part_ix), agg);
 227              flag = FLAG_PREFIX_READY;
 228          }
 229          state[state_flag_index(part_ix)] = flag;
 230          if (part_ix != 0) {
 231              // step 4 of paper: decoupled lookback
 232              uint look_back_ix = part_ix - 1;
 233  
 234              State their_agg;
 235              uint their_ix = 0;
 236              while (true) {
 237                  flag = state[state_flag_index(look_back_ix)];
 238                  if (flag == FLAG_PREFIX_READY) {
 239                      State their_prefix = State_read(state_prefix_ref(look_back_ix));
 240                      exclusive = combine_state(their_prefix, exclusive);
 241                      break;
 242                  } else if (flag == FLAG_AGGREGATE_READY) {
 243                      their_agg = State_read(state_aggregate_ref(look_back_ix));
 244                      exclusive = combine_state(their_agg, exclusive);
 245                      look_back_ix--;
 246                      their_ix = 0;
 247                      continue;
 248                  }
 249                  // else spin
 250  
 251                  // Unfortunately there's no guarantee of forward progress of other
 252                  // workgroups, so compute a bit of the aggregate before trying again.
 253                  // In the worst case, spinning stops when the aggregate is complete.
 254                  ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
 255                  State s = map_element(ref);
 256                  if (their_ix == 0) {
 257                      their_agg = s;
 258                  } else {
 259                      their_agg = combine_state(their_agg, s);
 260                  }
 261                  their_ix++;
 262                  if (their_ix == PARTITION_SIZE) {
 263                      exclusive = combine_state(their_agg, exclusive);
 264                      if (look_back_ix == 0) {
 265                          break;
 266                      }
 267                      look_back_ix--;
 268                      their_ix = 0;
 269                  }
 270              }
 271  
 272              // step 5 of paper: compute inclusive prefix
 273              State inclusive_prefix = combine_state(exclusive, agg);
 274              sh_prefix = exclusive;
 275              State_write(state_prefix_ref(part_ix), inclusive_prefix);
 276              memoryBarrierBuffer();
 277              flag = FLAG_PREFIX_READY;
 278              state[state_flag_index(part_ix)] = flag;
 279          }
 280      }
 281      barrier();
 282      if (part_ix != 0) {
 283          exclusive = sh_prefix;
 284      }
 285  
 286      State row = exclusive;
 287      if (gl_LocalInvocationID.x > 0) {
 288          State other = sh_state[gl_LocalInvocationID.x - 1];
 289          row = combine_state(row, other);
 290      }
 291      for (uint i = 0; i < N_ROWS; i++) {
 292          State st = combine_state(row, th_state[i]);
 293  
 294          // Here we read again from the original scene. There may be
 295          // gains to be had from stashing in shared memory or possibly
 296          // registers (though register pressure is an issue).
 297          ElementRef this_ref = Element_index(ref, i);
 298          ElementTag tag = Element_tag(this_ref);
 299          uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE);
 300          bool is_stroke = fill_mode == MODE_STROKE;
 301          switch (tag.tag) {
 302          case Element_Line:
 303              LineSeg line = Element_Line_read(this_ref);
 304              PathCubic path_cubic;
 305              path_cubic.p0 = line.p0;
 306              path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0);
 307              path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0);
 308              path_cubic.p3 = line.p1;
 309              path_cubic.path_ix = st.path_count;
 310              path_cubic.trans_ix = st.trans_count;
 311              if (is_stroke) {
 312                  path_cubic.stroke = get_linewidth(st);
 313              } else {
 314                  path_cubic.stroke = vec2(0.0);
 315              }
 316              PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
 317              PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
 318              break;
 319          case Element_Quad:
 320              QuadSeg quad = Element_Quad_read(this_ref);
 321              path_cubic.p0 = quad.p0;
 322              path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0);
 323              path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0);
 324              path_cubic.p3 = quad.p2;
 325              path_cubic.path_ix = st.path_count;
 326              path_cubic.trans_ix = st.trans_count;
 327              if (is_stroke) {
 328                  path_cubic.stroke = get_linewidth(st);
 329              } else {
 330                  path_cubic.stroke = vec2(0.0);
 331              }
 332              path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
 333              PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
 334              break;
 335          case Element_Cubic:
 336              CubicSeg cubic = Element_Cubic_read(this_ref);
 337              path_cubic.p0 = cubic.p0;
 338              path_cubic.p1 = cubic.p1;
 339              path_cubic.p2 = cubic.p2;
 340              path_cubic.p3 = cubic.p3;
 341              path_cubic.path_ix = st.path_count;
 342              path_cubic.trans_ix = st.trans_count;
 343              if (is_stroke) {
 344                  path_cubic.stroke = get_linewidth(st);
 345              } else {
 346                  path_cubic.stroke = vec2(0.0);
 347              }
 348              path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
 349              PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
 350              break;
 351          case Element_FillColor:
 352              FillColor fill = Element_FillColor_read(this_ref);
 353              AnnoColor anno_fill;
 354              anno_fill.rgba_color = fill.rgba_color;
 355              if (is_stroke) {
 356                  vec2 lw = get_linewidth(st);
 357                  anno_fill.bbox = st.bbox + vec4(-lw, lw);
 358                  anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
 359              } else {
 360                  anno_fill.bbox = st.bbox;
 361                  anno_fill.linewidth = 0.0;
 362              }
 363              AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
 364              Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
 365              break;
 366          case Element_FillImage:
 367              FillImage fill_img = Element_FillImage_read(this_ref);
 368              AnnoImage anno_img;
 369              anno_img.index = fill_img.index;
 370              anno_img.offset = fill_img.offset;
 371              if (is_stroke) {
 372                  vec2 lw = get_linewidth(st);
 373                  anno_img.bbox = st.bbox + vec4(-lw, lw);
 374                  anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
 375              } else {
 376                  anno_img.bbox = st.bbox;
 377                  anno_img.linewidth = 0.0;
 378              }
 379              out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
 380              Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
 381              break;
 382          case Element_BeginClip:
 383              Clip begin_clip = Element_BeginClip_read(this_ref);
 384              AnnoBeginClip anno_begin_clip;
 385              // This is the absolute bbox, it's been transformed during encoding.
 386              anno_begin_clip.bbox = begin_clip.bbox;
 387              if (is_stroke) {
 388                  vec2 lw = get_linewidth(st);
 389                  anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
 390              } else {
 391                  anno_fill.linewidth = 0.0;
 392              }
 393              out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
 394              Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip);
 395              break;
 396          case Element_EndClip:
 397              Clip end_clip = Element_EndClip_read(this_ref);
 398              // This bbox is expected to be the same as the begin one.
 399              AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox);
 400              out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
 401              Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
 402              break;
 403          case Element_Transform:
 404              TransformSeg transform = TransformSeg(st.mat, st.translate);
 405              TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size);
 406              TransformSeg_write(conf.trans_alloc, trans_ref, transform);
 407              break;
 408          }
 409      }
 410  }
 411