binning.comp raw
1 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
2
3 // The binning stage of the pipeline.
4 //
5 // Each workgroup processes N_TILE paths.
6 // Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
7 // based on the path bounding box to bin the paths.
8
9 #version 450
10 #extension GL_GOOGLE_include_directive : enable
11
12 #include "mem.h"
13 #include "setup.h"
14
15 layout(local_size_x = N_TILE, local_size_y = 1) in;
16
17 layout(set = 0, binding = 1) readonly buffer ConfigBuf {
18 Config conf;
19 };
20
21 #include "annotated.h"
22 #include "bins.h"
23
24 // scale factors useful for converting coordinates to bins
25 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
26 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
27
28 // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
29 #define INFINITY (1.0 / 0.0)
30
31 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
32 // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
33 shared uint bitmaps[N_SLICE][N_TILE];
34 shared uint count[N_SLICE][N_TILE];
35 shared Alloc sh_chunk_alloc[N_TILE];
36 shared bool sh_alloc_failed;
37
38 void main() {
39 uint my_n_elements = conf.n_elements;
40 uint my_partition = gl_WorkGroupID.x;
41
42 for (uint i = 0; i < N_SLICE; i++) {
43 bitmaps[i][gl_LocalInvocationID.x] = 0;
44 }
45 if (gl_LocalInvocationID.x == 0) {
46 sh_alloc_failed = false;
47 }
48 barrier();
49
50 // Read inputs and determine coverage of bins
51 uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
52 AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
53 uint tag = Annotated_Nop;
54 if (element_ix < my_n_elements) {
55 tag = Annotated_tag(conf.anno_alloc, ref).tag;
56 }
57 int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
58 switch (tag) {
59 case Annotated_Color:
60 case Annotated_Image:
61 case Annotated_BeginClip:
62 case Annotated_EndClip:
63 // Note: we take advantage of the fact that these drawing elements
64 // have the bbox at the same place in their layout.
65 AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
66 x0 = int(floor(clip.bbox.x * SX));
67 y0 = int(floor(clip.bbox.y * SY));
68 x1 = int(ceil(clip.bbox.z * SX));
69 y1 = int(ceil(clip.bbox.w * SY));
70 break;
71 }
72
73 // At this point, we run an iterator over the coverage area,
74 // trying to keep divergence low.
75 // Right now, it's just a bbox, but we'll get finer with
76 // segments.
77 uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
78 uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
79 x0 = clamp(x0, 0, int(width_in_bins));
80 x1 = clamp(x1, x0, int(width_in_bins));
81 y0 = clamp(y0, 0, int(height_in_bins));
82 y1 = clamp(y1, y0, int(height_in_bins));
83 if (x0 == x1) y1 = y0;
84 int x = x0, y = y0;
85 uint my_slice = gl_LocalInvocationID.x / 32;
86 uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
87 while (y < y1) {
88 atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
89 x++;
90 if (x == x1) {
91 x = x0;
92 y++;
93 }
94 }
95
96 barrier();
97 // Allocate output segments.
98 uint element_count = 0;
99 for (uint i = 0; i < N_SLICE; i++) {
100 element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
101 count[i][gl_LocalInvocationID.x] = element_count;
102 }
103 // element_count is number of elements covering bin for this invocation.
104 Alloc chunk_alloc = new_alloc(0, 0, true);
105 if (element_count != 0) {
106 // TODO: aggregate atomic adds (subgroup is probably fastest)
107 MallocResult chunk = malloc(element_count * BinInstance_size);
108 chunk_alloc = chunk.alloc;
109 sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
110 if (chunk.failed) {
111 sh_alloc_failed = true;
112 }
113 }
114 // Note: it might be more efficient for reading to do this in the
115 // other order (each bin is a contiguous sequence of partitions)
116 uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
117 write_mem(conf.bin_alloc, out_ix, element_count);
118 write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
119
120 barrier();
121 if (sh_alloc_failed || mem_error != NO_ERROR) {
122 return;
123 }
124
125 // Use similar strategy as Laine & Karras paper; loop over bbox of bins
126 // touched by this element
127 x = x0;
128 y = y0;
129 while (y < y1) {
130 uint bin_ix = y * width_in_bins + x;
131 uint out_mask = bitmaps[my_slice][bin_ix];
132 if ((out_mask & my_mask) != 0) {
133 uint idx = bitCount(out_mask & (my_mask - 1));
134 if (my_slice > 0) {
135 idx += count[my_slice - 1][bin_ix];
136 }
137 Alloc out_alloc = sh_chunk_alloc[bin_ix];
138 uint out_offset = out_alloc.offset + idx * BinInstance_size;
139 BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
140 }
141 x++;
142 if (x == x1) {
143 x = x0;
144 y++;
145 }
146 }
147 }
148