compute.go raw

   1  // SPDX-License-Identifier: Unlicense OR MIT
   2  
   3  package gpu
   4  
   5  import (
   6  	"encoding/binary"
   7  	"errors"
   8  	"fmt"
   9  	"image"
  10  	"image/color"
  11  	"math/bits"
  12  	"time"
  13  	"unsafe"
  14  
  15  	"github.com/p9c/p9/pkg/gel/gio/f32"
  16  	"github.com/p9c/p9/pkg/gel/gio/gpu/internal/driver"
  17  	"github.com/p9c/p9/pkg/gel/gio/internal/byteslice"
  18  	"github.com/p9c/p9/pkg/gel/gio/internal/f32color"
  19  	"github.com/p9c/p9/pkg/gel/gio/internal/ops"
  20  	"github.com/p9c/p9/pkg/gel/gio/internal/scene"
  21  	"github.com/p9c/p9/pkg/gel/gio/layout"
  22  	"github.com/p9c/p9/pkg/gel/gio/op"
  23  )
  24  
  25  type compute struct {
  26  	ctx driver.Device
  27  	enc encoder
  28  
  29  	drawOps       drawOps
  30  	texOps        []textureOp
  31  	cache         *resourceCache
  32  	maxTextureDim int
  33  
  34  	programs struct {
  35  		elements   driver.Program
  36  		tileAlloc  driver.Program
  37  		pathCoarse driver.Program
  38  		backdrop   driver.Program
  39  		binning    driver.Program
  40  		coarse     driver.Program
  41  		kernel4    driver.Program
  42  	}
  43  	buffers struct {
  44  		config driver.Buffer
  45  		scene  sizedBuffer
  46  		state  sizedBuffer
  47  		memory sizedBuffer
  48  	}
  49  	output struct {
  50  		size image.Point
  51  		// image is the output texture. Note that it is in RGBA format,
  52  		// but contains data in sRGB. See blitOutput for more detail.
  53  		image    driver.Texture
  54  		blitProg driver.Program
  55  	}
  56  	// images contains ImageOp images packed into a texture atlas.
  57  	images struct {
  58  		packer packer
  59  		// positions maps imageOpData.handles to positions inside tex.
  60  		positions map[interface{}]image.Point
  61  		tex       driver.Texture
  62  	}
  63  	// materials contains the pre-processed materials (transformed images for
  64  	// now, gradients etc. later) packed in a texture atlas. The atlas is used
  65  	// as source in kernel4.
  66  	materials struct {
  67  		// offsets maps texture ops to the offsets to put in their FillImage commands.
  68  		offsets map[textureKey]image.Point
  69  
  70  		prog   driver.Program
  71  		layout driver.InputLayout
  72  
  73  		packer packer
  74  
  75  		tex   driver.Texture
  76  		fbo   driver.Framebuffer
  77  		quads []materialVertex
  78  
  79  		bufSize int
  80  		buffer  driver.Buffer
  81  	}
  82  	timers struct {
  83  		profile         string
  84  		t               *timers
  85  		elements        *timer
  86  		tileAlloc       *timer
  87  		pathCoarse      *timer
  88  		backdropBinning *timer
  89  		coarse          *timer
  90  		kernel4         *timer
  91  	}
  92  
  93  	// The following fields hold scratch space to avoid garbage.
  94  	zeroSlice []byte
  95  	memHeader *memoryHeader
  96  	conf      *config
  97  }
  98  
  99  // materialVertex describes a vertex of a quad used to render a transformed
 100  // material.
 101  type materialVertex struct {
 102  	posX, posY float32
 103  	u, v       float32
 104  }
 105  
 106  // textureKey identifies textureOp.
 107  type textureKey struct {
 108  	handle    interface{}
 109  	transform f32.Affine2D
 110  }
 111  
 112  // textureOp represents an imageOp that requires texture space.
 113  type textureOp struct {
 114  	// sceneIdx is the index in the scene that contains the fill image command
 115  	// that corresponds to the operation.
 116  	sceneIdx int
 117  	key      textureKey
 118  	img      imageOpData
 119  
 120  	// pos is the position of the untransformed image in the images texture.
 121  	pos image.Point
 122  }
 123  
 124  type encoder struct {
 125  	scene    []scene.Command
 126  	npath    int
 127  	npathseg int
 128  	ntrans   int
 129  }
 130  
 131  type encodeState struct {
 132  	trans f32.Affine2D
 133  	clip  f32.Rectangle
 134  }
 135  
 136  type sizedBuffer struct {
 137  	size   int
 138  	buffer driver.Buffer
 139  }
 140  
 141  // config matches Config in setup.h
 142  type config struct {
 143  	n_elements      uint32 // paths
 144  	n_pathseg       uint32
 145  	width_in_tiles  uint32
 146  	height_in_tiles uint32
 147  	tile_alloc      memAlloc
 148  	bin_alloc       memAlloc
 149  	ptcl_alloc      memAlloc
 150  	pathseg_alloc   memAlloc
 151  	anno_alloc      memAlloc
 152  	trans_alloc     memAlloc
 153  }
 154  
 155  // memAlloc matches Alloc in mem.h
 156  type memAlloc struct {
 157  	offset uint32
 158  	//size   uint32
 159  }
 160  
 161  // memoryHeader matches the header of Memory in mem.h.
 162  type memoryHeader struct {
 163  	mem_offset uint32
 164  	mem_error  uint32
 165  }
 166  
 167  // GPU structure sizes and constants.
 168  const (
 169  	tileWidthPx       = 32
 170  	tileHeightPx      = 32
 171  	ptclInitialAlloc  = 1024
 172  	kernel4OutputUnit = 2
 173  	kernel4AtlasUnit  = 3
 174  
 175  	pathSize    = 12
 176  	binSize     = 8
 177  	pathsegSize = 52
 178  	annoSize    = 32
 179  	transSize   = 24
 180  	stateSize   = 60
 181  	stateStride = 4 + 2*stateSize
 182  )
 183  
 184  // mem.h constants.
 185  const (
 186  	memNoError      = 0 // NO_ERROR
 187  	memMallocFailed = 1 // ERR_MALLOC_FAILED
 188  )
 189  
 190  func newCompute(ctx driver.Device) (*compute, error) {
 191  	maxDim := ctx.Caps().MaxTextureSize
 192  	// Large atlas textures cause artifacts due to precision loss in
 193  	// shaders.
 194  	if cap := 8192; maxDim > cap {
 195  		maxDim = cap
 196  	}
 197  	g := &compute{
 198  		ctx:           ctx,
 199  		cache:         newResourceCache(),
 200  		maxTextureDim: maxDim,
 201  		conf:          new(config),
 202  		memHeader:     new(memoryHeader),
 203  	}
 204  
 205  	blitProg, err := ctx.NewProgram(shader_copy_vert, shader_copy_frag)
 206  	if err != nil {
 207  		g.Release()
 208  		return nil, err
 209  	}
 210  	g.output.blitProg = blitProg
 211  
 212  	materialProg, err := ctx.NewProgram(shader_material_vert, shader_material_frag)
 213  	if err != nil {
 214  		g.Release()
 215  		return nil, err
 216  	}
 217  	g.materials.prog = materialProg
 218  	progLayout, err := ctx.NewInputLayout(shader_material_vert, []driver.InputDesc{
 219  		{Type: driver.DataTypeFloat, Size: 2, Offset: 0},
 220  		{Type: driver.DataTypeFloat, Size: 2, Offset: 4 * 2},
 221  	})
 222  	if err != nil {
 223  		g.Release()
 224  		return nil, err
 225  	}
 226  	g.materials.layout = progLayout
 227  
 228  	g.drawOps.pathCache = newOpCache()
 229  	g.drawOps.compute = true
 230  
 231  	buf, err := ctx.NewBuffer(driver.BufferBindingShaderStorage, int(unsafe.Sizeof(config{})))
 232  	if err != nil {
 233  		g.Release()
 234  		return nil, err
 235  	}
 236  	g.buffers.config = buf
 237  
 238  	shaders := []struct {
 239  		prog *driver.Program
 240  		src  driver.ShaderSources
 241  	}{
 242  		{&g.programs.elements, shader_elements_comp},
 243  		{&g.programs.tileAlloc, shader_tile_alloc_comp},
 244  		{&g.programs.pathCoarse, shader_path_coarse_comp},
 245  		{&g.programs.backdrop, shader_backdrop_comp},
 246  		{&g.programs.binning, shader_binning_comp},
 247  		{&g.programs.coarse, shader_coarse_comp},
 248  		{&g.programs.kernel4, shader_kernel4_comp},
 249  	}
 250  	for _, shader := range shaders {
 251  		p, err := ctx.NewComputeProgram(shader.src)
 252  		if err != nil {
 253  			g.Release()
 254  			return nil, err
 255  		}
 256  		*shader.prog = p
 257  	}
 258  	return g, nil
 259  }
 260  
 261  func (g *compute) Collect(viewport image.Point, ops *op.Ops) {
 262  	g.drawOps.reset(g.cache, viewport)
 263  	g.drawOps.collect(g.ctx, g.cache, ops, viewport)
 264  	for _, img := range g.drawOps.allImageOps {
 265  		expandPathOp(img.path, img.clip)
 266  	}
 267  	if g.drawOps.profile && g.timers.t == nil && g.ctx.Caps().Features.Has(driver.FeatureTimers) {
 268  		t := &g.timers
 269  		t.t = newTimers(g.ctx)
 270  		t.elements = g.timers.t.newTimer()
 271  		t.tileAlloc = g.timers.t.newTimer()
 272  		t.pathCoarse = g.timers.t.newTimer()
 273  		t.backdropBinning = g.timers.t.newTimer()
 274  		t.coarse = g.timers.t.newTimer()
 275  		t.kernel4 = g.timers.t.newTimer()
 276  	}
 277  }
 278  
 279  func (g *compute) Clear(col color.NRGBA) {
 280  	g.drawOps.clear = true
 281  	g.drawOps.clearColor = f32color.LinearFromSRGB(col)
 282  }
 283  
 284  func (g *compute) Frame() error {
 285  	viewport := g.drawOps.viewport
 286  	tileDims := image.Point{
 287  		X: (viewport.X + tileWidthPx - 1) / tileWidthPx,
 288  		Y: (viewport.Y + tileHeightPx - 1) / tileHeightPx,
 289  	}
 290  
 291  	defFBO := g.ctx.BeginFrame()
 292  	defer g.ctx.EndFrame()
 293  
 294  	if err := g.encode(viewport); err != nil {
 295  		return err
 296  	}
 297  	if err := g.uploadImages(); err != nil {
 298  		return err
 299  	}
 300  	if err := g.renderMaterials(); err != nil {
 301  		return err
 302  	}
 303  	if err := g.render(tileDims); err != nil {
 304  		return err
 305  	}
 306  	g.ctx.BindFramebuffer(defFBO)
 307  	g.blitOutput(viewport)
 308  	g.cache.frame()
 309  	g.drawOps.pathCache.frame()
 310  	t := &g.timers
 311  	if g.drawOps.profile && t.t.ready() {
 312  		et, tat, pct, bbt := t.elements.Elapsed, t.tileAlloc.Elapsed, t.pathCoarse.Elapsed, t.backdropBinning.Elapsed
 313  		ct, k4t := t.coarse.Elapsed, t.kernel4.Elapsed
 314  		ft := et + tat + pct + bbt + ct + k4t
 315  		q := 100 * time.Microsecond
 316  		ft = ft.Round(q)
 317  		et, tat, pct, bbt = et.Round(q), tat.Round(q), pct.Round(q), bbt.Round(q)
 318  		ct, k4t = ct.Round(q), k4t.Round(q)
 319  		t.profile = fmt.Sprintf("ft:%7s et:%7s tat:%7s pct:%7s bbt:%7s ct:%7s k4t:%7s", ft, et, tat, pct, bbt, ct, k4t)
 320  	}
 321  	g.drawOps.clear = false
 322  	return nil
 323  }
 324  
 325  func (g *compute) Profile() string {
 326  	return g.timers.profile
 327  }
 328  
 329  // blitOutput copies the compute render output to the output FBO. We need to
 330  // copy because compute shaders can only write to textures, not FBOs. Compute
 331  // shader can only write to RGBA textures, but since we actually render in sRGB
 332  // format we can't use glBlitFramebuffer, because it does sRGB conversion.
 333  func (g *compute) blitOutput(viewport image.Point) {
 334  	if !g.drawOps.clear {
 335  		g.ctx.BlendFunc(driver.BlendFactorOne, driver.BlendFactorOneMinusSrcAlpha)
 336  		g.ctx.SetBlend(true)
 337  		defer g.ctx.SetBlend(false)
 338  	}
 339  	g.ctx.Viewport(0, 0, viewport.X, viewport.Y)
 340  	g.ctx.BindTexture(0, g.output.image)
 341  	g.ctx.BindProgram(g.output.blitProg)
 342  	g.ctx.DrawArrays(driver.DrawModeTriangleStrip, 0, 4)
 343  }
 344  
 345  func (g *compute) encode(viewport image.Point) error {
 346  	g.texOps = g.texOps[:0]
 347  	g.enc.reset()
 348  
 349  	// Flip Y-axis.
 350  	flipY := f32.Affine2D{}.Scale(f32.Pt(0, 0), f32.Pt(1, -1)).Offset(f32.Pt(0, float32(viewport.Y)))
 351  	g.enc.transform(flipY)
 352  	if g.drawOps.clear {
 353  		g.enc.rect(f32.Rectangle{Max: layout.FPt(viewport)})
 354  		g.enc.fillColor(f32color.NRGBAToRGBA(g.drawOps.clearColor.SRGB()))
 355  	}
 356  	return g.encodeOps(flipY, viewport, g.drawOps.allImageOps)
 357  }
 358  
 359  func (g *compute) renderMaterials() error {
 360  	m := &g.materials
 361  	m.quads = m.quads[:0]
 362  	resize := false
 363  	reclaimed := false
 364  restart:
 365  	for {
 366  		for _, op := range g.texOps {
 367  			if off, exists := m.offsets[op.key]; exists {
 368  				g.enc.setFillImageOffset(op.sceneIdx, off)
 369  				continue
 370  			}
 371  			quad, bounds := g.materialQuad(op.key.transform, op.img, op.pos)
 372  
 373  			// A material is clipped to avoid drawing outside its bounds inside the atlas. However,
 374  			// imprecision in the clipping may cause a single pixel overflow. Be safe.
 375  			size := bounds.Size().Add(image.Pt(1, 1))
 376  			place, fits := m.packer.tryAdd(size)
 377  			if !fits {
 378  				m.offsets = nil
 379  				m.quads = m.quads[:0]
 380  				m.packer.clear()
 381  				if !reclaimed {
 382  					// Some images may no longer be in use, try again
 383  					// after clearing existing maps.
 384  					reclaimed = true
 385  				} else {
 386  					m.packer.maxDim += 256
 387  					resize = true
 388  					if m.packer.maxDim > g.maxTextureDim {
 389  						return errors.New("compute: no space left in material atlas")
 390  					}
 391  				}
 392  				m.packer.newPage()
 393  				continue restart
 394  			}
 395  			// Position quad to match place.
 396  			offset := place.Pos.Sub(bounds.Min)
 397  			offsetf := layout.FPt(offset)
 398  			for i := range quad {
 399  				quad[i].posX += offsetf.X
 400  				quad[i].posY += offsetf.Y
 401  			}
 402  			// Draw quad as two triangles.
 403  			m.quads = append(m.quads, quad[0], quad[1], quad[3], quad[3], quad[1], quad[2])
 404  			if m.offsets == nil {
 405  				m.offsets = make(map[textureKey]image.Point)
 406  			}
 407  			m.offsets[op.key] = offset
 408  			g.enc.setFillImageOffset(op.sceneIdx, offset)
 409  		}
 410  		break
 411  	}
 412  	if len(m.quads) == 0 {
 413  		return nil
 414  	}
 415  	texSize := m.packer.maxDim
 416  	if resize {
 417  		if m.fbo != nil {
 418  			m.fbo.Release()
 419  			m.fbo = nil
 420  		}
 421  		if m.tex != nil {
 422  			m.tex.Release()
 423  			m.tex = nil
 424  		}
 425  		handle, err := g.ctx.NewTexture(driver.TextureFormatRGBA8, texSize, texSize,
 426  			driver.FilterNearest, driver.FilterNearest,
 427  			driver.BufferBindingShaderStorage|driver.BufferBindingFramebuffer)
 428  		if err != nil {
 429  			return fmt.Errorf("compute: failed to create material atlas: %v", err)
 430  		}
 431  		m.tex = handle
 432  		fbo, err := g.ctx.NewFramebuffer(handle, 0)
 433  		if err != nil {
 434  			return fmt.Errorf("compute: failed to create material framebuffer: %v", err)
 435  		}
 436  		m.fbo = fbo
 437  	}
 438  	// TODO: move to shaders.
 439  	// Transform to clip space: [-1, -1] - [1, 1].
 440  	clip := f32.Affine2D{}.Scale(f32.Pt(0, 0), f32.Pt(2/float32(texSize), 2/float32(texSize))).Offset(f32.Pt(-1, -1))
 441  	for i, v := range m.quads {
 442  		p := clip.Transform(f32.Pt(v.posX, v.posY))
 443  		m.quads[i].posX = p.X
 444  		m.quads[i].posY = p.Y
 445  	}
 446  	vertexData := byteslice.Slice(m.quads)
 447  	if len(vertexData) > m.bufSize {
 448  		if m.buffer != nil {
 449  			m.buffer.Release()
 450  			m.buffer = nil
 451  		}
 452  		n := pow2Ceil(len(vertexData))
 453  		buf, err := g.ctx.NewBuffer(driver.BufferBindingVertices, n)
 454  		if err != nil {
 455  			return err
 456  		}
 457  		m.bufSize = n
 458  		m.buffer = buf
 459  	}
 460  	m.buffer.Upload(vertexData)
 461  	g.ctx.BindTexture(0, g.images.tex)
 462  	g.ctx.BindFramebuffer(m.fbo)
 463  	g.ctx.Viewport(0, 0, texSize, texSize)
 464  	if reclaimed {
 465  		g.ctx.Clear(0, 0, 0, 0)
 466  	}
 467  	g.ctx.BindProgram(m.prog)
 468  	g.ctx.BindVertexBuffer(m.buffer, int(unsafe.Sizeof(m.quads[0])), 0)
 469  	g.ctx.BindInputLayout(m.layout)
 470  	g.ctx.DrawArrays(driver.DrawModeTriangles, 0, len(m.quads))
 471  	return nil
 472  }
 473  
 474  func (g *compute) uploadImages() error {
 475  	// padding is the number of pixels added to the right and below
 476  	// images, to avoid atlas filtering artifacts.
 477  	const padding = 1
 478  
 479  	a := &g.images
 480  	var uploads map[interface{}]*image.RGBA
 481  	resize := false
 482  	reclaimed := false
 483  restart:
 484  	for {
 485  		for i, op := range g.texOps {
 486  			if pos, exists := a.positions[op.img.handle]; exists {
 487  				g.texOps[i].pos = pos
 488  				continue
 489  			}
 490  			size := op.img.src.Bounds().Size().Add(image.Pt(padding, padding))
 491  			place, fits := a.packer.tryAdd(size)
 492  			if !fits {
 493  				a.positions = nil
 494  				uploads = nil
 495  				a.packer.clear()
 496  				if !reclaimed {
 497  					// Some images may no longer be in use, try again
 498  					// after clearing existing maps.
 499  					reclaimed = true
 500  				} else {
 501  					a.packer.maxDim += 256
 502  					resize = true
 503  					if a.packer.maxDim > g.maxTextureDim {
 504  						return errors.New("compute: no space left in image atlas")
 505  					}
 506  				}
 507  				a.packer.newPage()
 508  				continue restart
 509  			}
 510  			if a.positions == nil {
 511  				a.positions = make(map[interface{}]image.Point)
 512  			}
 513  			a.positions[op.img.handle] = place.Pos
 514  			g.texOps[i].pos = place.Pos
 515  			if uploads == nil {
 516  				uploads = make(map[interface{}]*image.RGBA)
 517  			}
 518  			uploads[op.img.handle] = op.img.src
 519  		}
 520  		break
 521  	}
 522  	if len(uploads) == 0 {
 523  		return nil
 524  	}
 525  	if resize {
 526  		if a.tex != nil {
 527  			a.tex.Release()
 528  			a.tex = nil
 529  		}
 530  		sz := a.packer.maxDim
 531  		handle, err := g.ctx.NewTexture(driver.TextureFormatSRGB, sz, sz, driver.FilterLinear, driver.FilterLinear, driver.BufferBindingTexture)
 532  		if err != nil {
 533  			return fmt.Errorf("compute: failed to create image atlas: %v", err)
 534  		}
 535  		a.tex = handle
 536  	}
 537  	for h, img := range uploads {
 538  		pos, ok := a.positions[h]
 539  		if !ok {
 540  			panic("compute: internal error: image not placed")
 541  		}
 542  		size := img.Bounds().Size()
 543  		driver.UploadImage(a.tex, pos, img)
 544  		rightPadding := image.Pt(padding, size.Y)
 545  		a.tex.Upload(image.Pt(pos.X+size.X, pos.Y), rightPadding, g.zeros(rightPadding.X*rightPadding.Y*4))
 546  		bottomPadding := image.Pt(size.X, padding)
 547  		a.tex.Upload(image.Pt(pos.X, pos.Y+size.Y), bottomPadding, g.zeros(bottomPadding.X*bottomPadding.Y*4))
 548  	}
 549  	return nil
 550  }
 551  
 552  func pow2Ceil(v int) int {
 553  	exp := bits.Len(uint(v))
 554  	if bits.OnesCount(uint(v)) == 1 {
 555  		exp--
 556  	}
 557  	return 1 << exp
 558  }
 559  
 560  // materialQuad constructs a quad that represents the transformed image. It returns the quad
 561  // and its bounds.
 562  func (g *compute) materialQuad(M f32.Affine2D, img imageOpData, uvPos image.Point) ([4]materialVertex, image.Rectangle) {
 563  	imgSize := layout.FPt(img.src.Bounds().Size())
 564  	sx, hx, ox, hy, sy, oy := M.Elems()
 565  	transOff := f32.Pt(ox, oy)
 566  	// The 4 corners of the image rectangle transformed by M, excluding its offset, are:
 567  	//
 568  	// q0: M * (0, 0)   q3: M * (w, 0)
 569  	// q1: M * (0, h)   q2: M * (w, h)
 570  	//
 571  	// Note that q0 = M*0 = 0, q2 = q1 + q3.
 572  	q0 := f32.Pt(0, 0)
 573  	q1 := f32.Pt(hx*imgSize.Y, sy*imgSize.Y)
 574  	q3 := f32.Pt(sx*imgSize.X, hy*imgSize.X)
 575  	q2 := q1.Add(q3)
 576  	q0 = q0.Add(transOff)
 577  	q1 = q1.Add(transOff)
 578  	q2 = q2.Add(transOff)
 579  	q3 = q3.Add(transOff)
 580  
 581  	boundsf := f32.Rectangle{
 582  		Min: min(min(q0, q1), min(q2, q3)),
 583  		Max: max(max(q0, q1), max(q2, q3)),
 584  	}
 585  
 586  	bounds := boundRectF(boundsf)
 587  	uvPosf := layout.FPt(uvPos)
 588  	atlasScale := 1 / float32(g.images.packer.maxDim)
 589  	uvBounds := f32.Rectangle{
 590  		Min: uvPosf.Mul(atlasScale),
 591  		Max: uvPosf.Add(imgSize).Mul(atlasScale),
 592  	}
 593  	quad := [4]materialVertex{
 594  		{posX: q0.X, posY: q0.Y, u: uvBounds.Min.X, v: uvBounds.Min.Y},
 595  		{posX: q1.X, posY: q1.Y, u: uvBounds.Min.X, v: uvBounds.Max.Y},
 596  		{posX: q2.X, posY: q2.Y, u: uvBounds.Max.X, v: uvBounds.Max.Y},
 597  		{posX: q3.X, posY: q3.Y, u: uvBounds.Max.X, v: uvBounds.Min.Y},
 598  	}
 599  	return quad, bounds
 600  }
 601  
 602  func max(p1, p2 f32.Point) f32.Point {
 603  	p := p1
 604  	if p2.X > p.X {
 605  		p.X = p2.X
 606  	}
 607  	if p2.Y > p.Y {
 608  		p.Y = p2.Y
 609  	}
 610  	return p
 611  }
 612  
 613  func min(p1, p2 f32.Point) f32.Point {
 614  	p := p1
 615  	if p2.X < p.X {
 616  		p.X = p2.X
 617  	}
 618  	if p2.Y < p.Y {
 619  		p.Y = p2.Y
 620  	}
 621  	return p
 622  }
 623  
 624  func (g *compute) encodeOps(trans f32.Affine2D, viewport image.Point, ops []imageOp) error {
 625  	for _, op := range ops {
 626  		bounds := layout.FRect(op.clip)
 627  		// clip is the union of all drawing affected by the clipping
 628  		// operation. TODO: tighten.
 629  		clip := f32.Rect(0, 0, float32(viewport.X), float32(viewport.Y))
 630  		nclips := g.encodeClipStack(clip, bounds, op.path, false)
 631  		m := op.material
 632  		switch m.material {
 633  		case materialTexture:
 634  			t := trans.Mul(m.trans)
 635  			g.texOps = append(g.texOps, textureOp{
 636  				sceneIdx: len(g.enc.scene),
 637  				img:      m.data,
 638  				key: textureKey{
 639  					transform: t,
 640  					handle:    m.data.handle,
 641  				},
 642  			})
 643  			// Add fill command, its offset is resolved and filled in renderMaterials.
 644  			g.enc.fillImage(0)
 645  		case materialColor:
 646  			g.enc.fillColor(f32color.NRGBAToRGBA(op.material.color.SRGB()))
 647  		case materialLinearGradient:
 648  			// TODO: implement.
 649  			g.enc.fillColor(f32color.NRGBAToRGBA(op.material.color1.SRGB()))
 650  		default:
 651  			panic("not implemented")
 652  		}
 653  		if op.path != nil && op.path.path {
 654  			g.enc.fillMode(scene.FillModeNonzero)
 655  			g.enc.transform(op.path.trans.Invert())
 656  		}
 657  		// Pop the clip stack.
 658  		for i := 0; i < nclips; i++ {
 659  			g.enc.endClip(clip)
 660  		}
 661  	}
 662  	return nil
 663  }
 664  
 665  // encodeClips encodes a stack of clip paths and return the stack depth.
 666  func (g *compute) encodeClipStack(clip, bounds f32.Rectangle, p *pathOp, begin bool) int {
 667  	nclips := 0
 668  	if p != nil && p.parent != nil {
 669  		nclips += g.encodeClipStack(clip, bounds, p.parent, true)
 670  		nclips += 1
 671  	}
 672  	isStroke := p.stroke.Width > 0
 673  	if p != nil && p.path {
 674  		if isStroke {
 675  			g.enc.fillMode(scene.FillModeStroke)
 676  			g.enc.lineWidth(p.stroke.Width)
 677  		}
 678  		pathData, _ := g.drawOps.pathCache.get(p.pathKey)
 679  		g.enc.transform(p.trans)
 680  		g.enc.append(pathData.computePath)
 681  	} else {
 682  		g.enc.rect(bounds)
 683  	}
 684  	if begin {
 685  		g.enc.beginClip(clip)
 686  		if isStroke {
 687  			g.enc.fillMode(scene.FillModeNonzero)
 688  		}
 689  		if p != nil && p.path {
 690  			g.enc.transform(p.trans.Invert())
 691  		}
 692  	}
 693  	return nclips
 694  }
 695  
 696  func encodePath(verts []byte) encoder {
 697  	var enc encoder
 698  	for len(verts) >= scene.CommandSize+4 {
 699  		cmd := ops.DecodeCommand(verts[4:])
 700  		enc.scene = append(enc.scene, cmd)
 701  		enc.npathseg++
 702  		verts = verts[scene.CommandSize+4:]
 703  	}
 704  	return enc
 705  }
 706  
 707  func (g *compute) render(tileDims image.Point) error {
 708  	const (
 709  		// wgSize is the largest and most common workgroup size.
 710  		wgSize = 128
 711  		// PARTITION_SIZE from elements.comp
 712  		partitionSize = 32 * 4
 713  	)
 714  	widthInBins := (tileDims.X + 15) / 16
 715  	heightInBins := (tileDims.Y + 7) / 8
 716  	if widthInBins*heightInBins > wgSize {
 717  		return fmt.Errorf("gpu: output too large (%dx%d)", tileDims.X*tileWidthPx, tileDims.Y*tileHeightPx)
 718  	}
 719  
 720  	// Pad scene with zeroes to avoid reading garbage in elements.comp.
 721  	scenePadding := partitionSize - len(g.enc.scene)%partitionSize
 722  	g.enc.scene = append(g.enc.scene, make([]scene.Command, scenePadding)...)
 723  
 724  	realloced := false
 725  	scene := byteslice.Slice(g.enc.scene)
 726  	if s := len(scene); s > g.buffers.scene.size {
 727  		realloced = true
 728  		paddedCap := s * 11 / 10
 729  		if err := g.buffers.scene.ensureCapacity(g.ctx, paddedCap); err != nil {
 730  			return err
 731  		}
 732  	}
 733  	g.buffers.scene.buffer.Upload(scene)
 734  
 735  	w, h := tileDims.X*tileWidthPx, tileDims.Y*tileHeightPx
 736  	if g.output.size.X != w || g.output.size.Y != h {
 737  		if err := g.resizeOutput(image.Pt(w, h)); err != nil {
 738  			return err
 739  		}
 740  	}
 741  	g.ctx.BindImageTexture(kernel4OutputUnit, g.output.image, driver.AccessWrite, driver.TextureFormatRGBA8)
 742  	if t := g.materials.tex; t != nil {
 743  		g.ctx.BindImageTexture(kernel4AtlasUnit, t, driver.AccessRead, driver.TextureFormatRGBA8)
 744  	}
 745  
 746  	// alloc is the number of allocated bytes for static buffers.
 747  	var alloc uint32
 748  	round := func(v, quantum int) int {
 749  		return (v + quantum - 1) &^ (quantum - 1)
 750  	}
 751  	malloc := func(size int) memAlloc {
 752  		size = round(size, 4)
 753  		offset := alloc
 754  		alloc += uint32(size)
 755  		return memAlloc{offset /*, uint32(size)*/}
 756  	}
 757  
 758  	*g.conf = config{
 759  		n_elements:      uint32(g.enc.npath),
 760  		n_pathseg:       uint32(g.enc.npathseg),
 761  		width_in_tiles:  uint32(tileDims.X),
 762  		height_in_tiles: uint32(tileDims.Y),
 763  		tile_alloc:      malloc(g.enc.npath * pathSize),
 764  		bin_alloc:       malloc(round(g.enc.npath, wgSize) * binSize),
 765  		ptcl_alloc:      malloc(tileDims.X * tileDims.Y * ptclInitialAlloc),
 766  		pathseg_alloc:   malloc(g.enc.npathseg * pathsegSize),
 767  		anno_alloc:      malloc(g.enc.npath * annoSize),
 768  		trans_alloc:     malloc(g.enc.ntrans * transSize),
 769  	}
 770  
 771  	numPartitions := (g.enc.numElements() + 127) / 128
 772  	// clearSize is the atomic partition counter plus flag and 2 states per partition.
 773  	clearSize := 4 + numPartitions*stateStride
 774  	if clearSize > g.buffers.state.size {
 775  		realloced = true
 776  		paddedCap := clearSize * 11 / 10
 777  		if err := g.buffers.state.ensureCapacity(g.ctx, paddedCap); err != nil {
 778  			return err
 779  		}
 780  	}
 781  
 782  	g.buffers.config.Upload(byteslice.Struct(g.conf))
 783  
 784  	minSize := int(unsafe.Sizeof(memoryHeader{})) + int(alloc)
 785  	if minSize > g.buffers.memory.size {
 786  		realloced = true
 787  		// Add space for dynamic GPU allocations.
 788  		const sizeBump = 4 * 1024 * 1024
 789  		minSize += sizeBump
 790  		if err := g.buffers.memory.ensureCapacity(g.ctx, minSize); err != nil {
 791  			return err
 792  		}
 793  	}
 794  	for {
 795  		*g.memHeader = memoryHeader{
 796  			mem_offset: alloc,
 797  		}
 798  		g.buffers.memory.buffer.Upload(byteslice.Struct(g.memHeader))
 799  		g.buffers.state.buffer.Upload(g.zeros(clearSize))
 800  
 801  		if realloced {
 802  			realloced = false
 803  			g.bindBuffers()
 804  		}
 805  		t := &g.timers
 806  		g.ctx.MemoryBarrier()
 807  		t.elements.begin()
 808  		g.ctx.BindProgram(g.programs.elements)
 809  		g.ctx.DispatchCompute(numPartitions, 1, 1)
 810  		g.ctx.MemoryBarrier()
 811  		t.elements.end()
 812  		t.tileAlloc.begin()
 813  		g.ctx.BindProgram(g.programs.tileAlloc)
 814  		g.ctx.DispatchCompute((g.enc.npath+wgSize-1)/wgSize, 1, 1)
 815  		g.ctx.MemoryBarrier()
 816  		t.tileAlloc.end()
 817  		t.pathCoarse.begin()
 818  		g.ctx.BindProgram(g.programs.pathCoarse)
 819  		g.ctx.DispatchCompute((g.enc.npathseg+31)/32, 1, 1)
 820  		g.ctx.MemoryBarrier()
 821  		t.pathCoarse.end()
 822  		t.backdropBinning.begin()
 823  		g.ctx.BindProgram(g.programs.backdrop)
 824  		g.ctx.DispatchCompute((g.enc.npath+wgSize-1)/wgSize, 1, 1)
 825  		// No barrier needed between backdrop and binning.
 826  		g.ctx.BindProgram(g.programs.binning)
 827  		g.ctx.DispatchCompute((g.enc.npath+wgSize-1)/wgSize, 1, 1)
 828  		g.ctx.MemoryBarrier()
 829  		t.backdropBinning.end()
 830  		t.coarse.begin()
 831  		g.ctx.BindProgram(g.programs.coarse)
 832  		g.ctx.DispatchCompute(widthInBins, heightInBins, 1)
 833  		g.ctx.MemoryBarrier()
 834  		t.coarse.end()
 835  		t.kernel4.begin()
 836  		g.ctx.BindProgram(g.programs.kernel4)
 837  		g.ctx.DispatchCompute(tileDims.X, tileDims.Y, 1)
 838  		g.ctx.MemoryBarrier()
 839  		t.kernel4.end()
 840  
 841  		if err := g.buffers.memory.buffer.Download(byteslice.Struct(g.memHeader)); err != nil {
 842  			if err == driver.ErrContentLost {
 843  				continue
 844  			}
 845  			return err
 846  		}
 847  		switch errCode := g.memHeader.mem_error; errCode {
 848  		case memNoError:
 849  			return nil
 850  		case memMallocFailed:
 851  			// Resize memory and try again.
 852  			realloced = true
 853  			sz := g.buffers.memory.size * 15 / 10
 854  			if err := g.buffers.memory.ensureCapacity(g.ctx, sz); err != nil {
 855  				return err
 856  			}
 857  			continue
 858  		default:
 859  			return fmt.Errorf("compute: shader program failed with error %d", errCode)
 860  		}
 861  	}
 862  }
 863  
 864  // zeros returns a byte slice with size bytes of zeros.
 865  func (g *compute) zeros(size int) []byte {
 866  	if cap(g.zeroSlice) < size {
 867  		g.zeroSlice = append(g.zeroSlice, make([]byte, size)...)
 868  	}
 869  	return g.zeroSlice[:size]
 870  }
 871  
 872  func (g *compute) resizeOutput(size image.Point) error {
 873  	if g.output.image != nil {
 874  		g.output.image.Release()
 875  		g.output.image = nil
 876  	}
 877  	img, err := g.ctx.NewTexture(driver.TextureFormatRGBA8, size.X, size.Y,
 878  		driver.FilterNearest,
 879  		driver.FilterNearest,
 880  		driver.BufferBindingShaderStorage|driver.BufferBindingTexture)
 881  	if err != nil {
 882  		return err
 883  	}
 884  	g.output.image = img
 885  	g.output.size = size
 886  	return nil
 887  }
 888  
 889  func (g *compute) Release() {
 890  	if g.drawOps.pathCache != nil {
 891  		g.drawOps.pathCache.release()
 892  	}
 893  	if g.cache != nil {
 894  		g.cache.release()
 895  	}
 896  	progs := []driver.Program{
 897  		g.programs.elements,
 898  		g.programs.tileAlloc,
 899  		g.programs.pathCoarse,
 900  		g.programs.backdrop,
 901  		g.programs.binning,
 902  		g.programs.coarse,
 903  		g.programs.kernel4,
 904  	}
 905  	if p := g.output.blitProg; p != nil {
 906  		p.Release()
 907  	}
 908  	for _, p := range progs {
 909  		if p != nil {
 910  			p.Release()
 911  		}
 912  	}
 913  	g.buffers.scene.release()
 914  	g.buffers.state.release()
 915  	g.buffers.memory.release()
 916  	if b := g.buffers.config; b != nil {
 917  		b.Release()
 918  	}
 919  	if g.output.image != nil {
 920  		g.output.image.Release()
 921  	}
 922  	if g.images.tex != nil {
 923  		g.images.tex.Release()
 924  	}
 925  	if g.materials.layout != nil {
 926  		g.materials.layout.Release()
 927  	}
 928  	if g.materials.prog != nil {
 929  		g.materials.prog.Release()
 930  	}
 931  	if g.materials.fbo != nil {
 932  		g.materials.fbo.Release()
 933  	}
 934  	if g.materials.tex != nil {
 935  		g.materials.tex.Release()
 936  	}
 937  	if g.materials.buffer != nil {
 938  		g.materials.buffer.Release()
 939  	}
 940  	if g.timers.t != nil {
 941  		g.timers.t.release()
 942  	}
 943  
 944  	*g = compute{}
 945  }
 946  
 947  func (g *compute) bindBuffers() {
 948  	bindStorageBuffers(g.programs.elements, g.buffers.memory.buffer, g.buffers.config, g.buffers.scene.buffer, g.buffers.state.buffer)
 949  	bindStorageBuffers(g.programs.tileAlloc, g.buffers.memory.buffer, g.buffers.config)
 950  	bindStorageBuffers(g.programs.pathCoarse, g.buffers.memory.buffer, g.buffers.config)
 951  	bindStorageBuffers(g.programs.backdrop, g.buffers.memory.buffer, g.buffers.config)
 952  	bindStorageBuffers(g.programs.binning, g.buffers.memory.buffer, g.buffers.config)
 953  	bindStorageBuffers(g.programs.coarse, g.buffers.memory.buffer, g.buffers.config)
 954  	bindStorageBuffers(g.programs.kernel4, g.buffers.memory.buffer, g.buffers.config)
 955  }
 956  
 957  func (b *sizedBuffer) release() {
 958  	if b.buffer == nil {
 959  		return
 960  	}
 961  	b.buffer.Release()
 962  	*b = sizedBuffer{}
 963  }
 964  
 965  func (b *sizedBuffer) ensureCapacity(ctx driver.Device, size int) error {
 966  	if b.size >= size {
 967  		return nil
 968  	}
 969  	if b.buffer != nil {
 970  		b.release()
 971  	}
 972  	buf, err := ctx.NewBuffer(driver.BufferBindingShaderStorage, size)
 973  	if err != nil {
 974  		return err
 975  	}
 976  	b.buffer = buf
 977  	b.size = size
 978  	return nil
 979  }
 980  
 981  func bindStorageBuffers(prog driver.Program, buffers ...driver.Buffer) {
 982  	for i, buf := range buffers {
 983  		prog.SetStorageBuffer(i, buf)
 984  	}
 985  }
 986  
 987  var bo = binary.LittleEndian
 988  
 989  func (e *encoder) reset() {
 990  	e.scene = e.scene[:0]
 991  	e.npath = 0
 992  	e.npathseg = 0
 993  	e.ntrans = 0
 994  }
 995  
 996  func (e *encoder) numElements() int {
 997  	return len(e.scene)
 998  }
 999  
1000  func (e *encoder) append(e2 encoder) {
1001  	e.scene = append(e.scene, e2.scene...)
1002  	e.npath += e2.npath
1003  	e.npathseg += e2.npathseg
1004  	e.ntrans += e2.ntrans
1005  }
1006  
1007  func (e *encoder) transform(m f32.Affine2D) {
1008  	e.scene = append(e.scene, scene.Transform(m))
1009  	e.ntrans++
1010  }
1011  
1012  func (e *encoder) lineWidth(width float32) {
1013  	e.scene = append(e.scene, scene.SetLineWidth(width))
1014  }
1015  
1016  func (e *encoder) fillMode(mode scene.FillMode) {
1017  	e.scene = append(e.scene, scene.SetFillMode(mode))
1018  }
1019  
1020  func (e *encoder) beginClip(bbox f32.Rectangle) {
1021  	e.scene = append(e.scene, scene.BeginClip(bbox))
1022  	e.npath++
1023  }
1024  
1025  func (e *encoder) endClip(bbox f32.Rectangle) {
1026  	e.scene = append(e.scene, scene.EndClip(bbox))
1027  	e.npath++
1028  }
1029  
1030  func (e *encoder) rect(r f32.Rectangle) {
1031  	// Rectangle corners, clock-wise.
1032  	c0, c1, c2, c3 := r.Min, f32.Pt(r.Min.X, r.Max.Y), r.Max, f32.Pt(r.Max.X, r.Min.Y)
1033  	e.line(c0, c1)
1034  	e.line(c1, c2)
1035  	e.line(c2, c3)
1036  	e.line(c3, c0)
1037  }
1038  
1039  func (e *encoder) fillColor(col color.RGBA) {
1040  	e.scene = append(e.scene, scene.FillColor(col))
1041  	e.npath++
1042  }
1043  
1044  func (e *encoder) setFillImageOffset(index int, offset image.Point) {
1045  	x := int16(offset.X)
1046  	y := int16(offset.Y)
1047  	e.scene[index][2] = uint32(uint16(x)) | uint32(uint16(y))<<16
1048  }
1049  
1050  func (e *encoder) fillImage(index int) {
1051  	e.scene = append(e.scene, scene.FillImage(index))
1052  	e.npath++
1053  }
1054  
1055  func (e *encoder) line(start, end f32.Point) {
1056  	e.scene = append(e.scene, scene.Line(start, end))
1057  	e.npathseg++
1058  }
1059  
1060  func (e *encoder) quad(start, ctrl, end f32.Point) {
1061  	e.scene = append(e.scene, scene.Quad(start, ctrl, end))
1062  	e.npathseg++
1063  }
1064