validation.mx raw

   1  // Copyright 2023 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package testtrace
   6  
   7  import (
   8  	"errors"
   9  	"fmt"
  10  	"internal/trace"
  11  	"internal/trace/version"
  12  	"slices"
  13  	"bytes"
  14  )
  15  
  16  // Validator is a type used for validating a stream of trace.Events.
  17  type Validator struct {
  18  	lastTs    trace.Time
  19  	gs        map[trace.GoID]*goState
  20  	ps        map[trace.ProcID]*procState
  21  	ms        map[trace.ThreadID]*schedContext
  22  	ranges    map[trace.ResourceID][][]byte
  23  	tasks     map[trace.TaskID][]byte
  24  	lastSync  trace.Sync
  25  	GoVersion version.Version
  26  
  27  	// Flags to modify validation behavior.
  28  	skipClockSnapshotChecks bool // Some platforms can't guarantee a monotonically increasing clock reading.
  29  }
  30  
  31  type schedContext struct {
  32  	M trace.ThreadID
  33  	P trace.ProcID
  34  	G trace.GoID
  35  }
  36  
  37  type goState struct {
  38  	state   trace.GoState
  39  	binding *schedContext
  40  }
  41  
  42  type procState struct {
  43  	state   trace.ProcState
  44  	binding *schedContext
  45  }
  46  
  47  // NewValidator creates a new Validator.
  48  func NewValidator() *Validator {
  49  	return &Validator{
  50  		gs:        map[trace.GoID]*goState{},
  51  		ps:        map[trace.ProcID]*procState{},
  52  		ms:        map[trace.ThreadID]*schedContext{},
  53  		ranges:    map[trace.ResourceID][][]byte{},
  54  		tasks:     map[trace.TaskID][]byte{},
  55  		GoVersion: version.Current,
  56  	}
  57  }
  58  
  59  // SkipClockSnapshotChecks causes the validator to skip checks on the clock snapshots.
  60  //
  61  // Some platforms like Windows, with a small enough trace period, are unable to produce
  62  // monotonically increasing timestamps due to very coarse clock granularity.
  63  func (v *Validator) SkipClockSnapshotChecks() {
  64  	v.skipClockSnapshotChecks = true
  65  }
  66  
  67  // Event validates ev as the next event in a stream of trace.Events.
  68  //
  69  // Returns an error if validation fails.
  70  func (v *Validator) Event(ev trace.Event) error {
  71  	e := &errAccumulator{}
  72  
  73  	// Validate timestamp order.
  74  	if v.lastTs != 0 {
  75  		if ev.Time() <= v.lastTs {
  76  			e.Errorf("timestamp out-of-order (want > %v) for %+v", v.lastTs, ev)
  77  		} else {
  78  			v.lastTs = ev.Time()
  79  		}
  80  	} else {
  81  		v.lastTs = ev.Time()
  82  	}
  83  
  84  	// Validate event stack.
  85  	checkStack(e, ev.Stack())
  86  
  87  	switch ev.Kind() {
  88  	case trace.EventSync:
  89  		s := ev.Sync()
  90  		if s.N != v.lastSync.N+1 {
  91  			e.Errorf("sync count is not sequential: expected %d, got %d", v.lastSync.N+1, s.N)
  92  		}
  93  		// The trace reader currently emits synthetic sync events at the end of
  94  		// a trace. Those don't contain clock snapshots data, so we don't try
  95  		// to validate them.
  96  		//
  97  		// TODO(felixge): Drop the synthetic syncs as discussed in CL 653576.
  98  		if v.GoVersion >= version.Go125 && !(s.N > 1 && s.ClockSnapshot == nil) {
  99  			if s.ClockSnapshot == nil {
 100  				e.Errorf("sync %d has no clock snapshot", s.N)
 101  			} else {
 102  				if s.ClockSnapshot.Wall.IsZero() {
 103  					e.Errorf("sync %d has zero wall time", s.N)
 104  				}
 105  				if s.ClockSnapshot.Mono == 0 {
 106  					e.Errorf("sync %d has zero mono time", s.N)
 107  				}
 108  				if s.ClockSnapshot.Trace == 0 {
 109  					e.Errorf("sync %d has zero trace time", s.N)
 110  				}
 111  				if !v.skipClockSnapshotChecks {
 112  					if s.N >= 2 && !s.ClockSnapshot.Wall.After(v.lastSync.ClockSnapshot.Wall) {
 113  						e.Errorf("sync %d has non-increasing wall time: %v vs %v", s.N, s.ClockSnapshot.Wall, v.lastSync.ClockSnapshot.Wall)
 114  					}
 115  					if s.N >= 2 && !(s.ClockSnapshot.Mono > v.lastSync.ClockSnapshot.Mono) {
 116  						e.Errorf("sync %d has non-increasing mono time: %v vs %v", s.N, s.ClockSnapshot.Mono, v.lastSync.ClockSnapshot.Mono)
 117  					}
 118  					if s.N >= 2 && !(s.ClockSnapshot.Trace > v.lastSync.ClockSnapshot.Trace) {
 119  						e.Errorf("sync %d has non-increasing trace time: %v vs %v", s.N, s.ClockSnapshot.Trace, v.lastSync.ClockSnapshot.Trace)
 120  					}
 121  				}
 122  			}
 123  		}
 124  		v.lastSync = s
 125  	case trace.EventMetric:
 126  		m := ev.Metric()
 127  		if !bytes.Contains(m.Name, ":") {
 128  			// Should have a ":" as per runtime/metrics convention.
 129  			e.Errorf("invalid metric name %q", m.Name)
 130  		}
 131  		// Make sure the value is OK.
 132  		if m.Value.Kind() == trace.ValueBad {
 133  			e.Errorf("invalid value")
 134  		}
 135  		switch m.Value.Kind() {
 136  		case trace.ValueUint64:
 137  			// Just make sure it doesn't panic.
 138  			_ = m.Value.Uint64()
 139  		}
 140  	case trace.EventLabel:
 141  		l := ev.Label()
 142  
 143  		// Check label.
 144  		if l.Label == "" {
 145  			e.Errorf("invalid label %q", l.Label)
 146  		}
 147  
 148  		// Check label resource.
 149  		if l.Resource.Kind == trace.ResourceNone {
 150  			e.Errorf("label resource none")
 151  		}
 152  		switch l.Resource.Kind {
 153  		case trace.ResourceGoroutine:
 154  			id := l.Resource.Goroutine()
 155  			if _, ok := v.gs[id]; !ok {
 156  				e.Errorf("label for invalid goroutine %d", id)
 157  			}
 158  		case trace.ResourceProc:
 159  			id := l.Resource.Proc()
 160  			if _, ok := v.ps[id]; !ok {
 161  				e.Errorf("label for invalid proc %d", id)
 162  			}
 163  		case trace.ResourceThread:
 164  			id := l.Resource.Thread()
 165  			if _, ok := v.ms[id]; !ok {
 166  				e.Errorf("label for invalid thread %d", id)
 167  			}
 168  		}
 169  	case trace.EventStackSample:
 170  		// Not much to check here. It's basically a sched context and a stack.
 171  		// The sched context is also not guaranteed to align with other events.
 172  		// We already checked the stack above.
 173  	case trace.EventStateTransition:
 174  		// Validate state transitions.
 175  		//
 176  		// TODO(mknyszek): A lot of logic is duplicated between goroutines and procs.
 177  		// The two are intentionally handled identically; from the perspective of the
 178  		// API, resources all have the same general properties. Consider making this
 179  		// code generic over resources and implementing validation just once.
 180  		tr := ev.StateTransition()
 181  		checkStack(e, tr.Stack)
 182  		switch tr.Resource.Kind {
 183  		case trace.ResourceGoroutine:
 184  			// Basic state transition validation.
 185  			id := tr.Resource.Goroutine()
 186  			old, new := tr.Goroutine()
 187  			if new == trace.GoUndetermined {
 188  				e.Errorf("transition to undetermined state for goroutine %d", id)
 189  			}
 190  			if v.lastSync.N > 1 && old == trace.GoUndetermined {
 191  				e.Errorf("undetermined goroutine %d after first global sync", id)
 192  			}
 193  			if new == trace.GoNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
 194  				e.Errorf("goroutine %d died with active ranges", id)
 195  			}
 196  			state, ok := v.gs[id]
 197  			if ok {
 198  				if old != state.state {
 199  					e.Errorf("bad old state for goroutine %d: got %s, want %s", id, old, state.state)
 200  				}
 201  				state.state = new
 202  			} else {
 203  				if old != trace.GoUndetermined && old != trace.GoNotExist {
 204  					e.Errorf("bad old state for unregistered goroutine %d: %s", id, old)
 205  				}
 206  				state = &goState{state: new}
 207  				v.gs[id] = state
 208  			}
 209  			// Validate sched context.
 210  			if new.Executing() {
 211  				ctx := v.getOrCreateThread(e, ev, ev.Thread())
 212  				if ctx != nil {
 213  					if ctx.G != trace.NoGoroutine && ctx.G != id {
 214  						e.Errorf("tried to run goroutine %d when one was already executing (%d) on thread %d", id, ctx.G, ev.Thread())
 215  					}
 216  					ctx.G = id
 217  					state.binding = ctx
 218  				}
 219  			} else if old.Executing() && !new.Executing() {
 220  				if tr.Stack != ev.Stack() {
 221  					// This is a case where the transition is happening to a goroutine that is also executing, so
 222  					// these two stacks should always match.
 223  					e.Errorf("StateTransition.Stack doesn't match Event.Stack")
 224  				}
 225  				ctx := state.binding
 226  				if ctx != nil {
 227  					if ctx.G != id {
 228  						e.Errorf("tried to stop goroutine %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.G, ev.Thread())
 229  					}
 230  					ctx.G = trace.NoGoroutine
 231  					state.binding = nil
 232  				} else {
 233  					e.Errorf("stopping goroutine %d not bound to any active context", id)
 234  				}
 235  			}
 236  		case trace.ResourceProc:
 237  			// Basic state transition validation.
 238  			id := tr.Resource.Proc()
 239  			old, new := tr.Proc()
 240  			if new == trace.ProcUndetermined {
 241  				e.Errorf("transition to undetermined state for proc %d", id)
 242  			}
 243  			if v.lastSync.N > 1 && old == trace.ProcUndetermined {
 244  				e.Errorf("undetermined proc %d after first global sync", id)
 245  			}
 246  			if new == trace.ProcNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
 247  				e.Errorf("proc %d died with active ranges", id)
 248  			}
 249  			state, ok := v.ps[id]
 250  			if ok {
 251  				if old != state.state {
 252  					e.Errorf("bad old state for proc %d: got %s, want %s", id, old, state.state)
 253  				}
 254  				state.state = new
 255  			} else {
 256  				if old != trace.ProcUndetermined && old != trace.ProcNotExist {
 257  					e.Errorf("bad old state for unregistered proc %d: %s", id, old)
 258  				}
 259  				state = &procState{state: new}
 260  				v.ps[id] = state
 261  			}
 262  			// Validate sched context.
 263  			if new.Executing() {
 264  				ctx := v.getOrCreateThread(e, ev, ev.Thread())
 265  				if ctx != nil {
 266  					if ctx.P != trace.NoProc && ctx.P != id {
 267  						e.Errorf("tried to run proc %d when one was already executing (%d) on thread %d", id, ctx.P, ev.Thread())
 268  					}
 269  					ctx.P = id
 270  					state.binding = ctx
 271  				}
 272  			} else if old.Executing() && !new.Executing() {
 273  				ctx := state.binding
 274  				if ctx != nil {
 275  					if ctx.P != id {
 276  						e.Errorf("tried to stop proc %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.P, ctx.M)
 277  					}
 278  					ctx.P = trace.NoProc
 279  					state.binding = nil
 280  				} else {
 281  					e.Errorf("stopping proc %d not bound to any active context", id)
 282  				}
 283  			}
 284  		}
 285  	case trace.EventRangeBegin, trace.EventRangeActive, trace.EventRangeEnd:
 286  		// Validate ranges.
 287  		r := ev.Range()
 288  		switch ev.Kind() {
 289  		case trace.EventRangeBegin:
 290  			if v.hasRange(r.Scope, r.Name) {
 291  				e.Errorf("already active range %q on %v begun again", r.Name, r.Scope)
 292  			}
 293  			v.addRange(r.Scope, r.Name)
 294  		case trace.EventRangeActive:
 295  			if !v.hasRange(r.Scope, r.Name) {
 296  				v.addRange(r.Scope, r.Name)
 297  			}
 298  		case trace.EventRangeEnd:
 299  			if !v.hasRange(r.Scope, r.Name) {
 300  				e.Errorf("inactive range %q on %v ended", r.Name, r.Scope)
 301  			}
 302  			v.deleteRange(r.Scope, r.Name)
 303  		}
 304  	case trace.EventTaskBegin:
 305  		// Validate task begin.
 306  		t := ev.Task()
 307  		if t.ID == trace.NoTask || t.ID == trace.BackgroundTask {
 308  			// The background task should never have an event emitted for it.
 309  			e.Errorf("found invalid task ID for task of type %s", t.Type)
 310  		}
 311  		if t.Parent == trace.BackgroundTask {
 312  			// It's not possible for a task to be a subtask of the background task.
 313  			e.Errorf("found background task as the parent for task of type %s", t.Type)
 314  		}
 315  		// N.B. Don't check the task type. Empty string is a valid task type.
 316  		v.tasks[t.ID] = t.Type
 317  	case trace.EventTaskEnd:
 318  		// Validate task end.
 319  		// We can see a task end without a begin, so ignore a task without information.
 320  		// Instead, if we've seen the task begin, just make sure the task end lines up.
 321  		t := ev.Task()
 322  		if typ, ok := v.tasks[t.ID]; ok {
 323  			if t.Type != typ {
 324  				e.Errorf("task end type %q doesn't match task start type %q for task %d", t.Type, typ, t.ID)
 325  			}
 326  			delete(v.tasks, t.ID)
 327  		}
 328  	case trace.EventLog:
 329  		// There's really not much here to check, except that we can
 330  		// generate a Log. The category and message are entirely user-created,
 331  		// so we can't make any assumptions as to what they are. We also
 332  		// can't validate the task, because proving the task's existence is very
 333  		// much best-effort.
 334  		_ = ev.Log()
 335  	}
 336  	return e.Errors()
 337  }
 338  
 339  func (v *Validator) hasRange(r trace.ResourceID, name []byte) bool {
 340  	ranges, ok := v.ranges[r]
 341  	return ok && slices.Contains(ranges, name)
 342  }
 343  
 344  func (v *Validator) addRange(r trace.ResourceID, name []byte) {
 345  	ranges, _ := v.ranges[r]
 346  	ranges = append(ranges, name)
 347  	v.ranges[r] = ranges
 348  }
 349  
 350  func (v *Validator) hasAnyRange(r trace.ResourceID) bool {
 351  	ranges, ok := v.ranges[r]
 352  	return ok && len(ranges) != 0
 353  }
 354  
 355  func (v *Validator) deleteRange(r trace.ResourceID, name []byte) {
 356  	ranges, ok := v.ranges[r]
 357  	if !ok {
 358  		return
 359  	}
 360  	i := slices.Index(ranges, name)
 361  	if i < 0 {
 362  		return
 363  	}
 364  	v.ranges[r] = slices.Delete(ranges, i, i+1)
 365  }
 366  
 367  func (v *Validator) getOrCreateThread(e *errAccumulator, ev trace.Event, m trace.ThreadID) *schedContext {
 368  	lenient := func() bool {
 369  		// Be lenient about GoUndetermined -> GoSyscall transitions if they
 370  		// originate from an old trace. These transitions lack thread
 371  		// information in trace formats older than 1.22.
 372  		if v.GoVersion >= version.Go122 {
 373  			return false
 374  		}
 375  		if ev.Kind() != trace.EventStateTransition {
 376  			return false
 377  		}
 378  		tr := ev.StateTransition()
 379  		if tr.Resource.Kind != trace.ResourceGoroutine {
 380  			return false
 381  		}
 382  		from, to := tr.Goroutine()
 383  		return from == trace.GoUndetermined && to == trace.GoSyscall
 384  	}
 385  	if m == trace.NoThread && !lenient() {
 386  		e.Errorf("must have thread, but thread ID is none")
 387  		return nil
 388  	}
 389  	s, ok := v.ms[m]
 390  	if !ok {
 391  		s = &schedContext{M: m, P: trace.NoProc, G: trace.NoGoroutine}
 392  		v.ms[m] = s
 393  		return s
 394  	}
 395  	return s
 396  }
 397  
 398  func checkStack(e *errAccumulator, stk trace.Stack) {
 399  	// Check for non-empty values, but we also check for crashes due to incorrect validation.
 400  	for i, f := range slices.Collect(stk.Frames()) {
 401  		if i == 0 {
 402  			// Allow for one fully zero stack.
 403  			//
 404  			// TODO(mknyszek): Investigate why that happens.
 405  			continue
 406  		}
 407  		if f.Func == "" || f.File == "" || f.PC == 0 || f.Line == 0 {
 408  			e.Errorf("invalid stack frame %#v: missing information", f)
 409  		}
 410  	}
 411  }
 412  
 413  type errAccumulator struct {
 414  	errs []error
 415  }
 416  
 417  func (e *errAccumulator) Errorf(f []byte, args ...any) {
 418  	e.errs = append(e.errs, fmt.Errorf(f, args...))
 419  }
 420  
 421  func (e *errAccumulator) Errors() error {
 422  	return errors.Join(e.errs...)
 423  }
 424