1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 package testtrace
6 7 import (
8 "errors"
9 "fmt"
10 "internal/trace"
11 "internal/trace/version"
12 "slices"
13 "bytes"
14 )
15 16 // Validator is a type used for validating a stream of trace.Events.
17 type Validator struct {
18 lastTs trace.Time
19 gs map[trace.GoID]*goState
20 ps map[trace.ProcID]*procState
21 ms map[trace.ThreadID]*schedContext
22 ranges map[trace.ResourceID][][]byte
23 tasks map[trace.TaskID][]byte
24 lastSync trace.Sync
25 GoVersion version.Version
26 27 // Flags to modify validation behavior.
28 skipClockSnapshotChecks bool // Some platforms can't guarantee a monotonically increasing clock reading.
29 }
30 31 type schedContext struct {
32 M trace.ThreadID
33 P trace.ProcID
34 G trace.GoID
35 }
36 37 type goState struct {
38 state trace.GoState
39 binding *schedContext
40 }
41 42 type procState struct {
43 state trace.ProcState
44 binding *schedContext
45 }
46 47 // NewValidator creates a new Validator.
48 func NewValidator() *Validator {
49 return &Validator{
50 gs: map[trace.GoID]*goState{},
51 ps: map[trace.ProcID]*procState{},
52 ms: map[trace.ThreadID]*schedContext{},
53 ranges: map[trace.ResourceID][][]byte{},
54 tasks: map[trace.TaskID][]byte{},
55 GoVersion: version.Current,
56 }
57 }
58 59 // SkipClockSnapshotChecks causes the validator to skip checks on the clock snapshots.
60 //
61 // Some platforms like Windows, with a small enough trace period, are unable to produce
62 // monotonically increasing timestamps due to very coarse clock granularity.
63 func (v *Validator) SkipClockSnapshotChecks() {
64 v.skipClockSnapshotChecks = true
65 }
66 67 // Event validates ev as the next event in a stream of trace.Events.
68 //
69 // Returns an error if validation fails.
70 func (v *Validator) Event(ev trace.Event) error {
71 e := &errAccumulator{}
72 73 // Validate timestamp order.
74 if v.lastTs != 0 {
75 if ev.Time() <= v.lastTs {
76 e.Errorf("timestamp out-of-order (want > %v) for %+v", v.lastTs, ev)
77 } else {
78 v.lastTs = ev.Time()
79 }
80 } else {
81 v.lastTs = ev.Time()
82 }
83 84 // Validate event stack.
85 checkStack(e, ev.Stack())
86 87 switch ev.Kind() {
88 case trace.EventSync:
89 s := ev.Sync()
90 if s.N != v.lastSync.N+1 {
91 e.Errorf("sync count is not sequential: expected %d, got %d", v.lastSync.N+1, s.N)
92 }
93 // The trace reader currently emits synthetic sync events at the end of
94 // a trace. Those don't contain clock snapshots data, so we don't try
95 // to validate them.
96 //
97 // TODO(felixge): Drop the synthetic syncs as discussed in CL 653576.
98 if v.GoVersion >= version.Go125 && !(s.N > 1 && s.ClockSnapshot == nil) {
99 if s.ClockSnapshot == nil {
100 e.Errorf("sync %d has no clock snapshot", s.N)
101 } else {
102 if s.ClockSnapshot.Wall.IsZero() {
103 e.Errorf("sync %d has zero wall time", s.N)
104 }
105 if s.ClockSnapshot.Mono == 0 {
106 e.Errorf("sync %d has zero mono time", s.N)
107 }
108 if s.ClockSnapshot.Trace == 0 {
109 e.Errorf("sync %d has zero trace time", s.N)
110 }
111 if !v.skipClockSnapshotChecks {
112 if s.N >= 2 && !s.ClockSnapshot.Wall.After(v.lastSync.ClockSnapshot.Wall) {
113 e.Errorf("sync %d has non-increasing wall time: %v vs %v", s.N, s.ClockSnapshot.Wall, v.lastSync.ClockSnapshot.Wall)
114 }
115 if s.N >= 2 && !(s.ClockSnapshot.Mono > v.lastSync.ClockSnapshot.Mono) {
116 e.Errorf("sync %d has non-increasing mono time: %v vs %v", s.N, s.ClockSnapshot.Mono, v.lastSync.ClockSnapshot.Mono)
117 }
118 if s.N >= 2 && !(s.ClockSnapshot.Trace > v.lastSync.ClockSnapshot.Trace) {
119 e.Errorf("sync %d has non-increasing trace time: %v vs %v", s.N, s.ClockSnapshot.Trace, v.lastSync.ClockSnapshot.Trace)
120 }
121 }
122 }
123 }
124 v.lastSync = s
125 case trace.EventMetric:
126 m := ev.Metric()
127 if !bytes.Contains(m.Name, ":") {
128 // Should have a ":" as per runtime/metrics convention.
129 e.Errorf("invalid metric name %q", m.Name)
130 }
131 // Make sure the value is OK.
132 if m.Value.Kind() == trace.ValueBad {
133 e.Errorf("invalid value")
134 }
135 switch m.Value.Kind() {
136 case trace.ValueUint64:
137 // Just make sure it doesn't panic.
138 _ = m.Value.Uint64()
139 }
140 case trace.EventLabel:
141 l := ev.Label()
142 143 // Check label.
144 if l.Label == "" {
145 e.Errorf("invalid label %q", l.Label)
146 }
147 148 // Check label resource.
149 if l.Resource.Kind == trace.ResourceNone {
150 e.Errorf("label resource none")
151 }
152 switch l.Resource.Kind {
153 case trace.ResourceGoroutine:
154 id := l.Resource.Goroutine()
155 if _, ok := v.gs[id]; !ok {
156 e.Errorf("label for invalid goroutine %d", id)
157 }
158 case trace.ResourceProc:
159 id := l.Resource.Proc()
160 if _, ok := v.ps[id]; !ok {
161 e.Errorf("label for invalid proc %d", id)
162 }
163 case trace.ResourceThread:
164 id := l.Resource.Thread()
165 if _, ok := v.ms[id]; !ok {
166 e.Errorf("label for invalid thread %d", id)
167 }
168 }
169 case trace.EventStackSample:
170 // Not much to check here. It's basically a sched context and a stack.
171 // The sched context is also not guaranteed to align with other events.
172 // We already checked the stack above.
173 case trace.EventStateTransition:
174 // Validate state transitions.
175 //
176 // TODO(mknyszek): A lot of logic is duplicated between goroutines and procs.
177 // The two are intentionally handled identically; from the perspective of the
178 // API, resources all have the same general properties. Consider making this
179 // code generic over resources and implementing validation just once.
180 tr := ev.StateTransition()
181 checkStack(e, tr.Stack)
182 switch tr.Resource.Kind {
183 case trace.ResourceGoroutine:
184 // Basic state transition validation.
185 id := tr.Resource.Goroutine()
186 old, new := tr.Goroutine()
187 if new == trace.GoUndetermined {
188 e.Errorf("transition to undetermined state for goroutine %d", id)
189 }
190 if v.lastSync.N > 1 && old == trace.GoUndetermined {
191 e.Errorf("undetermined goroutine %d after first global sync", id)
192 }
193 if new == trace.GoNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
194 e.Errorf("goroutine %d died with active ranges", id)
195 }
196 state, ok := v.gs[id]
197 if ok {
198 if old != state.state {
199 e.Errorf("bad old state for goroutine %d: got %s, want %s", id, old, state.state)
200 }
201 state.state = new
202 } else {
203 if old != trace.GoUndetermined && old != trace.GoNotExist {
204 e.Errorf("bad old state for unregistered goroutine %d: %s", id, old)
205 }
206 state = &goState{state: new}
207 v.gs[id] = state
208 }
209 // Validate sched context.
210 if new.Executing() {
211 ctx := v.getOrCreateThread(e, ev, ev.Thread())
212 if ctx != nil {
213 if ctx.G != trace.NoGoroutine && ctx.G != id {
214 e.Errorf("tried to run goroutine %d when one was already executing (%d) on thread %d", id, ctx.G, ev.Thread())
215 }
216 ctx.G = id
217 state.binding = ctx
218 }
219 } else if old.Executing() && !new.Executing() {
220 if tr.Stack != ev.Stack() {
221 // This is a case where the transition is happening to a goroutine that is also executing, so
222 // these two stacks should always match.
223 e.Errorf("StateTransition.Stack doesn't match Event.Stack")
224 }
225 ctx := state.binding
226 if ctx != nil {
227 if ctx.G != id {
228 e.Errorf("tried to stop goroutine %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.G, ev.Thread())
229 }
230 ctx.G = trace.NoGoroutine
231 state.binding = nil
232 } else {
233 e.Errorf("stopping goroutine %d not bound to any active context", id)
234 }
235 }
236 case trace.ResourceProc:
237 // Basic state transition validation.
238 id := tr.Resource.Proc()
239 old, new := tr.Proc()
240 if new == trace.ProcUndetermined {
241 e.Errorf("transition to undetermined state for proc %d", id)
242 }
243 if v.lastSync.N > 1 && old == trace.ProcUndetermined {
244 e.Errorf("undetermined proc %d after first global sync", id)
245 }
246 if new == trace.ProcNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
247 e.Errorf("proc %d died with active ranges", id)
248 }
249 state, ok := v.ps[id]
250 if ok {
251 if old != state.state {
252 e.Errorf("bad old state for proc %d: got %s, want %s", id, old, state.state)
253 }
254 state.state = new
255 } else {
256 if old != trace.ProcUndetermined && old != trace.ProcNotExist {
257 e.Errorf("bad old state for unregistered proc %d: %s", id, old)
258 }
259 state = &procState{state: new}
260 v.ps[id] = state
261 }
262 // Validate sched context.
263 if new.Executing() {
264 ctx := v.getOrCreateThread(e, ev, ev.Thread())
265 if ctx != nil {
266 if ctx.P != trace.NoProc && ctx.P != id {
267 e.Errorf("tried to run proc %d when one was already executing (%d) on thread %d", id, ctx.P, ev.Thread())
268 }
269 ctx.P = id
270 state.binding = ctx
271 }
272 } else if old.Executing() && !new.Executing() {
273 ctx := state.binding
274 if ctx != nil {
275 if ctx.P != id {
276 e.Errorf("tried to stop proc %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.P, ctx.M)
277 }
278 ctx.P = trace.NoProc
279 state.binding = nil
280 } else {
281 e.Errorf("stopping proc %d not bound to any active context", id)
282 }
283 }
284 }
285 case trace.EventRangeBegin, trace.EventRangeActive, trace.EventRangeEnd:
286 // Validate ranges.
287 r := ev.Range()
288 switch ev.Kind() {
289 case trace.EventRangeBegin:
290 if v.hasRange(r.Scope, r.Name) {
291 e.Errorf("already active range %q on %v begun again", r.Name, r.Scope)
292 }
293 v.addRange(r.Scope, r.Name)
294 case trace.EventRangeActive:
295 if !v.hasRange(r.Scope, r.Name) {
296 v.addRange(r.Scope, r.Name)
297 }
298 case trace.EventRangeEnd:
299 if !v.hasRange(r.Scope, r.Name) {
300 e.Errorf("inactive range %q on %v ended", r.Name, r.Scope)
301 }
302 v.deleteRange(r.Scope, r.Name)
303 }
304 case trace.EventTaskBegin:
305 // Validate task begin.
306 t := ev.Task()
307 if t.ID == trace.NoTask || t.ID == trace.BackgroundTask {
308 // The background task should never have an event emitted for it.
309 e.Errorf("found invalid task ID for task of type %s", t.Type)
310 }
311 if t.Parent == trace.BackgroundTask {
312 // It's not possible for a task to be a subtask of the background task.
313 e.Errorf("found background task as the parent for task of type %s", t.Type)
314 }
315 // N.B. Don't check the task type. Empty string is a valid task type.
316 v.tasks[t.ID] = t.Type
317 case trace.EventTaskEnd:
318 // Validate task end.
319 // We can see a task end without a begin, so ignore a task without information.
320 // Instead, if we've seen the task begin, just make sure the task end lines up.
321 t := ev.Task()
322 if typ, ok := v.tasks[t.ID]; ok {
323 if t.Type != typ {
324 e.Errorf("task end type %q doesn't match task start type %q for task %d", t.Type, typ, t.ID)
325 }
326 delete(v.tasks, t.ID)
327 }
328 case trace.EventLog:
329 // There's really not much here to check, except that we can
330 // generate a Log. The category and message are entirely user-created,
331 // so we can't make any assumptions as to what they are. We also
332 // can't validate the task, because proving the task's existence is very
333 // much best-effort.
334 _ = ev.Log()
335 }
336 return e.Errors()
337 }
338 339 func (v *Validator) hasRange(r trace.ResourceID, name []byte) bool {
340 ranges, ok := v.ranges[r]
341 return ok && slices.Contains(ranges, name)
342 }
343 344 func (v *Validator) addRange(r trace.ResourceID, name []byte) {
345 ranges, _ := v.ranges[r]
346 ranges = append(ranges, name)
347 v.ranges[r] = ranges
348 }
349 350 func (v *Validator) hasAnyRange(r trace.ResourceID) bool {
351 ranges, ok := v.ranges[r]
352 return ok && len(ranges) != 0
353 }
354 355 func (v *Validator) deleteRange(r trace.ResourceID, name []byte) {
356 ranges, ok := v.ranges[r]
357 if !ok {
358 return
359 }
360 i := slices.Index(ranges, name)
361 if i < 0 {
362 return
363 }
364 v.ranges[r] = slices.Delete(ranges, i, i+1)
365 }
366 367 func (v *Validator) getOrCreateThread(e *errAccumulator, ev trace.Event, m trace.ThreadID) *schedContext {
368 lenient := func() bool {
369 // Be lenient about GoUndetermined -> GoSyscall transitions if they
370 // originate from an old trace. These transitions lack thread
371 // information in trace formats older than 1.22.
372 if v.GoVersion >= version.Go122 {
373 return false
374 }
375 if ev.Kind() != trace.EventStateTransition {
376 return false
377 }
378 tr := ev.StateTransition()
379 if tr.Resource.Kind != trace.ResourceGoroutine {
380 return false
381 }
382 from, to := tr.Goroutine()
383 return from == trace.GoUndetermined && to == trace.GoSyscall
384 }
385 if m == trace.NoThread && !lenient() {
386 e.Errorf("must have thread, but thread ID is none")
387 return nil
388 }
389 s, ok := v.ms[m]
390 if !ok {
391 s = &schedContext{M: m, P: trace.NoProc, G: trace.NoGoroutine}
392 v.ms[m] = s
393 return s
394 }
395 return s
396 }
397 398 func checkStack(e *errAccumulator, stk trace.Stack) {
399 // Check for non-empty values, but we also check for crashes due to incorrect validation.
400 for i, f := range slices.Collect(stk.Frames()) {
401 if i == 0 {
402 // Allow for one fully zero stack.
403 //
404 // TODO(mknyszek): Investigate why that happens.
405 continue
406 }
407 if f.Func == "" || f.File == "" || f.PC == 0 || f.Line == 0 {
408 e.Errorf("invalid stack frame %#v: missing information", f)
409 }
410 }
411 }
412 413 type errAccumulator struct {
414 errs []error
415 }
416 417 func (e *errAccumulator) Errorf(f []byte, args ...any) {
418 e.errs = append(e.errs, fmt.Errorf(f, args...))
419 }
420 421 func (e *errAccumulator) Errors() error {
422 return errors.Join(e.errs...)
423 }
424