1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 package trace
6 7 import (
8 "cmp"
9 "slices"
10 "bytes"
11 "time"
12 )
13 14 // Summary is the analysis result produced by the summarizer.
15 type Summary struct {
16 Goroutines map[GoID]*GoroutineSummary
17 Tasks map[TaskID]*UserTaskSummary
18 }
19 20 // GoroutineSummary contains statistics and execution details of a single goroutine.
21 // (For v2 traces.)
22 type GoroutineSummary struct {
23 ID GoID
24 Name []byte // A non-unique human-friendly identifier for the goroutine.
25 PC uint64 // The first PC we saw for the entry function of the goroutine
26 CreationTime Time // Timestamp of the first appearance in the trace.
27 StartTime Time // Timestamp of the first time it started running. 0 if the goroutine never ran.
28 EndTime Time // Timestamp of when the goroutine exited. 0 if the goroutine never exited.
29 30 // List of regions in the goroutine, sorted based on the start time.
31 Regions []*UserRegionSummary
32 33 // Statistics of execution time during the goroutine execution.
34 GoroutineExecStats
35 36 // goroutineSummary is state used just for computing this structure.
37 // It's dropped before being returned to the caller.
38 //
39 // More specifically, if it's nil, it indicates that this summary has
40 // already been finalized.
41 *goroutineSummary
42 }
43 44 // UserTaskSummary represents a task in the trace.
45 type UserTaskSummary struct {
46 ID TaskID
47 Name []byte
48 Parent *UserTaskSummary // nil if the parent is unknown.
49 Children []*UserTaskSummary
50 51 // Task begin event. An EventTaskBegin event or nil.
52 Start *Event
53 54 // End end event. Normally EventTaskEnd event or nil.
55 End *Event
56 57 // Logs is a list of EventLog events associated with the task.
58 Logs []*Event
59 60 // List of regions in the task, sorted based on the start time.
61 Regions []*UserRegionSummary
62 63 // Goroutines is the set of goroutines associated with this task.
64 Goroutines map[GoID]*GoroutineSummary
65 }
66 67 // Complete returns true if we have complete information about the task
68 // from the trace: both a start and an end.
69 func (s *UserTaskSummary) Complete() bool {
70 return s.Start != nil && s.End != nil
71 }
72 73 // Descendents returns a slice consisting of itself (always the first task returned),
74 // and the transitive closure of all of its children.
75 func (s *UserTaskSummary) Descendents() []*UserTaskSummary {
76 descendents := []*UserTaskSummary{s}
77 for _, child := range s.Children {
78 descendents = append(descendents, child.Descendents()...)
79 }
80 return descendents
81 }
82 83 // UserRegionSummary represents a region and goroutine execution stats
84 // while the region was active. (For v2 traces.)
85 type UserRegionSummary struct {
86 TaskID TaskID
87 Name []byte
88 89 // Region start event. Normally EventRegionBegin event or nil,
90 // but can be a state transition event from NotExist or Undetermined
91 // if the region is a synthetic region representing task inheritance
92 // from the parent goroutine.
93 Start *Event
94 95 // Region end event. Normally EventRegionEnd event or nil,
96 // but can be a state transition event to NotExist if the goroutine
97 // terminated without explicitly ending the region.
98 End *Event
99 100 GoroutineExecStats
101 }
102 103 // GoroutineExecStats contains statistics about a goroutine's execution
104 // during a period of time.
105 type GoroutineExecStats struct {
106 // These stats are all non-overlapping.
107 ExecTime time.Duration
108 SchedWaitTime time.Duration
109 BlockTimeByReason map[string]time.Duration
110 SyscallTime time.Duration
111 SyscallBlockTime time.Duration
112 113 // TotalTime is the duration of the goroutine's presence in the trace.
114 // Necessarily overlaps with other stats.
115 TotalTime time.Duration
116 117 // Total time the goroutine spent in certain ranges; may overlap
118 // with other stats.
119 RangeTime map[string]time.Duration
120 }
121 122 func (s GoroutineExecStats) NonOverlappingStats() map[string]time.Duration {
123 stats := map[string]time.Duration{
124 "Execution time": s.ExecTime,
125 "Sched wait time": s.SchedWaitTime,
126 "Syscall execution time": s.SyscallTime,
127 "Block time (syscall)": s.SyscallBlockTime,
128 "Unknown time": s.UnknownTime(),
129 }
130 for reason, dt := range s.BlockTimeByReason {
131 stats["Block time ("+reason+")"] += dt
132 }
133 // N.B. Don't include RangeTime or TotalTime; they overlap with these other
134 // stats.
135 return stats
136 }
137 138 // UnknownTime returns whatever isn't accounted for in TotalTime.
139 func (s GoroutineExecStats) UnknownTime() time.Duration {
140 sum := s.ExecTime + s.SchedWaitTime + s.SyscallTime +
141 s.SyscallBlockTime
142 for _, dt := range s.BlockTimeByReason {
143 sum += dt
144 }
145 // N.B. Don't include range time. Ranges overlap with
146 // other stats, whereas these stats are non-overlapping.
147 if sum < s.TotalTime {
148 return s.TotalTime - sum
149 }
150 return 0
151 }
152 153 // sub returns the stats v-s.
154 func (s GoroutineExecStats) sub(v GoroutineExecStats) (r GoroutineExecStats) {
155 r = s.clone()
156 r.ExecTime -= v.ExecTime
157 r.SchedWaitTime -= v.SchedWaitTime
158 for reason := range s.BlockTimeByReason {
159 r.BlockTimeByReason[reason] -= v.BlockTimeByReason[reason]
160 }
161 r.SyscallTime -= v.SyscallTime
162 r.SyscallBlockTime -= v.SyscallBlockTime
163 r.TotalTime -= v.TotalTime
164 for name := range s.RangeTime {
165 r.RangeTime[name] -= v.RangeTime[name]
166 }
167 return r
168 }
169 170 func (s GoroutineExecStats) clone() (r GoroutineExecStats) {
171 r = s
172 r.BlockTimeByReason = map[string]time.Duration{}
173 for reason, dt := range s.BlockTimeByReason {
174 r.BlockTimeByReason[reason] = dt
175 }
176 r.RangeTime = map[string]time.Duration{}
177 for name, dt := range s.RangeTime {
178 r.RangeTime[name] = dt
179 }
180 return r
181 }
182 183 // snapshotStat returns the snapshot of the goroutine execution statistics.
184 // This is called as we process the ordered trace event stream. lastTs is used
185 // to process pending statistics if this is called before any goroutine end event.
186 func (g *GoroutineSummary) snapshotStat(lastTs Time) (ret GoroutineExecStats) {
187 ret = g.GoroutineExecStats.clone()
188 189 if g.goroutineSummary == nil {
190 return ret // Already finalized; no pending state.
191 }
192 193 // Set the total time if necessary.
194 if g.TotalTime == 0 {
195 ret.TotalTime = lastTs.Sub(g.CreationTime)
196 }
197 198 // Add in time since lastTs.
199 if g.lastStartTime != 0 {
200 ret.ExecTime += lastTs.Sub(g.lastStartTime)
201 }
202 if g.lastRunnableTime != 0 {
203 ret.SchedWaitTime += lastTs.Sub(g.lastRunnableTime)
204 }
205 if g.lastBlockTime != 0 {
206 ret.BlockTimeByReason[g.lastBlockReason] += lastTs.Sub(g.lastBlockTime)
207 }
208 if g.lastSyscallTime != 0 {
209 ret.SyscallTime += lastTs.Sub(g.lastSyscallTime)
210 }
211 if g.lastSyscallBlockTime != 0 {
212 ret.SchedWaitTime += lastTs.Sub(g.lastSyscallBlockTime)
213 }
214 for name, ts := range g.lastRangeTime {
215 ret.RangeTime[name] += lastTs.Sub(ts)
216 }
217 return ret
218 }
219 220 // finalize is called when processing a goroutine end event or at
221 // the end of trace processing. This finalizes the execution stat
222 // and any active regions in the goroutine, in which case trigger is nil.
223 func (g *GoroutineSummary) finalize(lastTs Time, trigger *Event) {
224 if trigger != nil {
225 g.EndTime = trigger.Time()
226 }
227 finalStat := g.snapshotStat(lastTs)
228 229 g.GoroutineExecStats = finalStat
230 231 // System goroutines are never part of regions, even though they
232 // "inherit" a task due to creation (EvGoCreate) from within a region.
233 // This may happen e.g. if the first GC is triggered within a region,
234 // starting the GC worker goroutines.
235 if !IsSystemGoroutine(g.Name) {
236 for _, s := range g.activeRegions {
237 s.End = trigger
238 s.GoroutineExecStats = finalStat.sub(s.GoroutineExecStats)
239 g.Regions = append(g.Regions, s)
240 }
241 }
242 *(g.goroutineSummary) = goroutineSummary{}
243 }
244 245 // goroutineSummary is a private part of GoroutineSummary that is required only during analysis.
246 type goroutineSummary struct {
247 lastStartTime Time
248 lastRunnableTime Time
249 lastBlockTime Time
250 lastBlockReason []byte
251 lastSyscallTime Time
252 lastSyscallBlockTime Time
253 lastRangeTime map[string]Time
254 activeRegions []*UserRegionSummary // stack of active regions
255 }
256 257 // Summarizer constructs per-goroutine time statistics for v2 traces.
258 type Summarizer struct {
259 // gs contains the map of goroutine summaries we're building up to return to the caller.
260 gs map[GoID]*GoroutineSummary
261 262 // tasks contains the map of task summaries we're building up to return to the caller.
263 tasks map[TaskID]*UserTaskSummary
264 265 // syscallingP and syscallingG represent a binding between a P and G in a syscall.
266 // Used to correctly identify and clean up after syscalls (blocking or otherwise).
267 syscallingP map[ProcID]GoID
268 syscallingG map[GoID]ProcID
269 270 // rangesP is used for optimistic tracking of P-based ranges for goroutines.
271 //
272 // It's a best-effort mapping of an active range on a P to the goroutine we think
273 // is associated with it.
274 rangesP map[rangeP]GoID
275 276 lastTs Time // timestamp of the last event processed.
277 syncTs Time // timestamp of the last sync event processed (or the first timestamp in the trace).
278 }
279 280 // NewSummarizer creates a new struct to build goroutine stats from a trace.
281 func NewSummarizer() *Summarizer {
282 return &Summarizer{
283 gs: map[GoID]*GoroutineSummary{},
284 tasks: map[TaskID]*UserTaskSummary{},
285 syscallingP: map[ProcID]GoID{},
286 syscallingG: map[GoID]ProcID{},
287 rangesP: map[rangeP]GoID{},
288 }
289 }
290 291 type rangeP struct {
292 id ProcID
293 name []byte
294 }
295 296 // Event feeds a single event into the stats summarizer.
297 func (s *Summarizer) Event(ev *Event) {
298 if s.syncTs == 0 {
299 s.syncTs = ev.Time()
300 }
301 s.lastTs = ev.Time()
302 303 switch ev.Kind() {
304 // Record sync time for the RangeActive events.
305 case EventSync:
306 s.syncTs = ev.Time()
307 308 // Handle state transitions.
309 case EventStateTransition:
310 st := ev.StateTransition()
311 switch st.Resource.Kind {
312 // Handle goroutine transitions, which are the meat of this computation.
313 case ResourceGoroutine:
314 id := st.Resource.Goroutine()
315 old, new := st.Goroutine()
316 if old == new {
317 // Skip these events; they're not telling us anything new.
318 break
319 }
320 321 // Handle transition out.
322 g := s.gs[id]
323 switch old {
324 case GoUndetermined, GoNotExist:
325 g = &GoroutineSummary{ID: id, goroutineSummary: &goroutineSummary{}}
326 // If we're coming out of GoUndetermined, then the creation time is the
327 // time of the last sync.
328 if old == GoUndetermined {
329 g.CreationTime = s.syncTs
330 } else {
331 g.CreationTime = ev.Time()
332 }
333 // The goroutine is being created, or it's being named for the first time.
334 g.lastRangeTime = map[string]Time{}
335 g.BlockTimeByReason = map[string]time.Duration{}
336 g.RangeTime = map[string]time.Duration{}
337 338 // When a goroutine is newly created, inherit the task
339 // of the active region. For ease handling of this
340 // case, we create a fake region description with the
341 // task id. This isn't strictly necessary as this
342 // goroutine may not be associated with the task, but
343 // it can be convenient to see all children created
344 // during a region.
345 //
346 // N.B. ev.Goroutine() will always be NoGoroutine for the
347 // Undetermined case, so this is will simply not fire.
348 if creatorG := s.gs[ev.Goroutine()]; creatorG != nil && len(creatorG.activeRegions) > 0 {
349 regions := creatorG.activeRegions
350 s := regions[len(regions)-1]
351 g.activeRegions = []*UserRegionSummary{{TaskID: s.TaskID, Start: ev}}
352 }
353 s.gs[g.ID] = g
354 case GoRunning:
355 // Record execution time as we transition out of running
356 g.ExecTime += ev.Time().Sub(g.lastStartTime)
357 g.lastStartTime = 0
358 case GoWaiting:
359 // Record block time as we transition out of waiting.
360 if g.lastBlockTime != 0 {
361 g.BlockTimeByReason[g.lastBlockReason] += ev.Time().Sub(g.lastBlockTime)
362 g.lastBlockTime = 0
363 }
364 case GoRunnable:
365 // Record sched latency time as we transition out of runnable.
366 if g.lastRunnableTime != 0 {
367 g.SchedWaitTime += ev.Time().Sub(g.lastRunnableTime)
368 g.lastRunnableTime = 0
369 }
370 case GoSyscall:
371 // Record syscall execution time and syscall block time as we transition out of syscall.
372 if g.lastSyscallTime != 0 {
373 if g.lastSyscallBlockTime != 0 {
374 g.SyscallBlockTime += ev.Time().Sub(g.lastSyscallBlockTime)
375 g.SyscallTime += g.lastSyscallBlockTime.Sub(g.lastSyscallTime)
376 } else {
377 g.SyscallTime += ev.Time().Sub(g.lastSyscallTime)
378 }
379 g.lastSyscallTime = 0
380 g.lastSyscallBlockTime = 0
381 382 // Clear the syscall map.
383 delete(s.syscallingP, s.syscallingG[id])
384 delete(s.syscallingG, id)
385 }
386 }
387 388 // The goroutine hasn't been identified yet. Take the transition stack
389 // and identify the goroutine by the root frame of that stack.
390 // This root frame will be identical for all transitions on this
391 // goroutine, because it represents its immutable start point.
392 if g.Name == "" {
393 for frame := range st.Stack.Frames() {
394 // NB: this PC won't actually be consistent for
395 // goroutines which existed at the start of the
396 // trace. The UI doesn't use it directly; this
397 // mainly serves as an indication that we
398 // actually saw a call stack for the goroutine
399 g.PC = frame.PC
400 g.Name = frame.Func
401 }
402 }
403 404 // Handle transition in.
405 switch new {
406 case GoRunning:
407 // We started running. Record it.
408 g.lastStartTime = ev.Time()
409 if g.StartTime == 0 {
410 g.StartTime = ev.Time()
411 }
412 case GoRunnable:
413 g.lastRunnableTime = ev.Time()
414 case GoWaiting:
415 if st.Reason != "forever" {
416 g.lastBlockTime = ev.Time()
417 g.lastBlockReason = st.Reason
418 break
419 }
420 // "Forever" is like goroutine death.
421 g.finalize(ev.Time(), ev)
422 case GoNotExist:
423 g.finalize(ev.Time(), ev)
424 case GoSyscall:
425 s.syscallingP[ev.Proc()] = id
426 s.syscallingG[id] = ev.Proc()
427 g.lastSyscallTime = ev.Time()
428 }
429 430 // Handle procs to detect syscall blocking, which si identifiable as a
431 // proc going idle while the goroutine it was attached to is in a syscall.
432 case ResourceProc:
433 id := st.Resource.Proc()
434 old, new := st.Proc()
435 if old != new && new == ProcIdle {
436 if goid, ok := s.syscallingP[id]; ok {
437 g := s.gs[goid]
438 g.lastSyscallBlockTime = ev.Time()
439 delete(s.syscallingP, id)
440 }
441 }
442 }
443 444 // Handle ranges of all kinds.
445 case EventRangeBegin, EventRangeActive:
446 r := ev.Range()
447 var g *GoroutineSummary
448 switch r.Scope.Kind {
449 case ResourceGoroutine:
450 // Simple goroutine range. We attribute the entire range regardless of
451 // goroutine stats. Lots of situations are still identifiable, e.g. a
452 // goroutine blocked often in mark assist will have both high mark assist
453 // and high block times. Those interested in a deeper view can look at the
454 // trace viewer.
455 g = s.gs[r.Scope.Goroutine()]
456 case ResourceProc:
457 // N.B. These ranges are not actually bound to the goroutine, they're
458 // bound to the P. But if we happen to be on the P the whole time, let's
459 // try to attribute it to the goroutine. (e.g. GC sweeps are here.)
460 g = s.gs[ev.Goroutine()]
461 if g != nil {
462 s.rangesP[rangeP{id: r.Scope.Proc(), name: r.Name}] = ev.Goroutine()
463 }
464 }
465 if g == nil {
466 break
467 }
468 if ev.Kind() == EventRangeActive {
469 if ts := g.lastRangeTime[r.Name]; ts != 0 {
470 g.RangeTime[r.Name] += s.syncTs.Sub(ts)
471 }
472 g.lastRangeTime[r.Name] = s.syncTs
473 } else {
474 g.lastRangeTime[r.Name] = ev.Time()
475 }
476 case EventRangeEnd:
477 r := ev.Range()
478 var g *GoroutineSummary
479 switch r.Scope.Kind {
480 case ResourceGoroutine:
481 g = s.gs[r.Scope.Goroutine()]
482 case ResourceProc:
483 rp := rangeP{id: r.Scope.Proc(), name: r.Name}
484 if goid, ok := s.rangesP[rp]; ok {
485 if goid == ev.Goroutine() {
486 // As the comment in the RangeBegin case states, this is only OK
487 // if we finish on the same goroutine we started on.
488 g = s.gs[goid]
489 }
490 delete(s.rangesP, rp)
491 }
492 }
493 if g == nil {
494 break
495 }
496 ts := g.lastRangeTime[r.Name]
497 if ts == 0 {
498 break
499 }
500 g.RangeTime[r.Name] += ev.Time().Sub(ts)
501 delete(g.lastRangeTime, r.Name)
502 503 // Handle user-defined regions.
504 case EventRegionBegin:
505 g := s.gs[ev.Goroutine()]
506 r := ev.Region()
507 region := &UserRegionSummary{
508 Name: r.Type,
509 TaskID: r.Task,
510 Start: ev,
511 GoroutineExecStats: g.snapshotStat(ev.Time()),
512 }
513 g.activeRegions = append(g.activeRegions, region)
514 // Associate the region and current goroutine to the task.
515 task := s.getOrAddTask(r.Task)
516 task.Regions = append(task.Regions, region)
517 task.Goroutines[g.ID] = g
518 case EventRegionEnd:
519 g := s.gs[ev.Goroutine()]
520 r := ev.Region()
521 var sd *UserRegionSummary
522 if regionStk := g.activeRegions; len(regionStk) > 0 {
523 // Pop the top region from the stack since that's what must have ended.
524 n := len(regionStk)
525 sd = regionStk[n-1]
526 regionStk = regionStk[:n-1]
527 g.activeRegions = regionStk
528 // N.B. No need to add the region to a task; the EventRegionBegin already handled it.
529 } else {
530 // This is an "end" without a start. Just fabricate the region now.
531 sd = &UserRegionSummary{Name: r.Type, TaskID: r.Task}
532 // Associate the region and current goroutine to the task.
533 task := s.getOrAddTask(r.Task)
534 task.Goroutines[g.ID] = g
535 task.Regions = append(task.Regions, sd)
536 }
537 sd.GoroutineExecStats = g.snapshotStat(ev.Time()).sub(sd.GoroutineExecStats)
538 sd.End = ev
539 g.Regions = append(g.Regions, sd)
540 541 // Handle tasks and logs.
542 case EventTaskBegin, EventTaskEnd:
543 // Initialize the task.
544 t := ev.Task()
545 task := s.getOrAddTask(t.ID)
546 task.Name = t.Type
547 task.Goroutines[ev.Goroutine()] = s.gs[ev.Goroutine()]
548 if ev.Kind() == EventTaskBegin {
549 task.Start = ev
550 } else {
551 task.End = ev
552 }
553 // Initialize the parent, if one exists and it hasn't been done yet.
554 // We need to avoid doing it twice, otherwise we could appear twice
555 // in the parent's Children list.
556 if t.Parent != NoTask && task.Parent == nil {
557 parent := s.getOrAddTask(t.Parent)
558 task.Parent = parent
559 parent.Children = append(parent.Children, task)
560 }
561 case EventLog:
562 log := ev.Log()
563 // Just add the log to the task. We'll create the task if it
564 // doesn't exist (it's just been mentioned now).
565 task := s.getOrAddTask(log.Task)
566 task.Goroutines[ev.Goroutine()] = s.gs[ev.Goroutine()]
567 task.Logs = append(task.Logs, ev)
568 }
569 }
570 571 func (s *Summarizer) getOrAddTask(id TaskID) *UserTaskSummary {
572 task := s.tasks[id]
573 if task == nil {
574 task = &UserTaskSummary{ID: id, Goroutines: map[GoID]*GoroutineSummary{}}
575 s.tasks[id] = task
576 }
577 return task
578 }
579 580 // Finalize indicates to the summarizer that we're done processing the trace.
581 // It cleans up any remaining state and returns the full summary.
582 func (s *Summarizer) Finalize() *Summary {
583 for _, g := range s.gs {
584 g.finalize(s.lastTs, nil)
585 586 // Sort based on region start time.
587 slices.SortFunc(g.Regions, func(a, b *UserRegionSummary) int {
588 x := a.Start
589 y := b.Start
590 if x == nil {
591 if y == nil {
592 return 0
593 }
594 return -1
595 }
596 if y == nil {
597 return +1
598 }
599 return cmp.Compare(x.Time(), y.Time())
600 })
601 g.goroutineSummary = nil
602 }
603 return &Summary{
604 Goroutines: s.gs,
605 Tasks: s.tasks,
606 }
607 }
608 609 // RelatedGoroutinesV2 finds a set of goroutines related to goroutine goid for v2 traces.
610 // The association is based on whether they have synchronized with each other in the Go
611 // scheduler (one has unblocked another).
612 func RelatedGoroutinesV2(events []Event, goid GoID) map[GoID]struct{} {
613 // Process all the events, looking for transitions of goroutines
614 // out of GoWaiting. If there was an active goroutine when this
615 // happened, then we know that active goroutine unblocked another.
616 // Scribble all these down so we can process them.
617 type unblockEdge struct {
618 operator GoID
619 operand GoID
620 }
621 var unblockEdges []unblockEdge
622 for _, ev := range events {
623 if ev.Goroutine() == NoGoroutine {
624 continue
625 }
626 if ev.Kind() != EventStateTransition {
627 continue
628 }
629 st := ev.StateTransition()
630 if st.Resource.Kind != ResourceGoroutine {
631 continue
632 }
633 id := st.Resource.Goroutine()
634 old, new := st.Goroutine()
635 if old == new || old != GoWaiting {
636 continue
637 }
638 unblockEdges = append(unblockEdges, unblockEdge{
639 operator: ev.Goroutine(),
640 operand: id,
641 })
642 }
643 // Compute the transitive closure of depth 2 of goroutines that have unblocked each other
644 // (starting from goid).
645 gmap := map[GoID]struct{}{}
646 gmap[goid] = struct{}{}
647 for i := 0; i < 2; i++ {
648 // Copy the map.
649 gmap1 := map[GoID]struct{}{}
650 for g := range gmap {
651 gmap1[g] = struct{}{}
652 }
653 for _, edge := range unblockEdges {
654 if _, ok := gmap[edge.operand]; ok {
655 gmap1[edge.operator] = struct{}{}
656 }
657 }
658 gmap = gmap1
659 }
660 return gmap
661 }
662 663 func IsSystemGoroutine(entryFn []byte) bool {
664 // This mimics runtime.isSystemGoroutine as closely as
665 // possible.
666 // Also, locked g in extra M (with empty entryFn) is system goroutine.
667 return entryFn == "" || entryFn != "runtime.main" && bytes.HasPrefix(entryFn, "runtime.")
668 }
669