1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 package cgroup
6 7 import (
8 "internal/bytealg"
9 "internal/runtime/strconv"
10 "internal/runtime/syscall"
11 )
12 13 var (
14 ErrNoCgroup error = stringError("not in a cgroup")
15 16 errMalformedFile error = stringError("malformed file")
17 )
18 19 const _PATH_MAX = 4096
20 21 const (
22 // Required amount of scratch space for CPULimit.
23 //
24 // TODO(prattmic): This is shockingly large (~70KiB) due to the (very
25 // unlikely) combination of extremely long paths consisting mostly
26 // escaped characters. The scratch buffer ends up in .bss in package
27 // runtime, so it doesn't contribute to binary size and generally won't
28 // be faulted in, but it would still be nice to shrink this. A more
29 // complex parser that did not need to keep entire lines in memory
30 // could get away with much less. Alternatively, we could do a one-off
31 // mmap allocation for this buffer, which is only mapped larger if we
32 // actually need the extra space.
33 ScratchSize = PathSize + ParseSize
34 35 // Required space to store a path of the cgroup in the filesystem.
36 PathSize = _PATH_MAX
37 38 // /proc/self/mountinfo path escape sequences are 4 characters long, so
39 // a path consisting entirely of escaped characters could be 4 times
40 // larger.
41 escapedPathMax = 4 * _PATH_MAX
42 43 // Required space to parse /proc/self/mountinfo and /proc/self/cgroup.
44 // See findCPUMount and findCPURelativePath.
45 ParseSize = 4 * escapedPathMax
46 )
47 48 // Include explicit NUL to be sure we include it in the slice.
49 const (
50 v2MaxFile = "/cpu.max\x00"
51 v1QuotaFile = "/cpu.cfs_quota_us\x00"
52 v1PeriodFile = "/cpu.cfs_period_us\x00"
53 )
54 55 // Version indicates the cgroup version.
56 type Version int
57 58 const (
59 VersionUnknown Version = iota
60 V1
61 V2
62 )
63 64 // CPU owns the FDs required to read the CPU limit from a cgroup.
65 type CPU struct {
66 version Version
67 68 // For cgroup v1, this is cpu.cfs_quota_us.
69 // For cgroup v2, this is cpu.max.
70 quotaFD int
71 72 // For cgroup v1, this is cpu.cfs_period_us.
73 // For cgroup v2, this is unused.
74 periodFD int
75 }
76 77 func (c CPU) Close() {
78 switch c.version {
79 case V1:
80 syscall.Close(c.quotaFD)
81 syscall.Close(c.periodFD)
82 case V2:
83 syscall.Close(c.quotaFD)
84 default:
85 throw("impossible cgroup version")
86 }
87 }
88 89 func checkBufferSize(s []byte, size int) {
90 if len(s) != size {
91 println("runtime: cgroup buffer length", len(s), "want", size)
92 throw("runtime: cgroup invalid buffer length")
93 }
94 }
95 96 // OpenCPU returns a CPU for the CPU cgroup containing the current process, or
97 // ErrNoCgroup if the process is not in a CPU cgroup.
98 //
99 // scratch must have length ScratchSize.
100 func OpenCPU(scratch []byte) (CPU, error) {
101 checkBufferSize(scratch, ScratchSize)
102 103 base := scratch[:PathSize]
104 scratch2 := scratch[PathSize:]
105 106 n, version, err := FindCPU(base, scratch2)
107 if err != nil {
108 return CPU{}, err
109 }
110 111 switch version {
112 case 1:
113 n2 := copy(base[n:], v1QuotaFile)
114 path := base[:n+n2]
115 quotaFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
116 if errno != 0 {
117 // This may fail if this process was migrated out of
118 // the cgroup found by FindCPU and that cgroup has been
119 // deleted.
120 return CPU{}, errSyscallFailed
121 }
122 123 n2 = copy(base[n:], v1PeriodFile)
124 path = base[:n+n2]
125 periodFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
126 if errno != 0 {
127 // This may fail if this process was migrated out of
128 // the cgroup found by FindCPU and that cgroup has been
129 // deleted.
130 return CPU{}, errSyscallFailed
131 }
132 133 c := CPU{
134 version: 1,
135 quotaFD: quotaFD,
136 periodFD: periodFD,
137 }
138 return c, nil
139 case 2:
140 n2 := copy(base[n:], v2MaxFile)
141 path := base[:n+n2]
142 maxFD, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
143 if errno != 0 {
144 // This may fail if this process was migrated out of
145 // the cgroup found by FindCPU and that cgroup has been
146 // deleted.
147 return CPU{}, errSyscallFailed
148 }
149 150 c := CPU{
151 version: 2,
152 quotaFD: maxFD,
153 periodFD: -1,
154 }
155 return c, nil
156 default:
157 throw("impossible cgroup version")
158 panic("unreachable")
159 }
160 }
161 162 // Returns average CPU throughput limit from the cgroup, or ok false if there
163 // is no limit.
164 func ReadCPULimit(c CPU) (float64, bool, error) {
165 switch c.version {
166 case 1:
167 quota, err := readV1Number(c.quotaFD)
168 if err != nil {
169 return 0, false, errMalformedFile
170 }
171 172 if quota < 0 {
173 // No limit.
174 return 0, false, nil
175 }
176 177 period, err := readV1Number(c.periodFD)
178 if err != nil {
179 return 0, false, errMalformedFile
180 }
181 182 return float64(quota) / float64(period), true, nil
183 case 2:
184 // quotaFD is the cpu.max FD.
185 return readV2Limit(c.quotaFD)
186 default:
187 throw("impossible cgroup version")
188 panic("unreachable")
189 }
190 }
191 192 // Returns the value from the quota/period file.
193 func readV1Number(fd int) (int64, error) {
194 // The format of the file is "<value>\n" where the value is in
195 // int64 microseconds and, if quota, may be -1 to indicate no limit.
196 //
197 // MaxInt64 requires 19 bytes to display in base 10, thus the
198 // conservative max size of this file is 19 + 1 (newline) = 20 bytes.
199 // We'll provide a bit more for good measure.
200 //
201 // Always read from the beginning of the file to get a fresh value.
202 var b [64]byte
203 n, errno := syscall.Pread(fd, b[:], 0)
204 if errno != 0 {
205 return 0, errSyscallFailed
206 }
207 if n == len(b) {
208 return 0, errMalformedFile
209 }
210 211 buf := b[:n]
212 return parseV1Number(buf)
213 }
214 215 func parseV1Number(buf []byte) (int64, error) {
216 // Ignore trailing newline.
217 i := bytealg.IndexByte(buf, '\n')
218 if i < 0 {
219 return 0, errMalformedFile
220 }
221 buf = buf[:i]
222 223 val, ok := strconv.Atoi64(string(buf))
224 if !ok {
225 return 0, errMalformedFile
226 }
227 228 return val, nil
229 }
230 231 // Returns CPU throughput limit, or ok false if there is no limit.
232 func readV2Limit(fd int) (float64, bool, error) {
233 // The format of the file is "<quota> <period>\n" where quota and
234 // period are microseconds and quota may be "max" to indicate no limit.
235 //
236 // Note that the kernel is inconsistent about whether the values are
237 // uint64 or int64: values are parsed as uint64 but printed as int64.
238 // See kernel/sched/core.c:cpu_max_{show,write}.
239 //
240 // In practice, the kernel limits the period to 1s (1000000us) (see
241 // max_cfs_quota_period), and the quota to (1<<44)us (see
242 // max_cfs_runtime), so these values can't get large enough for the
243 // distinction to matter.
244 //
245 // MaxInt64 requires 19 bytes to display in base 10, thus the
246 // conservative max size of this file is 19 + 19 + 1 (space) + 1
247 // (newline) = 40 bytes. We'll provide a bit more for good measure.
248 //
249 // Always read from the beginning of the file to get a fresh value.
250 var b [64]byte
251 n, errno := syscall.Pread(fd, b[:], 0)
252 if errno != 0 {
253 return 0, false, errSyscallFailed
254 }
255 if n == len(b) {
256 return 0, false, errMalformedFile
257 }
258 259 buf := b[:n]
260 return parseV2Limit(buf)
261 }
262 263 func parseV2Limit(buf []byte) (float64, bool, error) {
264 i := bytealg.IndexByte(buf, ' ')
265 if i < 0 {
266 return 0, false, errMalformedFile
267 }
268 269 quotaStr := buf[:i]
270 if bytealg.Compare(quotaStr, []byte("max")) == 0 {
271 // No limit.
272 return 0, false, nil
273 }
274 275 periodStr := buf[i+1:]
276 // Ignore trailing newline, if any.
277 i = bytealg.IndexByte(periodStr, '\n')
278 if i < 0 {
279 return 0, false, errMalformedFile
280 }
281 periodStr = periodStr[:i]
282 283 quota, ok := strconv.Atoi64(string(quotaStr))
284 if !ok {
285 return 0, false, errMalformedFile
286 }
287 288 period, ok := strconv.Atoi64(string(periodStr))
289 if !ok {
290 return 0, false, errMalformedFile
291 }
292 293 return float64(quota) / float64(period), true, nil
294 }
295 296 // FindCPU finds the path to the CPU cgroup that this process is a member of
297 // and places it in out. scratch is a scratch buffer for internal use.
298 //
299 // out must have length PathSize. scratch must have length ParseSize.
300 //
301 // Returns the number of bytes written to out and the cgroup version (1 or 2).
302 //
303 // Returns ErrNoCgroup if the process is not in a CPU cgroup.
304 func FindCPU(out []byte, scratch []byte) (int, Version, error) {
305 checkBufferSize(out, PathSize)
306 checkBufferSize(scratch, ParseSize)
307 308 // The cgroup path is <cgroup mount point> + <relative path>.
309 //
310 // This is racy if our cgroup is changed while this runs. For example,
311 // initially there is only a cgroup v2 mount and we are not in a
312 // cgroup. After, there a cgroup v1 mount with a CPU controller and we
313 // are placed in a cgroup in this hierarchy. In that case, findCPUMount
314 // could pick the v2 mount, and findCPURelativePath could find the v2
315 // relative path.
316 //
317 // In this case we'll later fail to read the cgroup files and fall back
318 // to assuming no cgroup.
319 320 n, err := FindCPUMountPoint(out, scratch)
321 if err != nil {
322 return 0, 0, err
323 }
324 325 // The relative path always starts with /, so we can directly append it
326 // to the mount point.
327 n2, version, err := FindCPURelativePath(out[n:], scratch)
328 if err != nil {
329 return 0, 0, err
330 }
331 n += n2
332 333 return n, version, nil
334 }
335 336 // FindCPURelativePath finds the path to the CPU cgroup that this process is a member of
337 // relative to the root of the cgroup mount and places it in out. scratch is a
338 // scratch buffer for internal use.
339 //
340 // out must have length PathSize minus the size of the cgroup mount root (if
341 // known). scratch must have length ParseSize.
342 //
343 // Returns the number of bytes written to out and the cgroup version (1 or 2).
344 //
345 // Returns ErrNoCgroup if the process is not in a CPU cgroup.
346 func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) {
347 path := []byte("/proc/self/cgroup\x00")
348 fd, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
349 if errno == syscall.ENOENT {
350 return 0, 0, ErrNoCgroup
351 } else if errno != 0 {
352 return 0, 0, errSyscallFailed
353 }
354 355 // The relative path always starts with /, so we can directly append it
356 // to the mount point.
357 n, version, err := parseCPURelativePath(fd, syscall.Read, out[:], scratch)
358 if err != nil {
359 syscall.Close(fd)
360 return 0, 0, err
361 }
362 363 syscall.Close(fd)
364 return n, version, nil
365 }
366 367 // Finds the path of the current process's CPU cgroup relative to the cgroup
368 // mount and writes it to out.
369 //
370 // Returns the number of bytes written and the cgroup version (1 or 2).
371 func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) {
372 // The format of each line is
373 //
374 // hierarchy-ID:controller-list:cgroup-path
375 //
376 // controller-list is comma-separated.
377 // See man 5 cgroup for more details.
378 //
379 // cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that
380 // is the CPU controller. Otherwise the v2 hierarchy (if any) is the
381 // CPU controller.
382 //
383 // hierarchy-ID and controller-list have relatively small maximum
384 // sizes, and the path can be up to _PATH_MAX, so we need a bit more
385 // than 1 _PATH_MAX of scratch space.
386 387 l := newLineReader(fd, scratch, read)
388 389 // Bytes written to out.
390 n := 0
391 392 for {
393 err := l.next()
394 if err == errIncompleteLine {
395 // Don't allow incomplete lines. While in theory the
396 // incomplete line may be for a controller we don't
397 // care about, in practice all lines should be of
398 // similar length, so we should just have a buffer big
399 // enough for any.
400 return 0, 0, err
401 } else if err == errEOF {
402 break
403 } else if err != nil {
404 return 0, 0, err
405 }
406 407 line := l.line()
408 409 // The format of each line is
410 //
411 // hierarchy-ID:controller-list:cgroup-path
412 //
413 // controller-list is comma-separated.
414 // See man 5 cgroup for more details.
415 i := bytealg.IndexByte(line, ':')
416 if i < 0 {
417 return 0, 0, errMalformedFile
418 }
419 420 hierarchy := line[:i]
421 line = line[i+1:]
422 423 i = bytealg.IndexByte(line, ':')
424 if i < 0 {
425 return 0, 0, errMalformedFile
426 }
427 428 controllers := line[:i]
429 line = line[i+1:]
430 431 path := line
432 433 if string(hierarchy) == "0" {
434 // v2 hierarchy.
435 n = copy(out, path)
436 // Keep searching, we might find a v1 hierarchy with a
437 // CPU controller, which takes precedence.
438 } else {
439 // v1 hierarchy
440 if containsCPU(controllers) {
441 // Found a v1 CPU controller. This must be the
442 // only one, so we're done.
443 return copy(out, path), V1, nil
444 }
445 }
446 }
447 448 if n == 0 {
449 // Found nothing.
450 return 0, 0, ErrNoCgroup
451 }
452 453 // Must be v2, v1 returns above.
454 return n, V2, nil
455 }
456 457 // Returns true if comma-separated list b contains "cpu".
458 func containsCPU(b []byte) bool {
459 for len(b) > 0 {
460 i := bytealg.IndexByte(b, ',')
461 if i < 0 {
462 // Neither cmd/compile nor gccgo allocates for these string conversions.
463 return string(b) == "cpu"
464 }
465 466 curr := b[:i]
467 rest := b[i+1:]
468 469 if string(curr) == "cpu" {
470 return true
471 }
472 473 b = rest
474 }
475 476 return false
477 }
478 479 // FindCPUMountPoint finds the root of the CPU cgroup mount places it in out.
480 // scratch is a scratch buffer for internal use.
481 //
482 // out must have length PathSize. scratch must have length ParseSize.
483 //
484 // Returns the number of bytes written to out.
485 //
486 // Returns ErrNoCgroup if the process is not in a CPU cgroup.
487 func FindCPUMountPoint(out []byte, scratch []byte) (int, error) {
488 checkBufferSize(out, PathSize)
489 checkBufferSize(scratch, ParseSize)
490 491 path := []byte("/proc/self/mountinfo\x00")
492 fd, errno := syscall.Open(&path[0], syscall.O_RDONLY|syscall.O_CLOEXEC, 0)
493 if errno == syscall.ENOENT {
494 return 0, ErrNoCgroup
495 } else if errno != 0 {
496 return 0, errSyscallFailed
497 }
498 499 n, err := parseCPUMount(fd, syscall.Read, out, scratch)
500 if err != nil {
501 syscall.Close(fd)
502 return 0, err
503 }
504 syscall.Close(fd)
505 506 return n, nil
507 }
508 509 // Returns the mount point for the cpu cgroup controller (v1 or v2) from
510 // /proc/self/mountinfo.
511 func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, error) {
512 // The format of each line is:
513 //
514 // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
515 // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
516 //
517 // (1) mount ID: unique identifier of the mount (may be reused after umount)
518 // (2) parent ID: ID of parent (or of self for the top of the mount tree)
519 // (3) major:minor: value of st_dev for files on filesystem
520 // (4) root: root of the mount within the filesystem
521 // (5) mount point: mount point relative to the process's root
522 // (6) mount options: per mount options
523 // (7) optional fields: zero or more fields of the form "tag[:value]"
524 // (8) separator: marks the end of the optional fields
525 // (9) filesystem type: name of filesystem of the form "type[.subtype]"
526 // (10) mount source: filesystem specific information or "none"
527 // (11) super options: per super block options
528 //
529 // See man 5 proc_pid_mountinfo for more details.
530 //
531 // Note that emitted paths will not contain space, tab, newline, or
532 // carriage return. Those are escaped. See Linux show_mountinfo ->
533 // show_path. We must unescape before returning.
534 //
535 // We return the mount point (5) if the filesystem type (9) is cgroup2,
536 // or cgroup with "cpu" in the super options (11).
537 //
538 // (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a
539 // small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.
540 // Note that non-cgroup mounts may have arbitrarily long (11), but we
541 // can skip those when parsing.
542 543 l := newLineReader(fd, scratch, read)
544 545 // Bytes written to out.
546 n := 0
547 548 for {
549 //incomplete := false
550 err := l.next()
551 if err == errIncompleteLine {
552 // An incomplete line is fine as long as it doesn't
553 // impede parsing the fields we need. It shouldn't be
554 // possible for any mount to use more than 3*PATH_MAX
555 // before (9) because there are two paths and all other
556 // earlier fields have bounded options. Only (11) has
557 // unbounded options.
558 } else if err == errEOF {
559 break
560 } else if err != nil {
561 return 0, err
562 }
563 564 line := l.line()
565 566 // Skip first four fields.
567 for range 4 {
568 i := bytealg.IndexByte(line, ' ')
569 if i < 0 {
570 return 0, errMalformedFile
571 }
572 line = line[i+1:]
573 }
574 575 // (5) mount point: mount point relative to the process's root
576 i := bytealg.IndexByte(line, ' ')
577 if i < 0 {
578 return 0, errMalformedFile
579 }
580 mnt := line[:i]
581 line = line[i+1:]
582 583 // Skip ahead past optional fields, delimited by " - ".
584 for {
585 i = bytealg.IndexByte(line, ' ')
586 if i < 0 {
587 return 0, errMalformedFile
588 }
589 if i+3 >= len(line) {
590 return 0, errMalformedFile
591 }
592 delim := line[i : i+3]
593 if string(delim) == " - " {
594 line = line[i+3:]
595 break
596 }
597 line = line[i+1:]
598 }
599 600 // (9) filesystem type: name of filesystem of the form "type[.subtype]"
601 i = bytealg.IndexByte(line, ' ')
602 if i < 0 {
603 return 0, errMalformedFile
604 }
605 ftype := line[:i]
606 line = line[i+1:]
607 608 if string(ftype) != "cgroup" && string(ftype) != "cgroup2" {
609 continue
610 }
611 612 // As in findCPUPath, cgroup v1 with a CPU controller takes
613 // precendence over cgroup v2.
614 if string(ftype) == "cgroup2" {
615 // v2 hierarchy.
616 n, err = unescapePath(out, mnt)
617 if err != nil {
618 // Don't keep searching on error. The kernel
619 // should never produce broken escaping.
620 return n, err
621 }
622 // Keep searching, we might find a v1 hierarchy with a
623 // CPU controller, which takes precedence.
624 continue
625 }
626 627 // (10) mount source: filesystem specific information or "none"
628 i = bytealg.IndexByte(line, ' ')
629 if i < 0 {
630 return 0, errMalformedFile
631 }
632 // Don't care about mount source.
633 line = line[i+1:]
634 635 // (11) super options: per super block options
636 superOpt := line
637 638 // v1 hierarchy
639 if containsCPU(superOpt) {
640 // Found a v1 CPU controller. This must be the
641 // only one, so we're done.
642 return unescapePath(out, mnt)
643 }
644 }
645 646 if n == 0 {
647 // Found nothing.
648 return 0, ErrNoCgroup
649 }
650 651 return n, nil
652 }
653 654 var errInvalidEscape error = stringError("invalid path escape sequence")
655 656 // unescapePath copies in to out, unescaping escape sequences generated by
657 // Linux's show_path.
658 //
659 // That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
660 // like '\040' for space.
661 //
662 // out must be at least as large as in.
663 //
664 // Returns the number of bytes written to out.
665 //
666 // Also see escapePath in cgroup_linux_test.go.
667 func unescapePath(out []byte, in []byte) (int, error) {
668 // Not strictly necessary, but simplifies the implementation and will
669 // always hold in users.
670 if len(out) < len(in) {
671 throw("output too small")
672 }
673 674 var outi, ini int
675 for ini < len(in) {
676 c := in[ini]
677 if c != '\\' {
678 out[outi] = c
679 outi++
680 ini++
681 continue
682 }
683 684 // Start of escape sequence.
685 686 // Escape sequence is always 4 characters: one slash and three
687 // digits.
688 if ini+3 >= len(in) {
689 return outi, errInvalidEscape
690 }
691 692 var outc byte
693 for i := range 3 {
694 c := in[ini+1+i]
695 if c < '0' || c > '9' {
696 return outi, errInvalidEscape
697 }
698 699 outc *= 8
700 outc += c - '0'
701 }
702 703 out[outi] = outc
704 outi++
705 706 ini += 4
707 }
708 709 return outi, nil
710 }
711