cpu_x86.go raw
1 // Copyright 2017 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build 386 amd64 amd64p32
6
7 package cpu
8
9 import (
10 "fmt"
11 "strings"
12 )
13
14 const CacheLineSize = 64
15
16 // cpuid is implemented in cpu_x86.s.
17 func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
18
19 // xgetbv with ecx = 0 is implemented in cpu_x86.s.
20 func xgetbv() (eax, edx uint32)
21
22 const (
23 // edx bits
24 cpuid_SSE2 = 1 << 26
25
26 // ecx bits
27 cpuid_SSE3 = 1 << 0
28 cpuid_PCLMULQDQ = 1 << 1
29 cpuid_SSSE3 = 1 << 9
30 cpuid_FMA = 1 << 12
31 cpuid_SSE41 = 1 << 19
32 cpuid_SSE42 = 1 << 20
33 cpuid_POPCNT = 1 << 23
34 cpuid_AES = 1 << 25
35 cpuid_OSXSAVE = 1 << 27
36 cpuid_AVX = 1 << 28
37 cpuid_CMPXCHG16B = 1 << 13
38
39 // ebx bits
40 cpuid_BMI1 = 1 << 3
41 cpuid_AVX2 = 1 << 5
42 cpuid_BMI2 = 1 << 8
43 cpuid_ERMS = 1 << 9
44 cpuid_ADX = 1 << 19
45 cpuid_AVX512F = 1 << 16
46 cpuid_AVX512DQ = 1 << 17
47 cpuid_AVX512BW = 1 << 30
48 cpuid_AVX512VL = 1 << 31
49
50 // edx bits
51 cpuid_Invariant_TSC = 1 << 8
52 )
53
54 func doinit() {
55 options = []option{
56 {"adx", &X86.HasADX},
57 {"aes", &X86.HasAES},
58 {"avx", &X86.HasAVX},
59 {"avx2", &X86.HasAVX2},
60 {"bmi1", &X86.HasBMI1},
61 {"bmi2", &X86.HasBMI2},
62 {"erms", &X86.HasERMS},
63 {"fma", &X86.HasFMA},
64 {"pclmulqdq", &X86.HasPCLMULQDQ},
65 {"popcnt", &X86.HasPOPCNT},
66 {"sse3", &X86.HasSSE3},
67 {"sse41", &X86.HasSSE41},
68 {"sse42", &X86.HasSSE42},
69 {"ssse3", &X86.HasSSSE3},
70 {"avx512f", &X86.HasAVX512F},
71 {"avx512dq", &X86.HasAVX512DQ},
72 {"avx512bw", &X86.HasAVX512BW},
73 {"avx512vl", &X86.HasAVX512VL},
74 {"invariant_tsc", &X86.HasInvariantTSC},
75
76 // sse2 set as last element so it can easily be removed again. See code below.
77 {"sse2", &X86.HasSSE2},
78 }
79
80 // Remove sse2 from options on amd64(p32) because SSE2 is a mandatory feature for these GOARCHs.
81 if GOARCH == "amd64" || GOARCH == "amd64p32" {
82 options = options[:len(options)-1]
83 }
84
85 maxID, _, _, _ := cpuid(0, 0)
86
87 if maxID < 1 {
88 return
89 }
90
91 _, _, ecx1, edx1 := cpuid(1, 0)
92 X86.HasSSE2 = isSet(edx1, cpuid_SSE2)
93
94 X86.HasSSE3 = isSet(ecx1, cpuid_SSE3)
95 X86.HasPCLMULQDQ = isSet(ecx1, cpuid_PCLMULQDQ)
96 X86.HasSSSE3 = isSet(ecx1, cpuid_SSSE3)
97 X86.HasFMA = isSet(ecx1, cpuid_FMA)
98 X86.HasSSE41 = isSet(ecx1, cpuid_SSE41)
99 X86.HasSSE42 = isSet(ecx1, cpuid_SSE42)
100 X86.HasPOPCNT = isSet(ecx1, cpuid_POPCNT)
101 X86.HasAES = isSet(ecx1, cpuid_AES)
102 X86.HasCMPXCHG16B = isSet(ecx1, cpuid_CMPXCHG16B)
103 X86.HasOSXSAVE = isSet(ecx1, cpuid_OSXSAVE)
104
105 osSupportsAVX := false
106 osSupportsAVX512 := false
107 // For XGETBV, OSXSAVE bit is required and sufficient.
108 if X86.HasOSXSAVE {
109 eax, _ := xgetbv()
110 // Check if XMM and YMM registers have OS support.
111 osSupportsAVX = isSet(eax, 1<<1) && isSet(eax, 1<<2)
112 // Check is ZMM registers have OS support.
113 osSupportsAVX512 = isSet(eax>>5, 7) && isSet(eax>>1, 3)
114 }
115
116 X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX
117
118 if maxID < 7 {
119 return
120 }
121
122 _, ebx7, _, _ := cpuid(7, 0)
123 X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
124 X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
125 X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512
126 X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ) && osSupportsAVX512
127 X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW) && osSupportsAVX512
128 X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) && osSupportsAVX512
129 X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
130 X86.HasERMS = isSet(ebx7, cpuid_ERMS)
131 X86.HasADX = isSet(ebx7, cpuid_ADX)
132
133 X86.Cache = getCacheSize()
134
135 X86.HasInvariantTSC = hasInvariantTSC()
136
137 X86.Family, X86.Model, X86.SteppingID = getVersionInfo()
138
139 X86.Signature = makeSignature(X86.Family, X86.Model)
140
141 X86.Name = getName()
142
143 X86.TSCFrequency = getNativeTSCFrequency(X86.Name, X86.Signature, X86.SteppingID)
144 }
145
146 func isSet(hwc uint32, value uint32) bool {
147 return hwc&value != 0
148 }
149
150 func hasInvariantTSC() bool {
151 if maxExtendedFunction() < 0x80000007 {
152 return false
153 }
154 _, _, _, edx := cpuid(0x80000007, 0)
155 return isSet(edx, cpuid_Invariant_TSC)
156 }
157
158 func getName() string {
159 if maxExtendedFunction() >= 0x80000004 {
160 v := make([]uint32, 0, 48)
161 for i := uint32(0); i < 3; i++ {
162 a, b, c, d := cpuid(0x80000002+i, 0)
163 v = append(v, a, b, c, d)
164 }
165 return strings.Trim(string(valAsString(v...)), " ")
166 }
167 return "unknown"
168 }
169
170 // getNativeTSCFrequency gets TSC frequency from CPUID,
171 // only supports Intel (Skylake or later microarchitecture) & key information is from Intel manual & kernel codes
172 // (especially this commit: https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684).
173 func getNativeTSCFrequency(name, sign string, steppingID uint32) uint64 {
174
175 if vendorID() != Intel {
176 return 0
177 }
178
179 if maxFunctionID() < 0x15 {
180 return 0
181 }
182
183 // ApolloLake, GeminiLake, CannonLake (and presumably all new chipsets
184 // from this point) report the crystal frequency directly via CPUID.0x15.
185 // That's definitive data that we can rely upon.
186 eax, ebx, ecx, _ := cpuid(0x15, 0)
187
188 // If ebx is 0, the TSC/”core crystal clock” ratio is not enumerated.
189 // We won't provide TSC frequency detection in this situation.
190 if eax == 0 || ebx == 0 {
191 return 0
192 }
193
194 // Skylake, Kabylake and all variants of those two chipsets report a
195 // crystal frequency of zero.
196 if ecx == 0 { // Crystal clock frequency is not enumerated.
197 ecx = getCrystalClockFrequency(sign, steppingID)
198 }
199
200 // TSC frequency = “core crystal clock frequency” * EBX/EAX.
201 return uint64(ecx) * (uint64(ebx) / uint64(eax))
202 }
203
204 // Copied from: CPUID Signature values of DisplayFamily and DisplayModel,
205 // in Intel® 64 and IA-32 Architectures Software Developer’s Manual
206 // Volume 4: Model-Specific Registers
207 // & https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/intel-family.h
208 const (
209 IntelFam6SkylakeL = "06_4EH"
210 IntelFam6Skylake = "06_5EH"
211 IntelFam6XeonScalable = "06_55H"
212 IntelFam6KabylakeL = "06_8EH"
213 IntelFam6Kabylake = "06_9EH"
214 )
215
216 // getCrystalClockFrequency gets crystal clock frequency
217 // for Intel processors in which CPUID.15H.EBX[31:0] ÷ CPUID.0x15.EAX[31:0] is enumerated
218 // but CPUID.15H.ECX is not enumerated using this function to get nominal core crystal clock frequency.
219 //
220 // Actually these crystal clock frequencies provided by Intel hardcoded tables are not so accurate in some cases,
221 // e.g. SkyLake server CPU may have issue (All SKX subject the crystal to an EMI reduction circuit that
222 //reduces its actual frequency by (approximately) -0.25%):
223 // see https://lore.kernel.org/lkml/ff6dcea166e8ff8f2f6a03c17beab2cb436aa779.1513920414.git.len.brown@intel.com/
224 // for more details.
225 // With this report, I set a coefficient (0.9975) for IntelFam6SkyLakeX.
226 //
227 // Unlike the kernel way (mentioned in https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684),
228 // I prefer the Intel hardcoded tables, (in <Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 3>
229 // 18.7.3 Determining the Processor Base Frequency, Table 18-85. Nominal Core Crystal Clock Frequency)
230 // because after some testing (comparing with wall clock, see https://github.com/templexxx/tsc/tsc_test.go for more details),
231 // I found hardcoded tables are more accurate.
232 func getCrystalClockFrequency(sign string, steppingID uint32) uint32 {
233
234 if maxFunctionID() < 0x16 {
235 return 0
236 }
237
238 switch sign {
239 case IntelFam6SkylakeL:
240 return 24 * 1000 * 1000
241 case IntelFam6Skylake:
242 return 24 * 1000 * 1000
243 case IntelFam6XeonScalable:
244 // SKL-SP.
245 // see: https://community.intel.com/t5/Software-Tuning-Performance/How-to-detect-microarchitecture-on-Xeon-Scalable/m-p/1205162#M7633.
246 if steppingID == 0x2 || steppingID == 0x3 || steppingID == 0x4 {
247 return 25 * 1000 * 1000 * 0.9975
248 }
249 return 25 * 1000 * 1000 // TODO check other Xeon Scalable has no slow down issue.
250 case IntelFam6KabylakeL:
251 return 24 * 1000 * 1000
252 case IntelFam6Kabylake:
253 return 24 * 1000 * 1000
254 }
255
256 return 0
257 }
258
259 func getVersionInfo() (uint32, uint32, uint32) {
260 if maxFunctionID() < 0x1 {
261 return 0, 0, 0
262 }
263 eax, _, _, _ := cpuid(1, 0)
264 family := (eax >> 8) & 0xf
265 displayFamily := family
266 if family == 0xf {
267 displayFamily = ((eax >> 20) & 0xff) + family
268 }
269 model := (eax >> 4) & 0xf
270 displayModel := model
271 if family == 0x6 || family == 0xf {
272 displayModel = ((eax >> 12) & 0xf0) + model
273 }
274 return displayFamily, displayModel, eax & 0x7
275 }
276
277 // signature format: XX_XXH
278 func makeSignature(family, model uint32) string {
279 signature := strings.ToUpper(fmt.Sprintf("0%x_0%xH", family, model))
280 ss := strings.Split(signature, "_")
281 for i, s := range ss {
282 // Maybe insert too more `0`, drop it.
283 if len(s) > 2 {
284 s = s[1:]
285 ss[i] = s
286 }
287 }
288 return strings.Join(ss, "_")
289 }
290
291 // getCacheSize is from
292 // https://github.com/klauspost/cpuid/blob/5a626f7029c910cc8329dae5405ee4f65034bce5/cpuid.go#L723
293 func getCacheSize() Cache {
294 c := Cache{
295 L1I: -1,
296 L1D: -1,
297 L2: -1,
298 L3: -1,
299 }
300
301 vendor := vendorID()
302 switch vendor {
303 case Intel:
304 if maxFunctionID() < 4 {
305 return c
306 }
307 for i := uint32(0); ; i++ {
308 eax, ebx, ecx, _ := cpuid(4, i)
309 cacheType := eax & 15
310 if cacheType == 0 {
311 break
312 }
313 cacheLevel := (eax >> 5) & 7
314 coherency := int(ebx&0xfff) + 1
315 partitions := int((ebx>>12)&0x3ff) + 1
316 associativity := int((ebx>>22)&0x3ff) + 1
317 sets := int(ecx) + 1
318 size := associativity * partitions * coherency * sets
319 switch cacheLevel {
320 case 1:
321 if cacheType == 1 {
322 // 1 = Data Cache
323 c.L1D = size
324 } else if cacheType == 2 {
325 // 2 = Instruction Cache
326 c.L1I = size
327 } else {
328 if c.L1D < 0 {
329 c.L1I = size
330 }
331 if c.L1I < 0 {
332 c.L1I = size
333 }
334 }
335 case 2:
336 c.L2 = size
337 case 3:
338 c.L3 = size
339 }
340 }
341 case AMD, Hygon:
342 // Untested.
343 if maxExtendedFunction() < 0x80000005 {
344 return c
345 }
346 _, _, ecx, edx := cpuid(0x80000005, 0)
347 c.L1D = int(((ecx >> 24) & 0xFF) * 1024)
348 c.L1I = int(((edx >> 24) & 0xFF) * 1024)
349
350 if maxExtendedFunction() < 0x80000006 {
351 return c
352 }
353 _, _, ecx, _ = cpuid(0x80000006, 0)
354 c.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
355 }
356
357 return c
358 }
359
360 func maxFunctionID() uint32 {
361 a, _, _, _ := cpuid(0, 0)
362 return a
363 }
364
365 func maxExtendedFunction() uint32 {
366 eax, _, _, _ := cpuid(0x80000000, 0)
367 return eax
368 }
369
370 const (
371 Other = iota
372 Intel
373 AMD
374 VIA
375 Transmeta
376 NSC
377 KVM // Kernel-based Virtual Machine
378 MSVM // Microsoft Hyper-V or Windows Virtual PC
379 VMware
380 XenHVM
381 Bhyve
382 Hygon
383 )
384
385 // Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
386 var vendorMapping = map[string]int{
387 "AMDisbetter!": AMD,
388 "AuthenticAMD": AMD,
389 "CentaurHauls": VIA,
390 "GenuineIntel": Intel,
391 "TransmetaCPU": Transmeta,
392 "GenuineTMx86": Transmeta,
393 "Geode by NSC": NSC,
394 "VIA VIA VIA ": VIA,
395 "KVMKVMKVMKVM": KVM,
396 "Microsoft Hv": MSVM,
397 "VMwareVMware": VMware,
398 "XenVMMXenVMM": XenHVM,
399 "bhyve bhyve ": Bhyve,
400 "HygonGenuine": Hygon,
401 }
402
403 func vendorID() int {
404 _, b, c, d := cpuid(0, 0)
405 v := valAsString(b, d, c)
406 vend, ok := vendorMapping[string(v)]
407 if !ok {
408 return Other
409 }
410 return vend
411 }
412
413 func valAsString(values ...uint32) []byte {
414 r := make([]byte, 4*len(values))
415 for i, v := range values {
416 dst := r[i*4:]
417 dst[0] = byte(v & 0xff)
418 dst[1] = byte((v >> 8) & 0xff)
419 dst[2] = byte((v >> 16) & 0xff)
420 dst[3] = byte((v >> 24) & 0xff)
421 switch {
422 case dst[0] == 0:
423 return r[:i*4]
424 case dst[1] == 0:
425 return r[:i*4+1]
426 case dst[2] == 0:
427 return r[:i*4+2]
428 case dst[3] == 0:
429 return r[:i*4+3]
430 }
431 }
432 return r
433 }
434