cpu_amd64.go raw

   1  // Copyright (c) 2024 The Decred developers
   2  // Use of this source code is governed by an ISC
   3  // license that can be found in the LICENSE file.
   4  //
   5  // Feature detection originally written by Dave Collins Feb 2019.  Additional
   6  // cleanup and comments added Jul 2024.
   7  
   8  //go:build !purego
   9  
  10  package compress
  11  
  12  import (
  13  	"os"
  14  )
  15  
  16  var (
  17  	// features houses the result of querying the CPU and OS for supported
  18  	// features.
  19  	features = querySupportedFeatures()
  20  
  21  	hasSSE2  = features.SSE2 && os.Getenv("BLAKE256_DISABLE_SSE2") != "1"
  22  	hasSSE41 = features.SSE41 && os.Getenv("BLAKE256_DISABLE_SSE41") != "1"
  23  	hasAVX   = features.AVX && os.Getenv("BLAKE256_DISABLE_AVX") != "1"
  24  )
  25  
  26  // supportsCPUID returns true when the CPU supports the CPUID opcode.
  27  //
  28  //go:noescape
  29  func supportsCPUID() bool
  30  
  31  // cpuid provides access to the CPUID opcode.
  32  //
  33  //go:noescape
  34  func cpuid(eaxIn, ecxIn uint32) (eax, ebx, ecx, edx uint32)
  35  
  36  // xgetbv provides access to the XGETBV opcode to read the contents of the
  37  // extended control register with ECX = 0x00.
  38  //
  39  //go:noescape
  40  func xgetbv() (eax uint32)
  41  
  42  // isBitSet returns whether or not the provided bit is set in the given test
  43  // value.
  44  func isBitSet(testVal uint32, bit uint8) bool {
  45  	return testVal>>bit&1 == 1
  46  }
  47  
  48  // supportedFeatures houses flags that specify whether or not various features
  49  // are supported by the CPU.
  50  type supportedFeatures struct {
  51  	SSE2  bool
  52  	SSE41 bool
  53  	AVX   bool
  54  	AVX2  bool
  55  }
  56  
  57  // querySupportedFeatures returns the result of querying the CPU and OS to
  58  // determine supported features.
  59  func querySupportedFeatures() supportedFeatures {
  60  	// Per CPUID—CPU Identification in Chapter 3 of the Intel 64 and IA-32
  61  	// Architectures Software Developer's Manual, Volume 2A:
  62  	//
  63  	// "The ID flag (bit 21) in the EFLAGS register indicates support for the
  64  	// CPUID instruction. If a software procedure can set and clear this flag,
  65  	// the processor executing the procedure supports the CPUID instruction.
  66  	// This instruction operates the same in non-64-bit modes and 64-bit mode.
  67  	//
  68  	// CPUID returns processor identification and feature information in the
  69  	// EAX, EBX, ECX, and EDX registers.  The output is dependent on the
  70  	// contents of the EAX register upon execution (in some cases, ECX as
  71  	// well)."
  72  	//
  73  	// The inputs and outputs for determining various levels of SIMD support
  74  	// that are likely relevant to BLAKE are:
  75  	//
  76  	// Initial EAX Value | Output
  77  	// ------------------|------------------------------------------------
  78  	// 0x00              | EAX = Maximum Input Value for Basic CPUID Info.
  79  	// -------------------------------------------------------------------
  80  	// 0x01              | ECX = Feature Information
  81  	//                   |  Bit 0 = Streaming SIMD Extensions 3 (SSE3)
  82  	//                   |  Bit 9 = Supplemental SSE3 (SSSE3)
  83  	//                   |  Bit 19 = Streaming SIMD Extensions 4.1 (SSE4.1)
  84  	//                   |  Bit 20 = Streaming SIMD Extensions 4.2 (SSE4.2)
  85  	//                   |  Bit 27 = OS sets to enable XSAVE features (OSXSAVE)
  86  	//                   |  Bit 28 = Advanced Vector Extensions (AVX)
  87  	//                   | EDX = Feature Information
  88  	//                   |  Bit 25 = Streaming SIMD Extensions (SSE)
  89  	//                   |  Bit 26 = Streaming SIMD Extensions 2 (SSE2)
  90  	// -------------------------------------------------------------------
  91  	// 0x07              | EBX = Feature Information
  92  	//                   |  Bit 5 = Advanced Vector Extensions 2 (AVX2)
  93  	//                   |  Bit 16 = AVX-512 Foundation (AVX512F)
  94  	//                   |  Bit 17 = AVX-512 Double and Quadword (AVX512DQ)
  95  	//                   |  Bit 30 = AVX-512 Byte and Word (AVX512BW)
  96  	//                   |  Bit 31 = AVX-512 Vector Length Extensions (AVX512VL)
  97  	//
  98  	// Note that all SSE and AVX variants also require operating system support
  99  	// in order to properly save the additional state when doing context
 100  	// switches.  Starting with AVX, this is signaled by the OS by setting bits
 101  	// in the extended control register which itself requires CPU support as
 102  	// specified by the OSXSAVE bit in the table above.
 103  	//
 104  	// Per Chapter 13 of the Intel 64 and IA-32 Architectures Software
 105  	// Developer’s Manual, Volume 1, the XGETBV opcode is used to obtain the
 106  	// aforementioned extended control register (XCR).  Per the "XSAVE-SUPPORTED
 107  	// FEATURES AND STATE-COMPONENT BITMAPS" section, the relevant bits for
 108  	// AVX/AVX-512 support are:
 109  	//
 110  	// XCR  | Output
 111  	// -----|------------------------------------------------
 112  	// 0x00 | EAX
 113  	//      |  Bit 1 = SSE state (XMM registers)
 114  	//      |  Bit 2 = AVX state (YMM registers)
 115  	//      |  Bits 5-7 = AVX-512 state components
 116  	//      |   Bit 5 = Opmask state (K0-K7 registers)
 117  	//      |   Bit 6 = ZMM high 256 state (upper 256 bits of ZMM0-ZMM15 registers)
 118  	//      |   Bit 7 = High 16 ZMM state (ZMM16-ZMM31 registers)
 119  	const (
 120  		eaxInputQueryMax          = 0x00
 121  		eaxInputQueryFeatureInfo  = 0x01
 122  		eaxInputQueryExtFeatFlags = 0x07
 123  
 124  		ecx1OutputOSXSAVEBit = 27
 125  		edx1OutputSSE2Bit    = 26
 126  		ecx1OutputSSE41Bit   = 19
 127  		ecx1OutputAVXBit     = 28
 128  		ebx7OutputAVX2Bit    = 5
 129  
 130  		xgbvEaxOutputSSEStateBit = 1
 131  		xgbvEaxOutputAVXStateBit = 2
 132  	)
 133  
 134  	// Nothing to do if the CPU somehow does not support CPUID.  Go probably
 135  	// won't even run on such a CPU, but as the Intel manual states, it is
 136  	// technically required to check if CPUID is supported before querying it
 137  	// and it's best to be safe.
 138  	var features supportedFeatures
 139  	if !supportsCPUID() {
 140  		return features
 141  	}
 142  
 143  	// Perform initial query to determine the max CPUID input value since the
 144  	// remaining checks are only valid if the CPU at least supports querying
 145  	// them to begin with.
 146  	maxEAXInput, _, _, _ := cpuid(eaxInputQueryMax, 0)
 147  	if maxEAXInput < eaxInputQueryFeatureInfo {
 148  		return features
 149  	}
 150  
 151  	// Query basic feature info to determine if the CPU supports SSE2/SSE4.1.
 152  	//
 153  	// Note that SSE2 is always active on amd64, so checking for it could
 154  	// probably technically be skipped, but it doesn't really cost anything
 155  	// extra to check for it and checking is more correct.
 156  	_, _, ecx, edx := cpuid(eaxInputQueryFeatureInfo, 0)
 157  	features.SSE2 = isBitSet(edx, edx1OutputSSE2Bit)
 158  	features.SSE41 = isBitSet(ecx, ecx1OutputSSE41Bit)
 159  	hasOSXSAVE := isBitSet(ecx, ecx1OutputOSXSAVEBit)
 160  
 161  	// Query basic feature info to determine AVX support as well as if the OS
 162  	// supports AVX/AVX2.  See the description above for details.
 163  	var osSupportsAVX bool
 164  	if hasOSXSAVE {
 165  		eax := xgetbv()
 166  		osSupportsSSE := isBitSet(eax, xgbvEaxOutputSSEStateBit)
 167  		osSupportsAVX = osSupportsSSE && isBitSet(eax, xgbvEaxOutputAVXStateBit)
 168  	}
 169  	features.AVX = isBitSet(ecx, ecx1OutputAVXBit) && osSupportsAVX
 170  
 171  	// Querying the supported feature info for AVX2 is only valid if the CPU at
 172  	// least supports querying it to begin with.
 173  	if maxEAXInput < eaxInputQueryExtFeatFlags {
 174  		return features
 175  	}
 176  
 177  	// Query extended feature info to determine AVX2 support.
 178  	_, ebx, _, _ := cpuid(eaxInputQueryExtFeatFlags, 0)
 179  	features.AVX2 = isBitSet(ebx, ebx7OutputAVX2Bit) && osSupportsAVX
 180  
 181  	return features
 182  }
 183