x86 and AVX¶
x86 processors from Intel and AMD dominate data centre servers where most ML training happens. This file covers the evolution of x86 SIMD, AVX/AVX2 intrinsics programming, AVX-512, Intel AMX for matrix operations, memory alignment, performance pitfalls, and profiling -- the tools for squeezing maximum performance from the world's most common server CPUs.
- If your training runs on cloud VMs (AWS, GCP, Azure), it is almost certainly running on x86. Even GPU-heavy training has CPU bottlenecks: data loading, preprocessing, gradient aggregation, and checkpointing all run on the CPU. Optimising these with x86 SIMD can meaningfully reduce end-to-end training time.
The x86 SIMD Evolution¶
- x86 SIMD has evolved through increasingly wider vector registers:
| Generation | Year | Register Width | Registers | Key Features |
|---|---|---|---|---|
| MMX | 1997 | 64-bit | 8 (mm0-7) | Integer only, shared with FPU |
| SSE | 1999 | 128-bit | 8 (xmm0-7) | 4 floats, dedicated registers |
| SSE2 | 2001 | 128-bit | 8/16 | 2 doubles, integer operations |
| AVX | 2011 | 256-bit | 16 (ymm0-15) | 8 floats, 3-operand instructions |
| AVX2 | 2013 | 256-bit | 16 | Integer 256-bit, FMA, gather |
| AVX-512 | 2017 | 512-bit | 32 (zmm0-31) | 16 floats, mask registers, scatter |
| AMX | 2023 | Tile registers | 8 tiles | Matrix multiply (BF16, INT8) |
-
Each generation doubled the throughput for vectorised code. Code written with SSE intrinsics runs on every x86 CPU made since 2001. AVX2 requires a 2013+ CPU. AVX-512 is Intel Xeon and some consumer chips. AMX is the newest (Sapphire Rapids and later).
-
Backward compatibility: x86 SSE registers (xmm) are the lower 128 bits of AVX registers (ymm), which are the lower 256 bits of AVX-512 registers (zmm). Old SSE code runs on new CPUs without modification.
AVX2 Programming¶
- AVX2 operates on 256-bit registers (YMM), processing 8 floats or 4 doubles simultaneously. It is the sweet spot for portable high-performance code: available on virtually all modern x86 CPUs (2013+).
Intrinsics Naming Convention¶
-
All x86 intrinsics follow the pattern:
_mm[width]_[operation]_[type]_mm= MMX/SSE (128-bit),_mm256= AVX (256-bit),_mm512= AVX-512 (512-bit)- Operation:
add,mul,fmadd,load,store,set, etc. - Type:
ps= packed single (float32),pd= packed double (float64),epi32= packed int32,si256= 256-bit integer
#include <immintrin.h> // all x86 SIMD intrinsics
// Data types
__m256 a; // 256-bit register holding 8 float32s
__m256d b; // 256-bit register holding 4 float64s
__m256i c; // 256-bit register holding integers (8x32, 16x16, or 32x8)
Loading and Storing Data¶
// Load 8 floats from memory
__m256 v = _mm256_loadu_ps(ptr); // unaligned load (works with any address)
__m256 v = _mm256_load_ps(ptr); // aligned load (ptr must be 32-byte aligned, faster)
// Store 8 floats to memory
_mm256_storeu_ps(out_ptr, v); // unaligned store
_mm256_store_ps(out_ptr, v); // aligned store
// Broadcast a single value to all 8 lanes
__m256 ones = _mm256_set1_ps(1.0f); // [1, 1, 1, 1, 1, 1, 1, 1]
// Set individual values (rarely needed)
__m256 v = _mm256_set_ps(7,6,5,4,3,2,1,0); // note: reverse order!
// Zero register
__m256 z = _mm256_setzero_ps();
Arithmetic¶
__m256 c = _mm256_add_ps(a, b); // c[i] = a[i] + b[i]
__m256 d = _mm256_mul_ps(a, b); // d[i] = a[i] * b[i]
__m256 e = _mm256_sub_ps(a, b); // e[i] = a[i] - b[i]
__m256 f = _mm256_div_ps(a, b); // f[i] = a[i] / b[i] (slower than mul)
// Fused Multiply-Add: r = a * b + c (one instruction, one rounding)
__m256 r = _mm256_fmadd_ps(a, b, c); // THE most important instruction for ML
// Min and max
__m256 mn = _mm256_min_ps(a, b); // min(a[i], b[i]) — useful for clipping
__m256 mx = _mm256_max_ps(a, b); // max(a[i], b[i]) — useful for ReLU
Practical Example: AVX2 Dot Product¶
#include <immintrin.h>
float dot_avx2(const float* a, const float* b, int n) {
__m256 sum = _mm256_setzero_ps(); // 8 accumulators initialised to 0
int i = 0;
for (; i + 8 <= n; i += 8) {
__m256 va = _mm256_loadu_ps(a + i);
__m256 vb = _mm256_loadu_ps(b + i);
sum = _mm256_fmadd_ps(va, vb, sum); // sum += va * vb
}
// Horizontal reduction: sum the 8 elements of sum
// Step 1: add upper 128 bits to lower 128 bits
__m128 hi = _mm256_extractf128_ps(sum, 1);
__m128 lo = _mm256_castps256_ps128(sum);
__m128 sum128 = _mm_add_ps(hi, lo); // 4 partial sums
// Step 2: horizontal add within 128-bit register
sum128 = _mm_hadd_ps(sum128, sum128); // [a+b, c+d, a+b, c+d]
sum128 = _mm_hadd_ps(sum128, sum128); // [a+b+c+d, ...]
float result = _mm_cvtss_f32(sum128); // extract scalar
// Scalar cleanup
for (; i < n; i++) {
result += a[i] * b[i];
}
return result;
}
-
Why the horizontal reduction is ugly: SIMD is designed for vertical operations (lane 0 with lane 0, lane 1 with lane 1). Horizontal operations (summing across lanes) fight the hardware. This is why dot products have the awkward reduction code at the end. The vectorised loop is clean; the reduction is boilerplate.
-
Performance: compared to the NEON version (file 02), AVX2 processes 8 floats per iteration vs NEON's 4. For long vectors, this is a 2x speedup over NEON (ignoring memory bandwidth limits).
Practical Example: AVX2 Softmax (Simplified)¶
- Softmax requires: find the max, subtract it, exponentiate, sum, divide. Here is the max-finding step:
float vector_max_avx2(const float* data, int n) {
__m256 max_vec = _mm256_set1_ps(-INFINITY);
int i = 0;
for (; i + 8 <= n; i += 8) {
__m256 v = _mm256_loadu_ps(data + i);
max_vec = _mm256_max_ps(max_vec, v);
}
// Reduce 8 maxes to 1
__m128 hi = _mm256_extractf128_ps(max_vec, 1);
__m128 lo = _mm256_castps256_ps128(max_vec);
__m128 max128 = _mm_max_ps(hi, lo);
// Shuffle and max to find the single maximum
max128 = _mm_max_ps(max128, _mm_shuffle_ps(max128, max128, 0b01001110));
max128 = _mm_max_ps(max128, _mm_shuffle_ps(max128, max128, 0b10110001));
float result = _mm_cvtss_f32(max128);
for (; i < n; i++) {
result = result > data[i] ? result : data[i];
}
return result;
}
- The
_mm_shuffle_psinstruction rearranges elements within a register. The binary constant0b01001110controls which elements go where. This is called a permutation, and it connects directly to permutation matrices (chapter 2): shuffling SIMD lanes is the hardware equivalent of multiplying by a permutation matrix.
AVX-512¶
- AVX-512 doubles the width again: 512-bit registers (ZMM), processing 16 floats simultaneously.
__m512 a = _mm512_loadu_ps(ptr); // load 16 floats
__m512 c = _mm512_fmadd_ps(a, b, c); // 16 FMAs at once
float sum = _mm512_reduce_add_ps(a); // built-in horizontal sum (no manual reduction!)
// Mask operations: operate on a subset of lanes
__mmask16 mask = _mm512_cmpgt_ps_mask(a, zero); // which lanes are > 0?
__m512 relu = _mm512_maskz_mov_ps(mask, a); // zero out negative lanes = ReLU
-
Mask registers (
__mmask16) are AVX-512's most powerful feature. Each bit controls whether a lane participates in an operation. This replaces the scalar cleanup loop: the last iteration uses a mask where only the valid lanes are active, handling any vector length without a separate scalar loop. -
AVX-512 frequency throttling: on many Intel CPUs, using AVX-512 instructions causes the CPU to temporarily reduce its clock frequency (to stay within thermal limits). This means AVX-512 is not always faster than AVX2 for short bursts — the frequency penalty can outweigh the wider vectors. For sustained workloads (like matrix multiplication), AVX-512 wins. For mixed code (some SIMD, some scalar), the frequency transitions can hurt.
Intel AMX: Matrix Multiply Hardware¶
- AMX (Advanced Matrix eXtensions) adds dedicated matrix multiply units. Instead of SIMD vectors, AMX operates on tiles: 2D blocks of data (up to 16 rows × 64 bytes each).
#include <immintrin.h>
// AMX tile multiply: C += A * B (in BF16)
// A is 16x32 BF16, B is 32x16 BF16, C is 16x16 FP32
_tile_loadd(0, a_ptr, stride_a); // load tile 0 from A
_tile_loadd(1, b_ptr, stride_b); // load tile 1 from B
_tile_dpbf16ps(2, 0, 1); // tile 2 += tile 0 * tile 1 (BF16 matmul, FP32 accumulation)
_tile_stored(2, c_ptr, stride_c); // store tile 2 to C
-
AMX performs a full 16×32 × 32×16 matrix multiply in a single instruction. This is hundreds of FMA operations at once, specifically designed for the small matrix multiplies that dominate transformer inference (attention score computation, MLP layers).
-
AMX supports BF16 (bfloat16) and INT8, matching the precisions used in ML inference. Combined with AVX-512 for other operations, AMX-equipped CPUs (Intel Sapphire Rapids, Emerald Rapids) can run transformer inference competitively with entry-level GPUs.
Memory Alignment¶
- Aligned memory access is when the data address is a multiple of the vector register width (16 bytes for SSE, 32 bytes for AVX, 64 bytes for AVX-512). Aligned access is faster on some CPUs and required by
_mm256_load_ps(as opposed to_mm256_loadu_ps).
// Allocate aligned memory
float* data = (float*)aligned_alloc(32, n * sizeof(float)); // 32-byte aligned for AVX
// C++ aligned allocation
#include <new>
float* data = new (std::align_val_t(32)) float[n];
// Alternatively, use the compiler attribute
alignas(32) float data[1024];
- In practice: on modern CPUs (Haswell and later), unaligned loads (
loadu) are almost as fast as aligned loads when the data does not cross a cache line boundary. The performance penalty of unaligned access has largely disappeared, but cache line splits (data spanning two 64-byte cache lines) can still cause a ~2x slowdown for that specific load. Aligned allocation avoids this entirely.
Performance Pitfalls¶
-
AVX-SSE transition penalty: on older Intel CPUs (pre-Skylake), switching between AVX (256-bit) and SSE (128-bit) instructions caused a penalty (~70 cycles). This is why you should use
_mm256_zeroupper()(orvzeroupperinstruction) before returning from a function that uses AVX, to clear the upper 128 bits of YMM registers. Modern CPUs (Skylake+) do not have this penalty. -
Register pressure: AVX2 has 16 YMM registers. If your kernel uses too many variables, the compiler spills registers to the stack (memory), destroying performance. Keep inner loops simple with few live variables.
-
Data dependencies:
sum = _mm256_fmadd_ps(a, b, sum)has a dependency onsum: each iteration must wait for the previous FMA to complete (~4-5 cycles latency). The fix: use multiple independent accumulators and reduce at the end:
// Single accumulator: limited by FMA latency (4-5 cycles)
__m256 sum = _mm256_setzero_ps();
for (...) {
sum = _mm256_fmadd_ps(a, b, sum); // each depends on previous
}
// Four accumulators: 4x throughput (hide latency)
__m256 sum0 = _mm256_setzero_ps();
__m256 sum1 = _mm256_setzero_ps();
__m256 sum2 = _mm256_setzero_ps();
__m256 sum3 = _mm256_setzero_ps();
for (...) {
sum0 = _mm256_fmadd_ps(a0, b0, sum0); // independent
sum1 = _mm256_fmadd_ps(a1, b1, sum1); // independent
sum2 = _mm256_fmadd_ps(a2, b2, sum2); // independent
sum3 = _mm256_fmadd_ps(a3, b3, sum3); // independent
}
sum0 = _mm256_add_ps(sum0, sum1);
sum2 = _mm256_add_ps(sum2, sum3);
sum0 = _mm256_add_ps(sum0, sum2);
- This is loop unrolling to hide latency. The CPU can issue the FMAs back-to-back because they write to different registers. This is one of the most impactful micro-optimisations in numerical code.
Profiling¶
- Performance counters provide hardware-level measurements:
# Linux perf (requires kernel support)
perf stat ./my_program # basic counters: cycles, instructions, IPC
perf stat -e cache-misses,cache-references ./my_program # cache behaviour
perf record -g ./my_program && perf report # call graph profiling
# Intel VTune (detailed x86 profiling)
vtune -collect hotspots -- ./my_program
vtune -collect memory-access -- ./my_program # memory bandwidth analysis
- What to look for:
- IPC (Instructions Per Cycle): how efficiently the CPU is used. IPC > 2 is good. IPC < 1 suggests memory stalls or branch mispredictions.
- Cache miss rate: high L1/L2 miss rates indicate poor data locality. Restructure data access patterns.
- Branch misprediction rate: > 5% suggests unpredictable branches. Convert to branchless code (SIMD comparisons + blending) if possible.
- FLOPS achieved vs roofline: compare your measured FLOPS to the roofline model (file 01). If you are below the roofline, there is room for improvement.
Coding Tasks (compile with g++ or clang++ on x86 — Intel/AMD)¶
-
Write a scalar dot product and an AVX2 dot product. Benchmark both and measure the speedup from 8-wide SIMD.
// task1_avx_dot.cpp // Compile: g++ -O3 -mavx2 -mfma -o task1 task1_avx_dot.cpp #include <iostream> #include <chrono> #include <vector> #include <immintrin.h> float dot_scalar(const float* a, const float* b, int n) { float sum = 0.0f; for (int i = 0; i < n; i++) sum += a[i] * b[i]; return sum; } float dot_avx2(const float* a, const float* b, int n) { __m256 sum = _mm256_setzero_ps(); int i = 0; for (; i + 8 <= n; i += 8) { __m256 va = _mm256_loadu_ps(a + i); __m256 vb = _mm256_loadu_ps(b + i); sum = _mm256_fmadd_ps(va, vb, sum); } // Reduce: add upper 128 to lower 128, then horizontal add __m128 hi = _mm256_extractf128_ps(sum, 1); __m128 lo = _mm256_castps256_ps128(sum); __m128 r = _mm_add_ps(hi, lo); r = _mm_hadd_ps(r, r); r = _mm_hadd_ps(r, r); float result = _mm_cvtss_f32(r); for (; i < n; i++) result += a[i] * b[i]; return result; } int main() { const int N = 10'000'000; std::vector<float> a(N, 1.0f), b(N, 2.0f); volatile float s1 = dot_scalar(a.data(), b.data(), N); volatile float s2 = dot_avx2(a.data(), b.data(), N); auto bench = [&](auto fn, const char* name) { auto start = std::chrono::high_resolution_clock::now(); volatile float s; for (int t = 0; t < 100; t++) s = fn(a.data(), b.data(), N); auto end = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration<double, std::milli>(end - start).count() / 100; std::cout << name << ": " << ms << " ms (result: " << s << ")\n"; return ms; }; double t1 = bench(dot_scalar, "Scalar"); double t2 = bench(dot_avx2, "AVX2 "); std::cout << "Speedup: " << t1 / t2 << "x\n"; return 0; } -
Implement AVX2 ReLU using
_mm256_max_psand compare against a scalar loop. Then try with multiple accumulators (loop unrolling) to hide FMA latency.// task2_avx_relu.cpp // Compile: g++ -O3 -mavx2 -o task2 task2_avx_relu.cpp #include <iostream> #include <chrono> #include <vector> #include <immintrin.h> void relu_scalar(const float* in, float* out, int n) { for (int i = 0; i < n; i++) { out[i] = in[i] > 0.0f ? in[i] : 0.0f; } } void relu_avx2(const float* in, float* out, int n) { __m256 zero = _mm256_setzero_ps(); int i = 0; for (; i + 8 <= n; i += 8) { __m256 x = _mm256_loadu_ps(in + i); _mm256_storeu_ps(out + i, _mm256_max_ps(x, zero)); } for (; i < n; i++) out[i] = in[i] > 0.0f ? in[i] : 0.0f; } // Unrolled: process 32 floats per iteration (4 x 8) void relu_avx2_unrolled(const float* in, float* out, int n) { __m256 zero = _mm256_setzero_ps(); int i = 0; for (; i + 32 <= n; i += 32) { __m256 x0 = _mm256_loadu_ps(in + i); __m256 x1 = _mm256_loadu_ps(in + i + 8); __m256 x2 = _mm256_loadu_ps(in + i + 16); __m256 x3 = _mm256_loadu_ps(in + i + 24); _mm256_storeu_ps(out + i, _mm256_max_ps(x0, zero)); _mm256_storeu_ps(out + i + 8, _mm256_max_ps(x1, zero)); _mm256_storeu_ps(out + i + 16, _mm256_max_ps(x2, zero)); _mm256_storeu_ps(out + i + 24, _mm256_max_ps(x3, zero)); } for (; i + 8 <= n; i += 8) { _mm256_storeu_ps(out + i, _mm256_max_ps(_mm256_loadu_ps(in + i), zero)); } for (; i < n; i++) out[i] = in[i] > 0.0f ? in[i] : 0.0f; } int main() { const int N = 16'000'000; std::vector<float> in(N), out(N); for (int i = 0; i < N; i++) in[i] = (float)(i % 200) - 100.0f; auto bench = [&](auto fn, const char* name) { fn(in.data(), out.data(), N); // warm up auto start = std::chrono::high_resolution_clock::now(); for (int t = 0; t < 100; t++) fn(in.data(), out.data(), N); auto end = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration<double, std::milli>(end - start).count() / 100; double bw = 2.0 * N * sizeof(float) / ms / 1e6; // read + write std::cout << name << ": " << ms << " ms (" << bw << " GB/s)\n"; }; bench(relu_scalar, "Scalar "); bench(relu_avx2, "AVX2 "); bench(relu_avx2_unrolled, "AVX2 unrolled "); return 0; } -
Measure the effect of memory alignment. Compare aligned vs unaligned loads on large arrays.
// task3_alignment.cpp // Compile: g++ -O3 -mavx2 -o task3 task3_alignment.cpp #include <iostream> #include <chrono> #include <cstdlib> #include <immintrin.h> int main() { const int N = 16'000'000; // Aligned allocation (32-byte for AVX2) float* aligned = (float*)aligned_alloc(32, N * sizeof(float)); // Unaligned: offset by 4 bytes (1 float) from aligned boundary float* raw = (float*)malloc((N + 1) * sizeof(float)); float* unaligned = raw + 1; // guaranteed misaligned for (int i = 0; i < N; i++) { aligned[i] = 1.0f; unaligned[i] = 1.0f; } auto bench = [&](float* ptr, bool use_aligned, const char* name) { __m256 sum = _mm256_setzero_ps(); // Warm up for (int i = 0; i + 8 <= N; i += 8) { __m256 v = use_aligned ? _mm256_load_ps(ptr + i) : _mm256_loadu_ps(ptr + i); sum = _mm256_add_ps(sum, v); } auto start = std::chrono::high_resolution_clock::now(); for (int t = 0; t < 100; t++) { sum = _mm256_setzero_ps(); for (int i = 0; i + 8 <= N; i += 8) { __m256 v = use_aligned ? _mm256_load_ps(ptr + i) : _mm256_loadu_ps(ptr + i); sum = _mm256_add_ps(sum, v); } } auto end = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration<double, std::milli>(end - start).count() / 100; double bw = (double)N * sizeof(float) / ms / 1e6; std::cout << name << ": " << ms << " ms (" << bw << " GB/s)\n"; }; bench(aligned, true, "Aligned load "); bench(unaligned, false, "Unaligned load"); free(aligned); free(raw); return 0; }