Go - Benchmarking and Profiling & Tracing

Concept: Benchmarking helps analyze code performance for CPU and memory usage, revealing bottlenecks caused by allocations or inefficient algorithms.

Why it matters: Without benchmarks, performance claims are guesses — benchmarks provide reproducible numbers that prove or disprove optimization attempts.

// bench/string_bench_test.go
package bench

import (
	"strings"
	"testing"
)

// two implementations — benchmark reveals which one is actually faster
func joinPlus(parts []string) string {
	result := ""
	for _, p := range parts {
		result += p // new allocation every iteration — O(n²) copies
	}
	return result
}

func joinBuilder(parts []string) string {
	var b strings.Builder
	for _, p := range parts {
		b.WriteString(p) // amortized O(1) — single backing buffer
	}
	return b.String()
}

var parts = []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}

func BenchmarkJoinPlus(b *testing.B) {
	for i := 0; i < b.N; i++ {
		_ = joinPlus(parts) // b.N adjusted by runtime to hit 1 second
	}
}

func BenchmarkJoinBuilder(b *testing.B) {
	for i := 0; i < b.N; i++ {
		_ = joinBuilder(parts)
	}
}
// go test -bench=. -benchmem ./bench
// -benchmem shows: allocs/op and bytes/op — the allocation story

Gotcha: Benchmarking without -benchmem — you see ns/op but miss the allocation count which is often the real bottleneck.


Concept: Go's benchmarking tools utilize the testing package and functions that begin with Benchmark.

Why it matters: The naming convention Benchmark* with *testing.B is enforced by the runner — any other name is silently ignored as a benchmark.

package bench

import "testing"

// correct signature: Benchmark prefix + *testing.B parameter
func BenchmarkMyFunction(b *testing.B) {
	// setup: NOT counted in the benchmark time
	data := make([]int, 1000)
	for i := range data { data[i] = i }

	b.ResetTimer() // reset: exclude setup time from measurement

	for i := 0; i < b.N; i++ {
		// b.N: adjusted by the runtime — run enough iterations for stable timing
		sum := 0
		for _, v := range data {
			sum += v
		}
		_ = sum // prevent dead code elimination
	}
}

// ignored by go test -bench: wrong prefix
func measureMyFunction(b *testing.B) { _ = b }

// ignored: missing *testing.B parameter
// func BenchmarkWrongSig() {}

// go test -bench=BenchmarkMyFunction  → run only this benchmark
// go test -bench=.                    → run all benchmarks
// go test -bench=. -benchtime=5s      → run for 5 seconds instead of 1

Gotcha: Not calling b.ResetTimer() after expensive setup — setup time is included in the measurement, making the benchmark unfairly slow.


Concept: Benchmarks measure execution time and memory allocation — b.N is adjusted by the testing tool to achieve a specified benchmark duration.

Why it matters: b.N is the tool's way of amortizing measurement noise — by running many iterations, individual timing jitter averages out to a stable ns/op number.

package bench

import (
	"testing"
	"fmt"
)

// observe how b.N grows across iterations
var iterations int

func BenchmarkObserveN(b *testing.B) {
	// b.N starts at 1 and grows until elapsed time reaches benchtime (default 1s)
	// typical growth: 1 → 100 → 10000 → 1000000 → ...
	for i := 0; i < b.N; i++ {
		// minimal work: measure the overhead of the loop itself
		_ = fmt.Sprintf("n=%d", b.N)
	}
}

func BenchmarkWithMemory(b *testing.B) {
	b.ReportAllocs() // equivalent to -benchmem for this benchmark only

	for i := 0; i < b.N; i++ {
		// b.N iterations — each alloc is counted
		s := make([]byte, 64)
		_ = s
	}
	// output: N ns/op   64 B/op   1 allocs/op
}

Gotcha: Putting the benchmark body outside the b.N loop — the code runs once and b.N iterations measure nothing; ALL work being benchmarked must be inside the loop.


Concept: Optimizations and dead code elimination by the compiler can influence benchmark results — need for accurate and realistic test scenarios.

Why it matters: The compiler may eliminate benchmark code entirely if the result is unused — benchmark results drop to 0 ns/op, producing a meaningless measurement.

package bench

import "testing"

// wrong: compiler may eliminate the entire function — result unused
func BenchmarkDeadCode(b *testing.B) {
	for i := 0; i < b.N; i++ {
		result := compute(i) // computed but never used — compiler eliminates
		_ = result           // blank identifier: still may be eliminated
	}
}

// correct: use a package-level sink to prevent dead code elimination
var globalSink int

func BenchmarkPreventElimination(b *testing.B) {
	for i := 0; i < b.N; i++ {
		globalSink = compute(i) // assign to package-level var — compiler cannot eliminate
	}
}

func compute(n int) int {
	result := 0
	for i := 0; i < n%100; i++ {
		result += i * i
	}
	return result
}

Gotcha: Using a local variable as sink instead of a package-level variable — the compiler may still prove the local is unused and eliminate the computation.


Concept: Subbenchmarking — allows grouping and running specific benchmarks within a larger function.

Why it matters: Subbenchmarks organize related performance scenarios in one function — they appear as BenchmarkParent/subtest and can be filtered individually.

package bench

import (
	"sort"
	"strconv"
	"testing"
)

// subbenchmarks: compare performance across different input sizes
func BenchmarkSort(b *testing.B) {
	sizes := []int{10, 100, 1000, 10000}

	for _, size := range sizes {
		size := size
		b.Run(strconv.Itoa(size), func(b *testing.B) {
			// setup: create input data — not counted
			data := make([]int, size)
			for i := range data { data[i] = size - i } // reverse order: worst case

			b.ResetTimer() // exclude setup from measurement

			for i := 0; i < b.N; i++ {
				// copy to avoid sorting sorted data on second iteration
				tmp := make([]int, len(data))
				copy(tmp, data)
				sort.Ints(tmp)
			}
		})
	}
}
// go test -bench=BenchmarkSort/1000   → only 1000-element case
// go test -bench=BenchmarkSort        → all sizes

Gotcha: Not resetting the timer inside subbenchmarks after per-case setup — the setup time is counted against the benchmark, distorting ns/op.


Concept: Benchmark Validation — ensuring benchmark results accurately reflect code performance by accounting for external factors.

Why it matters: A benchmark run on a CPU-throttled laptop during a Teams call produces numbers 3× worse than production — results must be validated against known baselines.

package bench

import (
	"testing"
	"time"
)

// validation technique: establish a known baseline and compare
// if the baseline changes unexpectedly, the measurement environment is suspect

// baseline: known-cost operation — measures benchmark harness overhead
func BenchmarkBaseline(b *testing.B) {
	for i := 0; i < b.N; i++ {
		time.Sleep(0) // known cost: ~100ns on Linux — sanity check
	}
}

// target: must be faster than N × baseline if N operations were done
func BenchmarkTarget(b *testing.B) {
	data := make([]int, 100)

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		sum := 0
		for _, v := range data { sum += v } // 100 additions: expect < 100ns
		_ = sum
	}
}

// validation workflow:
// 1. run benchmark 5 times: go test -bench=. -count=5 > results.txt
// 2. use benchstat: benchstat results.txt  — shows mean ± variance
// 3. variance > 5% = environment noise — rerun on idle machine
// install benchstat: go install golang.org/x/perf/cmd/benchstat@latest

Gotcha: Running benchmarks on a machine with active browser tabs, video calls, or background builds — CPU frequency scaling alone can cause 30% variance.


Concept: System Load — background processes and resource utilization from other applications can significantly skew benchmark results.

Why it matters: A CI machine running parallel jobs during a benchmark produces numbers that cannot be compared to the previous run — always isolate benchmark execution.

package bench

import (
	"runtime"
	"testing"
)

// record environment metadata alongside benchmark results
func BenchmarkWithEnvironment(b *testing.B) {
	// log environment: helps interpret results in context
	b.Logf("GOMAXPROCS=%d NumCPU=%d", runtime.GOMAXPROCS(0), runtime.NumCPU())

	// run with: go test -bench=. -v to see the log output

	work := func() int {
		sum := 0
		for i := 0; i < 10000; i++ { sum += i }
		return sum
	}

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_ = work()
	}
}

// isolation checklist before benchmarking:
// - close all non-essential applications
// - disable CPU frequency scaling: cpupower frequency-set -g performance
// - pin process to cores: taskset -c 0,1 go test -bench=.
// - disable turbo boost: echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
// - run multiple times and use benchstat for statistical significance

Gotcha: Comparing benchmark results across different machines without documenting CPU model, frequency, and OS — a 2× difference may just be different hardware.


Concept: CPU-bound workloads benefit significantly from parallelism — benchmarking shows the impact.

Why it matters: Benchmarks with b.RunParallel measure throughput under concurrent load — revealing whether the implementation scales with CPU count or serializes on a bottleneck.

package bench

import (
	"runtime"
	"sync"
	"testing"
)

// CPU-bound function: pure computation
func fibonacci(n int) int {
	if n <= 1 { return n }
	a, b := 0, 1
	for i := 2; i <= n; i++ { a, b = b, a+b }
	return b
}

// sequential baseline: establishes single-core performance
func BenchmarkFibSequential(b *testing.B) {
	for i := 0; i < b.N; i++ {
		_ = fibonacci(35)
	}
}

// parallel: RunParallel distributes b.N across GOMAXPROCS goroutines
func BenchmarkFibParallel(b *testing.B) {
	b.RunParallel(func(pb *testing.PB) {
		for pb.Next() { // pb.Next() is the parallel equivalent of i < b.N
			_ = fibonacci(35)
		}
	})
}

// manual fan out: controls goroutine count explicitly
func BenchmarkFibFanOut(b *testing.B) {
	workers := runtime.NumCPU()
	var wg sync.WaitGroup

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		wg.Add(workers)
		for w := 0; w < workers; w++ {
			go func() { defer wg.Done(); _ = fibonacci(35) }()
		}
		wg.Wait()
	}
}
// go test -bench=. -cpu=1,2,4,8  → vary GOMAXPROCS automatically

Gotcha: Using b.RunParallel for a function that accesses shared state without a mutex — the benchmark measures lock contention, not the function's real cost.


Concept: When constrained to a single thread, concurrent code may perform worse than sequential due to context switching overhead.

Why it matters: This is the key insight for CPU-bound work — adding concurrency on one core hurts throughput; only add goroutines when multiple cores are available.

package bench

import (
	"runtime"
	"sync"
	"testing"
)

func cpuWork(n int) int {
	sum := 0
	for i := 0; i < n; i++ { sum += i * i }
	return sum
}

// sequential: no goroutine overhead — best on single core
func BenchmarkCPU_Sequential(b *testing.B) {
	for i := 0; i < b.N; i++ {
		_ = cpuWork(10000)
	}
}

// concurrent on 1 core: goroutine creation + context switch COSTS MORE than work gained
func BenchmarkCPU_Concurrent_1Core(b *testing.B) {
	runtime.GOMAXPROCS(1) // force single core
	defer runtime.GOMAXPROCS(runtime.NumCPU())

	for i := 0; i < b.N; i++ {
		var wg sync.WaitGroup
		for j := 0; j < 4; j++ {
			wg.Add(1)
			go func() { defer wg.Done(); _ = cpuWork(10000) }()
		}
		wg.Wait()
	}
	// SLOWER than sequential on 1 core — context switch overhead dominates
}

// concurrent on multiple cores: parallelism wins for CPU-bound work
func BenchmarkCPU_Concurrent_AllCores(b *testing.B) {
	for i := 0; i < b.N; i++ {
		var wg sync.WaitGroup
		for j := 0; j < runtime.NumCPU(); j++ {
			wg.Add(1)
			go func() { defer wg.Done(); _ = cpuWork(10000) }()
		}
		wg.Wait()
	}
	// FASTER than sequential — each core does its share in parallel
}

Gotcha: Launching more goroutines than CPUs for CPU-bound work — beyond NumCPU workers, you add scheduling overhead with no additional throughput.


Concept: IO-Bound Workloads — concurrency can enhance performance without requiring parallelism.

Why it matters: IO-bound goroutines yield CPU while waiting — a single OS thread handles thousands of concurrent IO operations by switching between parked goroutines.

package bench

import (
	"sync"
	"testing"
	"time"
)

// simulates IO-bound work: goroutine parks during sleep
func ioWork(latency time.Duration) {
	time.Sleep(latency) // goroutine parks — OS thread handles other goroutines
}

// sequential IO: each operation waits for the previous to complete
func BenchmarkIO_Sequential(b *testing.B) {
	for i := 0; i < b.N; i++ {
		for j := 0; j < 10; j++ {
			ioWork(time.Millisecond) // 10ms total sequential
		}
	}
}

// concurrent IO: all operations in flight simultaneously
func BenchmarkIO_Concurrent(b *testing.B) {
	for i := 0; i < b.N; i++ {
		var wg sync.WaitGroup
		for j := 0; j < 10; j++ {
			wg.Add(1)
			go func() {
				defer wg.Done()
				ioWork(time.Millisecond) // parks — 1ms total concurrent
			}()
		}
		wg.Wait()
	}
}

// go test -bench=BenchmarkIO -cpu=1  → concurrent wins even on 1 core
// expected: concurrent ~10× faster — 1ms vs 10ms per operation

Gotcha: Using time.Sleep as a proxy for real I/O in production benchmarks — sleep doesn't exercise the network poller or system call stack; benchmark with real connections for realistic numbers.


Concept: Excessive goroutines beyond a certain point provide diminishing returns — overhead outweighs the benefits.

Why it matters: Past the optimal concurrency level, goroutine creation cost, scheduler overhead, and memory usage consume more resources than the parallelism gained.

package bench

import (
	"sync"
	"testing"
	"time"
)

func processItem(latency time.Duration) string {
	time.Sleep(latency)
	return "result"
}

// find optimal concurrency level through measurement
func BenchmarkConcurrencyLevel(b *testing.B) {
	levels := []int{1, 2, 4, 8, 16, 32, 64, 128}

	for _, level := range levels {
		level := level
		b.Run(func() string {
			if level == 1 { return "sequential" }
			return "concurrent_" + func(n int) string {
				s := ""
				for n > 0 { s = string(rune('0'+n%10)) + s; n /= 10 }
				return s
			}(level)
		}(), func(b *testing.B) {
			for i := 0; i < b.N; i++ {
				var wg sync.WaitGroup
				for j := 0; j < 10; j++ {
					wg.Add(1)
					sem := make(chan struct{}, level)
					go func() {
						defer wg.Done()
						sem <- struct{}{}
						defer func() { <-sem }()
						processItem(time.Millisecond)
					}()
				}
				wg.Wait()
			}
		})
	}
}

Gotcha: Picking a goroutine pool size based on intuition — always benchmark with different sizes; optimal concurrency is workload-specific and hardware-dependent.


Concept: Profiling Guidelines — ensure the machine is idle, focus on CPU and memory profiling.

Why it matters: A profile taken on a busy machine shows OS scheduling noise alongside application behavior — idle machine profiles reveal only your code's costs.

// profiling/main.go
package main

import (
	"fmt"
	"net/http"
	_ "net/http/pprof" // registers /debug/pprof/* endpoints automatically
	"os"
	"runtime/pprof"
	"time"
)

// file-based CPU profile: precise control over what is profiled
func profileToFile() {
	f, err := os.Create("cpu.prof")
	if err != nil { panic(err) }
	defer f.Close()

	pprof.StartCPUProfile(f)
	defer pprof.StopCPUProfile()

	// work to profile
	for i := 0; i < 1000000; i++ {
		_ = fmt.Sprintf("item-%d", i) // allocation-heavy work
	}
}

func main() {
	// HTTP endpoint: profile a running service
	go func() {
		http.ListenAndServe("localhost:6060", nil)
		// CPU:    go tool pprof http://localhost:6060/debug/pprof/profile?seconds=10
		// Memory: go tool pprof http://localhost:6060/debug/pprof/heap
		// Goroutines: http://localhost:6060/debug/pprof/goroutine?debug=1
	}()

	profileToFile()
	time.Sleep(100 * time.Millisecond)
	// analyze: go tool pprof cpu.prof → top → web (opens flame graph)
}

Gotcha: Profiling with seconds=30 on a machine running other services — include only the time window where your code is doing the work you want to measure.


Concept: Consistent benchmarking — run tests multiple times under similar conditions to ensure results are reliable.

Why it matters: A single benchmark run may hit a GC pause or CPU throttle — multiple runs with statistical analysis (benchstat) reveal true performance.

package bench

import (
	"sort"
	"testing"
)

// run this benchmark multiple times to check stability:
// go test -bench=BenchmarkStable -count=10 > results.txt
// benchstat results.txt  → shows mean ± std dev

func BenchmarkStable(b *testing.B) {
	data := make([]int, 1000)
	for i := range data { data[i] = 1000 - i }

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		tmp := make([]int, len(data))
		copy(tmp, data)
		sort.Ints(tmp) // deterministic work — should show low variance
	}
}

// validate: before/after optimization comparison
// step 1: go test -bench=BenchmarkStable -count=10 > before.txt
// step 2: implement optimization
// step 3: go test -bench=BenchmarkStable -count=10 > after.txt
// step 4: benchstat before.txt after.txt  → shows % improvement with confidence

Gotcha: Comparing single benchmark runs to claim a speedup — one run can differ by 20% due to GC timing; always use -count=5 or more and benchstat for statistical confidence.


Concept: Types of profiling — CPU profiling identifies hot functions; memory profiling analyzes allocation patterns.

Why it matters: CPU and memory profiles reveal different problems — a function may be fast but allocate heavily (hurting GC), or slow but allocation-free (bottleneck elsewhere).

package main

import (
	"fmt"
	"os"
	"runtime/pprof"
	"strings"
)

// CPU-hot function: pure computation — will appear in CPU profile
func countWords(text string) int {
	words := strings.Fields(text)
	return len(words)
}

// Allocation-heavy function: will appear in memory profile
func buildIndex(words []string) map[string]int {
	index := make(map[string]int, len(words))
	for i, w := range words {
		index[strings.ToLower(w)] = i // ToLower allocates a new string
	}
	return index
}

func main() {
	// CPU profile
	cpuF, _ := os.Create("cpu.prof")
	pprof.StartCPUProfile(cpuF)

	text := strings.Repeat("the quick brown fox ", 100000)
	for i := 0; i < 100; i++ {
		_ = countWords(text)
	}

	pprof.StopCPUProfile()
	cpuF.Close()

	// Memory profile
	memF, _ := os.Create("mem.prof")
	words := strings.Fields(text)
	for i := 0; i < 10; i++ {
		_ = buildIndex(words)
	}
	pprof.WriteHeapProfile(memF)
	memF.Close()

	fmt.Println("profiles written: cpu.prof mem.prof")
	// go tool pprof -http=:8080 cpu.prof  → interactive flame graph
	// go tool pprof -http=:8080 mem.prof  → allocation tree
}

Gotcha: Only looking at the CPU profile and missing that allocations are causing GC pressure — always profile both CPU and memory for a complete picture.


Concept: Stack Traces — provide information about the function call sequence leading to a panic or error, including line numbers and data values.

Why it matters: A stack trace shows exactly where and how a crash occurred — with goroutine IDs, function arguments, and file/line numbers, it pinpoints the bug without a debugger.

package main

import (
	"fmt"
	"runtime/debug"
)

func level3() {
	panic("something went critically wrong")
}

func level2() { level3() }
func level1() { level2() }

func safeExecute(fn func()) (err error) {
	defer func() {
		if r := recover(); r != nil {
			// capture full stack trace at the point of panic
			stack := debug.Stack()
			err = fmt.Errorf("panic: %v\nstack:\n%s", r, stack)
		}
	}()
	fn()
	return nil
}

func main() {
	err := safeExecute(level1)
	if err != nil {
		fmt.Println("recovered error with stack trace:")
		fmt.Println(err)
	}

	// to see raw Go stack trace: remove recover() and run
	// GOTRACEBACK=all go run main.go  → shows ALL goroutine stacks
	// GOTRACEBACK=crash               → also produces a core dump
}

Gotcha: Using recover() without debug.Stack() — you catch the panic but lose the origin location; always capture the stack trace alongside the recovered value.


Concept: Data values in a stack trace can be interpreted by understanding the size and order of bytes in the system.

Why it matters: Stack trace values are raw memory — knowing that a string is {ptr, len} and a slice is {ptr, len, cap} lets you reconstruct actual values from the hex output.

package main

import (
	"fmt"
	"runtime"
	"unsafe"
)

// understanding what appears in stack traces for different types
func main() {
	// in a stack trace: integer appears as decimal
	x := 42
	fmt.Printf("int: value=%d  size=%d bytes\n", x, unsafe.Sizeof(x))

	// in a stack trace: string appears as {ptr, len}
	s := "hello"
	fmt.Printf("string: value=%q  header_size=%d bytes\n", s, unsafe.Sizeof(s))

	// in a stack trace: slice appears as {ptr, len, cap}
	sl := []int{1, 2, 3}
	fmt.Printf("slice: len=%d cap=%d  header_size=%d bytes\n",
		len(sl), cap(sl), unsafe.Sizeof(sl))

	// in a stack trace: pointer appears as hex address
	p := &x
	fmt.Printf("pointer: addr=%p  size=%d bytes\n", p, unsafe.Sizeof(p))

	// goroutine ID: appears in "goroutine N [status]:" header
	buf := make([]byte, 64)
	n := runtime.Stack(buf, false)
	fmt.Printf("\nstack trace header:\n%s\n", buf[:n])
}

Gotcha: Seeing 0x0 for a pointer in a stack trace and assuming nil panic — it may be a real address on 32-bit systems; check the panic message for the nil dereference.


Concept: Instruction pointer offsets can be used to pinpoint the exact instruction where an issue occurred.

Why it matters: The PC offset +0x1a in a stack trace tells you the exact assembly instruction that failed — combined with go tool objdump, you can see the machine instruction.

package main

import (
	"fmt"
	"runtime"
)

// demonstrates how to read PC offset from a stack trace
func crashSite() {
	var p *int
	_ = *p // nil dereference — stack trace will show PC offset into this function
}

func capturePC() {
	pc := make([]uintptr, 10)
	n := runtime.Callers(1, pc)
	frames := runtime.CallersFrames(pc[:n])

	for {
		frame, more := frames.Next()
		fmt.Printf("func: %s\n", frame.Function)
		fmt.Printf("file: %s:%d\n", frame.File, frame.Line)
		fmt.Printf("PC:   %x\n", frame.PC)
		if !more { break }
	}
}

func main() {
	capturePC()

	// real stack trace analysis:
	// 1. note the PC offset: main.crashSite(...)+0x1a
	// 2. go build -o app . && go tool objdump -s main.crashSite app
	// 3. find the instruction at offset 0x1a — shows the nil dereference
}

Gotcha: Trying to decode PC offsets from a stripped binary — production binaries built with -ldflags="-s -w" have no debug symbols; keep a debug build for post-mortem analysis.


Concept: Micro-level Optimization — focuses on individual functions or code sections; benchmarking and profiling identify allocation sources.

Why it matters: Micro-optimization without profiling data is guesswork — profiles show exactly which function and which line is hot, eliminating wasted optimization effort.

package opt

import (
	"fmt"
	"strconv"
	"testing"
)

// version A: allocates on every call — interface conversion forces heap escape
func formatItemSlow(id int, name string) string {
	return fmt.Sprintf("item-%d:%s", id, name) // fmt.Sprintf always allocates
}

// version B: no allocation — strconv avoids interface{} boxing
func formatItemFast(id int, name string) string {
	b := make([]byte, 0, 32)
	b = append(b, "item-"...)
	b = strconv.AppendInt(b, int64(id), 10)
	b = append(b, ':')
	b = append(b, name...)
	return string(b) // one allocation: the final string
}

var sink string

func BenchmarkFormatSlow(b *testing.B) {
	for i := 0; i < b.N; i++ {
		sink = formatItemSlow(42, "widget")
	}
}

func BenchmarkFormatFast(b *testing.B) {
	for i := 0; i < b.N; i++ {
		sink = formatItemFast(42, "widget")
	}
}
// go test -bench=. -benchmem  → compare allocs/op
// go build -gcflags='-m' .    → see which variables escape to heap

Gotcha: Optimizing a function that isn't in the profile's top 10 — micro-optimization of cold code has no user-visible impact; start with what the profiler shows.


Concept: Inlining — a compiler optimization that replaces function calls with the function's code; reduces allocations particularly with pointer semantics.

Why it matters: Inlined functions eliminate call overhead AND allow the compiler to see through the abstraction for further optimizations like escape analysis.

package opt

import (
	"fmt"
	"testing"
)

// inlinable: small, no loops, no complex control flow — compiler inlines this
func add(a, b int) int { return a + b } // will be inlined

// not inlinable: too complex, or uses go:noinline directive
//go:noinline
func addNoInline(a, b int) int { return a + b } // forced out-of-line call

// check inlining decisions:
// go build -gcflags='-m -m' . 2>&1 | grep "inlining call"

func BenchmarkInlined(b *testing.B) {
	x, y := 3, 4
	var sink int
	for i := 0; i < b.N; i++ {
		sink = add(x, y) // inlined: zero function call overhead
	}
	_ = sink
}

func BenchmarkNotInlined(b *testing.B) {
	x, y := 3, 4
	var sink int
	for i := 0; i < b.N; i++ {
		sink = addNoInline(x, y) // real call: save/restore registers, stack frame
	}
	_ = sink
}

func main() { fmt.Println(add(3, 4)) }

Gotcha: Adding an error return to a simple function expecting it to stay inlined — functions with multiple return values and error handling often exceed the inlining budget.


Concept: Escape analysis reports pinpoint variables that escape to the heap — helping identify allocation sources.

Why it matters: Every heap allocation is a future GC event — escape analysis output shows which variables you thought were on the stack are actually causing allocations.

package main

import "fmt"

// stays on stack: value returned by copy — no escape
func stackAlloc() int {
	x := 42        // "x does not escape" — stays in this frame
	return x       // copy returned — x freed when function returns
}

// escapes to heap: address returned — must outlive the frame
func heapAlloc() *int {
	x := 42        // "x escapes to heap" — must survive after return
	return &x      // address returned — compiler moves x to heap
}

// interface conversion: escapes to heap — interface wraps the value in a pointer
func interfaceEscape(n int) interface{} {
	return n       // "n escapes to heap" — stored as interface{type, *data}
}

func main() {
	a := stackAlloc()
	b := heapAlloc()
	c := interfaceEscape(42)
	fmt.Println(a, *b, c)
}
// go build -gcflags='-m' .
// output lines to look for:
// "x escapes to heap"     → allocation you may be able to avoid
// "x does not escape"     → good: stays on stack
// "moved to heap: x"      → same as escapes to heap
// "inlining call to ..."  → inlining happening — good for small functions

Gotcha: Passing a value to fmt.Println and being surprised it escapes — fmt.Println takes interface{} which always causes the value to escape; use fmt.Fprintf with a concrete writer for hot paths.


Concept: Interface conversions involving concrete types can lead to allocations.

Why it matters: Every time you store a concrete value in an interface variable, the compiler may allocate a heap copy to store behind the interface's data pointer.

package opt

import "testing"

type Processor interface{ Process(n int) int }

type MyProcessor struct{ multiplier int }

func (p MyProcessor) Process(n int) int { return n * p.multiplier }

var sink int

// interface call: MyProcessor value stored in interface — may allocate
func BenchmarkInterfaceAlloc(b *testing.B) {
	var p Processor = MyProcessor{multiplier: 2} // allocation: concrete → interface
	for i := 0; i < b.N; i++ {
		sink = p.Process(i)
	}
}

// direct call: no interface — no allocation, compiler may inline
func BenchmarkDirectCall(b *testing.B) {
	p := MyProcessor{multiplier: 2} // stays as concrete type
	for i := 0; i < b.N; i++ {
		sink = p.Process(i) // direct method call — may be inlined
	}
}

// pointer in interface: no allocation — pointer fits in interface data word
func BenchmarkPointerInterface(b *testing.B) {
	var p Processor = &MyProcessor{multiplier: 2} // pointer stored — no value copy
	for i := 0; i < b.N; i++ {
		sink = p.Process(i)
	}
}
// go test -bench=. -benchmem  → compare allocs/op across all three

Gotcha: Storing a small struct (≤ pointer size) in an interface and expecting zero allocs — only scalar types that fit in a pointer word avoid allocation; structs almost always allocate.


Concept: Macro-level Optimization — examining overall application performance; observing GC traces; binding debug/pprof endpoint.

Why it matters: Macro-level issues (memory leaks, GC thrashing, goroutine accumulation) don't show up in micro-benchmarks — they only emerge when the full application runs under load.

package main

import (
	"fmt"
	"net/http"
	_ "net/http/pprof"
	"runtime"
	"time"
)

// simulate a gradual memory leak: retained references growing over time
var cache = make(map[int][]byte)

func handleRequest(id int) {
	// leak: items added but never removed — cache grows unbounded
	cache[id] = make([]byte, 1024) // 1KB per request
}

func main() {
	go func() { http.ListenAndServe(":6060", nil) }()

	// GODEBUG=gctrace=1 go run main.go
	// output format: gc N @Xs M%: P+Q+R ms clock, P+Q+R+S ms cpu, MB->MB->MB MB, MB goal, N P
	// fields: heap before GC → heap after GC → heap allocated
	// growing "heap after GC" = memory leak

	for i := 0; ; i++ {
		handleRequest(i)
		if i%1000 == 0 {
			var m runtime.MemStats
			runtime.ReadMemStats(&m)
			fmt.Printf("request=%d heapAlloc=%dMB numGC=%d\n",
				i, m.HeapAlloc/1024/1024, m.NumGC)
		}
		time.Sleep(time.Millisecond)
	}
	// memory profile: go tool pprof http://localhost:6060/debug/pprof/heap
	// look for: inuse_space — shows what is currently allocated
}

Gotcha: Profiling heap with /debug/pprof/heap and confusing inuse_space with alloc_spaceinuse_space shows current leaks; alloc_space shows historical allocations including already-freed ones.


Concept: Identifying and eliminating unnecessary allocations particularly within loops or frequently executed code paths.

Why it matters: One allocation per request at 10k RPS is 10k allocations per second — each one adds GC pressure proportional to its size.

package main

import (
	"fmt"
	"runtime"
	"strings"
)

// allocation in hot path: new Builder every call
func buildQueryNaive(filters []string) string {
	var b strings.Builder // allocated on heap every call
	b.WriteString("SELECT * FROM users WHERE ")
	for i, f := range filters {
		if i > 0 { b.WriteString(" AND ") }
		b.WriteString(f)
	}
	return b.String()
}

// zero allocation: reuse a pre-allocated buffer via sync.Pool
import "sync"

var builderPool = sync.Pool{
	New: func() interface{} {
		return &strings.Builder{}
	},
}

func buildQueryPooled(filters []string) string {
	b := builderPool.Get().(*strings.Builder)
	defer func() {
		b.Reset()
		builderPool.Put(b) // return to pool — reused by next call
	}()

	b.WriteString("SELECT * FROM users WHERE ")
	for i, f := range filters {
		if i > 0 { b.WriteString(" AND ") }
		b.WriteString(f)
	}
	return b.String()
}

func main() {
	filters := []string{"active=true", "age>18", "country='IN'"}

	var before, after runtime.MemStats
	runtime.ReadMemStats(&before)

	for i := 0; i < 10000; i++ {
		_ = buildQueryPooled(filters)
	}

	runtime.ReadMemStats(&after)
	fmt.Printf("allocs: %d\n", after.Mallocs-before.Mallocs) // near zero with pool
}

Gotcha: Putting large objects in sync.Pool — the GC may clear the pool at any time; pool objects may be evicted between requests under memory pressure.


Concept: Garbage collection traces — GODEBUG=gctrace=1 provides insights into heap stability, memory leaks, and allocation pace.

Why it matters: GC trace lines show exactly when GC runs, how long it takes, and whether the live heap is growing — a growing live heap proves a memory leak.

package main

import (
	"fmt"
	"runtime"
	"time"
)

// run with: GODEBUG=gctrace=1 go run main.go
// trace format: gc N @Ts M%: P+Q+R ms, P+Q+R+S ms cpu, MB->MB->MB MB, MB goal, N P
// fields:
//   N          = GC cycle number
//   @Ts        = seconds since program start
//   M%         = % of CPU used by GC
//   MB->MB->MB = heap before GC → heap after GC → current heap size
//   MB goal    = target heap size for next GC

func stableHeap() {
	// heap after GC stays constant → no leak
	for i := 0; i < 100000; i++ {
		_ = make([]byte, 1024) // allocated and immediately unreferenced
	}
}

func growingHeap() {
	// heap after GC grows each cycle → leak
	retained := make([][]byte, 0)
	for i := 0; i < 10000; i++ {
		retained = append(retained, make([]byte, 1024)) // held forever
	}
	_ = retained
}

func main() {
	var stats runtime.MemStats

	runtime.ReadMemStats(&stats)
	fmt.Printf("before: HeapAlloc=%dKB NumGC=%d\n", stats.HeapAlloc/1024, stats.NumGC)

	stableHeap()
	runtime.GC()
	runtime.ReadMemStats(&stats)
	fmt.Printf("stable: HeapAlloc=%dKB NumGC=%d\n", stats.HeapAlloc/1024, stats.NumGC)

	time.Sleep(10 * time.Millisecond)
	// GODEBUG=gctrace=1 output appears on stderr during the run
}

Gotcha: Reading HeapAlloc and thinking it represents the live heap — HeapAlloc includes objects queued for GC; HeapInuse is the more accurate live heap measurement.


Concept: Execution Tracing — go tool trace provides goroutine timelines, system calls, GC events, and user-defined regions.

Why it matters: The tracer shows time-based goroutine activity that profiling misses — you can see exactly when goroutines were blocked, for how long, and why.

package main

import (
	"context"
	"fmt"
	"os"
	"runtime/trace"
	"sync"
)

func processItems(ctx context.Context, items []string) {
	// user region: appears as a named span in the trace viewer
	defer trace.StartRegion(ctx, "processItems").End()

	var wg sync.WaitGroup
	for _, item := range items {
		item := item
		wg.Add(1)
		go func() {
			defer wg.Done()
			// nested region: shows individual item processing in the timeline
			r := trace.StartRegion(ctx, "processItem:"+item)
			defer r.End()

			fmt.Printf("processing: %s\n", item)
		}()
	}
	wg.Wait()
}

func main() {
	f, err := os.Create("trace.out")
	if err != nil { panic(err) }
	defer f.Close()

	trace.Start(f)
	defer trace.Stop()

	ctx := context.Background()

	// user task: top-level logical operation — groups related regions
	ctx, task := trace.NewTask(ctx, "batch-process")
	defer task.End()

	processItems(ctx, []string{"a", "b", "c", "d"})

	fmt.Println("trace written to trace.out")
	// go tool trace trace.out
	// → Goroutine analysis: see blocking points
	// → User-defined tasks: see your processItems span
	// → View trace: goroutine timeline with GC marks
}

Gotcha: Generating a trace for the entire program lifetime — trace files grow quickly; use trace.Start/trace.Stop around only the suspicious section.


Concept: Analyzing traces can reveal insights not apparent from profiling — such as goroutines blocked on syscalls or inefficient concurrency patterns.

Why it matters: A profile shows WHERE time is spent; a trace shows WHEN goroutines were blocked and on WHAT — essential for diagnosing latency spikes, not just throughput.

package main

import (
	"context"
	"fmt"
	"os"
	"runtime/trace"
	"sync"
	"time"
)

// two patterns: trace reveals which one has goroutines sitting idle
func patternSerial(ctx context.Context, n int) {
	defer trace.StartRegion(ctx, "serial").End()
	for i := 0; i < n; i++ {
		time.Sleep(time.Millisecond) // trace shows: one goroutine active at a time
	}
}

func patternParallel(ctx context.Context, n int) {
	defer trace.StartRegion(ctx, "parallel").End()
	var wg sync.WaitGroup
	for i := 0; i < n; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			time.Sleep(time.Millisecond) // trace shows: N goroutines sleeping concurrently
		}()
	}
	wg.Wait()
}

func main() {
	f, _ := os.Create("trace.out")
	defer f.Close()
	trace.Start(f)
	defer trace.Stop()

	ctx := context.Background()

	ctx, task := trace.NewTask(ctx, "comparison")
	patternSerial(ctx, 5)
	patternParallel(ctx, 5)
	task.End()

	fmt.Println("open with: go tool trace trace.out")
	fmt.Println("compare serial vs parallel regions in the goroutine view")
}

Gotcha: Ignoring the "Network blocking profile" in the trace viewer — if goroutines spend most time blocked on network, the fix is different from CPU or mutex blocking.


Concept: The go tool trace command provides a browser-based interface for visualizing execution traces.

Why it matters: The visual timeline makes goroutine scheduling visible — a goroutine that should be running but shows as blocked reveals the synchronization bottleneck.

package main

import (
	"fmt"
	"os"
	"runtime/trace"
	"sync"
	"time"
)

func main() {
	f, _ := os.Create("trace.out")
	trace.Start(f)

	// create a scenario with visible bottleneck: mutex contention
	var mu sync.Mutex
	var wg sync.WaitGroup

	for i := 0; i < 8; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			for j := 0; j < 5; j++ {
				mu.Lock()
				time.Sleep(2 * time.Millisecond) // hold lock — others blocked
				mu.Unlock()
			}
		}(i)
	}
	wg.Wait()

	trace.Stop()
	f.Close()

	fmt.Println("view with: go tool trace trace.out")
	// in the trace viewer:
	// → View trace: 8 goroutines visible — most blocked on mutex (shown in red)
	// → Goroutine analysis: shows "sync.(*Mutex).Lock" as blocking reason
	// → Network blocking / Sync blocking: quantifies time waiting for mutex
}

Gotcha: Closing the trace file before calling trace.Stop()trace.Stop() flushes the final events; close the file after Stop completes, not before.


Concept: User-defined tasks and regions — developers can instrument code with trace.NewTask and trace.StartRegion for granular insights.

Why it matters: Default traces show goroutine scheduling — user tasks and regions overlay your business logic onto the timeline, showing which operation caused the latency spike.

package main

import (
	"context"
	"fmt"
	"os"
	"runtime/trace"
	"time"
)

// multi-stage pipeline: each stage is a named region in the trace
func ingest(ctx context.Context, data string) string {
	defer trace.StartRegion(ctx, "ingest").End()
	time.Sleep(5 * time.Millisecond) // simulate parsing
	return "parsed:" + data
}

func transform(ctx context.Context, data string) string {
	defer trace.StartRegion(ctx, "transform").End()
	time.Sleep(10 * time.Millisecond) // simulate enrichment
	return "enriched:" + data
}

func load(ctx context.Context, data string) error {
	defer trace.StartRegion(ctx, "load").End()
	time.Sleep(3 * time.Millisecond) // simulate DB write
	return nil
}

func processRecord(ctx context.Context, record string) error {
	// user task: groups all three regions as one logical operation
	ctx, task := trace.NewTask(ctx, "process:"+record)
	defer task.End()

	parsed := ingest(ctx, record)
	enriched := transform(ctx, parsed)
	return load(ctx, enriched)
}

func main() {
	f, _ := os.Create("trace.out")
	trace.Start(f)
	defer func() { trace.Stop(); f.Close() }()

	ctx := context.Background()

	for _, record := range []string{"rec-1", "rec-2", "rec-3"} {
		if err := processRecord(ctx, record); err != nil {
			fmt.Println("error:", err)
		}
	}

	fmt.Println("trace written — go tool trace trace.out")
	// User tasks view: shows each record's total time
	// User regions view: shows time breakdown per stage
	// immediately shows: transform (10ms) dominates — optimize there first
}

Gotcha: Creating tasks without ending them — call task.End() or use defer task.End() immediately after trace.NewTask; unclosed tasks appear as infinitely long spans.