Go - Benchmarking and Profiling & Tracing
Concept: Benchmarking helps analyze code performance for CPU and memory usage, revealing bottlenecks caused by allocations or inefficient algorithms.
Why it matters: Without benchmarks, performance claims are guesses — benchmarks provide reproducible numbers that prove or disprove optimization attempts.
// bench/string_bench_test.go
package bench
import (
"strings"
"testing"
)
// two implementations — benchmark reveals which one is actually faster
func joinPlus(parts []string) string {
result := ""
for _, p := range parts {
result += p // new allocation every iteration — O(n²) copies
}
return result
}
func joinBuilder(parts []string) string {
var b strings.Builder
for _, p := range parts {
b.WriteString(p) // amortized O(1) — single backing buffer
}
return b.String()
}
var parts = []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}
func BenchmarkJoinPlus(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = joinPlus(parts) // b.N adjusted by runtime to hit 1 second
}
}
func BenchmarkJoinBuilder(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = joinBuilder(parts)
}
}
// go test -bench=. -benchmem ./bench
// -benchmem shows: allocs/op and bytes/op — the allocation story
Gotcha: Benchmarking without -benchmem — you see ns/op but miss the allocation count which is often the real bottleneck.
Concept: Go's benchmarking tools utilize the testing package and functions that begin with Benchmark.
Why it matters: The naming convention Benchmark* with *testing.B is enforced by the runner — any other name is silently ignored as a benchmark.
package bench
import "testing"
// correct signature: Benchmark prefix + *testing.B parameter
func BenchmarkMyFunction(b *testing.B) {
// setup: NOT counted in the benchmark time
data := make([]int, 1000)
for i := range data { data[i] = i }
b.ResetTimer() // reset: exclude setup time from measurement
for i := 0; i < b.N; i++ {
// b.N: adjusted by the runtime — run enough iterations for stable timing
sum := 0
for _, v := range data {
sum += v
}
_ = sum // prevent dead code elimination
}
}
// ignored by go test -bench: wrong prefix
func measureMyFunction(b *testing.B) { _ = b }
// ignored: missing *testing.B parameter
// func BenchmarkWrongSig() {}
// go test -bench=BenchmarkMyFunction → run only this benchmark
// go test -bench=. → run all benchmarks
// go test -bench=. -benchtime=5s → run for 5 seconds instead of 1
Gotcha: Not calling b.ResetTimer() after expensive setup — setup time is included in the measurement, making the benchmark unfairly slow.
Concept: Benchmarks measure execution time and memory allocation — b.N is adjusted by the testing tool to achieve a specified benchmark duration.
Why it matters: b.N is the tool's way of amortizing measurement noise — by running many iterations, individual timing jitter averages out to a stable ns/op number.
package bench
import (
"testing"
"fmt"
)
// observe how b.N grows across iterations
var iterations int
func BenchmarkObserveN(b *testing.B) {
// b.N starts at 1 and grows until elapsed time reaches benchtime (default 1s)
// typical growth: 1 → 100 → 10000 → 1000000 → ...
for i := 0; i < b.N; i++ {
// minimal work: measure the overhead of the loop itself
_ = fmt.Sprintf("n=%d", b.N)
}
}
func BenchmarkWithMemory(b *testing.B) {
b.ReportAllocs() // equivalent to -benchmem for this benchmark only
for i := 0; i < b.N; i++ {
// b.N iterations — each alloc is counted
s := make([]byte, 64)
_ = s
}
// output: N ns/op 64 B/op 1 allocs/op
}
Gotcha: Putting the benchmark body outside the b.N loop — the code runs once and b.N iterations measure nothing; ALL work being benchmarked must be inside the loop.
Concept: Optimizations and dead code elimination by the compiler can influence benchmark results — need for accurate and realistic test scenarios.
Why it matters: The compiler may eliminate benchmark code entirely if the result is unused — benchmark results drop to 0 ns/op, producing a meaningless measurement.
package bench
import "testing"
// wrong: compiler may eliminate the entire function — result unused
func BenchmarkDeadCode(b *testing.B) {
for i := 0; i < b.N; i++ {
result := compute(i) // computed but never used — compiler eliminates
_ = result // blank identifier: still may be eliminated
}
}
// correct: use a package-level sink to prevent dead code elimination
var globalSink int
func BenchmarkPreventElimination(b *testing.B) {
for i := 0; i < b.N; i++ {
globalSink = compute(i) // assign to package-level var — compiler cannot eliminate
}
}
func compute(n int) int {
result := 0
for i := 0; i < n%100; i++ {
result += i * i
}
return result
}
Gotcha: Using a local variable as sink instead of a package-level variable — the compiler may still prove the local is unused and eliminate the computation.
Concept: Subbenchmarking — allows grouping and running specific benchmarks within a larger function.
Why it matters: Subbenchmarks organize related performance scenarios in one function — they appear as BenchmarkParent/subtest and can be filtered individually.
package bench
import (
"sort"
"strconv"
"testing"
)
// subbenchmarks: compare performance across different input sizes
func BenchmarkSort(b *testing.B) {
sizes := []int{10, 100, 1000, 10000}
for _, size := range sizes {
size := size
b.Run(strconv.Itoa(size), func(b *testing.B) {
// setup: create input data — not counted
data := make([]int, size)
for i := range data { data[i] = size - i } // reverse order: worst case
b.ResetTimer() // exclude setup from measurement
for i := 0; i < b.N; i++ {
// copy to avoid sorting sorted data on second iteration
tmp := make([]int, len(data))
copy(tmp, data)
sort.Ints(tmp)
}
})
}
}
// go test -bench=BenchmarkSort/1000 → only 1000-element case
// go test -bench=BenchmarkSort → all sizes
Gotcha: Not resetting the timer inside subbenchmarks after per-case setup — the setup time is counted against the benchmark, distorting ns/op.
Concept: Benchmark Validation — ensuring benchmark results accurately reflect code performance by accounting for external factors.
Why it matters: A benchmark run on a CPU-throttled laptop during a Teams call produces numbers 3× worse than production — results must be validated against known baselines.
package bench
import (
"testing"
"time"
)
// validation technique: establish a known baseline and compare
// if the baseline changes unexpectedly, the measurement environment is suspect
// baseline: known-cost operation — measures benchmark harness overhead
func BenchmarkBaseline(b *testing.B) {
for i := 0; i < b.N; i++ {
time.Sleep(0) // known cost: ~100ns on Linux — sanity check
}
}
// target: must be faster than N × baseline if N operations were done
func BenchmarkTarget(b *testing.B) {
data := make([]int, 100)
b.ResetTimer()
for i := 0; i < b.N; i++ {
sum := 0
for _, v := range data { sum += v } // 100 additions: expect < 100ns
_ = sum
}
}
// validation workflow:
// 1. run benchmark 5 times: go test -bench=. -count=5 > results.txt
// 2. use benchstat: benchstat results.txt — shows mean ± variance
// 3. variance > 5% = environment noise — rerun on idle machine
// install benchstat: go install golang.org/x/perf/cmd/benchstat@latest
Gotcha: Running benchmarks on a machine with active browser tabs, video calls, or background builds — CPU frequency scaling alone can cause 30% variance.
Concept: System Load — background processes and resource utilization from other applications can significantly skew benchmark results.
Why it matters: A CI machine running parallel jobs during a benchmark produces numbers that cannot be compared to the previous run — always isolate benchmark execution.
package bench
import (
"runtime"
"testing"
)
// record environment metadata alongside benchmark results
func BenchmarkWithEnvironment(b *testing.B) {
// log environment: helps interpret results in context
b.Logf("GOMAXPROCS=%d NumCPU=%d", runtime.GOMAXPROCS(0), runtime.NumCPU())
// run with: go test -bench=. -v to see the log output
work := func() int {
sum := 0
for i := 0; i < 10000; i++ { sum += i }
return sum
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = work()
}
}
// isolation checklist before benchmarking:
// - close all non-essential applications
// - disable CPU frequency scaling: cpupower frequency-set -g performance
// - pin process to cores: taskset -c 0,1 go test -bench=.
// - disable turbo boost: echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
// - run multiple times and use benchstat for statistical significance
Gotcha: Comparing benchmark results across different machines without documenting CPU model, frequency, and OS — a 2× difference may just be different hardware.
Concept: CPU-bound workloads benefit significantly from parallelism — benchmarking shows the impact.
Why it matters: Benchmarks with b.RunParallel measure throughput under concurrent load — revealing whether the implementation scales with CPU count or serializes on a bottleneck.
package bench
import (
"runtime"
"sync"
"testing"
)
// CPU-bound function: pure computation
func fibonacci(n int) int {
if n <= 1 { return n }
a, b := 0, 1
for i := 2; i <= n; i++ { a, b = b, a+b }
return b
}
// sequential baseline: establishes single-core performance
func BenchmarkFibSequential(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = fibonacci(35)
}
}
// parallel: RunParallel distributes b.N across GOMAXPROCS goroutines
func BenchmarkFibParallel(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
for pb.Next() { // pb.Next() is the parallel equivalent of i < b.N
_ = fibonacci(35)
}
})
}
// manual fan out: controls goroutine count explicitly
func BenchmarkFibFanOut(b *testing.B) {
workers := runtime.NumCPU()
var wg sync.WaitGroup
b.ResetTimer()
for i := 0; i < b.N; i++ {
wg.Add(workers)
for w := 0; w < workers; w++ {
go func() { defer wg.Done(); _ = fibonacci(35) }()
}
wg.Wait()
}
}
// go test -bench=. -cpu=1,2,4,8 → vary GOMAXPROCS automatically
Gotcha: Using b.RunParallel for a function that accesses shared state without a mutex — the benchmark measures lock contention, not the function's real cost.
Concept: When constrained to a single thread, concurrent code may perform worse than sequential due to context switching overhead.
Why it matters: This is the key insight for CPU-bound work — adding concurrency on one core hurts throughput; only add goroutines when multiple cores are available.
package bench
import (
"runtime"
"sync"
"testing"
)
func cpuWork(n int) int {
sum := 0
for i := 0; i < n; i++ { sum += i * i }
return sum
}
// sequential: no goroutine overhead — best on single core
func BenchmarkCPU_Sequential(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = cpuWork(10000)
}
}
// concurrent on 1 core: goroutine creation + context switch COSTS MORE than work gained
func BenchmarkCPU_Concurrent_1Core(b *testing.B) {
runtime.GOMAXPROCS(1) // force single core
defer runtime.GOMAXPROCS(runtime.NumCPU())
for i := 0; i < b.N; i++ {
var wg sync.WaitGroup
for j := 0; j < 4; j++ {
wg.Add(1)
go func() { defer wg.Done(); _ = cpuWork(10000) }()
}
wg.Wait()
}
// SLOWER than sequential on 1 core — context switch overhead dominates
}
// concurrent on multiple cores: parallelism wins for CPU-bound work
func BenchmarkCPU_Concurrent_AllCores(b *testing.B) {
for i := 0; i < b.N; i++ {
var wg sync.WaitGroup
for j := 0; j < runtime.NumCPU(); j++ {
wg.Add(1)
go func() { defer wg.Done(); _ = cpuWork(10000) }()
}
wg.Wait()
}
// FASTER than sequential — each core does its share in parallel
}
Gotcha: Launching more goroutines than CPUs for CPU-bound work — beyond NumCPU workers, you add scheduling overhead with no additional throughput.
Concept: IO-Bound Workloads — concurrency can enhance performance without requiring parallelism.
Why it matters: IO-bound goroutines yield CPU while waiting — a single OS thread handles thousands of concurrent IO operations by switching between parked goroutines.
package bench
import (
"sync"
"testing"
"time"
)
// simulates IO-bound work: goroutine parks during sleep
func ioWork(latency time.Duration) {
time.Sleep(latency) // goroutine parks — OS thread handles other goroutines
}
// sequential IO: each operation waits for the previous to complete
func BenchmarkIO_Sequential(b *testing.B) {
for i := 0; i < b.N; i++ {
for j := 0; j < 10; j++ {
ioWork(time.Millisecond) // 10ms total sequential
}
}
}
// concurrent IO: all operations in flight simultaneously
func BenchmarkIO_Concurrent(b *testing.B) {
for i := 0; i < b.N; i++ {
var wg sync.WaitGroup
for j := 0; j < 10; j++ {
wg.Add(1)
go func() {
defer wg.Done()
ioWork(time.Millisecond) // parks — 1ms total concurrent
}()
}
wg.Wait()
}
}
// go test -bench=BenchmarkIO -cpu=1 → concurrent wins even on 1 core
// expected: concurrent ~10× faster — 1ms vs 10ms per operation
Gotcha: Using time.Sleep as a proxy for real I/O in production benchmarks — sleep doesn't exercise the network poller or system call stack; benchmark with real connections for realistic numbers.
Concept: Excessive goroutines beyond a certain point provide diminishing returns — overhead outweighs the benefits.
Why it matters: Past the optimal concurrency level, goroutine creation cost, scheduler overhead, and memory usage consume more resources than the parallelism gained.
package bench
import (
"sync"
"testing"
"time"
)
func processItem(latency time.Duration) string {
time.Sleep(latency)
return "result"
}
// find optimal concurrency level through measurement
func BenchmarkConcurrencyLevel(b *testing.B) {
levels := []int{1, 2, 4, 8, 16, 32, 64, 128}
for _, level := range levels {
level := level
b.Run(func() string {
if level == 1 { return "sequential" }
return "concurrent_" + func(n int) string {
s := ""
for n > 0 { s = string(rune('0'+n%10)) + s; n /= 10 }
return s
}(level)
}(), func(b *testing.B) {
for i := 0; i < b.N; i++ {
var wg sync.WaitGroup
for j := 0; j < 10; j++ {
wg.Add(1)
sem := make(chan struct{}, level)
go func() {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
processItem(time.Millisecond)
}()
}
wg.Wait()
}
})
}
}
Gotcha: Picking a goroutine pool size based on intuition — always benchmark with different sizes; optimal concurrency is workload-specific and hardware-dependent.
Concept: Profiling Guidelines — ensure the machine is idle, focus on CPU and memory profiling.
Why it matters: A profile taken on a busy machine shows OS scheduling noise alongside application behavior — idle machine profiles reveal only your code's costs.
// profiling/main.go
package main
import (
"fmt"
"net/http"
_ "net/http/pprof" // registers /debug/pprof/* endpoints automatically
"os"
"runtime/pprof"
"time"
)
// file-based CPU profile: precise control over what is profiled
func profileToFile() {
f, err := os.Create("cpu.prof")
if err != nil { panic(err) }
defer f.Close()
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// work to profile
for i := 0; i < 1000000; i++ {
_ = fmt.Sprintf("item-%d", i) // allocation-heavy work
}
}
func main() {
// HTTP endpoint: profile a running service
go func() {
http.ListenAndServe("localhost:6060", nil)
// CPU: go tool pprof http://localhost:6060/debug/pprof/profile?seconds=10
// Memory: go tool pprof http://localhost:6060/debug/pprof/heap
// Goroutines: http://localhost:6060/debug/pprof/goroutine?debug=1
}()
profileToFile()
time.Sleep(100 * time.Millisecond)
// analyze: go tool pprof cpu.prof → top → web (opens flame graph)
}
Gotcha: Profiling with seconds=30 on a machine running other services — include only the time window where your code is doing the work you want to measure.
Concept: Consistent benchmarking — run tests multiple times under similar conditions to ensure results are reliable.
Why it matters: A single benchmark run may hit a GC pause or CPU throttle — multiple runs with statistical analysis (benchstat) reveal true performance.
package bench
import (
"sort"
"testing"
)
// run this benchmark multiple times to check stability:
// go test -bench=BenchmarkStable -count=10 > results.txt
// benchstat results.txt → shows mean ± std dev
func BenchmarkStable(b *testing.B) {
data := make([]int, 1000)
for i := range data { data[i] = 1000 - i }
b.ResetTimer()
for i := 0; i < b.N; i++ {
tmp := make([]int, len(data))
copy(tmp, data)
sort.Ints(tmp) // deterministic work — should show low variance
}
}
// validate: before/after optimization comparison
// step 1: go test -bench=BenchmarkStable -count=10 > before.txt
// step 2: implement optimization
// step 3: go test -bench=BenchmarkStable -count=10 > after.txt
// step 4: benchstat before.txt after.txt → shows % improvement with confidence
Gotcha: Comparing single benchmark runs to claim a speedup — one run can differ by 20% due to GC timing; always use -count=5 or more and benchstat for statistical confidence.
Concept: Types of profiling — CPU profiling identifies hot functions; memory profiling analyzes allocation patterns.
Why it matters: CPU and memory profiles reveal different problems — a function may be fast but allocate heavily (hurting GC), or slow but allocation-free (bottleneck elsewhere).
package main
import (
"fmt"
"os"
"runtime/pprof"
"strings"
)
// CPU-hot function: pure computation — will appear in CPU profile
func countWords(text string) int {
words := strings.Fields(text)
return len(words)
}
// Allocation-heavy function: will appear in memory profile
func buildIndex(words []string) map[string]int {
index := make(map[string]int, len(words))
for i, w := range words {
index[strings.ToLower(w)] = i // ToLower allocates a new string
}
return index
}
func main() {
// CPU profile
cpuF, _ := os.Create("cpu.prof")
pprof.StartCPUProfile(cpuF)
text := strings.Repeat("the quick brown fox ", 100000)
for i := 0; i < 100; i++ {
_ = countWords(text)
}
pprof.StopCPUProfile()
cpuF.Close()
// Memory profile
memF, _ := os.Create("mem.prof")
words := strings.Fields(text)
for i := 0; i < 10; i++ {
_ = buildIndex(words)
}
pprof.WriteHeapProfile(memF)
memF.Close()
fmt.Println("profiles written: cpu.prof mem.prof")
// go tool pprof -http=:8080 cpu.prof → interactive flame graph
// go tool pprof -http=:8080 mem.prof → allocation tree
}
Gotcha: Only looking at the CPU profile and missing that allocations are causing GC pressure — always profile both CPU and memory for a complete picture.
Concept: Stack Traces — provide information about the function call sequence leading to a panic or error, including line numbers and data values.
Why it matters: A stack trace shows exactly where and how a crash occurred — with goroutine IDs, function arguments, and file/line numbers, it pinpoints the bug without a debugger.
package main
import (
"fmt"
"runtime/debug"
)
func level3() {
panic("something went critically wrong")
}
func level2() { level3() }
func level1() { level2() }
func safeExecute(fn func()) (err error) {
defer func() {
if r := recover(); r != nil {
// capture full stack trace at the point of panic
stack := debug.Stack()
err = fmt.Errorf("panic: %v\nstack:\n%s", r, stack)
}
}()
fn()
return nil
}
func main() {
err := safeExecute(level1)
if err != nil {
fmt.Println("recovered error with stack trace:")
fmt.Println(err)
}
// to see raw Go stack trace: remove recover() and run
// GOTRACEBACK=all go run main.go → shows ALL goroutine stacks
// GOTRACEBACK=crash → also produces a core dump
}
Gotcha: Using recover() without debug.Stack() — you catch the panic but lose the origin location; always capture the stack trace alongside the recovered value.
Concept: Data values in a stack trace can be interpreted by understanding the size and order of bytes in the system.
Why it matters: Stack trace values are raw memory — knowing that a string is {ptr, len} and a slice is {ptr, len, cap} lets you reconstruct actual values from the hex output.
package main
import (
"fmt"
"runtime"
"unsafe"
)
// understanding what appears in stack traces for different types
func main() {
// in a stack trace: integer appears as decimal
x := 42
fmt.Printf("int: value=%d size=%d bytes\n", x, unsafe.Sizeof(x))
// in a stack trace: string appears as {ptr, len}
s := "hello"
fmt.Printf("string: value=%q header_size=%d bytes\n", s, unsafe.Sizeof(s))
// in a stack trace: slice appears as {ptr, len, cap}
sl := []int{1, 2, 3}
fmt.Printf("slice: len=%d cap=%d header_size=%d bytes\n",
len(sl), cap(sl), unsafe.Sizeof(sl))
// in a stack trace: pointer appears as hex address
p := &x
fmt.Printf("pointer: addr=%p size=%d bytes\n", p, unsafe.Sizeof(p))
// goroutine ID: appears in "goroutine N [status]:" header
buf := make([]byte, 64)
n := runtime.Stack(buf, false)
fmt.Printf("\nstack trace header:\n%s\n", buf[:n])
}
Gotcha: Seeing 0x0 for a pointer in a stack trace and assuming nil panic — it may be a real address on 32-bit systems; check the panic message for the nil dereference.
Concept: Instruction pointer offsets can be used to pinpoint the exact instruction where an issue occurred.
Why it matters: The PC offset +0x1a in a stack trace tells you the exact assembly instruction that failed — combined with go tool objdump, you can see the machine instruction.
package main
import (
"fmt"
"runtime"
)
// demonstrates how to read PC offset from a stack trace
func crashSite() {
var p *int
_ = *p // nil dereference — stack trace will show PC offset into this function
}
func capturePC() {
pc := make([]uintptr, 10)
n := runtime.Callers(1, pc)
frames := runtime.CallersFrames(pc[:n])
for {
frame, more := frames.Next()
fmt.Printf("func: %s\n", frame.Function)
fmt.Printf("file: %s:%d\n", frame.File, frame.Line)
fmt.Printf("PC: %x\n", frame.PC)
if !more { break }
}
}
func main() {
capturePC()
// real stack trace analysis:
// 1. note the PC offset: main.crashSite(...)+0x1a
// 2. go build -o app . && go tool objdump -s main.crashSite app
// 3. find the instruction at offset 0x1a — shows the nil dereference
}
Gotcha: Trying to decode PC offsets from a stripped binary — production binaries built with -ldflags="-s -w" have no debug symbols; keep a debug build for post-mortem analysis.
Concept: Micro-level Optimization — focuses on individual functions or code sections; benchmarking and profiling identify allocation sources.
Why it matters: Micro-optimization without profiling data is guesswork — profiles show exactly which function and which line is hot, eliminating wasted optimization effort.
package opt
import (
"fmt"
"strconv"
"testing"
)
// version A: allocates on every call — interface conversion forces heap escape
func formatItemSlow(id int, name string) string {
return fmt.Sprintf("item-%d:%s", id, name) // fmt.Sprintf always allocates
}
// version B: no allocation — strconv avoids interface{} boxing
func formatItemFast(id int, name string) string {
b := make([]byte, 0, 32)
b = append(b, "item-"...)
b = strconv.AppendInt(b, int64(id), 10)
b = append(b, ':')
b = append(b, name...)
return string(b) // one allocation: the final string
}
var sink string
func BenchmarkFormatSlow(b *testing.B) {
for i := 0; i < b.N; i++ {
sink = formatItemSlow(42, "widget")
}
}
func BenchmarkFormatFast(b *testing.B) {
for i := 0; i < b.N; i++ {
sink = formatItemFast(42, "widget")
}
}
// go test -bench=. -benchmem → compare allocs/op
// go build -gcflags='-m' . → see which variables escape to heap
Gotcha: Optimizing a function that isn't in the profile's top 10 — micro-optimization of cold code has no user-visible impact; start with what the profiler shows.
Concept: Inlining — a compiler optimization that replaces function calls with the function's code; reduces allocations particularly with pointer semantics.
Why it matters: Inlined functions eliminate call overhead AND allow the compiler to see through the abstraction for further optimizations like escape analysis.
package opt
import (
"fmt"
"testing"
)
// inlinable: small, no loops, no complex control flow — compiler inlines this
func add(a, b int) int { return a + b } // will be inlined
// not inlinable: too complex, or uses go:noinline directive
//go:noinline
func addNoInline(a, b int) int { return a + b } // forced out-of-line call
// check inlining decisions:
// go build -gcflags='-m -m' . 2>&1 | grep "inlining call"
func BenchmarkInlined(b *testing.B) {
x, y := 3, 4
var sink int
for i := 0; i < b.N; i++ {
sink = add(x, y) // inlined: zero function call overhead
}
_ = sink
}
func BenchmarkNotInlined(b *testing.B) {
x, y := 3, 4
var sink int
for i := 0; i < b.N; i++ {
sink = addNoInline(x, y) // real call: save/restore registers, stack frame
}
_ = sink
}
func main() { fmt.Println(add(3, 4)) }
Gotcha: Adding an error return to a simple function expecting it to stay inlined — functions with multiple return values and error handling often exceed the inlining budget.
Concept: Escape analysis reports pinpoint variables that escape to the heap — helping identify allocation sources.
Why it matters: Every heap allocation is a future GC event — escape analysis output shows which variables you thought were on the stack are actually causing allocations.
package main
import "fmt"
// stays on stack: value returned by copy — no escape
func stackAlloc() int {
x := 42 // "x does not escape" — stays in this frame
return x // copy returned — x freed when function returns
}
// escapes to heap: address returned — must outlive the frame
func heapAlloc() *int {
x := 42 // "x escapes to heap" — must survive after return
return &x // address returned — compiler moves x to heap
}
// interface conversion: escapes to heap — interface wraps the value in a pointer
func interfaceEscape(n int) interface{} {
return n // "n escapes to heap" — stored as interface{type, *data}
}
func main() {
a := stackAlloc()
b := heapAlloc()
c := interfaceEscape(42)
fmt.Println(a, *b, c)
}
// go build -gcflags='-m' .
// output lines to look for:
// "x escapes to heap" → allocation you may be able to avoid
// "x does not escape" → good: stays on stack
// "moved to heap: x" → same as escapes to heap
// "inlining call to ..." → inlining happening — good for small functions
Gotcha: Passing a value to fmt.Println and being surprised it escapes — fmt.Println takes interface{} which always causes the value to escape; use fmt.Fprintf with a concrete writer for hot paths.
Concept: Interface conversions involving concrete types can lead to allocations.
Why it matters: Every time you store a concrete value in an interface variable, the compiler may allocate a heap copy to store behind the interface's data pointer.
package opt
import "testing"
type Processor interface{ Process(n int) int }
type MyProcessor struct{ multiplier int }
func (p MyProcessor) Process(n int) int { return n * p.multiplier }
var sink int
// interface call: MyProcessor value stored in interface — may allocate
func BenchmarkInterfaceAlloc(b *testing.B) {
var p Processor = MyProcessor{multiplier: 2} // allocation: concrete → interface
for i := 0; i < b.N; i++ {
sink = p.Process(i)
}
}
// direct call: no interface — no allocation, compiler may inline
func BenchmarkDirectCall(b *testing.B) {
p := MyProcessor{multiplier: 2} // stays as concrete type
for i := 0; i < b.N; i++ {
sink = p.Process(i) // direct method call — may be inlined
}
}
// pointer in interface: no allocation — pointer fits in interface data word
func BenchmarkPointerInterface(b *testing.B) {
var p Processor = &MyProcessor{multiplier: 2} // pointer stored — no value copy
for i := 0; i < b.N; i++ {
sink = p.Process(i)
}
}
// go test -bench=. -benchmem → compare allocs/op across all three
Gotcha: Storing a small struct (≤ pointer size) in an interface and expecting zero allocs — only scalar types that fit in a pointer word avoid allocation; structs almost always allocate.
Concept: Macro-level Optimization — examining overall application performance; observing GC traces; binding debug/pprof endpoint.
Why it matters: Macro-level issues (memory leaks, GC thrashing, goroutine accumulation) don't show up in micro-benchmarks — they only emerge when the full application runs under load.
package main
import (
"fmt"
"net/http"
_ "net/http/pprof"
"runtime"
"time"
)
// simulate a gradual memory leak: retained references growing over time
var cache = make(map[int][]byte)
func handleRequest(id int) {
// leak: items added but never removed — cache grows unbounded
cache[id] = make([]byte, 1024) // 1KB per request
}
func main() {
go func() { http.ListenAndServe(":6060", nil) }()
// GODEBUG=gctrace=1 go run main.go
// output format: gc N @Xs M%: P+Q+R ms clock, P+Q+R+S ms cpu, MB->MB->MB MB, MB goal, N P
// fields: heap before GC → heap after GC → heap allocated
// growing "heap after GC" = memory leak
for i := 0; ; i++ {
handleRequest(i)
if i%1000 == 0 {
var m runtime.MemStats
runtime.ReadMemStats(&m)
fmt.Printf("request=%d heapAlloc=%dMB numGC=%d\n",
i, m.HeapAlloc/1024/1024, m.NumGC)
}
time.Sleep(time.Millisecond)
}
// memory profile: go tool pprof http://localhost:6060/debug/pprof/heap
// look for: inuse_space — shows what is currently allocated
}
Gotcha: Profiling heap with /debug/pprof/heap and confusing inuse_space with alloc_space — inuse_space shows current leaks; alloc_space shows historical allocations including already-freed ones.
Concept: Identifying and eliminating unnecessary allocations particularly within loops or frequently executed code paths.
Why it matters: One allocation per request at 10k RPS is 10k allocations per second — each one adds GC pressure proportional to its size.
package main
import (
"fmt"
"runtime"
"strings"
)
// allocation in hot path: new Builder every call
func buildQueryNaive(filters []string) string {
var b strings.Builder // allocated on heap every call
b.WriteString("SELECT * FROM users WHERE ")
for i, f := range filters {
if i > 0 { b.WriteString(" AND ") }
b.WriteString(f)
}
return b.String()
}
// zero allocation: reuse a pre-allocated buffer via sync.Pool
import "sync"
var builderPool = sync.Pool{
New: func() interface{} {
return &strings.Builder{}
},
}
func buildQueryPooled(filters []string) string {
b := builderPool.Get().(*strings.Builder)
defer func() {
b.Reset()
builderPool.Put(b) // return to pool — reused by next call
}()
b.WriteString("SELECT * FROM users WHERE ")
for i, f := range filters {
if i > 0 { b.WriteString(" AND ") }
b.WriteString(f)
}
return b.String()
}
func main() {
filters := []string{"active=true", "age>18", "country='IN'"}
var before, after runtime.MemStats
runtime.ReadMemStats(&before)
for i := 0; i < 10000; i++ {
_ = buildQueryPooled(filters)
}
runtime.ReadMemStats(&after)
fmt.Printf("allocs: %d\n", after.Mallocs-before.Mallocs) // near zero with pool
}
Gotcha: Putting large objects in sync.Pool — the GC may clear the pool at any time; pool objects may be evicted between requests under memory pressure.
Concept: Garbage collection traces — GODEBUG=gctrace=1 provides insights into heap stability, memory leaks, and allocation pace.
Why it matters: GC trace lines show exactly when GC runs, how long it takes, and whether the live heap is growing — a growing live heap proves a memory leak.
package main
import (
"fmt"
"runtime"
"time"
)
// run with: GODEBUG=gctrace=1 go run main.go
// trace format: gc N @Ts M%: P+Q+R ms, P+Q+R+S ms cpu, MB->MB->MB MB, MB goal, N P
// fields:
// N = GC cycle number
// @Ts = seconds since program start
// M% = % of CPU used by GC
// MB->MB->MB = heap before GC → heap after GC → current heap size
// MB goal = target heap size for next GC
func stableHeap() {
// heap after GC stays constant → no leak
for i := 0; i < 100000; i++ {
_ = make([]byte, 1024) // allocated and immediately unreferenced
}
}
func growingHeap() {
// heap after GC grows each cycle → leak
retained := make([][]byte, 0)
for i := 0; i < 10000; i++ {
retained = append(retained, make([]byte, 1024)) // held forever
}
_ = retained
}
func main() {
var stats runtime.MemStats
runtime.ReadMemStats(&stats)
fmt.Printf("before: HeapAlloc=%dKB NumGC=%d\n", stats.HeapAlloc/1024, stats.NumGC)
stableHeap()
runtime.GC()
runtime.ReadMemStats(&stats)
fmt.Printf("stable: HeapAlloc=%dKB NumGC=%d\n", stats.HeapAlloc/1024, stats.NumGC)
time.Sleep(10 * time.Millisecond)
// GODEBUG=gctrace=1 output appears on stderr during the run
}
Gotcha: Reading HeapAlloc and thinking it represents the live heap — HeapAlloc includes objects queued for GC; HeapInuse is the more accurate live heap measurement.
Concept: Execution Tracing — go tool trace provides goroutine timelines, system calls, GC events, and user-defined regions.
Why it matters: The tracer shows time-based goroutine activity that profiling misses — you can see exactly when goroutines were blocked, for how long, and why.
package main
import (
"context"
"fmt"
"os"
"runtime/trace"
"sync"
)
func processItems(ctx context.Context, items []string) {
// user region: appears as a named span in the trace viewer
defer trace.StartRegion(ctx, "processItems").End()
var wg sync.WaitGroup
for _, item := range items {
item := item
wg.Add(1)
go func() {
defer wg.Done()
// nested region: shows individual item processing in the timeline
r := trace.StartRegion(ctx, "processItem:"+item)
defer r.End()
fmt.Printf("processing: %s\n", item)
}()
}
wg.Wait()
}
func main() {
f, err := os.Create("trace.out")
if err != nil { panic(err) }
defer f.Close()
trace.Start(f)
defer trace.Stop()
ctx := context.Background()
// user task: top-level logical operation — groups related regions
ctx, task := trace.NewTask(ctx, "batch-process")
defer task.End()
processItems(ctx, []string{"a", "b", "c", "d"})
fmt.Println("trace written to trace.out")
// go tool trace trace.out
// → Goroutine analysis: see blocking points
// → User-defined tasks: see your processItems span
// → View trace: goroutine timeline with GC marks
}
Gotcha: Generating a trace for the entire program lifetime — trace files grow quickly; use trace.Start/trace.Stop around only the suspicious section.
Concept: Analyzing traces can reveal insights not apparent from profiling — such as goroutines blocked on syscalls or inefficient concurrency patterns.
Why it matters: A profile shows WHERE time is spent; a trace shows WHEN goroutines were blocked and on WHAT — essential for diagnosing latency spikes, not just throughput.
package main
import (
"context"
"fmt"
"os"
"runtime/trace"
"sync"
"time"
)
// two patterns: trace reveals which one has goroutines sitting idle
func patternSerial(ctx context.Context, n int) {
defer trace.StartRegion(ctx, "serial").End()
for i := 0; i < n; i++ {
time.Sleep(time.Millisecond) // trace shows: one goroutine active at a time
}
}
func patternParallel(ctx context.Context, n int) {
defer trace.StartRegion(ctx, "parallel").End()
var wg sync.WaitGroup
for i := 0; i < n; i++ {
wg.Add(1)
go func() {
defer wg.Done()
time.Sleep(time.Millisecond) // trace shows: N goroutines sleeping concurrently
}()
}
wg.Wait()
}
func main() {
f, _ := os.Create("trace.out")
defer f.Close()
trace.Start(f)
defer trace.Stop()
ctx := context.Background()
ctx, task := trace.NewTask(ctx, "comparison")
patternSerial(ctx, 5)
patternParallel(ctx, 5)
task.End()
fmt.Println("open with: go tool trace trace.out")
fmt.Println("compare serial vs parallel regions in the goroutine view")
}
Gotcha: Ignoring the "Network blocking profile" in the trace viewer — if goroutines spend most time blocked on network, the fix is different from CPU or mutex blocking.
Concept: The go tool trace command provides a browser-based interface for visualizing execution traces.
Why it matters: The visual timeline makes goroutine scheduling visible — a goroutine that should be running but shows as blocked reveals the synchronization bottleneck.
package main
import (
"fmt"
"os"
"runtime/trace"
"sync"
"time"
)
func main() {
f, _ := os.Create("trace.out")
trace.Start(f)
// create a scenario with visible bottleneck: mutex contention
var mu sync.Mutex
var wg sync.WaitGroup
for i := 0; i < 8; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
for j := 0; j < 5; j++ {
mu.Lock()
time.Sleep(2 * time.Millisecond) // hold lock — others blocked
mu.Unlock()
}
}(i)
}
wg.Wait()
trace.Stop()
f.Close()
fmt.Println("view with: go tool trace trace.out")
// in the trace viewer:
// → View trace: 8 goroutines visible — most blocked on mutex (shown in red)
// → Goroutine analysis: shows "sync.(*Mutex).Lock" as blocking reason
// → Network blocking / Sync blocking: quantifies time waiting for mutex
}
Gotcha: Closing the trace file before calling trace.Stop() — trace.Stop() flushes the final events; close the file after Stop completes, not before.
Concept: User-defined tasks and regions — developers can instrument code with trace.NewTask and trace.StartRegion for granular insights.
Why it matters: Default traces show goroutine scheduling — user tasks and regions overlay your business logic onto the timeline, showing which operation caused the latency spike.
package main
import (
"context"
"fmt"
"os"
"runtime/trace"
"time"
)
// multi-stage pipeline: each stage is a named region in the trace
func ingest(ctx context.Context, data string) string {
defer trace.StartRegion(ctx, "ingest").End()
time.Sleep(5 * time.Millisecond) // simulate parsing
return "parsed:" + data
}
func transform(ctx context.Context, data string) string {
defer trace.StartRegion(ctx, "transform").End()
time.Sleep(10 * time.Millisecond) // simulate enrichment
return "enriched:" + data
}
func load(ctx context.Context, data string) error {
defer trace.StartRegion(ctx, "load").End()
time.Sleep(3 * time.Millisecond) // simulate DB write
return nil
}
func processRecord(ctx context.Context, record string) error {
// user task: groups all three regions as one logical operation
ctx, task := trace.NewTask(ctx, "process:"+record)
defer task.End()
parsed := ingest(ctx, record)
enriched := transform(ctx, parsed)
return load(ctx, enriched)
}
func main() {
f, _ := os.Create("trace.out")
trace.Start(f)
defer func() { trace.Stop(); f.Close() }()
ctx := context.Background()
for _, record := range []string{"rec-1", "rec-2", "rec-3"} {
if err := processRecord(ctx, record); err != nil {
fmt.Println("error:", err)
}
}
fmt.Println("trace written — go tool trace trace.out")
// User tasks view: shows each record's total time
// User regions view: shows time breakdown per stage
// immediately shows: transform (10ms) dominates — optimize there first
}
Gotcha: Creating tasks without ending them — call task.End() or use defer task.End() immediately after trace.NewTask; unclosed tasks appear as infinitely long spans.