Go - Goroutines

Last updated on May 25, 2026

Concept: Concurrency refers to the out-of-order execution of instructions.

Why it matters: Concurrency is a design property — it allows multiple tasks to make progress without requiring them to run at the same instant.

package main

import (
	"fmt"
	"sync"
)

// concurrency: two tasks make progress interleaved — not necessarily simultaneous
func task(id int, wg *sync.WaitGroup) {
	defer wg.Done()
	fmt.Printf("task %d: started\n", id)
	// simulate work — scheduler may interleave these with other goroutines
	for i := 0; i < 3; i++ {
		fmt.Printf("task %d: step %d\n", id, i)
	}
}

func main() {
	var wg sync.WaitGroup
	wg.Add(2)
	go task(1, &wg) // order of output is not guaranteed — that IS concurrency
	go task(2, &wg)
	wg.Wait()
	fmt.Println("both tasks completed — order was non-deterministic")
}

Gotcha: Assuming concurrent goroutines run in the order they were launched — the scheduler decides order; never rely on launch order for correctness.

Concept: Parallelism means executing two or more instructions at the same time.

Why it matters: Parallelism requires multiple hardware threads — it's a hardware property that concurrency can exploit when GOMAXPROCS > 1.

package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

// parallel: two goroutines run SIMULTANEOUSLY on separate CPU cores
func cpuWork(id int, wg *sync.WaitGroup) {
	defer wg.Done()
	start := time.Now()
	sum := 0
	for i := 0; i < 50_000_000; i++ {
		sum += i
	}
	fmt.Printf("worker %d: sum=%d elapsed=%v\n", id, sum, time.Since(start))
}

func main() {
	fmt.Printf("GOMAXPROCS=%d (parallel goroutines possible)\n", runtime.GOMAXPROCS(0))

	var wg sync.WaitGroup
	start := time.Now()
	wg.Add(2)
	go cpuWork(1, &wg) // runs on core 1
	go cpuWork(2, &wg) // runs on core 2 simultaneously — true parallelism
	wg.Wait()
	fmt.Printf("total elapsed: %v (less than 2× single-worker time = parallelism)\n", time.Since(start))
}

Gotcha: Setting GOMAXPROCS=1 and expecting parallel execution — with one logical processor, goroutines are concurrent but not parallel.

Concept: Operating systems employ schedulers to create the illusion that multiple programs run simultaneously, even on single-threaded hardware.

Why it matters: Understanding OS scheduling explains why goroutines don't need OS threads 1:1 — the Go scheduler layers on top and multiplexes efficiently.

package main

import (
	"fmt"
	"runtime"
	"time"
)

// OS scheduler gives each OS thread a time slice — Go scheduler multiplexes goroutines
// on top of OS threads — goroutines are MUCH cheaper than OS threads
func main() {
	fmt.Printf("OS threads (GOMAXPROCS): %d\n", runtime.GOMAXPROCS(0))

	// launch 10,000 goroutines — would be impossible with OS threads
	done := make(chan struct{})
	count := 10_000

	for i := 0; i < count; i++ {
		go func() {
			time.Sleep(100 * time.Millisecond) // each goroutine waits — parks, not blocks OS thread
			done <- struct{}{}
		}()
	}

	for i := 0; i < count; i++ {
		<-done
	}

	fmt.Printf("completed %d goroutines — OS would need far fewer threads\n", count)
}

Gotcha: Launching one OS thread per request (like Java threads) — Go's goroutines are designed for exactly this: one goroutine per request at minimal cost.

Concept: Schedulers use time slices to allocate CPU time to threads — context switching between threads incurs overhead.

Why it matters: Context switch overhead (~1–10µs on Linux) accumulates — too many threads wastes more CPU on switching than on real work.

package main

import (
	"fmt"
	"runtime"
	"time"
)

// demonstrate context switch overhead by measuring goroutine ping-pong
func pingPong(n int) time.Duration {
	ch := make(chan struct{})
	start := time.Now()

	go func() {
		for i := 0; i < n; i++ {
			<-ch         // receive — context switch to this goroutine
			ch <- struct{}{} // send — context switch back
		}
	}()

	for i := 0; i < n; i++ {
		ch <- struct{}{} // send — triggers context switch
		<-ch             // receive — triggers context switch back
	}
	return time.Since(start)
}

func main() {
	runtime.GOMAXPROCS(1) // single thread: all context switches visible
	n := 100_000
	elapsed := pingPong(n)
	fmt.Printf("%d ping-pongs in %v\n", n, elapsed)
	fmt.Printf("avg per switch: %v\n", elapsed/time.Duration(n*2))
}

Gotcha: Launching more goroutines than there are tasks — goroutine overhead and context switching costs more than the work itself for trivial tasks.

Concept: Understanding the difference between CPU-bound and IO-bound workloads is crucial for optimizing performance.

Why it matters: The right concurrency strategy depends on workload type — CPU-bound needs parallelism, IO-bound needs concurrency (goroutines that yield while waiting).

package main

import (
	"fmt"
	"time"
)

// CPU-bound: goroutine never yields — needs more CPUs, not more goroutines
func cpuBound() int {
	sum := 0
	for i := 0; i < 10_000_000; i++ {
		sum += i * i // pure computation — no opportunity to yield
	}
	return sum
}

// IO-bound: goroutine parks while waiting — one thread handles many goroutines
func ioBound(id int) string {
	time.Sleep(100 * time.Millisecond) // simulates network/disk wait — goroutine parks
	return fmt.Sprintf("io-%d done", id)
}

func main() {
	// CPU-bound: best with GOMAXPROCS workers (one per core)
	start := time.Now()
	_ = cpuBound()
	fmt.Println("cpu-bound:", time.Since(start))

	// IO-bound: 100 goroutines, but only milliseconds total
	start = time.Now()
	results := make(chan string, 100)
	for i := 0; i < 100; i++ {
		go func(id int) { results <- ioBound(id) }(i)
	}
	for i := 0; i < 100; i++ { <-results }
	fmt.Println("io-bound (100 concurrent):", time.Since(start)) // ~100ms not 10s
}

Gotcha: Using a worker pool sized to runtime.NumCPU() for IO-bound work — IO-bound work benefits from far more goroutines than CPU cores available.

Concept: Scheduler Period — time interval during which the scheduler aims to execute all runnable threads; as threads increase, each gets less time.

Why it matters: More goroutines competing for the same scheduler period means each gets a smaller slice — unbounded goroutine creation degrades throughput.

package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

// compare throughput: too many vs right-sized goroutine pool
func runWithWorkers(tasks int, workers int) time.Duration {
	work := make(chan int, tasks)
	var wg sync.WaitGroup

	for i := 0; i < workers; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			for range work {
				time.Sleep(time.Microsecond) // minimal IO-like work
			}
		}()
	}

	start := time.Now()
	for i := 0; i < tasks; i++ { work <- i }
	close(work)
	wg.Wait()
	return time.Since(start)
}

func main() {
	tasks := 10_000
	cpus := runtime.NumCPU()

	t1 := runWithWorkers(tasks, cpus)          // right-sized pool
	t2 := runWithWorkers(tasks, tasks)          // one goroutine per task

	fmt.Printf("pool(%d workers):  %v\n", cpus, t1)
	fmt.Printf("unbounded(%d goroutines): %v\n", tasks, t2)
}

Gotcha: Assuming more goroutines always means more throughput — past the optimal pool size, scheduler overhead reverses the gains.

Concept: Context Switch — switching execution from one thread to another; involves saving and loading state, resulting in performance overhead.

Why it matters: Each context switch saves and restores ~100 registers — for high-frequency operations this overhead dominates actual work time.

package main

import (
	"fmt"
	"runtime"
	"sync"
)

// channel-based handoff forces a context switch — measure the cost
func contextSwitchCost(n int) {
	ch1 := make(chan struct{})
	ch2 := make(chan struct{})
	var wg sync.WaitGroup

	wg.Add(1)
	go func() {
		defer wg.Done()
		for i := 0; i < n; i++ {
			<-ch1           // wait — context switch TO this goroutine
			ch2 <- struct{}{} // send — context switch AWAY
		}
	}()

	for i := 0; i < n; i++ {
		ch1 <- struct{}{} // send — context switch AWAY
		<-ch2             // wait — context switch BACK
	}
	wg.Wait()
}

func main() {
	runtime.GOMAXPROCS(1) // isolate: all switches on one thread
	fmt.Println("measuring context switch overhead...")
	contextSwitchCost(1_000_000)
	fmt.Println("done — run with -benchmem for precise measurement")
	// go test -bench=BenchmarkContextSwitch -benchmem -run=^$
}

Gotcha: Using unbuffered channels as mutexes in a hot path — each send+receive forces two context switches per critical section versus one lock/unlock.

Concept: CPU-Bound Workload — a workload where a thread continuously performs computations without waiting for external events; benefits from parallelism.

Why it matters: CPU-bound goroutines never yield voluntarily — they need separate hardware threads to achieve true speedup.

package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

// CPU-bound: matrix multiply — pure computation, no I/O
func matMulRow(a, b [][]float64, result [][]float64, row int) {
	n := len(b[0])
	m := len(b)
	for col := 0; col < n; col++ {
		sum := 0.0
		for k := 0; k < m; k++ {
			sum += a[row][k] * b[k][col]
		}
		result[row][col] = sum
	}
}

func parallelMatMul(a, b [][]float64) [][]float64 {
	rows := len(a)
	result := make([][]float64, rows)
	for i := range result { result[i] = make([]float64, len(b[0])) }

	workers := runtime.NumCPU() // optimal for CPU-bound: one per core
	sem := make(chan struct{}, workers)
	var wg sync.WaitGroup

	for row := 0; row < rows; row++ {
		wg.Add(1)
		sem <- struct{}{}
		go func(r int) {
			defer wg.Done()
			defer func() { <-sem }()
			matMulRow(a, b, result, r)
		}(row)
	}
	wg.Wait()
	return result
}

func main() {
	n := 100
	a := make([][]float64, n)
	b := make([][]float64, n)
	for i := range a {
		a[i] = make([]float64, n)
		b[i] = make([]float64, n)
		for j := range a[i] { a[i][j] = float64(i + j) }
		for j := range b[i] { b[i][j] = float64(i * j) }
	}

	start := time.Now()
	result := parallelMatMul(a, b)
	fmt.Printf("matmul %dx%d in %v, result[0][0]=%.2f\n",
		n, n, time.Since(start), result[0][0])
}

Gotcha: Using more goroutines than CPUs for CPU-bound work — the extra goroutines compete for the same cores, adding scheduling overhead without extra throughput.

Concept: IO-Bound Workload — a workload where a thread frequently waits for I/O; can achieve concurrency even on single-threaded hardware.

Why it matters: IO-bound goroutines spend most of their time parked — one OS thread can drive thousands of concurrent IO operations through the network poller.

package main

import (
	"fmt"
	"sync"
	"time"
)

// IO-bound: each task blocks on I/O — goroutine parks, thread stays free
func fetchData(id int, latency time.Duration) (string, error) {
	time.Sleep(latency) // goroutine parked by timer — OS thread handles other goroutines
	return fmt.Sprintf("data-%d", id), nil
}

func concurrentFetch(count int, latency time.Duration) []string {
	results := make([]string, count)
	var wg sync.WaitGroup

	for i := 0; i < count; i++ {
		wg.Add(1)
		go func(idx int) {
			defer wg.Done()
			data, _ := fetchData(idx, latency)
			results[idx] = data
		}(i)
	}
	wg.Wait()
	return results
}

func main() {
	latency := 100 * time.Millisecond
	count := 50

	start := time.Now()
	results := concurrentFetch(count, latency)
	elapsed := time.Since(start)

	fmt.Printf("fetched %d items in %v\n", len(results), elapsed)
	fmt.Printf("expected sequential: %v, actual concurrent: %v\n",
		time.Duration(count)*latency, elapsed)
	// concurrent ~= 1× latency, not count× latency
}

Gotcha: Applying a CPU-sized worker pool (NumCPU workers) to IO-bound work — IO-bound tasks benefit from 100s or 1000s of goroutines, not just 8.

Concept: Logical Processor (P) — an abstraction representing a resource that can execute goroutines; count typically equals hardware thread count.

Why it matters: Each P has a local run queue — goroutines are distributed across Ps for parallel execution; GOMAXPROCS sets the P count.

package main

import (
	"fmt"
	"runtime"
)

func main() {
	// P count = GOMAXPROCS = number of goroutines that can run in parallel
	currentP := runtime.GOMAXPROCS(0) // 0 = query without changing
	fmt.Printf("current P count: %d\n", currentP)
	fmt.Printf("hardware threads: %d\n", runtime.NumCPU())

	// change P count for experiments
	runtime.GOMAXPROCS(2) // restrict to 2 Ps — forces some goroutines to share
	fmt.Printf("after set: %d\n", runtime.GOMAXPROCS(0))

	// restore
	runtime.GOMAXPROCS(currentP)

	// current goroutine count
	fmt.Printf("goroutines running: %d\n", runtime.NumGoroutine())
}

Gotcha: Setting GOMAXPROCS lower than the number of blocking syscalls — blocked OS threads don't count against GOMAXPROCS; Go creates extra threads for syscalls automatically.

Concept: Machine (M) — an OS thread responsible for executing goroutines; each logical processor is assigned an OS thread.

Why it matters: M (OS thread) is the unit the OS schedules — when a goroutine makes a blocking syscall, the M blocks and Go creates a new M to keep the P busy.

package main

import (
	"fmt"
	"runtime"
	"runtime/debug"
	"sync"
)

func main() {
	// force creation of extra Ms by blocking them with syscalls
	var wg sync.WaitGroup
	blockCount := runtime.NumCPU() * 2

	for i := 0; i < blockCount; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			// LockOSThread: this goroutine is pinned to its M (OS thread)
			// useful for CGo, thread-local storage, or OS-specific calls
			runtime.LockOSThread()
			defer runtime.UnlockOSThread()
			fmt.Printf("locked to OS thread\n")
		}()
	}
	wg.Wait()

	// see thread stats
	var stats debug.GCStats
	debug.ReadGCStats(&stats)
	fmt.Printf("goroutines: %d\n", runtime.NumGoroutine())
}

Gotcha: Calling runtime.LockOSThread() without a matching runtime.UnlockOSThread() — the goroutine permanently owns its M, preventing other goroutines from using it.

Concept: Goroutine (G) — a lightweight, application-level thread managed by the Go scheduler; multiplexed onto OS threads.

Why it matters: Goroutines start at 2KB vs OS threads at 1–8MB — you can run millions of goroutines where OS threads would exhaust memory at thousands.

package main

import (
	"fmt"
	"runtime"
	"sync"
)

func main() {
	var wg sync.WaitGroup
	var mu sync.Mutex
	maxGoroutines := 0

	// launch goroutines and observe peak count
	for i := 0; i < 10000; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			mu.Lock()
			if n := runtime.NumGoroutine(); n > maxGoroutines {
				maxGoroutines = n
			}
			mu.Unlock()
		}()
	}
	wg.Wait()

	var mem runtime.MemStats
	runtime.ReadMemStats(&mem)
	fmt.Printf("peak goroutines: %d\n", maxGoroutines)
	fmt.Printf("heap alloc: %d MB\n", mem.HeapAlloc/1024/1024)
	// 10,000 goroutines use ~20-40MB vs OS threads would need ~10GB
}

Gotcha: Treating goroutines as free — they start cheap but each has a stack, a scheduler entry, and cleanup cost; unbounded creation still causes OOM.

Concept: Network Poller — a mechanism that handles asynchronous network operations, preventing goroutines from blocking OS threads during network I/O.

Why it matters: The network poller allows thousands of goroutines to wait for network events using a handful of OS threads — the key to Go's scalable HTTP servers.

package main

import (
	"fmt"
	"net/http"
	"sync"
	"time"
)

// network poller in action: 100 goroutines waiting on HTTP — only a few OS threads used
func fetchConcurrently(urls []string) []int {
	results := make([]int, len(urls))
	var wg sync.WaitGroup

	for i, url := range urls {
		wg.Add(1)
		go func(idx int, u string) {
			defer wg.Done()
			// http.Get: goroutine parks at network read — network poller watches the fd
			// OS thread is RELEASED and runs other goroutines while waiting
			resp, err := http.Get(u)
			if err != nil {
				results[idx] = 0
				return
			}
			defer resp.Body.Close()
			results[idx] = resp.StatusCode
		}(i, url)
	}
	wg.Wait()
	return results
}

func main() {
	// in real use: fetchConcurrently([]string{"https://go.dev", ...})
	fmt.Println("network poller enables concurrent HTTP with minimal OS threads")
	fmt.Println("goroutine parks on read → network poller notifies on data available")
	_ = time.Second // suppress unused import in simplified demo
}

Gotcha: Using net.Dial with a blocking read in a goroutine and wondering why Go needs so many threads — it doesn't; the network poller handles this correctly via epoll/kqueue.

Concept: Work Stealing — a strategy employed by the Go scheduler to balance workload; idle Ps steal goroutines from busy Ps.

Why it matters: Work stealing prevents CPU cores from sitting idle while other cores have queued goroutines — automatic load balancing without programmer intervention.

package main

import (
	"fmt"
	"runtime"
	"sync"
	"sync/atomic"
	"time"
)

// work stealing demonstration: uneven task distribution
// some Ps run out of work and steal from others
func unevenWork(tasks []int) int64 {
	var total int64
	var wg sync.WaitGroup

	// distribute work unevenly: first half gets heavy tasks
	for i, task := range tasks {
		wg.Add(1)
		weight := task
		if i < len(tasks)/2 {
			weight *= 10 // heavy tasks on "first" P's queue
		}
		go func(w int) {
			defer wg.Done()
			sum := int64(0)
			for j := 0; j < w*1000; j++ {
				sum += int64(j)
			}
			atomic.AddInt64(&total, sum) // idle Ps steal light tasks — balance achieved
		}(weight)
	}
	wg.Wait()
	return total
}

func main() {
	runtime.GOMAXPROCS(4)
	tasks := make([]int, 100)
	for i := range tasks { tasks[i] = i + 1 }

	start := time.Now()
	result := unevenWork(tasks)
	fmt.Printf("total=%d elapsed=%v (work stolen across Ps)\n", result, time.Since(start))
}

Gotcha: Assuming goroutines always run on the P that created them — work stealing means a goroutine may execute on any P, so never rely on goroutine-local storage via P identity.

Concept: Go effectively converts IO-bound work into CPU-bound work from the OS's perspective, maximizing hardware thread utilization.

Why it matters: Instead of the OS seeing hundreds of blocked threads (IO-bound), it sees a small number of always-busy threads (CPU-bound) — better OS scheduling and lower overhead.

package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

// from the OS's view: only GOMAXPROCS threads, always busy
// from Go's view: thousands of goroutines making IO progress
func simulateIOWorkload(goroutines int) time.Duration {
	var wg sync.WaitGroup
	start := time.Now()

	for i := 0; i < goroutines; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			// goroutine parks — OS thread picks up next runnable goroutine
			// OS sees the thread as always occupied (CPU-bound from its perspective)
			time.Sleep(10 * time.Millisecond)
		}(i)
	}
	wg.Wait()
	return time.Since(start)
}

func main() {
	fmt.Printf("GOMAXPROCS: %d OS threads\n", runtime.GOMAXPROCS(0))
	fmt.Printf("running 1000 IO-bound goroutines...\n")

	elapsed := simulateIOWorkload(1000)
	fmt.Printf("elapsed: %v (OS sees %d busy threads, not 1000 blocked threads)\n",
		elapsed, runtime.GOMAXPROCS(0))
}

Gotcha: Using runtime.GOMAXPROCS(runtime.NumCPU()) for IO-heavy services — this is the default; tuning it lower for IO services can actually improve throughput by reducing context switching.

Concept: Goroutines are created using the go keyword followed by a function call.

Why it matters: The go keyword is the only way to create a goroutine — it's syntactically minimal but semantically profound: the function runs independently and concurrently.

package main

import (
	"fmt"
	"sync"
)

func greet(name string) {
	fmt.Printf("hello, %s\n", name)
}

func main() {
	var wg sync.WaitGroup

	// go keyword: launch greet as a goroutine — returns immediately
	wg.Add(1)
	go func() {
		defer wg.Done()
		greet("Harish")
	}()

	// go with a named function — equivalent
	wg.Add(1)
	go func() {
		defer wg.Done()
		greet("Alice")
	}()

	// go with an inline anonymous function — most common pattern
	name := "Bob"
	wg.Add(1)
	go func(n string) { // pass name as argument — avoid closure capture bug
		defer wg.Done()
		greet(n)
	}(name)

	wg.Wait()
}

Gotcha: Using go greet(name) where name is a loop variable — by the time the goroutine runs, the loop may have advanced; pass as an argument.

Concept: Synchronization — coordinating access to shared resources among multiple goroutines to prevent data races.

Why it matters: Without synchronization, concurrent writes to shared memory produce non-deterministic results — the race detector catches these but doesn't fix them.

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
)

func main() {
	var wg sync.WaitGroup

	// without sync: data race — counter result is non-deterministic
	unsafeCounter := 0
	for i := 0; i < 1000; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			unsafeCounter++ // read-modify-write: NOT atomic
		}()
	}
	wg.Wait()
	fmt.Printf("unsafe counter (racy): %d (should be 1000)\n", unsafeCounter)

	// with atomic: deterministic — hardware guarantees indivisibility
	var safeCounter int64
	for i := 0; i < 1000; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			atomic.AddInt64(&safeCounter, 1) // single atomic instruction
		}()
	}
	wg.Wait()
	fmt.Printf("safe counter (atomic): %d\n", safeCounter) // always 1000
}
// run with: go run -race main.go to detect the data race

Gotcha: Using atomic for counters but map for results — maps require a mutex; atomic only works for single numeric variables.

Concept: Orchestration — managing the interactions and communication between multiple goroutines.

Why it matters: Orchestration patterns (pipelines, fan-out, fan-in) are the structured vocabulary for coordinating goroutines — ad-hoc coordination leads to races and deadlocks.

package main

import (
	"fmt"
	"sync"
)

// pipeline orchestration: three stages connected by channels
func generate(nums ...int) <-chan int {
	out := make(chan int)
	go func() {
		defer close(out)
		for _, n := range nums { out <- n }
	}()
	return out
}

func square(in <-chan int) <-chan int {
	out := make(chan int)
	go func() {
		defer close(out)
		for n := range in { out <- n * n }
	}()
	return out
}

func fanIn(channels ...<-chan int) <-chan int {
	out := make(chan int)
	var wg sync.WaitGroup
	for _, ch := range channels {
		wg.Add(1)
		go func(c <-chan int) {
			defer wg.Done()
			for v := range c { out <- v }
		}(ch)
	}
	go func() { wg.Wait(); close(out) }()
	return out
}

func main() {
	// orchestrated pipeline: generate → square → fanIn → print
	nums := generate(1, 2, 3, 4, 5)
	squared := square(nums)
	for v := range squared {
		fmt.Println(v) // 1 4 9 16 25
	}
}

Gotcha: Not closing output channels in pipeline stages — downstream range loops block forever waiting for more data that will never come.

Concept: WaitGroup — a synchronization primitive used to wait for a collection of goroutines to finish.

Why it matters: WaitGroup is the simplest correct way to block until N goroutines complete — it avoids the anti-pattern of sleeping for "long enough".

package main

import (
	"fmt"
	"sync"
)

func worker(id int, results chan<- int, wg *sync.WaitGroup) {
	defer wg.Done()                 // must always call Done — defer guarantees it
	results <- id * id               // send result before Done
}

func main() {
	const n = 5
	var wg sync.WaitGroup
	results := make(chan int, n)     // buffered: workers don't block on send

	for i := 1; i <= n; i++ {
		wg.Add(1)                   // Add BEFORE launching — prevents race with Wait
		go worker(i, results, &wg)
	}

	// close channel when all workers are done — enables range over results
	go func() {
		wg.Wait()
		close(results)
	}()

	for r := range results {
		fmt.Println(r)
	}
}

Gotcha: Calling wg.Add(1) inside the goroutine — if the scheduler delays the goroutine, wg.Wait() may return before Add is called, missing the goroutine entirely.

Concept: Data Races — occur when multiple goroutines access the same memory location concurrently without proper synchronization.

Why it matters: Data races produce undefined behavior — the result depends on CPU cache state, scheduler timing, and memory ordering, making bugs non-reproducible.

package main

import (
	"fmt"
	"sync"
)

// race: concurrent map writes — undefined behavior, likely panic
func raceMap() map[string]int {
	m := map[string]int{}
	var wg sync.WaitGroup
	for i := 0; i < 100; i++ {
		wg.Add(1)
		key := fmt.Sprintf("key%d", i)
		go func() {
			defer wg.Done()
			m[key] = 1 // DATA RACE: concurrent writes to map
		}()
	}
	wg.Wait()
	return m
}

// fix: sync.Map for concurrent access
func safeMap() *sync.Map {
	var m sync.Map
	var wg sync.WaitGroup
	for i := 0; i < 100; i++ {
		wg.Add(1)
		key := fmt.Sprintf("key%d", i)
		go func() {
			defer wg.Done()
			m.Store(key, 1) // safe: sync.Map handles concurrent access
		}()
	}
	wg.Wait()
	return &m
}

func main() {
	// run raceMap with -race flag to see the race detector report
	// raceMap() // uncomment to see race — DO NOT run in production
	sm := safeMap()
	sm.Range(func(k, v interface{}) bool {
		fmt.Printf("%v=%v\n", k, v)
		return true
	})
}
// detect races: go run -race main.go
// race detector adds ~10× overhead — only use in tests and staging

Gotcha: Thinking "it works in testing without -race" means there's no race — races are non-deterministic; they surface under load, not in unit tests.

Concept: Use appropriate synchronization and orchestration primitives for each problem to keep code complexity low.

Why it matters: Choosing the right primitive (mutex vs channel vs atomic vs once) directly determines code clarity — the wrong choice adds accidental complexity.

package main

import (
	"fmt"
	"sync"
)

// choose the right primitive for the problem:
// atomic:  single numeric counter with no dependent reads
// mutex:   guard access to a struct or map
// channel: communicate results between goroutines
// once:    initialize shared state exactly once

var (
	// sync.Once: initialize expensive shared resource exactly once — thread-safe
	once     sync.Once
	sharedDB *FakeDB
)

type FakeDB struct{ name string }

func initDB() *FakeDB {
	once.Do(func() {
		fmt.Println("initializing DB — runs exactly once")
		sharedDB = &FakeDB{name: "production"}
	})
	return sharedDB
}

func main() {
	var wg sync.WaitGroup
	for i := 0; i < 5; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			db := initDB() // all 5 goroutines call this — DB initialized once
			fmt.Println("using db:", db.name)
		}()
	}
	wg.Wait()
}

Gotcha: Using sync.Once and storing the result in a local variable — call once.Do to set a package-level variable, then read that variable; the once guarantees the write happens-before the read.