fix: Shard WAL consumer for higher throughput

Entire-Checkpoint: e583b7b11439
This commit is contained in:
2026-03-18 06:32:14 +01:00
parent 50aed595cf
commit bf1a8a174e
3 changed files with 269 additions and 135 deletions

View File

@@ -361,13 +361,11 @@ func DecodeLine(dec *lineprotocol.Decoder,
Value: metric.Value, Value: metric.Value,
Timestamp: time, Timestamp: time,
} }
select { if !SendWALMessage(msg) {
case WALMessages <- msg: // WAL shard channel full — metric is written to memory store but not WAL.
default:
// WAL channel full — metric is written to memory store but not WAL.
// Next binary snapshot will capture it. // Next binary snapshot will capture it.
if dropped := walDropped.Add(1); dropped%10000 == 1 { if dropped := walDropped.Add(1); dropped%10000 == 1 {
cclog.Warnf("[METRICSTORE]> WAL channel full, dropped %d messages (data safe in memory, next snapshot will capture)", dropped) cclog.Warnf("[METRICSTORE]> WAL shard channel full, dropped %d messages (data safe in memory, next snapshot will capture)", dropped)
} }
} }
} }

View File

@@ -277,7 +277,9 @@ func Shutdown() {
} }
if Keys.Checkpoints.FileFormat == "wal" { if Keys.Checkpoints.FileFormat == "wal" {
close(WALMessages) for _, ch := range walShardChs {
close(ch)
}
} }
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir) cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)

View File

@@ -61,6 +61,7 @@ import (
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"hash/crc32" "hash/crc32"
"hash/fnv"
"io" "io"
"math" "math"
"os" "os"
@@ -80,13 +81,15 @@ const (
snapFileMagic = uint32(0xCC5B0001) // Binary snapshot magic snapFileMagic = uint32(0xCC5B0001) // Binary snapshot magic
) )
// WALMessages is the channel for sending metric writes to the WAL staging goroutine. // walShardChs holds per-shard channels for WAL messages.
// Buffered to allow burst writes without blocking the metric ingestion path. // Initialized by WALStaging; nil when WAL is not active.
var WALMessages = make(chan *WALMessage, 65536) var walShardChs []chan *WALMessage
// walRotateCh is used by the checkpoint goroutine to request WAL file rotation // walShardRotateChs holds per-shard channels for WAL rotation requests.
// (close, delete, reopen) after a binary snapshot has been written. var walShardRotateChs []chan walRotateReq
var walRotateCh = make(chan walRotateReq, 256)
// walNumShards stores the number of shards (set during WALStaging init).
var walNumShards int
// WALMessage represents a single metric write to be appended to the WAL. // WALMessage represents a single metric write to be appended to the WAL.
// Cluster and Node are NOT stored in the WAL record (inferred from file path). // Cluster and Node are NOT stored in the WAL record (inferred from file path).
@@ -112,15 +115,56 @@ type walFileState struct {
w *bufio.Writer w *bufio.Writer
} }
// WALStaging starts a background goroutine that receives WALMessage items // walShardIndex computes which shard a message belongs to based on cluster+node.
// and appends binary WAL records to per-host current.wal files. // Uses FNV-1a hash for fast, well-distributed mapping.
// Also handles WAL rotation requests from the checkpoint goroutine. func walShardIndex(cluster, node string) int {
h := fnv.New32a()
h.Write([]byte(cluster))
h.Write([]byte{0})
h.Write([]byte(node))
return int(h.Sum32()) % walNumShards
}
// SendWALMessage routes a WAL message to the appropriate shard channel.
// Returns false if the channel is full (message dropped).
func SendWALMessage(msg *WALMessage) bool {
if walShardChs == nil {
return false
}
shard := walShardIndex(msg.Cluster, msg.Node)
select {
case walShardChs[shard] <- msg:
return true
default:
return false
}
}
// WALStaging starts N sharded background goroutines that receive WALMessage items
// and append binary WAL records to per-host current.wal files.
// Each shard owns a subset of hosts (determined by hash of cluster+node),
// parallelizing serialization and I/O while keeping per-host writes serial.
func WALStaging(wg *sync.WaitGroup, ctx context.Context) { func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
wg.Go(func() {
if Keys.Checkpoints.FileFormat == "json" { if Keys.Checkpoints.FileFormat == "json" {
return return
} }
walNumShards = max(Keys.NumWorkers, 1)
chBufSize := max(65536/walNumShards, 1024)
walShardChs = make([]chan *WALMessage, walNumShards)
walShardRotateChs = make([]chan walRotateReq, walNumShards)
for i := range walNumShards {
walShardChs[i] = make(chan *WALMessage, chBufSize)
walShardRotateChs[i] = make(chan walRotateReq, 64)
}
for i := range walNumShards {
msgCh := walShardChs[i]
rotateCh := walShardRotateChs[i]
wg.Go(func() {
hostFiles := make(map[string]*walFileState) hostFiles := make(map[string]*walFileState)
defer func() { defer func() {
@@ -175,7 +219,7 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
if ws == nil { if ws == nil {
return return
} }
if err := writeWALRecord(ws.w, msg); err != nil { if err := writeWALRecordDirect(ws.w, msg); err != nil {
cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err) cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err)
} }
} }
@@ -194,23 +238,26 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
close(req.done) close(req.done)
} }
drain := func() { flushAll := func() {
for {
select {
case msg, ok := <-WALMessages:
if !ok {
return
}
processMsg(msg)
case req := <-walRotateCh:
processRotate(req)
default:
// Flush all buffered writers after draining remaining messages.
for _, ws := range hostFiles { for _, ws := range hostFiles {
if ws.f != nil { if ws.f != nil {
ws.w.Flush() ws.w.Flush()
} }
} }
}
drain := func() {
for {
select {
case msg, ok := <-msgCh:
if !ok {
return
}
processMsg(msg)
case req := <-rotateCh:
processRotate(req)
default:
flushAll()
return return
} }
} }
@@ -221,7 +268,7 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
case <-ctx.Done(): case <-ctx.Done():
drain() drain()
return return
case msg, ok := <-WALMessages: case msg, ok := <-msgCh:
if !ok { if !ok {
return return
} }
@@ -230,44 +277,56 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
// Drain up to 256 more messages without blocking to batch writes. // Drain up to 256 more messages without blocking to batch writes.
for range 256 { for range 256 {
select { select {
case msg, ok := <-WALMessages: case msg, ok := <-msgCh:
if !ok { if !ok {
return return
} }
processMsg(msg) processMsg(msg)
case req := <-walRotateCh: case req := <-rotateCh:
processRotate(req) processRotate(req)
default: default:
goto flushed goto flushed
} }
} }
flushed: flushed:
// Flush all buffered writers after processing the batch. flushAll()
for _, ws := range hostFiles { case req := <-rotateCh:
if ws.f != nil {
ws.w.Flush()
}
}
case req := <-walRotateCh:
processRotate(req) processRotate(req)
} }
} }
}) })
}
} }
// RotateWALFiles sends rotation requests for the given host directories // RotateWALFiles sends rotation requests for the given host directories
// and blocks until all rotations complete. // and blocks until all rotations complete. Each request is routed to the
// shard that owns the host directory.
func RotateWALFiles(hostDirs []string) { func RotateWALFiles(hostDirs []string) {
if walShardRotateChs == nil {
return
}
dones := make([]chan struct{}, len(hostDirs)) dones := make([]chan struct{}, len(hostDirs))
for i, dir := range hostDirs { for i, dir := range hostDirs {
dones[i] = make(chan struct{}) dones[i] = make(chan struct{})
walRotateCh <- walRotateReq{hostDir: dir, done: dones[i]} // Extract cluster/node from hostDir to find the right shard.
// hostDir = rootDir/cluster/node
shard := walShardIndexFromDir(dir)
walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: dones[i]}
} }
for _, done := range dones { for _, done := range dones {
<-done <-done
} }
} }
// walShardIndexFromDir extracts cluster and node from a host directory path
// and returns the shard index. The path format is: rootDir/cluster/node
func walShardIndexFromDir(hostDir string) int {
// hostDir = .../cluster/node — extract last two path components
node := path.Base(hostDir)
cluster := path.Base(path.Dir(hostDir))
return walShardIndex(cluster, node)
}
// RotateWALFiles sends rotation requests for the given host directories // RotateWALFiles sends rotation requests for the given host directories
// and blocks until all rotations complete. // and blocks until all rotations complete.
func RotateWALFilesAfterShutdown(hostDirs []string) { func RotateWALFilesAfterShutdown(hostDirs []string) {
@@ -279,6 +338,81 @@ func RotateWALFilesAfterShutdown(hostDirs []string) {
} }
} }
// writeWALRecordDirect encodes a WAL record directly into the bufio.Writer,
// avoiding heap allocations by using a stack-allocated scratch buffer for
// the fixed-size header/trailer and computing CRC inline.
func writeWALRecordDirect(w *bufio.Writer, msg *WALMessage) error {
// Compute payload size.
payloadSize := 8 + 2 + len(msg.MetricName) + 1 + 4
for _, s := range msg.Selector {
payloadSize += 1 + len(s)
}
// Write magic + payload length (8 bytes header).
var hdr [8]byte
binary.LittleEndian.PutUint32(hdr[0:4], walRecordMagic)
binary.LittleEndian.PutUint32(hdr[4:8], uint32(payloadSize))
if _, err := w.Write(hdr[:]); err != nil {
return err
}
// We need to compute CRC over the payload as we write it.
crc := crc32.NewIEEE()
// Timestamp (8 bytes).
var scratch [8]byte
binary.LittleEndian.PutUint64(scratch[:8], uint64(msg.Timestamp))
crc.Write(scratch[:8])
if _, err := w.Write(scratch[:8]); err != nil {
return err
}
// Metric name length (2 bytes) + metric name.
binary.LittleEndian.PutUint16(scratch[:2], uint16(len(msg.MetricName)))
crc.Write(scratch[:2])
if _, err := w.Write(scratch[:2]); err != nil {
return err
}
nameBytes := []byte(msg.MetricName)
crc.Write(nameBytes)
if _, err := w.Write(nameBytes); err != nil {
return err
}
// Selector count (1 byte).
scratch[0] = byte(len(msg.Selector))
crc.Write(scratch[:1])
if _, err := w.Write(scratch[:1]); err != nil {
return err
}
// Selectors (1-byte length + bytes each).
for _, sel := range msg.Selector {
scratch[0] = byte(len(sel))
crc.Write(scratch[:1])
if _, err := w.Write(scratch[:1]); err != nil {
return err
}
selBytes := []byte(sel)
crc.Write(selBytes)
if _, err := w.Write(selBytes); err != nil {
return err
}
}
// Value (4 bytes, float32 bits).
binary.LittleEndian.PutUint32(scratch[:4], math.Float32bits(float32(msg.Value)))
crc.Write(scratch[:4])
if _, err := w.Write(scratch[:4]); err != nil {
return err
}
// CRC32 (4 bytes).
binary.LittleEndian.PutUint32(scratch[:4], crc.Sum32())
_, err := w.Write(scratch[:4])
return err
}
// buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC). // buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC).
func buildWALPayload(msg *WALMessage) []byte { func buildWALPayload(msg *WALMessage) []byte {
size := 8 + 2 + len(msg.MetricName) + 1 + 4 size := 8 + 2 + len(msg.MetricName) + 1 + 4