parallel checkpointing

This commit is contained in:
Lou Knauer 2022-02-17 11:00:30 +01:00
parent 6ab7b6879c
commit dea577424e

View File

@ -35,6 +35,8 @@ type CheckpointFile struct {
var ErrNoNewData error = errors.New("all data already archived") var ErrNoNewData error = errors.New("all data already archived")
var NumWorkers int = (runtime.NumCPU() / 4) + 1
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)! // Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
// On a per-host basis a new JSON file is created. I have no idea if this will scale. // On a per-host basis a new JSON file is created. I have no idea if this will scale.
// The good thing: Only a host at a time is locked, so this function can run // The good thing: Only a host at a time is locked, so this function can run
@ -53,27 +55,52 @@ func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
} }
m.root.lock.RUnlock() m.root.lock.RUnlock()
n, errs := 0, 0 type workItem struct {
for i := 0; i < len(levels); i++ { level *level
dir := path.Join(dir, path.Join(selectors[i]...)) dir string
err := levels[i].toCheckpoint(dir, from, to, m) selector []string
if err != nil { }
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(NumWorkers)
work := make(chan workItem, NumWorkers*2)
for worker := 0; worker < NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
if err == ErrNoNewData { if err == ErrNoNewData {
continue continue
} }
log.Printf("checkpointing %#v failed: %s", selectors[i], err.Error()) log.Printf("checkpointing %#v failed: %s", workItem.selector, err.Error())
errs += 1 atomic.AddInt32(&errs, 1)
continue } else {
atomic.AddInt32(&n, 1)
}
}
}()
} }
n += 1 for i := 0; i < len(levels); i++ {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
} }
}
close(work)
wg.Wait()
if errs > 0 { if errs > 0 {
return n, fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n) return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
} }
return n, nil return int(n), nil
} }
func (l *level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) { func (l *level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
@ -176,75 +203,61 @@ func (l *level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
// This function can only be called once and before the very first write or read. // This function can only be called once and before the very first write or read.
// Different host's data is loaded to memory in parallel. // Different host's data is loaded to memory in parallel.
func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) { func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
workers := (runtime.NumCPU() / 2) + 1
work := make(chan [2]string, workers*2)
errs := make(chan error, 1)
var wg sync.WaitGroup var wg sync.WaitGroup
wg.Add(workers + 1) work := make(chan [2]string, NumWorkers*2)
n, errs := int32(0), int32(0)
wg.Add(NumWorkers)
for worker := 0; worker < NumWorkers; worker++ {
go func() { go func() {
defer close(work)
defer wg.Done() defer wg.Done()
for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.metrics))
nn, err := lvl.fromCheckpoint(filepath.Join(dir, host[0], host[1]), from, m)
if err != nil {
log.Fatalf("error while loading checkpoints: %s", err.Error())
atomic.AddInt32(&errs, int32(nn))
continue
}
atomic.AddInt32(&n, int32(nn))
}
}()
}
clustersDir, err := os.ReadDir(dir) clustersDir, err := os.ReadDir(dir)
if err != nil {
errs <- err
return
}
for _, clusterDir := range clustersDir { for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() { if !clusterDir.IsDir() {
errs <- errors.New("expected only directories at first level of checkpoints/ directory") err = errors.New("expected only directories at first level of checkpoints/ directory")
return goto done
} }
hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name())) hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if err != nil { if e != nil {
errs <- err err = e
return goto done
} }
for _, hostDir := range hostsDir { for _, hostDir := range hostsDir {
if !hostDir.IsDir() { if !hostDir.IsDir() {
errs <- errors.New("expected only directories at second level of checkpoints/ directory") err = errors.New("expected only directories at second level of checkpoints/ directory")
return goto done
} }
work <- [2]string{clusterDir.Name(), hostDir.Name()} work <- [2]string{clusterDir.Name(), hostDir.Name()}
} }
} }
}() done:
close(work)
loadedFiles := int32(0)
errors := int32(0)
for worker := 0; worker < workers; worker++ {
go func() {
defer wg.Done()
for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.metrics))
n, err := lvl.fromCheckpoint(filepath.Join(dir, host[0], host[1]), from, m)
if err != nil {
log.Fatalf("error while loading checkpoints: %s", err.Error())
atomic.AddInt32(&errors, int32(n))
continue
}
atomic.AddInt32(&loadedFiles, int32(n))
}
}()
}
wg.Wait() wg.Wait()
select {
case err := <-errs: if err != nil {
return int(loadedFiles), err return int(n), err
default:
} }
if errors > 0 { if errs > 0 {
return int(loadedFiles), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errors, loadedFiles) return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
} }
return int(loadedFiles), nil return int(n), nil
} }
func (l *level) loadFile(cf *CheckpointFile, m *MemoryStore) error { func (l *level) loadFile(cf *CheckpointFile, m *MemoryStore) error {