mirror of
				https://github.com/ClusterCockpit/cc-metric-store.git
				synced 2025-11-03 18:25:08 +01:00 
			
		
		
		
	parallel checkpointing
This commit is contained in:
		
							
								
								
									
										127
									
								
								archive.go
									
									
									
									
									
								
							
							
						
						
									
										127
									
								
								archive.go
									
									
									
									
									
								
							@@ -35,6 +35,8 @@ type CheckpointFile struct {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
var ErrNoNewData error = errors.New("all data already archived")
 | 
					var ErrNoNewData error = errors.New("all data already archived")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					var NumWorkers int = (runtime.NumCPU() / 4) + 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
 | 
					// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
 | 
				
			||||||
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
 | 
					// On a per-host basis a new JSON file is created. I have no idea if this will scale.
 | 
				
			||||||
// The good thing: Only a host at a time is locked, so this function can run
 | 
					// The good thing: Only a host at a time is locked, so this function can run
 | 
				
			||||||
@@ -53,27 +55,52 @@ func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
	m.root.lock.RUnlock()
 | 
						m.root.lock.RUnlock()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	n, errs := 0, 0
 | 
						type workItem struct {
 | 
				
			||||||
	for i := 0; i < len(levels); i++ {
 | 
							level    *level
 | 
				
			||||||
		dir := path.Join(dir, path.Join(selectors[i]...))
 | 
							dir      string
 | 
				
			||||||
		err := levels[i].toCheckpoint(dir, from, to, m)
 | 
							selector []string
 | 
				
			||||||
		if err != nil {
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						n, errs := int32(0), int32(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						var wg sync.WaitGroup
 | 
				
			||||||
 | 
						wg.Add(NumWorkers)
 | 
				
			||||||
 | 
						work := make(chan workItem, NumWorkers*2)
 | 
				
			||||||
 | 
						for worker := 0; worker < NumWorkers; worker++ {
 | 
				
			||||||
 | 
							go func() {
 | 
				
			||||||
 | 
								defer wg.Done()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								for workItem := range work {
 | 
				
			||||||
 | 
									if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
 | 
				
			||||||
					if err == ErrNoNewData {
 | 
										if err == ErrNoNewData {
 | 
				
			||||||
						continue
 | 
											continue
 | 
				
			||||||
					}
 | 
										}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			log.Printf("checkpointing %#v failed: %s", selectors[i], err.Error())
 | 
										log.Printf("checkpointing %#v failed: %s", workItem.selector, err.Error())
 | 
				
			||||||
			errs += 1
 | 
										atomic.AddInt32(&errs, 1)
 | 
				
			||||||
			continue
 | 
									} else {
 | 
				
			||||||
 | 
										atomic.AddInt32(&n, 1)
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}()
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		n += 1
 | 
						for i := 0; i < len(levels); i++ {
 | 
				
			||||||
 | 
							dir := path.Join(dir, path.Join(selectors[i]...))
 | 
				
			||||||
 | 
							work <- workItem{
 | 
				
			||||||
 | 
								level:    levels[i],
 | 
				
			||||||
 | 
								dir:      dir,
 | 
				
			||||||
 | 
								selector: selectors[i],
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						close(work)
 | 
				
			||||||
 | 
						wg.Wait()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if errs > 0 {
 | 
						if errs > 0 {
 | 
				
			||||||
		return n, fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
 | 
							return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return n, nil
 | 
						return int(n), nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func (l *level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
 | 
					func (l *level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
 | 
				
			||||||
@@ -176,75 +203,61 @@ func (l *level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
 | 
				
			|||||||
// This function can only be called once and before the very first write or read.
 | 
					// This function can only be called once and before the very first write or read.
 | 
				
			||||||
// Different host's data is loaded to memory in parallel.
 | 
					// Different host's data is loaded to memory in parallel.
 | 
				
			||||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
 | 
					func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
 | 
				
			||||||
	workers := (runtime.NumCPU() / 2) + 1
 | 
					 | 
				
			||||||
	work := make(chan [2]string, workers*2)
 | 
					 | 
				
			||||||
	errs := make(chan error, 1)
 | 
					 | 
				
			||||||
	var wg sync.WaitGroup
 | 
						var wg sync.WaitGroup
 | 
				
			||||||
	wg.Add(workers + 1)
 | 
						work := make(chan [2]string, NumWorkers*2)
 | 
				
			||||||
 | 
						n, errs := int32(0), int32(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						wg.Add(NumWorkers)
 | 
				
			||||||
 | 
						for worker := 0; worker < NumWorkers; worker++ {
 | 
				
			||||||
		go func() {
 | 
							go func() {
 | 
				
			||||||
		defer close(work)
 | 
					 | 
				
			||||||
			defer wg.Done()
 | 
								defer wg.Done()
 | 
				
			||||||
 | 
								for host := range work {
 | 
				
			||||||
 | 
									lvl := m.root.findLevelOrCreate(host[:], len(m.metrics))
 | 
				
			||||||
 | 
									nn, err := lvl.fromCheckpoint(filepath.Join(dir, host[0], host[1]), from, m)
 | 
				
			||||||
 | 
									if err != nil {
 | 
				
			||||||
 | 
										log.Fatalf("error while loading checkpoints: %s", err.Error())
 | 
				
			||||||
 | 
										atomic.AddInt32(&errs, int32(nn))
 | 
				
			||||||
 | 
										continue
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
									atomic.AddInt32(&n, int32(nn))
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}()
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	clustersDir, err := os.ReadDir(dir)
 | 
						clustersDir, err := os.ReadDir(dir)
 | 
				
			||||||
		if err != nil {
 | 
					 | 
				
			||||||
			errs <- err
 | 
					 | 
				
			||||||
			return
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	for _, clusterDir := range clustersDir {
 | 
						for _, clusterDir := range clustersDir {
 | 
				
			||||||
		if !clusterDir.IsDir() {
 | 
							if !clusterDir.IsDir() {
 | 
				
			||||||
				errs <- errors.New("expected only directories at first level of checkpoints/ directory")
 | 
								err = errors.New("expected only directories at first level of checkpoints/ directory")
 | 
				
			||||||
				return
 | 
								goto done
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
 | 
							hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
 | 
				
			||||||
			if err != nil {
 | 
							if e != nil {
 | 
				
			||||||
				errs <- err
 | 
								err = e
 | 
				
			||||||
				return
 | 
								goto done
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		for _, hostDir := range hostsDir {
 | 
							for _, hostDir := range hostsDir {
 | 
				
			||||||
			if !hostDir.IsDir() {
 | 
								if !hostDir.IsDir() {
 | 
				
			||||||
					errs <- errors.New("expected only directories at second level of checkpoints/ directory")
 | 
									err = errors.New("expected only directories at second level of checkpoints/ directory")
 | 
				
			||||||
					return
 | 
									goto done
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			work <- [2]string{clusterDir.Name(), hostDir.Name()}
 | 
								work <- [2]string{clusterDir.Name(), hostDir.Name()}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	}()
 | 
					done:
 | 
				
			||||||
 | 
						close(work)
 | 
				
			||||||
	loadedFiles := int32(0)
 | 
					 | 
				
			||||||
	errors := int32(0)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	for worker := 0; worker < workers; worker++ {
 | 
					 | 
				
			||||||
		go func() {
 | 
					 | 
				
			||||||
			defer wg.Done()
 | 
					 | 
				
			||||||
			for host := range work {
 | 
					 | 
				
			||||||
				lvl := m.root.findLevelOrCreate(host[:], len(m.metrics))
 | 
					 | 
				
			||||||
				n, err := lvl.fromCheckpoint(filepath.Join(dir, host[0], host[1]), from, m)
 | 
					 | 
				
			||||||
				if err != nil {
 | 
					 | 
				
			||||||
					log.Fatalf("error while loading checkpoints: %s", err.Error())
 | 
					 | 
				
			||||||
					atomic.AddInt32(&errors, int32(n))
 | 
					 | 
				
			||||||
					continue
 | 
					 | 
				
			||||||
				}
 | 
					 | 
				
			||||||
				atomic.AddInt32(&loadedFiles, int32(n))
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
		}()
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	wg.Wait()
 | 
						wg.Wait()
 | 
				
			||||||
	select {
 | 
					
 | 
				
			||||||
	case err := <-errs:
 | 
						if err != nil {
 | 
				
			||||||
		return int(loadedFiles), err
 | 
							return int(n), err
 | 
				
			||||||
	default:
 | 
					 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if errors > 0 {
 | 
						if errs > 0 {
 | 
				
			||||||
		return int(loadedFiles), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errors, loadedFiles)
 | 
							return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return int(loadedFiles), nil
 | 
						return int(n), nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func (l *level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
 | 
					func (l *level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user