Parallelize the Iter function in all archive backends

This commit is contained in:
2025-12-16 09:11:09 +01:00
parent e6286768a7
commit 7fce6fa401
3 changed files with 144 additions and 84 deletions

View File

@@ -17,6 +17,7 @@ import (
"os"
"strconv"
"strings"
"sync"
"text/tabwriter"
"time"
@@ -813,29 +814,18 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
ctx := context.Background()
defer close(ch)
for _, cluster := range s3a.clusters {
prefix := cluster + "/"
numWorkers := 4
metaKeys := make(chan string, numWorkers*2)
var wg sync.WaitGroup
paginator := s3.NewListObjectsV2Paginator(s3a.client, &s3.ListObjectsV2Input{
Bucket: aws.String(s3a.bucket),
Prefix: aws.String(prefix),
})
for paginator.HasMorePages() {
page, err := paginator.NextPage(ctx)
if err != nil {
cclog.Fatalf("S3Archive Iter() > list error: %s", err.Error())
}
for _, obj := range page.Contents {
if obj.Key == nil || !strings.HasSuffix(*obj.Key, "/meta.json") {
continue
}
// Load job metadata
for range numWorkers {
wg.Add(1)
go func() {
defer wg.Done()
for metaKey := range metaKeys {
result, err := s3a.client.GetObject(ctx, &s3.GetObjectInput{
Bucket: aws.String(s3a.bucket),
Key: obj.Key,
Key: aws.String(metaKey),
})
if err != nil {
cclog.Errorf("S3Archive Iter() > GetObject meta error: %v", err)
@@ -867,8 +857,34 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
ch <- JobContainer{Meta: job, Data: nil}
}
}
}()
}
for _, cluster := range s3a.clusters {
prefix := cluster + "/"
paginator := s3.NewListObjectsV2Paginator(s3a.client, &s3.ListObjectsV2Input{
Bucket: aws.String(s3a.bucket),
Prefix: aws.String(prefix),
})
for paginator.HasMorePages() {
page, err := paginator.NextPage(ctx)
if err != nil {
cclog.Fatalf("S3Archive Iter() > list error: %s", err.Error())
}
for _, obj := range page.Contents {
if obj.Key == nil || !strings.HasSuffix(*obj.Key, "/meta.json") {
continue
}
metaKeys <- *obj.Key
}
}
}
close(metaKeys)
wg.Wait()
}()
return ch