Massive speedup in job table initialization

It went from taking like 15 minutes on a ramdisk to taking 430 seconds
on my SSD (~900000 jobs inserted).

- Create indexes after inserts so that they do not need to be updated continually
- Use prepared statements for the job insert
- Bundle 200 job inserts into a single transaction
This commit is contained in:
Lou Knauer 2021-10-20 09:30:50 +02:00
parent 22e3e49970
commit 236f51ba9a

View File

@ -4,14 +4,20 @@ import (
"bufio" "bufio"
"database/sql" "database/sql"
"encoding/json" "encoding/json"
"github.com/jmoiron/sqlx" "fmt"
"log"
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
"time"
"github.com/jmoiron/sqlx"
) )
func initDB(db *sqlx.DB, archive string) error { func initDB(db *sqlx.DB, archive string) error {
starttime := time.Now()
fmt.Println("Building database...")
// Basic database structure:
_, err := db.Exec(` _, err := db.Exec(`
DROP TABLE IF EXISTS job; DROP TABLE IF EXISTS job;
DROP TABLE IF EXISTS tag; DROP TABLE IF EXISTS tag;
@ -44,10 +50,7 @@ func initDB(db *sqlx.DB, archive string) error {
tag_id INTEGER, tag_id INTEGER,
PRIMARY KEY (job_id, tag_id), PRIMARY KEY (job_id, tag_id),
FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE ON UPDATE NO ACTION, FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE ON UPDATE NO ACTION,
FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE ON UPDATE NO ACTION); FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE ON UPDATE NO ACTION);`)
CREATE INDEX job_by_user ON job (user_id);
CREATE INDEX job_by_starttime ON job (start_time);`)
if err != nil { if err != nil {
return err return err
} }
@ -57,6 +60,15 @@ func initDB(db *sqlx.DB, archive string) error {
return err return err
} }
insertstmt, err := db.Prepare(`INSERT INTO job
(job_id, user_id, project_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata, flops_any_avg, mem_bw_avg, net_bw_avg, file_bw_avg, load_avg)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);`)
if err != nil {
return err
}
var tx *sql.Tx = nil
var i int = 0
tags := make(map[string]int64) tags := make(map[string]int64)
for _, entry0 := range entries0 { for _, entry0 := range entries0 {
entries1, err := os.ReadDir(filepath.Join(archive, entry0.Name())) entries1, err := os.ReadDir(filepath.Join(archive, entry0.Name()))
@ -75,13 +87,45 @@ func initDB(db *sqlx.DB, archive string) error {
} }
for _, entry2 := range entries2 { for _, entry2 := range entries2 {
if err = loadJob(db, tags, filepath.Join(archive, entry0.Name(), entry1.Name(), entry2.Name())); err != nil { // Bundle 200 inserts into one transaction for better performance:
if i%200 == 0 {
if tx != nil {
if err := tx.Commit(); err != nil {
return err return err
} }
} }
tx, err = db.Begin()
if err != nil {
return err
}
insertstmt = tx.Stmt(insertstmt)
fmt.Printf("%d jobs inserted...\r", i)
}
if err = loadJob(tx, insertstmt, tags, filepath.Join(archive, entry0.Name(), entry1.Name(), entry2.Name())); err != nil {
return err
}
i += 1
}
} }
} }
if err := tx.Commit(); err != nil {
return err
}
// Create indexes after inserts so that they do not
// need to be continually updated.
if _, err := db.Exec(`
CREATE INDEX job_by_user ON job (user_id);
CREATE INDEX job_by_starttime ON job (start_time);`); err != nil {
return err
}
fmt.Printf("A total of %d jobs have been registered in %.3f seconds.\n", i, time.Since(starttime).Seconds())
return nil return nil
} }
@ -107,7 +151,7 @@ type JobMetaFile struct {
} `json:"statistics"` } `json:"statistics"`
} }
func loadJob(db *sqlx.DB, tags map[string]int64, path string) error { func loadJob(tx *sql.Tx, stmt *sql.Stmt, tags map[string]int64, path string) error {
f, err := os.Open(filepath.Join(path, "meta.json")) f, err := os.Open(filepath.Join(path, "meta.json"))
if err != nil { if err != nil {
return err return err
@ -125,14 +169,8 @@ func loadJob(db *sqlx.DB, tags map[string]int64, path string) error {
fileBwAvg := loadJobStat(&job, "file_bw") fileBwAvg := loadJobStat(&job, "file_bw")
loadAvg := loadJobStat(&job, "load_one") loadAvg := loadJobStat(&job, "load_one")
res, err := db.Exec(` res, err := stmt.Exec(job.JobId, job.UserId, job.ProjectId, job.ClusterId, job.StartTime, job.Duration, job.JobState,
INSERT INTO job job.NumNodes, strings.Join(job.Nodes, ","), nil, flopsAnyAvg, memBwAvg, netBwAvg, fileBwAvg, loadAvg)
(job_id, user_id, project_id, cluster_id, start_time, duration, job_state, num_nodes, node_list, metadata,
flops_any_avg, mem_bw_avg, net_bw_avg, file_bw_avg, load_avg)
VALUES
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);`,
job.JobId, job.UserId, job.ProjectId, job.ClusterId, job.StartTime, job.Duration, job.JobState, job.NumNodes, strings.Join(job.Nodes, ","), nil,
flopsAnyAvg, memBwAvg, netBwAvg, fileBwAvg, loadAvg)
if err != nil { if err != nil {
return err return err
} }
@ -142,15 +180,11 @@ func loadJob(db *sqlx.DB, tags map[string]int64, path string) error {
return err return err
} }
if id % 50 == 0 {
log.Printf("Inserting Job (id: %d, jobId: %s, clusterId: %s)\n", id, job.JobId, job.ClusterId)
}
for _, tag := range job.Tags { for _, tag := range job.Tags {
tagstr := tag.Name + ":" + tag.Type tagstr := tag.Name + ":" + tag.Type
tagId, ok := tags[tagstr] tagId, ok := tags[tagstr]
if !ok { if !ok {
res, err := db.Exec(`INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)`, tag.Name, tag.Type) res, err := tx.Exec(`INSERT INTO tag (tag_name, tag_type) VALUES (?, ?)`, tag.Name, tag.Type)
if err != nil { if err != nil {
return err return err
} }
@ -161,7 +195,7 @@ func loadJob(db *sqlx.DB, tags map[string]int64, path string) error {
tags[tagstr] = tagId tags[tagstr] = tagId
} }
if _, err := db.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, id, tagId); err != nil { if _, err := tx.Exec(`INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)`, id, tagId); err != nil {
return err return err
} }
} }