Add documentation for importer

2026-03-17 21:37:30 +01:00 · 2025-12-04 15:07:09 +01:00
parent f3ea95535b
commit 7cff8bbfd2
6 changed files with 362 additions and 148 deletions
--- a/internal/importer/initDB.go
+++ b/internal/importer/initDB.go
@@ -2,6 +2,15 @@
 // All rights reserved. This file is part of cc-backend.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
+
+// Package importer provides functionality for importing job data into the ClusterCockpit database.
+//
+// The package supports two primary use cases:
+//  1. Bulk database initialization from archived jobs via InitDB()
+//  2. Individual job import from file pairs via HandleImportFlag()
+//
+// Both operations enrich job metadata by calculating footprints and energy metrics
+// before persisting to the database.
 package importer

 import (
@@ -22,8 +31,21 @@ const (
 	setTagQuery = "INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)"
 )

-// Delete the tables "job", "tag" and "jobtag" from the database and
-// repopulate them using the jobs found in `archive`.
+// InitDB reinitializes the job database from archived job data.
+//
+// This function performs the following operations:
+//  1. Flushes existing job, tag, and jobtag tables
+//  2. Iterates through all jobs in the archive
+//  3. Enriches each job with calculated footprints and energy metrics
+//  4. Inserts jobs and tags into the database in batched transactions
+//
+// Jobs are processed in batches of 100 for optimal performance. The function
+// continues processing even if individual jobs fail, logging errors and
+// returning a summary at the end.
+//
+// Returns an error if database initialization, transaction management, or
+// critical operations fail. Individual job failures are logged but do not
+// stop the overall import process.
 func InitDB() error {
 	r := repository.GetJobRepository()
 	if err := r.Flush(); err != nil {
@@ -72,76 +94,7 @@ func InitDB() error {

 		jobMeta.MonitoringStatus = schema.MonitoringStatusArchivingSuccessful

-		sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster)
-		if err != nil {
-			cclog.Errorf("cannot get subcluster: %s", err.Error())
-			return err
-		}
-
-		jobMeta.Footprint = make(map[string]float64)
-
-		for _, fp := range sc.Footprint {
-			statType := "avg"
-
-			if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
-				statType = sc.MetricConfig[i].Footprint
-			}
-
-			name := fmt.Sprintf("%s_%s", fp, statType)
-
-			jobMeta.Footprint[name] = repository.LoadJobStat(jobMeta, fp, statType)
-		}
-
-		jobMeta.RawFootprint, err = json.Marshal(jobMeta.Footprint)
-		if err != nil {
-			cclog.Warn("Error while marshaling job footprint")
-			return err
-		}
-
-		jobMeta.EnergyFootprint = make(map[string]float64)
-
-		// Total Job Energy Outside Loop
-		totalEnergy := 0.0
-		for _, fp := range sc.EnergyFootprint {
-			// Always Init Metric Energy Inside Loop
-			metricEnergy := 0.0
-			if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
-				// Note: For DB data, calculate and save as kWh
-				if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules)
-					cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp)
-					// FIXME: Needs sum as stats type
-				} else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt)
-					// Energy: Power (in Watts) * Time (in Seconds)
-					// Unit: (W * (s / 3600)) / 1000 = kWh
-					// Round 2 Digits: round(Energy * 100) / 100
-					// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
-					// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
-					rawEnergy := ((repository.LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0
-					metricEnergy = math.Round(rawEnergy*100.0) / 100.0
-				}
-			} else {
-				cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID)
-			}
-
-			jobMeta.EnergyFootprint[fp] = metricEnergy
-			totalEnergy += metricEnergy
-		}
-
-		jobMeta.Energy = (math.Round(totalEnergy*100.0) / 100.0)
-		if jobMeta.RawEnergyFootprint, err = json.Marshal(jobMeta.EnergyFootprint); err != nil {
-			cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", jobMeta.ID)
-			return err
-		}
-
-		jobMeta.RawResources, err = json.Marshal(jobMeta.Resources)
-		if err != nil {
-			cclog.Errorf("repository initDB(): %v", err)
-			errorOccured++
-			continue
-		}
-
-		jobMeta.RawMetaData, err = json.Marshal(jobMeta.MetaData)
-		if err != nil {
+		if err := enrichJobMetadata(jobMeta); err != nil {
 			cclog.Errorf("repository initDB(): %v", err)
 			errorOccured++
 			continue
@@ -163,9 +116,9 @@ func InitDB() error {

 		for _, tag := range jobMeta.Tags {
 			tagstr := tag.Name + ":" + tag.Type
-			tagId, ok := tags[tagstr]
+			tagID, ok := tags[tagstr]
 			if !ok {
-				tagId, err = r.TransactionAdd(t,
+				tagID, err = r.TransactionAdd(t,
 					addTagQuery,
 					tag.Name, tag.Type)
 				if err != nil {
@@ -173,12 +126,12 @@ func InitDB() error {
 					errorOccured++
 					continue
 				}
-				tags[tagstr] = tagId
+				tags[tagstr] = tagID
 			}

 			r.TransactionAdd(t,
 				setTagQuery,
-				id, tagId)
+				id, tagID)
 		}

 		if err == nil {
@@ -195,7 +148,110 @@ func InitDB() error {
 	return nil
 }

-// This function also sets the subcluster if necessary!
+// enrichJobMetadata calculates and populates job footprints, energy metrics, and serialized fields.
+//
+// This function performs the following enrichment operations:
+//  1. Calculates job footprint metrics based on the subcluster configuration
+//  2. Computes energy footprint and total energy consumption in kWh
+//  3. Marshals footprints, resources, and metadata into JSON for database storage
+//
+// The function expects the job's MonitoringStatus and SubCluster to be already set.
+// Energy calculations convert power metrics (Watts) to energy (kWh) using the formula:
+//
+//	Energy (kWh) = (Power (W) * Duration (s) / 3600) / 1000
+//
+// Returns an error if subcluster retrieval, metric indexing, or JSON marshaling fails.
+func enrichJobMetadata(job *schema.Job) error {
+	sc, err := archive.GetSubCluster(job.Cluster, job.SubCluster)
+	if err != nil {
+		cclog.Errorf("cannot get subcluster: %s", err.Error())
+		return err
+	}
+
+	job.Footprint = make(map[string]float64)
+
+	for _, fp := range sc.Footprint {
+		statType := "avg"
+
+		if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil {
+			statType = sc.MetricConfig[i].Footprint
+		}
+
+		name := fmt.Sprintf("%s_%s", fp, statType)
+
+		job.Footprint[name] = repository.LoadJobStat(job, fp, statType)
+	}
+
+	job.RawFootprint, err = json.Marshal(job.Footprint)
+	if err != nil {
+		cclog.Warn("Error while marshaling job footprint")
+		return err
+	}
+
+	job.EnergyFootprint = make(map[string]float64)
+
+	// Total Job Energy Outside Loop
+	totalEnergy := 0.0
+	for _, fp := range sc.EnergyFootprint {
+		// Always Init Metric Energy Inside Loop
+		metricEnergy := 0.0
+		if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil {
+			// Note: For DB data, calculate and save as kWh
+			switch sc.MetricConfig[i].Energy {
+			case "energy": // this metric has energy as unit (Joules)
+				cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", job.JobID, job.Cluster, fp)
+				// FIXME: Needs sum as stats type
+			case "power": // this metric has power as unit (Watt)
+				// Energy: Power (in Watts) * Time (in Seconds)
+				// Unit: (W * (s / 3600)) / 1000 = kWh
+				// Round 2 Digits: round(Energy * 100) / 100
+				// Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000
+				// Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1
+				rawEnergy := ((repository.LoadJobStat(job, fp, "avg") * float64(job.NumNodes)) * (float64(job.Duration) / 3600.0)) / 1000.0
+				metricEnergy = math.Round(rawEnergy*100.0) / 100.0
+			}
+		} else {
+			cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID)
+		}
+
+		job.EnergyFootprint[fp] = metricEnergy
+		totalEnergy += metricEnergy
+	}
+
+	job.Energy = (math.Round(totalEnergy*100.0) / 100.0)
+	if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil {
+		cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID)
+		return err
+	}
+
+	job.RawResources, err = json.Marshal(job.Resources)
+	if err != nil {
+		cclog.Warn("Error while marshaling job resources")
+		return err
+	}
+
+	job.RawMetaData, err = json.Marshal(job.MetaData)
+	if err != nil {
+		cclog.Warn("Error while marshaling job metadata")
+		return err
+	}
+
+	return nil
+}
+
+// SanityChecks validates job metadata and ensures cluster/subcluster configuration is valid.
+//
+// This function performs the following validations:
+//  1. Verifies the cluster exists in the archive configuration
+//  2. Assigns and validates the subcluster (may modify job.SubCluster)
+//  3. Validates job state is a recognized value
+//  4. Ensures resources and user fields are populated
+//  5. Validates node counts and hardware thread counts are positive
+//  6. Verifies the number of resources matches the declared node count
+//
+// The function may modify the job's SubCluster field if it needs to be assigned.
+//
+// Returns an error if any validation check fails.
 func SanityChecks(job *schema.Job) error {
 	if c := archive.GetCluster(job.Cluster); c == nil {
 		return fmt.Errorf("no such cluster: %v", job.Cluster)
@@ -220,6 +276,14 @@ func SanityChecks(job *schema.Job) error {
 	return nil
 }

+// checkJobData normalizes metric units in job data based on average values.
+//
+// NOTE: This function is currently unused and contains incomplete implementation.
+// It was intended to normalize byte and file-related metrics to appropriate SI prefixes,
+// but the normalization logic is commented out. Consider removing or completing this
+// function based on project requirements.
+//
+// TODO: Either implement the metric normalization or remove this dead code.
 func checkJobData(d *schema.JobData) error {
 	for _, scopes := range *d {
 		// var newUnit schema.Unit