Add log messages to error events w/o log message, primaryly error level

- "log spam" to be controlled via loglevel flag on startup
This commit is contained in:
Christoph Kluge
2023-01-31 18:28:44 +01:00
parent 25eb3bb481
commit b77bd078e5
22 changed files with 214 additions and 14 deletions

View File

@@ -68,10 +68,12 @@ func scanJob(row interface{ Scan(...interface{}) error }) (*schema.Job, error) {
&job.ID, &job.JobID, &job.User, &job.Project, &job.Cluster, &job.SubCluster, &job.StartTimeUnix, &job.Partition, &job.ArrayJobId,
&job.NumNodes, &job.NumHWThreads, &job.NumAcc, &job.Exclusive, &job.MonitoringStatus, &job.SMT, &job.State,
&job.Duration, &job.Walltime, &job.RawResources /*&job.MetaData*/); err != nil {
log.Error("Error while scanning rows")
return nil, err
}
if err := json.Unmarshal(job.RawResources, &job.Resources); err != nil {
log.Error("Error while unmarhsaling raw resources json")
return nil, err
}
@@ -93,6 +95,7 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error
if err := sq.Select("job.meta_data").From("job").Where("job.id = ?", job.ID).
RunWith(r.stmtCache).QueryRow().Scan(&job.RawMetaData); err != nil {
log.Error("Error while scanning for job metadata")
return nil, err
}
@@ -101,6 +104,7 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error
}
if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil {
log.Error("Error while unmarshaling raw metadata json")
return nil, err
}
@@ -113,6 +117,7 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
r.cache.Del(cachekey)
if job.MetaData == nil {
if _, err = r.FetchMetadata(job); err != nil {
log.Errorf("Error while fetching metadata for job, DB ID '%#v'", job.ID)
return err
}
}
@@ -129,10 +134,12 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er
}
if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil {
log.Errorf("Error while marshaling metadata for job, DB ID '%#v'", job.ID)
return err
}
if _, err = sq.Update("job").Set("meta_data", job.RawMetaData).Where("job.id = ?", job.ID).RunWith(r.stmtCache).Exec(); err != nil {
log.Errorf("Error while updating metadata for job, DB ID '%#v'", job.ID)
return err
}
@@ -185,6 +192,7 @@ func (r *JobRepository) FindAll(
rows, err := q.RunWith(r.stmtCache).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
@@ -192,6 +200,7 @@ func (r *JobRepository) FindAll(
for rows.Next() {
job, err := scanJob(rows)
if err != nil {
log.Error("Error while scanning rows")
return nil, err
}
jobs = append(jobs, job)
@@ -259,7 +268,7 @@ func (r *JobRepository) DeleteJobsBefore(startTime int64) (int, error) {
err := r.DB.Get(&cnt, qs) //ignore error as it will also occur in delete statement
_, err = r.DB.Exec(`DELETE FROM job WHERE job.start_time < ?`, startTime)
if err != nil {
log.Warnf(" DeleteJobsBefore(%d): error %v", startTime, err)
log.Errorf(" DeleteJobsBefore(%d): error %#v", startTime, err)
} else {
log.Infof("DeleteJobsBefore(%d): Deleted %d jobs", startTime, cnt)
}
@@ -269,7 +278,7 @@ func (r *JobRepository) DeleteJobsBefore(startTime int64) (int, error) {
func (r *JobRepository) DeleteJobById(id int64) error {
_, err := r.DB.Exec(`DELETE FROM job WHERE job.id = ?`, id)
if err != nil {
log.Warnf("DeleteJobById(%d): error %v", id, err)
log.Errorf("DeleteJobById(%d): error %#v", id, err)
} else {
log.Infof("DeleteJobById(%d): Success", id)
}
@@ -293,7 +302,7 @@ func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggre
count = fmt.Sprintf(`sum(job.num_nodes * (CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) as count`, now)
runner = r.DB
default:
log.Notef("CountGroupedJobs() Weight %v unknown.", *weight)
log.Notef("CountGroupedJobs() Weight %#v unknown.", *weight)
}
}
@@ -309,6 +318,7 @@ func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggre
counts := map[string]int{}
rows, err := q.RunWith(runner).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
@@ -316,6 +326,7 @@ func (r *JobRepository) CountGroupedJobs(ctx context.Context, aggreg model.Aggre
var group string
var count int
if err := rows.Scan(&group, &count); err != nil {
log.Error("Error while scanning rows")
return nil, err
}
@@ -359,11 +370,12 @@ func (r *JobRepository) MarkArchived(
case "file_bw":
stmt = stmt.Set("file_bw_avg", stats.Avg)
default:
log.Notef("MarkArchived() Metric %s unknown.", metric)
log.Notef("MarkArchived() Metric '%#v' unknown", metric)
}
}
if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil {
log.Error("Error while marking job as archived")
return err
}
return nil
@@ -478,6 +490,7 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
Where("job.cluster = ?", cluster).
RunWith(r.stmtCache).Query()
if err != nil {
log.Error("Error while running query")
return nil, err
}
@@ -488,9 +501,11 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in
var resources []*schema.Resource
var subcluster string
if err := rows.Scan(&raw, &subcluster); err != nil {
log.Error("Error while scanning rows")
return nil, err
}
if err := json.Unmarshal(raw, &resources); err != nil {
log.Error("Error while unmarshaling raw resources json")
return nil, err
}
@@ -518,16 +533,18 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error {
Where(fmt.Sprintf("(%d - job.start_time) > (job.walltime + %d)", time.Now().Unix(), seconds)).
RunWith(r.DB).Exec()
if err != nil {
log.Error("Error while stopping jobs exceeding walltime")
return err
}
rowsAffected, err := res.RowsAffected()
if err != nil {
log.Error("Error while fetching affected rows after stopping due to exceeded walltime")
return err
}
if rowsAffected > 0 {
log.Warnf("%d jobs have been marked as failed due to running too long", rowsAffected)
log.Notef("%d jobs have been marked as failed due to running too long", rowsAffected)
}
return nil
}