cc-metric-collector/collectors/slurmJobDetector.go
2024-01-05 17:24:06 +01:00

758 lines
22 KiB
Go

package collectors
import (
"encoding/json"
"fmt"
"os"
osuser "os/user"
filepath "path/filepath"
"strconv"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type SlurmJobMetadata struct {
UID uint64 `json:"uid"`
JobId uint64 `json:"jobid"`
Timestamp uint64 `json:"timestamp"`
Status string `json:"status"`
Step string `json:"step,omitempty"`
Cpus []int `json:"cpus,omitempty"`
Memories []int `json:"memories,omitempty"`
MemoryLimitHard uint64 `json:"memory_limit_hard,omitempty"`
MemoryLimitSoft uint64 `json:"memory_limit_soft,omitempty"`
Devices []string `json:"devices,omitempty"`
}
type SlurmJobMetrics struct {
MemoryUsage int64
MaxMemoryUsage int64
LimitMemoryUsage int64
CpuUsageUser int64
CpuUsageSys int64
}
type SlurmJobStepData struct {
Metrics SlurmJobMetrics
Step string
}
type SlurmJobData struct {
Metrics SlurmJobMetrics
Steps []SlurmJobStepData
}
// These are the fields we read from the JSON configuration
type SlurmJobDetectorConfig struct {
Interval string `json:"interval"`
SendJobEvents bool `json:"send_job_events,omitempty"`
SendStepEvents bool `json:"send_step_events,omitempty"`
SendJobMetrics bool `json:"send_job_metrics,omitempty"`
SendStepMetrics bool `json:"send_step_metrics,omitempty"`
ExcludeUsers []string `json:"exclude_users,omitempty"`
BaseDirectory string `json:"sysfs_base,omitempty"`
}
// This contains all variables we need during execution and the variables
// defined by metricCollector (name, init, ...)
type SlurmJobDetector struct {
metricCollector
config SlurmJobDetectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
//jobs map[string]map[string]SlurmJobData
interval time.Duration // the interval parsed from configuration
ticker *time.Ticker // own timer
output chan lp.CCMetric // own internal output channel
wg sync.WaitGroup // sync group for management
done chan bool // channel for management
files map[string]struct{}
}
const default_base_dir = "/sys/fs/cgroup"
var cpuacct_base = fmt.Sprintf("%s/cpuacct/slurm", default_base_dir)
var memory_base = fmt.Sprintf("%s/memory/slurm", default_base_dir)
var cpuset_base = fmt.Sprintf("%s/cpuset/slurm", default_base_dir)
var devices_base = fmt.Sprintf("%s/devices/slurm", default_base_dir)
func getSlurmJobs() []string {
out := make([]string, 0)
globpattern := filepath.Join(cpuacct_base, "uid_[0-9]*", "job_[0-9]*")
dirs, err := filepath.Glob(globpattern)
if err == nil {
for _, d := range dirs {
r, err := filepath.Rel(cpuacct_base, d)
if err == nil {
out = append(out, r)
}
}
}
return out
}
func getSlurmSteps() []string {
out := make([]string, 0)
globpattern := filepath.Join(cpuacct_base, "uid_[0-9]*", "job_[0-9]*", "step_*")
dirs, err := filepath.Glob(globpattern)
if err == nil {
out = append(out, dirs...)
}
return out
}
func getId(prefix, str string) (uint64, error) {
var s string
format := prefix + "_%s"
_, err := fmt.Sscanf(str, format, &s)
if err != nil {
return 0, err
}
id, err := strconv.ParseInt(s, 0, 64)
if err != nil {
return 0, err
}
return uint64(id), nil
}
func ExpandList(strlist string) []int {
out := make([]int, 0)
level1 := strings.Split(strlist, ",")
if len(level1) > 0 {
for _, entry := range level1 {
var s, e int
_, err := fmt.Sscanf(entry, "%d-%d", &s, &e)
if err == nil {
if s < e {
for i := s; i <= e; i++ {
out = append(out, i)
}
} else {
for i := e; i <= s; i-- {
out = append(out, i)
}
}
} else {
_, err := fmt.Sscanf(entry, "%d", &s)
if err == nil {
out = append(out, s)
}
}
}
}
return out
}
func ParseDevices(devlist string) []string {
out := make([]string, 0)
return out
}
// Init initializes the sample collector
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *SlurmJobDetector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SlurmJobDetector"
// This is for later use, also call it early
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors actually doing measurements
// because they should not measure the execution of the other collectors
m.parallel = true
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granularity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// die -> CPU die (requires CPU die ID as 'type-id' tag)
// memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag)
// llc -> Last level cache (requires last level cache ID as 'type-id' tag)
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
m.tags = map[string]string{"type": "node"}
// Read in the JSON configuration
m.config.SendJobEvents = false
m.config.SendJobMetrics = false
m.config.SendStepEvents = false
m.config.SendStepMetrics = false
m.config.BaseDirectory = default_base_dir
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Parse the read interval duration
m.interval, err = time.ParseDuration(m.config.Interval)
if err != nil {
cclog.ComponentError(m.name, "Error parsing interval:", err.Error())
return err
}
// Storage for output channel
m.output = nil
// Management channel for the timer function.
m.done = make(chan bool)
// Create the own ticker
m.ticker = time.NewTicker(m.interval)
// Create space for storing files
m.files = make(map[string]struct{})
cpuacct_base = fmt.Sprintf("%s/cpuacct/slurm", m.config.BaseDirectory)
memory_base = fmt.Sprintf("%s/memory/slurm", m.config.BaseDirectory)
cpuset_base = fmt.Sprintf("%s/cpuset/slurm", m.config.BaseDirectory)
devices_base = fmt.Sprintf("%s/devices/slurm", m.config.BaseDirectory)
cclog.ComponentDebug(m.name, "Using base directory", m.config.BaseDirectory)
// Start the timer loop with return functionality by sending 'true' to the done channel
m.wg.Add(1)
go func() {
for {
select {
case <-m.done:
// Exit the timer loop
cclog.ComponentDebug(m.name, "Closing...")
m.wg.Done()
return
case timestamp := <-m.ticker.C:
// This is executed every timer tick but we have to wait until the first
// Read() to get the output channel
cclog.ComponentDebug(m.name, "Checking events")
if m.output != nil {
m.CheckEvents(timestamp)
}
}
}
}()
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
return err
}
func ReadJobData(userdir, jobdir string) (SlurmJobMetrics, error) {
jobdata := SlurmJobMetrics{
MemoryUsage: 0,
MaxMemoryUsage: 0,
LimitMemoryUsage: 0,
CpuUsageUser: 0,
CpuUsageSys: 0,
}
job_mem := filepath.Join(memory_base, userdir, jobdir, "memory.usage_in_bytes")
mem_usage, err := os.ReadFile(job_mem)
if err == nil {
x, err := strconv.ParseInt(string(mem_usage), 0, 64)
if err == nil {
jobdata.MemoryUsage = x
}
}
job_mem = filepath.Join(memory_base, userdir, jobdir, "memory.max_usage_in_bytes")
mem_usage, err = os.ReadFile(job_mem)
if err == nil {
x, err := strconv.ParseInt(string(mem_usage), 0, 64)
if err == nil {
jobdata.MaxMemoryUsage = x
}
}
job_cpu := filepath.Join(cpuacct_base, userdir, jobdir, "cpuacct.usage")
total_usage, err := os.ReadFile(job_cpu)
if err == nil {
tu, err := strconv.ParseInt(string(total_usage), 0, 64)
if err == nil {
job_cpu = filepath.Join(cpuacct_base, userdir, jobdir, "cpuacct.usage_user")
user_usage, err := os.ReadFile(job_cpu)
if err == nil {
uu, err := strconv.ParseInt(string(user_usage), 0, 64)
if err == nil {
jobdata.CpuUsageUser = int64(uu/tu) * 100
jobdata.CpuUsageSys = 100 - jobdata.CpuUsageUser
}
}
}
}
return jobdata, nil
}
func ReadJobStepData(userdir, jobdir, stepdir string) (SlurmJobMetrics, error) {
jobdata := SlurmJobMetrics{
MemoryUsage: 0,
MaxMemoryUsage: 0,
LimitMemoryUsage: 0,
CpuUsageUser: 0,
CpuUsageSys: 0,
}
job_mem := filepath.Join(memory_base, userdir, jobdir, stepdir, "memory.usage_in_bytes")
mem_usage, err := os.ReadFile(job_mem)
if err == nil {
x, err := strconv.ParseInt(string(mem_usage), 0, 64)
if err == nil {
jobdata.MemoryUsage = x
}
}
job_mem = filepath.Join(memory_base, userdir, jobdir, stepdir, "memory.max_usage_in_bytes")
mem_usage, err = os.ReadFile(job_mem)
if err == nil {
x, err := strconv.ParseInt(string(mem_usage), 0, 64)
if err == nil {
jobdata.MaxMemoryUsage = x
}
}
job_cpu := filepath.Join(cpuacct_base, userdir, jobdir, stepdir, "cpuacct.usage")
total_usage, err := os.ReadFile(job_cpu)
if err == nil {
tu, err := strconv.ParseInt(string(total_usage), 0, 64)
if err == nil {
job_cpu = filepath.Join(cpuacct_base, userdir, jobdir, stepdir, "cpuacct.usage_user")
user_usage, err := os.ReadFile(job_cpu)
if err == nil {
uu, err := strconv.ParseInt(string(user_usage), 0, 64)
if err == nil {
jobdata.CpuUsageUser = int64(uu/tu) * 100
jobdata.CpuUsageSys = 100 - jobdata.CpuUsageUser
}
}
}
}
return jobdata, nil
}
func pathinfo(path string) (uint64, uint64, string, error) {
uid := uint64(0)
jobid := uint64(0)
step := ""
parts := strings.Split(path, "/")
for i := len(parts) - 1; i >= 0; i-- {
p := parts[i]
if strings.HasPrefix(p, "uid_") {
u, err := getId("uid", p)
if err == nil {
uid = u
}
} else if strings.HasPrefix(p, "job_") {
j, err := getId("job", p)
if err == nil {
jobid = j
}
} else if strings.HasPrefix(p, "step_") {
step = p[5:]
}
}
return uid, jobid, step, nil
}
func (m *SlurmJobDetector) CheckEvents(timestamp time.Time) {
globPattern := filepath.Join(cpuacct_base, "uid_[0-9]*", "job_[0-9]*")
if m.config.SendStepEvents {
globPattern = filepath.Join(cpuacct_base, "uid_[0-9]*", "job_[0-9]*", "step_*")
}
dirs, err := filepath.Glob(globPattern)
if err != nil {
cclog.ComponentError(m.name, "Cannot glob with pattern", globPattern)
return
}
for _, d := range dirs {
if _, ok := m.files[d]; !ok {
uid := uint64(0)
jobid := uint64(0)
step := ""
uid, jobid, step, err = pathinfo(d)
if err == nil {
if len(step) == 0 {
cclog.ComponentDebug(m.name, "New job for UID ", uid, " and JOBID ", jobid)
m.NewJobEvent(uint64(uid), uint64(jobid), timestamp, m.output)
} else {
cclog.ComponentDebug(m.name, "New job step for UID ", uid, ", JOBID ", jobid, " and step ", step)
m.NewJobStepEvent(uint64(uid), uint64(jobid), step, timestamp, m.output)
}
}
m.files[d] = struct{}{}
}
}
for d := range m.files {
if _, ok := stringArrayContains(dirs, d); !ok {
uid := uint64(0)
jobid := uint64(0)
step := ""
uid, jobid, step, err = pathinfo(d)
if err == nil {
if len(step) == 0 {
cclog.ComponentDebug(m.name, "Vanished job for UID ", uid, " and JOBID ", jobid)
m.EndJobEvent(uint64(uid), uint64(jobid), timestamp, m.output)
} else {
cclog.ComponentDebug(m.name, "Vanished job step for UID ", uid, ", JOBID ", jobid, " and step ", step)
m.EndJobStepEvent(uint64(uid), uint64(jobid), step, timestamp, m.output)
}
}
delete(m.files, d)
}
}
}
func (m *SlurmJobDetector) NewJobEvent(uid, jobid uint64, timestamp time.Time, output chan lp.CCMetric) {
jobtags := map[string]string{
"type": "job",
"type-id": fmt.Sprintf("%d", jobid),
}
userdir := fmt.Sprintf("uid_%d", uid)
jobdir := fmt.Sprintf("job_%d", jobid)
// Fill job JSON with data from cgroup
var md SlurmJobMetadata
job_cpus_file := filepath.Join(cpuset_base, userdir, jobdir, "cpuset.effective_cpus")
cclog.ComponentDebug(m.name, job_cpus_file)
job_cpus, err := os.ReadFile(job_cpus_file)
if err == nil {
cclog.ComponentDebug(m.name, string(job_cpus))
md.Cpus = ExpandList(string(job_cpus))
}
job_mems_file := filepath.Join(cpuset_base, userdir, jobdir, "cpuset.effective_mems")
job_mems, err := os.ReadFile(job_mems_file)
if err == nil {
md.Memories = ExpandList(string(job_mems))
}
job_devs_file := filepath.Join(devices_base, userdir, jobdir, "devices.list")
job_devs, err := os.ReadFile(job_devs_file)
if err == nil {
md.Devices = ParseDevices(string(job_devs))
}
job_mem_limit_hard_file := filepath.Join(memory_base, userdir, jobdir, "memory.limit_in_bytes")
job_mem_limit_hard, err := os.ReadFile(job_mem_limit_hard_file)
if err == nil {
x, err := strconv.ParseInt(string(job_mem_limit_hard), 0, 64)
if err == nil {
md.MemoryLimitHard = uint64(x)
}
}
job_mem_limit_soft_file := filepath.Join(memory_base, userdir, jobdir, "memory.soft_limit_in_bytes")
job_mem_limit_soft, err := os.ReadFile(job_mem_limit_soft_file)
if err == nil {
x, err := strconv.ParseInt(string(job_mem_limit_soft), 0, 64)
if err == nil {
md.MemoryLimitSoft = uint64(x)
}
}
md.UID = uid
md.JobId = jobid
md.Timestamp = uint64(timestamp.Unix())
md.Status = "start"
jobjson, err := json.Marshal(md)
if err == nil {
y, err := lp.New("slurm", jobtags, m.meta, map[string]interface{}{"value": string(jobjson)}, timestamp)
if err == nil {
suid := fmt.Sprintf("%d", uid)
y.AddMeta("uid", suid)
uname, err := osuser.LookupId(suid)
if err == nil {
y.AddMeta("username", uname.Username)
}
y.AddMeta("metric_type", "event")
output <- y
}
}
}
func (m *SlurmJobDetector) NewJobStepEvent(uid, jobid uint64, step string, timestamp time.Time, output chan lp.CCMetric) {
jobtags := map[string]string{
"type": "job",
"type-id": fmt.Sprintf("%d", jobid),
"stype": "step",
"stype-id": step,
}
userdir := fmt.Sprintf("uid_%d", uid)
jobdir := fmt.Sprintf("job_%d", jobid)
stepdir := fmt.Sprintf("step_%s", step)
// Fill job JSON with data from cgroup
var md SlurmJobMetadata
job_cpus_file := filepath.Join(cpuset_base, userdir, jobdir, stepdir, "cpuset.effective_cpus")
job_cpus, err := os.ReadFile(job_cpus_file)
if err == nil {
md.Cpus = ExpandList(string(job_cpus))
}
job_mems_file := filepath.Join(cpuset_base, userdir, jobdir, stepdir, "cpuset.effective_mems")
job_mems, err := os.ReadFile(job_mems_file)
if err == nil {
md.Memories = ExpandList(string(job_mems))
}
job_devs_file := filepath.Join(devices_base, userdir, jobdir, stepdir, "devices.list")
job_devs, err := os.ReadFile(job_devs_file)
if err == nil {
md.Devices = ParseDevices(string(job_devs))
}
job_mem_limit_hard_file := filepath.Join(memory_base, userdir, jobdir, stepdir, "memory.limit_in_bytes")
job_mem_limit_hard, err := os.ReadFile(job_mem_limit_hard_file)
if err == nil {
x, err := strconv.ParseInt(string(job_mem_limit_hard), 0, 64)
if err == nil {
md.MemoryLimitHard = uint64(x)
}
}
job_mem_limit_soft_file := filepath.Join(memory_base, userdir, jobdir, stepdir, "memory.soft_limit_in_bytes")
job_mem_limit_soft, err := os.ReadFile(job_mem_limit_soft_file)
if err == nil {
x, err := strconv.ParseInt(string(job_mem_limit_soft), 0, 64)
if err == nil {
md.MemoryLimitSoft = uint64(x)
}
}
md.UID = uid
md.JobId = jobid
md.Step = step
md.Timestamp = uint64(timestamp.Unix())
md.Status = "start"
jobjson, err := json.Marshal(md)
if err == nil {
y, err := lp.New("slurm", jobtags, m.meta, map[string]interface{}{"value": string(jobjson)}, timestamp)
if err == nil {
suid := fmt.Sprintf("%d", uid)
y.AddMeta("uid", suid)
uname, err := osuser.LookupId(suid)
if err == nil {
y.AddMeta("username", uname.Username)
}
y.AddMeta("metric_type", "event")
output <- y
}
}
}
func (m *SlurmJobDetector) EndJobEvent(uid, jobid uint64, timestamp time.Time, output chan lp.CCMetric) {
jobtags := map[string]string{
"type": "job",
"type-id": fmt.Sprintf("%d", jobid),
}
// Fill job JSON with data from cgroup
var md SlurmJobMetadata
md.UID = uid
md.JobId = jobid
md.Timestamp = uint64(timestamp.Unix())
md.Status = "end"
jobjson, err := json.Marshal(md)
if err == nil {
y, err := lp.New("slurm", jobtags, m.meta, map[string]interface{}{"value": string(jobjson)}, timestamp)
if err == nil {
suid := fmt.Sprintf("%d", uid)
y.AddMeta("uid", suid)
uname, err := osuser.LookupId(suid)
if err == nil {
y.AddMeta("username", uname.Username)
}
y.AddMeta("metric_type", "event")
output <- y
}
}
}
func (m *SlurmJobDetector) EndJobStepEvent(uid, jobid uint64, step string, timestamp time.Time, output chan lp.CCMetric) {
jobtags := map[string]string{
"type": "job",
"type-id": fmt.Sprintf("%d", jobid),
"stype": "step",
"stype-id": step,
}
// Fill job JSON with data from cgroup
var md SlurmJobMetadata
md.UID = uid
md.JobId = jobid
md.Step = step
md.Timestamp = uint64(timestamp.Unix())
md.Status = "end"
jobjson, err := json.Marshal(md)
if err == nil {
y, err := lp.New("slurm", jobtags, m.meta, map[string]interface{}{"value": string(jobjson)}, timestamp)
if err == nil {
suid := fmt.Sprintf("%d", uid)
y.AddMeta("uid", suid)
uname, err := osuser.LookupId(suid)
if err == nil {
y.AddMeta("username", uname.Username)
}
y.AddMeta("metric_type", "event")
output <- y
}
}
}
func (m *SlurmJobDetector) SendMetrics(jobtags map[string]string, jobmetrics SlurmJobMetrics, timestamp time.Time, output chan lp.CCMetric) {
y, err := lp.New("mem_used", jobtags, m.meta, map[string]interface{}{"value": jobmetrics.MemoryUsage}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("max_mem_used", jobtags, m.meta, map[string]interface{}{"value": jobmetrics.MaxMemoryUsage}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("user_cpu", jobtags, m.meta, map[string]interface{}{"value": jobmetrics.CpuUsageUser}, timestamp)
if err == nil {
y.AddMeta("unit", "%")
output <- y
}
y, err = lp.New("user_sys", jobtags, m.meta, map[string]interface{}{"value": jobmetrics.CpuUsageSys}, timestamp)
if err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
// Read collects all metrics belonging to the sample collector
// and sends them through the output channel to the collector manager
func (m *SlurmJobDetector) Read(interval time.Duration, output chan lp.CCMetric) {
// Create a sample metric
timestamp := time.Now()
// Capture output channel
m.output = output
udirs, err := filepath.Glob(filepath.Join(cpuacct_base, "uid_[0-9]*"))
if err != nil {
return
}
for _, ud := range udirs {
jdirs, err := filepath.Glob(filepath.Join(ud, "job_[0-9]*"))
if err != nil {
continue
}
uKey := filepath.Base(ud)
for _, jd := range jdirs {
jKey := filepath.Base(jd)
jobid, err := getId("job", jKey)
if err != nil {
continue
}
jobmetrics, err := ReadJobData(uKey, jKey)
if err != nil {
jobtags := map[string]string{
"type": "job",
"type-id": fmt.Sprintf("%d", jobid),
}
m.SendMetrics(jobtags, jobmetrics, timestamp, output)
}
if m.config.SendStepMetrics {
sdirs, err := filepath.Glob(filepath.Join(jd, "step_*"))
if err != nil {
continue
}
for _, sd := range sdirs {
sKey := filepath.Base(sd)
stepmetrics, err := ReadJobStepData(uKey, jKey, sKey)
if err != nil {
continue
}
var stepname string
_, err = fmt.Sscanf(sKey, "step_%s", &stepname)
if err == nil {
jobtags := map[string]string{
"type": "job",
"type-id": fmt.Sprintf("%d", jobid),
"stype": "step",
"stype-id": stepname,
}
m.SendMetrics(jobtags, stepmetrics, timestamp, output)
}
}
}
}
}
// uid_pattern := "uid_[0-9]*"
// job_pattern := "job_[0-9]*"
// //step_pattern := "step_*"
// globPattern := filepath.Join(cpuacct_base, uid_pattern)
// uidDirs, err := filepath.Glob(globPattern)
// if err != nil {
// return
// }
// for _, udir := range uidDirs {
// uKey := filepath.Base(udir)
// if _, ok := m.jobs[uKey]; !ok {
// m.jobs[uKey] = make(map[string]SlurmJobData)
// }
// uid, _ := getId("uid", uKey)
// globPattern = filepath.Join(cpuacct_base, uKey, job_pattern)
// jobDirs, err := filepath.Glob(globPattern)
// if err != nil {
// continue
// }
// for _, jdir := range jobDirs {
// jKey := filepath.Base(jdir)
// jobid, _ := getId("job", jKey)
// if _, ok := m.jobs[uKey][jKey]; !ok {
// var steps []SlurmJobStepData = nil
// if m.config.SendStepEvents || m.config.SendStepMetrics {
// steps = make([]SlurmJobStepData, 0)
// }
// m.jobs[uKey][jKey] = SlurmJobData{
// Metrics: SlurmJobMetrics{
// MemoryUsage: 0,
// MaxMemoryUsage: 0,
// LimitMemoryUsage: 0,
// CpuUsageUser: 0,
// CpuUsageSys: 0,
// },
// Steps: steps,
// }
// m.NewJobEvent(uid, jobid, timestamp, output)
// }
// jdata := m.jobs[uKey][jKey]
// jobmetrics, err := ReadJobData(uKey, jKey)
// if err == nil {
// jdata.Metrics = jobmetrics
// m.SendMetrics(jobid, jobmetrics, timestamp, output)
// }
// m.jobs[uKey][jKey] = jdata
// }
// }
// for uKey, udata := range m.jobs {
// uid, _ := getId("uid", uKey)
// for jKey := range udata {
// jobid, _ := getId("job", jKey)
// p := filepath.Join(cpuset_base, uKey, jKey)
// if _, err := os.Stat(p); err != nil {
// m.EndJobEvent(uid, jobid, timestamp, output)
// delete(udata, jKey)
// }
// }
// p := filepath.Join(cpuset_base, uKey)
// if _, err := os.Stat(p); err != nil {
// delete(udata, uKey)
// }
// }
}
// Close metric collector: close network connection, close files, close libraries, ...
// Called once by the collector manager
func (m *SlurmJobDetector) Close() {
m.done <- true
m.wg.Wait()
// Unset flag
m.init = false
}