2024-08-07 16:09:40 +02:00
|
|
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
|
|
// All rights reserved.
|
|
|
|
// Use of this source code is governed by a MIT-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package natsMessenger
|
|
|
|
|
|
|
|
import (
|
2024-08-12 09:03:53 +02:00
|
|
|
"database/sql"
|
2024-08-07 16:09:40 +02:00
|
|
|
"encoding/json"
|
2024-08-12 09:03:53 +02:00
|
|
|
"errors"
|
2024-08-07 16:09:40 +02:00
|
|
|
"fmt"
|
|
|
|
"time"
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
"github.com/ClusterCockpit/cc-backend/internal/importer"
|
|
|
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
2024-08-07 16:09:40 +02:00
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/log"
|
|
|
|
"github.com/ClusterCockpit/cc-backend/pkg/schema"
|
|
|
|
"github.com/nats-io/nats-server/v2/server"
|
|
|
|
"github.com/nats-io/nats.go"
|
|
|
|
)
|
|
|
|
|
|
|
|
// Authentication *auth.Authentication
|
|
|
|
type NatsMessenger struct {
|
|
|
|
Server *server.Server
|
|
|
|
Connection *nats.Conn
|
|
|
|
Subscriptions []*nats.Subscription
|
2024-08-12 09:03:53 +02:00
|
|
|
JobRepository *repository.JobRepository
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func New(config *schema.NatsConfig) (nm *NatsMessenger, err error) {
|
|
|
|
return SetupNatsMessenger(config)
|
|
|
|
}
|
|
|
|
|
|
|
|
type DevNatsMessage struct {
|
|
|
|
Content string `json:"content"`
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
// StartJobNatsResponse model
|
|
|
|
type StartJobNatsResponse struct {
|
|
|
|
// Database ID of new job
|
|
|
|
DBID int64 `json:"id"`
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
// StopJobNatsRequest model
|
|
|
|
type StopJobNatsRequest struct {
|
2024-08-07 16:09:40 +02:00
|
|
|
JobId *int64 `json:"jobId" example:"123000"`
|
|
|
|
Cluster *string `json:"cluster" example:"fritz"`
|
|
|
|
StartTime *int64 `json:"startTime" example:"1649723812"`
|
|
|
|
State schema.JobState `json:"jobState" validate:"required" example:"completed"`
|
|
|
|
StopTime int64 `json:"stopTime" validate:"required" example:"1649763839"`
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
// DeleteJobNatsRequest model
|
|
|
|
type DeleteJobNatsRequest struct {
|
2024-08-07 16:09:40 +02:00
|
|
|
JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
|
|
|
|
Cluster *string `json:"cluster" example:"fritz"` // Cluster of job
|
|
|
|
StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
// jobEventNatsRequest model
|
|
|
|
type ReceiveEventNatsRequest struct {
|
|
|
|
JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job
|
|
|
|
Cluster *string `json:"cluster" example:"fritz"` // Cluster of job
|
|
|
|
StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch
|
|
|
|
Metric *string `json:"metric" example:"cpu_power"` // Event Target Metric for Job
|
|
|
|
Timestamp *int64 `json:"timestamp" example:"1649724000"` // Event Timestamp
|
|
|
|
Event *string `json:"event" example:"powercap"` // Event Name / Type
|
|
|
|
Value *int64 `json:"value,omitempty" example:"150"` // Optional Value Set for Evenr, eg powercap
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check auth and setup listeners to channels
|
|
|
|
|
|
|
|
// ns *server.Server, nc *nats.Conn, subs []*nats.Subscription, err error
|
|
|
|
func SetupNatsMessenger(config *schema.NatsConfig) (nm *NatsMessenger, err error) {
|
|
|
|
// Check if Config present
|
|
|
|
if config == nil {
|
|
|
|
log.Info("No NATS config found: Skip NATS init.")
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Init Raw
|
|
|
|
nmr := NatsMessenger{
|
|
|
|
Server: nil,
|
|
|
|
Connection: nil,
|
|
|
|
Subscriptions: []*nats.Subscription{},
|
2024-08-12 09:03:53 +02:00
|
|
|
JobRepository: repository.GetJobRepository(),
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Start Nats Server
|
|
|
|
// Note: You can configure things like Host, Port, Authorization, and much more using server.Options.
|
|
|
|
opts := &server.Options{Port: config.Port}
|
|
|
|
nmr.Server, err = server.NewServer(opts)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
log.Error("nats server error on creation")
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
go nmr.Server.Start()
|
|
|
|
|
2024-08-08 15:42:34 +02:00
|
|
|
if !nmr.Server.ReadyForConnections(3 * time.Second) {
|
2024-08-07 16:09:40 +02:00
|
|
|
log.Error("nats server not ready for connection")
|
|
|
|
return nil, fmt.Errorf("nats server not ready for connection")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Connect
|
|
|
|
var copts []nats.Option
|
|
|
|
nmr.Connection, err = nats.Connect(nmr.Server.ClientURL(), copts...)
|
|
|
|
if nmr.Connection == nil {
|
|
|
|
nmr.Server.Shutdown()
|
|
|
|
log.Error("nats connection could not be established: nats shut down")
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Subscribe
|
2024-08-12 09:03:53 +02:00
|
|
|
if err = nmr.setupSubscriptions(); err != nil {
|
|
|
|
log.Error("error when subscribing to channels: nats shut down")
|
|
|
|
nmr.Connection.Close()
|
|
|
|
nmr.Server.Shutdown()
|
2024-08-07 16:09:40 +02:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Infof("NATS server and subscriptions on port '%d' established\n", config.Port)
|
2024-08-07 16:09:40 +02:00
|
|
|
return &nmr, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (nm *NatsMessenger) StopNatsMessenger() {
|
|
|
|
for _, sub := range nm.Subscriptions {
|
|
|
|
err := sub.Unsubscribe()
|
|
|
|
if err != nil {
|
|
|
|
log.Errorf("NATS unsubscribe failed: %s", err.Error())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
nm.Connection.Close()
|
|
|
|
nm.Server.Shutdown()
|
|
|
|
log.Info("NATS connections closed and server shut down")
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) setupSubscriptions() (err error) {
|
2024-08-07 16:09:40 +02:00
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
if startSub, err := nm.startJobListener(); err != nil {
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Infof("Subscription to 'start-job' failed: %s", err)
|
|
|
|
} else {
|
|
|
|
log.Info("Subscribed to 'start-job'")
|
2024-08-12 09:03:53 +02:00
|
|
|
nm.Subscriptions = append(nm.Subscriptions, startSub)
|
2024-08-08 15:42:34 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
if stopSub, err := nm.stopJobListener(); err != nil {
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Infof("Subscription to 'stop-job' failed: %s", err)
|
|
|
|
} else {
|
|
|
|
log.Info("Subscribed to 'stop-job'")
|
2024-08-12 09:03:53 +02:00
|
|
|
nm.Subscriptions = append(nm.Subscriptions, stopSub)
|
2024-08-08 15:42:34 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
if deleteSub, err := nm.deleteJobListener(); err != nil {
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Infof("Subscription to 'delete-job' failed: %s", err)
|
|
|
|
} else {
|
|
|
|
log.Info("Subscribed to 'delete-job'")
|
2024-08-12 09:03:53 +02:00
|
|
|
nm.Subscriptions = append(nm.Subscriptions, deleteSub)
|
2024-08-08 15:42:34 +02:00
|
|
|
}
|
2024-08-07 16:09:40 +02:00
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
if eventSub, err := nm.jobEventListener(); err != nil {
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Infof("Subscription to 'job-event' failed: %s", err)
|
|
|
|
} else {
|
|
|
|
log.Info("Subscribed to 'job-event'")
|
2024-08-12 09:03:53 +02:00
|
|
|
nm.Subscriptions = append(nm.Subscriptions, eventSub)
|
2024-08-08 15:42:34 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
return err
|
2024-08-08 15:42:34 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Listeners: Subscribe to specified channels and handle with specific handler functions
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) startJobListener() (sub *nats.Subscription, err error) {
|
|
|
|
return nm.Connection.Subscribe("start-job", func(m *nats.Msg) {
|
|
|
|
req := schema.JobMeta{BaseJob: schema.JobDefaults}
|
2024-08-08 15:42:34 +02:00
|
|
|
if err := json.Unmarshal(m.Data, &req); err != nil {
|
2024-08-12 09:03:53 +02:00
|
|
|
log.Warnf("Error while unmarshaling raw json nats message content on channel start-job: %s", err.Error())
|
|
|
|
m.Respond([]byte("Error while unmarshaling raw json nats message content on channel start-job: " + err.Error()))
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
m.Respond(nm.startJobHandler(req))
|
2024-08-07 16:09:40 +02:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) stopJobListener() (sub *nats.Subscription, err error) {
|
|
|
|
return nm.Connection.Subscribe("stop-job", func(m *nats.Msg) {
|
|
|
|
var req StopJobNatsRequest
|
2024-08-08 15:42:34 +02:00
|
|
|
if err := json.Unmarshal(m.Data, &req); err != nil {
|
|
|
|
log.Error("Error while unmarshaling raw json nats message content: stopJob")
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
m.Respond(nm.stopJobHandler(req))
|
2024-08-08 15:42:34 +02:00
|
|
|
})
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) deleteJobListener() (sub *nats.Subscription, err error) {
|
|
|
|
return nm.Connection.Subscribe("delete-job", func(m *nats.Msg) {
|
2024-08-08 15:42:34 +02:00
|
|
|
var req DevNatsMessage
|
|
|
|
if err := json.Unmarshal(m.Data, &req); err != nil {
|
|
|
|
log.Error("Error while unmarshaling raw json nats message content: deleteJob")
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
if err := nm.deleteJobHandler(req); err != nil {
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Errorf("error: %s", err.Error())
|
|
|
|
}
|
|
|
|
})
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) jobEventListener() (sub *nats.Subscription, err error) {
|
|
|
|
return nm.Connection.Subscribe("job-event", func(m *nats.Msg) {
|
2024-08-08 15:42:34 +02:00
|
|
|
var req DevNatsMessage
|
|
|
|
if err := json.Unmarshal(m.Data, &req); err != nil {
|
|
|
|
log.Error("Error while unmarshaling raw json nats message content: jobEvent")
|
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
if err := nm.jobEventHandler(req); err != nil {
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Errorf("error: %s", err.Error())
|
|
|
|
}
|
|
|
|
})
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Handlers: Take content of message and perform action, e.g. adding job in db
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) startJobHandler(req schema.JobMeta) []byte {
|
|
|
|
if req.State == "" {
|
|
|
|
req.State = schema.JobStateRunning
|
|
|
|
}
|
|
|
|
if err := importer.SanityChecks(&req.BaseJob); err != nil {
|
|
|
|
log.Error(err)
|
|
|
|
return handleErr(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// // aquire lock to avoid race condition between API calls --> for NATS required?
|
|
|
|
// var unlockOnce sync.Once
|
|
|
|
// api.RepositoryMutex.Lock()
|
|
|
|
// defer unlockOnce.Do(api.RepositoryMutex.Unlock)
|
|
|
|
|
|
|
|
// Check if combination of (job_id, cluster_id, start_time) already exists:
|
|
|
|
jobs, err := nm.JobRepository.FindAll(&req.JobID, &req.Cluster, nil)
|
|
|
|
if err != nil && err != sql.ErrNoRows {
|
|
|
|
log.Errorf("checking for duplicate failed: %s", err)
|
|
|
|
return handleErr(fmt.Errorf("checking for duplicate failed: %w", err))
|
|
|
|
} else if err == nil {
|
|
|
|
for _, job := range jobs {
|
|
|
|
if (req.StartTime - job.StartTimeUnix) < 86400 {
|
|
|
|
log.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID)
|
|
|
|
return handleErr(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// id, err := nm.JobRepository.Start(&req)
|
|
|
|
// if err != nil {
|
|
|
|
// log.Errorf("insert into database failed: %s", err)
|
|
|
|
// return handleErr(fmt.Errorf("insert into database failed: %w", err))
|
|
|
|
// }
|
|
|
|
|
|
|
|
// // unlock here, adding Tags can be async
|
|
|
|
// unlockOnce.Do(api.RepositoryMutex.Unlock)
|
|
|
|
|
|
|
|
for _, tag := range req.Tags {
|
|
|
|
if _, err := nm.JobRepository.AddTagOrCreate(1337, tag.Type, tag.Name); err != nil {
|
|
|
|
log.Errorf("adding tag to new job %d failed: %s", 1337, err)
|
|
|
|
return handleErr(fmt.Errorf("adding tag to new job %d failed: %w", 1337, err))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Infof("new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", 1337, req.Cluster, req.JobID, req.User, req.StartTime)
|
|
|
|
|
|
|
|
result, _ := json.Marshal(StartJobNatsResponse{
|
|
|
|
DBID: 1337,
|
|
|
|
})
|
|
|
|
return result
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) stopJobHandler(req StopJobNatsRequest) []byte {
|
|
|
|
// Fetch job (that will be stopped) from db
|
|
|
|
var job *schema.Job
|
|
|
|
var err error
|
|
|
|
if req.JobId == nil {
|
|
|
|
return handleErr(errors.New("the field 'jobId' is required"))
|
|
|
|
}
|
|
|
|
|
|
|
|
job, err = nm.JobRepository.Find(req.JobId, req.Cluster, req.StartTime)
|
|
|
|
if err != nil {
|
|
|
|
return handleErr(fmt.Errorf("finding job failed: %w", err))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sanity checks
|
|
|
|
if job == nil || job.StartTime.Unix() >= req.StopTime || job.State != schema.JobStateRunning {
|
|
|
|
return handleErr(errors.New("stopTime must be larger than startTime and only running jobs can be stopped"))
|
|
|
|
}
|
|
|
|
|
|
|
|
if req.State != "" && !req.State.Valid() {
|
|
|
|
return handleErr(fmt.Errorf("invalid job state: %#v", req.State))
|
|
|
|
} else if req.State == "" {
|
|
|
|
req.State = schema.JobStateCompleted
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark job as stopped in the database (update state and duration)
|
|
|
|
job.Duration = int32(req.StopTime - job.StartTime.Unix())
|
|
|
|
job.State = req.State
|
|
|
|
// if err := nm.JobRepository.Stop(job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil {
|
|
|
|
// return handleErr(fmt.Errorf("marking job as stopped failed: %w", err))
|
|
|
|
// }
|
|
|
|
|
|
|
|
log.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime)
|
|
|
|
|
|
|
|
// // Send a response (with status OK). This means that erros that happen from here on forward
|
|
|
|
// // can *NOT* be communicated to the client. If reading from a MetricDataRepository or
|
|
|
|
// // writing to the filesystem fails, the client will not know.
|
|
|
|
// rw.Header().Add("Content-Type", "application/json")
|
|
|
|
// rw.WriteHeader(http.StatusOK)
|
|
|
|
// json.NewEncoder(rw).Encode(job)
|
|
|
|
|
|
|
|
// Monitoring is disabled...
|
|
|
|
if job.MonitoringStatus == schema.MonitoringStatusDisabled {
|
|
|
|
return handleErr(fmt.Errorf("monitoring is disabled"))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Trigger async archiving
|
|
|
|
// nm.JobRepository.TriggerArchiving(job)
|
|
|
|
|
|
|
|
result, _ := json.Marshal(job)
|
|
|
|
return result
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) deleteJobHandler(req DevNatsMessage) (err error) {
|
|
|
|
// Allow via Nats?
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Debugf("CALLED HANDLER FOR deleteJob: %s", req.Content)
|
|
|
|
return nil
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
|
|
|
|
2024-08-12 09:03:53 +02:00
|
|
|
func (nm *NatsMessenger) jobEventHandler(req DevNatsMessage) (err error) {
|
|
|
|
// Implement from scratch
|
2024-08-08 15:42:34 +02:00
|
|
|
log.Debugf("CALLED HANDLER FOR jobEvent: %s", req.Content)
|
|
|
|
return nil
|
2024-08-07 16:09:40 +02:00
|
|
|
}
|
2024-08-12 09:03:53 +02:00
|
|
|
|
|
|
|
// Helper
|
|
|
|
|
|
|
|
func handleErr(err error) []byte {
|
|
|
|
res, _ := json.Marshal(err.Error())
|
|
|
|
return res
|
|
|
|
}
|