mirror of
https://gitlab.cs.uni-saarland.de/hpc/cc-condor-sync.git
synced 2024-12-26 05:29:06 +01:00
Don't start on JobStatus chng & stop on R -> IDLE.
This is handling edgecases, where a startd died and after a timeout of 1hr the job is requeued. The IDLE -> RUNNING state then already has all other attributes (like RemoteHost) set, from the last execution. We should wait for `CpusProvisioned`, though, so that all required values are already up-to-date. Additionally, since timed-out jobs are not held, but instead just requeued, stop jobs as well when the state changes from `RUNNING` to `IDLE`.
This commit is contained in:
parent
6128b58cbd
commit
e38e6fa5b9
@ -40,11 +40,8 @@ using namespace std;
|
||||
using namespace nlohmann;
|
||||
|
||||
static const std::unordered_map<int, std::string> jobStateMap = {
|
||||
{RUNNING, "running"},
|
||||
{REMOVED, "cancelled"},
|
||||
{HELD, "failed"},
|
||||
{COMPLETED, "completed"},
|
||||
{SUSPENDED, "stopped"}};
|
||||
{IDLE, "failed"}, {RUNNING, "running"}, {REMOVED, "cancelled"},
|
||||
{HELD, "failed"}, {COMPLETED, "completed"}, {SUSPENDED, "stopped"}};
|
||||
|
||||
void CCSyncPlugin::earlyInitialize() {
|
||||
lock_guard<std::mutex> guard(data_mutex);
|
||||
@ -186,13 +183,19 @@ void CCSyncPlugin::setAttribute(const char *key, const char *_name,
|
||||
for (auto &jobEntry : currentStatus[clusterId]) {
|
||||
int actualProcId = jobEntry.first;
|
||||
currentStatus[clusterId][actualProcId][name] = value;
|
||||
// dprintf(D_FULLDEBUG,"[%d.%d] ", clusterId, actualProcId);
|
||||
// dprintf(D_FULLDEBUG, "[%d.%d] %s: %s\n", clusterId, actualProcId,
|
||||
// name.c_str(), value.c_str());
|
||||
if (actualProcId != -1) {
|
||||
// dprintf(D_FULLDEBUG,"{%d.%d} ", clusterId, actualProcId);
|
||||
if (!initializing) { // HTCondor bug? We should'n be receiving events
|
||||
// before initialization is done. We will ignore
|
||||
// them if it happens.
|
||||
if (name == "CpusProvisioned" || name == "JobStatus")
|
||||
// before initialization is done. We will ignore them if it happens.
|
||||
|
||||
// don't trigger on change to RUNNING, since we want to wait for
|
||||
// RemoteHost and CpusProvisioned to be set as well
|
||||
// If a job is being requeued, this would otherwise potentially
|
||||
// start a job with the wrong host / CPU count..
|
||||
if (name == "CpusProvisioned" ||
|
||||
(name == "JobStatus" && std::stoi(value) != RUNNING))
|
||||
toConsider.push_back({clusterId, procId});
|
||||
}
|
||||
}
|
||||
@ -202,10 +205,18 @@ void CCSyncPlugin::setAttribute(const char *key, const char *_name,
|
||||
// dprintf(D_FULLDEBUG,"Assigning attribute to JobId %d.%d\n", clusterId,
|
||||
// procId);
|
||||
currentStatus[clusterId][procId][name] = value;
|
||||
// dprintf(D_FULLDEBUG, "[%d.%d] %s: %s\n", clusterId, procId, name.c_str(),
|
||||
// value.c_str());
|
||||
if (!initializing) { // HTCondor bug? We should'n be receiving events
|
||||
// before initialization is done. We will ignore them
|
||||
// if it happens.
|
||||
if (name == "CpusProvisioned" || name == "JobStatus")
|
||||
|
||||
// don't trigger on change to RUNNING, since we want to wait for
|
||||
// RemoteHost and CpusProvisioned to be set as well
|
||||
// If a job is being requeued, this would otherwise potentially start a
|
||||
// job with the wrong host / CPU count..
|
||||
if (name == "CpusProvisioned" ||
|
||||
(name == "JobStatus" && std::stoi(value) != RUNNING))
|
||||
toConsider.push_back({clusterId, procId});
|
||||
}
|
||||
}
|
||||
@ -403,7 +414,7 @@ void CCSyncPlugin::endTransaction() {
|
||||
auto jobBatchNameIt = job.find("JobBatchName");
|
||||
auto globalJobId = globalJobIdToInt(jobId);
|
||||
|
||||
auto hostname = getRemoteHost(job["RemoteHost"]);
|
||||
auto hostname = getRemoteHost(removeQuotes(job["RemoteHost"]));
|
||||
json resources = {{"hostname", hostname}};
|
||||
auto accs = getAccelerators(job, hostname);
|
||||
if (!accs.empty()) {
|
||||
@ -430,8 +441,8 @@ void CCSyncPlugin::endTransaction() {
|
||||
sendPostRequest("/api/jobs/start_job/", j.dump());
|
||||
|
||||
} else if (lastState == RUNNING &&
|
||||
(state == REMOVED || state == COMPLETED || state == HELD ||
|
||||
state == SUSPENDED)) {
|
||||
(state == IDLE || state == REMOVED || state == COMPLETED ||
|
||||
state == HELD || state == SUSPENDED)) {
|
||||
|
||||
std::uint64_t startTime = std::stoull(job["JobCurrentStartDate"]),
|
||||
stopTime = std::stoull(job["EnteredCurrentStatus"]);
|
||||
|
Loading…
Reference in New Issue
Block a user