From e38e6fa5b9eff8c4a87786ff574783486127308a Mon Sep 17 00:00:00 2001 From: Joachim Meyer Date: Mon, 19 Dec 2022 11:24:24 +0100 Subject: [PATCH] Don't start on JobStatus chng & stop on R -> IDLE. This is handling edgecases, where a startd died and after a timeout of 1hr the job is requeued. The IDLE -> RUNNING state then already has all other attributes (like RemoteHost) set, from the last execution. We should wait for `CpusProvisioned`, though, so that all required values are already up-to-date. Additionally, since timed-out jobs are not held, but instead just requeued, stop jobs as well when the state changes from `RUNNING` to `IDLE`. --- src/htcondor_cc_sync_plugin.cpp | 37 +++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/src/htcondor_cc_sync_plugin.cpp b/src/htcondor_cc_sync_plugin.cpp index 4061e53..9c155dd 100644 --- a/src/htcondor_cc_sync_plugin.cpp +++ b/src/htcondor_cc_sync_plugin.cpp @@ -40,11 +40,8 @@ using namespace std; using namespace nlohmann; static const std::unordered_map jobStateMap = { - {RUNNING, "running"}, - {REMOVED, "cancelled"}, - {HELD, "failed"}, - {COMPLETED, "completed"}, - {SUSPENDED, "stopped"}}; + {IDLE, "failed"}, {RUNNING, "running"}, {REMOVED, "cancelled"}, + {HELD, "failed"}, {COMPLETED, "completed"}, {SUSPENDED, "stopped"}}; void CCSyncPlugin::earlyInitialize() { lock_guard guard(data_mutex); @@ -186,13 +183,19 @@ void CCSyncPlugin::setAttribute(const char *key, const char *_name, for (auto &jobEntry : currentStatus[clusterId]) { int actualProcId = jobEntry.first; currentStatus[clusterId][actualProcId][name] = value; - // dprintf(D_FULLDEBUG,"[%d.%d] ", clusterId, actualProcId); + // dprintf(D_FULLDEBUG, "[%d.%d] %s: %s\n", clusterId, actualProcId, + // name.c_str(), value.c_str()); if (actualProcId != -1) { // dprintf(D_FULLDEBUG,"{%d.%d} ", clusterId, actualProcId); if (!initializing) { // HTCondor bug? We should'n be receiving events - // before initialization is done. We will ignore - // them if it happens. - if (name == "CpusProvisioned" || name == "JobStatus") + // before initialization is done. We will ignore them if it happens. + + // don't trigger on change to RUNNING, since we want to wait for + // RemoteHost and CpusProvisioned to be set as well + // If a job is being requeued, this would otherwise potentially + // start a job with the wrong host / CPU count.. + if (name == "CpusProvisioned" || + (name == "JobStatus" && std::stoi(value) != RUNNING)) toConsider.push_back({clusterId, procId}); } } @@ -202,10 +205,18 @@ void CCSyncPlugin::setAttribute(const char *key, const char *_name, // dprintf(D_FULLDEBUG,"Assigning attribute to JobId %d.%d\n", clusterId, // procId); currentStatus[clusterId][procId][name] = value; + // dprintf(D_FULLDEBUG, "[%d.%d] %s: %s\n", clusterId, procId, name.c_str(), + // value.c_str()); if (!initializing) { // HTCondor bug? We should'n be receiving events // before initialization is done. We will ignore them // if it happens. - if (name == "CpusProvisioned" || name == "JobStatus") + + // don't trigger on change to RUNNING, since we want to wait for + // RemoteHost and CpusProvisioned to be set as well + // If a job is being requeued, this would otherwise potentially start a + // job with the wrong host / CPU count.. + if (name == "CpusProvisioned" || + (name == "JobStatus" && std::stoi(value) != RUNNING)) toConsider.push_back({clusterId, procId}); } } @@ -403,7 +414,7 @@ void CCSyncPlugin::endTransaction() { auto jobBatchNameIt = job.find("JobBatchName"); auto globalJobId = globalJobIdToInt(jobId); - auto hostname = getRemoteHost(job["RemoteHost"]); + auto hostname = getRemoteHost(removeQuotes(job["RemoteHost"])); json resources = {{"hostname", hostname}}; auto accs = getAccelerators(job, hostname); if (!accs.empty()) { @@ -430,8 +441,8 @@ void CCSyncPlugin::endTransaction() { sendPostRequest("/api/jobs/start_job/", j.dump()); } else if (lastState == RUNNING && - (state == REMOVED || state == COMPLETED || state == HELD || - state == SUSPENDED)) { + (state == IDLE || state == REMOVED || state == COMPLETED || + state == HELD || state == SUSPENDED)) { std::uint64_t startTime = std::stoull(job["JobCurrentStartDate"]), stopTime = std::stoull(job["EnteredCurrentStatus"]);