mirror of
https://gitlab.cs.uni-saarland.de/hpc/cc-condor-sync.git
synced 2024-12-26 05:29:06 +01:00
Limit verbosity a bit (SCHEDD_DEBUG=D_FULLDEBUG).
This commit is contained in:
parent
b530d9034e
commit
80619b6154
@ -42,3 +42,5 @@ target_include_directories(htcondor_cc_sync_plugin PRIVATE ${CMAKE_SOURCE_DIR}/c
|
|||||||
|
|
||||||
target_compile_options(htcondor_cc_sync_plugin PRIVATE
|
target_compile_options(htcondor_cc_sync_plugin PRIVATE
|
||||||
-DCONDOR_VERSION=\"10.0.0\" -DENABLE_STATE_DUMP -DGLIBC219=GLIBC219 -DGLIBC=GLIBC -DHAVE_CONFIG_H -DLINUX=\"LINUX_3.13.0-30-GENERIC\" -DPLATFORM=\"X86_64-Linux_17\" -DPRE_RELEASE_STR=\"\" -DWITH_OPENSSL -DX86_64=X86_64 -Dcondorapi_shared_EXPORTS -DWITH_IPV6)
|
-DCONDOR_VERSION=\"10.0.0\" -DENABLE_STATE_DUMP -DGLIBC219=GLIBC219 -DGLIBC=GLIBC -DHAVE_CONFIG_H -DLINUX=\"LINUX_3.13.0-30-GENERIC\" -DPLATFORM=\"X86_64-Linux_17\" -DPRE_RELEASE_STR=\"\" -DWITH_OPENSSL -DX86_64=X86_64 -Dcondorapi_shared_EXPORTS -DWITH_IPV6)
|
||||||
|
|
||||||
|
target_compile_definitions(htcondor_cc_sync_plugin PRIVATE $<$<OR:$<STREQUAL:${CMAKE_BUILD_TYPE},Debug>,$<STREQUAL:${CMAKE_BUILD_TYPE},RelWithDebInfo>>:VERBOSE>)
|
||||||
|
@ -38,3 +38,5 @@ CCSYNC_SUBMIT_ID=<Unique submission node id, expected to be in 0..3 (see #global
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For getting a debug dump of the class ads at the end of the `endTransaction`, build with `-DVERBOSE` (automatically set for `Debug` or `RelWithDebInfo` builds) and set `SCHEDD_DEBUG=D_FULLDEBUG` in the condor config.
|
||||||
|
@ -252,17 +252,16 @@ std::uint64_t CCSyncPlugin::globalJobIdToInt(const JobId jobId) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void printJobStatus(const JobId jobId, const CondorJob &job) {
|
void printJobStatus(const JobId jobId, const CondorJob &job) {
|
||||||
dprintf(D_ALWAYS, "=============== Job ClassAds: %d.%d\n", jobId.first,
|
dprintf(D_VERBOSE, "=============== Job ClassAds: %d.%d\n", jobId.first,
|
||||||
jobId.second);
|
jobId.second);
|
||||||
for (auto value : job) {
|
for (auto value : job) {
|
||||||
dprintf(D_ALWAYS, "%s: %s\n", value.first.c_str(), value.second.c_str());
|
dprintf(D_VERBOSE, "%s: %s\n", value.first.c_str(), value.second.c_str());
|
||||||
}
|
}
|
||||||
dprintf(D_ALWAYS, "=============== End Job ClassAds: %d.%d\n", jobId.first,
|
dprintf(D_VERBOSE, "=============== End Job ClassAds: %d.%d\n", jobId.first,
|
||||||
jobId.second);
|
jobId.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string getRemoteHost(std::string_view slot) {
|
std::string getRemoteHost(std::string_view slot) {
|
||||||
return "uller";
|
|
||||||
auto atIt = std::find(slot.begin(), slot.end(), '@');
|
auto atIt = std::find(slot.begin(), slot.end(), '@');
|
||||||
auto subIt = std::find(atIt, slot.end(), '.');
|
auto subIt = std::find(atIt, slot.end(), '.');
|
||||||
return {++atIt, subIt};
|
return {++atIt, subIt};
|
||||||
@ -323,7 +322,7 @@ std::string removeQuotes(const std::string &value) {
|
|||||||
|
|
||||||
void CCSyncPlugin::sendPostRequest(const std::string &route,
|
void CCSyncPlugin::sendPostRequest(const std::string &route,
|
||||||
const std::string &body) const noexcept {
|
const std::string &body) const noexcept {
|
||||||
dprintf(D_ALWAYS, "POST body: %s\n", body.c_str());
|
dprintf(D_VERBOSE, "POST body: %s\n", body.c_str());
|
||||||
try {
|
try {
|
||||||
curlpp::Easy request;
|
curlpp::Easy request;
|
||||||
request.setOpt(curlpp::options::Url(url + route));
|
request.setOpt(curlpp::options::Url(url + route));
|
||||||
@ -341,18 +340,20 @@ void CCSyncPlugin::sendPostRequest(const std::string &route,
|
|||||||
std::stringstream buf;
|
std::stringstream buf;
|
||||||
request.setOpt(curlpp::options::WriteStream(&buf));
|
request.setOpt(curlpp::options::WriteStream(&buf));
|
||||||
|
|
||||||
|
#ifdef CURL_DEBUG
|
||||||
request.setOpt(curlpp::options::DebugFunction(
|
request.setOpt(curlpp::options::DebugFunction(
|
||||||
[](curl_infotype type, char *ptr, unsigned long length) -> int {
|
[](curl_infotype type, char *ptr, unsigned long length) -> int {
|
||||||
// todo: eh.. if we need a length, are we potentially reading
|
// todo: eh.. if we need a length, are we potentially reading
|
||||||
// oob??
|
// oob??
|
||||||
dprintf(D_ALWAYS, "%d: %s\n", type, ptr);
|
dprintf(D_VERBOSE, "%d: %s\n", type, ptr);
|
||||||
return length;
|
return length;
|
||||||
}));
|
}));
|
||||||
// request.setOpt(curlpp::options::Verbose(true));
|
request.setOpt(curlpp::options::Verbose(true));
|
||||||
|
#endif
|
||||||
|
|
||||||
request.perform();
|
request.perform();
|
||||||
|
|
||||||
dprintf(D_ALWAYS, "response: %s\n", buf.str().c_str());
|
dprintf(D_VERBOSE, "response: %s\n", buf.str().c_str());
|
||||||
} catch (const curlpp::LogicError &e) {
|
} catch (const curlpp::LogicError &e) {
|
||||||
dprintf(D_ALWAYS, "LogicError: %s\n", e.what());
|
dprintf(D_ALWAYS, "LogicError: %s\n", e.what());
|
||||||
} catch (const curlpp::RuntimeError &e) {
|
} catch (const curlpp::RuntimeError &e) {
|
||||||
@ -364,41 +365,6 @@ void CCSyncPlugin::sendPostRequest(const std::string &route,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// std::unordered_map<std::string, std::string> parseToE(const std::string &toe)
|
|
||||||
// {
|
|
||||||
// // toe = "[ Who = "itself"; How = "OF_ITS_OWN_ACCORD"; ExitCode = 2;
|
|
||||||
// HowCode =
|
|
||||||
// // 0; When = 1671108732; ExitBySignal = false ]";
|
|
||||||
// auto start = toe.find_first_not_of("[ ");
|
|
||||||
// auto stop = toe.find_last_not_of("] ");
|
|
||||||
// std::istringstream toeStream{toe.substr(start + 1, stop - start)};
|
|
||||||
|
|
||||||
// std::unordered_map<std::string, std::string> map;
|
|
||||||
|
|
||||||
// std::string element;
|
|
||||||
// while (std::getline(toeStream, element, ';')) {
|
|
||||||
// auto splitter = element.find(" = ");
|
|
||||||
// auto key = element.substr(0, splitter);
|
|
||||||
// auto value = element.substr(splitter + sizeof(" = "));
|
|
||||||
// map.emplace(key, value);
|
|
||||||
// dprintf(D_ALWAYS, "ToE field '%s': '%s'\n", key.c_str(), value.c_str());
|
|
||||||
// }
|
|
||||||
|
|
||||||
// return map;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// std::uint64_t getEndTime(const CondorJob &job) {
|
|
||||||
// if (auto toeIt = job.find("ToE"); toeIt != job.end()) {
|
|
||||||
// auto toe = parseToE(toeIt->second);
|
|
||||||
// if (auto whenIt = toe.find("When"); whenIt != toe.end())
|
|
||||||
// return std::stoull(whenIt->second); // todo: always an int here?
|
|
||||||
// } else if (auto enteredStateTimeIt = job.find("EnteredCurrentStatus");
|
|
||||||
// enteredStateTimeIt != job.end()) {
|
|
||||||
// return std::stoull(enteredStateTimeIt->second);
|
|
||||||
// }
|
|
||||||
// return 0;
|
|
||||||
// }
|
|
||||||
|
|
||||||
void CCSyncPlugin::endTransaction() {
|
void CCSyncPlugin::endTransaction() {
|
||||||
if (initializing) { // HTCondor bug? We should'n be receiving events before
|
if (initializing) { // HTCondor bug? We should'n be receiving events before
|
||||||
// initialization is done. We will ignore them if it
|
// initialization is done. We will ignore them if it
|
||||||
@ -411,70 +377,78 @@ void CCSyncPlugin::endTransaction() {
|
|||||||
// dprintf(D_FULLDEBUG,"Ending transaction.\n");
|
// dprintf(D_FULLDEBUG,"Ending transaction.\n");
|
||||||
|
|
||||||
for (auto jobId : toConsider) {
|
for (auto jobId : toConsider) {
|
||||||
if (jobId.second == -1)
|
try {
|
||||||
continue; // don't care about the "template"
|
if (jobId.second == -1)
|
||||||
|
continue; // don't care about the "template"
|
||||||
|
|
||||||
auto &job = currentStatus[jobId.first][jobId.second];
|
auto &job = currentStatus[jobId.first][jobId.second];
|
||||||
printJobStatus(jobId, job);
|
#ifdef VERBOSE
|
||||||
if (job.empty() || !jobHasRequiredClassAds(job))
|
printJobStatus(jobId, job);
|
||||||
continue; // not yet ready
|
#endif
|
||||||
|
if (job.empty() || !jobHasRequiredClassAds(job))
|
||||||
|
continue; // not yet ready
|
||||||
|
|
||||||
dprintf(D_ALWAYS, "JobStatus: %s\n", job["JobStatus"].c_str());
|
dprintf(D_VERBOSE, "JobStatus: %s\n", job["JobStatus"].c_str());
|
||||||
int state = std::stoi(job["JobStatus"]), lastState;
|
int state = std::stoi(job["JobStatus"]), lastState;
|
||||||
|
|
||||||
if (auto lastStateIt = job.find("LastJobStatus");
|
if (auto lastStateIt = job.find("LastJobStatus");
|
||||||
lastStateIt != job.end()) {
|
lastStateIt != job.end()) {
|
||||||
dprintf(D_ALWAYS, "LastJobStatus: %s\n", lastStateIt->second.c_str());
|
dprintf(D_VERBOSE, "LastJobStatus: %s\n", lastStateIt->second.c_str());
|
||||||
if (lastState = std::stoi(lastStateIt->second); lastState == state)
|
if (lastState = std::stoi(lastStateIt->second); lastState == state)
|
||||||
|
continue;
|
||||||
|
} else
|
||||||
continue;
|
continue;
|
||||||
} else
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (state == RUNNING) {
|
if (state == RUNNING) {
|
||||||
auto jobBatchNameIt = job.find("JobBatchName");
|
auto jobBatchNameIt = job.find("JobBatchName");
|
||||||
auto globalJobId = globalJobIdToInt(jobId);
|
auto globalJobId = globalJobIdToInt(jobId);
|
||||||
|
|
||||||
auto hostname = getRemoteHost(job["RemoteHost"]);
|
auto hostname = getRemoteHost(job["RemoteHost"]);
|
||||||
json resources = {{"hostname", hostname}};
|
json resources = {{"hostname", hostname}};
|
||||||
auto accs = getAccelerators(job, hostname);
|
auto accs = getAccelerators(job, hostname);
|
||||||
if (!accs.empty()) {
|
if (!accs.empty()) {
|
||||||
resources["accelerators"] = accs;
|
resources["accelerators"] = accs;
|
||||||
|
}
|
||||||
|
|
||||||
|
json j = {
|
||||||
|
{"jobId", globalJobId},
|
||||||
|
{"arrayJobId", jobId.first * 10 + submitNodeId},
|
||||||
|
{"user", removeQuotes(job["Owner"])},
|
||||||
|
{"cluster", clusterName},
|
||||||
|
{"numNodes", std::stoi(job["CurrentHosts"])},
|
||||||
|
{"numHwthreads", std::stoi(job["CpusProvisioned"])},
|
||||||
|
{"startTime", std::stoull(job["EnteredCurrentStatus"])},
|
||||||
|
{"project", removeQuotes(job["AcctGroup"])},
|
||||||
|
{"partition", "main"},
|
||||||
|
{"exclusive", 0},
|
||||||
|
{"resources", json::array({resources})},
|
||||||
|
{"numAcc", accs.size()},
|
||||||
|
{"metadata",
|
||||||
|
{{"jobName",
|
||||||
|
jobBatchNameIt != job.end() ? jobBatchNameIt->second : ""}}}};
|
||||||
|
|
||||||
|
sendPostRequest("/api/jobs/start_job/", j.dump());
|
||||||
|
|
||||||
|
} else if (lastState == RUNNING &&
|
||||||
|
(state == REMOVED || state == COMPLETED || state == HELD ||
|
||||||
|
state == SUSPENDED)) {
|
||||||
|
|
||||||
|
std::uint64_t startTime = std::stoull(job["JobCurrentStartDate"]),
|
||||||
|
stopTime = std::stoull(job["EnteredCurrentStatus"]);
|
||||||
|
|
||||||
|
json j = {{"jobId", globalJobIdToInt(jobId)},
|
||||||
|
{"cluster", clusterName},
|
||||||
|
{"jobState", jobStateMap.at(state)},
|
||||||
|
{"startTime", startTime},
|
||||||
|
{"stopTime",
|
||||||
|
(stopTime - startTime > 10) ? stopTime : startTime + 10}};
|
||||||
|
|
||||||
|
sendPostRequest("/api/jobs/stop_job/", j.dump());
|
||||||
}
|
}
|
||||||
|
} catch (const std::exception &e) {
|
||||||
json j = {
|
dprintf(D_ALWAYS, "exception: %s\n", e.what());
|
||||||
{"jobId", globalJobId},
|
} catch (...) {
|
||||||
{"arrayJobId", jobId.first * 10 + submitNodeId},
|
dprintf(D_ALWAYS, "unknown exception\n");
|
||||||
{"user", removeQuotes(job["Owner"])},
|
|
||||||
{"cluster", clusterName},
|
|
||||||
{"numNodes", std::stoi(job["CurrentHosts"])},
|
|
||||||
{"numHwthreads", std::stoi(job["CpusProvisioned"])},
|
|
||||||
{"startTime", std::stoull(job["EnteredCurrentStatus"])},
|
|
||||||
{"project", removeQuotes(job["AcctGroup"])},
|
|
||||||
{"partition", "main"},
|
|
||||||
{"exclusive", 0},
|
|
||||||
{"resources", json::array({resources})},
|
|
||||||
{"numAcc", accs.size()},
|
|
||||||
{"metadata",
|
|
||||||
{{"jobName",
|
|
||||||
jobBatchNameIt != job.end() ? jobBatchNameIt->second : ""}}}};
|
|
||||||
|
|
||||||
sendPostRequest("/api/jobs/start_job/", j.dump());
|
|
||||||
|
|
||||||
} else if (lastState == RUNNING &&
|
|
||||||
(state == REMOVED || state == COMPLETED || state == HELD ||
|
|
||||||
state == SUSPENDED)) {
|
|
||||||
|
|
||||||
std::uint64_t startTime = std::stoull(job["JobCurrentStartDate"]),
|
|
||||||
stopTime = std::stoull(job["EnteredCurrentStatus"]);
|
|
||||||
|
|
||||||
json j = {{"jobId", globalJobIdToInt(jobId)},
|
|
||||||
{"cluster", clusterName},
|
|
||||||
{"jobState", jobStateMap.at(state)},
|
|
||||||
{"startTime", startTime},
|
|
||||||
{"stopTime",
|
|
||||||
(stopTime - startTime > 10) ? stopTime : startTime + 10}};
|
|
||||||
|
|
||||||
sendPostRequest("/api/jobs/stop_job/", j.dump());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user