Limit verbosity a bit (SCHEDD_DEBUG=D_FULLDEBUG).

This commit is contained in:
Joachim Meyer 2022-12-16 09:35:10 +01:00
parent b530d9034e
commit 80619b6154
3 changed files with 79 additions and 101 deletions

View File

@ -42,3 +42,5 @@ target_include_directories(htcondor_cc_sync_plugin PRIVATE ${CMAKE_SOURCE_DIR}/c
target_compile_options(htcondor_cc_sync_plugin PRIVATE target_compile_options(htcondor_cc_sync_plugin PRIVATE
-DCONDOR_VERSION=\"10.0.0\" -DENABLE_STATE_DUMP -DGLIBC219=GLIBC219 -DGLIBC=GLIBC -DHAVE_CONFIG_H -DLINUX=\"LINUX_3.13.0-30-GENERIC\" -DPLATFORM=\"X86_64-Linux_17\" -DPRE_RELEASE_STR=\"\" -DWITH_OPENSSL -DX86_64=X86_64 -Dcondorapi_shared_EXPORTS -DWITH_IPV6) -DCONDOR_VERSION=\"10.0.0\" -DENABLE_STATE_DUMP -DGLIBC219=GLIBC219 -DGLIBC=GLIBC -DHAVE_CONFIG_H -DLINUX=\"LINUX_3.13.0-30-GENERIC\" -DPLATFORM=\"X86_64-Linux_17\" -DPRE_RELEASE_STR=\"\" -DWITH_OPENSSL -DX86_64=X86_64 -Dcondorapi_shared_EXPORTS -DWITH_IPV6)
target_compile_definitions(htcondor_cc_sync_plugin PRIVATE $<$<OR:$<STREQUAL:${CMAKE_BUILD_TYPE},Debug>,$<STREQUAL:${CMAKE_BUILD_TYPE},RelWithDebInfo>>:VERBOSE>)

View File

@ -38,3 +38,5 @@ CCSYNC_SUBMIT_ID=<Unique submission node id, expected to be in 0..3 (see #global
} }
} }
``` ```
For getting a debug dump of the class ads at the end of the `endTransaction`, build with `-DVERBOSE` (automatically set for `Debug` or `RelWithDebInfo` builds) and set `SCHEDD_DEBUG=D_FULLDEBUG` in the condor config.

View File

@ -252,17 +252,16 @@ std::uint64_t CCSyncPlugin::globalJobIdToInt(const JobId jobId) {
} }
void printJobStatus(const JobId jobId, const CondorJob &job) { void printJobStatus(const JobId jobId, const CondorJob &job) {
dprintf(D_ALWAYS, "=============== Job ClassAds: %d.%d\n", jobId.first, dprintf(D_VERBOSE, "=============== Job ClassAds: %d.%d\n", jobId.first,
jobId.second); jobId.second);
for (auto value : job) { for (auto value : job) {
dprintf(D_ALWAYS, "%s: %s\n", value.first.c_str(), value.second.c_str()); dprintf(D_VERBOSE, "%s: %s\n", value.first.c_str(), value.second.c_str());
} }
dprintf(D_ALWAYS, "=============== End Job ClassAds: %d.%d\n", jobId.first, dprintf(D_VERBOSE, "=============== End Job ClassAds: %d.%d\n", jobId.first,
jobId.second); jobId.second);
} }
std::string getRemoteHost(std::string_view slot) { std::string getRemoteHost(std::string_view slot) {
return "uller";
auto atIt = std::find(slot.begin(), slot.end(), '@'); auto atIt = std::find(slot.begin(), slot.end(), '@');
auto subIt = std::find(atIt, slot.end(), '.'); auto subIt = std::find(atIt, slot.end(), '.');
return {++atIt, subIt}; return {++atIt, subIt};
@ -323,7 +322,7 @@ std::string removeQuotes(const std::string &value) {
void CCSyncPlugin::sendPostRequest(const std::string &route, void CCSyncPlugin::sendPostRequest(const std::string &route,
const std::string &body) const noexcept { const std::string &body) const noexcept {
dprintf(D_ALWAYS, "POST body: %s\n", body.c_str()); dprintf(D_VERBOSE, "POST body: %s\n", body.c_str());
try { try {
curlpp::Easy request; curlpp::Easy request;
request.setOpt(curlpp::options::Url(url + route)); request.setOpt(curlpp::options::Url(url + route));
@ -341,18 +340,20 @@ void CCSyncPlugin::sendPostRequest(const std::string &route,
std::stringstream buf; std::stringstream buf;
request.setOpt(curlpp::options::WriteStream(&buf)); request.setOpt(curlpp::options::WriteStream(&buf));
#ifdef CURL_DEBUG
request.setOpt(curlpp::options::DebugFunction( request.setOpt(curlpp::options::DebugFunction(
[](curl_infotype type, char *ptr, unsigned long length) -> int { [](curl_infotype type, char *ptr, unsigned long length) -> int {
// todo: eh.. if we need a length, are we potentially reading // todo: eh.. if we need a length, are we potentially reading
// oob?? // oob??
dprintf(D_ALWAYS, "%d: %s\n", type, ptr); dprintf(D_VERBOSE, "%d: %s\n", type, ptr);
return length; return length;
})); }));
// request.setOpt(curlpp::options::Verbose(true)); request.setOpt(curlpp::options::Verbose(true));
#endif
request.perform(); request.perform();
dprintf(D_ALWAYS, "response: %s\n", buf.str().c_str()); dprintf(D_VERBOSE, "response: %s\n", buf.str().c_str());
} catch (const curlpp::LogicError &e) { } catch (const curlpp::LogicError &e) {
dprintf(D_ALWAYS, "LogicError: %s\n", e.what()); dprintf(D_ALWAYS, "LogicError: %s\n", e.what());
} catch (const curlpp::RuntimeError &e) { } catch (const curlpp::RuntimeError &e) {
@ -364,41 +365,6 @@ void CCSyncPlugin::sendPostRequest(const std::string &route,
} }
} }
// std::unordered_map<std::string, std::string> parseToE(const std::string &toe)
// {
// // toe = "[ Who = "itself"; How = "OF_ITS_OWN_ACCORD"; ExitCode = 2;
// HowCode =
// // 0; When = 1671108732; ExitBySignal = false ]";
// auto start = toe.find_first_not_of("[ ");
// auto stop = toe.find_last_not_of("] ");
// std::istringstream toeStream{toe.substr(start + 1, stop - start)};
// std::unordered_map<std::string, std::string> map;
// std::string element;
// while (std::getline(toeStream, element, ';')) {
// auto splitter = element.find(" = ");
// auto key = element.substr(0, splitter);
// auto value = element.substr(splitter + sizeof(" = "));
// map.emplace(key, value);
// dprintf(D_ALWAYS, "ToE field '%s': '%s'\n", key.c_str(), value.c_str());
// }
// return map;
// }
// std::uint64_t getEndTime(const CondorJob &job) {
// if (auto toeIt = job.find("ToE"); toeIt != job.end()) {
// auto toe = parseToE(toeIt->second);
// if (auto whenIt = toe.find("When"); whenIt != toe.end())
// return std::stoull(whenIt->second); // todo: always an int here?
// } else if (auto enteredStateTimeIt = job.find("EnteredCurrentStatus");
// enteredStateTimeIt != job.end()) {
// return std::stoull(enteredStateTimeIt->second);
// }
// return 0;
// }
void CCSyncPlugin::endTransaction() { void CCSyncPlugin::endTransaction() {
if (initializing) { // HTCondor bug? We should'n be receiving events before if (initializing) { // HTCondor bug? We should'n be receiving events before
// initialization is done. We will ignore them if it // initialization is done. We will ignore them if it
@ -411,70 +377,78 @@ void CCSyncPlugin::endTransaction() {
// dprintf(D_FULLDEBUG,"Ending transaction.\n"); // dprintf(D_FULLDEBUG,"Ending transaction.\n");
for (auto jobId : toConsider) { for (auto jobId : toConsider) {
if (jobId.second == -1) try {
continue; // don't care about the "template" if (jobId.second == -1)
continue; // don't care about the "template"
auto &job = currentStatus[jobId.first][jobId.second]; auto &job = currentStatus[jobId.first][jobId.second];
printJobStatus(jobId, job); #ifdef VERBOSE
if (job.empty() || !jobHasRequiredClassAds(job)) printJobStatus(jobId, job);
continue; // not yet ready #endif
if (job.empty() || !jobHasRequiredClassAds(job))
continue; // not yet ready
dprintf(D_ALWAYS, "JobStatus: %s\n", job["JobStatus"].c_str()); dprintf(D_VERBOSE, "JobStatus: %s\n", job["JobStatus"].c_str());
int state = std::stoi(job["JobStatus"]), lastState; int state = std::stoi(job["JobStatus"]), lastState;
if (auto lastStateIt = job.find("LastJobStatus"); if (auto lastStateIt = job.find("LastJobStatus");
lastStateIt != job.end()) { lastStateIt != job.end()) {
dprintf(D_ALWAYS, "LastJobStatus: %s\n", lastStateIt->second.c_str()); dprintf(D_VERBOSE, "LastJobStatus: %s\n", lastStateIt->second.c_str());
if (lastState = std::stoi(lastStateIt->second); lastState == state) if (lastState = std::stoi(lastStateIt->second); lastState == state)
continue;
} else
continue; continue;
} else
continue;
if (state == RUNNING) { if (state == RUNNING) {
auto jobBatchNameIt = job.find("JobBatchName"); auto jobBatchNameIt = job.find("JobBatchName");
auto globalJobId = globalJobIdToInt(jobId); auto globalJobId = globalJobIdToInt(jobId);
auto hostname = getRemoteHost(job["RemoteHost"]); auto hostname = getRemoteHost(job["RemoteHost"]);
json resources = {{"hostname", hostname}}; json resources = {{"hostname", hostname}};
auto accs = getAccelerators(job, hostname); auto accs = getAccelerators(job, hostname);
if (!accs.empty()) { if (!accs.empty()) {
resources["accelerators"] = accs; resources["accelerators"] = accs;
}
json j = {
{"jobId", globalJobId},
{"arrayJobId", jobId.first * 10 + submitNodeId},
{"user", removeQuotes(job["Owner"])},
{"cluster", clusterName},
{"numNodes", std::stoi(job["CurrentHosts"])},
{"numHwthreads", std::stoi(job["CpusProvisioned"])},
{"startTime", std::stoull(job["EnteredCurrentStatus"])},
{"project", removeQuotes(job["AcctGroup"])},
{"partition", "main"},
{"exclusive", 0},
{"resources", json::array({resources})},
{"numAcc", accs.size()},
{"metadata",
{{"jobName",
jobBatchNameIt != job.end() ? jobBatchNameIt->second : ""}}}};
sendPostRequest("/api/jobs/start_job/", j.dump());
} else if (lastState == RUNNING &&
(state == REMOVED || state == COMPLETED || state == HELD ||
state == SUSPENDED)) {
std::uint64_t startTime = std::stoull(job["JobCurrentStartDate"]),
stopTime = std::stoull(job["EnteredCurrentStatus"]);
json j = {{"jobId", globalJobIdToInt(jobId)},
{"cluster", clusterName},
{"jobState", jobStateMap.at(state)},
{"startTime", startTime},
{"stopTime",
(stopTime - startTime > 10) ? stopTime : startTime + 10}};
sendPostRequest("/api/jobs/stop_job/", j.dump());
} }
} catch (const std::exception &e) {
json j = { dprintf(D_ALWAYS, "exception: %s\n", e.what());
{"jobId", globalJobId}, } catch (...) {
{"arrayJobId", jobId.first * 10 + submitNodeId}, dprintf(D_ALWAYS, "unknown exception\n");
{"user", removeQuotes(job["Owner"])},
{"cluster", clusterName},
{"numNodes", std::stoi(job["CurrentHosts"])},
{"numHwthreads", std::stoi(job["CpusProvisioned"])},
{"startTime", std::stoull(job["EnteredCurrentStatus"])},
{"project", removeQuotes(job["AcctGroup"])},
{"partition", "main"},
{"exclusive", 0},
{"resources", json::array({resources})},
{"numAcc", accs.size()},
{"metadata",
{{"jobName",
jobBatchNameIt != job.end() ? jobBatchNameIt->second : ""}}}};
sendPostRequest("/api/jobs/start_job/", j.dump());
} else if (lastState == RUNNING &&
(state == REMOVED || state == COMPLETED || state == HELD ||
state == SUSPENDED)) {
std::uint64_t startTime = std::stoull(job["JobCurrentStartDate"]),
stopTime = std::stoull(job["EnteredCurrentStatus"]);
json j = {{"jobId", globalJobIdToInt(jobId)},
{"cluster", clusterName},
{"jobState", jobStateMap.at(state)},
{"startTime", startTime},
{"stopTime",
(stopTime - startTime > 10) ? stopTime : startTime + 10}};
sendPostRequest("/api/jobs/stop_job/", j.dump());
} }
} }