mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-26 15:29:04 +01:00
8d85bd53f1
* Cleanup: Remove unused code * Use Golang duration parser for 'interval' and 'duration' in main config * Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 * Units with cc-units (#64) * Add option to normalize units with cc-unit * Add unit conversion to router * Add option to change unit prefix in the router * Add to MetricRouter README * Add order of operations in router to README * Use second add_tags/del_tags only if metric gets renamed * Skip disks in DiskstatCollector that have size=0 * Check readability of sensor files in TempCollector * Fix for --once option * Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend * Collectors in parallel (#74) * Provide info to CollectorManager whether the collector can be executed in parallel with others * Split serial and parallel collectors. Read in parallel first * Update NvidiaCollector with new metrics, MIG and NvLink support (#75) * CC topology module update (#76) * Rename CPU to hardware thread, write some comments * Do renaming in other parts * Remove CpuList and SocketList function from metricCollector. Available in ccTopology * Option to use MIG UUID as subtype-id in NvidiaCollector * Option to use MIG slice name as subtype-id in NvidiaCollector * MetricRouter: Fix JSON in README * Fix for Github Action to really use the selected version * Remove Ganglia installation in runonce Action and add Go 1.18 * Fix daemon options in init script * Add separate go.mod files to use it with deprecated 1.16 * Minor updates for Makefiles * fix string comparison * AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description * Use http instead of ftp to download likwid * Fix serial number in rocmCollector * Improved http sink (#78) * automatic flush in NatsSink * tweak default options of HttpSink * shorter cirt. section and retries for HttpSink * fix error handling * Remove file added by mistake. * Use http instead of ftp to download likwid * Fix serial number in rocmCollector Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Lou <lou.knauer@gmx.de>
141 lines
2.9 KiB
Bash
Executable File
141 lines
2.9 KiB
Bash
Executable File
#! /usr/bin/env bash
|
|
|
|
# chkconfig: 2345 80 05
|
|
# description: ClusterCockpit metric collector
|
|
# processname: cc-metric-collector
|
|
# config: /etc/default/cc-metric-collector
|
|
# pidfile: /var/run/cc-metric-collector.pid
|
|
|
|
### BEGIN INIT INFO
|
|
# Provides: cc-metric-collector
|
|
# Required-Start: $all
|
|
# Required-Stop: $remote_fs $syslog
|
|
# Default-Start: 2 3 4 5
|
|
# Default-Stop: 0 1 6
|
|
# Short-Description: Start ClusterCockpit metric collector at boot time
|
|
### END INIT INFO
|
|
|
|
|
|
PATH=/bin:/usr/bin:/sbin:/usr/sbin
|
|
NAME=cc-metric-collector
|
|
DESC="ClusterCockpit metric collector"
|
|
DEFAULT=/etc/default/${NAME}.json
|
|
|
|
CC_USER=clustercockpit
|
|
CC_GROUP=clustercockpit
|
|
CONF_DIR=/etc/cc-metric-collector
|
|
PID_FILE=/var/run/$NAME.pid
|
|
DAEMON=/usr/sbin/$NAME
|
|
CONF_FILE=${CONF_DIR}/cc-metric-collector.json
|
|
|
|
umask 0027
|
|
|
|
if [ ! -x $DAEMON ]; then
|
|
echo "Program not installed or not executable"
|
|
exit 5
|
|
fi
|
|
|
|
. /lib/lsb/init-functions
|
|
|
|
if [ -r /etc/default/rcS ]; then
|
|
. /etc/default/rcS
|
|
fi
|
|
|
|
# overwrite settings from default file
|
|
if [ -f "$DEFAULT" ]; then
|
|
. "$DEFAULT"
|
|
fi
|
|
|
|
CC_OPTS="--config=${CONF_FILE}"
|
|
|
|
function checkUser() {
|
|
if [ `id -u` -ne 0 ]; then
|
|
echo "You need root privileges to run this script"
|
|
exit 4
|
|
fi
|
|
}
|
|
|
|
case "$1" in
|
|
start)
|
|
checkUser
|
|
log_daemon_msg "Starting $DESC"
|
|
|
|
pid=`pidofproc -p $PID_FILE $NAME`
|
|
if [ -n "$pid" ] ; then
|
|
log_begin_msg "Already running."
|
|
log_end_msg 0
|
|
exit 0
|
|
fi
|
|
|
|
# Prepare environment
|
|
touch "$PID_FILE" && chown "$CC_USER":"$CC_GROUP" "$PID_FILE"
|
|
|
|
if [ -n "$MAX_OPEN_FILES" ]; then
|
|
ulimit -n $MAX_OPEN_FILES
|
|
fi
|
|
|
|
# Start Daemon
|
|
start-stop-daemon --start -b --chdir "$WORK_DIR" --user "$CC_USER" -c "$CC_USER" --pidfile "$PID_FILE" --exec $DAEMON -- $CC_OPTS
|
|
return=$?
|
|
if [ $return -eq 0 ]
|
|
then
|
|
sleep 1
|
|
|
|
# check if pid file has been written to
|
|
if ! [[ -s $PID_FILE ]]; then
|
|
log_end_msg 1
|
|
exit 1
|
|
fi
|
|
|
|
i=0
|
|
timeout=10
|
|
# Wait for the process to be properly started before exiting
|
|
until { cat "$PID_FILE" | xargs kill -0; } >/dev/null 2>&1
|
|
do
|
|
sleep 1
|
|
i=$(($i + 1))
|
|
if [ $i -gt $timeout ]; then
|
|
log_end_msg 1
|
|
exit 1
|
|
fi
|
|
done
|
|
fi
|
|
log_end_msg $return
|
|
;;
|
|
stop)
|
|
checkUser
|
|
log_daemon_msg "Stopping $DESC"
|
|
|
|
if [ -f "$PID_FILE" ]; then
|
|
start-stop-daemon --stop --pidfile "$PID_FILE" \
|
|
--user "$CC_USER" \
|
|
--retry=TERM/20/KILL/5 >/dev/null
|
|
if [ $? -eq 1 ]; then
|
|
log_progress_msg "$DESC is not running but pid file exists, cleaning up"
|
|
elif [ $? -eq 3 ]; then
|
|
PID="`cat $PID_FILE`"
|
|
log_failure_msg "Failed to stop $DESC (pid $PID)"
|
|
exit 1
|
|
fi
|
|
rm -f "$PID_FILE"
|
|
else
|
|
log_progress_msg "(not running)"
|
|
fi
|
|
log_end_msg 0
|
|
;;
|
|
status)
|
|
status_of_proc -p $PID_FILE $NAME $NAME && exit 0 || exit $?
|
|
;;
|
|
restart|force-reload)
|
|
if [ -f "$PID_FILE" ]; then
|
|
$0 stop
|
|
sleep 1
|
|
fi
|
|
$0 start
|
|
;;
|
|
*)
|
|
log_success_msg "Usage: $0 {start|stop|restart|force-reload|status}"
|
|
exit 3
|
|
;;
|
|
esac
|