package collectors

import (
	"encoding/json"
	"errors"
	"fmt"
	"slices"
	"strconv"
	"strings"
	"time"

	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
	lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
	"github.com/NVIDIA/go-nvml/pkg/nvml"
)

type NvidiaGPMMetricDef struct {
	name    string
	outname string
	id      nvml.GpmMetricId
	unit    string
}

var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{
	{
		name:    "GRAPHICS_UTIL",
		outname: "nv_gpm_graphics_util",
		id:      nvml.GPM_METRIC_GRAPHICS_UTIL,
		unit:    "%",
	},
	{
		name:    "SM_UTIL",
		outname: "nv_gpm_sm_util",
		id:      nvml.GPM_METRIC_SM_UTIL,
		unit:    "%",
	},
	{
		name:    "SM_OCCUPANCY",
		outname: "nv_gpm_sm_occupancy",
		id:      nvml.GPM_METRIC_SM_OCCUPANCY,
		unit:    "%",
	},
	{
		name:    "INTEGER_UTIL",
		outname: "nv_gpm_integer_util",
		id:      nvml.GPM_METRIC_INTEGER_UTIL,
		unit:    "%",
	},
	{
		name:    "ANY_TENSOR_UTIL",
		outname: "nv_gpm_any_tensor_util",
		id:      nvml.GPM_METRIC_ANY_TENSOR_UTIL,
		unit:    "%",
	},
	{
		name:    "DFMA_TENSOR_UTIL",
		outname: "nv_gpm_dfma_tensor_util",
		id:      nvml.GPM_METRIC_DFMA_TENSOR_UTIL,
		unit:    "%",
	},
	{
		name:    "HMMA_TENSOR_UTIL",
		outname: "nv_gpm_hmma_tensor_util",
		id:      nvml.GPM_METRIC_HMMA_TENSOR_UTIL,
		unit:    "%",
	},
	{
		name:    "IMMA_TENSOR_UTIL",
		outname: "nv_gpm_imma_tensor_util",
		id:      nvml.GPM_METRIC_IMMA_TENSOR_UTIL,
		unit:    "%",
	},
	{
		name:    "DRAM_BW_UTIL",
		outname: "nv_gpm_dram_bw_util",
		id:      nvml.GPM_METRIC_DRAM_BW_UTIL,
		unit:    "%",
	},
	{
		name:    "FP64_UTIL",
		outname: "nv_gpm_fp64_util",
		id:      nvml.GPM_METRIC_FP64_UTIL,
		unit:    "%",
	},
	{
		name:    "FP32_UTIL",
		outname: "nv_gpm_fp32_util",
		id:      nvml.GPM_METRIC_FP32_UTIL,
		unit:    "%",
	},
	{
		name:    "FP16_UTIL",
		outname: "nv_gpm_fp16_util",
		id:      nvml.GPM_METRIC_FP16_UTIL,
		unit:    "%",
	},
}

type NvidiaGPMCollectorConfig struct {
	Metrics               []string `json:"metrics,omitempty"`
	ExcludeDevices        []string `json:"exclude_devices,omitempty"`
	AddPciInfoTag         bool     `json:"add_pci_info_tag,omitempty"`
	UsePciInfoAsTypeId    bool     `json:"use_pci_info_as_type_id,omitempty"`
	AddUuidMeta           bool     `json:"add_uuid_meta,omitempty"`
	AddBoardNumberMeta    bool     `json:"add_board_number_meta,omitempty"`
	AddSerialMeta         bool     `json:"add_serial_meta,omitempty"`
	ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"`
	UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"`
	UseSliceForMigDevices bool     `json:"use_slice_for_mig_device,omitempty"`
}

type NvidiaGPMCollectorDevice struct {
	device        nvml.Device
	tags          map[string]string
	meta          map[string]string
	startTime     time.Time
	endTime       time.Time
	measurement   nvml.GpmMetricsGetType
	metricsLookup map[int]NvidiaGPMMetricDef
}

type NvidiaGPMCollector struct {
	metricCollector

	config   NvidiaGPMCollectorConfig
	gpus     []NvidiaGPMCollectorDevice
	num_gpus int
}

func (m *NvidiaGPMCollector) Init(config json.RawMessage) error {
	var err error = nil
	m.name = "NvidiaGPMCollector"
	m.parallel = true
	if err := m.setup(); err != nil {
		return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
	}
	if len(config) > 0 {
		d := json.NewDecoder(strings.NewReader(string(config)))
		d.DisallowUnknownFields()
		if err = d.Decode(&m.config); err != nil {
			return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
		}
	}
	m.meta = map[string]string{
		"source": m.name,
		"group":  "NvidiaGPM",
	}

	// Initialize NVIDIA Management Library (NVML)
	ret := nvml.Init()

	// Error: NVML library not found
	// (nvml.ErrorString can not be used in this case)
	if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
		return fmt.Errorf("%s Init(): NVML library not found", m.name)
	}
	if ret != nvml.SUCCESS {
		err = errors.New(nvml.ErrorString(ret))
		return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
	}

	// Number of NVIDIA GPUs
	num_gpus, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		err = errors.New(nvml.ErrorString(ret))
		return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
	}

	// For all GPUs
	m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus)
	for i := range num_gpus {

		// Skip excluded devices by ID
		str_i := strconv.Itoa(i)
		if slices.Contains(m.config.ExcludeDevices, str_i) {
			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
			continue
		}

		// Get device handle
		device, ret := nvml.DeviceGetHandleByIndex(i)
		if ret != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
			continue
		}

		supportInfo, ret := nvml.GpmQueryDeviceSupport(device)
		if ret != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentErrorf(m.name, "Unable to query GPM support for device at index %d: %s", i, err.Error())
			continue
		} else {
			if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) {
				cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i)
				continue
			}
		}

		stream, ret := nvml.GpmQueryIfStreamingEnabled(device)
		if ret != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentErrorf(m.name, "Unable to query GPM streaming for device at index %d: %s", i, err.Error())
			continue
		} else {
			if stream == uint32(nvml.FEATURE_DISABLED) {
				ret = nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED))
				if ret != nvml.SUCCESS {
					err = errors.New(nvml.ErrorString(ret))
					cclog.ComponentErrorf(m.name, "Unable to set streaming mode for device at index %d: %s", i, err.Error())
				}
			}
		}

		// Get device's PCI info
		pciInfo, ret := nvml.DeviceGetPciInfo(device)
		if ret != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
			continue
		}
		// Create PCI ID in the common format used by the NVML.
		pci_id := fmt.Sprintf(
			nvml.DEVICE_PCI_BUS_ID_FMT,
			pciInfo.Domain,
			pciInfo.Bus,
			pciInfo.Device)

		// Skip excluded devices specified by PCI ID
		if slices.Contains(m.config.ExcludeDevices, pci_id) {
			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
			continue
		}
		ss, nvmlErr := nvml.GpmSampleAlloc()
		if nvmlErr != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
			continue
		}
		es, nvmlErr := nvml.GpmSampleAlloc()
		if nvmlErr != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
			continue
		}

		// Select which value to use as 'type-id'.
		// The PCI ID is commonly required in SLURM environments because the
		// numberic IDs used by SLURM and the ones used by NVML might differ
		// depending on the job type. The PCI ID is more reliable but is commonly
		// not recorded for a job, so it must be added manually in prologue or epilogue
		// e.g. to the comment field
		tid := str_i
		if m.config.UsePciInfoAsTypeId {
			tid = pci_id
		}

		// Now we got all infos together, populate the device list
		g := NvidiaGPMCollectorDevice{}

		// Add device handle
		g.device = device

		// Add tags
		g.tags = map[string]string{
			"type":    "accelerator",
			"type-id": tid,
		}

		// Add PCI info as tag if not already used as 'type-id'
		if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
			g.tags["pci_identifier"] = pci_id
		}

		g.meta = map[string]string{
			"source": m.name,
			"group":  "Nvidia",
		}

		if m.config.AddBoardNumberMeta {
			board, ret := nvml.DeviceGetBoardPartNumber(device)
			if ret != nvml.SUCCESS {
				err = errors.New(nvml.ErrorString(ret))
				cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
			} else {
				g.meta["board_number"] = board
			}
		}
		if m.config.AddSerialMeta {
			serial, ret := nvml.DeviceGetSerial(device)
			if ret != nvml.SUCCESS {
				err = errors.New(nvml.ErrorString(ret))
				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
			} else {
				g.meta["serial"] = serial
			}
		}
		if m.config.AddUuidMeta {
			uuid, ret := nvml.DeviceGetUUID(device)
			if ret != nvml.SUCCESS {
				err = errors.New(nvml.ErrorString(ret))
				cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
			} else {
				g.meta["uuid"] = uuid
			}
		}

		g.measurement.Sample1 = ss
		g.measurement.Sample2 = es
		g.measurement.Version = nvml.GPM_METRICS_GET_VERSION
		g.metricsLookup = make(map[int]NvidiaGPMMetricDef)
		metIdx := 0
		for _, inmetric := range m.config.Metrics {
			for _, defmetric := range NvidiaGPMMetrics {
				if inmetric == defmetric.outname || inmetric == defmetric.name {
					g.measurement.Metrics[metIdx] = nvml.GpmMetric{
						MetricId: uint32(defmetric.id),
					}
					g.metricsLookup[metIdx] = defmetric
					metIdx += 1
				}
			}
		}
		g.measurement.NumMetrics = uint32(metIdx)
		m.gpus = append(m.gpus, g)
	}
	cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus))
	m.num_gpus = len(m.gpus)
	m.init = true
	return err
}

func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) {
	var err error
	if !m.init {
		return
	}
	for i, gpu := range m.gpus {
		gpu.startTime = time.Now()
		nvmlErr := gpu.measurement.Sample1.Get(gpu.device)
		if nvmlErr != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(nvmlErr))
			cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error())
			continue
		}
	}
	time.Sleep(interval)

	for i, gpu := range m.gpus {
		gpu.endTime = time.Now()
		nvmlErr := gpu.measurement.Sample2.Get(gpu.device)
		if nvmlErr != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(nvmlErr))
			cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error())
			continue
		}
	}

	for i, gpu := range m.gpus {
		nvmlErr := nvml.GpmMetricsGet(&gpu.measurement)
		if nvmlErr != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(nvmlErr))
			cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error())
			continue
		}
		for idx, metricDef := range gpu.metricsLookup {
			y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now())
			if err == nil {
				y.AddMeta("unit", metricDef.unit)
				output <- y
			}
		}
	}

}

func (m *NvidiaGPMCollector) Close() {
	if m.init {
		for i, gpu := range m.gpus {
			ret := gpu.measurement.Sample1.Free()
			if ret != nvml.SUCCESS {
				err := errors.New(nvml.ErrorString(ret))
				cclog.ComponentErrorf(m.name, "Unable to free start sample for device at index %d: %s", i, err.Error())
			}
			ret = gpu.measurement.Sample2.Free()
			if ret != nvml.SUCCESS {
				err := errors.New(nvml.ErrorString(ret))
				cclog.ComponentErrorf(m.name, "Unable to free stop sample for device at index %d: %s", i, err.Error())
			}
		}
		if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
			cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
		}
		m.init = false
	}
}