// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package metricstore

import (
	"bufio"
	"encoding/binary"
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"sort"

	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
	pq "github.com/parquet-go/parquet-go"
)

// ParquetMetricRow is the long-format schema for archived metric data.
// One row per (host, metric, scope, scope_id, timestamp) data point.
// Sorted by (cluster, hostname, metric, timestamp) for optimal compression.
type ParquetMetricRow struct {
	Cluster   string  `parquet:"cluster"`
	Hostname  string  `parquet:"hostname"`
	Metric    string  `parquet:"metric"`
	Scope     string  `parquet:"scope"`
	ScopeID   string  `parquet:"scope_id"`
	Timestamp int64   `parquet:"timestamp"`
	Frequency int64   `parquet:"frequency"`
	Value     float32 `parquet:"value"`
}

// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows.
// The scope path is built from the hierarchy: host level is "node", then child names
// map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0").
func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow {
	for metricName, cm := range cf.Metrics {
		ts := cm.Start
		for _, v := range cm.Data {
			if !v.IsNaN() {
				rows = append(rows, ParquetMetricRow{
					Cluster:   cluster,
					Hostname:  hostname,
					Metric:    metricName,
					Scope:     scope,
					ScopeID:   scopeID,
					Timestamp: ts,
					Frequency: cm.Frequency,
					Value:     float32(v),
				})
			}
			ts += cm.Frequency
		}
	}

	for childName, childCf := range cf.Children {
		childScope, childScopeID := parseScopeFromName(childName)
		rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows)
	}

	return rows
}

// parseScopeFromName infers scope and scope_id from a child level name.
// Examples: "socket0" → ("socket", "0"), "core12" → ("core", "12"),
// "a0" (accelerator) → ("accelerator", "0").
// If the name doesn't match known patterns, it's used as-is for scope with empty scope_id.
func parseScopeFromName(name string) (string, string) {
	prefixes := []struct {
		prefix string
		scope  string
	}{
		{"socket", "socket"},
		{"memoryDomain", "memoryDomain"},
		{"core", "core"},
		{"hwthread", "hwthread"},
		{"cpu", "hwthread"},
		{"accelerator", "accelerator"},
	}

	for _, p := range prefixes {
		if len(name) > len(p.prefix) && name[:len(p.prefix)] == p.prefix {
			id := name[len(p.prefix):]
			if len(id) > 0 && id[0] >= '0' && id[0] <= '9' {
				return p.scope, id
			}
		}
	}

	return name, ""
}

// parquetArchiveWriter supports incremental writes to a Parquet file.
// Each call to WriteHostRows writes one row group (typically one host's data),
// avoiding accumulation of all rows in memory.
type parquetArchiveWriter struct {
	writer *pq.GenericWriter[ParquetMetricRow]
	bw     *bufio.Writer
	f      *os.File
	count  int
}

// newParquetArchiveWriter creates a streaming Parquet writer with Zstd compression.
func newParquetArchiveWriter(filename string) (*parquetArchiveWriter, error) {
	if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil {
		return nil, fmt.Errorf("creating archive directory: %w", err)
	}

	f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
	if err != nil {
		return nil, fmt.Errorf("creating parquet file: %w", err)
	}

	bw := bufio.NewWriterSize(f, 1<<20) // 1MB write buffer

	writer := pq.NewGenericWriter[ParquetMetricRow](bw,
		pq.Compression(&pq.Zstd),
	)

	return &parquetArchiveWriter{writer: writer, bw: bw, f: f}, nil
}

// WriteHostRows sorts rows by (metric, timestamp) in-place, writes them,
// and flushes to create a separate row group.
func (w *parquetArchiveWriter) WriteHostRows(rows []ParquetMetricRow) error {
	sort.Slice(rows, func(i, j int) bool {
		if rows[i].Metric != rows[j].Metric {
			return rows[i].Metric < rows[j].Metric
		}
		return rows[i].Timestamp < rows[j].Timestamp
	})

	if _, err := w.writer.Write(rows); err != nil {
		return fmt.Errorf("writing parquet rows: %w", err)
	}

	if err := w.writer.Flush(); err != nil {
		return fmt.Errorf("flushing parquet row group: %w", err)
	}

	w.count += len(rows)
	return nil
}

// Close finalises the Parquet file (footer, buffered I/O, file handle).
func (w *parquetArchiveWriter) Close() error {
	if err := w.writer.Close(); err != nil {
		w.f.Close()
		return fmt.Errorf("closing parquet writer: %w", err)
	}

	if err := w.bw.Flush(); err != nil {
		w.f.Close()
		return fmt.Errorf("flushing parquet file: %w", err)
	}

	return w.f.Close()
}

// sortParquetRows sorts rows by (metric, timestamp) in-place.
func sortParquetRows(rows []ParquetMetricRow) {
	sort.Slice(rows, func(i, j int) bool {
		if rows[i].Metric != rows[j].Metric {
			return rows[i].Metric < rows[j].Metric
		}
		return rows[i].Timestamp < rows[j].Timestamp
	})
}

// loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns
// a CheckpointFile. Used by the Parquet archiver to read checkpoint data
// before converting it to Parquet format.
func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) {
	f, err := os.Open(filename)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	ext := filepath.Ext(filename)
	switch ext {
	case ".json":
		cf := &CheckpointFile{}
		br := bufio.NewReader(f)
		if err := json.NewDecoder(br).Decode(cf); err != nil {
			return nil, fmt.Errorf("decoding JSON checkpoint %s: %w", filename, err)
		}
		return cf, nil

	case ".bin":
		br := bufio.NewReader(f)
		var magic uint32
		if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
			return nil, fmt.Errorf("reading magic from %s: %w", filename, err)
		}
		if magic != snapFileMagic {
			return nil, fmt.Errorf("invalid snapshot magic in %s: 0x%08X", filename, magic)
		}
		var fileFrom, fileTo int64
		if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil {
			return nil, fmt.Errorf("reading from-timestamp from %s: %w", filename, err)
		}
		if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil {
			return nil, fmt.Errorf("reading to-timestamp from %s: %w", filename, err)
		}
		cf, err := readBinaryLevel(br)
		if err != nil {
			return nil, fmt.Errorf("reading binary level from %s: %w", filename, err)
		}
		cf.From = fileFrom
		cf.To = fileTo
		return cf, nil

	default:
		return nil, fmt.Errorf("unsupported checkpoint extension: %s", ext)
	}
}

// estimateRowCount estimates the number of Parquet rows a CheckpointFile will produce.
// Used for pre-allocating the rows slice to avoid repeated append doubling.
func estimateRowCount(cf *CheckpointFile) int {
	n := 0
	for _, cm := range cf.Metrics {
		n += len(cm.Data)
	}
	for _, child := range cf.Children {
		n += estimateRowCount(child)
	}
	return n
}

// archiveCheckpointsToParquet reads checkpoint files for a host directory,
// converts them to Parquet rows. Returns the rows and filenames that were processed.
func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) {
	entries, err := os.ReadDir(dir)
	if err != nil {
		return nil, nil, err
	}

	files, err := findFiles(entries, from, false)
	if err != nil {
		return nil, nil, err
	}

	if len(files) == 0 {
		return nil, nil, nil
	}

	// First pass: load checkpoints and estimate total rows for pre-allocation.
	type loaded struct {
		cf       *CheckpointFile
		filename string
	}
	var checkpoints []loaded
	totalEstimate := 0

	for _, checkpoint := range files {
		filename := filepath.Join(dir, checkpoint)
		cf, err := loadCheckpointFileFromDisk(filename)
		if err != nil {
			cclog.Warnf("[METRICSTORE]> skipping unreadable checkpoint %s: %v", filename, err)
			continue
		}
		totalEstimate += estimateRowCount(cf)
		checkpoints = append(checkpoints, loaded{cf: cf, filename: checkpoint})
	}

	if len(checkpoints) == 0 {
		return nil, nil, nil
	}

	rows := make([]ParquetMetricRow, 0, totalEstimate)
	processedFiles := make([]string, 0, len(checkpoints))

	for _, cp := range checkpoints {
		rows = flattenCheckpointFile(cp.cf, cluster, host, "node", "", rows)
		processedFiles = append(processedFiles, cp.filename)
	}

	return rows, processedFiles, nil
}