152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import ast
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import MultiLabelBinarizer
|
|
from sklearn.metrics import classification_report
|
|
from xgboost import XGBClassifier
|
|
from sklearn.multiclass import OneVsRestClassifier
|
|
from sklearn import set_config
|
|
import joblib
|
|
set_config(enable_metadata_routing=True)
|
|
# ---------------------------------------------------------
|
|
# 1. Load & clean
|
|
# ---------------------------------------------------------
|
|
print("Loading data from roofline_features.h5...")
|
|
df = pd.read_hdf("roofline_features.h5", key="features")
|
|
print(f"Loaded {len(df)} samples with {len(df.columns)} columns")
|
|
|
|
print("Cleaning data...")
|
|
original_shape = df.shape
|
|
# Drop empty columns
|
|
df.dropna(axis=1, how="all", inplace=True)
|
|
|
|
# Treat empty strings as NaN, then drop rows with any NaN
|
|
df.replace(r"^\s*$", np.nan, regex=True, inplace=True)
|
|
df.dropna(axis=0, how="any", inplace=True)
|
|
print(f"After cleaning: {len(df)} samples remaining (removed {original_shape[0] - len(df)} rows, {original_shape[1] - len(df.columns)} columns)")
|
|
|
|
# ---------------------------------------------------------
|
|
# 2. Parse label column into Python lists
|
|
# ---------------------------------------------------------
|
|
print("Parsing labels...")
|
|
def parse_label(x):
|
|
# Convert string like '["OpenFOAM","Gaussian"]' to list
|
|
if isinstance(x, str):
|
|
try:
|
|
return list(ast.literal_eval(x))
|
|
except Exception:
|
|
return []
|
|
elif isinstance(x, list):
|
|
return x
|
|
else:
|
|
return []
|
|
|
|
df["label"] = df["label"].apply(parse_label)
|
|
|
|
# Drop rows where label list is empty after parsing
|
|
original_len = len(df)
|
|
df = df[df["label"].map(len) > 0]
|
|
print(f"Parsed labels: {len(df)} samples remaining (removed {original_len - len(df)} samples with empty labels)")
|
|
|
|
# ---------------------------------------------------------
|
|
# 3. Features and multi-label target
|
|
# ---------------------------------------------------------
|
|
print("Preparing features and targets...")
|
|
X = df.drop(columns=["job_id", "label"])
|
|
y_lists = df["label"]
|
|
|
|
mlb = MultiLabelBinarizer()
|
|
Y = mlb.fit_transform(y_lists)
|
|
all_classes = mlb.classes_
|
|
|
|
print(f"Feature matrix shape: {X.shape}")
|
|
print(f"Target matrix shape: {Y.shape}")
|
|
print(f"Number of unique classes: {len(all_classes)}")
|
|
print(f"Classes: {list(all_classes)}")
|
|
|
|
# ---------------------------------------------------------
|
|
# 4. Split (stratification isn't directly supported for multi-label)
|
|
# ---------------------------------------------------------
|
|
print("Splitting data into train/validation sets...")
|
|
X_train, X_val, Y_train, Y_val = train_test_split(
|
|
X, Y, test_size=0.2, random_state=42
|
|
)
|
|
|
|
print(f"Training set: {X_train.shape[0]} samples")
|
|
print(f"Validation set: {X_val.shape[0]} samples")
|
|
|
|
# ---------------------------------------------------------
|
|
# 5. Handle imbalance
|
|
# ---------------------------------------------------------
|
|
print("Calculating sample weights for class imbalance...")
|
|
# For each label column, compute a weight: N / (2 * count)
|
|
# Then weight each sample by sum of its label weights (simple heuristic)
|
|
label_counts = Y_train.sum(axis=0)
|
|
weights_per_class = (len(Y_train) / (2.0 * (label_counts + 1e-6)))
|
|
sample_weights = (Y_train * weights_per_class).sum(axis=1)
|
|
|
|
print(f"Label frequencies in training set:")
|
|
for i, class_name in enumerate(all_classes):
|
|
count = label_counts[i]
|
|
percentage = (count / len(Y_train)) * 100
|
|
print(".1f")
|
|
|
|
print(".3f")
|
|
|
|
# ---------------------------------------------------------
|
|
# 6. Train One-vs-Rest XGBoost
|
|
# ---------------------------------------------------------
|
|
print("Setting up XGBoost model...")
|
|
# Each label gets its own binary XGB classifier
|
|
xgb_base = XGBClassifier(
|
|
objective="binary:logistic",
|
|
eval_metric="logloss",
|
|
tree_method="hist", # or "gpu_hist" if GPU is available
|
|
learning_rate=0.1,
|
|
max_depth=6,
|
|
n_estimators=300,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42
|
|
)
|
|
|
|
# Enable metadata routing for sample_weight
|
|
xgb_base.set_fit_request(sample_weight=True)
|
|
|
|
model = OneVsRestClassifier(xgb_base, n_jobs=-1)
|
|
|
|
print("Training One-vs-Rest XGBoost model...")
|
|
# This may take a while depending on dataset size and number of classes
|
|
model.fit(X_train, Y_train, sample_weight=sample_weights)
|
|
print("Training completed!")
|
|
|
|
# ---------------------------------------------------------
|
|
# 7. Evaluate
|
|
# ---------------------------------------------------------
|
|
print("Evaluating model on validation set...")
|
|
Y_pred = model.predict(X_val)
|
|
present = np.unique(np.where(Y_val.sum(axis=0) + Y_pred.sum(axis=0) > 0)[0])
|
|
|
|
print("Classification Report:")
|
|
print("=" * 50)
|
|
print(
|
|
classification_report(
|
|
Y_val,
|
|
Y_pred,
|
|
labels=present, # only evaluate classes that exist
|
|
target_names=all_classes[present],
|
|
zero_division=0
|
|
)
|
|
)
|
|
|
|
print("Saving model...")
|
|
model_data = {
|
|
'model': model,
|
|
'mlb': mlb,
|
|
'feature_columns': list(X.columns)
|
|
}
|
|
joblib.dump(model_data, 'xgb_model.joblib')
|
|
print("Model saved to 'xgb_model.joblib'")
|
|
print("To load the model later, use: model_data = joblib.load('xgb_model.joblib')")
|
|
print("Processing complete!") |