Initial commit

This commit is contained in:
Bole Ma
2025-12-10 12:17:41 +01:00
commit 739563f916
12 changed files with 3428 additions and 0 deletions

530
xgb_local.py Normal file
View File

@@ -0,0 +1,530 @@
import pandas as pd
import numpy as np
import joblib
import json
from typing import Dict, List, Tuple, Union, Optional
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
def compute_mad(data: np.ndarray) -> float:
"""Compute Median Absolute Deviation."""
median = np.median(data)
mad = np.median(np.abs(data - median))
return mad
def df_aggregate(json_str: str, job_id_full: Optional[str] = None) -> Dict:
"""
Aggregate roofline data from JSON string into a single feature vector.
Args:
json_str: JSON string containing roofline data records
job_id_full: Optional job ID to include in the result
Returns:
Dictionary containing aggregated features
"""
# Parse JSON string to DataFrame
try:
data = json.loads(json_str)
if isinstance(data, list):
df = pd.DataFrame(data)
elif isinstance(data, dict):
df = pd.DataFrame([data])
else:
raise ValueError("JSON must contain a list of objects or a single object")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON string: {e}")
# Group data by node_num
if 'node_num' not in df.columns:
# If no node_num, treat all data as single node
df['node_num'] = 1
grouped = df.groupby('node_num')
all_features = []
for node_num, group in grouped:
features = {
'node_num': int(node_num)
}
if job_id_full is not None:
features['job_id'] = job_id_full
# Compute statistics for key metrics
for axis in ['bandwidth_raw', 'flops_raw', 'arith_intensity']:
data = group[axis].values
# Compute percentiles
p10 = np.percentile(data, 10)
p50 = np.median(data)
p90 = np.percentile(data, 90)
# Compute MAD (more robust than variance)
mad = compute_mad(data)
# Store features
features[f'{axis}_p10'] = p10
features[f'{axis}_median'] = p50
features[f'{axis}_p90'] = p90
features[f'{axis}_mad'] = mad
features[f'{axis}_range'] = p90 - p10
features[f'{axis}_iqr'] = np.percentile(data, 75) - np.percentile(data, 25)
# Compute covariance and correlation between bandwidth_raw and flops_raw
if len(group) > 1: # Need at least 2 points for correlation
cov = np.cov(group['bandwidth_raw'], group['flops_raw'])[0, 1]
features['bw_flops_covariance'] = cov
corr, _ = stats.pearsonr(group['bandwidth_raw'], group['flops_raw'])
features['bw_flops_correlation'] = corr
# Additional useful features for the classifier
# Performance metrics
features['avg_performance_gflops'] = group['performance_gflops'].mean()
features['median_performance_gflops'] = group['performance_gflops'].median()
features['performance_gflops_mad'] = compute_mad(group['performance_gflops'].values)
# # Efficiency metrics
# features['avg_efficiency'] = group['efficiency'].mean()
# features['median_efficiency'] = group['efficiency'].median()
# features['efficiency_mad'] = compute_mad(group['efficiency'].values)
# features['efficiency_p10'] = np.percentile(group['efficiency'].values, 10)
# features['efficiency_p90'] = np.percentile(group['efficiency'].values, 90)
# # Distribution of roofline regions (memory-bound vs compute-bound)
# if 'roofline_region' in group.columns:
# region_counts = group['roofline_region'].value_counts(normalize=True).to_dict()
# for region, ratio in region_counts.items():
# features[f'region_{region}_ratio'] = ratio
# System characteristics
if 'memory_bw_gbs' in group.columns:
features['avg_memory_bw_gbs'] = group['memory_bw_gbs'].mean()
if 'scalar_peak_gflops' in group.columns and len(group['scalar_peak_gflops'].unique()) > 0:
features['scalar_peak_gflops'] = group['scalar_peak_gflops'].iloc[0]
if 'simd_peak_gflops' in group.columns and len(group['simd_peak_gflops'].unique()) > 0:
features['simd_peak_gflops'] = group['simd_peak_gflops'].iloc[0]
# # Subcluster information if available
# if 'subcluster_name' in group.columns and not group['subcluster_name'].isna().all():
# features['subcluster_name'] = group['subcluster_name'].iloc[0]
# Duration information
if 'duration' in group.columns:
features['duration'] = group['duration'].iloc[0]
all_features.append(features)
# Return first node's features (or combine multiple nodes if needed)
if len(all_features) == 1:
return all_features[0]
else:
# If multiple nodes, return the first one or average across nodes
# For now, return the first node's features
return all_features[0]
class XGBoostMultiLabelPredictor:
"""
Python API for XGBoost multi-label classification inference.
Provides methods to load trained models band perform inference with
confidence scores for each class.
"""
def __init__(self, model_path: str = 'xgb_model.joblib'):
"""
Initialize the predictor by loading the trained model.
Args:
model_path: Path to the saved model file (.joblib)
"""
self.model_data = None
self.model = None
self.mlb = None
self.feature_columns = None
self.n_features = 0
self.classes = []
self.load_model(model_path)
def load_model(self, model_path: str) -> None:
"""
Load the trained XGBoost model from disk.
Args:
model_path: Path to the saved model file
"""
try:
print(f"Loading model from {model_path}...")
self.model_data = joblib.load(model_path)
self.model = self.model_data['model']
self.mlb = self.model_data['mlb']
self.feature_columns = self.model_data['feature_columns']
self.classes = list(self.mlb.classes_)
self.n_features = len(self.feature_columns)
print("Model loaded successfully!")
print(f" - {len(self.classes)} classes: {self.classes}")
print(f" - {self.n_features} features: {self.feature_columns[:5]}...")
print(f" - Model type: {type(self.model).__name__}")
except Exception as e:
raise ValueError(f"Failed to load model from {model_path}: {e}")
def predict(self, features: Union[pd.DataFrame, np.ndarray, List, Dict, str],
threshold: float = 0.5,
return_all_probabilities: bool = True,
is_json: bool = False,
job_id: Optional[str] = None) -> Dict:
"""
Perform multi-label prediction on input features.
Args:
features: Input features in various formats:
- pandas DataFrame
- numpy array (2D)
- list of lists/dicts
- single feature vector (list/dict)
- JSON string (if is_json=True): roofline data to aggregate
threshold: Probability threshold for binary classification (0.0-1.0)
return_all_probabilities: If True, return probabilities for all classes.
If False, return only classes above threshold.
is_json: If True, treat features as JSON string of roofline data
job_id: Optional job ID (used when is_json=True)
Returns:
Dictionary containing:
- 'predictions': List of predicted class names
- 'probabilities': Dict of {class_name: probability} for all classes
- 'confidences': Dict of {class_name: confidence_score} for predicted classes
- 'threshold': The threshold used
"""
# If input is JSON string, aggregate features first
if is_json:
if not isinstance(features, str):
raise ValueError("When is_json=True, features must be a JSON string")
features = df_aggregate(features, job_id_full=job_id)
# Convert input to proper format
X = self._prepare_features(features)
# Get probability predictions
probabilities = self.model.predict_proba(X)
# Convert to class probabilities
class_probabilities = {}
for i, class_name in enumerate(self.classes):
# For OneVsRest, predict_proba returns shape (n_samples, n_classes)
# Each column i contains probabilities for class i
if isinstance(probabilities, list):
# List of arrays (multiple samples)
prob_array = probabilities[i]
prob_positive = prob_array[0] if hasattr(prob_array, '__getitem__') else float(prob_array)
else:
# 2D numpy array (single sample or batch)
if len(probabilities.shape) == 2:
# Shape: (n_samples, n_classes)
prob_positive = float(probabilities[0, i])
else:
# 1D array
prob_positive = float(probabilities[i])
class_probabilities[class_name] = prob_positive
# Apply threshold for predictions
predictions = []
confidences = {}
for class_name, prob in class_probabilities.items():
if prob >= threshold:
predictions.append(class_name)
# Confidence score: distance from threshold as percentage
confidence = min(1.0, (prob - threshold) / (1.0 - threshold)) * 100
confidences[class_name] = round(confidence, 2)
# Sort predictions by probability
predictions.sort(key=lambda x: class_probabilities[x], reverse=True)
result = {
'predictions': predictions,
'probabilities': {k: round(v, 4) for k, v in class_probabilities.items()},
'confidences': confidences,
'threshold': threshold
}
if not return_all_probabilities:
result['probabilities'] = {k: v for k, v in result['probabilities'].items()
if k in predictions}
return result
def predict_top_k(self, features: Union[pd.DataFrame, np.ndarray, List, Dict, str],
k: int = 5,
is_json: bool = False,
job_id: Optional[str] = None) -> Dict:
"""
Get top-k predictions with their probabilities.
Args:
features: Input features (various formats) or JSON string if is_json=True
k: Number of top predictions to return
is_json: If True, treat features as JSON string of roofline data
job_id: Optional job ID (used when is_json=True)
Returns:
Dictionary with top-k predictions and their details
"""
# If input is JSON string, aggregate features first
if is_json:
if not isinstance(features, str):
raise ValueError("When is_json=True, features must be a JSON string")
features = df_aggregate(features, job_id_full=job_id)
# Get all probabilities
X = self._prepare_features(features)
probabilities = self.model.predict_proba(X)
class_probabilities = {}
for i, class_name in enumerate(self.classes):
# For OneVsRest, predict_proba returns shape (n_samples, n_classes)
# Each column i contains probabilities for class i
if isinstance(probabilities, list):
# List of arrays (multiple samples)
prob_array = probabilities[i]
prob_positive = prob_array[0] if hasattr(prob_array, '__getitem__') else float(prob_array)
else:
# 2D numpy array (single sample or batch)
if len(probabilities.shape) == 2:
# Shape: (n_samples, n_classes)
prob_positive = float(probabilities[0, i])
else:
# 1D array
prob_positive = float(probabilities[i])
class_probabilities[class_name] = prob_positive
# Sort by probability
sorted_classes = sorted(class_probabilities.items(),
key=lambda x: x[1], reverse=True)
top_k_classes = sorted_classes[:k]
return {
'top_predictions': [cls for cls, _ in top_k_classes],
'top_probabilities': {cls: round(prob, 4) for cls, prob in top_k_classes},
'all_probabilities': {k: round(v, 4) for k, v in class_probabilities.items()}
}
def _prepare_features(self, features: Union[pd.DataFrame, np.ndarray, List, Dict]) -> pd.DataFrame:
"""
Convert various input formats to the expected feature format.
Args:
features: Input features in various formats
Returns:
pandas DataFrame with correct columns and order
"""
if isinstance(features, pd.DataFrame):
df = features.copy()
elif isinstance(features, np.ndarray):
if features.ndim == 1:
features = features.reshape(1, -1)
df = pd.DataFrame(features, columns=self.feature_columns[:features.shape[1]])
elif isinstance(features, list):
if isinstance(features[0], dict):
# List of dictionaries
df = pd.DataFrame(features)
else:
# List of lists
df = pd.DataFrame(features, columns=self.feature_columns[:len(features[0])])
elif isinstance(features, dict):
# Single feature dictionary
df = pd.DataFrame([features])
else:
raise ValueError(f"Unsupported feature format: {type(features)}")
# Ensure correct column order and fill missing columns with 0
for col in self.feature_columns:
if col not in df.columns:
df[col] = 0.0
df = df[self.feature_columns]
# Validate feature count
if df.shape[1] != self.n_features:
raise ValueError(f"Expected {self.n_features} features, got {df.shape[1]}")
return df
def get_class_info(self) -> Dict:
"""
Get information about available classes.
Returns:
Dictionary with class information
"""
return {
'classes': self.classes,
'n_classes': len(self.classes),
'feature_columns': self.feature_columns,
'n_features': self.n_features
}
def batch_predict(self, features_list: List[Union[pd.DataFrame, np.ndarray, List, Dict, str]],
threshold: float = 0.5,
is_json: bool = False,
job_ids: Optional[List[str]] = None) -> List[Dict]:
"""
Perform batch prediction on multiple samples.
Args:
features_list: List of feature inputs (or JSON strings if is_json=True)
threshold: Probability threshold
is_json: If True, treat each item in features_list as JSON string
job_ids: Optional list of job IDs (used when is_json=True)
Returns:
List of prediction results
"""
results = []
for idx, features in enumerate(features_list):
try:
job_id = job_ids[idx] if job_ids and idx < len(job_ids) else None
result = self.predict(features, threshold=threshold, is_json=is_json, job_id=job_id)
results.append(result)
except Exception as e:
results.append({'error': str(e)})
return results
def create_sample_data(n_samples: int = 5) -> List[Dict]:
"""
Create sample feature data for testing.
Args:
n_samples: Number of sample feature vectors to create
Returns:
List of feature dictionaries
"""
np.random.seed(42)
# Load feature columns from model if available
try:
model_data = joblib.load('xgb_model.joblib')
feature_columns = model_data['feature_columns']
except:
# Fallback to some default features
feature_columns = [
'node_num', 'bandwidth_raw_p10', 'bandwidth_raw_median',
'bandwidth_raw_p90', 'bandwidth_raw_mad', 'bandwidth_raw_range',
'bandwidth_raw_iqr', 'flops_raw_p10', 'flops_raw_median',
'flops_raw_p90', 'flops_raw_mad', 'flops_raw_range'
]
samples = []
for _ in range(n_samples):
sample = {}
for col in feature_columns:
if 'bandwidth' in col:
sample[col] = np.random.uniform(50, 500)
elif 'flops' in col:
sample[col] = np.random.uniform(100, 5000)
elif 'node_num' in col:
sample[col] = np.random.randint(1, 16)
else:
sample[col] = np.random.uniform(0, 1000)
samples.append(sample)
return samples
if __name__ == "__main__":
print("XGBoost Multi-Label Inference API")
print("=" * 40)
# Initialize predictor
try:
predictor = XGBoostMultiLabelPredictor()
except Exception as e:
print(f"Error loading model: {e}")
exit(1)
# Example usage of df_aggregate with JSON string
print("\n=== Example 0: JSON Aggregation ===")
sample_json = json.dumps([
{
"node_num": 1,
"bandwidth_raw": 150.5,
"flops_raw": 2500.0,
"arith_intensity": 16.6,
"performance_gflops": 1200.0,
"memory_bw_gbs": 450,
"scalar_peak_gflops": 600,
"duration": 3600
},
{
"node_num": 2,
"bandwidth_raw": 155.2,
"flops_raw": 2600.0,
"arith_intensity": 16.8,
"performance_gflops": 1250.0,
"memory_bw_gbs": 450,
"scalar_peak_gflops": 600,
"duration": 3600
}
])
try:
aggregated_features = df_aggregate(sample_json, job_id_full="test_job_123")
print(f"Aggregated features from JSON:")
for key, value in list(aggregated_features.items())[:10]:
print(f" {key}: {value}")
print(f" ... ({len(aggregated_features)} total features)")
# Use aggregated features for prediction
result = predictor.predict(aggregated_features, threshold=0.3)
print(f"\nPredictions from aggregated data: {result['predictions'][:3]}")
except Exception as e:
print(f"Error in aggregation: {e}")
# Create sample data
print("\n=== Generating sample data for other examples ===")
sample_data = create_sample_data(3)
# Example 1: Single prediction
print("\n=== Example 1: Single Prediction ===")
result = predictor.predict(sample_data[0], threshold=0.3)
print(f"Predictions: {result['predictions']}")
print(f"Confidences: {result['confidences']}")
print(f"Top probabilities:")
for class_name, prob in sorted(result['probabilities'].items(),
key=lambda x: x[1], reverse=True)[:5]:
print(".4f")
# Example 2: Top-K predictions
print("\n=== Example 2: Top-5 Predictions ===")
top_result = predictor.predict_top_k(sample_data[1], k=5)
for i, class_name in enumerate(top_result['top_predictions'], 1):
prob = top_result['top_probabilities'][class_name]
print(f"{i}. {class_name}: {prob:.4f}")
# Example 3: Batch prediction
print("\n=== Example 3: Batch Prediction ===")
batch_results = predictor.batch_predict(sample_data, threshold=0.4)
for i, result in enumerate(batch_results, 1):
if 'error' not in result:
print(f"Sample {i}: {len(result['predictions'])} predictions")
else:
print(f"Sample {i}: Error - {result['error']}")
print("\nAPI ready for use!")
print("Usage:")
print(" predictor = XGBoostMultiLabelPredictor()")
print(" result = predictor.predict(your_features)")
print(" top_k = predictor.predict_top_k(your_features, k=5)")