530 lines
20 KiB
Python
530 lines
20 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import joblib
|
|
import json
|
|
from typing import Dict, List, Tuple, Union, Optional
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
from scipy import stats
|
|
|
|
def compute_mad(data: np.ndarray) -> float:
|
|
"""Compute Median Absolute Deviation."""
|
|
median = np.median(data)
|
|
mad = np.median(np.abs(data - median))
|
|
return mad
|
|
|
|
|
|
def df_aggregate(json_str: str, job_id_full: Optional[str] = None) -> Dict:
|
|
"""
|
|
Aggregate roofline data from JSON string into a single feature vector.
|
|
|
|
Args:
|
|
json_str: JSON string containing roofline data records
|
|
job_id_full: Optional job ID to include in the result
|
|
|
|
Returns:
|
|
Dictionary containing aggregated features
|
|
"""
|
|
# Parse JSON string to DataFrame
|
|
try:
|
|
data = json.loads(json_str)
|
|
if isinstance(data, list):
|
|
df = pd.DataFrame(data)
|
|
elif isinstance(data, dict):
|
|
df = pd.DataFrame([data])
|
|
else:
|
|
raise ValueError("JSON must contain a list of objects or a single object")
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON string: {e}")
|
|
|
|
# Group data by node_num
|
|
if 'node_num' not in df.columns:
|
|
# If no node_num, treat all data as single node
|
|
df['node_num'] = 1
|
|
|
|
grouped = df.groupby('node_num')
|
|
|
|
all_features = []
|
|
for node_num, group in grouped:
|
|
features = {
|
|
'node_num': int(node_num)
|
|
}
|
|
|
|
if job_id_full is not None:
|
|
features['job_id'] = job_id_full
|
|
|
|
# Compute statistics for key metrics
|
|
for axis in ['bandwidth_raw', 'flops_raw', 'arith_intensity']:
|
|
data = group[axis].values
|
|
|
|
# Compute percentiles
|
|
p10 = np.percentile(data, 10)
|
|
p50 = np.median(data)
|
|
p90 = np.percentile(data, 90)
|
|
|
|
# Compute MAD (more robust than variance)
|
|
mad = compute_mad(data)
|
|
|
|
# Store features
|
|
features[f'{axis}_p10'] = p10
|
|
features[f'{axis}_median'] = p50
|
|
features[f'{axis}_p90'] = p90
|
|
features[f'{axis}_mad'] = mad
|
|
features[f'{axis}_range'] = p90 - p10
|
|
features[f'{axis}_iqr'] = np.percentile(data, 75) - np.percentile(data, 25)
|
|
|
|
# Compute covariance and correlation between bandwidth_raw and flops_raw
|
|
if len(group) > 1: # Need at least 2 points for correlation
|
|
cov = np.cov(group['bandwidth_raw'], group['flops_raw'])[0, 1]
|
|
features['bw_flops_covariance'] = cov
|
|
|
|
corr, _ = stats.pearsonr(group['bandwidth_raw'], group['flops_raw'])
|
|
features['bw_flops_correlation'] = corr
|
|
|
|
# Additional useful features for the classifier
|
|
|
|
# Performance metrics
|
|
features['avg_performance_gflops'] = group['performance_gflops'].mean()
|
|
features['median_performance_gflops'] = group['performance_gflops'].median()
|
|
features['performance_gflops_mad'] = compute_mad(group['performance_gflops'].values)
|
|
|
|
# # Efficiency metrics
|
|
# features['avg_efficiency'] = group['efficiency'].mean()
|
|
# features['median_efficiency'] = group['efficiency'].median()
|
|
# features['efficiency_mad'] = compute_mad(group['efficiency'].values)
|
|
# features['efficiency_p10'] = np.percentile(group['efficiency'].values, 10)
|
|
# features['efficiency_p90'] = np.percentile(group['efficiency'].values, 90)
|
|
|
|
# # Distribution of roofline regions (memory-bound vs compute-bound)
|
|
# if 'roofline_region' in group.columns:
|
|
# region_counts = group['roofline_region'].value_counts(normalize=True).to_dict()
|
|
# for region, ratio in region_counts.items():
|
|
# features[f'region_{region}_ratio'] = ratio
|
|
|
|
# System characteristics
|
|
if 'memory_bw_gbs' in group.columns:
|
|
features['avg_memory_bw_gbs'] = group['memory_bw_gbs'].mean()
|
|
if 'scalar_peak_gflops' in group.columns and len(group['scalar_peak_gflops'].unique()) > 0:
|
|
features['scalar_peak_gflops'] = group['scalar_peak_gflops'].iloc[0]
|
|
if 'simd_peak_gflops' in group.columns and len(group['simd_peak_gflops'].unique()) > 0:
|
|
features['simd_peak_gflops'] = group['simd_peak_gflops'].iloc[0]
|
|
|
|
# # Subcluster information if available
|
|
# if 'subcluster_name' in group.columns and not group['subcluster_name'].isna().all():
|
|
# features['subcluster_name'] = group['subcluster_name'].iloc[0]
|
|
|
|
# Duration information
|
|
if 'duration' in group.columns:
|
|
features['duration'] = group['duration'].iloc[0]
|
|
|
|
all_features.append(features)
|
|
|
|
# Return first node's features (or combine multiple nodes if needed)
|
|
if len(all_features) == 1:
|
|
return all_features[0]
|
|
else:
|
|
# If multiple nodes, return the first one or average across nodes
|
|
# For now, return the first node's features
|
|
return all_features[0]
|
|
|
|
class XGBoostMultiLabelPredictor:
|
|
"""
|
|
Python API for XGBoost multi-label classification inference.
|
|
|
|
Provides methods to load trained models band perform inference with
|
|
confidence scores for each class.
|
|
"""
|
|
|
|
def __init__(self, model_path: str = 'xgb_model.joblib'):
|
|
"""
|
|
Initialize the predictor by loading the trained model.
|
|
|
|
Args:
|
|
model_path: Path to the saved model file (.joblib)
|
|
"""
|
|
self.model_data = None
|
|
self.model = None
|
|
self.mlb = None
|
|
self.feature_columns = None
|
|
self.n_features = 0
|
|
self.classes = []
|
|
|
|
self.load_model(model_path)
|
|
|
|
def load_model(self, model_path: str) -> None:
|
|
"""
|
|
Load the trained XGBoost model from disk.
|
|
|
|
Args:
|
|
model_path: Path to the saved model file
|
|
"""
|
|
try:
|
|
print(f"Loading model from {model_path}...")
|
|
self.model_data = joblib.load(model_path)
|
|
self.model = self.model_data['model']
|
|
self.mlb = self.model_data['mlb']
|
|
self.feature_columns = self.model_data['feature_columns']
|
|
|
|
self.classes = list(self.mlb.classes_)
|
|
self.n_features = len(self.feature_columns)
|
|
|
|
print("Model loaded successfully!")
|
|
print(f" - {len(self.classes)} classes: {self.classes}")
|
|
print(f" - {self.n_features} features: {self.feature_columns[:5]}...")
|
|
print(f" - Model type: {type(self.model).__name__}")
|
|
|
|
except Exception as e:
|
|
raise ValueError(f"Failed to load model from {model_path}: {e}")
|
|
|
|
def predict(self, features: Union[pd.DataFrame, np.ndarray, List, Dict, str],
|
|
threshold: float = 0.5,
|
|
return_all_probabilities: bool = True,
|
|
is_json: bool = False,
|
|
job_id: Optional[str] = None) -> Dict:
|
|
"""
|
|
Perform multi-label prediction on input features.
|
|
|
|
Args:
|
|
features: Input features in various formats:
|
|
- pandas DataFrame
|
|
- numpy array (2D)
|
|
- list of lists/dicts
|
|
- single feature vector (list/dict)
|
|
- JSON string (if is_json=True): roofline data to aggregate
|
|
threshold: Probability threshold for binary classification (0.0-1.0)
|
|
return_all_probabilities: If True, return probabilities for all classes.
|
|
If False, return only classes above threshold.
|
|
is_json: If True, treat features as JSON string of roofline data
|
|
job_id: Optional job ID (used when is_json=True)
|
|
|
|
Returns:
|
|
Dictionary containing:
|
|
- 'predictions': List of predicted class names
|
|
- 'probabilities': Dict of {class_name: probability} for all classes
|
|
- 'confidences': Dict of {class_name: confidence_score} for predicted classes
|
|
- 'threshold': The threshold used
|
|
"""
|
|
# If input is JSON string, aggregate features first
|
|
if is_json:
|
|
if not isinstance(features, str):
|
|
raise ValueError("When is_json=True, features must be a JSON string")
|
|
features = df_aggregate(features, job_id_full=job_id)
|
|
|
|
# Convert input to proper format
|
|
X = self._prepare_features(features)
|
|
|
|
# Get probability predictions
|
|
probabilities = self.model.predict_proba(X)
|
|
|
|
# Convert to class probabilities
|
|
class_probabilities = {}
|
|
for i, class_name in enumerate(self.classes):
|
|
# For OneVsRest, predict_proba returns shape (n_samples, n_classes)
|
|
# Each column i contains probabilities for class i
|
|
if isinstance(probabilities, list):
|
|
# List of arrays (multiple samples)
|
|
prob_array = probabilities[i]
|
|
prob_positive = prob_array[0] if hasattr(prob_array, '__getitem__') else float(prob_array)
|
|
else:
|
|
# 2D numpy array (single sample or batch)
|
|
if len(probabilities.shape) == 2:
|
|
# Shape: (n_samples, n_classes)
|
|
prob_positive = float(probabilities[0, i])
|
|
else:
|
|
# 1D array
|
|
prob_positive = float(probabilities[i])
|
|
class_probabilities[class_name] = prob_positive
|
|
|
|
# Apply threshold for predictions
|
|
predictions = []
|
|
confidences = {}
|
|
|
|
for class_name, prob in class_probabilities.items():
|
|
if prob >= threshold:
|
|
predictions.append(class_name)
|
|
# Confidence score: distance from threshold as percentage
|
|
confidence = min(1.0, (prob - threshold) / (1.0 - threshold)) * 100
|
|
confidences[class_name] = round(confidence, 2)
|
|
|
|
# Sort predictions by probability
|
|
predictions.sort(key=lambda x: class_probabilities[x], reverse=True)
|
|
|
|
result = {
|
|
'predictions': predictions,
|
|
'probabilities': {k: round(v, 4) for k, v in class_probabilities.items()},
|
|
'confidences': confidences,
|
|
'threshold': threshold
|
|
}
|
|
|
|
if not return_all_probabilities:
|
|
result['probabilities'] = {k: v for k, v in result['probabilities'].items()
|
|
if k in predictions}
|
|
|
|
return result
|
|
|
|
def predict_top_k(self, features: Union[pd.DataFrame, np.ndarray, List, Dict, str],
|
|
k: int = 5,
|
|
is_json: bool = False,
|
|
job_id: Optional[str] = None) -> Dict:
|
|
"""
|
|
Get top-k predictions with their probabilities.
|
|
|
|
Args:
|
|
features: Input features (various formats) or JSON string if is_json=True
|
|
k: Number of top predictions to return
|
|
is_json: If True, treat features as JSON string of roofline data
|
|
job_id: Optional job ID (used when is_json=True)
|
|
|
|
Returns:
|
|
Dictionary with top-k predictions and their details
|
|
"""
|
|
# If input is JSON string, aggregate features first
|
|
if is_json:
|
|
if not isinstance(features, str):
|
|
raise ValueError("When is_json=True, features must be a JSON string")
|
|
features = df_aggregate(features, job_id_full=job_id)
|
|
|
|
# Get all probabilities
|
|
X = self._prepare_features(features)
|
|
probabilities = self.model.predict_proba(X)
|
|
|
|
class_probabilities = {}
|
|
for i, class_name in enumerate(self.classes):
|
|
# For OneVsRest, predict_proba returns shape (n_samples, n_classes)
|
|
# Each column i contains probabilities for class i
|
|
if isinstance(probabilities, list):
|
|
# List of arrays (multiple samples)
|
|
prob_array = probabilities[i]
|
|
prob_positive = prob_array[0] if hasattr(prob_array, '__getitem__') else float(prob_array)
|
|
else:
|
|
# 2D numpy array (single sample or batch)
|
|
if len(probabilities.shape) == 2:
|
|
# Shape: (n_samples, n_classes)
|
|
prob_positive = float(probabilities[0, i])
|
|
else:
|
|
# 1D array
|
|
prob_positive = float(probabilities[i])
|
|
class_probabilities[class_name] = prob_positive
|
|
|
|
# Sort by probability
|
|
sorted_classes = sorted(class_probabilities.items(),
|
|
key=lambda x: x[1], reverse=True)
|
|
|
|
top_k_classes = sorted_classes[:k]
|
|
|
|
return {
|
|
'top_predictions': [cls for cls, _ in top_k_classes],
|
|
'top_probabilities': {cls: round(prob, 4) for cls, prob in top_k_classes},
|
|
'all_probabilities': {k: round(v, 4) for k, v in class_probabilities.items()}
|
|
}
|
|
|
|
def _prepare_features(self, features: Union[pd.DataFrame, np.ndarray, List, Dict]) -> pd.DataFrame:
|
|
"""
|
|
Convert various input formats to the expected feature format.
|
|
|
|
Args:
|
|
features: Input features in various formats
|
|
|
|
Returns:
|
|
pandas DataFrame with correct columns and order
|
|
"""
|
|
if isinstance(features, pd.DataFrame):
|
|
df = features.copy()
|
|
elif isinstance(features, np.ndarray):
|
|
if features.ndim == 1:
|
|
features = features.reshape(1, -1)
|
|
df = pd.DataFrame(features, columns=self.feature_columns[:features.shape[1]])
|
|
elif isinstance(features, list):
|
|
if isinstance(features[0], dict):
|
|
# List of dictionaries
|
|
df = pd.DataFrame(features)
|
|
else:
|
|
# List of lists
|
|
df = pd.DataFrame(features, columns=self.feature_columns[:len(features[0])])
|
|
elif isinstance(features, dict):
|
|
# Single feature dictionary
|
|
df = pd.DataFrame([features])
|
|
else:
|
|
raise ValueError(f"Unsupported feature format: {type(features)}")
|
|
|
|
# Ensure correct column order and fill missing columns with 0
|
|
for col in self.feature_columns:
|
|
if col not in df.columns:
|
|
df[col] = 0.0
|
|
|
|
df = df[self.feature_columns]
|
|
|
|
# Validate feature count
|
|
if df.shape[1] != self.n_features:
|
|
raise ValueError(f"Expected {self.n_features} features, got {df.shape[1]}")
|
|
|
|
return df
|
|
|
|
def get_class_info(self) -> Dict:
|
|
"""
|
|
Get information about available classes.
|
|
|
|
Returns:
|
|
Dictionary with class information
|
|
"""
|
|
return {
|
|
'classes': self.classes,
|
|
'n_classes': len(self.classes),
|
|
'feature_columns': self.feature_columns,
|
|
'n_features': self.n_features
|
|
}
|
|
|
|
def batch_predict(self, features_list: List[Union[pd.DataFrame, np.ndarray, List, Dict, str]],
|
|
threshold: float = 0.5,
|
|
is_json: bool = False,
|
|
job_ids: Optional[List[str]] = None) -> List[Dict]:
|
|
"""
|
|
Perform batch prediction on multiple samples.
|
|
|
|
Args:
|
|
features_list: List of feature inputs (or JSON strings if is_json=True)
|
|
threshold: Probability threshold
|
|
is_json: If True, treat each item in features_list as JSON string
|
|
job_ids: Optional list of job IDs (used when is_json=True)
|
|
|
|
Returns:
|
|
List of prediction results
|
|
"""
|
|
results = []
|
|
for idx, features in enumerate(features_list):
|
|
try:
|
|
job_id = job_ids[idx] if job_ids and idx < len(job_ids) else None
|
|
result = self.predict(features, threshold=threshold, is_json=is_json, job_id=job_id)
|
|
results.append(result)
|
|
except Exception as e:
|
|
results.append({'error': str(e)})
|
|
|
|
return results
|
|
|
|
|
|
def create_sample_data(n_samples: int = 5) -> List[Dict]:
|
|
"""
|
|
Create sample feature data for testing.
|
|
|
|
Args:
|
|
n_samples: Number of sample feature vectors to create
|
|
|
|
Returns:
|
|
List of feature dictionaries
|
|
"""
|
|
np.random.seed(42)
|
|
|
|
# Load feature columns from model if available
|
|
try:
|
|
model_data = joblib.load('xgb_model.joblib')
|
|
feature_columns = model_data['feature_columns']
|
|
except:
|
|
# Fallback to some default features
|
|
feature_columns = [
|
|
'node_num', 'bandwidth_raw_p10', 'bandwidth_raw_median',
|
|
'bandwidth_raw_p90', 'bandwidth_raw_mad', 'bandwidth_raw_range',
|
|
'bandwidth_raw_iqr', 'flops_raw_p10', 'flops_raw_median',
|
|
'flops_raw_p90', 'flops_raw_mad', 'flops_raw_range'
|
|
]
|
|
|
|
samples = []
|
|
for _ in range(n_samples):
|
|
sample = {}
|
|
for col in feature_columns:
|
|
if 'bandwidth' in col:
|
|
sample[col] = np.random.uniform(50, 500)
|
|
elif 'flops' in col:
|
|
sample[col] = np.random.uniform(100, 5000)
|
|
elif 'node_num' in col:
|
|
sample[col] = np.random.randint(1, 16)
|
|
else:
|
|
sample[col] = np.random.uniform(0, 1000)
|
|
samples.append(sample)
|
|
|
|
return samples
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("XGBoost Multi-Label Inference API")
|
|
print("=" * 40)
|
|
|
|
# Initialize predictor
|
|
try:
|
|
predictor = XGBoostMultiLabelPredictor()
|
|
except Exception as e:
|
|
print(f"Error loading model: {e}")
|
|
exit(1)
|
|
|
|
# Example usage of df_aggregate with JSON string
|
|
print("\n=== Example 0: JSON Aggregation ===")
|
|
sample_json = json.dumps([
|
|
{
|
|
"node_num": 1,
|
|
"bandwidth_raw": 150.5,
|
|
"flops_raw": 2500.0,
|
|
"arith_intensity": 16.6,
|
|
"performance_gflops": 1200.0,
|
|
"memory_bw_gbs": 450,
|
|
"scalar_peak_gflops": 600,
|
|
"duration": 3600
|
|
},
|
|
{
|
|
"node_num": 2,
|
|
"bandwidth_raw": 155.2,
|
|
"flops_raw": 2600.0,
|
|
"arith_intensity": 16.8,
|
|
"performance_gflops": 1250.0,
|
|
"memory_bw_gbs": 450,
|
|
"scalar_peak_gflops": 600,
|
|
"duration": 3600
|
|
}
|
|
])
|
|
|
|
try:
|
|
aggregated_features = df_aggregate(sample_json, job_id_full="test_job_123")
|
|
print(f"Aggregated features from JSON:")
|
|
for key, value in list(aggregated_features.items())[:10]:
|
|
print(f" {key}: {value}")
|
|
print(f" ... ({len(aggregated_features)} total features)")
|
|
|
|
# Use aggregated features for prediction
|
|
result = predictor.predict(aggregated_features, threshold=0.3)
|
|
print(f"\nPredictions from aggregated data: {result['predictions'][:3]}")
|
|
except Exception as e:
|
|
print(f"Error in aggregation: {e}")
|
|
|
|
# Create sample data
|
|
print("\n=== Generating sample data for other examples ===")
|
|
sample_data = create_sample_data(3)
|
|
|
|
# Example 1: Single prediction
|
|
print("\n=== Example 1: Single Prediction ===")
|
|
result = predictor.predict(sample_data[0], threshold=0.3)
|
|
print(f"Predictions: {result['predictions']}")
|
|
print(f"Confidences: {result['confidences']}")
|
|
print(f"Top probabilities:")
|
|
for class_name, prob in sorted(result['probabilities'].items(),
|
|
key=lambda x: x[1], reverse=True)[:5]:
|
|
print(".4f")
|
|
|
|
# Example 2: Top-K predictions
|
|
print("\n=== Example 2: Top-5 Predictions ===")
|
|
top_result = predictor.predict_top_k(sample_data[1], k=5)
|
|
for i, class_name in enumerate(top_result['top_predictions'], 1):
|
|
prob = top_result['top_probabilities'][class_name]
|
|
print(f"{i}. {class_name}: {prob:.4f}")
|
|
|
|
# Example 3: Batch prediction
|
|
print("\n=== Example 3: Batch Prediction ===")
|
|
batch_results = predictor.batch_predict(sample_data, threshold=0.4)
|
|
for i, result in enumerate(batch_results, 1):
|
|
if 'error' not in result:
|
|
print(f"Sample {i}: {len(result['predictions'])} predictions")
|
|
else:
|
|
print(f"Sample {i}: Error - {result['error']}")
|
|
|
|
print("\nAPI ready for use!")
|
|
print("Usage:")
|
|
print(" predictor = XGBoostMultiLabelPredictor()")
|
|
print(" result = predictor.predict(your_features)")
|
|
print(" top_k = predictor.predict_top_k(your_features, k=5)") |