import pandas as pd import numpy as np import joblib import json from typing import Dict, List, Tuple, Union, Optional import warnings warnings.filterwarnings('ignore') from scipy import stats def compute_mad(data: np.ndarray) -> float: """Compute Median Absolute Deviation.""" median = np.median(data) mad = np.median(np.abs(data - median)) return mad def df_aggregate(json_str: str, job_id_full: Optional[str] = None) -> Dict: """ Aggregate roofline data from JSON string into a single feature vector. Args: json_str: JSON string containing roofline data records job_id_full: Optional job ID to include in the result Returns: Dictionary containing aggregated features """ # Parse JSON string to DataFrame try: data = json.loads(json_str) if isinstance(data, list): df = pd.DataFrame(data) elif isinstance(data, dict): df = pd.DataFrame([data]) else: raise ValueError("JSON must contain a list of objects or a single object") except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON string: {e}") # Group data by node_num if 'node_num' not in df.columns: # If no node_num, treat all data as single node df['node_num'] = 1 grouped = df.groupby('node_num') all_features = [] for node_num, group in grouped: features = { 'node_num': int(node_num) } if job_id_full is not None: features['job_id'] = job_id_full # Compute statistics for key metrics for axis in ['bandwidth_raw', 'flops_raw', 'arith_intensity']: data = group[axis].values # Compute percentiles p10 = np.percentile(data, 10) p50 = np.median(data) p90 = np.percentile(data, 90) # Compute MAD (more robust than variance) mad = compute_mad(data) # Store features features[f'{axis}_p10'] = p10 features[f'{axis}_median'] = p50 features[f'{axis}_p90'] = p90 features[f'{axis}_mad'] = mad features[f'{axis}_range'] = p90 - p10 features[f'{axis}_iqr'] = np.percentile(data, 75) - np.percentile(data, 25) # Compute covariance and correlation between bandwidth_raw and flops_raw if len(group) > 1: # Need at least 2 points for correlation cov = np.cov(group['bandwidth_raw'], group['flops_raw'])[0, 1] features['bw_flops_covariance'] = cov corr, _ = stats.pearsonr(group['bandwidth_raw'], group['flops_raw']) features['bw_flops_correlation'] = corr # Additional useful features for the classifier # Performance metrics features['avg_performance_gflops'] = group['performance_gflops'].mean() features['median_performance_gflops'] = group['performance_gflops'].median() features['performance_gflops_mad'] = compute_mad(group['performance_gflops'].values) # # Efficiency metrics # features['avg_efficiency'] = group['efficiency'].mean() # features['median_efficiency'] = group['efficiency'].median() # features['efficiency_mad'] = compute_mad(group['efficiency'].values) # features['efficiency_p10'] = np.percentile(group['efficiency'].values, 10) # features['efficiency_p90'] = np.percentile(group['efficiency'].values, 90) # # Distribution of roofline regions (memory-bound vs compute-bound) # if 'roofline_region' in group.columns: # region_counts = group['roofline_region'].value_counts(normalize=True).to_dict() # for region, ratio in region_counts.items(): # features[f'region_{region}_ratio'] = ratio # System characteristics if 'memory_bw_gbs' in group.columns: features['avg_memory_bw_gbs'] = group['memory_bw_gbs'].mean() if 'scalar_peak_gflops' in group.columns and len(group['scalar_peak_gflops'].unique()) > 0: features['scalar_peak_gflops'] = group['scalar_peak_gflops'].iloc[0] if 'simd_peak_gflops' in group.columns and len(group['simd_peak_gflops'].unique()) > 0: features['simd_peak_gflops'] = group['simd_peak_gflops'].iloc[0] # # Subcluster information if available # if 'subcluster_name' in group.columns and not group['subcluster_name'].isna().all(): # features['subcluster_name'] = group['subcluster_name'].iloc[0] # Duration information if 'duration' in group.columns: features['duration'] = group['duration'].iloc[0] all_features.append(features) # Return first node's features (or combine multiple nodes if needed) if len(all_features) == 1: return all_features[0] else: # If multiple nodes, return the first one or average across nodes # For now, return the first node's features return all_features[0] class XGBoostMultiLabelPredictor: """ Python API for XGBoost multi-label classification inference. Provides methods to load trained models band perform inference with confidence scores for each class. """ def __init__(self, model_path: str = 'xgb_model.joblib'): """ Initialize the predictor by loading the trained model. Args: model_path: Path to the saved model file (.joblib) """ self.model_data = None self.model = None self.mlb = None self.feature_columns = None self.n_features = 0 self.classes = [] self.load_model(model_path) def load_model(self, model_path: str) -> None: """ Load the trained XGBoost model from disk. Args: model_path: Path to the saved model file """ try: print(f"Loading model from {model_path}...") self.model_data = joblib.load(model_path) self.model = self.model_data['model'] self.mlb = self.model_data['mlb'] self.feature_columns = self.model_data['feature_columns'] self.classes = list(self.mlb.classes_) self.n_features = len(self.feature_columns) print("Model loaded successfully!") print(f" - {len(self.classes)} classes: {self.classes}") print(f" - {self.n_features} features: {self.feature_columns[:5]}...") print(f" - Model type: {type(self.model).__name__}") except Exception as e: raise ValueError(f"Failed to load model from {model_path}: {e}") def predict(self, features: Union[pd.DataFrame, np.ndarray, List, Dict, str], threshold: float = 0.5, return_all_probabilities: bool = True, is_json: bool = False, job_id: Optional[str] = None) -> Dict: """ Perform multi-label prediction on input features. Args: features: Input features in various formats: - pandas DataFrame - numpy array (2D) - list of lists/dicts - single feature vector (list/dict) - JSON string (if is_json=True): roofline data to aggregate threshold: Probability threshold for binary classification (0.0-1.0) return_all_probabilities: If True, return probabilities for all classes. If False, return only classes above threshold. is_json: If True, treat features as JSON string of roofline data job_id: Optional job ID (used when is_json=True) Returns: Dictionary containing: - 'predictions': List of predicted class names - 'probabilities': Dict of {class_name: probability} for all classes - 'confidences': Dict of {class_name: confidence_score} for predicted classes - 'threshold': The threshold used """ # If input is JSON string, aggregate features first if is_json: if not isinstance(features, str): raise ValueError("When is_json=True, features must be a JSON string") features = df_aggregate(features, job_id_full=job_id) # Convert input to proper format X = self._prepare_features(features) # Get probability predictions probabilities = self.model.predict_proba(X) # Convert to class probabilities class_probabilities = {} for i, class_name in enumerate(self.classes): # For OneVsRest, predict_proba returns shape (n_samples, n_classes) # Each column i contains probabilities for class i if isinstance(probabilities, list): # List of arrays (multiple samples) prob_array = probabilities[i] prob_positive = prob_array[0] if hasattr(prob_array, '__getitem__') else float(prob_array) else: # 2D numpy array (single sample or batch) if len(probabilities.shape) == 2: # Shape: (n_samples, n_classes) prob_positive = float(probabilities[0, i]) else: # 1D array prob_positive = float(probabilities[i]) class_probabilities[class_name] = prob_positive # Apply threshold for predictions predictions = [] confidences = {} for class_name, prob in class_probabilities.items(): if prob >= threshold: predictions.append(class_name) # Confidence score: distance from threshold as percentage confidence = min(1.0, (prob - threshold) / (1.0 - threshold)) * 100 confidences[class_name] = round(confidence, 2) # Sort predictions by probability predictions.sort(key=lambda x: class_probabilities[x], reverse=True) result = { 'predictions': predictions, 'probabilities': {k: round(v, 4) for k, v in class_probabilities.items()}, 'confidences': confidences, 'threshold': threshold } if not return_all_probabilities: result['probabilities'] = {k: v for k, v in result['probabilities'].items() if k in predictions} return result def predict_top_k(self, features: Union[pd.DataFrame, np.ndarray, List, Dict, str], k: int = 5, is_json: bool = False, job_id: Optional[str] = None) -> Dict: """ Get top-k predictions with their probabilities. Args: features: Input features (various formats) or JSON string if is_json=True k: Number of top predictions to return is_json: If True, treat features as JSON string of roofline data job_id: Optional job ID (used when is_json=True) Returns: Dictionary with top-k predictions and their details """ # If input is JSON string, aggregate features first if is_json: if not isinstance(features, str): raise ValueError("When is_json=True, features must be a JSON string") features = df_aggregate(features, job_id_full=job_id) # Get all probabilities X = self._prepare_features(features) probabilities = self.model.predict_proba(X) class_probabilities = {} for i, class_name in enumerate(self.classes): # For OneVsRest, predict_proba returns shape (n_samples, n_classes) # Each column i contains probabilities for class i if isinstance(probabilities, list): # List of arrays (multiple samples) prob_array = probabilities[i] prob_positive = prob_array[0] if hasattr(prob_array, '__getitem__') else float(prob_array) else: # 2D numpy array (single sample or batch) if len(probabilities.shape) == 2: # Shape: (n_samples, n_classes) prob_positive = float(probabilities[0, i]) else: # 1D array prob_positive = float(probabilities[i]) class_probabilities[class_name] = prob_positive # Sort by probability sorted_classes = sorted(class_probabilities.items(), key=lambda x: x[1], reverse=True) top_k_classes = sorted_classes[:k] return { 'top_predictions': [cls for cls, _ in top_k_classes], 'top_probabilities': {cls: round(prob, 4) for cls, prob in top_k_classes}, 'all_probabilities': {k: round(v, 4) for k, v in class_probabilities.items()} } def _prepare_features(self, features: Union[pd.DataFrame, np.ndarray, List, Dict]) -> pd.DataFrame: """ Convert various input formats to the expected feature format. Args: features: Input features in various formats Returns: pandas DataFrame with correct columns and order """ if isinstance(features, pd.DataFrame): df = features.copy() elif isinstance(features, np.ndarray): if features.ndim == 1: features = features.reshape(1, -1) df = pd.DataFrame(features, columns=self.feature_columns[:features.shape[1]]) elif isinstance(features, list): if isinstance(features[0], dict): # List of dictionaries df = pd.DataFrame(features) else: # List of lists df = pd.DataFrame(features, columns=self.feature_columns[:len(features[0])]) elif isinstance(features, dict): # Single feature dictionary df = pd.DataFrame([features]) else: raise ValueError(f"Unsupported feature format: {type(features)}") # Ensure correct column order and fill missing columns with 0 for col in self.feature_columns: if col not in df.columns: df[col] = 0.0 df = df[self.feature_columns] # Validate feature count if df.shape[1] != self.n_features: raise ValueError(f"Expected {self.n_features} features, got {df.shape[1]}") return df def get_class_info(self) -> Dict: """ Get information about available classes. Returns: Dictionary with class information """ return { 'classes': self.classes, 'n_classes': len(self.classes), 'feature_columns': self.feature_columns, 'n_features': self.n_features } def batch_predict(self, features_list: List[Union[pd.DataFrame, np.ndarray, List, Dict, str]], threshold: float = 0.5, is_json: bool = False, job_ids: Optional[List[str]] = None) -> List[Dict]: """ Perform batch prediction on multiple samples. Args: features_list: List of feature inputs (or JSON strings if is_json=True) threshold: Probability threshold is_json: If True, treat each item in features_list as JSON string job_ids: Optional list of job IDs (used when is_json=True) Returns: List of prediction results """ results = [] for idx, features in enumerate(features_list): try: job_id = job_ids[idx] if job_ids and idx < len(job_ids) else None result = self.predict(features, threshold=threshold, is_json=is_json, job_id=job_id) results.append(result) except Exception as e: results.append({'error': str(e)}) return results def create_sample_data(n_samples: int = 5) -> List[Dict]: """ Create sample feature data for testing. Args: n_samples: Number of sample feature vectors to create Returns: List of feature dictionaries """ np.random.seed(42) # Load feature columns from model if available try: model_data = joblib.load('xgb_model.joblib') feature_columns = model_data['feature_columns'] except: # Fallback to some default features feature_columns = [ 'node_num', 'bandwidth_raw_p10', 'bandwidth_raw_median', 'bandwidth_raw_p90', 'bandwidth_raw_mad', 'bandwidth_raw_range', 'bandwidth_raw_iqr', 'flops_raw_p10', 'flops_raw_median', 'flops_raw_p90', 'flops_raw_mad', 'flops_raw_range' ] samples = [] for _ in range(n_samples): sample = {} for col in feature_columns: if 'bandwidth' in col: sample[col] = np.random.uniform(50, 500) elif 'flops' in col: sample[col] = np.random.uniform(100, 5000) elif 'node_num' in col: sample[col] = np.random.randint(1, 16) else: sample[col] = np.random.uniform(0, 1000) samples.append(sample) return samples if __name__ == "__main__": print("XGBoost Multi-Label Inference API") print("=" * 40) # Initialize predictor try: predictor = XGBoostMultiLabelPredictor() except Exception as e: print(f"Error loading model: {e}") exit(1) # Example usage of df_aggregate with JSON string print("\n=== Example 0: JSON Aggregation ===") sample_json = json.dumps([ { "node_num": 1, "bandwidth_raw": 150.5, "flops_raw": 2500.0, "arith_intensity": 16.6, "performance_gflops": 1200.0, "memory_bw_gbs": 450, "scalar_peak_gflops": 600, "duration": 3600 }, { "node_num": 2, "bandwidth_raw": 155.2, "flops_raw": 2600.0, "arith_intensity": 16.8, "performance_gflops": 1250.0, "memory_bw_gbs": 450, "scalar_peak_gflops": 600, "duration": 3600 } ]) try: aggregated_features = df_aggregate(sample_json, job_id_full="test_job_123") print(f"Aggregated features from JSON:") for key, value in list(aggregated_features.items())[:10]: print(f" {key}: {value}") print(f" ... ({len(aggregated_features)} total features)") # Use aggregated features for prediction result = predictor.predict(aggregated_features, threshold=0.3) print(f"\nPredictions from aggregated data: {result['predictions'][:3]}") except Exception as e: print(f"Error in aggregation: {e}") # Create sample data print("\n=== Generating sample data for other examples ===") sample_data = create_sample_data(3) # Example 1: Single prediction print("\n=== Example 1: Single Prediction ===") result = predictor.predict(sample_data[0], threshold=0.3) print(f"Predictions: {result['predictions']}") print(f"Confidences: {result['confidences']}") print(f"Top probabilities:") for class_name, prob in sorted(result['probabilities'].items(), key=lambda x: x[1], reverse=True)[:5]: print(".4f") # Example 2: Top-K predictions print("\n=== Example 2: Top-5 Predictions ===") top_result = predictor.predict_top_k(sample_data[1], k=5) for i, class_name in enumerate(top_result['top_predictions'], 1): prob = top_result['top_probabilities'][class_name] print(f"{i}. {class_name}: {prob:.4f}") # Example 3: Batch prediction print("\n=== Example 3: Batch Prediction ===") batch_results = predictor.batch_predict(sample_data, threshold=0.4) for i, result in enumerate(batch_results, 1): if 'error' not in result: print(f"Sample {i}: {len(result['predictions'])} predictions") else: print(f"Sample {i}: Error - {result['error']}") print("\nAPI ready for use!") print("Usage:") print(" predictor = XGBoostMultiLabelPredictor()") print(" result = predictor.predict(your_features)") print(" top_k = predictor.predict_top_k(your_features, k=5)")