""" core/optimization/engine.py OptimizationEngine — 分析 Pipeline 執行統計,產生可執行的優化建議。 設計重點: - analyze_pipeline 接受來自 InferencePipeline.get_pipeline_statistics() 的 stats 字典。 - 三條優化規則(rebalance_devices、adjust_queue、add_devices)各自獨立, 可個別觸發,不互斥。 - apply_suggestion 對 rebalance_devices 呼叫 device_manager.assign_device; 其他類型(add_devices、adjust_queue)需要人工操作,僅記錄 log 後回傳 True。 - predict_performance 使用保守係數 0.6 的啟發式估算。 """ from __future__ import annotations import logging import uuid from dataclasses import dataclass, field from typing import Any, Dict, List, Tuple logger = logging.getLogger(__name__) # 優化規則閾值 _QUEUE_FILL_THRESHOLD = 0.70 # queue_fill_rate > 此值觸發 rebalance_devices _TIME_RATIO_THRESHOLD = 2.0 # max/min avg_processing_time > 此值觸發 adjust_queue _UTILIZATION_THRESHOLD = 85.0 # 所有裝置 utilization_pct > 此值觸發 add_devices _CONSERVATIVE_FACTOR = 0.6 # predict_performance 的保守係數 @dataclass class OptimizationSuggestion: """單一優化建議。 屬性: suggestion_id: 唯一識別碼(UUID 字串)。 type: 建議類型,如 "rebalance_devices" | "adjust_queue" | "add_devices"。 description: 使用者可讀的說明(避免技術術語)。 estimated_improvement_pct: 預估改善百分比(0.0–100.0)。 confidence: 信心程度,"high" | "medium" | "low"。 action_params: 執行建議所需的參數字典。 """ suggestion_id: str type: str description: str estimated_improvement_pct: float confidence: str action_params: Dict[str, Any] class OptimizationEngine: """分析 Pipeline 執行統計並產生優化建議。""" # ------------------------------------------------------------------ # 公開介面 # ------------------------------------------------------------------ def analyze_pipeline( self, stats: Dict[str, Any], ) -> List[OptimizationSuggestion]: """分析 Pipeline 執行統計,產生優化建議清單。 參數: stats: 來自 InferencePipeline.get_pipeline_statistics() 的字典, 格式詳見模組文件。 回傳: 可能為空的 OptimizationSuggestion 清單。 """ stages: Dict[str, Any] = stats.get("stages", {}) devices: Dict[str, Any] = stats.get("devices", {}) suggestions: List[OptimizationSuggestion] = [] suggestions.extend(self._check_rebalance_devices(stages)) suggestions.extend(self._check_adjust_queue(stages)) suggestions.extend(self._check_add_devices(devices)) return suggestions def predict_performance( self, config: List[Any], available_devices: List[Any], ) -> Dict[str, float]: """以啟發式方法估算 Pipeline 效能。 公式: estimated_fps = sum(device.gops for d in available_devices) / num_stages * 0.6 estimated_latency_ms = 1000 / estimated_fps confidence_range = (estimated_fps * 0.8, estimated_fps * 1.2) 參數: config: Stage 設定列表(每個元素代表一個 Stage)。 available_devices: DeviceInfo 物件列表(具備 gops 屬性)。 回傳: 包含 estimated_fps、estimated_latency_ms、confidence_range 的字典。 """ num_stages = len(config) total_gops = sum(getattr(d, "gops", 0) for d in available_devices) if num_stages == 0 or total_gops == 0: return { "estimated_fps": 0.0, "estimated_latency_ms": 0.0, "confidence_range": (0.0, 0.0), } estimated_fps = total_gops / num_stages * _CONSERVATIVE_FACTOR estimated_latency_ms = 1000.0 / estimated_fps confidence_range = (estimated_fps * 0.8, estimated_fps * 1.2) return { "estimated_fps": estimated_fps, "estimated_latency_ms": estimated_latency_ms, "confidence_range": confidence_range, } def apply_suggestion( self, suggestion: OptimizationSuggestion, device_manager: Any, ) -> bool: """執行優化建議。 - rebalance_devices:呼叫 device_manager.assign_device 並回傳其結果。 - add_devices / adjust_queue:記錄 log(需人工操作),回傳 True。 參數: suggestion: 要執行的優化建議。 device_manager: DeviceManager 實例。 回傳: 執行是否成功。 """ if suggestion.type == "rebalance_devices": device_id = suggestion.action_params.get("device_id", "") stage_id = suggestion.action_params.get("stage_id", "") success = device_manager.assign_device(device_id, stage_id) if success: logger.info( "已將裝置 %s 重新分配至 Stage %s", device_id, stage_id ) else: logger.warning( "無法將裝置 %s 分配至 Stage %s", device_id, stage_id ) return success if suggestion.type in ("add_devices", "adjust_queue"): logger.info( "優化建議 [%s]:%s(需要人工操作)", suggestion.type, suggestion.description, ) return True logger.warning("未知的建議類型:%s", suggestion.type) return False # ------------------------------------------------------------------ # 內部規則實作 # ------------------------------------------------------------------ def _check_rebalance_devices( self, stages: Dict[str, Any] ) -> List[OptimizationSuggestion]: """規則 1:queue_fill_rate > 0.70 → 建議重新分配裝置。""" suggestions = [] for stage_id, stage_data in stages.items(): fill_rate: float = stage_data.get("queue_fill_rate", 0.0) if fill_rate > _QUEUE_FILL_THRESHOLD: pct = round((fill_rate - _QUEUE_FILL_THRESHOLD) / _QUEUE_FILL_THRESHOLD * 100, 1) suggestions.append( OptimizationSuggestion( suggestion_id=str(uuid.uuid4()), type="rebalance_devices", description=( f"{stage_id} 的佇列使用率偏高({fill_rate:.0%})," "建議將算力較高的裝置分配給此階段以降低積壓。" ), estimated_improvement_pct=min(pct, 40.0), confidence="medium", action_params={"stage_id": stage_id, "device_id": ""}, ) ) return suggestions def _check_adjust_queue( self, stages: Dict[str, Any] ) -> List[OptimizationSuggestion]: """規則 2:avg_processing_time 最大/最小比值 > 2.0 → 建議調整佇列大小。""" if len(stages) < 2: return [] times = { sid: data.get("avg_processing_time", 0.0) for sid, data in stages.items() } max_time = max(times.values()) min_time = min(times.values()) if min_time <= 0 or max_time / min_time <= _TIME_RATIO_THRESHOLD: return [] ratio = max_time / min_time return [ OptimizationSuggestion( suggestion_id=str(uuid.uuid4()), type="adjust_queue", description=( f"各 Stage 的處理時間差異達 {ratio:.1f} 倍," "建議調整佇列大小以平衡各階段的吞吐量。" ), estimated_improvement_pct=min((ratio - 2.0) * 10.0, 30.0), confidence="low", action_params={"max_stage": max(times, key=times.get), "ratio": ratio}, ) ] def _check_add_devices( self, devices: Dict[str, Any] ) -> List[OptimizationSuggestion]: """規則 3:所有 Dongle 使用率 > 85% → 建議增加更多 Dongle。""" if not devices: return [] utilizations = [ data.get("utilization_pct", 0.0) for data in devices.values() ] if not all(u > _UTILIZATION_THRESHOLD for u in utilizations): return [] avg_util = sum(utilizations) / len(utilizations) return [ OptimizationSuggestion( suggestion_id=str(uuid.uuid4()), type="add_devices", description=( f"所有裝置的平均使用率已達 {avg_util:.1f}%," "系統已接近飽和,建議增加更多 NPU 裝置。" ), estimated_improvement_pct=min((avg_util - 85.0) * 2.0, 50.0), confidence="high", action_params={"current_avg_utilization": avg_util}, ) ]