forked from masonhuang/cluster4npu
Phase 1 — Performance Benchmarking: - PerformanceBenchmarker: sequential vs parallel benchmark with injectable runner - PerformanceHistory: JSON-backed benchmark history with regression support - PerformanceDashboard: real-time FPS/latency display widget - BenchmarkDialog: one-click benchmark with 3-phase progress bar Phase 2 — Device Management: - DeviceManager: NPU dongle scan, assign/unassign, load balance recommendation - DeviceManagementPanel: live device status cards with auto-refresh - BottleneckAlert: dataclass for pipeline bottleneck detection Phase 3 — Advanced Features: - OptimizationEngine: 3 optimization rules (rebalance/adjust_queue/add_devices) - TemplateManager: 3 built-in pipeline templates (YOLOv5, fire detection, dual-model) Phase 4 — Report Export: - ReportExporter: PDF (reportlab, optional) and CSV export - ExportReportDialog: format selection + path picker UI 192 unit tests, all passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
249 lines
9.1 KiB
Python
249 lines
9.1 KiB
Python
"""
|
||
core/optimization/engine.py
|
||
|
||
OptimizationEngine — 分析 Pipeline 執行統計,產生可執行的優化建議。
|
||
|
||
設計重點:
|
||
- analyze_pipeline 接受來自 InferencePipeline.get_pipeline_statistics() 的 stats 字典。
|
||
- 三條優化規則(rebalance_devices、adjust_queue、add_devices)各自獨立,
|
||
可個別觸發,不互斥。
|
||
- apply_suggestion 對 rebalance_devices 呼叫 device_manager.assign_device;
|
||
其他類型(add_devices、adjust_queue)需要人工操作,僅記錄 log 後回傳 True。
|
||
- predict_performance 使用保守係數 0.6 的啟發式估算。
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import uuid
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Dict, List, Tuple
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 優化規則閾值
|
||
_QUEUE_FILL_THRESHOLD = 0.70 # queue_fill_rate > 此值觸發 rebalance_devices
|
||
_TIME_RATIO_THRESHOLD = 2.0 # max/min avg_processing_time > 此值觸發 adjust_queue
|
||
_UTILIZATION_THRESHOLD = 85.0 # 所有裝置 utilization_pct > 此值觸發 add_devices
|
||
_CONSERVATIVE_FACTOR = 0.6 # predict_performance 的保守係數
|
||
|
||
|
||
@dataclass
|
||
class OptimizationSuggestion:
|
||
"""單一優化建議。
|
||
|
||
屬性:
|
||
suggestion_id: 唯一識別碼(UUID 字串)。
|
||
type: 建議類型,如 "rebalance_devices" | "adjust_queue" | "add_devices"。
|
||
description: 使用者可讀的說明(避免技術術語)。
|
||
estimated_improvement_pct: 預估改善百分比(0.0–100.0)。
|
||
confidence: 信心程度,"high" | "medium" | "low"。
|
||
action_params: 執行建議所需的參數字典。
|
||
"""
|
||
suggestion_id: str
|
||
type: str
|
||
description: str
|
||
estimated_improvement_pct: float
|
||
confidence: str
|
||
action_params: Dict[str, Any]
|
||
|
||
|
||
class OptimizationEngine:
|
||
"""分析 Pipeline 執行統計並產生優化建議。"""
|
||
|
||
# ------------------------------------------------------------------
|
||
# 公開介面
|
||
# ------------------------------------------------------------------
|
||
|
||
def analyze_pipeline(
|
||
self,
|
||
stats: Dict[str, Any],
|
||
) -> List[OptimizationSuggestion]:
|
||
"""分析 Pipeline 執行統計,產生優化建議清單。
|
||
|
||
參數:
|
||
stats: 來自 InferencePipeline.get_pipeline_statistics() 的字典,
|
||
格式詳見模組文件。
|
||
|
||
回傳:
|
||
可能為空的 OptimizationSuggestion 清單。
|
||
"""
|
||
stages: Dict[str, Any] = stats.get("stages", {})
|
||
devices: Dict[str, Any] = stats.get("devices", {})
|
||
|
||
suggestions: List[OptimizationSuggestion] = []
|
||
|
||
suggestions.extend(self._check_rebalance_devices(stages))
|
||
suggestions.extend(self._check_adjust_queue(stages))
|
||
suggestions.extend(self._check_add_devices(devices))
|
||
|
||
return suggestions
|
||
|
||
def predict_performance(
|
||
self,
|
||
config: List[Any],
|
||
available_devices: List[Any],
|
||
) -> Dict[str, float]:
|
||
"""以啟發式方法估算 Pipeline 效能。
|
||
|
||
公式:
|
||
estimated_fps = sum(device.gops for d in available_devices) / num_stages * 0.6
|
||
estimated_latency_ms = 1000 / estimated_fps
|
||
confidence_range = (estimated_fps * 0.8, estimated_fps * 1.2)
|
||
|
||
參數:
|
||
config: Stage 設定列表(每個元素代表一個 Stage)。
|
||
available_devices: DeviceInfo 物件列表(具備 gops 屬性)。
|
||
|
||
回傳:
|
||
包含 estimated_fps、estimated_latency_ms、confidence_range 的字典。
|
||
"""
|
||
num_stages = len(config)
|
||
total_gops = sum(getattr(d, "gops", 0) for d in available_devices)
|
||
|
||
if num_stages == 0 or total_gops == 0:
|
||
return {
|
||
"estimated_fps": 0.0,
|
||
"estimated_latency_ms": 0.0,
|
||
"confidence_range": (0.0, 0.0),
|
||
}
|
||
|
||
estimated_fps = total_gops / num_stages * _CONSERVATIVE_FACTOR
|
||
estimated_latency_ms = 1000.0 / estimated_fps
|
||
confidence_range = (estimated_fps * 0.8, estimated_fps * 1.2)
|
||
|
||
return {
|
||
"estimated_fps": estimated_fps,
|
||
"estimated_latency_ms": estimated_latency_ms,
|
||
"confidence_range": confidence_range,
|
||
}
|
||
|
||
def apply_suggestion(
|
||
self,
|
||
suggestion: OptimizationSuggestion,
|
||
device_manager: Any,
|
||
) -> bool:
|
||
"""執行優化建議。
|
||
|
||
- rebalance_devices:呼叫 device_manager.assign_device 並回傳其結果。
|
||
- add_devices / adjust_queue:記錄 log(需人工操作),回傳 True。
|
||
|
||
參數:
|
||
suggestion: 要執行的優化建議。
|
||
device_manager: DeviceManager 實例。
|
||
|
||
回傳:
|
||
執行是否成功。
|
||
"""
|
||
if suggestion.type == "rebalance_devices":
|
||
device_id = suggestion.action_params.get("device_id", "")
|
||
stage_id = suggestion.action_params.get("stage_id", "")
|
||
success = device_manager.assign_device(device_id, stage_id)
|
||
if success:
|
||
logger.info(
|
||
"已將裝置 %s 重新分配至 Stage %s", device_id, stage_id
|
||
)
|
||
else:
|
||
logger.warning(
|
||
"無法將裝置 %s 分配至 Stage %s", device_id, stage_id
|
||
)
|
||
return success
|
||
|
||
if suggestion.type in ("add_devices", "adjust_queue"):
|
||
logger.info(
|
||
"優化建議 [%s]:%s(需要人工操作)",
|
||
suggestion.type,
|
||
suggestion.description,
|
||
)
|
||
return True
|
||
|
||
logger.warning("未知的建議類型:%s", suggestion.type)
|
||
return False
|
||
|
||
# ------------------------------------------------------------------
|
||
# 內部規則實作
|
||
# ------------------------------------------------------------------
|
||
|
||
def _check_rebalance_devices(
|
||
self, stages: Dict[str, Any]
|
||
) -> List[OptimizationSuggestion]:
|
||
"""規則 1:queue_fill_rate > 0.70 → 建議重新分配裝置。"""
|
||
suggestions = []
|
||
for stage_id, stage_data in stages.items():
|
||
fill_rate: float = stage_data.get("queue_fill_rate", 0.0)
|
||
if fill_rate > _QUEUE_FILL_THRESHOLD:
|
||
pct = round((fill_rate - _QUEUE_FILL_THRESHOLD) / _QUEUE_FILL_THRESHOLD * 100, 1)
|
||
suggestions.append(
|
||
OptimizationSuggestion(
|
||
suggestion_id=str(uuid.uuid4()),
|
||
type="rebalance_devices",
|
||
description=(
|
||
f"{stage_id} 的佇列使用率偏高({fill_rate:.0%}),"
|
||
"建議將算力較高的裝置分配給此階段以降低積壓。"
|
||
),
|
||
estimated_improvement_pct=min(pct, 40.0),
|
||
confidence="medium",
|
||
action_params={"stage_id": stage_id, "device_id": ""},
|
||
)
|
||
)
|
||
return suggestions
|
||
|
||
def _check_adjust_queue(
|
||
self, stages: Dict[str, Any]
|
||
) -> List[OptimizationSuggestion]:
|
||
"""規則 2:avg_processing_time 最大/最小比值 > 2.0 → 建議調整佇列大小。"""
|
||
if len(stages) < 2:
|
||
return []
|
||
|
||
times = {
|
||
sid: data.get("avg_processing_time", 0.0)
|
||
for sid, data in stages.items()
|
||
}
|
||
max_time = max(times.values())
|
||
min_time = min(times.values())
|
||
|
||
if min_time <= 0 or max_time / min_time <= _TIME_RATIO_THRESHOLD:
|
||
return []
|
||
|
||
ratio = max_time / min_time
|
||
return [
|
||
OptimizationSuggestion(
|
||
suggestion_id=str(uuid.uuid4()),
|
||
type="adjust_queue",
|
||
description=(
|
||
f"各 Stage 的處理時間差異達 {ratio:.1f} 倍,"
|
||
"建議調整佇列大小以平衡各階段的吞吐量。"
|
||
),
|
||
estimated_improvement_pct=min((ratio - 2.0) * 10.0, 30.0),
|
||
confidence="low",
|
||
action_params={"max_stage": max(times, key=times.get), "ratio": ratio},
|
||
)
|
||
]
|
||
|
||
def _check_add_devices(
|
||
self, devices: Dict[str, Any]
|
||
) -> List[OptimizationSuggestion]:
|
||
"""規則 3:所有 Dongle 使用率 > 85% → 建議增加更多 Dongle。"""
|
||
if not devices:
|
||
return []
|
||
|
||
utilizations = [
|
||
data.get("utilization_pct", 0.0) for data in devices.values()
|
||
]
|
||
if not all(u > _UTILIZATION_THRESHOLD for u in utilizations):
|
||
return []
|
||
|
||
avg_util = sum(utilizations) / len(utilizations)
|
||
return [
|
||
OptimizationSuggestion(
|
||
suggestion_id=str(uuid.uuid4()),
|
||
type="add_devices",
|
||
description=(
|
||
f"所有裝置的平均使用率已達 {avg_util:.1f}%,"
|
||
"系統已接近飽和,建議增加更多 NPU 裝置。"
|
||
),
|
||
estimated_improvement_pct=min((avg_util - 85.0) * 2.0, 50.0),
|
||
confidence="high",
|
||
action_params={"current_avg_utilization": avg_util},
|
||
)
|
||
]
|