abin 55040733fe feat: implement Phase 1-4 performance visualization and device management
Phase 1 — Performance Benchmarking:
- PerformanceBenchmarker: sequential vs parallel benchmark with injectable runner
- PerformanceHistory: JSON-backed benchmark history with regression support
- PerformanceDashboard: real-time FPS/latency display widget
- BenchmarkDialog: one-click benchmark with 3-phase progress bar

Phase 2 — Device Management:
- DeviceManager: NPU dongle scan, assign/unassign, load balance recommendation
- DeviceManagementPanel: live device status cards with auto-refresh
- BottleneckAlert: dataclass for pipeline bottleneck detection

Phase 3 — Advanced Features:
- OptimizationEngine: 3 optimization rules (rebalance/adjust_queue/add_devices)
- TemplateManager: 3 built-in pipeline templates (YOLOv5, fire detection, dual-model)

Phase 4 — Report Export:
- ReportExporter: PDF (reportlab, optional) and CSV export
- ExportReportDialog: format selection + path picker UI

192 unit tests, all passing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-06 19:32:05 +08:00

249 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
core/optimization/engine.py
OptimizationEngine — 分析 Pipeline 執行統計,產生可執行的優化建議。
設計重點:
- analyze_pipeline 接受來自 InferencePipeline.get_pipeline_statistics() 的 stats 字典。
- 三條優化規則rebalance_devices、adjust_queue、add_devices各自獨立
可個別觸發,不互斥。
- apply_suggestion 對 rebalance_devices 呼叫 device_manager.assign_device
其他類型add_devices、adjust_queue需要人工操作僅記錄 log 後回傳 True。
- predict_performance 使用保守係數 0.6 的啟發式估算。
"""
from __future__ import annotations
import logging
import uuid
from dataclasses import dataclass, field
from typing import Any, Dict, List, Tuple
logger = logging.getLogger(__name__)
# 優化規則閾值
_QUEUE_FILL_THRESHOLD = 0.70 # queue_fill_rate > 此值觸發 rebalance_devices
_TIME_RATIO_THRESHOLD = 2.0 # max/min avg_processing_time > 此值觸發 adjust_queue
_UTILIZATION_THRESHOLD = 85.0 # 所有裝置 utilization_pct > 此值觸發 add_devices
_CONSERVATIVE_FACTOR = 0.6 # predict_performance 的保守係數
@dataclass
class OptimizationSuggestion:
"""單一優化建議。
屬性:
suggestion_id: 唯一識別碼UUID 字串)。
type: 建議類型,如 "rebalance_devices" | "adjust_queue" | "add_devices"
description: 使用者可讀的說明(避免技術術語)。
estimated_improvement_pct: 預估改善百分比0.0100.0)。
confidence: 信心程度,"high" | "medium" | "low"
action_params: 執行建議所需的參數字典。
"""
suggestion_id: str
type: str
description: str
estimated_improvement_pct: float
confidence: str
action_params: Dict[str, Any]
class OptimizationEngine:
"""分析 Pipeline 執行統計並產生優化建議。"""
# ------------------------------------------------------------------
# 公開介面
# ------------------------------------------------------------------
def analyze_pipeline(
self,
stats: Dict[str, Any],
) -> List[OptimizationSuggestion]:
"""分析 Pipeline 執行統計,產生優化建議清單。
參數:
stats: 來自 InferencePipeline.get_pipeline_statistics() 的字典,
格式詳見模組文件。
回傳:
可能為空的 OptimizationSuggestion 清單。
"""
stages: Dict[str, Any] = stats.get("stages", {})
devices: Dict[str, Any] = stats.get("devices", {})
suggestions: List[OptimizationSuggestion] = []
suggestions.extend(self._check_rebalance_devices(stages))
suggestions.extend(self._check_adjust_queue(stages))
suggestions.extend(self._check_add_devices(devices))
return suggestions
def predict_performance(
self,
config: List[Any],
available_devices: List[Any],
) -> Dict[str, float]:
"""以啟發式方法估算 Pipeline 效能。
公式:
estimated_fps = sum(device.gops for d in available_devices) / num_stages * 0.6
estimated_latency_ms = 1000 / estimated_fps
confidence_range = (estimated_fps * 0.8, estimated_fps * 1.2)
參數:
config: Stage 設定列表(每個元素代表一個 Stage
available_devices: DeviceInfo 物件列表(具備 gops 屬性)。
回傳:
包含 estimated_fps、estimated_latency_ms、confidence_range 的字典。
"""
num_stages = len(config)
total_gops = sum(getattr(d, "gops", 0) for d in available_devices)
if num_stages == 0 or total_gops == 0:
return {
"estimated_fps": 0.0,
"estimated_latency_ms": 0.0,
"confidence_range": (0.0, 0.0),
}
estimated_fps = total_gops / num_stages * _CONSERVATIVE_FACTOR
estimated_latency_ms = 1000.0 / estimated_fps
confidence_range = (estimated_fps * 0.8, estimated_fps * 1.2)
return {
"estimated_fps": estimated_fps,
"estimated_latency_ms": estimated_latency_ms,
"confidence_range": confidence_range,
}
def apply_suggestion(
self,
suggestion: OptimizationSuggestion,
device_manager: Any,
) -> bool:
"""執行優化建議。
- rebalance_devices呼叫 device_manager.assign_device 並回傳其結果。
- add_devices / adjust_queue記錄 log需人工操作回傳 True。
參數:
suggestion: 要執行的優化建議。
device_manager: DeviceManager 實例。
回傳:
執行是否成功。
"""
if suggestion.type == "rebalance_devices":
device_id = suggestion.action_params.get("device_id", "")
stage_id = suggestion.action_params.get("stage_id", "")
success = device_manager.assign_device(device_id, stage_id)
if success:
logger.info(
"已將裝置 %s 重新分配至 Stage %s", device_id, stage_id
)
else:
logger.warning(
"無法將裝置 %s 分配至 Stage %s", device_id, stage_id
)
return success
if suggestion.type in ("add_devices", "adjust_queue"):
logger.info(
"優化建議 [%s]%s(需要人工操作)",
suggestion.type,
suggestion.description,
)
return True
logger.warning("未知的建議類型:%s", suggestion.type)
return False
# ------------------------------------------------------------------
# 內部規則實作
# ------------------------------------------------------------------
def _check_rebalance_devices(
self, stages: Dict[str, Any]
) -> List[OptimizationSuggestion]:
"""規則 1queue_fill_rate > 0.70 → 建議重新分配裝置。"""
suggestions = []
for stage_id, stage_data in stages.items():
fill_rate: float = stage_data.get("queue_fill_rate", 0.0)
if fill_rate > _QUEUE_FILL_THRESHOLD:
pct = round((fill_rate - _QUEUE_FILL_THRESHOLD) / _QUEUE_FILL_THRESHOLD * 100, 1)
suggestions.append(
OptimizationSuggestion(
suggestion_id=str(uuid.uuid4()),
type="rebalance_devices",
description=(
f"{stage_id} 的佇列使用率偏高({fill_rate:.0%}"
"建議將算力較高的裝置分配給此階段以降低積壓。"
),
estimated_improvement_pct=min(pct, 40.0),
confidence="medium",
action_params={"stage_id": stage_id, "device_id": ""},
)
)
return suggestions
def _check_adjust_queue(
self, stages: Dict[str, Any]
) -> List[OptimizationSuggestion]:
"""規則 2avg_processing_time 最大/最小比值 > 2.0 → 建議調整佇列大小。"""
if len(stages) < 2:
return []
times = {
sid: data.get("avg_processing_time", 0.0)
for sid, data in stages.items()
}
max_time = max(times.values())
min_time = min(times.values())
if min_time <= 0 or max_time / min_time <= _TIME_RATIO_THRESHOLD:
return []
ratio = max_time / min_time
return [
OptimizationSuggestion(
suggestion_id=str(uuid.uuid4()),
type="adjust_queue",
description=(
f"各 Stage 的處理時間差異達 {ratio:.1f} 倍,"
"建議調整佇列大小以平衡各階段的吞吐量。"
),
estimated_improvement_pct=min((ratio - 2.0) * 10.0, 30.0),
confidence="low",
action_params={"max_stage": max(times, key=times.get), "ratio": ratio},
)
]
def _check_add_devices(
self, devices: Dict[str, Any]
) -> List[OptimizationSuggestion]:
"""規則 3所有 Dongle 使用率 > 85% → 建議增加更多 Dongle。"""
if not devices:
return []
utilizations = [
data.get("utilization_pct", 0.0) for data in devices.values()
]
if not all(u > _UTILIZATION_THRESHOLD for u in utilizations):
return []
avg_util = sum(utilizations) / len(utilizations)
return [
OptimizationSuggestion(
suggestion_id=str(uuid.uuid4()),
type="add_devices",
description=(
f"所有裝置的平均使用率已達 {avg_util:.1f}%"
"系統已接近飽和,建議增加更多 NPU 裝置。"
),
estimated_improvement_pct=min((avg_util - 85.0) * 2.0, 50.0),
confidence="high",
action_params={"current_avg_utilization": avg_util},
)
]