cluster4npu/core/device/device_manager.py
abin 55040733fe feat: implement Phase 1-4 performance visualization and device management
Phase 1 — Performance Benchmarking:
- PerformanceBenchmarker: sequential vs parallel benchmark with injectable runner
- PerformanceHistory: JSON-backed benchmark history with regression support
- PerformanceDashboard: real-time FPS/latency display widget
- BenchmarkDialog: one-click benchmark with 3-phase progress bar

Phase 2 — Device Management:
- DeviceManager: NPU dongle scan, assign/unassign, load balance recommendation
- DeviceManagementPanel: live device status cards with auto-refresh
- BottleneckAlert: dataclass for pipeline bottleneck detection

Phase 3 — Advanced Features:
- OptimizationEngine: 3 optimization rules (rebalance/adjust_queue/add_devices)
- TemplateManager: 3 built-in pipeline templates (YOLOv5, fire detection, dual-model)

Phase 4 — Report Export:
- ReportExporter: PDF (reportlab, optional) and CSV export
- ExportReportDialog: format selection + path picker UI

192 unit tests, all passing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-06 19:32:05 +08:00

218 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
core/device/device_manager.py
DeviceManager — manages NPU Dongle discovery, health, and assignment.
Design:
- scan_devices() calls the Kneron KP SDK but accepts an injectable kp_api
parameter so tests can supply a Mock without real hardware.
- DongleSeriesSpec constants are inlined here to avoid a circular import
from core.functions.Multidongle.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List, Optional
# ---------------------------------------------------------------------------
# GOPS table (mirrors DongleSeriesSpec in Multidongle.py)
# ---------------------------------------------------------------------------
_PRODUCT_ID_TO_SERIES: Dict[int, str] = {
0x100: "KL520",
0x720: "KL720",
0x630: "KL630",
0x730: "KL730",
}
_SERIES_GOPS: Dict[str, int] = {
"KL520": 2,
"KL720": 28,
"KL630": 400,
"KL730": 1600,
}
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class DeviceInfo:
"""Snapshot of a single NPU Dongle's state."""
device_id: str # unique id, e.g. "usb-<port_id>"
series: str # "KL520" | "KL720" | ...
product_id: int # raw USB product ID
status: str # "online" | "offline" | "busy"
gops: int # compute capacity
assigned_stage: Optional[str] # currently assigned stage ID, or None
current_fps: float # live inference throughput
utilization_pct: float # 0.0 100.0
@dataclass
class DeviceHealth:
"""Health snapshot of a single NPU Dongle."""
device_id: str
temperature_celsius: Optional[float] # None if SDK does not support it
error_count: int
last_error: Optional[str]
uptime_seconds: float
# ---------------------------------------------------------------------------
# DeviceManager
# ---------------------------------------------------------------------------
class DeviceManager:
"""Manages NPU Dongle discovery, health queries, and stage assignment.
Parameters
----------
kp_api:
Kneron KP SDK module reference. Pass ``None`` to import the real
``kp`` module at runtime, or inject a Mock in tests.
"""
def __init__(self, kp_api=None) -> None:
if kp_api is None:
import kp as _kp # real SDK (requires hardware)
self._kp = _kp
else:
self._kp = kp_api
# Known devices, populated by scan_devices()
self._devices: Dict[str, DeviceInfo] = {}
# stage assignments: {device_id: stage_id}
self._assignments: Dict[str, str] = {}
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def scan_devices(self) -> List[DeviceInfo]:
"""Scan for connected Kneron Dongles and update internal state.
Returns
-------
List[DeviceInfo]
All currently connected devices, each with status "online".
"""
try:
descriptors = self._kp.core.scan_devices()
except Exception:
return []
if not descriptors or descriptors.device_descriptor_number == 0:
return []
found: Dict[str, DeviceInfo] = {}
for desc in descriptors.device_descriptor_list:
try:
port_id = desc.usb_port_id
product_id = desc.product_id
device_id = f"usb-{port_id}"
series = _PRODUCT_ID_TO_SERIES.get(product_id, "Unknown")
gops = _SERIES_GOPS.get(series, 0)
assigned = self._assignments.get(device_id)
info = DeviceInfo(
device_id=device_id,
series=series,
product_id=product_id,
status="online",
gops=gops,
assigned_stage=assigned,
current_fps=0.0,
utilization_pct=0.0,
)
found[device_id] = info
except Exception:
continue
self._devices = found
return list(self._devices.values())
def get_device_health(self, device_id: str) -> DeviceHealth:
"""Return a health snapshot for the given device.
Temperature is returned as ``None`` because the current KP SDK
version does not expose thermal sensors.
"""
return DeviceHealth(
device_id=device_id,
temperature_celsius=None,
error_count=0,
last_error=None,
uptime_seconds=0.0,
)
def assign_device(self, device_id: str, stage_id: str) -> bool:
"""Assign *device_id* to *stage_id*.
Returns
-------
bool
``False`` if the device is unknown or already assigned to a
different stage; ``True`` on success.
"""
device = self._devices.get(device_id)
if device is None or device.status == "offline":
return False
existing_stage = self._assignments.get(device_id)
if existing_stage is not None and existing_stage != stage_id:
return False # already assigned to a different stage
self._assignments[device_id] = stage_id
self._devices[device_id].assigned_stage = stage_id
return True
def unassign_device(self, device_id: str) -> bool:
"""Release *device_id* from its current stage assignment.
Returns
-------
bool
``False`` if the device is unknown; ``True`` on success.
"""
if device_id not in self._devices:
return False
self._assignments.pop(device_id, None)
self._devices[device_id].assigned_stage = None
return True
def get_load_balance_recommendation(
self, stages: List[str]
) -> Dict[str, str]:
"""Recommend device-to-stage assignment by GOPS (descending).
Higher-GOPS devices are assigned to earlier stages. Stages with
no available device are mapped to an empty string.
Parameters
----------
stages:
Ordered list of stage IDs (first stage has highest priority).
Returns
-------
Dict[str, str]
``{stage_id: device_id}``; device_id is "" if unavailable.
"""
available = sorted(
self._devices.values(),
key=lambda d: d.gops,
reverse=True,
)
recommendation: Dict[str, str] = {}
for i, stage_id in enumerate(stages):
if i < len(available):
recommendation[stage_id] = available[i].device_id
else:
recommendation[stage_id] = ""
return recommendation
def get_device_statistics(self) -> Dict[str, DeviceInfo]:
"""Return a snapshot of all known devices keyed by device_id."""
return dict(self._devices)