forked from masonhuang/cluster4npu
Phase 1 — Performance Benchmarking: - PerformanceBenchmarker: sequential vs parallel benchmark with injectable runner - PerformanceHistory: JSON-backed benchmark history with regression support - PerformanceDashboard: real-time FPS/latency display widget - BenchmarkDialog: one-click benchmark with 3-phase progress bar Phase 2 — Device Management: - DeviceManager: NPU dongle scan, assign/unassign, load balance recommendation - DeviceManagementPanel: live device status cards with auto-refresh - BottleneckAlert: dataclass for pipeline bottleneck detection Phase 3 — Advanced Features: - OptimizationEngine: 3 optimization rules (rebalance/adjust_queue/add_devices) - TemplateManager: 3 built-in pipeline templates (YOLOv5, fire detection, dual-model) Phase 4 — Report Export: - ReportExporter: PDF (reportlab, optional) and CSV export - ExportReportDialog: format selection + path picker UI 192 unit tests, all passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
218 lines
7.0 KiB
Python
218 lines
7.0 KiB
Python
"""
|
||
core/device/device_manager.py
|
||
|
||
DeviceManager — manages NPU Dongle discovery, health, and assignment.
|
||
|
||
Design:
|
||
- scan_devices() calls the Kneron KP SDK but accepts an injectable kp_api
|
||
parameter so tests can supply a Mock without real hardware.
|
||
- DongleSeriesSpec constants are inlined here to avoid a circular import
|
||
from core.functions.Multidongle.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass, field
|
||
from typing import Dict, List, Optional
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# GOPS table (mirrors DongleSeriesSpec in Multidongle.py)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_PRODUCT_ID_TO_SERIES: Dict[int, str] = {
|
||
0x100: "KL520",
|
||
0x720: "KL720",
|
||
0x630: "KL630",
|
||
0x730: "KL730",
|
||
}
|
||
|
||
_SERIES_GOPS: Dict[str, int] = {
|
||
"KL520": 2,
|
||
"KL720": 28,
|
||
"KL630": 400,
|
||
"KL730": 1600,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Data classes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class DeviceInfo:
|
||
"""Snapshot of a single NPU Dongle's state."""
|
||
|
||
device_id: str # unique id, e.g. "usb-<port_id>"
|
||
series: str # "KL520" | "KL720" | ...
|
||
product_id: int # raw USB product ID
|
||
status: str # "online" | "offline" | "busy"
|
||
gops: int # compute capacity
|
||
assigned_stage: Optional[str] # currently assigned stage ID, or None
|
||
current_fps: float # live inference throughput
|
||
utilization_pct: float # 0.0 – 100.0
|
||
|
||
|
||
@dataclass
|
||
class DeviceHealth:
|
||
"""Health snapshot of a single NPU Dongle."""
|
||
|
||
device_id: str
|
||
temperature_celsius: Optional[float] # None if SDK does not support it
|
||
error_count: int
|
||
last_error: Optional[str]
|
||
uptime_seconds: float
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# DeviceManager
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class DeviceManager:
|
||
"""Manages NPU Dongle discovery, health queries, and stage assignment.
|
||
|
||
Parameters
|
||
----------
|
||
kp_api:
|
||
Kneron KP SDK module reference. Pass ``None`` to import the real
|
||
``kp`` module at runtime, or inject a Mock in tests.
|
||
"""
|
||
|
||
def __init__(self, kp_api=None) -> None:
|
||
if kp_api is None:
|
||
import kp as _kp # real SDK (requires hardware)
|
||
self._kp = _kp
|
||
else:
|
||
self._kp = kp_api
|
||
|
||
# Known devices, populated by scan_devices()
|
||
self._devices: Dict[str, DeviceInfo] = {}
|
||
# stage assignments: {device_id: stage_id}
|
||
self._assignments: Dict[str, str] = {}
|
||
|
||
# ------------------------------------------------------------------
|
||
# Public API
|
||
# ------------------------------------------------------------------
|
||
|
||
def scan_devices(self) -> List[DeviceInfo]:
|
||
"""Scan for connected Kneron Dongles and update internal state.
|
||
|
||
Returns
|
||
-------
|
||
List[DeviceInfo]
|
||
All currently connected devices, each with status "online".
|
||
"""
|
||
try:
|
||
descriptors = self._kp.core.scan_devices()
|
||
except Exception:
|
||
return []
|
||
|
||
if not descriptors or descriptors.device_descriptor_number == 0:
|
||
return []
|
||
|
||
found: Dict[str, DeviceInfo] = {}
|
||
for desc in descriptors.device_descriptor_list:
|
||
try:
|
||
port_id = desc.usb_port_id
|
||
product_id = desc.product_id
|
||
device_id = f"usb-{port_id}"
|
||
series = _PRODUCT_ID_TO_SERIES.get(product_id, "Unknown")
|
||
gops = _SERIES_GOPS.get(series, 0)
|
||
assigned = self._assignments.get(device_id)
|
||
info = DeviceInfo(
|
||
device_id=device_id,
|
||
series=series,
|
||
product_id=product_id,
|
||
status="online",
|
||
gops=gops,
|
||
assigned_stage=assigned,
|
||
current_fps=0.0,
|
||
utilization_pct=0.0,
|
||
)
|
||
found[device_id] = info
|
||
except Exception:
|
||
continue
|
||
|
||
self._devices = found
|
||
return list(self._devices.values())
|
||
|
||
def get_device_health(self, device_id: str) -> DeviceHealth:
|
||
"""Return a health snapshot for the given device.
|
||
|
||
Temperature is returned as ``None`` because the current KP SDK
|
||
version does not expose thermal sensors.
|
||
"""
|
||
return DeviceHealth(
|
||
device_id=device_id,
|
||
temperature_celsius=None,
|
||
error_count=0,
|
||
last_error=None,
|
||
uptime_seconds=0.0,
|
||
)
|
||
|
||
def assign_device(self, device_id: str, stage_id: str) -> bool:
|
||
"""Assign *device_id* to *stage_id*.
|
||
|
||
Returns
|
||
-------
|
||
bool
|
||
``False`` if the device is unknown or already assigned to a
|
||
different stage; ``True`` on success.
|
||
"""
|
||
device = self._devices.get(device_id)
|
||
if device is None or device.status == "offline":
|
||
return False
|
||
existing_stage = self._assignments.get(device_id)
|
||
if existing_stage is not None and existing_stage != stage_id:
|
||
return False # already assigned to a different stage
|
||
self._assignments[device_id] = stage_id
|
||
self._devices[device_id].assigned_stage = stage_id
|
||
return True
|
||
|
||
def unassign_device(self, device_id: str) -> bool:
|
||
"""Release *device_id* from its current stage assignment.
|
||
|
||
Returns
|
||
-------
|
||
bool
|
||
``False`` if the device is unknown; ``True`` on success.
|
||
"""
|
||
if device_id not in self._devices:
|
||
return False
|
||
self._assignments.pop(device_id, None)
|
||
self._devices[device_id].assigned_stage = None
|
||
return True
|
||
|
||
def get_load_balance_recommendation(
|
||
self, stages: List[str]
|
||
) -> Dict[str, str]:
|
||
"""Recommend device-to-stage assignment by GOPS (descending).
|
||
|
||
Higher-GOPS devices are assigned to earlier stages. Stages with
|
||
no available device are mapped to an empty string.
|
||
|
||
Parameters
|
||
----------
|
||
stages:
|
||
Ordered list of stage IDs (first stage has highest priority).
|
||
|
||
Returns
|
||
-------
|
||
Dict[str, str]
|
||
``{stage_id: device_id}``; device_id is "" if unavailable.
|
||
"""
|
||
available = sorted(
|
||
self._devices.values(),
|
||
key=lambda d: d.gops,
|
||
reverse=True,
|
||
)
|
||
recommendation: Dict[str, str] = {}
|
||
for i, stage_id in enumerate(stages):
|
||
if i < len(available):
|
||
recommendation[stage_id] = available[i].device_id
|
||
else:
|
||
recommendation[stage_id] = ""
|
||
return recommendation
|
||
|
||
def get_device_statistics(self) -> Dict[str, DeviceInfo]:
|
||
"""Return a snapshot of all known devices keyed by device_id."""
|
||
return dict(self._devices)
|