""" core/device/device_manager.py DeviceManager — manages NPU Dongle discovery, health, and assignment. Design: - scan_devices() calls the Kneron KP SDK but accepts an injectable kp_api parameter so tests can supply a Mock without real hardware. - DongleSeriesSpec constants are inlined here to avoid a circular import from core.functions.Multidongle. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Dict, List, Optional # --------------------------------------------------------------------------- # GOPS table (mirrors DongleSeriesSpec in Multidongle.py) # --------------------------------------------------------------------------- _PRODUCT_ID_TO_SERIES: Dict[int, str] = { 0x100: "KL520", 0x720: "KL720", 0x630: "KL630", 0x730: "KL730", } _SERIES_GOPS: Dict[str, int] = { "KL520": 2, "KL720": 28, "KL630": 400, "KL730": 1600, } # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class DeviceInfo: """Snapshot of a single NPU Dongle's state.""" device_id: str # unique id, e.g. "usb-" series: str # "KL520" | "KL720" | ... product_id: int # raw USB product ID status: str # "online" | "offline" | "busy" gops: int # compute capacity assigned_stage: Optional[str] # currently assigned stage ID, or None current_fps: float # live inference throughput utilization_pct: float # 0.0 – 100.0 @dataclass class DeviceHealth: """Health snapshot of a single NPU Dongle.""" device_id: str temperature_celsius: Optional[float] # None if SDK does not support it error_count: int last_error: Optional[str] uptime_seconds: float # --------------------------------------------------------------------------- # DeviceManager # --------------------------------------------------------------------------- class DeviceManager: """Manages NPU Dongle discovery, health queries, and stage assignment. Parameters ---------- kp_api: Kneron KP SDK module reference. Pass ``None`` to import the real ``kp`` module at runtime, or inject a Mock in tests. """ def __init__(self, kp_api=None) -> None: if kp_api is None: import kp as _kp # real SDK (requires hardware) self._kp = _kp else: self._kp = kp_api # Known devices, populated by scan_devices() self._devices: Dict[str, DeviceInfo] = {} # stage assignments: {device_id: stage_id} self._assignments: Dict[str, str] = {} # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def scan_devices(self) -> List[DeviceInfo]: """Scan for connected Kneron Dongles and update internal state. Returns ------- List[DeviceInfo] All currently connected devices, each with status "online". """ try: descriptors = self._kp.core.scan_devices() except Exception: return [] if not descriptors or descriptors.device_descriptor_number == 0: return [] found: Dict[str, DeviceInfo] = {} for desc in descriptors.device_descriptor_list: try: port_id = desc.usb_port_id product_id = desc.product_id device_id = f"usb-{port_id}" series = _PRODUCT_ID_TO_SERIES.get(product_id, "Unknown") gops = _SERIES_GOPS.get(series, 0) assigned = self._assignments.get(device_id) info = DeviceInfo( device_id=device_id, series=series, product_id=product_id, status="online", gops=gops, assigned_stage=assigned, current_fps=0.0, utilization_pct=0.0, ) found[device_id] = info except Exception: continue self._devices = found return list(self._devices.values()) def get_device_health(self, device_id: str) -> DeviceHealth: """Return a health snapshot for the given device. Temperature is returned as ``None`` because the current KP SDK version does not expose thermal sensors. """ return DeviceHealth( device_id=device_id, temperature_celsius=None, error_count=0, last_error=None, uptime_seconds=0.0, ) def assign_device(self, device_id: str, stage_id: str) -> bool: """Assign *device_id* to *stage_id*. Returns ------- bool ``False`` if the device is unknown or already assigned to a different stage; ``True`` on success. """ device = self._devices.get(device_id) if device is None or device.status == "offline": return False existing_stage = self._assignments.get(device_id) if existing_stage is not None and existing_stage != stage_id: return False # already assigned to a different stage self._assignments[device_id] = stage_id self._devices[device_id].assigned_stage = stage_id return True def unassign_device(self, device_id: str) -> bool: """Release *device_id* from its current stage assignment. Returns ------- bool ``False`` if the device is unknown; ``True`` on success. """ if device_id not in self._devices: return False self._assignments.pop(device_id, None) self._devices[device_id].assigned_stage = None return True def get_load_balance_recommendation( self, stages: List[str] ) -> Dict[str, str]: """Recommend device-to-stage assignment by GOPS (descending). Higher-GOPS devices are assigned to earlier stages. Stages with no available device are mapped to an empty string. Parameters ---------- stages: Ordered list of stage IDs (first stage has highest priority). Returns ------- Dict[str, str] ``{stage_id: device_id}``; device_id is "" if unavailable. """ available = sorted( self._devices.values(), key=lambda d: d.gops, reverse=True, ) recommendation: Dict[str, str] = {} for i, stage_id in enumerate(stages): if i < len(available): recommendation[stage_id] = available[i].device_id else: recommendation[stage_id] = "" return recommendation def get_device_statistics(self) -> Dict[str, DeviceInfo]: """Return a snapshot of all known devices keyed by device_id.""" return dict(self._devices)