cluster4npu/core/device/device_manager.py

"""
core/device/device_manager.py

DeviceManager — manages NPU Dongle discovery, health, and assignment.

Design:
- scan_devices() calls the Kneron KP SDK but accepts an injectable kp_api
  parameter so tests can supply a Mock without real hardware.
- DongleSeriesSpec constants are inlined here to avoid a circular import
  from core.functions.Multidongle.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict, List, Optional

# ---------------------------------------------------------------------------
# GOPS table (mirrors DongleSeriesSpec in Multidongle.py)
# ---------------------------------------------------------------------------

_PRODUCT_ID_TO_SERIES: Dict[int, str] = {
    0x100: "KL520",
    0x720: "KL720",
    0x630: "KL630",
    0x730: "KL730",
}

_SERIES_GOPS: Dict[str, int] = {
    "KL520": 2,
    "KL720": 28,
    "KL630": 400,
    "KL730": 1600,
}


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class DeviceInfo:
    """Snapshot of a single NPU Dongle's state."""

    device_id: str                   # unique id, e.g. "usb-<port_id>"
    series: str                      # "KL520" | "KL720" | ...
    product_id: int                  # raw USB product ID
    status: str                      # "online" | "offline" | "busy"
    gops: int                        # compute capacity
    assigned_stage: Optional[str]    # currently assigned stage ID, or None
    current_fps: float               # live inference throughput
    utilization_pct: float           # 0.0 – 100.0


@dataclass
class DeviceHealth:
    """Health snapshot of a single NPU Dongle."""

    device_id: str
    temperature_celsius: Optional[float]  # None if SDK does not support it
    error_count: int
    last_error: Optional[str]
    uptime_seconds: float


# ---------------------------------------------------------------------------
# DeviceManager
# ---------------------------------------------------------------------------

class DeviceManager:
    """Manages NPU Dongle discovery, health queries, and stage assignment.

    Parameters
    ----------
    kp_api:
        Kneron KP SDK module reference.  Pass ``None`` to import the real
        ``kp`` module at runtime, or inject a Mock in tests.
    """

    def __init__(self, kp_api=None) -> None:
        if kp_api is None:
            import kp as _kp  # real SDK (requires hardware)
            self._kp = _kp
        else:
            self._kp = kp_api

        # Known devices, populated by scan_devices()
        self._devices: Dict[str, DeviceInfo] = {}
        # stage assignments: {device_id: stage_id}
        self._assignments: Dict[str, str] = {}

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def scan_devices(self) -> List[DeviceInfo]:
        """Scan for connected Kneron Dongles and update internal state.

        Returns
        -------
        List[DeviceInfo]
            All currently connected devices, each with status "online".
        """
        try:
            descriptors = self._kp.core.scan_devices()
        except Exception:
            return []

        if not descriptors or descriptors.device_descriptor_number == 0:
            return []

        found: Dict[str, DeviceInfo] = {}
        for desc in descriptors.device_descriptor_list:
            try:
                port_id = desc.usb_port_id
                product_id = desc.product_id
                device_id = f"usb-{port_id}"
                series = _PRODUCT_ID_TO_SERIES.get(product_id, "Unknown")
                gops = _SERIES_GOPS.get(series, 0)
                assigned = self._assignments.get(device_id)
                info = DeviceInfo(
                    device_id=device_id,
                    series=series,
                    product_id=product_id,
                    status="online",
                    gops=gops,
                    assigned_stage=assigned,
                    current_fps=0.0,
                    utilization_pct=0.0,
                )
                found[device_id] = info
            except Exception:
                continue

        self._devices = found
        return list(self._devices.values())

    def get_device_health(self, device_id: str) -> DeviceHealth:
        """Return a health snapshot for the given device.

        Temperature is returned as ``None`` because the current KP SDK
        version does not expose thermal sensors.
        """
        return DeviceHealth(
            device_id=device_id,
            temperature_celsius=None,
            error_count=0,
            last_error=None,
            uptime_seconds=0.0,
        )

    def assign_device(self, device_id: str, stage_id: str) -> bool:
        """Assign *device_id* to *stage_id*.

        Returns
        -------
        bool
            ``False`` if the device is unknown or already assigned to a
            different stage; ``True`` on success.
        """
        device = self._devices.get(device_id)
        if device is None or device.status == "offline":
            return False
        existing_stage = self._assignments.get(device_id)
        if existing_stage is not None and existing_stage != stage_id:
            return False  # already assigned to a different stage
        self._assignments[device_id] = stage_id
        self._devices[device_id].assigned_stage = stage_id
        return True

    def unassign_device(self, device_id: str) -> bool:
        """Release *device_id* from its current stage assignment.

        Returns
        -------
        bool
            ``False`` if the device is unknown; ``True`` on success.
        """
        if device_id not in self._devices:
            return False
        self._assignments.pop(device_id, None)
        self._devices[device_id].assigned_stage = None
        return True

    def get_load_balance_recommendation(
        self, stages: List[str]
    ) -> Dict[str, str]:
        """Recommend device-to-stage assignment by GOPS (descending).

        Higher-GOPS devices are assigned to earlier stages.  Stages with
        no available device are mapped to an empty string.

        Parameters
        ----------
        stages:
            Ordered list of stage IDs (first stage has highest priority).

        Returns
        -------
        Dict[str, str]
            ``{stage_id: device_id}``; device_id is "" if unavailable.
        """
        available = sorted(
            self._devices.values(),
            key=lambda d: d.gops,
            reverse=True,
        )
        recommendation: Dict[str, str] = {}
        for i, stage_id in enumerate(stages):
            if i < len(available):
                recommendation[stage_id] = available[i].device_id
            else:
                recommendation[stage_id] = ""
        return recommendation

    def get_device_statistics(self) -> Dict[str, DeviceInfo]:
        """Return a snapshot of all known devices keyed by device_id."""
        return dict(self._devices)