jim800121chen b71ff4cd3c perf(local-tool): Windows KL520 cold-boot connect 106s → ~40s(跳過多餘 reset)
背景:
Windows 實測 KL520 首次 connect 耗時 106 秒,原因是 reset 流程內部重複
firmware load:
  1. 進來 Loader → load firmware (35s) → Comp/U
  2. reset 退回 Loader → bridge 重啟
  3. reconnect 進來又是 Loader → load firmware (30s) → Comp/U
  4. Loader reconnect 第一次常 fail(15s timeout)
總共 ~65s 花在「砍掉剛載好的 firmware、再載一次」的白工上。

根因:先前修的 needsReset 邏輯不管 firmware 新舊一律 reset。但 Error 15
只發生在「Comp/U 是上次 session 殘留」的情境;「本次 connect 內部剛載的
Comp/U」session 是乾淨的,不需要 reset。

修法(條件性 reset):
- server/scripts/kneron_bridge.py:connect handler 新增追蹤本次有無走
  firmware load flow,return 多帶 `fresh_firmware_loaded` bool
- server/internal/driver/kneron/kl720_driver.go:Connect 讀 flag,若為
  true 就 skipReset(firmware 剛載的,session 已乾淨)

驗證(2026-04-21):
- `/tmp/test_bridge.py` 拔插 USB 後跑 `connect (fw=Loader) →
  fresh_firmware_loaded=True → skip reset → load_model → inference`
  → 11 detections(person×8, tie×3, latency 332ms)
- Mac UI Comp/U 殘留路徑:reset → 11 bbox ✓
- Mac UI Loader cold-boot 路徑(拔插後):skip reset → 11 bbox ✓

預期效益:
- Windows cold-boot(常見):106s → ~40s(省 65s)
- Mac 跨 session(常見):~15-20s 不變
- 極少數(Windows device 未斷電但跨 server session):走完整 reset

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 11:09:25 +08:00

698 lines
20 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package kneron
import (
"bufio"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"time"
"visiona-local/server/internal/driver"
"visiona-local/server/pkg/logger"
)
// LogFunc is a function that writes a log line to both stderr and
// the WebSocket broadcaster. When nil, logs go only to stderr.
type LogFunc func(level, msg string)
// KneronDriver implements driver.DeviceDriver for Kneron NPU devices
// (KL520, KL720, etc.). It delegates hardware operations to a Python
// subprocess (kneron_bridge.py) that communicates via JSON-RPC over
// stdin/stdout.
type KneronDriver struct {
info driver.DeviceInfo
connected bool
inferring bool
modelLoaded string
chipType string // "KL520" or "KL720" — derived from info.Type
mu sync.Mutex
scriptPath string
pythonCmd *exec.Cmd
stdin io.WriteCloser
stdout *bufio.Scanner
pythonReady bool
logBroadcaster *logger.Broadcaster
needsReset bool // true on first connect after server start to clear stale models
}
// NewKneronDriver creates a new KneronDriver with the given device info and
// path to the kneron_bridge.py script. Works for any Kneron chip variant.
func NewKneronDriver(info driver.DeviceInfo, scriptPath string) *KneronDriver {
chip := "KL520"
if strings.Contains(strings.ToLower(info.Type), "kl720") {
chip = "KL720"
}
return &KneronDriver{
info: info,
scriptPath: scriptPath,
chipType: chip,
needsReset: true,
}
}
// SetLogBroadcaster attaches a log broadcaster so that bridge stderr
// and driver messages are forwarded to the frontend.
func (d *KneronDriver) SetLogBroadcaster(b *logger.Broadcaster) {
d.logBroadcaster = b
}
// driverLog writes a log message to stderr and the broadcaster.
func (d *KneronDriver) driverLog(level, format string, args ...interface{}) {
msg := fmt.Sprintf(format, args...)
fmt.Fprintf(os.Stderr, "%s\n", msg)
if d.logBroadcaster != nil {
d.logBroadcaster.Push(level, msg)
}
}
// NewKL720Driver is a backward-compatible alias for NewKneronDriver.
// Deprecated: Use NewKneronDriver instead.
func NewKL720Driver(info driver.DeviceInfo, scriptPath string) *KneronDriver {
return NewKneronDriver(info, scriptPath)
}
// KL720Driver is a backward-compatible type alias for KneronDriver.
// Deprecated: Use KneronDriver instead.
type KL720Driver = KneronDriver
// resolvePython finds the best Python interpreter using the package-level resolver.
func (d *KneronDriver) resolvePython() string {
return ResolvePython(d.scriptPath)
}
// startPython launches the Python bridge subprocess and waits for the
// "ready" signal on stdout.
func (d *KneronDriver) startPython() error {
pythonBin := d.resolvePython()
scriptDir := filepath.Dir(d.scriptPath)
cmd := exec.Command(pythonBin, d.scriptPath)
// On macOS with Apple Silicon, Kneron SDK requires x86_64 (Rosetta 2).
// The venv should already contain the correct architecture Python.
// Set DYLD_LIBRARY_PATH so libkplus.dylib can be found.
cmd.Env = append(os.Environ(),
"PYTHONUNBUFFERED=1",
)
// Add library path for native kp module if lib directory exists.
libDir := filepath.Join(scriptDir, "lib")
if _, err := os.Stat(libDir); err == nil {
if runtime.GOOS == "darwin" {
cmd.Env = append(cmd.Env, "DYLD_LIBRARY_PATH="+libDir)
} else {
cmd.Env = append(cmd.Env, "LD_LIBRARY_PATH="+libDir)
}
}
// On Windows, ensure libusb-1.0.dll can be found by adding the install
// directory to PATH (installer places the DLL there).
if runtime.GOOS == "windows" {
installDir := filepath.Dir(scriptDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("PATH=%s;%s;%s", installDir, scriptDir, os.Getenv("PATH")))
}
stdinPipe, err := cmd.StdinPipe()
if err != nil {
return fmt.Errorf("failed to create stdin pipe: %w", err)
}
stdoutPipe, err := cmd.StdoutPipe()
if err != nil {
stdinPipe.Close()
return fmt.Errorf("failed to create stdout pipe: %w", err)
}
// Capture stderr from the Python bridge: forward each line to both
// os.Stderr and the WebSocket broadcaster so it shows in the frontend.
stderrPipe, err := cmd.StderrPipe()
if err != nil {
stdinPipe.Close()
stdoutPipe.Close()
return fmt.Errorf("failed to create stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
stdinPipe.Close()
return fmt.Errorf("failed to start python bridge (%s): %w", pythonBin, err)
}
// Forward bridge stderr line-by-line to os.Stderr + broadcaster.
go func() {
scanner := bufio.NewScanner(stderrPipe)
for scanner.Scan() {
line := scanner.Text()
fmt.Fprintln(os.Stderr, line)
if d.logBroadcaster != nil {
d.logBroadcaster.Push("DEBUG", line)
}
}
}()
d.pythonCmd = cmd
d.stdin = stdinPipe
d.stdout = bufio.NewScanner(stdoutPipe)
// Increase scanner buffer for large inference responses.
d.stdout.Buffer(make([]byte, 0, 64*1024), 1024*1024)
// Wait for the ready signal from the Python process.
if d.stdout.Scan() {
var resp map[string]interface{}
if err := json.Unmarshal([]byte(d.stdout.Text()), &resp); err == nil {
if status, ok := resp["status"].(string); ok && status == "ready" {
d.pythonReady = true
return nil
}
}
}
// If we didn't get a ready signal, clean up and report failure.
d.stopPython()
return fmt.Errorf("python bridge did not send ready signal")
}
// sendCommand sends a JSON command to the Python subprocess and returns
// the parsed JSON response.
func (d *KneronDriver) sendCommand(cmd map[string]interface{}) (map[string]interface{}, error) {
if !d.pythonReady {
return nil, fmt.Errorf("python bridge is not running")
}
data, err := json.Marshal(cmd)
if err != nil {
return nil, fmt.Errorf("failed to marshal command: %w", err)
}
// Write the JSON command followed by a newline.
if _, err := fmt.Fprintf(d.stdin, "%s\n", data); err != nil {
return nil, fmt.Errorf("failed to write to python bridge: %w", err)
}
// Read the response line.
if !d.stdout.Scan() {
if err := d.stdout.Err(); err != nil {
return nil, fmt.Errorf("failed to read from python bridge: %w", err)
}
return nil, fmt.Errorf("python bridge closed unexpectedly")
}
var resp map[string]interface{}
if err := json.Unmarshal([]byte(d.stdout.Text()), &resp); err != nil {
return nil, fmt.Errorf("failed to parse python response: %w", err)
}
// Check for error responses from the bridge.
if errMsg, ok := resp["error"].(string); ok {
return nil, fmt.Errorf("python bridge error: %s", errMsg)
}
return resp, nil
}
// stopPython kills the Python subprocess and cleans up resources.
func (d *KneronDriver) stopPython() {
d.pythonReady = false
if d.stdin != nil {
d.stdin.Close()
d.stdin = nil
}
if d.pythonCmd != nil && d.pythonCmd.Process != nil {
d.pythonCmd.Process.Kill()
d.pythonCmd.Wait()
d.pythonCmd = nil
}
d.stdout = nil
}
// Info returns the current device information.
func (d *KneronDriver) Info() driver.DeviceInfo {
d.mu.Lock()
defer d.mu.Unlock()
return d.info
}
// Connect starts the Python bridge subprocess and connects to the Kneron device.
// On the first connect after server start, the device is reset to clear any
// stale model from a previous session.
func (d *KneronDriver) Connect() error {
d.mu.Lock()
if d.connected {
d.mu.Unlock()
return nil
}
needsReset := d.needsReset
d.info.Status = driver.StatusConnecting
// Start the Python bridge process.
if err := d.startPython(); err != nil {
d.info.Status = driver.StatusError
d.mu.Unlock()
return fmt.Errorf("failed to start hardware bridge: %w", err)
}
// Send connect command to the bridge.
resp, err := d.sendCommand(map[string]interface{}{
"cmd": "connect",
"port": d.info.Port,
"index": 0,
"device_type": d.info.Type,
})
if err != nil {
d.stopPython()
d.info.Status = driver.StatusError
d.mu.Unlock()
return fmt.Errorf("failed to connect to device: %w", err)
}
d.connected = true
d.needsReset = false
d.info.Status = driver.StatusConnected
if fw, ok := resp["firmware"].(string); ok {
d.info.FirmwareVer = fw
}
// Bridge reports whether firmware was freshly loaded during this connect.
// Freshly loaded firmware = clean state → no reset needed.
// Firmware already present (残留 from previous session) → must reset to
// avoid Error 15 SEND_DATA_TOO_LARGE on first inference.
freshFirmware, _ := resp["fresh_firmware_loaded"].(bool)
d.mu.Unlock()
// First connect after server start: reset device to clear stale session.
//
// Why reset is needed:
// - KL720: flash-basedfirmware 和 model 保留在 flashreset 清 stale
// model 才有意義。
// - KL520: USB Boot / RAM-based。若 session 間 firmware 殘留(不是剛載
// 的 Comp/U直接 load_model + inference 100% 炸 Error 15。必須
// reset → Loader → reload firmware → Comp/U 得到乾淨 session。
//
// Why we skip reset when freshFirmware=true:
// - 這次 connect 內部剛做過完整 firmware load → Comp/U 是新鮮乾淨的。
// 再做 reset 會再砍掉 reload 一次,浪費 30-60s 沒意義。
// - Windows cold boot 情境最常見device 斷電後第一次 connect
// 省下 restartBridge 的 ~65s 代價。
skipReset := freshFirmware
if needsReset && !skipReset {
d.driverLog("INFO", "[kneron] first connect — resetting %s to clear stale session (firmware was already present)...", d.chipType)
if err := d.restartBridge(); err != nil {
d.driverLog("WARN", "[kneron] reset on connect failed (non-fatal): %v", err)
} else {
d.driverLog("INFO", "[kneron] device reset complete — clean state ready")
}
} else if needsReset && skipReset {
d.driverLog("INFO", "[kneron] %s: skipping reset — firmware just loaded, session already clean", d.chipType)
}
return nil
}
// Disconnect stops the Python bridge and disconnects from the device.
func (d *KneronDriver) Disconnect() error {
d.mu.Lock()
defer d.mu.Unlock()
if !d.connected {
return nil
}
// Try to send disconnect command if Python is running.
if d.pythonReady {
d.sendCommand(map[string]interface{}{"cmd": "disconnect"})
}
d.stopPython()
d.connected = false
d.inferring = false
d.info.Status = driver.StatusDisconnected
return nil
}
// IsConnected returns whether the driver is currently connected.
func (d *KneronDriver) IsConnected() bool {
d.mu.Lock()
defer d.mu.Unlock()
return d.connected
}
// restartBridge resets the Kneron device and restarts the Python bridge.
//
// The KL520 USB Boot mode only allows loading one model per firmware
// session. To load a different model we must:
// 1. Send a "reset" command via the current bridge — this calls
// kp.core.reset_device() which forces the device back to Loader
// (USB Boot) state, wiping firmware + model from RAM.
// 2. Kill the Python bridge process.
// 3. Wait for the device to re-enumerate on USB (~8 s).
// 4. Start a fresh Python bridge.
// 5. Send "connect" which reloads firmware from scratch.
//
// After this the device is in a clean state ready for load_model.
//
// Caller must NOT hold d.mu.
func (d *KneronDriver) restartBridge() error {
d.mu.Lock()
port := d.info.Port
d.modelLoaded = ""
// Step 1: Ask the running bridge to reset the device.
if d.pythonReady {
d.driverLog("INFO", "[kneron] sending reset command to device...")
d.sendCommand(map[string]interface{}{"cmd": "reset"})
// Ignore errors — the device may have already disconnected.
}
// Step 2: Kill the bridge process.
d.stopPython()
d.mu.Unlock()
// Step 3: Wait for USB device to re-enumerate after hardware reset.
// The reset causes the device to drop off USB and reappear as a
// Loader-mode device. This typically takes 5-8 seconds.
d.driverLog("INFO", "[kneron] bridge stopped, waiting for USB re-enumerate after reset...")
time.Sleep(8 * time.Second)
d.mu.Lock()
defer d.mu.Unlock()
// Step 4: Start a fresh Python bridge.
d.driverLog("INFO", "[kneron] starting new bridge process...")
if err := d.startPython(); err != nil {
return fmt.Errorf("failed to restart bridge: %w", err)
}
// Step 5: Reconnect — firmware will be loaded fresh.
d.driverLog("INFO", "[kneron] bridge started, reconnecting to device (port=%s)...", port)
_, err := d.sendCommand(map[string]interface{}{
"cmd": "connect",
"port": port,
"index": 0,
"device_type": d.info.Type,
})
if err != nil {
d.stopPython()
return fmt.Errorf("failed to reconnect after bridge restart: %w", err)
}
d.driverLog("INFO", "[kneron] device reconnected after reset + bridge restart")
return nil
}
// Flash loads a model onto the Kneron device. Progress is reported through
// the provided channel.
//
// Behavior differs by chip:
// - KL520 (USB Boot): only one model per session. Error 40 triggers
// a full device reset + bridge restart + firmware reload.
// - KL720 (flash-based): models can be freely reloaded. Error 40
// should not occur; if it does, a simple retry is attempted first.
func (d *KneronDriver) Flash(modelPath string, progressCh chan<- driver.FlashProgress) error {
d.mu.Lock()
d.info.Status = driver.StatusFlashing
pythonReady := d.pythonReady
currentModel := d.modelLoaded
chip := d.chipType
d.mu.Unlock()
if !pythonReady {
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("hardware bridge is not running — cannot flash model")
}
// Same model already loaded — skip, report success
if currentModel != "" && currentModel == modelPath {
d.driverLog("INFO", "[kneron] model already loaded (%s), skipping reload", modelPath)
progressCh <- driver.FlashProgress{
Percent: 50,
Stage: "transferring",
Message: "model already loaded on device",
}
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
progressCh <- driver.FlashProgress{Percent: 100, Stage: "done", Message: "Flash complete (model already loaded)"}
return nil
}
// Try loading the model
progressCh <- driver.FlashProgress{
Percent: 5,
Stage: "preparing",
Message: "preparing... loading model to device",
}
d.mu.Lock()
_, err := d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
// Handle retryable errors (error 40, broken pipe).
if err != nil {
errMsg := err.Error()
d.driverLog("WARN", "[kneron] load_model failed: %s", errMsg)
isRetryable := strings.Contains(errMsg, "Error code: 40") ||
strings.Contains(errMsg, "SECOND_MODEL") ||
strings.Contains(errMsg, "broken pipe") ||
strings.Contains(errMsg, "USB_TIMEOUT")
if isRetryable {
if chip == "KL720" {
// KL720: error 40 should not occur. Try a simple retry
// without full bridge restart first.
d.driverLog("WARN", "[kneron] KL720 unexpected retryable error, retrying without restart...")
progressCh <- driver.FlashProgress{
Percent: 5,
Stage: "preparing",
Message: "preparing... retrying model load",
}
d.mu.Lock()
_, err = d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
// If still failing, fall back to bridge restart as last resort.
if err != nil {
d.driverLog("WARN", "[kneron] KL720 retry failed: %v, falling back to bridge restart...", err)
if restartErr := d.restartBridge(); restartErr != nil {
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("failed to reset device: %w", restartErr)
}
d.mu.Lock()
d.info.Status = driver.StatusFlashing
_, err = d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
}
} else {
// KL520: error 40 means a model is already loaded in this
// USB Boot session. Must reset device + reload firmware.
d.driverLog("WARN", "[kneron] KL520 retryable error, restarting bridge...")
progressCh <- driver.FlashProgress{
Percent: 5,
Stage: "preparing",
Message: "preparing... resetting device for new model",
}
if restartErr := d.restartBridge(); restartErr != nil {
d.driverLog("ERROR", "[kneron] restartBridge failed: %v", restartErr)
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("failed to reset device: %w", restartErr)
}
d.driverLog("INFO", "[kneron] bridge restarted, retrying load_model...")
d.mu.Lock()
d.info.Status = driver.StatusFlashing
_, err = d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
}
}
}
if err != nil {
d.driverLog("ERROR", "[kneron] load_model ultimately failed: %v", err)
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("failed to load model: %w", err)
}
d.driverLog("INFO", "[kneron] load_model succeeded: %s", modelPath)
// Simulate remaining flash progress stages (the Kneron SDK does not
// provide granular progress, so we approximate it after the model
// has been loaded successfully).
type stage struct {
name string
duration time.Duration
startPct int
endPct int
}
stages := []stage{
{"transferring", 2 * time.Second, 10, 80},
{"verifying", 1 * time.Second, 80, 95},
{"finalizing", 500 * time.Millisecond, 95, 99},
}
// KL720 is faster (USB 3.0, no firmware reload needed)
if chip == "KL720" {
stages = []stage{
{"transferring", 1 * time.Second, 10, 80},
{"verifying", 500 * time.Millisecond, 80, 95},
{"finalizing", 200 * time.Millisecond, 95, 99},
}
}
for _, s := range stages {
steps := (s.endPct - s.startPct) / 5
if steps < 1 {
steps = 1
}
interval := s.duration / time.Duration(steps)
for i := 0; i <= steps; i++ {
pct := s.startPct + (s.endPct-s.startPct)*i/steps
progressCh <- driver.FlashProgress{
Percent: pct,
Stage: s.name,
Message: fmt.Sprintf("%s... %d%%", s.name, pct),
}
time.Sleep(interval)
}
}
d.mu.Lock()
d.modelLoaded = modelPath
d.info.FlashedModel = modelPath
d.info.Status = driver.StatusConnected
d.mu.Unlock()
progressCh <- driver.FlashProgress{Percent: 100, Stage: "done", Message: "Flash complete"}
return nil
}
// StartInference begins continuous inference mode.
func (d *KneronDriver) StartInference() error {
d.mu.Lock()
defer d.mu.Unlock()
if !d.connected {
return fmt.Errorf("device not connected")
}
d.inferring = true
d.info.Status = driver.StatusInferencing
return nil
}
// StopInference stops continuous inference mode.
func (d *KneronDriver) StopInference() error {
d.mu.Lock()
defer d.mu.Unlock()
d.inferring = false
d.info.Status = driver.StatusConnected
return nil
}
// ReadInference reads the latest inference result. This is equivalent to
// calling RunInference with nil image data.
func (d *KneronDriver) ReadInference() (*driver.InferenceResult, error) {
return d.RunInference(nil)
}
// RunInference runs inference on the provided image data and returns
// the result. If imageData is nil, the bridge will run inference on
// a default/empty input.
func (d *KneronDriver) RunInference(imageData []byte) (*driver.InferenceResult, error) {
d.mu.Lock()
pythonReady := d.pythonReady
d.mu.Unlock()
if !pythonReady {
return nil, fmt.Errorf("hardware bridge is not running — device may not be connected")
}
// Encode image data as base64 for transmission to Python.
imageB64 := ""
if imageData != nil {
imageB64 = base64.StdEncoding.EncodeToString(imageData)
}
d.mu.Lock()
resp, err := d.sendCommand(map[string]interface{}{
"cmd": "inference",
"image_base64": imageB64,
})
d.mu.Unlock()
if err != nil {
return nil, fmt.Errorf("inference failed: %w", err)
}
return parseInferenceResult(resp)
}
// parseInferenceResult converts a JSON response map into an InferenceResult.
func parseInferenceResult(resp map[string]interface{}) (*driver.InferenceResult, error) {
// Re-marshal to JSON and unmarshal into the struct for clean conversion.
data, err := json.Marshal(resp)
if err != nil {
return nil, fmt.Errorf("failed to marshal response: %w", err)
}
var result driver.InferenceResult
if err := json.Unmarshal(data, &result); err != nil {
return nil, fmt.Errorf("failed to parse inference result: %w", err)
}
return &result, nil
}
// GetModelInfo returns information about the currently loaded model.
func (d *KneronDriver) GetModelInfo() (*driver.ModelInfo, error) {
d.mu.Lock()
defer d.mu.Unlock()
if d.modelLoaded == "" {
return nil, fmt.Errorf("no model loaded")
}
return &driver.ModelInfo{
ID: d.modelLoaded,
Name: d.modelLoaded,
LoadedAt: time.Now(),
}, nil
}