jim800121chen 3f0175f1a9 feat(local-agent): Phase 0.5 visionA Agent — Wails 桌面 + tunnel client + 配對 UI
從 local-tool 複製出獨立的「visionA Agent」桌面應用(A3 純橋樑:
tunnel client + 配對 UI + 設定,不開 HTTP port、不做本機裝置/推論 UI)。
Bundle ID 與 local-tool 不同(com.innovedus.visiona-agent vs visiona-local),
雙 app 可共存。fork 後不主動 sync,需要時手動 cherry-pick。

Backend / Wails Go(AB1-AB13):
- internal/tunnel:6 狀態機(Idle/Connecting/Connected/Reconnecting/Failed/Stopped)
  + Pair/Unpair/Reconnect/Disconnect binding + ClientHooks event
- internal/auth:encrypted file token store(AES-GCM + scrypt + machineID
  fallback salt + 13 tests)
- internal/config:YAML validation + atomic write + 11 tests
- internal/log:ring buffer + ExportLog 升級 zip
- visionA-backend /api/pairing/exchange:SessionTokenStore + 17 new tests
- 三平台 build 驗證(macOS DMG 160 MB / Windows EXE / Linux AppImage)
- end-to-end 5 milestone 全綠(pairing → tunnel → forward → reuse 防護
  → tunnel drop failover)

Frontend / Next.js(AF1-AF7,沿用 visionA-frontend 基礎):
- AppShell + Header + TabNav(StatusView / PairView / SettingsView 三 tab)
- ConnectionStatusBadge 5 種狀態
- TokenInput regex 驗證 + 7 種錯誤 + 0.5s auto-switch 到狀態頁
- 設定頁 4 區塊(含重新配對 AlertDialog)
- agent-api.ts 封裝 Wails bindings(mock/real 雙實作)+ 90 tests

Phase 0.7 review-driven fix(Round 2):
- A1 Session fixation 防護(RotateSessionID)
- A3 mock pairing 預設改 false(必須明確 opt-in)+ startup log
- A4 Pair 失敗後 state 清理矩陣(exchange/Save/Start fail 各自終態)
- A5 Pair/Unpair/Reconnect lifecycleMu + 50 goroutine race test
- F1 重新配對次按鈕 / F2 PairView Esc cancel / F3 Wails BrowserOpenURL
  / F4 Settings draft 持久 + 未儲存 badge

驗證:agent backend go test -race -count=3 ./... 4 packages 全綠 /
agent frontend pnpm test 119 tests 全綠

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 11:22:01 +08:00

698 lines
20 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package kneron
import (
"bufio"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"sync"
"time"
"visiona-agent/server/internal/driver"
"visiona-agent/server/pkg/logger"
)
// LogFunc is a function that writes a log line to both stderr and
// the WebSocket broadcaster. When nil, logs go only to stderr.
type LogFunc func(level, msg string)
// KneronDriver implements driver.DeviceDriver for Kneron NPU devices
// (KL520, KL720, etc.). It delegates hardware operations to a Python
// subprocess (kneron_bridge.py) that communicates via JSON-RPC over
// stdin/stdout.
type KneronDriver struct {
info driver.DeviceInfo
connected bool
inferring bool
modelLoaded string
chipType string // "KL520" or "KL720" — derived from info.Type
mu sync.Mutex
scriptPath string
pythonCmd *exec.Cmd
stdin io.WriteCloser
stdout *bufio.Scanner
pythonReady bool
logBroadcaster *logger.Broadcaster
needsReset bool // true on first connect after server start to clear stale models
}
// NewKneronDriver creates a new KneronDriver with the given device info and
// path to the kneron_bridge.py script. Works for any Kneron chip variant.
func NewKneronDriver(info driver.DeviceInfo, scriptPath string) *KneronDriver {
chip := "KL520"
if strings.Contains(strings.ToLower(info.Type), "kl720") {
chip = "KL720"
}
return &KneronDriver{
info: info,
scriptPath: scriptPath,
chipType: chip,
needsReset: true,
}
}
// SetLogBroadcaster attaches a log broadcaster so that bridge stderr
// and driver messages are forwarded to the frontend.
func (d *KneronDriver) SetLogBroadcaster(b *logger.Broadcaster) {
d.logBroadcaster = b
}
// driverLog writes a log message to stderr and the broadcaster.
func (d *KneronDriver) driverLog(level, format string, args ...interface{}) {
msg := fmt.Sprintf(format, args...)
fmt.Fprintf(os.Stderr, "%s\n", msg)
if d.logBroadcaster != nil {
d.logBroadcaster.Push(level, msg)
}
}
// NewKL720Driver is a backward-compatible alias for NewKneronDriver.
// Deprecated: Use NewKneronDriver instead.
func NewKL720Driver(info driver.DeviceInfo, scriptPath string) *KneronDriver {
return NewKneronDriver(info, scriptPath)
}
// KL720Driver is a backward-compatible type alias for KneronDriver.
// Deprecated: Use KneronDriver instead.
type KL720Driver = KneronDriver
// resolvePython finds the best Python interpreter using the package-level resolver.
func (d *KneronDriver) resolvePython() string {
return ResolvePython(d.scriptPath)
}
// startPython launches the Python bridge subprocess and waits for the
// "ready" signal on stdout.
func (d *KneronDriver) startPython() error {
pythonBin := d.resolvePython()
scriptDir := filepath.Dir(d.scriptPath)
cmd := exec.Command(pythonBin, d.scriptPath)
// On macOS with Apple Silicon, Kneron SDK requires x86_64 (Rosetta 2).
// The venv should already contain the correct architecture Python.
// Set DYLD_LIBRARY_PATH so libkplus.dylib can be found.
cmd.Env = append(os.Environ(),
"PYTHONUNBUFFERED=1",
)
// Add library path for native kp module if lib directory exists.
libDir := filepath.Join(scriptDir, "lib")
if _, err := os.Stat(libDir); err == nil {
if runtime.GOOS == "darwin" {
cmd.Env = append(cmd.Env, "DYLD_LIBRARY_PATH="+libDir)
} else {
cmd.Env = append(cmd.Env, "LD_LIBRARY_PATH="+libDir)
}
}
// On Windows, ensure libusb-1.0.dll can be found by adding the install
// directory to PATH (installer places the DLL there).
if runtime.GOOS == "windows" {
installDir := filepath.Dir(scriptDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("PATH=%s;%s;%s", installDir, scriptDir, os.Getenv("PATH")))
}
stdinPipe, err := cmd.StdinPipe()
if err != nil {
return fmt.Errorf("failed to create stdin pipe: %w", err)
}
stdoutPipe, err := cmd.StdoutPipe()
if err != nil {
stdinPipe.Close()
return fmt.Errorf("failed to create stdout pipe: %w", err)
}
// Capture stderr from the Python bridge: forward each line to both
// os.Stderr and the WebSocket broadcaster so it shows in the frontend.
stderrPipe, err := cmd.StderrPipe()
if err != nil {
stdinPipe.Close()
stdoutPipe.Close()
return fmt.Errorf("failed to create stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
stdinPipe.Close()
return fmt.Errorf("failed to start python bridge (%s): %w", pythonBin, err)
}
// Forward bridge stderr line-by-line to os.Stderr + broadcaster.
go func() {
scanner := bufio.NewScanner(stderrPipe)
for scanner.Scan() {
line := scanner.Text()
fmt.Fprintln(os.Stderr, line)
if d.logBroadcaster != nil {
d.logBroadcaster.Push("DEBUG", line)
}
}
}()
d.pythonCmd = cmd
d.stdin = stdinPipe
d.stdout = bufio.NewScanner(stdoutPipe)
// Increase scanner buffer for large inference responses.
d.stdout.Buffer(make([]byte, 0, 64*1024), 1024*1024)
// Wait for the ready signal from the Python process.
if d.stdout.Scan() {
var resp map[string]interface{}
if err := json.Unmarshal([]byte(d.stdout.Text()), &resp); err == nil {
if status, ok := resp["status"].(string); ok && status == "ready" {
d.pythonReady = true
return nil
}
}
}
// If we didn't get a ready signal, clean up and report failure.
d.stopPython()
return fmt.Errorf("python bridge did not send ready signal")
}
// sendCommand sends a JSON command to the Python subprocess and returns
// the parsed JSON response.
func (d *KneronDriver) sendCommand(cmd map[string]interface{}) (map[string]interface{}, error) {
if !d.pythonReady {
return nil, fmt.Errorf("python bridge is not running")
}
data, err := json.Marshal(cmd)
if err != nil {
return nil, fmt.Errorf("failed to marshal command: %w", err)
}
// Write the JSON command followed by a newline.
if _, err := fmt.Fprintf(d.stdin, "%s\n", data); err != nil {
return nil, fmt.Errorf("failed to write to python bridge: %w", err)
}
// Read the response line.
if !d.stdout.Scan() {
if err := d.stdout.Err(); err != nil {
return nil, fmt.Errorf("failed to read from python bridge: %w", err)
}
return nil, fmt.Errorf("python bridge closed unexpectedly")
}
var resp map[string]interface{}
if err := json.Unmarshal([]byte(d.stdout.Text()), &resp); err != nil {
return nil, fmt.Errorf("failed to parse python response: %w", err)
}
// Check for error responses from the bridge.
if errMsg, ok := resp["error"].(string); ok {
return nil, fmt.Errorf("python bridge error: %s", errMsg)
}
return resp, nil
}
// stopPython kills the Python subprocess and cleans up resources.
func (d *KneronDriver) stopPython() {
d.pythonReady = false
if d.stdin != nil {
d.stdin.Close()
d.stdin = nil
}
if d.pythonCmd != nil && d.pythonCmd.Process != nil {
d.pythonCmd.Process.Kill()
d.pythonCmd.Wait()
d.pythonCmd = nil
}
d.stdout = nil
}
// Info returns the current device information.
func (d *KneronDriver) Info() driver.DeviceInfo {
d.mu.Lock()
defer d.mu.Unlock()
return d.info
}
// Connect starts the Python bridge subprocess and connects to the Kneron device.
// On the first connect after server start, the device is reset to clear any
// stale model from a previous session.
func (d *KneronDriver) Connect() error {
d.mu.Lock()
if d.connected {
d.mu.Unlock()
return nil
}
needsReset := d.needsReset
d.info.Status = driver.StatusConnecting
// Start the Python bridge process.
if err := d.startPython(); err != nil {
d.info.Status = driver.StatusError
d.mu.Unlock()
return fmt.Errorf("failed to start hardware bridge: %w", err)
}
// Send connect command to the bridge.
resp, err := d.sendCommand(map[string]interface{}{
"cmd": "connect",
"port": d.info.Port,
"index": 0,
"device_type": d.info.Type,
})
if err != nil {
d.stopPython()
d.info.Status = driver.StatusError
d.mu.Unlock()
return fmt.Errorf("failed to connect to device: %w", err)
}
d.connected = true
d.needsReset = false
d.info.Status = driver.StatusConnected
if fw, ok := resp["firmware"].(string); ok {
d.info.FirmwareVer = fw
}
// Bridge reports whether firmware was freshly loaded during this connect.
// Freshly loaded firmware = clean state → no reset needed.
// Firmware already present (残留 from previous session) → must reset to
// avoid Error 15 SEND_DATA_TOO_LARGE on first inference.
freshFirmware, _ := resp["fresh_firmware_loaded"].(bool)
d.mu.Unlock()
// First connect after server start: reset device to clear stale session.
//
// Why reset is needed:
// - KL720: flash-basedfirmware 和 model 保留在 flashreset 清 stale
// model 才有意義。
// - KL520: USB Boot / RAM-based。若 session 間 firmware 殘留(不是剛載
// 的 Comp/U直接 load_model + inference 100% 炸 Error 15。必須
// reset → Loader → reload firmware → Comp/U 得到乾淨 session。
//
// Why we skip reset when freshFirmware=true:
// - 這次 connect 內部剛做過完整 firmware load → Comp/U 是新鮮乾淨的。
// 再做 reset 會再砍掉 reload 一次,浪費 30-60s 沒意義。
// - Windows cold boot 情境最常見device 斷電後第一次 connect
// 省下 restartBridge 的 ~65s 代價。
skipReset := freshFirmware
if needsReset && !skipReset {
d.driverLog("INFO", "[kneron] first connect — resetting %s to clear stale session (firmware was already present)...", d.chipType)
if err := d.restartBridge(); err != nil {
d.driverLog("WARN", "[kneron] reset on connect failed (non-fatal): %v", err)
} else {
d.driverLog("INFO", "[kneron] device reset complete — clean state ready")
}
} else if needsReset && skipReset {
d.driverLog("INFO", "[kneron] %s: skipping reset — firmware just loaded, session already clean", d.chipType)
}
return nil
}
// Disconnect stops the Python bridge and disconnects from the device.
func (d *KneronDriver) Disconnect() error {
d.mu.Lock()
defer d.mu.Unlock()
if !d.connected {
return nil
}
// Try to send disconnect command if Python is running.
if d.pythonReady {
d.sendCommand(map[string]interface{}{"cmd": "disconnect"})
}
d.stopPython()
d.connected = false
d.inferring = false
d.info.Status = driver.StatusDisconnected
return nil
}
// IsConnected returns whether the driver is currently connected.
func (d *KneronDriver) IsConnected() bool {
d.mu.Lock()
defer d.mu.Unlock()
return d.connected
}
// restartBridge resets the Kneron device and restarts the Python bridge.
//
// The KL520 USB Boot mode only allows loading one model per firmware
// session. To load a different model we must:
// 1. Send a "reset" command via the current bridge — this calls
// kp.core.reset_device() which forces the device back to Loader
// (USB Boot) state, wiping firmware + model from RAM.
// 2. Kill the Python bridge process.
// 3. Wait for the device to re-enumerate on USB (~8 s).
// 4. Start a fresh Python bridge.
// 5. Send "connect" which reloads firmware from scratch.
//
// After this the device is in a clean state ready for load_model.
//
// Caller must NOT hold d.mu.
func (d *KneronDriver) restartBridge() error {
d.mu.Lock()
port := d.info.Port
d.modelLoaded = ""
// Step 1: Ask the running bridge to reset the device.
if d.pythonReady {
d.driverLog("INFO", "[kneron] sending reset command to device...")
d.sendCommand(map[string]interface{}{"cmd": "reset"})
// Ignore errors — the device may have already disconnected.
}
// Step 2: Kill the bridge process.
d.stopPython()
d.mu.Unlock()
// Step 3: Wait for USB device to re-enumerate after hardware reset.
// The reset causes the device to drop off USB and reappear as a
// Loader-mode device. This typically takes 5-8 seconds.
d.driverLog("INFO", "[kneron] bridge stopped, waiting for USB re-enumerate after reset...")
time.Sleep(8 * time.Second)
d.mu.Lock()
defer d.mu.Unlock()
// Step 4: Start a fresh Python bridge.
d.driverLog("INFO", "[kneron] starting new bridge process...")
if err := d.startPython(); err != nil {
return fmt.Errorf("failed to restart bridge: %w", err)
}
// Step 5: Reconnect — firmware will be loaded fresh.
d.driverLog("INFO", "[kneron] bridge started, reconnecting to device (port=%s)...", port)
_, err := d.sendCommand(map[string]interface{}{
"cmd": "connect",
"port": port,
"index": 0,
"device_type": d.info.Type,
})
if err != nil {
d.stopPython()
return fmt.Errorf("failed to reconnect after bridge restart: %w", err)
}
d.driverLog("INFO", "[kneron] device reconnected after reset + bridge restart")
return nil
}
// Flash loads a model onto the Kneron device. Progress is reported through
// the provided channel.
//
// Behavior differs by chip:
// - KL520 (USB Boot): only one model per session. Error 40 triggers
// a full device reset + bridge restart + firmware reload.
// - KL720 (flash-based): models can be freely reloaded. Error 40
// should not occur; if it does, a simple retry is attempted first.
func (d *KneronDriver) Flash(modelPath string, progressCh chan<- driver.FlashProgress) error {
d.mu.Lock()
d.info.Status = driver.StatusFlashing
pythonReady := d.pythonReady
currentModel := d.modelLoaded
chip := d.chipType
d.mu.Unlock()
if !pythonReady {
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("hardware bridge is not running — cannot flash model")
}
// Same model already loaded — skip, report success
if currentModel != "" && currentModel == modelPath {
d.driverLog("INFO", "[kneron] model already loaded (%s), skipping reload", modelPath)
progressCh <- driver.FlashProgress{
Percent: 50,
Stage: "transferring",
Message: "model already loaded on device",
}
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
progressCh <- driver.FlashProgress{Percent: 100, Stage: "done", Message: "Flash complete (model already loaded)"}
return nil
}
// Try loading the model
progressCh <- driver.FlashProgress{
Percent: 5,
Stage: "preparing",
Message: "preparing... loading model to device",
}
d.mu.Lock()
_, err := d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
// Handle retryable errors (error 40, broken pipe).
if err != nil {
errMsg := err.Error()
d.driverLog("WARN", "[kneron] load_model failed: %s", errMsg)
isRetryable := strings.Contains(errMsg, "Error code: 40") ||
strings.Contains(errMsg, "SECOND_MODEL") ||
strings.Contains(errMsg, "broken pipe") ||
strings.Contains(errMsg, "USB_TIMEOUT")
if isRetryable {
if chip == "KL720" {
// KL720: error 40 should not occur. Try a simple retry
// without full bridge restart first.
d.driverLog("WARN", "[kneron] KL720 unexpected retryable error, retrying without restart...")
progressCh <- driver.FlashProgress{
Percent: 5,
Stage: "preparing",
Message: "preparing... retrying model load",
}
d.mu.Lock()
_, err = d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
// If still failing, fall back to bridge restart as last resort.
if err != nil {
d.driverLog("WARN", "[kneron] KL720 retry failed: %v, falling back to bridge restart...", err)
if restartErr := d.restartBridge(); restartErr != nil {
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("failed to reset device: %w", restartErr)
}
d.mu.Lock()
d.info.Status = driver.StatusFlashing
_, err = d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
}
} else {
// KL520: error 40 means a model is already loaded in this
// USB Boot session. Must reset device + reload firmware.
d.driverLog("WARN", "[kneron] KL520 retryable error, restarting bridge...")
progressCh <- driver.FlashProgress{
Percent: 5,
Stage: "preparing",
Message: "preparing... resetting device for new model",
}
if restartErr := d.restartBridge(); restartErr != nil {
d.driverLog("ERROR", "[kneron] restartBridge failed: %v", restartErr)
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("failed to reset device: %w", restartErr)
}
d.driverLog("INFO", "[kneron] bridge restarted, retrying load_model...")
d.mu.Lock()
d.info.Status = driver.StatusFlashing
_, err = d.sendCommand(map[string]interface{}{
"cmd": "load_model",
"path": modelPath,
})
d.mu.Unlock()
}
}
}
if err != nil {
d.driverLog("ERROR", "[kneron] load_model ultimately failed: %v", err)
d.mu.Lock()
d.info.Status = driver.StatusConnected
d.mu.Unlock()
return fmt.Errorf("failed to load model: %w", err)
}
d.driverLog("INFO", "[kneron] load_model succeeded: %s", modelPath)
// Simulate remaining flash progress stages (the Kneron SDK does not
// provide granular progress, so we approximate it after the model
// has been loaded successfully).
type stage struct {
name string
duration time.Duration
startPct int
endPct int
}
stages := []stage{
{"transferring", 2 * time.Second, 10, 80},
{"verifying", 1 * time.Second, 80, 95},
{"finalizing", 500 * time.Millisecond, 95, 99},
}
// KL720 is faster (USB 3.0, no firmware reload needed)
if chip == "KL720" {
stages = []stage{
{"transferring", 1 * time.Second, 10, 80},
{"verifying", 500 * time.Millisecond, 80, 95},
{"finalizing", 200 * time.Millisecond, 95, 99},
}
}
for _, s := range stages {
steps := (s.endPct - s.startPct) / 5
if steps < 1 {
steps = 1
}
interval := s.duration / time.Duration(steps)
for i := 0; i <= steps; i++ {
pct := s.startPct + (s.endPct-s.startPct)*i/steps
progressCh <- driver.FlashProgress{
Percent: pct,
Stage: s.name,
Message: fmt.Sprintf("%s... %d%%", s.name, pct),
}
time.Sleep(interval)
}
}
d.mu.Lock()
d.modelLoaded = modelPath
d.info.FlashedModel = modelPath
d.info.Status = driver.StatusConnected
d.mu.Unlock()
progressCh <- driver.FlashProgress{Percent: 100, Stage: "done", Message: "Flash complete"}
return nil
}
// StartInference begins continuous inference mode.
func (d *KneronDriver) StartInference() error {
d.mu.Lock()
defer d.mu.Unlock()
if !d.connected {
return fmt.Errorf("device not connected")
}
d.inferring = true
d.info.Status = driver.StatusInferencing
return nil
}
// StopInference stops continuous inference mode.
func (d *KneronDriver) StopInference() error {
d.mu.Lock()
defer d.mu.Unlock()
d.inferring = false
d.info.Status = driver.StatusConnected
return nil
}
// ReadInference reads the latest inference result. This is equivalent to
// calling RunInference with nil image data.
func (d *KneronDriver) ReadInference() (*driver.InferenceResult, error) {
return d.RunInference(nil)
}
// RunInference runs inference on the provided image data and returns
// the result. If imageData is nil, the bridge will run inference on
// a default/empty input.
func (d *KneronDriver) RunInference(imageData []byte) (*driver.InferenceResult, error) {
d.mu.Lock()
pythonReady := d.pythonReady
d.mu.Unlock()
if !pythonReady {
return nil, fmt.Errorf("hardware bridge is not running — device may not be connected")
}
// Encode image data as base64 for transmission to Python.
imageB64 := ""
if imageData != nil {
imageB64 = base64.StdEncoding.EncodeToString(imageData)
}
d.mu.Lock()
resp, err := d.sendCommand(map[string]interface{}{
"cmd": "inference",
"image_base64": imageB64,
})
d.mu.Unlock()
if err != nil {
return nil, fmt.Errorf("inference failed: %w", err)
}
return parseInferenceResult(resp)
}
// parseInferenceResult converts a JSON response map into an InferenceResult.
func parseInferenceResult(resp map[string]interface{}) (*driver.InferenceResult, error) {
// Re-marshal to JSON and unmarshal into the struct for clean conversion.
data, err := json.Marshal(resp)
if err != nil {
return nil, fmt.Errorf("failed to marshal response: %w", err)
}
var result driver.InferenceResult
if err := json.Unmarshal(data, &result); err != nil {
return nil, fmt.Errorf("failed to parse inference result: %w", err)
}
return &result, nil
}
// GetModelInfo returns information about the currently loaded model.
func (d *KneronDriver) GetModelInfo() (*driver.ModelInfo, error) {
d.mu.Lock()
defer d.mu.Unlock()
if d.modelLoaded == "" {
return nil, fmt.Errorf("no model loaded")
}
return &driver.ModelInfo{
ID: d.modelLoaded,
Name: d.modelLoaded,
LoadedAt: time.Now(),
}, nil
}