From b71ff4cd3c72e879435f773ae15b23bf8b70841e Mon Sep 17 00:00:00 2001 From: jim800121chen Date: Tue, 21 Apr 2026 11:09:25 +0800 Subject: [PATCH] =?UTF-8?q?perf(local-tool):=20Windows=20KL520=20cold-boot?= =?UTF-8?q?=20connect=20106s=20=E2=86=92=20~40s=EF=BC=88=E8=B7=B3=E9=81=8E?= =?UTF-8?q?=E5=A4=9A=E9=A4=98=20reset=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 背景: Windows 實測 KL520 首次 connect 耗時 106 秒,原因是 reset 流程內部重複 firmware load: 1. 進來 Loader → load firmware (35s) → Comp/U 2. reset 退回 Loader → bridge 重啟 3. reconnect 進來又是 Loader → load firmware (30s) → Comp/U 4. Loader reconnect 第一次常 fail(15s timeout) 總共 ~65s 花在「砍掉剛載好的 firmware、再載一次」的白工上。 根因:先前修的 needsReset 邏輯不管 firmware 新舊一律 reset。但 Error 15 只發生在「Comp/U 是上次 session 殘留」的情境;「本次 connect 內部剛載的 Comp/U」session 是乾淨的,不需要 reset。 修法(條件性 reset): - server/scripts/kneron_bridge.py:connect handler 新增追蹤本次有無走 firmware load flow,return 多帶 `fresh_firmware_loaded` bool - server/internal/driver/kneron/kl720_driver.go:Connect 讀 flag,若為 true 就 skipReset(firmware 剛載的,session 已乾淨) 驗證(2026-04-21): - `/tmp/test_bridge.py` 拔插 USB 後跑 `connect (fw=Loader) → fresh_firmware_loaded=True → skip reset → load_model → inference` → 11 detections(person×8, tie×3, latency 332ms) - Mac UI Comp/U 殘留路徑:reset → 11 bbox ✓ - Mac UI Loader cold-boot 路徑(拔插後):skip reset → 11 bbox ✓ 預期效益: - Windows cold-boot(常見):106s → ~40s(省 65s) - Mac 跨 session(常見):~15-20s 不變 - 極少數(Windows device 未斷電但跨 server session):走完整 reset Co-Authored-By: Claude Opus 4.7 (1M context) --- local-tool/.autoflow/progress.md | 32 +++++++++++++-- .../internal/driver/kneron/kl720_driver.go | 39 +++++++++++-------- local-tool/server/scripts/kneron_bridge.py | 12 +++++- 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/local-tool/.autoflow/progress.md b/local-tool/.autoflow/progress.md index a8e7a65..6aabac9 100644 --- a/local-tool/.autoflow/progress.md +++ b/local-tool/.autoflow/progress.md @@ -64,10 +64,36 @@ Mac 版 app 上傳單張圖推論,畫面上完全沒有 bbox 標註。 三種尺寸(516×640 直式 / 1920×1080 横式 / 512×512 正方)全通過。 -### 待使用者驗證 -- [ ] Mac UI 端實測:上傳 `~/Downloads/000000000459.jpg` 應見 11 個 bbox 精準框住 person + tie -- [ ] Windows 實測首次 connect 耗時 + 是否還踩 HTTP timeout(現已放寬到 120s) +### 已驗證(2026-04-21) +- [x] Mac UI Comp/U 殘留路徑:reset 後推論 11 個 bbox 正確 +- [x] Mac UI Loader cold-boot 路徑(拔插 USB):skip reset 後推論 11 個 bbox 正確 +- [x] Windows 實測首次 connect:106s 成功(< 120s timeout),推論正確 + +### 後續優化:Windows connect 106s → 預期 ~40s(方案 C) + +Windows 實測發現即使 timeout 120s 夠用,使用者要等 106s 體感太久。拆解 +瓶頸發現走了兩次 firmware load(第一次 connect 進來 Loader → load fw +→ Comp/U ~35s / reset → 回 Loader / reconnect → load fw → Comp/U ~30s), +reset 流程中第二次 firmware load 是白做工。 + +**條件性 reset(方案 C)**: +- `kneron_bridge.py connect` 回報 `fresh_firmware_loaded` flag + - `True`:本次 connect 內部剛做過 firmware load(原本是 Loader) + - `False`:進來就是 Comp/U(上次 session 殘留,需要 reset 清乾淨) +- `kl720_driver.go` 判 flag 決定要不要做 restartBridge reset + +**驗證兩條路徑都 OK(2026-04-21)**: +- Loader cold-boot → skip reset → 推論 11 bbox ✓ +- Comp/U 殘留 → 做 reset → 推論 11 bbox ✓ + +**預期效益**: +- Windows cold-boot(最常見):106s → **~40s**(省 65s) +- Mac 跨 session(最常見):~15-20s 不變 +- 極少數情境(Windows 但 device 未斷電):維持走完整 reset 流程 + +### 待驗證 - [ ] Linux 實測 +- [ ] Windows 實測方案 C 效益(預期 cold-boot 降到 ~40s) ### 前端 debug log 去留 `camera-overlay.tsx` 的 `console.log('[bbox-debug] ...')` 驗證完成後**可清可留**。保留成本低,對未來 debug 有幫助。 diff --git a/local-tool/server/internal/driver/kneron/kl720_driver.go b/local-tool/server/internal/driver/kneron/kl720_driver.go index df13df6..28217c4 100644 --- a/local-tool/server/internal/driver/kneron/kl720_driver.go +++ b/local-tool/server/internal/driver/kneron/kl720_driver.go @@ -286,32 +286,37 @@ func (d *KneronDriver) Connect() error { if fw, ok := resp["firmware"].(string); ok { d.info.FirmwareVer = fw } + // Bridge reports whether firmware was freshly loaded during this connect. + // Freshly loaded firmware = clean state → no reset needed. + // Firmware already present (残留 from previous session) → must reset to + // avoid Error 15 SEND_DATA_TOO_LARGE on first inference. + freshFirmware, _ := resp["fresh_firmware_loaded"].(bool) d.mu.Unlock() - // First connect after server start: reset device to clear stale models. + // First connect after server start: reset device to clear stale session. // - // BOTH KL520 and KL720 需要 reset: + // Why reset is needed: + // - KL720: flash-based,firmware 和 model 保留在 flash,reset 清 stale + // model 才有意義。 + // - KL520: USB Boot / RAM-based。若 session 間 firmware 殘留(不是剛載 + // 的 Comp/U),直接 load_model + inference 100% 炸 Error 15。必須 + // reset → Loader → reload firmware → Comp/U 得到乾淨 session。 // - // - KL720 是 flash-based 裝置,firmware 和 model 會保留在 flash,reset - // 清 stale model 才有意義。 - // - // - KL520 雖然是 USB Boot 裝置(RAM-based firmware,斷電即清),理論上 - // 每次 connect 是 clean state。但實測發現若 session 間 firmware 殘留 - // (fw=KDP2 Comp/U 而非 Loader),直接走 load_model + inference 會 - // 100% 炸 ApiKPException Error 15 (SEND_DATA_TOO_LARGE)。只有走 - // reset → reboot 到 Loader → 重新載 firmware 到 Comp/U 的完整流程, - // 才能得到能正常 inference 的 session。 - // - // 成本:KL520 reset + firmware load + reconnect ~15-20s(macOS 實測)。 - // Windows 上可能更久;若 HTTP connect timeout 60s 不夠,需調高或改 - // 非同步 connect pattern。 - if needsReset { - d.driverLog("INFO", "[kneron] first connect after server start — resetting %s to clear stale session...", d.chipType) + // Why we skip reset when freshFirmware=true: + // - 這次 connect 內部剛做過完整 firmware load → Comp/U 是新鮮乾淨的。 + // 再做 reset 會再砍掉 reload 一次,浪費 30-60s 沒意義。 + // - Windows cold boot 情境最常見(device 斷電後第一次 connect)— + // 省下 restartBridge 的 ~65s 代價。 + skipReset := freshFirmware + if needsReset && !skipReset { + d.driverLog("INFO", "[kneron] first connect — resetting %s to clear stale session (firmware was already present)...", d.chipType) if err := d.restartBridge(); err != nil { d.driverLog("WARN", "[kneron] reset on connect failed (non-fatal): %v", err) } else { d.driverLog("INFO", "[kneron] device reset complete — clean state ready") } + } else if needsReset && skipReset { + d.driverLog("INFO", "[kneron] %s: skipping reset — firmware just loaded, session already clean", d.chipType) } return nil diff --git a/local-tool/server/scripts/kneron_bridge.py b/local-tool/server/scripts/kneron_bridge.py index 62ab4a8..f567461 100644 --- a/local-tool/server/scripts/kneron_bridge.py +++ b/local-tool/server/scripts/kneron_bridge.py @@ -864,7 +864,11 @@ def handle_connect(params): kp.core.set_timeout(device_group=_device_group, milliseconds=_timeout_ms) _log(f"set_timeout succeeded") - # Firmware handling — chip-dependent + # Firmware handling — chip-dependent. + # fresh_firmware_loaded is used by Go driver to decide whether to + # skip the post-connect reset (freshly loaded firmware is already + # in a clean state — reset would just waste 30-60s reloading it). + fresh_firmware_loaded = False if "Loader" in fw_str: # Device is in USB Boot (Loader) mode and needs firmware if _device_chip == "KL720": @@ -904,11 +908,14 @@ def handle_connect(params): device_group=_device_group, milliseconds=_timeout_ms ) fw_str = str(target_dev.firmware) + fresh_firmware_loaded = True _log(f"Reconnected after firmware load, firmware: {fw_str}") else: _log(f"WARNING: {_device_chip} firmware files not found, skipping firmware load") else: - # Not in Loader mode — firmware already present + # Not in Loader mode — firmware already present from a previous + # session. This is the state that triggers Error 15 on inference + # without reset, per observed bug. _log(f"{_device_chip}: firmware already present (normal). fw={fw_str}") return { @@ -916,6 +923,7 @@ def handle_connect(params): "firmware": fw_str, "kn_number": f"0x{target_dev.kn_number:08X}", "chip": _device_chip, + "fresh_firmware_loaded": fresh_firmware_loaded, } except Exception as e: