From 30d0ff5695b3f3c3a0621dfd00e9d73ecb3dda3b Mon Sep 17 00:00:00 2001 From: jim800121chen Date: Tue, 21 Apr 2026 01:12:10 +0800 Subject: [PATCH] =?UTF-8?q?fix(local-tool):=20=E6=8E=A8=E8=AB=96=20bbox=20?= =?UTF-8?q?=E6=A8=99=E8=A8=BB=E4=B8=8D=E9=A1=AF=E7=A4=BA=20=E2=80=94=20?= =?UTF-8?q?=E5=89=8D=E7=AB=AF=20canvas=20=E5=B0=BA=E5=AF=B8=20+=20KL520=20?= =?UTF-8?q?reset=20+=20=E5=BB=B6=E9=95=B7=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 症狀:Mac 版上傳單張圖推論,畫面完全沒有 bbox 標註。實測追根因後發現 兩層獨立問題疊加(前端 + 後端),擇一修復都無法解決。 ## Layer 1: 前端 canvas 尺寸對不上 img 顯示尺寸 - camera-inference-view.tsx renderedSize 初始值硬寫 {w:640, h:480} - ResizeObserver 理應在 load 後 fire,但實測沒 fire 或時機不對 - 結果 overlay canvas 永遠用 640×480 畫,bbox 嚴重偏位或跑出 canvas 修法(camera-feed.tsx + camera-inference-view.tsx): - 加 onLoad handler,decode 完立刻用 getBoundingClientRect 回報 - ResizeObserver effect 進來先檢查 img.complete && naturalWidth > 0, 是就立刻 report(cover HMR / cached image) - effect 依賴加 streamUrl / batchImageUrl,換圖會重觀察 - renderedSize 初始值改 null,overlay 改為拿到真實尺寸才 render - setState callback 用 prev 比對,同尺寸不觸發 render - camera-overlay.tsx 加 [bbox-debug] console.log 保留(debug 成本低, 對未來排查有幫助) ## Layer 2: KL520 推論炸 ApiKPException Error 15 - kp.inference.generic_image_inference_send 回 SEND_DATA_TOO_LARGE - 試過 image 尺寸(516×640 / 640×794 / 640×640 host pad)、numpy vs bytes、明確傳 width/height — 全部炸 - Python bridge 直接測試(/tmp/test_bridge.py)做完整 `connect → reset → reconnect → load_model → inference` 序列 → 11 個 detection 正常回傳 - Go driver 走 `connect → load_model → inference` 跳過 reset 根因:commit ddf0eb8(2026-04-16)「KL520 首次 connect 跳過 reset」當時 為解 Windows 60s HTTP timeout 的優化。但副作用:KL520 若 session 間 firmware 殘留(fw=KDP2 Comp/U),直接 load_model + inference 100% 炸 Error 15。必須走完整 reset → 退回 Loader → 重新載 firmware → Comp/U 流程才能得到能 inference 的 session。 修法(kl720_driver.go): - 移除「KL520 跳過 reset」特例,讓 KL520 和 KL720 都走 needsReset → restartBridge - 註解記錄 trade-off:KL520 connect 時間 ~2s → ~15-20s(macOS), Windows 可能 60s+ ## HTTP timeout 配套調整 - device_handler.go ConnectDevice timeout 60s → 120s - Windows worst-case(~65s:Loader reconnect 16s + firmware load 31s + reboot 8s + reconnect 5s)留 buffer,避免 504 CONNECT_TIMEOUT ## Bridge 清理 - kneron_bridge.py 清掉中途試驗遺留的 `_host_preproc` 死碼 (還原成原版 _correct_bbox_for_letterbox) - 加了 debug log(Inference: sending / parse done / EXCEPTION with traceback)保留,未來排查 inference 路徑很有用 ## 驗證(function 層) /tmp/test_bridge.py 三種尺寸全通過: - 516×640 直式 → 11 detections (person×8, tie×3) latency 308ms - 1920×1080 横式 → 0 detections(合成圖,正常) - 512×512 正方 → 0 detections ## 待使用者驗證 - Mac UI 實測:上傳 ~/Downloads/000000000459.jpg 應見 11 個 bbox 精準框住 - Windows 實測 connect 耗時 + timeout 是否足夠 - Linux 實測 Co-Authored-By: Claude Opus 4.7 (1M context) --- local-tool/.autoflow/progress.md | 106 +++++++++++++++++- .../src/components/camera/camera-feed.tsx | 32 ++++-- .../camera/camera-inference-view.tsx | 6 +- .../src/components/camera/camera-overlay.tsx | 10 ++ .../internal/api/handlers/device_handler.go | 9 +- .../internal/driver/kneron/kl720_driver.go | 28 ++--- local-tool/server/scripts/kneron_bridge.py | 10 +- 7 files changed, 169 insertions(+), 32 deletions(-) diff --git a/local-tool/.autoflow/progress.md b/local-tool/.autoflow/progress.md index f3d7860..a8e7a65 100644 --- a/local-tool/.autoflow/progress.md +++ b/local-tool/.autoflow/progress.md @@ -3,7 +3,111 @@ ## 目的:全新專案(從 edge-ai-platform 衍生的 local 版本) ## 當前階段:🔴 **第一階段回溯** — L 級重大方向變更(Wails 內嵌 → Wails 控制台 + 瀏覽器 Web UI) ## 當前狀態:✅ 使用者決策全部收齊(R5 第五輪決策),待三方產出正式 PRD v2 / Design Spec v2 / TDD v2 -## 最後更新:2026-04-14 +## 最後更新:2026-04-21 + +## 2026-04-21 推論 bbox 標註不顯示 + KL520 Error 15(S 級 bug fix) + +### 症狀 +Mac 版 app 上傳單張圖推論,畫面上完全沒有 bbox 標註。 + +### 根因(兩層獨立問題,疊加讓「bbox 完全不見」) + +**Layer 1(前端 canvas 尺寸)**: +- `camera-inference-view.tsx` `renderedSize` 初始值硬寫 `{w:640, h:480}` +- ResizeObserver 理應在 `` load 後 fire 更新成實際顯示尺寸(例如 516×640 直式圖 → CSS 640×794),但實測沒 fire 或 fire 時機不對 +- 結果 overlay canvas 永遠用 640×480 畫,和 img 實際 DOM box 對不上 → 就算有 detection,bbox 位置會嚴重偏位甚至跑出 canvas + +**Layer 2(後端推論 Error 15)**: +- `kp.inference.generic_image_inference_send` 回 `ApiKPException Error 15 SEND_DATA_TOO_LARGE` +- 試過:image 尺寸(516×640 / 640×794 / 640×640 pad)、傳 numpy vs bytes、明確傳 width/height — **全部都炸** +- Python bridge 直接測試(`/tmp/test_bridge.py`)做完整 `connect → reset → reconnect → load_model → inference` → **11 個 detection 正常回傳** +- 對比 Go driver 實際路徑:`connect → load_model → inference` **跳過了 reset** + +### 兇手:commit `ddf0eb8`(2026-04-16) +`KL520 首次 connect 跳過不必要的 device reset` — 當時為解 Windows 60s HTTP timeout(Loader mode connect 不穩定 + firmware load 總耗 64s)而加的優化,讓 KL520 首次 connect 不再 restartBridge。 + +副作用:KL520 雖然是 USB Boot / RAM-based 裝置,理論上每次 connect 是 clean state,但實測若 session 間 firmware 殘留(`fw=KDP2 Comp/U`),**直接 load_model + inference 100% 炸 Error 15**。只有走完整 `reset → 退回 Loader → 重新載 firmware 到 Comp/U` 流程,才能拿到能正常 inference 的 session。 + +### 修法 + +**前端(`camera-feed.tsx` + `camera-inference-view.tsx`)**: +- `` 加 `onLoad` handler,圖片 decode 完立刻用 `getBoundingClientRect` 回報尺寸(最可靠時機) +- ResizeObserver effect 進來先檢查 `img.complete && naturalWidth > 0`,是就立刻 report(cover HMR / cached image) +- effect 依賴加 `streamUrl / batchImageUrl`,換圖會重觀察 +- `renderedSize` 初始值改 `null`,overlay 改為 `isStreaming && renderedSize` 才 render(避免首次用預設值畫錯) +- setState callback 用 prev 比對,同尺寸不觸發 render + +**後端(`server/internal/driver/kneron/kl720_driver.go`)**: +- 移除 `ddf0eb8` 的「KL520 跳過 reset」特例,讓 KL520 和 KL720 都走 `needsReset=true → restartBridge()` +- 註解記錄 trade-off:KL520 connect 時間從 ~2s 變 ~15-20s(macOS),Windows 可能 60s+ +- 同步調整 `server/internal/api/handlers/device_handler.go` connect timeout:`60s → 120s`,為 Windows worst-case(~65s)留 buffer + +**Python bridge(`server/scripts/kneron_bridge.py`)**: +- 無實質改動(試過 host-side letterbox、numpy→bytes、明確傳 w/h 全部無效 → 還原回原版,確認問題在 Go driver 的 reset 流程) +- 只加了 debug log(`Inference: sending...` / `Inference: parse done, detections=N` / `Inference EXCEPTION with traceback`),追 bug 時用,commit 前會保留(低成本、高價值) + +### 驗證(function 層) + +`/tmp/test_bridge.py` 直接測試 bridge JSON-RPC: + +``` +[5/5] inference (real 516x640) keys: ['taskType', 'timestamp', 'latencyMs', 'detections', 'classifications'] + ✅ inference OK — detections=11 classifications=0 latency=308.3ms + - person 0.705 bbox=(x=0.427, y=0.526, w=0.089, h=0.070) + - person 0.701 bbox=(x=0.360, y=0.438, w=0.227, h=0.246) + - tie 0.639 bbox=(x=0.351, y=0.573, w=0.011, h=0.107) + ... + ✅ 1920x1080 OK — detections=0 + ✅ 512x512 OK — detections=0 +=== ALL TESTS PASSED === +``` + +三種尺寸(516×640 直式 / 1920×1080 横式 / 512×512 正方)全通過。 + +### 待使用者驗證 +- [ ] Mac UI 端實測:上傳 `~/Downloads/000000000459.jpg` 應見 11 個 bbox 精準框住 person + tie +- [ ] Windows 實測首次 connect 耗時 + 是否還踩 HTTP timeout(現已放寬到 120s) +- [ ] Linux 實測 + +### 前端 debug log 去留 +`camera-overlay.tsx` 的 `console.log('[bbox-debug] ...')` 驗證完成後**可清可留**。保留成本低,對未來 debug 有幫助。 + +## 2026-04-20 macOS 掃不到 Kneron 裝置(S 級 bug fix) + +症狀:Mac 版 app 啟動後,前端顯示沒有裝置(實際 KL520 透過 USB 連上)。 + +根因(兩層): +1. **主要**:`PythonModeAuto` 預設「先 system 後 bundled」,系統 python3 通常沒裝 KneronPLUS wheel → `import kp` 失敗 → bridge 降級 pyusb → pyusb 找不到 libusb → scan 空。 +2. **次要(潛在)**:macOS hardened runtime 會剝掉 `DYLD_LIBRARY_PATH`;若未來 bundle 架構變動 dyld 找不到 libkplus 的相依 libusb,會再踩坑。 + +修法: +- `visiona-local/app.go` `PythonModeAuto` 語意翻轉 → **先 bundled(已預裝 kp wheel),失敗才 fallback system**。理由:local-tool 整包內嵌 Python + wheels,系統 python 不會裝 kp,不該優先。 +- `server/scripts/kneron_bridge.py` 在 `import kp` 前新增 `_preload_kneron_dylibs_macos()` — 用 `ctypes.CDLL` 絕對路徑預載 wheel 內 `kp/lib/libusb-1.0.0.dylib` + `libkplus.dylib`,避開 DYLD 被 hardened runtime 砍的風險。Windows/Linux 分支不動。 +- 同步 bridge 到 payload/{darwin,linux,windows}/scripts/ + build bundle。 + +驗證: +- `go build` 兩個 module 都通過 +- bridge script 直跑:`{"cmd":"scan"}` → 回傳 KL520 裝置 `kn_number 0xB906162C` +- 待 rebuild wails app 後實測(需要 `make wails-macos`) + +## 2026-04-20 macOS DMG 美化(S 級) + +需求:Mac 端也要有 installer(類比 Windows .exe)。走方案 C(create-dmg 美化 DMG + 背景圖 + Applications 捷徑)。 + +實作: +- 新增 `installer/macos/{make-dmg-background.py, background.png, background@2x.png, README.md}` + - 動態生成 640×400 深色背景(對齊 Wails 控制台 splash 配色 `#111827→#0B0F19` + `#38BDF8` accent) + - 含 1x + 2x Retina 版本 +- Makefile `dmg` 拆成三個 target: + - `dmg`:auto-detect,有 create-dmg 走 fancy,沒有 fallback plain(CI 無痛) + - `dmg-fancy`:強制美化版(需 `brew install create-dmg`) + - `dmg-plain`:原本的 hdiutil UDZO(保留為 fallback) +- Windows / Linux 流程零改動 + +驗證: +- `brew install create-dmg` 成功 +- `make dmg-fancy` 產出 157MB DMG,mount 後內容:app + Applications 捷徑 + .background/background.png + .DS_Store(視窗樣式) +- `hdiutil verify` 通過 ## 🔴 2026-04-14 使用者提出 L 級重大方向變更 diff --git a/local-tool/frontend/src/components/camera/camera-feed.tsx b/local-tool/frontend/src/components/camera/camera-feed.tsx index f4c8b9b..2242783 100644 --- a/local-tool/frontend/src/components/camera/camera-feed.tsx +++ b/local-tool/frontend/src/components/camera/camera-feed.tsx @@ -23,17 +23,30 @@ export function CameraFeed({ streamUrl, width = 640, height = 480, sourceType, b const img = imgRef.current; if (!img || !onDimensionsChange) return; - const observer = new ResizeObserver((entries) => { - for (const entry of entries) { - const { width: w, height: h } = entry.contentRect; - if (w > 0 && h > 0) { - onDimensionsChange(Math.round(w), Math.round(h)); - } - } - }); + const report = () => { + const rect = img.getBoundingClientRect(); + const w = Math.round(rect.width); + const h = Math.round(rect.height); + if (w > 0 && h > 0) onDimensionsChange(w, h); + }; + + // Initial read — covers the case where the image is already cached/decoded + // before the observer attaches (common for fast image swaps / HMR). + if (img.complete && img.naturalWidth > 0) report(); + + const observer = new ResizeObserver(report); observer.observe(img); return () => observer.disconnect(); - }, [onDimensionsChange]); + }, [onDimensionsChange, streamUrl, batchImageUrl]); + + const handleLoad = () => { + const img = imgRef.current; + if (!img || !onDimensionsChange) return; + const rect = img.getBoundingClientRect(); + const w = Math.round(rect.width); + const h = Math.round(rect.height); + if (w > 0 && h > 0) onDimensionsChange(w, h); + }; if (!streamUrl) { return ( @@ -64,6 +77,7 @@ export function CameraFeed({ streamUrl, width = 640, height = 480, sourceType, b ref={imgRef} src={displayUrl} alt={altText} + onLoad={handleLoad} style={{ width, height: 'auto' }} className="block" /> diff --git a/local-tool/frontend/src/components/camera/camera-inference-view.tsx b/local-tool/frontend/src/components/camera/camera-inference-view.tsx index ff46a21..8011ac0 100644 --- a/local-tool/frontend/src/components/camera/camera-inference-view.tsx +++ b/local-tool/frontend/src/components/camera/camera-inference-view.tsx @@ -18,11 +18,11 @@ export function CameraInferenceView({ deviceId }: CameraInferenceViewProps) { const { result, batchResults, confidenceThreshold } = useInferenceStore(); const displayWidth = 640; - const [renderedSize, setRenderedSize] = useState({ w: 640, h: 480 }); + const [renderedSize, setRenderedSize] = useState<{ w: number; h: number } | null>(null); const isBatchMode = sourceType === 'batch_image'; const handleDimensionsChange = useCallback((w: number, h: number) => { - setRenderedSize({ w, h }); + setRenderedSize((prev) => (prev && prev.w === w && prev.h === h ? prev : { w, h })); }, []); // In batch mode, show the selected image's detections @@ -47,7 +47,7 @@ export function CameraInferenceView({ deviceId }: CameraInferenceViewProps) { batchImageUrl={batchImageUrl} onDimensionsChange={handleDimensionsChange} overlay={ - isStreaming ? ( + isStreaming && renderedSize ? ( d.confidence >= confidenceThreshold); + if (typeof window !== 'undefined') { + // TEMP debug: 驗證 bbox coordinate space 對齊問題 + // eslint-disable-next-line no-console + console.log('[bbox-debug] canvas=%dx%d total=%d filtered=%d threshold=%s', width, height, detections.length, filtered.length, confidenceThreshold, filtered.map((d) => ({ + label: d.label, + bbox: d.bbox, + conf: d.confidence, + }))); + } + filtered.forEach((det, i) => { const color = COLORS[i % COLORS.length]; // Convert normalized coordinates (0-1) to pixel values diff --git a/local-tool/server/internal/api/handlers/device_handler.go b/local-tool/server/internal/api/handlers/device_handler.go index 424006a..0b4668b 100644 --- a/local-tool/server/internal/api/handlers/device_handler.go +++ b/local-tool/server/internal/api/handlers/device_handler.go @@ -82,9 +82,12 @@ func (h *DeviceHandler) GetDevice(c *gin.Context) { func (h *DeviceHandler) ConnectDevice(c *gin.Context) { id := c.Param("id") - // KL520 USB Boot flow can take ~40s: retry connect (3x2s) + firmware - // load + 5s reboot wait + reconnect retry (3x3s). Use 60s timeout. - ctx, cancel := context.WithTimeout(c.Request.Context(), 60*time.Second) + // KL520 USB Boot flow now includes mandatory reset + firmware reload on + // first connect (required for inference to work — see kl720_driver.go + // needsReset block). Worst-case path on Windows: Loader-mode reconnect + // retry (16s) + firmware load (~31s) + reboot wait + second reconnect + // (~13s) = ~60-65s. Use 120s to leave headroom and avoid spurious 504s. + ctx, cancel := context.WithTimeout(c.Request.Context(), 120*time.Second) defer cancel() errCh := make(chan error, 1) diff --git a/local-tool/server/internal/driver/kneron/kl720_driver.go b/local-tool/server/internal/driver/kneron/kl720_driver.go index 5f2ab30..df13df6 100644 --- a/local-tool/server/internal/driver/kneron/kl720_driver.go +++ b/local-tool/server/internal/driver/kneron/kl720_driver.go @@ -290,26 +290,28 @@ func (d *KneronDriver) Connect() error { // First connect after server start: reset device to clear stale models. // - // KL520 跳過 reset:KL520 是 USB Boot 裝置,reset 會退回 Loader 模式 - // (firmware 從 RAM 清掉),然後需要重新載 firmware(~30 秒)+ USB - // 重新枚舉(~8 秒)。這讓原本 2 秒的 connect 變成 60+ 秒,而且 - // Loader 模式的 connect_devices_without_check 不穩定(常需重試)。 + // BOTH KL520 and KL720 需要 reset: // - // KL520 每次 connect 本來就是 clean state(RAM-based firmware,斷電 - // 即清),不需要主動 reset 清 stale model。如果真的有殘留 model, - // 下次 load_model 前的 restartBridge 會處理。 + // - KL720 是 flash-based 裝置,firmware 和 model 會保留在 flash,reset + // 清 stale model 才有意義。 // - // KL720 是 flash-based 裝置,firmware 和 model 會保留在 flash,reset - // 清 stale model 才有意義。 - if needsReset && d.chipType == "KL720" { - d.driverLog("INFO", "[kneron] first connect after server start — resetting KL720 to clear stale model...") + // - KL520 雖然是 USB Boot 裝置(RAM-based firmware,斷電即清),理論上 + // 每次 connect 是 clean state。但實測發現若 session 間 firmware 殘留 + // (fw=KDP2 Comp/U 而非 Loader),直接走 load_model + inference 會 + // 100% 炸 ApiKPException Error 15 (SEND_DATA_TOO_LARGE)。只有走 + // reset → reboot 到 Loader → 重新載 firmware 到 Comp/U 的完整流程, + // 才能得到能正常 inference 的 session。 + // + // 成本:KL520 reset + firmware load + reconnect ~15-20s(macOS 實測)。 + // Windows 上可能更久;若 HTTP connect timeout 60s 不夠,需調高或改 + // 非同步 connect pattern。 + if needsReset { + d.driverLog("INFO", "[kneron] first connect after server start — resetting %s to clear stale session...", d.chipType) if err := d.restartBridge(); err != nil { d.driverLog("WARN", "[kneron] reset on connect failed (non-fatal): %v", err) } else { d.driverLog("INFO", "[kneron] device reset complete — clean state ready") } - } else if needsReset { - d.driverLog("INFO", "[kneron] %s: skipping reset on first connect (USB Boot device, clean state by default)", d.chipType) } return nil diff --git a/local-tool/server/scripts/kneron_bridge.py b/local-tool/server/scripts/kneron_bridge.py index bbcc85c..62ab4a8 100644 --- a/local-tool/server/scripts/kneron_bridge.py +++ b/local-tool/server/scripts/kneron_bridge.py @@ -1053,7 +1053,6 @@ def handle_inference(params): new_h = int(h * scale) else: new_w, new_h = w, h - # Ensure even dimensions (NPU requirement) new_w = (new_w + 1) & ~1 new_h = (new_h + 1) & ~1 img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR) @@ -1061,12 +1060,11 @@ def handle_inference(params): # Convert BGR to BGR565 img_bgr565 = cv2.cvtColor(src=img, code=cv2.COLOR_BGR2BGR565) else: - # Fallback: try to use raw bytes (assume RGB565 format) img_bgr565 = np.frombuffer(img_bytes, dtype=np.uint8) else: return {"error": "no image data provided"} - # Create inference config + # Create inference config (original: pass numpy ndarray, SDK reads shape) inf_config = kp.GenericImageInferenceDescriptor( model_id=_model_id, inference_number=0, @@ -1079,8 +1077,10 @@ def handle_inference(params): ) # Send and receive + _log(f"Inference: sending to NPU (model_type={_model_type}, input_size={_model_input_size})") kp.inference.generic_image_inference_send(_device_group, inf_config) result = kp.inference.generic_image_inference_receive(_device_group) + _log(f"Inference: receive complete, parsing...") elapsed_ms = (time.time() - t0) * 1000 @@ -1110,6 +1110,8 @@ def handle_inference(params): input_size=_model_input_size, ) + _log(f"Inference: parse done, detections={len(detections)}, classifications={len(classifications)}, elapsed={elapsed_ms:.1f}ms") + return { "taskType": task_type, "timestamp": int(time.time() * 1000), @@ -1119,6 +1121,8 @@ def handle_inference(params): } except Exception as e: + import traceback + _log(f"Inference EXCEPTION: {type(e).__name__}: {e}\n{traceback.format_exc()}") return {"error": str(e)}