diff --git a/local_service_win/LocalAPI/main.py b/local_service_win/LocalAPI/main.py index a807d11..076410f 100644 --- a/local_service_win/LocalAPI/main.py +++ b/local_service_win/LocalAPI/main.py @@ -3,19 +3,26 @@ from __future__ import annotations import base64 import json import os +import tempfile import subprocess import sys import threading +import time from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional -from fastapi import FastAPI, HTTPException, Request -from fastapi.responses import JSONResponse +from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile +from fastapi.responses import FileResponse, JSONResponse, StreamingResponse from pydantic import BaseModel, Field import kp +try: + import cv2 # type: ignore +except Exception: + cv2 = None + SERVICE_VERSION = "0.1.0" PROJECT_ROOT = Path(__file__).resolve().parent.parent @@ -24,6 +31,7 @@ DFUT_BIN = DFUT_ROOT / "bin" DFUT_EXE = DFUT_BIN / "KneronDFUT.exe" KP121_DIST = PROJECT_ROOT / "third_party" / "kneron_plus_1_2_1" / "dist" KP121_RUNNER = Path(__file__).resolve().parent / "legacy_plus121_runner.py" +VIDEO_VIEWER_HTML = PROJECT_ROOT / "TestRes" / "Images" / "VideoInferenceWeb.html" @dataclass @@ -501,11 +509,181 @@ def _query_windows_driver_status() -> List[Dict[str, Any]]: return results +def _open_camera_capture(camera_id: int) -> Any: + if cv2 is None: + raise HTTPException( + status_code=500, + detail=_err("OPENCV_NOT_AVAILABLE", "opencv-python is not installed"), + ) + cap = cv2.VideoCapture(camera_id, cv2.CAP_DSHOW) + if not cap.isOpened(): + cap.release() + cap = cv2.VideoCapture(camera_id) + if not cap.isOpened(): + cap.release() + raise HTTPException( + status_code=404, + detail=_err("CAMERA_NOT_FOUND", f"Cannot open camera id={camera_id}"), + ) + return cap + + +def _mjpeg_stream_generator(cap: Any, jpeg_quality: int, frame_interval_sec: float): + try: + while True: + ok, frame = cap.read() + if not ok: + time.sleep(0.03) + continue + + ok, encoded = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), jpeg_quality]) + if not ok: + continue + + jpg = encoded.tobytes() + header = ( + b"--frame\r\n" + b"Content-Type: image/jpeg\r\n" + b"Content-Length: " + str(len(jpg)).encode("ascii") + b"\r\n\r\n" + ) + yield header + jpg + b"\r\n" + + if frame_interval_sec > 0: + time.sleep(frame_interval_sec) + finally: + cap.release() + + +def _frame_to_input_bytes(frame_bgr: Any, image_format: str) -> bytes: + fmt = image_format.upper() + if fmt == "RGB565": + converted = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2BGR565) + return converted.tobytes() + if fmt == "RGBA8888": + converted = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGBA) + return converted.tobytes() + if fmt == "RAW8": + converted = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) + return converted.tobytes() + raise HTTPException( + status_code=400, + detail=_err( + "UNSUPPORTED_STREAM_IMAGE_FORMAT", + "For /inference/run_video, supported image_format: RGB565, RGBA8888, RAW8", + ), + ) + + +def _run_inference_from_image_bytes( + image_bytes: bytes, + width: int, + height: int, + model_id: int, + image_format_text: str, + channels_ordering_text: str, + output_dtype_text: str, +) -> List[Dict[str, Any]]: + device_group = _require_device() + image_format = _image_format_from_str(image_format_text) + channels_ordering = _channels_ordering_from_str(channels_ordering_text) + if output_dtype_text.lower() != "float32": + raise HTTPException( + status_code=400, + detail=_err("INVALID_OUTPUT_DTYPE", "Only float32 output is supported in PoC"), + ) + + try: + if STATE.port_id is not None: + kp.core.get_model_info(device_group, STATE.port_id) + except kp.ApiKPException as exc: + if exc.api_return_code == kp.ApiReturnCode.KP_ERROR_MODEL_NOT_LOADED_35: + raise HTTPException( + status_code=500, + detail=_err( + "KP_ERROR_MODEL_NOT_LOADED_35", + str(kp.ApiReturnCode.KP_ERROR_MODEL_NOT_LOADED_35), + ), + ) + raise HTTPException( + status_code=500, + detail=_err(str(exc.api_return_code), str(exc)), + ) + + expected_size = _expected_image_size_bytes(image_format_text, width, height) + if expected_size is not None and len(image_bytes) != expected_size: + raise HTTPException( + status_code=400, + detail=_err( + "INVALID_IMAGE_SIZE", + ( + f"image bytes size mismatch: expected={expected_size}, actual={len(image_bytes)}. " + "Send raw pixel bytes for selected image_format (not BMP/JPEG/PNG file bytes)." + ), + ), + ) + + input_image = kp.GenericInputNodeImage( + image=image_bytes, + width=width, + height=height, + image_format=image_format, + ) + + input_desc = kp.GenericImageInferenceDescriptor( + model_id=model_id, + input_node_image_list=[input_image], + ) + + try: + kp.inference.generic_image_inference_send(device_group, input_desc) + result = kp.inference.generic_image_inference_receive(device_group) + except kp.ApiKPException as exc: + raise HTTPException( + status_code=500, + detail=_err(str(exc.api_return_code), str(exc)), + ) + + outputs = [] + for node_idx in range(result.header.num_output_node): + try: + node_output = kp.inference.generic_inference_retrieve_float_node( + node_idx, result, channels_ordering + ) + except kp.ApiKPException as exc: + raise HTTPException( + status_code=500, + detail=_err(str(exc.api_return_code), str(exc)), + ) + + data_bytes = node_output.ndarray.astype("float32").tobytes() + outputs.append( + { + "node_idx": node_idx, + "name": node_output.name, + "dtype": "float32", + "shape": node_output.shape, + "data_base64": base64.b64encode(data_bytes).decode("ascii"), + "channels_ordering": channels_ordering.name, + } + ) + return outputs + + @app.get("/health") def health() -> Dict[str, Any]: return _ok({"status": "up"}) +@app.get("/tools/video-inference") +def tools_video_inference() -> FileResponse: + if not VIDEO_VIEWER_HTML.is_file(): + raise HTTPException( + status_code=404, + detail=_err("TOOL_PAGE_NOT_FOUND", f"Tool page not found: {VIDEO_VIEWER_HTML}"), + ) + return FileResponse(str(VIDEO_VIEWER_HTML), media_type="text/html; charset=utf-8") + + @app.get("/version") def version() -> Dict[str, Any]: return _ok( @@ -516,6 +694,90 @@ def version() -> Dict[str, Any]: ) +@app.get("/camera/list") +def camera_list(max_probe: int = 5) -> Dict[str, Any]: + if max_probe < 1 or max_probe > 20: + raise HTTPException( + status_code=400, + detail=_err("INVALID_MAX_PROBE", "max_probe must be between 1 and 20"), + ) + if cv2 is None: + raise HTTPException( + status_code=500, + detail=_err("OPENCV_NOT_AVAILABLE", "opencv-python is not installed"), + ) + + cameras: List[Dict[str, Any]] = [] + for camera_id in range(max_probe): + cap = cv2.VideoCapture(camera_id, cv2.CAP_DSHOW) + opened = cap.isOpened() + if not opened: + cap.release() + cap = cv2.VideoCapture(camera_id) + opened = cap.isOpened() + if opened: + cameras.append( + { + "camera_id": camera_id, + "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0), + "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0), + "fps": float(cap.get(cv2.CAP_PROP_FPS) or 0.0), + } + ) + cap.release() + return _ok({"cameras": cameras}) + + +@app.get("/camera/stream") +def camera_stream( + camera_id: int = 0, + width: Optional[int] = None, + height: Optional[int] = None, + fps: Optional[float] = None, + jpeg_quality: int = 80, +) -> StreamingResponse: + if camera_id < 0: + raise HTTPException( + status_code=400, + detail=_err("INVALID_CAMERA_ID", "camera_id must be >= 0"), + ) + if width is not None and width <= 0: + raise HTTPException(status_code=400, detail=_err("INVALID_WIDTH", "width must be > 0")) + if height is not None and height <= 0: + raise HTTPException(status_code=400, detail=_err("INVALID_HEIGHT", "height must be > 0")) + if fps is not None and (fps <= 0 or fps > 60): + raise HTTPException(status_code=400, detail=_err("INVALID_FPS", "fps must be in range (0, 60]")) + if jpeg_quality < 1 or jpeg_quality > 100: + raise HTTPException( + status_code=400, + detail=_err("INVALID_JPEG_QUALITY", "jpeg_quality must be in range [1, 100]"), + ) + + cap = _open_camera_capture(camera_id) + if width is not None: + cap.set(cv2.CAP_PROP_FRAME_WIDTH, float(width)) + if height is not None: + cap.set(cv2.CAP_PROP_FRAME_HEIGHT, float(height)) + if fps is not None: + cap.set(cv2.CAP_PROP_FPS, float(fps)) + + frame_interval_sec = (1.0 / float(fps)) if fps else 0.0 + stream = _mjpeg_stream_generator(cap, jpeg_quality=jpeg_quality, frame_interval_sec=frame_interval_sec) + + headers = { + "Cache-Control": "no-cache, no-store, must-revalidate", + "Pragma": "no-cache", + "Expires": "0", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + } + return StreamingResponse( + stream, + media_type="multipart/x-mixed-replace; boundary=frame", + headers=headers, + ) + + @app.get("/devices") def devices() -> Dict[str, Any]: device_list = kp.core.scan_devices() @@ -888,32 +1150,6 @@ def models_reset() -> Dict[str, Any]: @app.post("/inference/run") def inference_run(req: InferenceRunRequest) -> Dict[str, Any]: - device_group = _require_device() - image_format = _image_format_from_str(req.image_format) - channels_ordering = _channels_ordering_from_str(req.channels_ordering) - if req.output_dtype.lower() != "float32": - raise HTTPException( - status_code=400, - detail=_err("INVALID_OUTPUT_DTYPE", "Only float32 output is supported in PoC"), - ) - - try: - if STATE.port_id is not None: - kp.core.get_model_info(device_group, STATE.port_id) - except kp.ApiKPException as exc: - if exc.api_return_code == kp.ApiReturnCode.KP_ERROR_MODEL_NOT_LOADED_35: - raise HTTPException( - status_code=500, - detail=_err( - "KP_ERROR_MODEL_NOT_LOADED_35", - str(kp.ApiReturnCode.KP_ERROR_MODEL_NOT_LOADED_35), - ), - ) - raise HTTPException( - status_code=500, - detail=_err(str(exc.api_return_code), str(exc)), - ) - b64_text = req.image_base64.strip() if b64_text.startswith("data:") and "," in b64_text: b64_text = b64_text.split(",", 1)[1] @@ -925,68 +1161,121 @@ def inference_run(req: InferenceRunRequest) -> Dict[str, Any]: status_code=400, detail=_err("INVALID_BASE64", "image_base64 is not valid base64 data"), ) - - expected_size = _expected_image_size_bytes(req.image_format, req.width, req.height) - if expected_size is not None and len(image_bytes) != expected_size: - raise HTTPException( - status_code=400, - detail=_err( - "INVALID_IMAGE_SIZE", - ( - f"image bytes size mismatch: expected={expected_size}, actual={len(image_bytes)}. " - "Send raw pixel bytes for selected image_format (not BMP/JPEG/PNG file bytes)." - ), - ), - ) - - input_image = kp.GenericInputNodeImage( - image=image_bytes, + outputs = _run_inference_from_image_bytes( + image_bytes=image_bytes, width=req.width, height=req.height, - image_format=image_format, - ) - - input_desc = kp.GenericImageInferenceDescriptor( model_id=req.model_id, - input_node_image_list=[input_image], + image_format_text=req.image_format, + channels_ordering_text=req.channels_ordering, + output_dtype_text=req.output_dtype, ) - try: - kp.inference.generic_image_inference_send(device_group, input_desc) - result = kp.inference.generic_image_inference_receive(device_group) - except kp.ApiKPException as exc: - raise HTTPException( - status_code=500, - detail=_err(str(exc.api_return_code), str(exc)), - ) - - outputs = [] - for node_idx in range(result.header.num_output_node): - try: - node_output = kp.inference.generic_inference_retrieve_float_node( - node_idx, result, channels_ordering - ) - except kp.ApiKPException as exc: - raise HTTPException( - status_code=500, - detail=_err(str(exc.api_return_code), str(exc)), - ) - - data_bytes = node_output.ndarray.astype("float32").tobytes() - outputs.append( - { - "node_idx": node_idx, - "name": node_output.name, - "dtype": "float32", - "shape": node_output.shape, - "data_base64": base64.b64encode(data_bytes).decode("ascii"), - "channels_ordering": channels_ordering.name, - } - ) - return _ok({"outputs": outputs}) +@app.post("/inference/run_video") +async def inference_run_video( + file: UploadFile = File(...), + model_id: int = Form(...), + image_format: str = Form(...), + channels_ordering: str = Form("DEFAULT"), + output_dtype: str = Form("float32"), + sample_every_n: int = Form(1), + max_frames: Optional[int] = Form(default=None), +) -> StreamingResponse: + if cv2 is None: + raise HTTPException( + status_code=500, + detail=_err("OPENCV_NOT_AVAILABLE", "opencv-python is not installed"), + ) + if sample_every_n <= 0: + raise HTTPException( + status_code=400, + detail=_err("INVALID_SAMPLE_EVERY_N", "sample_every_n must be >= 1"), + ) + if max_frames is not None and max_frames <= 0: + raise HTTPException( + status_code=400, + detail=_err("INVALID_MAX_FRAMES", "max_frames must be >= 1 when provided"), + ) + + suffix = Path(file.filename or "upload.mp4").suffix or ".mp4" + tmp_path = Path(tempfile.gettempdir()) / f"inference_upload_{int(time.time() * 1000)}{suffix}" + with tmp_path.open("wb") as f: + while True: + chunk = await file.read(1024 * 1024) + if not chunk: + break + f.write(chunk) + await file.close() + + def _iter_results(): + cap = cv2.VideoCapture(str(tmp_path)) + if not cap.isOpened(): + cap.release() + if tmp_path.exists(): + tmp_path.unlink() + error_line = json.dumps( + _err("VIDEO_OPEN_FAILED", f"Cannot open uploaded video: {tmp_path.name}"), + ensure_ascii=False, + ) + yield (error_line + "\n").encode("utf-8") + return + + sent_count = 0 + frame_index = -1 + try: + while True: + ok, frame = cap.read() + if not ok: + break + frame_index += 1 + if frame_index % sample_every_n != 0: + continue + + height, width = int(frame.shape[0]), int(frame.shape[1]) + image_bytes = _frame_to_input_bytes(frame, image_format) + outputs = _run_inference_from_image_bytes( + image_bytes=image_bytes, + width=width, + height=height, + model_id=model_id, + image_format_text=image_format, + channels_ordering_text=channels_ordering, + output_dtype_text=output_dtype, + ) + payload = _ok( + { + "frame_index": frame_index, + "width": width, + "height": height, + "outputs": outputs, + } + ) + yield (json.dumps(payload, ensure_ascii=False) + "\n").encode("utf-8") + + sent_count += 1 + if max_frames is not None and sent_count >= max_frames: + break + finally: + cap.release() + if tmp_path.exists(): + tmp_path.unlink() + + headers = { + "Cache-Control": "no-cache, no-store, must-revalidate", + "Pragma": "no-cache", + "Expires": "0", + "Connection": "keep-alive", + } + return StreamingResponse( + _iter_results(), + media_type="application/x-ndjson", + headers=headers, + ) + + if __name__ == "__main__": import uvicorn diff --git a/local_service_win/LocalAPI/postprocess_core.py b/local_service_win/LocalAPI/postprocess_core.py new file mode 100644 index 0000000..8021e75 --- /dev/null +++ b/local_service_win/LocalAPI/postprocess_core.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import base64 +import math +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import numpy as np + + +YOLO_DEFAULT_ANCHORS: List[List[Tuple[float, float]]] = [ + [(10.0, 14.0), (23.0, 27.0), (37.0, 58.0)], + [(81.0, 82.0), (135.0, 169.0), (344.0, 319.0)], +] + + +@dataclass +class Box: + cls: int + score: float + x1: float + y1: float + x2: float + y2: float + + +def _sigmoid(v: np.ndarray | float) -> np.ndarray | float: + return 1.0 / (1.0 + np.exp(-v)) + + +def decode_outputs(raw_outputs: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: + decoded: List[Dict[str, Any]] = [] + for idx, o in enumerate(raw_outputs): + shape = list(o.get("shape") or []) + data_b64 = str(o.get("data_base64") or "") + raw = base64.b64decode(data_b64) + arr = np.frombuffer(raw, dtype=" List[Dict[str, Any]]: + picked: List[Dict[str, Any]] = [] + for o in all_nodes: + shape = o["shape"] + if len(shape) != 4 or shape[0] != 1: + continue + ch = int(shape[1]) + if ch % (5 + num_classes) != 0: + continue + picked.append(o) + picked.sort(key=lambda n: int(n["shape"][2]), reverse=True) + return picked + + +def decode_yolo_common( + all_nodes: Sequence[Dict[str, Any]], + mode: str, + num_classes: int, + input_w: int, + input_h: int, + conf_th: float, + use_sigmoid: bool = True, + use_xy_sigmoid: bool = True, + score_mode: str = "obj_cls", + anchors_by_level: Optional[List[List[Tuple[float, float]]]] = None, +) -> List[Box]: + nodes = _pick_yolo_nodes(all_nodes, num_classes) + if not nodes: + raise RuntimeError("No YOLO-like [1,C,H,W] output nodes found") + anchors_levels = anchors_by_level or YOLO_DEFAULT_ANCHORS + + boxes: List[Box] = [] + attrs = 5 + num_classes + + for lv, o in enumerate(nodes): + _, ch, gh, gw = o["shape"] + na = int(ch // attrs) + data: np.ndarray = o["data"] + anchors = anchors_levels[min(lv, len(anchors_levels) - 1)] + + def at(channel_idx: int, y: int, x: int) -> float: + return float(data[channel_idx * gh * gw + y * gw + x]) + + for a in range(na): + aw, ah = anchors[min(a, len(anchors) - 1)] + base = a * attrs + for y in range(gh): + for x in range(gw): + tx = at(base + 0, y, x) + ty = at(base + 1, y, x) + tw = at(base + 2, y, x) + th = at(base + 3, y, x) + to = at(base + 4, y, x) + + obj = float(_sigmoid(to) if use_sigmoid else to) + best_cls = -1 + best_prob = -1e9 + for k in range(num_classes): + p = at(base + 5 + k, y, x) + p = float(_sigmoid(p) if use_sigmoid else p) + if p > best_prob: + best_prob = p + best_cls = k + + if score_mode == "obj": + score = obj + elif score_mode == "cls": + score = best_prob + else: + score = obj * best_prob + if score < conf_th: + continue + + if mode == "yolov5": + sx = input_w / gw + sy = input_h / gh + txv = float(_sigmoid(tx) if use_xy_sigmoid else tx) + tyv = float(_sigmoid(ty) if use_xy_sigmoid else ty) + bx = (txv * 2.0 - 0.5 + x) * sx + by = (tyv * 2.0 - 0.5 + y) * sy + bw = (float(_sigmoid(tw)) * 2.0) ** 2 * aw + bh = (float(_sigmoid(th)) * 2.0) ** 2 * ah + else: + txv = float(_sigmoid(tx) if use_xy_sigmoid else tx) + tyv = float(_sigmoid(ty) if use_xy_sigmoid else ty) + bx = (txv + x) / gw * input_w + by = (tyv + y) / gh * input_h + bw = aw * math.exp(tw) + bh = ah * math.exp(th) + + boxes.append( + Box( + cls=best_cls, + score=score, + x1=bx - bw / 2.0, + y1=by - bh / 2.0, + x2=bx + bw / 2.0, + y2=by + bh / 2.0, + ) + ) + return boxes + + +def _auto_fcos_indices(all_nodes: Sequence[Dict[str, Any]], num_classes: int) -> List[Tuple[int, int, int, int]]: + valid = [o for o in all_nodes if len(o["shape"]) == 4 and o["shape"][0] == 1] + cls_nodes = [o for o in valid if int(o["shape"][1]) == num_classes] + reg_nodes = [o for o in valid if int(o["shape"][1]) == 4] + ctr_nodes = [o for o in valid if int(o["shape"][1]) == 1] + + by_hw: Dict[Tuple[int, int], Dict[str, Dict[str, Any]]] = {} + for n in cls_nodes: + by_hw.setdefault((int(n["shape"][2]), int(n["shape"][3])), {})["cls"] = n + for n in reg_nodes: + by_hw.setdefault((int(n["shape"][2]), int(n["shape"][3])), {})["reg"] = n + for n in ctr_nodes: + by_hw.setdefault((int(n["shape"][2]), int(n["shape"][3])), {})["ctr"] = n + + levels: List[Tuple[int, int, int, int]] = [] + for (h, _w), items in by_hw.items(): + if not {"cls", "reg", "ctr"}.issubset(items.keys()): + continue + levels.append( + ( + h, + int(items["cls"]["node_idx"]), + int(items["reg"]["node_idx"]), + int(items["ctr"]["node_idx"]), + ) + ) + levels.sort(key=lambda x: x[0], reverse=True) + strides = [8, 16, 32, 64, 128] + return [ + (cls_i, reg_i, ctr_i, strides[min(i, len(strides) - 1)]) + for i, (_h, cls_i, reg_i, ctr_i) in enumerate(levels) + ] + + +def decode_fcos( + all_nodes: Sequence[Dict[str, Any]], + num_classes: int, + input_w: int, + input_h: int, + conf_th: float, + use_sigmoid: bool = True, + score_mode: str = "obj_cls", +) -> List[Box]: + levels = _auto_fcos_indices(all_nodes, num_classes) + if not levels: + raise RuntimeError("Cannot auto match FCOS cls/reg/ctr nodes") + + boxes: List[Box] = [] + by_idx = {int(n["node_idx"]): n for n in all_nodes} + + for cls_idx, reg_idx, ctr_idx, stride in levels: + cls_node = by_idx.get(cls_idx) + reg_node = by_idx.get(reg_idx) + ctr_node = by_idx.get(ctr_idx) + if not cls_node or not reg_node or not ctr_node: + continue + + gh = int(cls_node["shape"][2]) + gw = int(cls_node["shape"][3]) + cls_data: np.ndarray = cls_node["data"] + reg_data: np.ndarray = reg_node["data"] + ctr_data: np.ndarray = ctr_node["data"] + + def at(node_data: np.ndarray, channel_idx: int, y: int, x: int) -> float: + return float(node_data[channel_idx * gh * gw + y * gw + x]) + + cls_channels = int(cls_node["shape"][1]) + for y in range(gh): + for x in range(gw): + ctr = at(ctr_data, 0, y, x) + ctr = float(_sigmoid(ctr) if use_sigmoid else ctr) + + best_cls = -1 + best_prob = -1e9 + for k in range(min(num_classes, cls_channels)): + p = at(cls_data, k, y, x) + p = float(_sigmoid(p) if use_sigmoid else p) + if p > best_prob: + best_prob = p + best_cls = k + + if score_mode == "obj": + score = ctr + elif score_mode == "cls": + score = best_prob + else: + score = math.sqrt(max(0.0, best_prob * ctr)) + if score < conf_th: + continue + + l = max(0.0, at(reg_data, 0, y, x)) + t = max(0.0, at(reg_data, 1, y, x)) + r = max(0.0, at(reg_data, 2, y, x)) + b = max(0.0, at(reg_data, 3, y, x)) + cx = (x + 0.5) * stride + cy = (y + 0.5) * stride + + x1 = max(0.0, min(input_w, cx - l)) + y1 = max(0.0, min(input_h, cy - t)) + x2 = max(0.0, min(input_w, cx + r)) + y2 = max(0.0, min(input_h, cy + b)) + if x2 <= x1 or y2 <= y1: + continue + boxes.append(Box(cls=best_cls, score=score, x1=x1, y1=y1, x2=x2, y2=y2)) + return boxes + + +def _iou(a: Box, b: Box) -> float: + xx1 = max(a.x1, b.x1) + yy1 = max(a.y1, b.y1) + xx2 = min(a.x2, b.x2) + yy2 = min(a.y2, b.y2) + w = max(0.0, xx2 - xx1) + h = max(0.0, yy2 - yy1) + inter = w * h + if inter <= 0: + return 0.0 + area_a = max(0.0, a.x2 - a.x1) * max(0.0, a.y2 - a.y1) + area_b = max(0.0, b.x2 - b.x1) * max(0.0, b.y2 - b.y1) + return inter / max(1e-9, area_a + area_b - inter) + + +def nms(boxes: Sequence[Box], iou_th: float, max_out: int) -> List[Box]: + by_cls: Dict[int, List[Box]] = {} + for b in boxes: + by_cls.setdefault(b.cls, []).append(b) + + kept: List[Box] = [] + for cls_boxes in by_cls.values(): + cls_boxes = sorted(cls_boxes, key=lambda b: b.score, reverse=True) + picked: List[Box] = [] + while cls_boxes: + cur = cls_boxes.pop(0) + picked.append(cur) + cls_boxes = [b for b in cls_boxes if _iou(cur, b) <= iou_th] + kept.extend(picked) + + kept.sort(key=lambda b: b.score, reverse=True) + return kept[:max_out] diff --git a/local_service_win/STRATEGY.md b/local_service_win/STRATEGY.md index 50c8bfa..43881a1 100644 --- a/local_service_win/STRATEGY.md +++ b/local_service_win/STRATEGY.md @@ -347,6 +347,46 @@ Response } ``` +### `POST /inference/run_video` +Notes +- Video file upload endpoint for continuous inference in PoC. +- Response is NDJSON stream (`application/x-ndjson`), one JSON object per processed frame. +- ByteTrack-specific tracking output is out of scope for current PoC; this endpoint returns raw model outputs per frame. +Request (`multipart/form-data`) +- `file`: video file (`.mp4/.avi/...`) +- `model_id`: integer +- `image_format`: `RGB565` | `RGBA8888` | `RAW8` +- `channels_ordering`: optional, default `DEFAULT` +- `output_dtype`: optional, default `float32` +- `sample_every_n`: optional, default `1` +- `max_frames`: optional + +Response line example (NDJSON) +```json +{ + "ok": true, + "data": { + "frame_index": 0, + "width": 640, + "height": 640, + "outputs": [ + { "node_idx": 0, "dtype": "float32", "shape": [1, 255, 80, 80], "data_base64": "..." } + ] + }, + "error": null +} +``` + +### `GET /tools/video-inference` +Notes +- Serves a single-page visual test tool from LocalAPI. +- Supports two input sources: + - Video file + - Webcam (browser `getUserMedia`) +- Frontend calls `POST /inference/run` frame-by-frame and draws decoded boxes on canvas. +- Purpose: PoC visual validation for YOLOv5/FCOS/TinyYOLO style models. +- ByteTrack visualization/tracking is intentionally excluded in current phase. + ### `WS /ws` (streaming inference) Notes - For camera/video stream, use WebSocket for low-latency send/receive. @@ -396,7 +436,7 @@ Message (server -> client) - MEMO: define production approach for privilege handling (installer-time elevation, helper process with UAC prompt, or enterprise pre-install policy) so end-user flow does not get blocked. ## API Test Progress (Windows PoC) -Updated: 2026-03-03 +Updated: 2026-03-04 ### Completed - `GET /health` @@ -417,6 +457,8 @@ Updated: 2026-03-03 - `POST /models/clear` - `POST /models/reset` - `POST /inference/run` +- `POST /inference/run_video` +- `GET /tools/video-inference` ### Pending - None (for currently implemented HTTP endpoints). @@ -429,6 +471,24 @@ Updated: 2026-03-03 - `POST /models/load` and `POST /inference/run` must be tested as a pair in the same flow. - Test pairs are defined in `local_service_win/TestRes/TEST_PAIRS.md`. +### Video/Webcam PoC Test Flow +1. Start LocalAPI service. +2. Connect device and load model: + - `POST /devices/connect` + - `POST /models/load` +3. Visual tool path: + - Open `http://127.0.0.1:4398/tools/video-inference` + - Select source (`Video File` or `Webcam`) + - Use default model presets (YOLOv5=20005, FCOS=20004, TinyYOLO=19), then click `Start` +4. API-only path: + - Use `POST /inference/run_video` with `multipart/form-data` + - Start with small values: `sample_every_n=3`, `max_frames=30` +5. Expected: + - Continuous frame-wise inference results are returned. + - Visual page overlays detection boxes on displayed frames. +6. Current scope note: + - ByteTrack tracking output (`track_id` continuity) is not covered in this PoC phase. + ### Model/Inference Test Pairs #### KL520 1. YOLOv5 (model zoo) diff --git a/local_service_win/TestRes/Images/MOT16-03_trim.mp4 b/local_service_win/TestRes/Images/MOT16-03_trim.mp4 new file mode 100644 index 0000000..6124631 Binary files /dev/null and b/local_service_win/TestRes/Images/MOT16-03_trim.mp4 differ diff --git a/local_service_win/TestRes/Images/VideoInferenceWeb.html b/local_service_win/TestRes/Images/VideoInferenceWeb.html new file mode 100644 index 0000000..b05a64d --- /dev/null +++ b/local_service_win/TestRes/Images/VideoInferenceWeb.html @@ -0,0 +1,627 @@ + + + + + + Video Inference Viewer + + + +
+
+

Video Inference (API)

+
+
+ + +
+
+ + +
+ +
+ + +
+
+ +
+
+ + +
+
+ + +
+
+ +
+
+ + +
+
+ + +
+
+ +
+
+ + +
+
+ + +
+
+ +
+
+ + +
+
+ + +
+
+ +
+
+ + +
+
+ + +
+
+ +
+ + +
+ +
Ready.
+
預設值可直接測 YOLOv5。先確認 LocalAPI 已啟動,並完成 connect + load model。
+
+ +
+
+ +
+ + +
+
+ + + + diff --git a/local_service_win/TestRes/video_inference_viewer.py b/local_service_win/TestRes/video_inference_viewer.py new file mode 100644 index 0000000..ae714ee --- /dev/null +++ b/local_service_win/TestRes/video_inference_viewer.py @@ -0,0 +1,514 @@ +from __future__ import annotations + +import argparse +import base64 +import json +import math +import sys +import time +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import cv2 +import numpy as np + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +from LocalAPI import postprocess_core as core + + +YOLO_DEFAULT_ANCHORS: List[List[Tuple[float, float]]] = [ + [(10.0, 14.0), (23.0, 27.0), (37.0, 58.0)], + [(81.0, 82.0), (135.0, 169.0), (344.0, 319.0)], +] + + +def _sigmoid(v: np.ndarray | float) -> np.ndarray | float: + return 1.0 / (1.0 + np.exp(-v)) + + +def _encode_frame(frame_bgr: np.ndarray, image_format: str) -> bytes: + fmt = image_format.upper() + if fmt == "RGBA8888": + rgba = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGBA) + return rgba.tobytes() + if fmt == "RAW8": + gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) + return gray.tobytes() + if fmt == "RGB565": + bgr565 = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2BGR565) + return bgr565.tobytes() + raise ValueError(f"Unsupported image_format: {image_format}") + + +def _call_inference_run( + base_url: str, + model_id: int, + image_format: str, + width: int, + height: int, + image_bytes: bytes, + channels_ordering: str = "DEFAULT", + output_dtype: str = "float32", + timeout_sec: float = 20.0, +) -> Dict[str, Any]: + body = { + "model_id": model_id, + "image_format": image_format, + "width": width, + "height": height, + "image_base64": base64.b64encode(image_bytes).decode("ascii"), + "channels_ordering": channels_ordering, + "output_dtype": output_dtype, + } + req = urllib.request.Request( + url=f"{base_url.rstrip('/')}/inference/run", + data=json.dumps(body).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=timeout_sec) as resp: + content = resp.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as exc: + msg = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"HTTP {exc.code}: {msg}") from exc + except urllib.error.URLError as exc: + raise RuntimeError(f"Request failed: {exc}") from exc + + parsed = json.loads(content) + if not parsed.get("ok"): + raise RuntimeError(json.dumps(parsed.get("error"), ensure_ascii=False)) + return parsed["data"] + + +def _decode_outputs(raw_outputs: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: + decoded: List[Dict[str, Any]] = [] + for idx, o in enumerate(raw_outputs): + shape = list(o.get("shape") or []) + data_b64 = str(o.get("data_base64") or "") + raw = base64.b64decode(data_b64) + arr = np.frombuffer(raw, dtype=" List[Dict[str, Any]]: + picked: List[Dict[str, Any]] = [] + for o in all_nodes: + shape = o["shape"] + if len(shape) != 4 or shape[0] != 1: + continue + ch = int(shape[1]) + if ch % (5 + num_classes) != 0: + continue + picked.append(o) + picked.sort(key=lambda n: int(n["shape"][2]), reverse=True) + return picked + + +def _decode_yolo_common( + all_nodes: Sequence[Dict[str, Any]], + mode: str, + num_classes: int, + input_w: int, + input_h: int, + conf_th: float, + use_sigmoid: bool = True, + use_xy_sigmoid: bool = True, + score_mode: str = "obj_cls", + anchors_by_level: Optional[List[List[Tuple[float, float]]]] = None, +) -> List[Box]: + nodes = _pick_yolo_nodes(all_nodes, num_classes) + if not nodes: + raise RuntimeError("No YOLO-like [1,C,H,W] output nodes found") + anchors_levels = anchors_by_level or YOLO_DEFAULT_ANCHORS + + boxes: List[Box] = [] + attrs = 5 + num_classes + + for lv, o in enumerate(nodes): + _, ch, gh, gw = o["shape"] + na = int(ch // attrs) + data: np.ndarray = o["data"] + anchors = anchors_levels[min(lv, len(anchors_levels) - 1)] + + def at(channel_idx: int, y: int, x: int) -> float: + return float(data[channel_idx * gh * gw + y * gw + x]) + + for a in range(na): + aw, ah = anchors[min(a, len(anchors) - 1)] + base = a * attrs + + for y in range(gh): + for x in range(gw): + tx = at(base + 0, y, x) + ty = at(base + 1, y, x) + tw = at(base + 2, y, x) + th = at(base + 3, y, x) + to = at(base + 4, y, x) + + obj = float(_sigmoid(to) if use_sigmoid else to) + best_cls = -1 + best_prob = -1e9 + for k in range(num_classes): + p = at(base + 5 + k, y, x) + p = float(_sigmoid(p) if use_sigmoid else p) + if p > best_prob: + best_prob = p + best_cls = k + + if score_mode == "obj": + score = obj + elif score_mode == "cls": + score = best_prob + else: + score = obj * best_prob + + if score < conf_th: + continue + + if mode == "yolov5": + sx = input_w / gw + sy = input_h / gh + txv = float(_sigmoid(tx) if use_xy_sigmoid else tx) + tyv = float(_sigmoid(ty) if use_xy_sigmoid else ty) + bx = (txv * 2.0 - 0.5 + x) * sx + by = (tyv * 2.0 - 0.5 + y) * sy + bw = (float(_sigmoid(tw)) * 2.0) ** 2 * aw + bh = (float(_sigmoid(th)) * 2.0) ** 2 * ah + else: + txv = float(_sigmoid(tx) if use_xy_sigmoid else tx) + tyv = float(_sigmoid(ty) if use_xy_sigmoid else ty) + bx = (txv + x) / gw * input_w + by = (tyv + y) / gh * input_h + bw = aw * math.exp(tw) + bh = ah * math.exp(th) + + boxes.append( + Box( + cls=best_cls, + score=score, + x1=bx - bw / 2.0, + y1=by - bh / 2.0, + x2=bx + bw / 2.0, + y2=by + bh / 2.0, + ) + ) + + return boxes + + +def _auto_fcos_indices(all_nodes: Sequence[Dict[str, Any]], num_classes: int) -> List[Tuple[int, int, int, int]]: + valid = [o for o in all_nodes if len(o["shape"]) == 4 and o["shape"][0] == 1] + cls_nodes = [o for o in valid if int(o["shape"][1]) == num_classes] + reg_nodes = [o for o in valid if int(o["shape"][1]) == 4] + ctr_nodes = [o for o in valid if int(o["shape"][1]) == 1] + + by_hw: Dict[Tuple[int, int], Dict[str, Dict[str, Any]]] = {} + for n in cls_nodes: + by_hw.setdefault((int(n["shape"][2]), int(n["shape"][3])), {})["cls"] = n + for n in reg_nodes: + by_hw.setdefault((int(n["shape"][2]), int(n["shape"][3])), {})["reg"] = n + for n in ctr_nodes: + by_hw.setdefault((int(n["shape"][2]), int(n["shape"][3])), {})["ctr"] = n + + levels: List[Tuple[int, int, int, int]] = [] + for (h, w), items in by_hw.items(): + if not {"cls", "reg", "ctr"}.issubset(items.keys()): + continue + levels.append( + ( + h, + int(items["cls"]["node_idx"]), + int(items["reg"]["node_idx"]), + int(items["ctr"]["node_idx"]), + ) + ) + levels.sort(key=lambda x: x[0], reverse=True) + strides = [8, 16, 32, 64, 128] + return [(cls_i, reg_i, ctr_i, strides[min(i, len(strides) - 1)]) for i, (_, cls_i, reg_i, ctr_i) in enumerate(levels)] + + +def _decode_fcos( + all_nodes: Sequence[Dict[str, Any]], + num_classes: int, + input_w: int, + input_h: int, + conf_th: float, + use_sigmoid: bool = True, + score_mode: str = "obj_cls", +) -> List[Box]: + levels = _auto_fcos_indices(all_nodes, num_classes) + if not levels: + raise RuntimeError("Cannot auto match FCOS cls/reg/ctr nodes") + + boxes: List[Box] = [] + by_idx = {int(n["node_idx"]): n for n in all_nodes} + + for cls_idx, reg_idx, ctr_idx, stride in levels: + cls_node = by_idx.get(cls_idx) + reg_node = by_idx.get(reg_idx) + ctr_node = by_idx.get(ctr_idx) + if not cls_node or not reg_node or not ctr_node: + continue + + gh = int(cls_node["shape"][2]) + gw = int(cls_node["shape"][3]) + cls_data: np.ndarray = cls_node["data"] + reg_data: np.ndarray = reg_node["data"] + ctr_data: np.ndarray = ctr_node["data"] + + def at(node_data: np.ndarray, channel_idx: int, y: int, x: int) -> float: + return float(node_data[channel_idx * gh * gw + y * gw + x]) + + cls_channels = int(cls_node["shape"][1]) + for y in range(gh): + for x in range(gw): + ctr = at(ctr_data, 0, y, x) + ctr = float(_sigmoid(ctr) if use_sigmoid else ctr) + + best_cls = -1 + best_prob = -1e9 + for k in range(min(num_classes, cls_channels)): + p = at(cls_data, k, y, x) + p = float(_sigmoid(p) if use_sigmoid else p) + if p > best_prob: + best_prob = p + best_cls = k + + if score_mode == "obj": + score = ctr + elif score_mode == "cls": + score = best_prob + else: + score = math.sqrt(max(0.0, best_prob * ctr)) + if score < conf_th: + continue + + l = max(0.0, at(reg_data, 0, y, x)) + t = max(0.0, at(reg_data, 1, y, x)) + r = max(0.0, at(reg_data, 2, y, x)) + b = max(0.0, at(reg_data, 3, y, x)) + cx = (x + 0.5) * stride + cy = (y + 0.5) * stride + + x1 = max(0.0, min(input_w, cx - l)) + y1 = max(0.0, min(input_h, cy - t)) + x2 = max(0.0, min(input_w, cx + r)) + y2 = max(0.0, min(input_h, cy + b)) + if x2 <= x1 or y2 <= y1: + continue + boxes.append(Box(cls=best_cls, score=score, x1=x1, y1=y1, x2=x2, y2=y2)) + + return boxes + + +def _iou(a: Box, b: Box) -> float: + xx1 = max(a.x1, b.x1) + yy1 = max(a.y1, b.y1) + xx2 = min(a.x2, b.x2) + yy2 = min(a.y2, b.y2) + w = max(0.0, xx2 - xx1) + h = max(0.0, yy2 - yy1) + inter = w * h + if inter <= 0: + return 0.0 + area_a = max(0.0, a.x2 - a.x1) * max(0.0, a.y2 - a.y1) + area_b = max(0.0, b.x2 - b.x1) * max(0.0, b.y2 - b.y1) + return inter / max(1e-9, area_a + area_b - inter) + + +def _nms(boxes: Sequence[Box], iou_th: float, max_out: int) -> List[Box]: + by_cls: Dict[int, List[Box]] = {} + for b in boxes: + by_cls.setdefault(b.cls, []).append(b) + + kept: List[Box] = [] + for cls_boxes in by_cls.values(): + cls_boxes = sorted(cls_boxes, key=lambda b: b.score, reverse=True) + picked: List[Box] = [] + while cls_boxes: + cur = cls_boxes.pop(0) + picked.append(cur) + cls_boxes = [b for b in cls_boxes if _iou(cur, b) <= iou_th] + kept.extend(picked) + + kept.sort(key=lambda b: b.score, reverse=True) + return kept[:max_out] + + +def _draw_boxes(frame: np.ndarray, boxes: Sequence[core.Box], input_w: int, input_h: int) -> np.ndarray: + out = frame.copy() + h, w = out.shape[:2] + sx = w / float(input_w) + sy = h / float(input_h) + + for b in boxes: + x1 = int(max(0, min(w - 1, round(b.x1 * sx)))) + y1 = int(max(0, min(h - 1, round(b.y1 * sy)))) + x2 = int(max(0, min(w - 1, round(b.x2 * sx)))) + y2 = int(max(0, min(h - 1, round(b.y2 * sy)))) + if x2 <= x1 or y2 <= y1: + continue + color = tuple(int(c) for c in cv2.cvtColor(np.uint8([[[b.cls * 47 % 180, 255, 220]]]), cv2.COLOR_HSV2BGR)[0][0]) + cv2.rectangle(out, (x1, y1), (x2, y2), color, 2) + text = f"{b.cls}:{b.score:.3f}" + cv2.putText(out, text, (x1, max(14, y1 - 4)), cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 2, cv2.LINE_AA) + return out + + +def _pick_video_via_dialog() -> Optional[str]: + try: + import tkinter as tk + from tkinter import filedialog + except Exception: + return None + root = tk.Tk() + root.withdraw() + path = filedialog.askopenfilename( + title="Select video file", + filetypes=[("Video files", "*.mp4 *.avi *.mov *.mkv *.wmv"), ("All files", "*.*")], + ) + root.destroy() + return path or None + + +def _defaults_for_model(model_type: str) -> Tuple[int, int]: + mt = model_type.lower() + if mt == "fcos": + return 512, 512 + if mt == "tinyyolo": + return 224, 224 + return 640, 640 + + +def main() -> None: + parser = argparse.ArgumentParser(description="Video -> /inference/run -> draw detection boxes") + parser.add_argument("--base-url", default="http://127.0.0.1:4398") + parser.add_argument("--video", default="") + parser.add_argument("--model-id", type=int, required=True) + parser.add_argument("--model-type", choices=["yolov5", "fcos", "tinyyolo"], default="yolov5") + parser.add_argument("--input-width", type=int, default=0) + parser.add_argument("--input-height", type=int, default=0) + parser.add_argument("--image-format", default="RGBA8888") + parser.add_argument("--num-classes", type=int, default=80) + parser.add_argument("--score-th", type=float, default=0.25) + parser.add_argument("--iou-th", type=float, default=0.45) + parser.add_argument("--max-boxes", type=int, default=200) + parser.add_argument("--sample-every-n", type=int, default=3) + parser.add_argument("--save-output", default="") + args = parser.parse_args() + + video_path = args.video.strip() or _pick_video_via_dialog() + if not video_path: + raise SystemExit("No video selected") + if not Path(video_path).is_file(): + raise SystemExit(f"Video not found: {video_path}") + + default_w, default_h = _defaults_for_model(args.model_type) + in_w = int(args.input_width or default_w) + in_h = int(args.input_height or default_h) + + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise SystemExit(f"Cannot open video: {video_path}") + + writer: Optional[cv2.VideoWriter] = None + if args.save_output: + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + fps = float(cap.get(cv2.CAP_PROP_FPS) or 20.0) + frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or in_w) + frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or in_h) + writer = cv2.VideoWriter(args.save_output, fourcc, fps, (frame_w, frame_h)) + + print("Press 'q' to quit.") + frame_idx = -1 + infer_count = 0 + last_boxes: List[core.Box] = [] + t0 = time.time() + + try: + while True: + ok, frame = cap.read() + if not ok: + break + frame_idx += 1 + + if frame_idx % max(1, args.sample_every_n) == 0: + infer_count += 1 + resized = cv2.resize(frame, (in_w, in_h), interpolation=cv2.INTER_AREA) + image_bytes = _encode_frame(resized, args.image_format) + try: + result = _call_inference_run( + base_url=args.base_url, + model_id=args.model_id, + image_format=args.image_format, + width=in_w, + height=in_h, + image_bytes=image_bytes, + ) + raw_outputs = result.get("outputs") or [] + outputs = core.decode_outputs(raw_outputs) + if args.model_type == "fcos": + raw_boxes = core.decode_fcos( + outputs, + num_classes=args.num_classes, + input_w=in_w, + input_h=in_h, + conf_th=args.score_th, + ) + else: + raw_boxes = core.decode_yolo_common( + outputs, + mode="yolov5" if args.model_type == "yolov5" else "tinyyolo", + num_classes=args.num_classes, + input_w=in_w, + input_h=in_h, + conf_th=args.score_th, + ) + last_boxes = core.nms(raw_boxes, iou_th=args.iou_th, max_out=args.max_boxes) + except Exception as exc: + print(f"[frame {frame_idx}] inference failed: {exc}") + + vis = _draw_boxes(frame, last_boxes, in_w, in_h) + elapsed = max(1e-6, time.time() - t0) + api_fps = infer_count / elapsed + cv2.putText( + vis, + f"frame={frame_idx} infer={infer_count} api_fps={api_fps:.2f} boxes={len(last_boxes)}", + (10, 24), + cv2.FONT_HERSHEY_SIMPLEX, + 0.65, + (0, 255, 0), + 2, + cv2.LINE_AA, + ) + + cv2.imshow("Kneron Video Inference Viewer", vis) + if writer is not None: + writer.write(vis) + + key = cv2.waitKey(1) & 0xFF + if key == ord("q"): + break + finally: + cap.release() + if writer is not None: + writer.release() + cv2.destroyAllWindows() + + +if __name__ == "__main__": + main() diff --git a/local_service_win/requirements.txt b/local_service_win/requirements.txt index 59dd9d8..55f981e 100644 --- a/local_service_win/requirements.txt +++ b/local_service_win/requirements.txt @@ -4,6 +4,7 @@ # HTTP service fastapi uvicorn +python-multipart # Reference packages from C:\Users\user\Documents\KNEOX\README.md PyQt5