debug: Add comprehensive logging to diagnose pipeline hanging issue

- Add pipeline activity logging every 10 results to track processing - Add queue size monitoring in InferencePipeline coordinator - Add camera frame capture logging every 100 frames - Add MultiDongle send/receive thread logging every 100 operations - Add error handling for repeated callback failures in camera source This will help identify where the pipeline stops processing: - Camera capture stopping - MultiDongle threads blocking - Pipeline coordinator hanging - Queue capacity issues 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-24 19:49:00 +08:00 · 2025-07-24 19:49:00 +08:00 · cde1aac908
commit cde1aac908
parent 4b8fb7fead
3 changed files with 28 additions and 1 deletions
--- a/cluster4npu_ui/core/functions/InferencePipeline.py
+++ b/cluster4npu_ui/core/functions/InferencePipeline.py
@ -522,6 +522,11 @@ class InferencePipeline:
                        # Record output timestamp for FPS calculation
                        self._record_output_timestamp()
                        # Debug: Log pipeline activity every 10 results
                        if self.completed_counter % 10 == 0:
                            print(f"[{self.pipeline_name}] Processed {self.completed_counter} results")
                            print(f"[{self.pipeline_name}] Queue sizes - Input: {self.pipeline_input_queue.qsize()}, Output: {self.pipeline_output_queue.qsize()}")
                        # Call result callback
                        if self.result_callback:
                            self.result_callback(current_data)
--- a/cluster4npu_ui/core/functions/Multidongle.py
+++ b/cluster4npu_ui/core/functions/Multidongle.py
@ -409,6 +409,7 @@ class MultiDongle:
    def _send_thread_func(self):
        """Internal function run by the send thread, gets images from input queue."""
        print("Send thread started.")
        send_count = 0
        while not self._stop_event.is_set():
            if self.generic_inference_input_descriptor is None:
                # Wait for descriptor to be ready or stop
@ -434,6 +435,12 @@ class MultiDongle:
                # Configure and send the image
                self._inference_counter += 1  # Increment counter for each image
                send_count += 1
                # Debug: Log send activity every 100 images
                if send_count % 100 == 0:
                    print(f"[MultiDongle] Sent {send_count} images to inference")
                self.generic_inference_input_descriptor.inference_number = self._inference_counter
                self.generic_inference_input_descriptor.input_node_image_list = [kp.GenericInputNodeImage(
                    image=image_data,
@ -445,7 +452,6 @@ class MultiDongle:
                kp.inference.generic_image_inference_send(device_group=self.device_group,
                                                          generic_inference_input_descriptor=self.generic_inference_input_descriptor)
                # print("Image sent.") # Optional: add log
                # No need for sleep here usually, as queue.get is blocking
            except kp.ApiKPException as exception:
                print(f' - Error in send thread: inference send failed, error = {exception}')
@ -460,10 +466,16 @@ class MultiDongle:
    def _receive_thread_func(self):
        """Internal function run by the receive thread, puts results into output queue."""
        print("Receive thread started.")
        receive_count = 0
        while not self._stop_event.is_set():
            try:
                generic_inference_output_descriptor = kp.inference.generic_image_inference_receive(device_group=self.device_group)
                self._output_queue.put(generic_inference_output_descriptor)
                receive_count += 1
                # Debug: Log receive activity every 100 results
                if receive_count % 100 == 0:
                    print(f"[MultiDongle] Received {receive_count} inference results")
            except kp.ApiKPException as exception:
                if not self._stop_event.is_set(): # Avoid printing error if we are already stopping
                     print(f' - Error in receive thread: inference receive failed, error = {exception}')
--- a/cluster4npu_ui/core/functions/camera_source.py
+++ b/cluster4npu_ui/core/functions/camera_source.py
@ -95,6 +95,7 @@ class CameraSource:
        """
        The main loop for capturing frames from the camera.
        """
        frame_count = 0
        while self.running and not self._stop_event.is_set():
            ret, frame = self.cap.read()
            if not ret:
@ -104,12 +105,21 @@ class CameraSource:
                self.initialize()
                continue
            frame_count += 1
            # Debug: Log camera activity every 100 frames
            if frame_count % 100 == 0:
                print(f"[Camera] Captured {frame_count} frames")
            if self.data_callback:
                try:
                    # Assuming the callback is thread-safe or handles its own locking
                    self.data_callback(frame)
                except Exception as e:
                    print(f"Error in data_callback: {e}")
                    # If callback fails repeatedly, camera might need to stop
                    if frame_count > 10:  # Allow some initial failures
                        print("Too many callback failures, stopping camera")
                        break
            if self.frame_callback:
                try: