python - How to post-process the output of a YOLO-based instance segmentation model that produces 10 tensors as output?

I am working with a custom YOLO-based instance segmentation model in ONNX format. The model takes an input image of shape (1,3,640,640) and outputs 10 tensors:

Bounding Boxes:

bbox_0: (1, 4, 80, 80)
bbox_1: (1, 4, 40, 40)
bbox_2: (1, 4, 20, 20)

Class Probabilities:

class_prob_0: (1, 10, 80, 80)
class_prob_1: (1, 10, 40, 40)
class_prob_2: (1, 10, 20, 20)

Mask Coefficients:

mask_coeff_0: (1, 32, 80, 80)
mask_coeff_1: (1, 32, 40, 40)
mask_coeff_2: (1, 32, 20, 20)

Prototype Mask:

mask: (1, 32, 160, 160)

I need to correctly process these outputs to:

Extract valid object detections and filter redundant ones.
Assign class labels to detections based on class probabilities.
Decode mask coefficients to generate segmentation masks.
Properly overlay instance masks on the original image.

How should I approach post-processing to achieve clean and accurate detections with instance masks?

import cv2
import numpy as np
import onnxruntime as ort
from ultralytics.utils.plotting import Annotator, colors

# Load ONNX model
session = ort.InferenceSession("models/model.onnx", providers=["CPUExecutionProvider"])

cap = cv2.VideoCapture("input/sample.mp4")
w, h, fps = int(cap.get(3)), int(cap.get(4)), cap.get(5)

out = cv2.VideoWriter("instance-segmentation-object-tracking.avi", cv2.VideoWriter_fourcc(*"MJPG"), fps, (w, h))

while True:
    ret, im0 = cap.read()
    if not ret:
        print("Video processing completed.")
        break

    # Preprocess image for ONNX
    im_resized = cv2.resize(im0, (640, 640), interpolation=cv2.INTER_LINEAR)
    im_rgb = cv2.cvtColor(im_resized, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    im_rgb = np.transpose(im_rgb, (2, 0, 1))[None]  # Shape: (1,3,640,640)

    input_name = session.get_inputs()[0].name
    outputs = session.run(None, {input_name: im_rgb})

    # Extract model outputs
    bbox_list = [outputs[i] for i in range(3)]  # bbox_0, bbox_1, bbox_2
    class_probs = [outputs[i] for i in range(3, 6)]  # class_prob_0, class_prob_1, class_prob_2
    mask_coeffs = [outputs[i] for i in range(6, 9)]  # mask_coeff_0, mask_coeff_1, mask_coeff_2
    mask_prototype = outputs[9][0]  # Shape: (32,160,160)

    # Process bounding boxes
    all_boxes, all_scores, all_classes, all_masks = [], [], [], []
    strides = [8, 16, 32]  # YOLO-like feature map scaling

    for i, stride in enumerate(strides):
        grid_size = bbox_list[i].shape[-1]
        scale_factor = w / (grid_size * stride)  # Scale bbox to original image size

        boxes = bbox_list[i].reshape(-1, 4) * scale_factor
        scores = class_probs[i].reshape(-1, 10).max(axis=1)  # Get max class confidence
        classes = class_probs[i].reshape(-1, 10).argmax(axis=1)  # Get class IDs
        masks = mask_coeffs[i].reshape(-1, 32)  # Mask coefficients

        valid = scores > 0.3  # Confidence threshold
        all_boxes.append(boxes[valid])
        all_scores.append(scores[valid])
        all_classes.append(classes[valid])
        all_masks.append(masks[valid])

    # Flatten and stack results
    if not all_boxes:
        continue
    all_boxes = np.vstack(all_boxes)
    all_scores = np.hstack(all_scores)
    all_classes = np.hstack(all_classes)
    all_masks = np.vstack(all_masks)

    # Apply Non-Maximum Suppression (NMS)
    indices = cv2.dnn.NMSBoxes(all_boxes.tolist(), all_scores.tolist(), 0.3, 0.5)
    if len(indices) == 0:
        continue
    indices = indices.flatten()

    annotator = Annotator(im0, line_width=2)
    mask_prototype = cv2.resize(mask_prototype.transpose(1, 2, 0), (160, 160))  # Keep correct shape

    for i in indices:
        x1, y1, x2, y2 = map(int, all_boxes[i])
        cls, conf = int(all_classes[i]), all_scores[i]
        color = colors(cls, True)

        # Reconstruct instance mask
        mask = np.dot(mask_prototype, all_masks[i])  # Linear combination
        mask = 1 / (1 + np.exp(-mask))  # Sigmoid activation
        mask = (mask > 0.5).astype(np.uint8)  # Thresholding

        # Resize mask to match bbox
        # mask = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_LINEAR)
        # Ensure bbox dimensions are valid
        if x2 > x1 and y2 > y1:
            mask = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_LINEAR)
        else:
            print("Skipping invalid bbox:", x1, y1, x2, y2)
            continue  # Skip this mask if the bbox is invalid

        # Apply mask only inside bbox
        if x2 > x1 and y2 > y1:
            overlay = im0[y1:y2, x1:x2].copy()
            
            # Ensure mask and overlay sizes match before applying color
            if overlay.shape[:2] == mask.shape:
                colored_mask = np.zeros_like(overlay, dtype=np.uint8)
                colored_mask[:, :, 0] = mask * color[0]
                colored_mask[:, :, 1] = mask * color[1]
                colored_mask[:, :, 2] = mask * color[2]
            else:
                print("Skipping mask due to size mismatch:", overlay.shape, mask.shape)
        else:
            print("Skipping invalid bbox:", x1, y1, x2, y2)

        im0[y1:y2, x1:x2] = cv2.addWeighted(overlay, 1, colored_mask, 0.5, 0)

        # Draw bounding box and class label
        cv2.rectangle(im0, (x1, y1), (x2, y2), color, 2)
        label = f"{cls}: {conf:.2f}"
        annotator.text((x1, y1 - 10), label, color)

    out.write(im0)
    cv2.imshow("instance-segmentation-object-tracking", im0)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

out.release()
cap.release()
cv2.destroyAllWindows()

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - How to post-process the output of a YOLO-based instance segmentation model that produces 10 tensors as output? - Stack

与本文相关的文章

评论列表(0)