I am working with a custom YOLO-based instance segmentation model in ONNX format. The model takes an input image of shape (1,3,640,640) and outputs 10 tensors:
Bounding Boxes:
bbox_0: (1, 4, 80, 80)
bbox_1: (1, 4, 40, 40)
bbox_2: (1, 4, 20, 20)
Class Probabilities:
class_prob_0: (1, 10, 80, 80)
class_prob_1: (1, 10, 40, 40)
class_prob_2: (1, 10, 20, 20)
Mask Coefficients:
mask_coeff_0: (1, 32, 80, 80)
mask_coeff_1: (1, 32, 40, 40)
mask_coeff_2: (1, 32, 20, 20)
Prototype Mask:
mask: (1, 32, 160, 160)
I need to correctly process these outputs to:
- Extract valid object detections and filter redundant ones.
- Assign class labels to detections based on class probabilities.
- Decode mask coefficients to generate segmentation masks.
- Properly overlay instance masks on the original image.
How should I approach post-processing to achieve clean and accurate detections with instance masks?
import cv2
import numpy as np
import onnxruntime as ort
from ultralytics.utils.plotting import Annotator, colors
# Load ONNX model
session = ort.InferenceSession("models/model.onnx", providers=["CPUExecutionProvider"])
cap = cv2.VideoCapture("input/sample.mp4")
w, h, fps = int(cap.get(3)), int(cap.get(4)), cap.get(5)
out = cv2.VideoWriter("instance-segmentation-object-tracking.avi", cv2.VideoWriter_fourcc(*"MJPG"), fps, (w, h))
while True:
ret, im0 = cap.read()
if not ret:
print("Video processing completed.")
break
# Preprocess image for ONNX
im_resized = cv2.resize(im0, (640, 640), interpolation=cv2.INTER_LINEAR)
im_rgb = cv2.cvtColor(im_resized, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
im_rgb = np.transpose(im_rgb, (2, 0, 1))[None] # Shape: (1,3,640,640)
input_name = session.get_inputs()[0].name
outputs = session.run(None, {input_name: im_rgb})
# Extract model outputs
bbox_list = [outputs[i] for i in range(3)] # bbox_0, bbox_1, bbox_2
class_probs = [outputs[i] for i in range(3, 6)] # class_prob_0, class_prob_1, class_prob_2
mask_coeffs = [outputs[i] for i in range(6, 9)] # mask_coeff_0, mask_coeff_1, mask_coeff_2
mask_prototype = outputs[9][0] # Shape: (32,160,160)
# Process bounding boxes
all_boxes, all_scores, all_classes, all_masks = [], [], [], []
strides = [8, 16, 32] # YOLO-like feature map scaling
for i, stride in enumerate(strides):
grid_size = bbox_list[i].shape[-1]
scale_factor = w / (grid_size * stride) # Scale bbox to original image size
boxes = bbox_list[i].reshape(-1, 4) * scale_factor
scores = class_probs[i].reshape(-1, 10).max(axis=1) # Get max class confidence
classes = class_probs[i].reshape(-1, 10).argmax(axis=1) # Get class IDs
masks = mask_coeffs[i].reshape(-1, 32) # Mask coefficients
valid = scores > 0.3 # Confidence threshold
all_boxes.append(boxes[valid])
all_scores.append(scores[valid])
all_classes.append(classes[valid])
all_masks.append(masks[valid])
# Flatten and stack results
if not all_boxes:
continue
all_boxes = np.vstack(all_boxes)
all_scores = np.hstack(all_scores)
all_classes = np.hstack(all_classes)
all_masks = np.vstack(all_masks)
# Apply Non-Maximum Suppression (NMS)
indices = cv2.dnn.NMSBoxes(all_boxes.tolist(), all_scores.tolist(), 0.3, 0.5)
if len(indices) == 0:
continue
indices = indices.flatten()
annotator = Annotator(im0, line_width=2)
mask_prototype = cv2.resize(mask_prototype.transpose(1, 2, 0), (160, 160)) # Keep correct shape
for i in indices:
x1, y1, x2, y2 = map(int, all_boxes[i])
cls, conf = int(all_classes[i]), all_scores[i]
color = colors(cls, True)
# Reconstruct instance mask
mask = np.dot(mask_prototype, all_masks[i]) # Linear combination
mask = 1 / (1 + np.exp(-mask)) # Sigmoid activation
mask = (mask > 0.5).astype(np.uint8) # Thresholding
# Resize mask to match bbox
# mask = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_LINEAR)
# Ensure bbox dimensions are valid
if x2 > x1 and y2 > y1:
mask = cv2.resize(mask, (x2 - x1, y2 - y1), interpolation=cv2.INTER_LINEAR)
else:
print("Skipping invalid bbox:", x1, y1, x2, y2)
continue # Skip this mask if the bbox is invalid
# Apply mask only inside bbox
if x2 > x1 and y2 > y1:
overlay = im0[y1:y2, x1:x2].copy()
# Ensure mask and overlay sizes match before applying color
if overlay.shape[:2] == mask.shape:
colored_mask = np.zeros_like(overlay, dtype=np.uint8)
colored_mask[:, :, 0] = mask * color[0]
colored_mask[:, :, 1] = mask * color[1]
colored_mask[:, :, 2] = mask * color[2]
else:
print("Skipping mask due to size mismatch:", overlay.shape, mask.shape)
else:
print("Skipping invalid bbox:", x1, y1, x2, y2)
im0[y1:y2, x1:x2] = cv2.addWeighted(overlay, 1, colored_mask, 0.5, 0)
# Draw bounding box and class label
cv2.rectangle(im0, (x1, y1), (x2, y2), color, 2)
label = f"{cls}: {conf:.2f}"
annotator.text((x1, y1 - 10), label, color)
out.write(im0)
cv2.imshow("instance-segmentation-object-tracking", im0)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
out.release()
cap.release()
cv2.destroyAllWindows()