[[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)


# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv8-seg instance segmentation model. For Usage examples see https://docs.ultralytics.com/tasks/segment# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n-seg.yaml' will call yolov8-seg.yaml with scale 'n'# [depth, width, max_channels]n: [0.33, 0.25, 1024]s: [0.33, 0.50, 1024]m: [0.67, 0.75, 768]l: [1.00, 1.00, 512]x: [1.00, 1.25, 512]# YOLOv8.0n backbone
backbone:# [from, repeats, module, args]- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4- [-1, 3, C2f, [128, True]]- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8- [-1, 6, C2f, [256, True]]- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16- [-1, 6, C2f, [512, True]]- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32- [-1, 3, C2f, [1024, True]]- [-1, 1, SPPF, [1024, 5]] # 9# YOLOv8.0n head
head:- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 6], 1, Concat, [1]] # cat backbone P4- [-1, 3, C2f, [512]] # 12- [-1, 1, nn.Upsample, [None, 2, "nearest"]]- [[-1, 4], 1, Concat, [1]] # cat backbone P3- [-1, 3, C2f, [256]] # 15 (P3/8-small)- [-1, 1, Conv, [256, 3, 2]]- [[-1, 12], 1, Concat, [1]] # cat head P4- [-1, 3, C2f, [512]] # 18 (P4/16-medium)- [-1, 1, Conv, [512, 3, 2]]- [[-1, 9], 1, Concat, [1]] # cat head P5- [-1, 3, C2f, [1024]] # 21 (P5/32-large)- [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)




class Detect(nn.Module):"""YOLOv8 Detect head for detection models."""dynamic = False  # force grid reconstructionexport = False  # export modeend2end = False  # end2endmax_det = 300  # max_detshape = Noneanchors = torch.empty(0)  # initstrides = torch.empty(0)  # initdef __init__(self, nc=80, ch=()):"""Initializes the YOLOv8 detection layer with specified number of classes and channels."""super().__init__()self.nc = nc  # number of classesself.nl = len(ch)  # number of detection layersself.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)self.no = nc + self.reg_max * 4  # number of outputs per anchorself.stride = torch.zeros(self.nl)  # strides computed during buildc2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channelsself.cv2 = nn.ModuleList(nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()if self.end2end:self.one2one_cv2 = copy.deepcopy(self.cv2)self.one2one_cv3 = copy.deepcopy(self.cv3)def forward(self, x):"""Concatenates and returns predicted bounding boxes and class probabilities."""if self.end2end:return self.forward_end2end(x)for i in range(self.nl):x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)if self.training:  # Training pathreturn xy = self._inference(x)return y if self.export else (y, x)y = self._inference(one2one)y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)return y if self.export else (y, {"one2many": x, "one2one": one2one})def _inference(self, x):"""Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""# Inference pathshape = x[0].shape  # BCHWx_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)if self.dynamic or self.shape != shape:self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))self.shape = shapeif self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # avoid TF FlexSplitV opsbox = x_cat[:, : self.reg_max * 4]cls = x_cat[:, self.reg_max * 4 :]else:box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)if self.export and self.format in {"tflite", "edgetpu"}:# Precompute normalization factor to increase numerical stability# See https://github.com/ultralytics/ultralytics/issues/7371grid_h = shape[2]grid_w = shape[3]grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)norm = self.strides / (self.stride[0] * grid_size)dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])else:dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.stridesreturn torch.cat((dbox, cls.sigmoid()), 1)



class Segment(Detect):"""YOLOv8 Segment head for segmentation models."""def __init__(self, nc=80, nm=32, npr=256, ch=()):"""Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers."""super().__init__(nc, ch)self.nm = nm  # number of masksself.npr = npr  # number of protosself.proto = Proto(ch[0], self.npr, self.nm)  # protosc4 = max(ch[0] // 4, self.nm)self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)def forward(self, x):"""Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""p = self.proto(x[0])  # mask protosbs = p.shape[0]  # batch sizemc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficientsx = Detect.forward(self, x)if self.training:return x, mc, preturn (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))



上述第一个操作便是Proto操作 ,传入的是第一尺度的输出特征图,Proto的功能是针对x[0]进行卷积,将原来80x80大小的feature通过上采样变为160x160,这个图像是基础蒙版(mask)。

p = self.proto(x[0])
Proto((cv1): Conv((conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(upsample): ConvTranspose2d(64, 64, kernel_size=(2, 2), stride=(2, 2))(cv2): Conv((conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(cv3): Conv((conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))(act): SiLU(inplace=True))


ModuleList((0): Sequential((0): Conv((conv): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))   #CBS模块(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))   #CBS模块(act): SiLU(inplace=True))(2): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))     #用于通道维度转换,Conv2d)(1): Sequential((0): Conv((conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1)))(2): Sequential((0): Conv((conv): Conv2d(256, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1)))


for i in range(self.nl):x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)


ModuleList((0): Sequential((0): Conv((conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1)))(1): Sequential((0): Conv((conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1)))(2): Sequential((0): Conv((conv): Conv2d(256, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1)))


ModuleList((0): Sequential((0): Conv((conv): Conv2d(64, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1)))(1): Sequential((0): Conv((conv): Conv2d(128, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1)))(2): Sequential((0): Conv((conv): Conv2d(256, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(1): Conv((conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(act): SiLU(inplace=True))(2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1)))



self.no = nc + self.reg_max * 4,其中reg_max是根据YOLOv8不同模型大小设定的,即 scale 4/8/12/16/20 for n/s/m/l/x),此处reg_max=16

def _inference(self, x):"""Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""# Inference pathshape = x[0].shape  # BCHW  (144,80,80)x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)#(1,144,8400)8400=80*80+40*40+20*20if self.dynamic or self.shape != shape:self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))self.shape = shape  #if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # avoid TF FlexSplitV ops#不执行box = x_cat[:, : self.reg_max * 4]cls = x_cat[:, self.reg_max * 4 :]else:box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)if self.export and self.format in {"tflite", "edgetpu"}:#不执行# Precompute normalization factor to increase numerical stability# See https://github.com/ultralytics/ultralytics/issues/7371grid_h = shape[2]grid_w = shape[3]grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)norm = self.strides / (self.stride[0] * grid_size)dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])else:dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.stridesreturn torch.cat((dbox, cls.sigmoid()), 1)

self.anchors为torch.Size([2, 8400]), self.strides为torch.Size([1, 8400])
shapetorch.Size([1, 144, 80, 80]) 144=64+80,这个64是预测的box的值,最后还要进行转换


dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides


最后将 dboxcls(类别)返回

return torch.cat((dbox, cls.sigmoid()), 1)



return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))








nm = prediction.shape[1] - nc - 4  # number of masks
mi = 4 + nc


output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs


box,类别 以及分割mask分开:

box, cls, mask = x.split((4, nc, nm), 1)



conf, j = cls.max(1, keepdim=True)
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]

得到的conf为分值,j为坐标(代表类别),维度均为(36,1),并将这些数据再次拼接到一起,得到(36,38),其中36为目标个数,384+1+1+32,即 box+conf+cls_id+mask

c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes,max_wh是定义的,值为7860
scores = x[:, 4]  # scores
if rotated:boxes = torch.cat((x[:, :2] + c, x[:, 2:4], x[:, -1:]), dim=-1)  # xywhri = nms_rotated(boxes, scores, iou_thres)
else:#执行这个,对box进行非极大值抑制,这个是调用了torch的包boxes = x[:, :4] + c  # boxes (offset by class)i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
i = i[:max_det]#max_det=300,即最多只能预测300个目标output[xi] = x[i]

返回的 itensor([20, 24, 3, 32, 34], device='cuda:0'),这里给出的i36个中经过筛选后的检测框编号,最终将x中的目标筛选出存储到output中,可以看到output是一个列表,存放的是每个batch的结果,由于在预测时只输入一张图像,故里面只有一个数据,筛选出的结果为(5,38),即有5个目标。




def postprocess(self, preds, img, orig_imgs):






p = ops.non_max_suppression(preds[0],self.args.conf,self.args.iou,agnostic=self.args.agnostic_nms,max_det=self.args.max_det,nc=len(self.model.names),classes=self.args.classes,)

得到的结果 p 即为(5,38)


proto = preds[1][-1] if isinstance(preds[1], tuple) else preds[1]


for i, (pred, orig_img, img_path) in enumerate(zip(p, orig_imgs, self.batch[0])):if not len(pred):  # save empty boxesmasks = Noneelif self.args.retina_masks:pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2])  # HWCelse:#执行的是这个分支masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True)  # HWCpred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks))return results


masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True)


def process_mask(protos, masks_in, bboxes, shape, upsample=False):c, mh, mw = protos.shape  # CHW 32 120 160ih, iw = shape #480,640masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw)  # CHW  (5,120,160)width_ratio = mw / iw #0.25height_ratio = mh / ih#0.25downsampled_bboxes = bboxes.clone()#克隆downsampled_bboxes[:, 0] *= width_ratio #对其进行缩放downsampled_bboxes[:, 2] *= width_ratiodownsampled_bboxes[:, 3] *= height_ratiodownsampled_bboxes[:, 1] *= height_ratiomasks = crop_mask(masks, downsampled_bboxes)  # CHWif upsample:masks = F.interpolate(masks[None], shape, mode="bilinear", align_corners=False)[0]  # CHWreturn masks.gt_(0.0)
tensor([[2.3550e+02, 1.1798e+02, 3.6113e+02, 3.4263e+02],[2.7596e-01, 1.5049e+02, 1.8605e+02, 4.1289e+02],[5.9170e+02, 1.7436e+02, 6.3966e+02, 3.0410e+02],[9.4319e+00, 1.5594e+02, 4.7882e+02, 4.7825e+02],[3.0707e+01, 1.3653e+02, 4.7746e+02, 4.7805e+02]], device='cuda:0')bbox缩小后的值:
tensor([[5.8876e+01, 2.9494e+01, 9.0283e+01, 8.5657e+01],[6.8989e-02, 3.7624e+01, 4.6512e+01, 1.0322e+02],[1.4793e+02, 4.3590e+01, 1.5992e+02, 7.6024e+01],[2.3580e+00, 3.8984e+01, 1.1971e+02, 1.1956e+02],[7.6767e+00, 3.4133e+01, 1.1936e+02, 1.1951e+02]], device='cuda:0')


masks = crop_mask(masks, downsampled_bboxes)
def crop_mask(masks, boxes):_, h, w = masks.shapex1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(n,1,1)r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,1,w)c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(1,h,1)return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))



在检测头(分割头)中输出的32 维的向量可以看作是与每个检测框关联的分割 mask 的系数或权重。

针对于分割头的输出 1x32x160x160,一个关键的概念是 prototype masks。它是一个固定数量(32)的基础mask,每个 mask 的尺寸为 160×160。这些基础 mask并不直接对应于任何特定的物体或类别,而是被设计为可以线性组合来表示任何可能的物体 mask

简单来说,模型不直接预测每个物体的完整 mask,而是预测一组基本的 masks(称为 prototype masks)以及每个物体如何组合这些 masks(权重/系数)。这种方法的好处是,模型只需要预测一个较小的 mask张量,然后可以通过简单的矩阵乘法将这些小 mask 组合成完整的物体 masks

大家可以把它类比于线性代数中基向量的概念,空间中的任何一个向量是不是都可以表示为一组基向量的线性组合,那么其中的 prototype masks32x160x160mask 张量可以把它理解为一组基向量,而之前在检测框中的 32维向量可以理解为组合这一组基向量的权重或者说系数。

当我们从检测头得到一个 32 维的向量,分割头得到 32 个基础 masks 时,这个 32 维的向量实际上表示了如何组合这些基础masks 来得到一个特定物体的 mask。具体来说,我们用这个 32 维向量对 32 个基础 masks进行线性组合,从而得到与检测框关联的最终 mask。简单来说,这就像你现在有 32 种不同的颜料,检测头给你一个配方(32 维向量),告诉你如何混合这些颜料来得到一个特定的颜色(最终的 mask)。

这样做的优点是我们不需要为每个检测框都预测一个完整的 mask,这个非常消耗内存和计算资源。相反,我们只需要预测一个相对较小的 32 维向量和一个固定数量的基础 masks,然后在后处理中进行组合即可。



import cv2
import numpy as np
from ultralytics import YOLOdef hsv2bgr(h, s, v):h_i = int(h * 6)f = h * 6 - h_ip = v * (1 - s)q = v * (1 - f * s)t = v * (1 - (1 - f) * s)r, g, b = 0, 0, 0if h_i == 0:r, g, b = v, t, pelif h_i == 1:r, g, b = q, v, pelif h_i == 2:r, g, b = p, v, telif h_i == 3:r, g, b = p, q, velif h_i == 4:r, g, b = t, p, velif h_i == 5:r, g, b = v, p, qreturn int(b * 255), int(g * 255), int(r * 255)def random_color(id):h_plane = (((id << 2) ^ 0x937151) % 100) / 100.0s_plane = (((id << 3) ^ 0x315793) % 100) / 100.0return hsv2bgr(h_plane, s_plane, 1)if __name__ == "__main__":model = YOLO("yolov8n-seg.pt")img = cv2.imread("img.jpg")result = model(img)[0]names = result.namesboxes = result.boxes.data.tolist()masks = result.masksh, w = img.shape[:2]for i, mask in enumerate(masks.data):mask = mask.cpu().numpy().astype(np.uint8)mask_resized = cv2.resize(mask, (w, h))label = int(boxes[i][5])color = np.array(random_color(label))colored_mask = (np.ones((h, w, 3)) * color).astype(np.uint8)masked_colored_mask = cv2.bitwise_and(colored_mask, colored_mask, mask=mask_resized)mask_indices = mask_resized == 1img[mask_indices] = (img[mask_indices] * 0.6 + masked_colored_mask[mask_indices] * 0.4).astype(np.uint8)for obj in boxes:left, top, right, bottom = int(obj[0]), int(obj[1]), int(obj[2]), int(obj[3])confidence = obj[4]label = int(obj[5])color = random_color(label)cv2.rectangle(img, (left, top), (right, bottom), color = color ,thickness=2, lineType=cv2.LINE_AA)caption = f"{names[label]} {confidence:.2f}"w, h = cv2.getTextSize(caption, 0, 1, 2)[0]cv2.rectangle(img, (left - 3, top - 33), (left + w + 10, top), color, -1)cv2.putText(img, caption, (left, top - 5), 0, 1, (0, 0, 0), 2, 16)cv2.imwrite("predict-seg.jpg", img)print("save done")


