SAM 3 / 3.1 是什么?
SAM 3 是 Meta Segment Anything Model 系列的新模型。
以前的 SAM 主要只是 理解物体的形状。如果想切出指定对象,需要用 BBOX 或坐标来指定位置。
SAM 3 则可以像 VLM 一样用文本指定对象,并且单独完成分割。
SAM 3.1 是 SAM 3 的更新版,改进了视频中多个对象的追踪处理。
模型下载
- sam3.1_multiplex_fp16.safetensors (1.75 GB)
📂ComfyUI/
└── 📂models/
└── 📂checkpoints/
└── sam3.1_multiplex_fp16.safetensors
workflow
静态图像

{
"id": "0c99f612-ade6-434c-9aef-a452aeb77f92",
"revision": 0,
"last_node_id": 12,
"last_link_id": 11,
"nodes": [
{
"id": 3,
"type": "CheckpointLoaderSimple",
"pos": [
-1060.964981835579,
698.12753964874
],
"size": [
297.3094587159344,
98
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "MODEL",
"type": "MODEL",
"links": [
1
]
},
{
"name": "CLIP",
"type": "CLIP",
"links": [
5
]
},
{
"name": "VAE",
"type": "VAE",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CheckpointLoaderSimple"
},
"widgets_values": [
"sam3.1_multiplex_fp16.safetensors"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 12,
"type": "InvertMask",
"pos": [
-189.9197221538626,
698.12753964874
],
"size": [
140,
26
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "mask",
"type": "MASK",
"link": 10
}
],
"outputs": [
{
"name": "MASK",
"type": "MASK",
"links": [
11
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "InvertMask"
},
"widgets_values": []
},
{
"id": 8,
"type": "JoinImageWithAlpha",
"pos": [
-20.040925416533565,
952.4472834503921
],
"size": [
214.7865573347107,
46
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 7
},
{
"name": "alpha",
"type": "MASK",
"link": 11
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
9
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "JoinImageWithAlpha"
},
"widgets_values": []
},
{
"id": 9,
"type": "LoadImage",
"pos": [
-813.0177595344426,
952.4472834503921
],
"size": [
335.8566793646694,
366.72727272727286
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
6,
7
]
},
{
"name": "MASK",
"type": "MASK",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"ec87a3373da827a80d3d19bcf66df7cd.jpg",
"image"
]
},
{
"id": 5,
"type": "CLIPTextEncode",
"pos": [
-728.3297522774851,
771.8199768517986
],
"size": [
250.42005327504967,
109.84222389181082
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 5
}
],
"outputs": [
{
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
2
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CLIPTextEncode"
},
"widgets_values": [
"human:99"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 10,
"type": "PreviewImage",
"pos": [
233.07005996484492,
952.4472834503921
],
"size": [
455.6999999999998,
354.89999999999986
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 9
}
],
"outputs": [],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "PreviewImage"
},
"widgets_values": []
},
{
"id": 1,
"type": "SAM3_Detect",
"pos": [
-428.0779475923605,
698.12753964874
],
"size": [
210,
206
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"label": "model",
"name": "model",
"type": "MODEL",
"link": 1
},
{
"label": "image",
"name": "image",
"type": "IMAGE",
"link": 6
},
{
"label": "conditioning",
"name": "conditioning",
"shape": 7,
"type": "CONDITIONING",
"link": 2
},
{
"label": "bboxes",
"name": "bboxes",
"shape": 7,
"type": "BOUNDING_BOX",
"link": null
},
{
"label": "positive_coords",
"name": "positive_coords",
"shape": 7,
"type": "STRING",
"link": null
},
{
"label": "negative_coords",
"name": "negative_coords",
"shape": 7,
"type": "STRING",
"link": null
}
],
"outputs": [
{
"name": "masks",
"type": "MASK",
"links": [
10
]
},
{
"name": "bboxes",
"type": "BOUNDING_BOX",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_Detect"
},
"widgets_values": [
0.5,
2,
false
],
"color": "#232",
"bgcolor": "#353"
}
],
"links": [
[
1,
3,
0,
1,
0,
"MODEL"
],
[
2,
5,
0,
1,
2,
"CONDITIONING"
],
[
5,
3,
1,
5,
0,
"CLIP"
],
[
6,
9,
0,
1,
1,
"IMAGE"
],
[
7,
9,
0,
8,
0,
"IMAGE"
],
[
9,
8,
0,
10,
0,
"IMAGE"
],
[
10,
1,
0,
12,
0,
"MASK"
],
[
11,
12,
0,
8,
1,
"MASK"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.9090909090909091,
"offset": [
1279.743745342885,
-482.20165823823663
]
},
"frontendVersion": "1.42.14",
"VHS_latentpreview": false,
"VHS_latentpreviewrate": 0,
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"version": 0.4
}
- 向
SAM3 Detect节点输入图像、蒙版,以及想切出的对象信息(文本提示词、BBOX、坐标)。 - 这个规格有点绕。如果有多个对象符合提示词,只写
car的话,只会检测其中最像的一个。- 如果想分割到第 N 个对象,需要写成
car:N。 - 如果只是想检测画面中所有符合条件的对象,直接写成
car:99也可以。
- 如果想分割到第 N 个对象,需要写成
视频
{
"id": "336e23e1-fe7a-4d35-a30e-888a29b97ddb",
"revision": 0,
"last_node_id": 18,
"last_link_id": 19,
"nodes": [
{
"id": 5,
"type": "CLIPTextEncode",
"pos": [
-728.3297522774851,
771.8199768517986
],
"size": [
250.42005327504967,
109.84222389181082
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 5
}
],
"outputs": [
{
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
12
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CLIPTextEncode"
},
"widgets_values": [
"human:99"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 3,
"type": "CheckpointLoaderSimple",
"pos": [
-1060.964981835579,
698.12753964874
],
"size": [
297.3094587159344,
98
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "MODEL",
"type": "MODEL",
"links": [
14
]
},
{
"name": "CLIP",
"type": "CLIP",
"links": [
5
]
},
{
"name": "VAE",
"type": "VAE",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CheckpointLoaderSimple"
},
"widgets_values": [
"sam3.1_multiplex_fp16.safetensors"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 15,
"type": "SAM3_TrackToMask",
"pos": [
-182.9072624896289,
678.7675396487398
],
"size": [
210,
58
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"label": "track_data",
"name": "track_data",
"type": "SAM3_TRACK_DATA",
"link": 16
}
],
"outputs": [
{
"name": "masks",
"type": "MASK",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_TrackToMask"
},
"widgets_values": [
""
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 13,
"type": "SAM3_VideoTrack",
"pos": [
-428.0779475923605,
678.7675396487398
],
"size": [
215.80859375,
166
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"label": "images",
"name": "images",
"type": "IMAGE",
"link": 17
},
{
"label": "model",
"name": "model",
"type": "MODEL",
"link": 14
},
{
"label": "initial_mask",
"name": "initial_mask",
"shape": 7,
"type": "MASK",
"link": null
},
{
"label": "conditioning",
"name": "conditioning",
"shape": 7,
"type": "CONDITIONING",
"link": 12
}
],
"outputs": [
{
"name": "track_data",
"type": "SAM3_TRACK_DATA",
"links": [
16,
18
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_VideoTrack"
},
"widgets_values": [
0.5,
0,
1
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 16,
"type": "VHS_LoadVideo",
"pos": [
-743.2570475684959,
948.9868102053013
],
"size": [
265.2896839488637,
453.15252746695273
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"shape": 7,
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"shape": 7,
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
17,
19
]
},
{
"name": "frame_count",
"type": "INT",
"links": null
},
{
"name": "audio",
"type": "AUDIO",
"links": null
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null
}
],
"properties": {
"cnr_id": "comfyui-videohelpersuite",
"ver": "2984ec4c4b93292421888f38db74a5e8802a8ff8",
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "4927323-uhd_4096_2160_30fps.mp4",
"force_rate": 24,
"custom_width": 0,
"custom_height": 0,
"frame_load_cap": 121,
"skip_first_frames": 0,
"select_every_nth": 1,
"format": "None",
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "4927323-uhd_4096_2160_30fps.mp4",
"type": "input",
"format": "video/mp4",
"force_rate": 24,
"custom_width": 0,
"custom_height": 0,
"frame_load_cap": 121,
"skip_first_frames": 0,
"select_every_nth": 1
}
}
}
},
{
"id": 17,
"type": "SAM3_TrackPreview",
"pos": [
-182.9072624896289,
948.9868102053013
],
"size": [
342,
291.64218749999986
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"label": "track_data",
"name": "track_data",
"type": "SAM3_TRACK_DATA",
"link": 18
},
{
"label": "images",
"name": "images",
"shape": 7,
"type": "IMAGE",
"link": 19
}
],
"outputs": [],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_TrackPreview"
},
"widgets_values": [
0.5,
24
]
}
],
"links": [
[
5,
3,
1,
5,
0,
"CLIP"
],
[
12,
5,
0,
13,
3,
"CONDITIONING"
],
[
14,
3,
0,
13,
1,
"MODEL"
],
[
16,
13,
0,
15,
0,
"SAM3_TRACK_DATA"
],
[
17,
16,
0,
13,
0,
"IMAGE"
],
[
18,
13,
0,
17,
0,
"SAM3_TRACK_DATA"
],
[
19,
16,
0,
17,
1,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.8264462809917359,
"offset": [
1379.2065697412722,
-443.11239441144596
]
},
"frontendVersion": "1.42.14",
"VHS_latentpreview": false,
"VHS_latentpreviewrate": 0,
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"version": 0.4
}
- 使用
SAM3 Video Track节点。 - 将输出传给
SAM3 Track to Mask节点,就可以作为蒙版使用。 SAM3 Track Preview节点输入图像和track_data后,会给蒙版部分上色,方便查看。