SAM 3 / 3.1とは?
SAM 3 は、Meta の Segment Anything Model シリーズの新しいモデルです。
これまでの SAM は、あくまで 物の形が分かる だけで、指定したオブジェクトを切り抜くには、BBOX や座標で位置を指定してあげる必要がありました。
SAM 3 では、VLM のようにテキストで対象を指定し、単独でセグメンテーションを完結させることができるようになっています。
SAM 3.1 は SAM 3 の更新版です。動画で複数オブジェクトを追跡する処理が改善されています。
モデルのダウンロード
- sam3.1_multiplex_fp16.safetensors (1.75 GB)
📂ComfyUI/
└── 📂models/
└── 📂checkpoints/
└── sam3.1_multiplex_fp16.safetensors
workflow
静止画

{
"id": "0c99f612-ade6-434c-9aef-a452aeb77f92",
"revision": 0,
"last_node_id": 12,
"last_link_id": 11,
"nodes": [
{
"id": 3,
"type": "CheckpointLoaderSimple",
"pos": [
-1060.964981835579,
698.12753964874
],
"size": [
297.3094587159344,
98
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "MODEL",
"type": "MODEL",
"links": [
1
]
},
{
"name": "CLIP",
"type": "CLIP",
"links": [
5
]
},
{
"name": "VAE",
"type": "VAE",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CheckpointLoaderSimple"
},
"widgets_values": [
"sam3.1_multiplex_fp16.safetensors"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 12,
"type": "InvertMask",
"pos": [
-189.9197221538626,
698.12753964874
],
"size": [
140,
26
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "mask",
"type": "MASK",
"link": 10
}
],
"outputs": [
{
"name": "MASK",
"type": "MASK",
"links": [
11
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "InvertMask"
},
"widgets_values": []
},
{
"id": 8,
"type": "JoinImageWithAlpha",
"pos": [
-20.040925416533565,
952.4472834503921
],
"size": [
214.7865573347107,
46
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 7
},
{
"name": "alpha",
"type": "MASK",
"link": 11
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
9
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "JoinImageWithAlpha"
},
"widgets_values": []
},
{
"id": 9,
"type": "LoadImage",
"pos": [
-813.0177595344426,
952.4472834503921
],
"size": [
335.8566793646694,
366.72727272727286
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
6,
7
]
},
{
"name": "MASK",
"type": "MASK",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"ec87a3373da827a80d3d19bcf66df7cd.jpg",
"image"
]
},
{
"id": 5,
"type": "CLIPTextEncode",
"pos": [
-728.3297522774851,
771.8199768517986
],
"size": [
250.42005327504967,
109.84222389181082
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 5
}
],
"outputs": [
{
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
2
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CLIPTextEncode"
},
"widgets_values": [
"human:99"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 10,
"type": "PreviewImage",
"pos": [
233.07005996484492,
952.4472834503921
],
"size": [
455.6999999999998,
354.89999999999986
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 9
}
],
"outputs": [],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "PreviewImage"
},
"widgets_values": []
},
{
"id": 1,
"type": "SAM3_Detect",
"pos": [
-428.0779475923605,
698.12753964874
],
"size": [
210,
206
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"label": "model",
"name": "model",
"type": "MODEL",
"link": 1
},
{
"label": "image",
"name": "image",
"type": "IMAGE",
"link": 6
},
{
"label": "conditioning",
"name": "conditioning",
"shape": 7,
"type": "CONDITIONING",
"link": 2
},
{
"label": "bboxes",
"name": "bboxes",
"shape": 7,
"type": "BOUNDING_BOX",
"link": null
},
{
"label": "positive_coords",
"name": "positive_coords",
"shape": 7,
"type": "STRING",
"link": null
},
{
"label": "negative_coords",
"name": "negative_coords",
"shape": 7,
"type": "STRING",
"link": null
}
],
"outputs": [
{
"name": "masks",
"type": "MASK",
"links": [
10
]
},
{
"name": "bboxes",
"type": "BOUNDING_BOX",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_Detect"
},
"widgets_values": [
0.5,
2,
false
],
"color": "#232",
"bgcolor": "#353"
}
],
"links": [
[
1,
3,
0,
1,
0,
"MODEL"
],
[
2,
5,
0,
1,
2,
"CONDITIONING"
],
[
5,
3,
1,
5,
0,
"CLIP"
],
[
6,
9,
0,
1,
1,
"IMAGE"
],
[
7,
9,
0,
8,
0,
"IMAGE"
],
[
9,
8,
0,
10,
0,
"IMAGE"
],
[
10,
1,
0,
12,
0,
"MASK"
],
[
11,
12,
0,
8,
1,
"MASK"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.9090909090909091,
"offset": [
1279.743745342885,
-482.20165823823663
]
},
"frontendVersion": "1.42.14",
"VHS_latentpreview": false,
"VHS_latentpreviewrate": 0,
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"version": 0.4
}
SAM3 Detectノードに、画像・マスク、切り抜く対象の情報(テキストプロンプト、BBOX、座標)を入力します。- 少しややこしい仕様ですが、そのプロンプトに対応する対象が複数あった場合、単に
carのように書くだけでは、そのうちの一番それらしいものしか検出しません。- N 番目までセグメンテーションしたい場合は、
car:Nのように書く必要があります。 - 単に画面に映っている対象をすべて検出したい場合は、
car:99のように書いてしまってもいいでしょう。
- N 番目までセグメンテーションしたい場合は、
動画
{
"id": "336e23e1-fe7a-4d35-a30e-888a29b97ddb",
"revision": 0,
"last_node_id": 18,
"last_link_id": 19,
"nodes": [
{
"id": 5,
"type": "CLIPTextEncode",
"pos": [
-728.3297522774851,
771.8199768517986
],
"size": [
250.42005327504967,
109.84222389181082
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 5
}
],
"outputs": [
{
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
12
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CLIPTextEncode"
},
"widgets_values": [
"human:99"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 3,
"type": "CheckpointLoaderSimple",
"pos": [
-1060.964981835579,
698.12753964874
],
"size": [
297.3094587159344,
98
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "MODEL",
"type": "MODEL",
"links": [
14
]
},
{
"name": "CLIP",
"type": "CLIP",
"links": [
5
]
},
{
"name": "VAE",
"type": "VAE",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CheckpointLoaderSimple"
},
"widgets_values": [
"sam3.1_multiplex_fp16.safetensors"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 15,
"type": "SAM3_TrackToMask",
"pos": [
-182.9072624896289,
678.7675396487398
],
"size": [
210,
58
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"label": "track_data",
"name": "track_data",
"type": "SAM3_TRACK_DATA",
"link": 16
}
],
"outputs": [
{
"name": "masks",
"type": "MASK",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_TrackToMask"
},
"widgets_values": [
""
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 13,
"type": "SAM3_VideoTrack",
"pos": [
-428.0779475923605,
678.7675396487398
],
"size": [
215.80859375,
166
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"label": "images",
"name": "images",
"type": "IMAGE",
"link": 17
},
{
"label": "model",
"name": "model",
"type": "MODEL",
"link": 14
},
{
"label": "initial_mask",
"name": "initial_mask",
"shape": 7,
"type": "MASK",
"link": null
},
{
"label": "conditioning",
"name": "conditioning",
"shape": 7,
"type": "CONDITIONING",
"link": 12
}
],
"outputs": [
{
"name": "track_data",
"type": "SAM3_TRACK_DATA",
"links": [
16,
18
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_VideoTrack"
},
"widgets_values": [
0.5,
0,
1
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 16,
"type": "VHS_LoadVideo",
"pos": [
-743.2570475684959,
948.9868102053013
],
"size": [
265.2896839488637,
453.15252746695273
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"shape": 7,
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"shape": 7,
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
17,
19
]
},
{
"name": "frame_count",
"type": "INT",
"links": null
},
{
"name": "audio",
"type": "AUDIO",
"links": null
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null
}
],
"properties": {
"cnr_id": "comfyui-videohelpersuite",
"ver": "2984ec4c4b93292421888f38db74a5e8802a8ff8",
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "4927323-uhd_4096_2160_30fps.mp4",
"force_rate": 24,
"custom_width": 0,
"custom_height": 0,
"frame_load_cap": 121,
"skip_first_frames": 0,
"select_every_nth": 1,
"format": "None",
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "4927323-uhd_4096_2160_30fps.mp4",
"type": "input",
"format": "video/mp4",
"force_rate": 24,
"custom_width": 0,
"custom_height": 0,
"frame_load_cap": 121,
"skip_first_frames": 0,
"select_every_nth": 1
}
}
}
},
{
"id": 17,
"type": "SAM3_TrackPreview",
"pos": [
-182.9072624896289,
948.9868102053013
],
"size": [
342,
291.64218749999986
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"label": "track_data",
"name": "track_data",
"type": "SAM3_TRACK_DATA",
"link": 18
},
{
"label": "images",
"name": "images",
"shape": 7,
"type": "IMAGE",
"link": 19
}
],
"outputs": [],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_TrackPreview"
},
"widgets_values": [
0.5,
24
]
}
],
"links": [
[
5,
3,
1,
5,
0,
"CLIP"
],
[
12,
5,
0,
13,
3,
"CONDITIONING"
],
[
14,
3,
0,
13,
1,
"MODEL"
],
[
16,
13,
0,
15,
0,
"SAM3_TRACK_DATA"
],
[
17,
16,
0,
13,
0,
"IMAGE"
],
[
18,
13,
0,
17,
0,
"SAM3_TRACK_DATA"
],
[
19,
16,
0,
17,
1,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.8264462809917359,
"offset": [
1379.2065697412722,
-443.11239441144596
]
},
"frontendVersion": "1.42.14",
"VHS_latentpreview": false,
"VHS_latentpreviewrate": 0,
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"version": 0.4
}
SAM3 Video Trackノードを使用します。- 出力を
SAM3 Track to Maskノードに渡すことで、マスクとして使用できます。 SAM3 Track Previewノードは、画像とtrack_dataを入力すると、マスク部分を色付けして見やすくしてくれます。