SAM 3 / 3.1

SAM 3 / 3.1とは？

SAM 3 は、Meta の Segment Anything Model シリーズの新しいモデルです。

これまでの SAM は、あくまで 物の形が分かる だけで、指定したオブジェクトを切り抜くには、BBOX や座標で位置を指定してあげる必要がありました。

SAM 3 では、VLM のようにテキストで対象を指定し、単独でセグメンテーションを完結させることができるようになっています。

SAM 3.1 は SAM 3 の更新版です。動画で複数オブジェクトを追跡する処理が改善されています。

モデルのダウンロード

sam3.1_multiplex_fp16.safetensors (1.75 GB)

📂ComfyUI/
└── 📂models/
    └── 📂checkpoints/
        └── sam3.1_multiplex_fp16.safetensors

workflow

静止画

SAM3.1.json

{
  "id": "0c99f612-ade6-434c-9aef-a452aeb77f92",
  "revision": 0,
  "last_node_id": 12,
  "last_link_id": 11,
  "nodes": [
    {
      "id": 3,
      "type": "CheckpointLoaderSimple",
      "pos": [
        -1060.964981835579,
        698.12753964874
      ],
      "size": [
        297.3094587159344,
        98
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            1
          ]
        },
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            5
          ]
        },
        {
          "name": "VAE",
          "type": "VAE",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "CheckpointLoaderSimple"
      },
      "widgets_values": [
        "sam3.1_multiplex_fp16.safetensors"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 12,
      "type": "InvertMask",
      "pos": [
        -189.9197221538626,
        698.12753964874
      ],
      "size": [
        140,
        26
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "mask",
          "type": "MASK",
          "link": 10
        }
      ],
      "outputs": [
        {
          "name": "MASK",
          "type": "MASK",
          "links": [
            11
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "InvertMask"
      },
      "widgets_values": []
    },
    {
      "id": 8,
      "type": "JoinImageWithAlpha",
      "pos": [
        -20.040925416533565,
        952.4472834503921
      ],
      "size": [
        214.7865573347107,
        46
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 7
        },
        {
          "name": "alpha",
          "type": "MASK",
          "link": 11
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            9
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "JoinImageWithAlpha"
      },
      "widgets_values": []
    },
    {
      "id": 9,
      "type": "LoadImage",
      "pos": [
        -813.0177595344426,
        952.4472834503921
      ],
      "size": [
        335.8566793646694,
        366.72727272727286
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            6,
            7
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "ec87a3373da827a80d3d19bcf66df7cd.jpg",
        "image"
      ]
    },
    {
      "id": 5,
      "type": "CLIPTextEncode",
      "pos": [
        -728.3297522774851,
        771.8199768517986
      ],
      "size": [
        250.42005327504967,
        109.84222389181082
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 5
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            2
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        "human:99"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 10,
      "type": "PreviewImage",
      "pos": [
        233.07005996484492,
        952.4472834503921
      ],
      "size": [
        455.6999999999998,
        354.89999999999986
      ],
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 9
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "PreviewImage"
      },
      "widgets_values": []
    },
    {
      "id": 1,
      "type": "SAM3_Detect",
      "pos": [
        -428.0779475923605,
        698.12753964874
      ],
      "size": [
        210,
        206
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "label": "model",
          "name": "model",
          "type": "MODEL",
          "link": 1
        },
        {
          "label": "image",
          "name": "image",
          "type": "IMAGE",
          "link": 6
        },
        {
          "label": "conditioning",
          "name": "conditioning",
          "shape": 7,
          "type": "CONDITIONING",
          "link": 2
        },
        {
          "label": "bboxes",
          "name": "bboxes",
          "shape": 7,
          "type": "BOUNDING_BOX",
          "link": null
        },
        {
          "label": "positive_coords",
          "name": "positive_coords",
          "shape": 7,
          "type": "STRING",
          "link": null
        },
        {
          "label": "negative_coords",
          "name": "negative_coords",
          "shape": 7,
          "type": "STRING",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "masks",
          "type": "MASK",
          "links": [
            10
          ]
        },
        {
          "name": "bboxes",
          "type": "BOUNDING_BOX",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "SAM3_Detect"
      },
      "widgets_values": [
        0.5,
        2,
        false
      ],
      "color": "#232",
      "bgcolor": "#353"
    }
  ],
  "links": [
    [
      1,
      3,
      0,
      1,
      0,
      "MODEL"
    ],
    [
      2,
      5,
      0,
      1,
      2,
      "CONDITIONING"
    ],
    [
      5,
      3,
      1,
      5,
      0,
      "CLIP"
    ],
    [
      6,
      9,
      0,
      1,
      1,
      "IMAGE"
    ],
    [
      7,
      9,
      0,
      8,
      0,
      "IMAGE"
    ],
    [
      9,
      8,
      0,
      10,
      0,
      "IMAGE"
    ],
    [
      10,
      1,
      0,
      12,
      0,
      "MASK"
    ],
    [
      11,
      12,
      0,
      8,
      1,
      "MASK"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.9090909090909091,
      "offset": [
        1279.743745342885,
        -482.20165823823663
      ]
    },
    "frontendVersion": "1.42.14",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

SAM3 Detect ノードに、画像・マスク、切り抜く対象の情報（テキストプロンプト、BBOX、座標）を入力します。
少しややこしい仕様ですが、そのプロンプトに対応する対象が複数あった場合、単に car のように書くだけでは、そのうちの一番それらしいものしか検出しません。
- N 番目までセグメンテーションしたい場合は、car:N のように書く必要があります。
- 単に画面に映っている対象をすべて検出したい場合は、car:99 のように書いてしまってもいいでしょう。

動画

SAM3.1_video.json

{
  "id": "336e23e1-fe7a-4d35-a30e-888a29b97ddb",
  "revision": 0,
  "last_node_id": 18,
  "last_link_id": 19,
  "nodes": [
    {
      "id": 5,
      "type": "CLIPTextEncode",
      "pos": [
        -728.3297522774851,
        771.8199768517986
      ],
      "size": [
        250.42005327504967,
        109.84222389181082
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 5
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            12
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        "human:99"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 3,
      "type": "CheckpointLoaderSimple",
      "pos": [
        -1060.964981835579,
        698.12753964874
      ],
      "size": [
        297.3094587159344,
        98
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            14
          ]
        },
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            5
          ]
        },
        {
          "name": "VAE",
          "type": "VAE",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "CheckpointLoaderSimple"
      },
      "widgets_values": [
        "sam3.1_multiplex_fp16.safetensors"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 15,
      "type": "SAM3_TrackToMask",
      "pos": [
        -182.9072624896289,
        678.7675396487398
      ],
      "size": [
        210,
        58
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "label": "track_data",
          "name": "track_data",
          "type": "SAM3_TRACK_DATA",
          "link": 16
        }
      ],
      "outputs": [
        {
          "name": "masks",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "SAM3_TrackToMask"
      },
      "widgets_values": [
        ""
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 13,
      "type": "SAM3_VideoTrack",
      "pos": [
        -428.0779475923605,
        678.7675396487398
      ],
      "size": [
        215.80859375,
        166
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "label": "images",
          "name": "images",
          "type": "IMAGE",
          "link": 17
        },
        {
          "label": "model",
          "name": "model",
          "type": "MODEL",
          "link": 14
        },
        {
          "label": "initial_mask",
          "name": "initial_mask",
          "shape": 7,
          "type": "MASK",
          "link": null
        },
        {
          "label": "conditioning",
          "name": "conditioning",
          "shape": 7,
          "type": "CONDITIONING",
          "link": 12
        }
      ],
      "outputs": [
        {
          "name": "track_data",
          "type": "SAM3_TRACK_DATA",
          "links": [
            16,
            18
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "SAM3_VideoTrack"
      },
      "widgets_values": [
        0.5,
        0,
        1
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 16,
      "type": "VHS_LoadVideo",
      "pos": [
        -743.2570475684959,
        948.9868102053013
      ],
      "size": [
        265.2896839488637,
        453.15252746695273
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "shape": 7,
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "shape": 7,
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            17,
            19
          ]
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "links": null
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfyui-videohelpersuite",
        "ver": "2984ec4c4b93292421888f38db74a5e8802a8ff8",
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "4927323-uhd_4096_2160_30fps.mp4",
        "force_rate": 24,
        "custom_width": 0,
        "custom_height": 0,
        "frame_load_cap": 121,
        "skip_first_frames": 0,
        "select_every_nth": 1,
        "format": "None",
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "4927323-uhd_4096_2160_30fps.mp4",
            "type": "input",
            "format": "video/mp4",
            "force_rate": 24,
            "custom_width": 0,
            "custom_height": 0,
            "frame_load_cap": 121,
            "skip_first_frames": 0,
            "select_every_nth": 1
          }
        }
      }
    },
    {
      "id": 17,
      "type": "SAM3_TrackPreview",
      "pos": [
        -182.9072624896289,
        948.9868102053013
      ],
      "size": [
        342,
        291.64218749999986
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "label": "track_data",
          "name": "track_data",
          "type": "SAM3_TRACK_DATA",
          "link": 18
        },
        {
          "label": "images",
          "name": "images",
          "shape": 7,
          "type": "IMAGE",
          "link": 19
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.19.3",
        "Node name for S&R": "SAM3_TrackPreview"
      },
      "widgets_values": [
        0.5,
        24
      ]
    }
  ],
  "links": [
    [
      5,
      3,
      1,
      5,
      0,
      "CLIP"
    ],
    [
      12,
      5,
      0,
      13,
      3,
      "CONDITIONING"
    ],
    [
      14,
      3,
      0,
      13,
      1,
      "MODEL"
    ],
    [
      16,
      13,
      0,
      15,
      0,
      "SAM3_TRACK_DATA"
    ],
    [
      17,
      16,
      0,
      13,
      0,
      "IMAGE"
    ],
    [
      18,
      13,
      0,
      17,
      0,
      "SAM3_TRACK_DATA"
    ],
    [
      19,
      16,
      0,
      17,
      1,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.8264462809917359,
      "offset": [
        1379.2065697412722,
        -443.11239441144596
      ]
    },
    "frontendVersion": "1.42.14",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

SAM3 Video Track ノードを使用します。
出力を SAM3 Track to Mask ノードに渡すことで、マスクとして使用できます。
SAM3 Track Preview ノードは、画像と track_data を入力すると、マスク部分を色付けして見やすくしてくれます。

SAM 3 / 3.1

SAM 3 / 3.1とは？

モデルのダウンロード

workflow

静止画

動画

jsonコピーボタンとは？

修正・誤字報告

記事リクエスト

感想・その他

ありがとうございます