指示ベース画像編集

指示ベース画像編集とは？

画像とテキストの指示（必要なら参照画像も）を入力すると、その指示に従って画像を編集してくれるモデルを、このサイトでは 指示ベース画像編集モデル と呼びます。

以前は、

絵柄変換 → LoRA や IP-Adapter
オブジェクト除去・置き換え → inpainting
着せ替え → 専用モデル

といった具合に、タスクごとに別々の技術や workflow を組む必要がありました。

指示ベース画像編集モデルは、こうしたタスクを 「全部テキスト指示でまとめて扱う」 方向に進化させたものです。
現在 SOTA である nano banana も、このカテゴリに入ります。

発展の歴史

指示ベース画像編集がどのように発展してきたか、大まかに流れを押さえておきましょう。

InstructPix2Pix ― 指示で編集する発想の出発点

2023 年に発表された InstructPix2Pix により、「指示ベース画像編集」という道が開かれました。

Turn the car red

このモデルは「画像」と「それに対する編集指示テキスト」をペアで学習し、ユーザーの書いた指示に従って画像を編集することを目指したモデルです。

DiT と In-Context 系

あとから発見されたことですが、Flux などの DiT 系モデルは、もともと 複数枚にわたって一貫性のある画像を作る能力 を持っていました。

この性質を編集に応用する枠組みが、IC-LoRA / ACE++ です。

ACE_Plus_portrait_face-swap.json

{
  "id": "659b4eae-4a1e-444c-a2e5-9da456f179bb",
  "revision": 0,
  "last_node_id": 97,
  "last_link_id": 215,
  "nodes": [
    {
      "id": 72,
      "type": "UnetLoaderGGUF",
      "pos": [
        550,
        -140
      ],
      "size": [
        315,
        58
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            153
          ]
        }
      ],
      "properties": {
        "cnr_id": "ComfyUI-GGUF",
        "ver": "bc5223b0e37e053dbec2ea5e5f52c2fd4b8f712a",
        "Node name for S&R": "UnetLoaderGGUF"
      },
      "widgets_values": [
        "FLUX_gguf\\flux1-fill-dev-Q4_K_S.gguf"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 55,
      "type": "LoadImage",
      "pos": [
        200,
        420
      ],
      "size": [
        290,
        498.96368408203125
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "slot_index": 0,
          "links": [
            163
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "pexels-photo-15169599.jpg",
        "image",
        ""
      ]
    },
    {
      "id": 32,
      "type": "VAELoader",
      "pos": [
        597.4476928710938,
        254.066162109375
      ],
      "size": [
        248.4499969482422,
        58
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "VAE",
          "type": "VAE",
          "slot_index": 0,
          "links": [
            60,
            170
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "VAELoader"
      },
      "widgets_values": [
        "FLUXvae.safetensors"
      ]
    },
    {
      "id": 3,
      "type": "KSampler",
      "pos": [
        1250,
        40
      ],
      "size": [
        315,
        262
      ],
      "flags": {},
      "order": 12,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 206
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 171
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 172
        },
        {
          "name": "latent_image",
          "type": "LATENT",
          "link": 173
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "slot_index": 0,
          "links": [
            7
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "KSampler"
      },
      "widgets_values": [
        1234,
        "fixed",
        30,
        1,
        "euler",
        "normal",
        1
      ]
    },
    {
      "id": 48,
      "type": "LoraLoaderModelOnly",
      "pos": [
        900,
        -140
      ],
      "size": [
        315,
        82
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 153
        }
      ],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "slot_index": 0,
          "links": [
            206
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "LoraLoaderModelOnly"
      },
      "widgets_values": [
        "ACE_Plus\\comfyui_portrait_lora64.safetensors",
        1
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 56,
      "type": "LoadImage",
      "pos": [
        199.01502990722656,
        984.2403564453125
      ],
      "size": [
        290,
        510
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "slot_index": 0,
          "links": [
            164
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "slot_index": 1,
          "links": [
            204
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "clipspace/clipspace-mask-37996723.69999981.png [input]",
        "image",
        ""
      ]
    },
    {
      "id": 8,
      "type": "VAEDecode",
      "pos": [
        1600,
        40
      ],
      "size": [
        190,
        46
      ],
      "flags": {},
      "order": 13,
      "mode": 0,
      "inputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "link": 7
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 60
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "slot_index": 0,
          "links": [
            207
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "VAEDecode"
      },
      "widgets_values": []
    },
    {
      "id": 78,
      "type": "ACEPlusLoraProcessor",
      "pos": [
        565.2416381835938,
        415.5736999511719
      ],
      "size": [
        315,
        234
      ],
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "reference_image",
          "shape": 7,
          "type": "IMAGE",
          "link": 163
        },
        {
          "name": "edit_image",
          "shape": 7,
          "type": "IMAGE",
          "link": 164
        },
        {
          "name": "edit_mask",
          "shape": 7,
          "type": "MASK",
          "link": 204
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            166,
            205
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": [
            195
          ]
        },
        {
          "name": "OUT_H",
          "type": "INT",
          "links": [
            211
          ]
        },
        {
          "name": "OUT_W",
          "type": "INT",
          "links": [
            212
          ]
        },
        {
          "name": "SLICE_W",
          "type": "INT",
          "links": [
            213
          ]
        }
      ],
      "properties": {
        "Node name for S&R": "ACEPlusLoraProcessor"
      },
      "widgets_values": [
        true,
        1024,
        1024,
        "repainting",
        3072
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 95,
      "type": "ImageCrop",
      "pos": [
        1840,
        40
      ],
      "size": [
        210,
        130
      ],
      "flags": {},
      "order": 14,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 207
        },
        {
          "name": "width",
          "type": "INT",
          "widget": {
            "name": "width"
          },
          "link": 212
        },
        {
          "name": "height",
          "type": "INT",
          "widget": {
            "name": "height"
          },
          "link": 211
        },
        {
          "name": "x",
          "type": "INT",
          "widget": {
            "name": "x"
          },
          "link": 213
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            214
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "ImageCrop"
      },
      "widgets_values": [
        512,
        512,
        0,
        0
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 96,
      "type": "PreviewImage",
      "pos": [
        2079.263427734375,
        41.27529525756836
      ],
      "size": [
        495.5356750488281,
        730.54541015625
      ],
      "flags": {},
      "order": 15,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 214
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "PreviewImage"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 63,
      "type": "PreviewImage",
      "pos": [
        926.4385986328125,
        412.4072265625
      ],
      "size": [
        435.3353271484375,
        324.3360290527344
      ],
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 205
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "PreviewImage"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 23,
      "type": "CLIPTextEncode",
      "pos": [
        150,
        10
      ],
      "size": [
        397.89935302734375,
        120.82927703857422
      ],
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 62
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "slot_index": 0,
          "links": [
            41
          ]
        }
      ],
      "title": "CLIP Text Encode (Positive Prompt)",
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        "The girl looking at viewer"
      ]
    },
    {
      "id": 7,
      "type": "CLIPTextEncode",
      "pos": [
        160,
        200
      ],
      "size": [
        397.89935302734375,
        132.290771484375
      ],
      "flags": {
        "collapsed": true
      },
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 63
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "slot_index": 0,
          "links": [
            169
          ]
        }
      ],
      "title": "CLIP Text Encode (Negative Prompt)",
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 26,
      "type": "FluxGuidance",
      "pos": [
        600,
        10
      ],
      "size": [
        242.8545684814453,
        58
      ],
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "link": 41
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "slot_index": 0,
          "links": [
            168
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "FluxGuidance"
      },
      "widgets_values": [
        30
      ]
    },
    {
      "id": 34,
      "type": "DualCLIPLoader",
      "pos": [
        -208.03005981445312,
        92.42076873779297
      ],
      "size": [
        315,
        130
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            62,
            63
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.27",
        "Node name for S&R": "DualCLIPLoader"
      },
      "widgets_values": [
        "clip_l.safetensors",
        "t5xxl_fp8_e4m3fn.safetensors",
        "flux",
        "default"
      ]
    },
    {
      "id": 79,
      "type": "ACEPlusLoraConditioning",
      "pos": [
        903.8878173828125,
        59.738609313964844
      ],
      "size": [
        315,
        138
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 168
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 169
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 170
        },
        {
          "name": "pixels",
          "type": "IMAGE",
          "link": 166
        },
        {
          "name": "mask",
          "type": "MASK",
          "link": 195
        }
      ],
      "outputs": [
        {
          "name": "positive",
          "type": "CONDITIONING",
          "links": [
            171
          ]
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "links": [
            172
          ]
        },
        {
          "name": "latent",
          "type": "LATENT",
          "links": [
            173
          ]
        }
      ],
      "properties": {
        "Node name for S&R": "ACEPlusLoraConditioning"
      },
      "widgets_values": [
        false
      ],
      "color": "#2a363b",
      "bgcolor": "#3f5159"
    }
  ],
  "links": [
    [
      7,
      3,
      0,
      8,
      0,
      "LATENT"
    ],
    [
      41,
      23,
      0,
      26,
      0,
      "CONDITIONING"
    ],
    [
      60,
      32,
      0,
      8,
      1,
      "VAE"
    ],
    [
      62,
      34,
      0,
      23,
      0,
      "CLIP"
    ],
    [
      63,
      34,
      0,
      7,
      0,
      "CLIP"
    ],
    [
      153,
      72,
      0,
      48,
      0,
      "MODEL"
    ],
    [
      163,
      55,
      0,
      78,
      0,
      "IMAGE"
    ],
    [
      164,
      56,
      0,
      78,
      1,
      "IMAGE"
    ],
    [
      166,
      78,
      0,
      79,
      3,
      "IMAGE"
    ],
    [
      168,
      26,
      0,
      79,
      0,
      "CONDITIONING"
    ],
    [
      169,
      7,
      0,
      79,
      1,
      "CONDITIONING"
    ],
    [
      170,
      32,
      0,
      79,
      2,
      "VAE"
    ],
    [
      171,
      79,
      0,
      3,
      1,
      "CONDITIONING"
    ],
    [
      172,
      79,
      1,
      3,
      2,
      "CONDITIONING"
    ],
    [
      173,
      79,
      2,
      3,
      3,
      "LATENT"
    ],
    [
      195,
      78,
      1,
      79,
      4,
      "MASK"
    ],
    [
      204,
      56,
      1,
      78,
      2,
      "MASK"
    ],
    [
      205,
      78,
      0,
      63,
      0,
      "IMAGE"
    ],
    [
      206,
      48,
      0,
      3,
      0,
      "MODEL"
    ],
    [
      207,
      8,
      0,
      95,
      0,
      "IMAGE"
    ],
    [
      211,
      78,
      2,
      95,
      2,
      "INT"
    ],
    [
      212,
      78,
      3,
      95,
      1,
      "INT"
    ],
    [
      213,
      78,
      4,
      95,
      3,
      "INT"
    ],
    [
      214,
      95,
      0,
      96,
      0,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.2940834937055236,
      "offset": [
        -105.93120800654457,
        1432.1167803053202
      ]
    },
    "node_versions": {
      "comfy-core": "v0.3.10-44-g2ff3104",
      "Comfyui-In-Context-Lora-Utils": "6ef772d589928a380a139c6cd2cfc49b83c8e441"
    },
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

画像キャンバスの左側に「参照画像」を置く
右側をマスクして生成させる
テキスト指示と組み合わせて「左側を見ながら右側を編集」させる

といった、いわゆる 横並べテクニック を使うことで、特別なアダプタを介さずとも「参照画像の特徴を保持したまま編集できる」ことが示されました。

画像編集モデルの登場

その後、text2image モデルの派生として、FLUX.1 Kontext、Qwen-Image-Edit、OmniGen といった「画像編集」専用のモデルが登場し、「画像編集」が text2image とは別の一カテゴリとして扱われるようになってきました。

Flux.1_Kontext.json

{
  "id": "18404b37-92b0-4d11-a39c-ae941838eb83",
  "revision": 0,
  "last_node_id": 75,
  "last_link_id": 129,
  "nodes": [
    {
      "id": 51,
      "type": "ReferenceLatent",
      "pos": [
        893.0234375,
        190
      ],
      "size": [
        197.712890625,
        46
      ],
      "flags": {
        "collapsed": false
      },
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "link": 74
        },
        {
          "name": "latent",
          "shape": 7,
          "type": "LATENT",
          "link": 76
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            114
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.41",
        "Node name for S&R": "ReferenceLatent"
      },
      "widgets_values": [],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 68,
      "type": "FluxGuidance",
      "pos": [
        1115.2528076171875,
        190
      ],
      "size": [
        211.3223114013672,
        58
      ],
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "link": 114
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            115
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.41",
        "Node name for S&R": "FluxGuidance"
      },
      "widgets_values": [
        3.5
      ]
    },
    {
      "id": 69,
      "type": "DualCLIPLoader",
      "pos": [
        174.92930603027344,
        261.95574951171875
      ],
      "size": [
        270,
        130
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            117,
            118
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.41",
        "Node name for S&R": "DualCLIPLoader"
      },
      "widgets_values": [
        "clip_l.safetensors",
        "t5xxl_fp8_e4m3fn.safetensors",
        "flux",
        "default"
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 33,
      "type": "CLIPTextEncode",
      "pos": [
        517.7193603515625,
        378
      ],
      "size": [
        336.888427734375,
        103.97698974609375
      ],
      "flags": {
        "collapsed": true
      },
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 118
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "slot_index": 0,
          "links": [
            99
          ]
        }
      ],
      "title": "CLIP Text Encode (Negative Prompt)",
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.39",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 52,
      "type": "VAEEncode",
      "pos": [
        719.3842163085938,
        468.98004150390625
      ],
      "size": [
        140,
        46
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "pixels",
          "type": "IMAGE",
          "link": 127
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 77
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "links": [
            76,
            116
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.41",
        "Node name for S&R": "VAEEncode"
      },
      "widgets_values": [],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 43,
      "type": "VAELoader",
      "pos": [
        462.1297302246094,
        613.5346069335938
      ],
      "size": [
        234.05543518066406,
        58
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "VAE",
          "type": "VAE",
          "links": [
            62,
            77
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.39",
        "Node name for S&R": "VAELoader"
      },
      "widgets_values": [
        "ae.safetensors"
      ],
      "color": "#322",
      "bgcolor": "#533"
    },
    {
      "id": 67,
      "type": "MarkdownNote",
      "pos": [
        91.0188980102539,
        -45.492122650146484
      ],
      "size": [
        353.2001037597656,
        245.5225372314453
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "properties": {},
      "widgets_values": [
        "## models\n\n- [flux1-kontext-dev.gguf](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF/tree/main)\n- [clip_l.safetensors](https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors)\n- [t5xxl_fp8_e4m3fn_scaled.safetensors](https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp8_e4m3fn_scaled.safetensors)\n- [ae.safetensors](https://huggingface.co/Comfy-Org/Omnigen2_ComfyUI_repackaged/tree/main/split_files/vae)\n\n```\n📂ComfyUI/\n└── 📂models/\n    ├── 📂clip/\n    │   ├── clip_l.safetensors\n    │   └── t5xxl_fp8_e4m3fn.safetensors\n    ├── 📂unet/\n    │   └── flux1-kontext-dev.gguf\n    └── 📂vae/\n         └── ae.safetensors\n```"
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 73,
      "type": "FluxKontextImageScale",
      "pos": [
        481.14453125,
        468.98004150390625
      ],
      "size": [
        187.7544921875,
        26
      ],
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 126
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            127
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.43",
        "Node name for S&R": "FluxKontextImageScale"
      },
      "widgets_values": [],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 31,
      "type": "KSampler",
      "pos": [
        1355.8184814453125,
        194.12423706054688
      ],
      "size": [
        315,
        262
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 128
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 115
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 99
        },
        {
          "name": "latent_image",
          "type": "LATENT",
          "link": 116
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "slot_index": 0,
          "links": [
            52
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.39",
        "Node name for S&R": "KSampler"
      },
      "widgets_values": [
        1234,
        "fixed",
        20,
        1,
        "euler",
        "simple",
        1
      ]
    },
    {
      "id": 53,
      "type": "LoadImage",
      "pos": [
        153.1424102783203,
        468.98004150390625
      ],
      "size": [
        277.51690673828125,
        455.66180419921875
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            126
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.41",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "pexels-photo-32490940.jpg",
        "image"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 74,
      "type": "UNETLoader",
      "pos": [
        1053.054867669603,
        35.01709924294782
      ],
      "size": [
        273.52025134895166,
        82
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            128
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.75",
        "Node name for S&R": "UNETLoader"
      },
      "widgets_values": [
        "Flux\\flux1-dev-kontext_fp8_scaled.safetensors",
        "default"
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 6,
      "type": "CLIPTextEncode",
      "pos": [
        516.5379638671875,
        190
      ],
      "size": [
        339.84503173828125,
        123.01304626464844
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 117
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "slot_index": 0,
          "links": [
            74
          ]
        }
      ],
      "title": "CLIP Text Encode (Positive Prompt)",
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.39",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        "Change the clothes into a dark business suit with a white dress shirt and tie. Keep everything else (face, hairstyle, pose, background, lighting) unchanged."
      ]
    },
    {
      "id": 8,
      "type": "VAEDecode",
      "pos": [
        1706.111810633583,
        194.12423706054688
      ],
      "size": [
        140,
        46
      ],
      "flags": {},
      "order": 12,
      "mode": 0,
      "inputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "link": 52
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 62
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "slot_index": 0,
          "links": [
            129
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.39",
        "Node name for S&R": "VAEDecode"
      },
      "widgets_values": []
    },
    {
      "id": 75,
      "type": "SaveImage",
      "pos": [
        1881.4051398218535,
        194.12423706054688
      ],
      "size": [
        334.00927000000047,
        519.62905
      ],
      "flags": {},
      "order": 13,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 129
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.75"
      },
      "widgets_values": [
        "ComfyUI"
      ]
    }
  ],
  "links": [
    [
      52,
      31,
      0,
      8,
      0,
      "LATENT"
    ],
    [
      62,
      43,
      0,
      8,
      1,
      "VAE"
    ],
    [
      74,
      6,
      0,
      51,
      0,
      "CONDITIONING"
    ],
    [
      76,
      52,
      0,
      51,
      1,
      "LATENT"
    ],
    [
      77,
      43,
      0,
      52,
      1,
      "VAE"
    ],
    [
      99,
      33,
      0,
      31,
      2,
      "CONDITIONING"
    ],
    [
      114,
      51,
      0,
      68,
      0,
      "CONDITIONING"
    ],
    [
      115,
      68,
      0,
      31,
      1,
      "CONDITIONING"
    ],
    [
      116,
      52,
      0,
      31,
      3,
      "LATENT"
    ],
    [
      117,
      69,
      0,
      6,
      0,
      "CLIP"
    ],
    [
      118,
      69,
      0,
      33,
      0,
      "CLIP"
    ],
    [
      126,
      53,
      0,
      73,
      0,
      "IMAGE"
    ],
    [
      127,
      73,
      0,
      52,
      0,
      "IMAGE"
    ],
    [
      128,
      74,
      0,
      31,
      0,
      "MODEL"
    ],
    [
      129,
      8,
      0,
      75,
      0,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.620921323059155,
      "offset": [
        8.981101989746094,
        145.49212265014648
      ]
    },
    "frontendVersion": "1.34.2",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

共通しているのは、入力画像の内容を理解し、テキスト指示に応じて「どこをどの程度変えるか」を調整する、といった編集タスクを、ひとつのモデルで（そこそこ）汎用に扱おうとしている点です。

マルチリファレンス時代

初期の指示ベース画像編集は、「画像 1 枚」＋「テキスト指示」→ 編集結果という 1 入力画像前提でした。

Qwen-Image-Edit-2509 や Flux.2 以降は、複数の参照画像を同時に扱う流れが強くなっています。

Qwen-Image-Edit-2509_object-swap.json

{
  "id": "d8034549-7e0a-40f1-8c2e-de3ffc6f1cae",
  "revision": 0,
  "last_node_id": 125,
  "last_link_id": 323,
  "nodes": [
    {
      "id": 54,
      "type": "ModelSamplingAuraFlow",
      "pos": [
        634.9767456054688,
        -1.8326886892318726
      ],
      "size": [
        230.33058166503906,
        58
      ],
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 282
        }
      ],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            123
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.49",
        "Node name for S&R": "ModelSamplingAuraFlow"
      },
      "widgets_values": [
        3.1000000000000005
      ]
    },
    {
      "id": 63,
      "type": "VAEEncode",
      "pos": [
        714.6403198242188,
        673.7313842773438
      ],
      "size": [
        140,
        46
      ],
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "pixels",
          "type": "IMAGE",
          "link": 239
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 115
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "links": [
            112
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.51",
        "Node name for S&R": "VAEEncode"
      },
      "widgets_values": []
    },
    {
      "id": 112,
      "type": "CLIPLoader",
      "pos": [
        75.53079223632812,
        277.016357421875
      ],
      "size": [
        270,
        106
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            290,
            291
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.51",
        "Node name for S&R": "CLIPLoader"
      },
      "widgets_values": [
        "qwen_2.5_vl_7b_fp8_scaled.safetensors",
        "qwen_image",
        "default"
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 39,
      "type": "VAELoader",
      "pos": [
        107.53079223632812,
        446.7167663574219
      ],
      "size": [
        238,
        58
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "VAE",
          "type": "VAE",
          "slot_index": 0,
          "links": [
            76,
            115,
            292,
            293
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.33",
        "Node name for S&R": "VAELoader"
      },
      "widgets_values": [
        "qwen_image_vae.safetensors"
      ],
      "color": "#322",
      "bgcolor": "#533"
    },
    {
      "id": 114,
      "type": "TextEncodeQwenImageEditPlus",
      "pos": [
        454.6401672363281,
        419.63690185546875
      ],
      "size": [
        400,
        200
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 291
        },
        {
          "name": "vae",
          "shape": 7,
          "type": "VAE",
          "link": 293
        },
        {
          "name": "image1",
          "shape": 7,
          "type": "IMAGE",
          "link": 295
        },
        {
          "name": "image2",
          "shape": 7,
          "type": "IMAGE",
          "link": 320
        },
        {
          "name": "image3",
          "shape": 7,
          "type": "IMAGE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            315
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.59",
        "Node name for S&R": "TextEncodeQwenImageEditPlus"
      },
      "widgets_values": [
        ""
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 111,
      "type": "UNETLoader",
      "pos": [
        330.1968994140625,
        -1.8326886892318726
      ],
      "size": [
        276.62274169921875,
        82
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            282
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.51",
        "Node name for S&R": "UNETLoader"
      },
      "widgets_values": [
        "Qwen-Image\\qwen_image_edit_2509_fp8_e4m3fn.safetensors",
        "fp8_e4m3fn"
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 82,
      "type": "ImageScaleToTotalPixels",
      "pos": [
        -224.63221740722656,
        668.4074096679688
      ],
      "size": [
        270,
        82
      ],
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 275
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            244
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.51",
        "Node name for S&R": "ImageScaleToTotalPixels"
      },
      "widgets_values": [
        "nearest-exact",
        1
      ]
    },
    {
      "id": 55,
      "type": "MarkdownNote",
      "pos": [
        -84.94583892822266,
        -171.1671905517578
      ],
      "size": [
        386.9856262207031,
        251.33447265625
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "properties": {},
      "widgets_values": [
        "## models\n- [qwen_image_edit_2509_fp8_e4m3fn.safetensors](https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/blob/main/split_files/diffusion_models/qwen_image_edit_2509_fp8_e4m3fn.safetensors)\n- [qwen_2.5_vl_7b_fp8_scaled.safetensors](https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/blob/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors)\n- [qwen_image_vae.safetensors](https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae)\n\n\n```\n📂ComfyUI/\n└──📂models/\n    ├── 📂diffusion_models/\n    │   └── qwen_image_edit_2509_fp8_e4m3fn.safetensors\n    ├── 📂text_encoders/\n    │   └── qwen_2.5_vl_7b_fp8.safetensors\n    └── 📂vae/\n         └── wan_2.1_vae.safetensors\n\n```"
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 8,
      "type": "VAEDecode",
      "pos": [
        1293.939697265625,
        143.6978759765625
      ],
      "size": [
        157.56002807617188,
        46
      ],
      "flags": {},
      "order": 13,
      "mode": 0,
      "inputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "link": 35
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 76
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "slot_index": 0,
          "links": [
            254
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.33",
        "Node name for S&R": "VAEDecode"
      },
      "widgets_values": []
    },
    {
      "id": 83,
      "type": "ImageResizeKJv2",
      "pos": [
        75.53079223632812,
        668.4074096679688
      ],
      "size": [
        270,
        336
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 244
        },
        {
          "name": "mask",
          "shape": 7,
          "type": "MASK",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            239,
            294,
            295
          ]
        },
        {
          "name": "width",
          "type": "INT",
          "links": null
        },
        {
          "name": "height",
          "type": "INT",
          "links": null
        },
        {
          "name": "mask",
          "type": "MASK",
          "links": []
        }
      ],
      "properties": {
        "cnr_id": "comfyui-kjnodes",
        "ver": "e2ce0843d1183aea86ce6a1617426f492dcdc802",
        "Node name for S&R": "ImageResizeKJv2"
      },
      "widgets_values": [
        0,
        0,
        "nearest-exact",
        "crop",
        "0, 0, 0",
        "center",
        8,
        "cpu"
      ]
    },
    {
      "id": 3,
      "type": "KSampler",
      "pos": [
        933.5941772460938,
        143.6978759765625
      ],
      "size": [
        315,
        262
      ],
      "flags": {},
      "order": 12,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 123
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 314
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 315
        },
        {
          "name": "latent_image",
          "type": "LATENT",
          "link": 112
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "slot_index": 0,
          "links": [
            35
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.33",
        "Node name for S&R": "KSampler"
      },
      "widgets_values": [
        1234,
        "fixed",
        20,
        2.5,
        "res_multistep",
        "simple",
        1
      ]
    },
    {
      "id": 99,
      "type": "LoadImage",
      "pos": [
        -522.9654541015625,
        668.4074096679688
      ],
      "size": [
        268.17022705078125,
        414.46728515625
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            275
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.51",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "viewfilename=ComfyUI_temp_mohpt_00009_.png",
        "image"
      ]
    },
    {
      "id": 124,
      "type": "LoadImage",
      "pos": [
        79.30519104003906,
        1079.8746337890625
      ],
      "size": [
        268.17022705078125,
        414.46728515625
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            320,
            321
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.51",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "7686enxu.png",
        "image"
      ]
    },
    {
      "id": 113,
      "type": "TextEncodeQwenImageEditPlus",
      "pos": [
        454.6401672363281,
        163.63690185546875
      ],
      "size": [
        400,
        200
      ],
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 290
        },
        {
          "name": "vae",
          "shape": 7,
          "type": "VAE",
          "link": 292
        },
        {
          "name": "image1",
          "shape": 7,
          "type": "IMAGE",
          "link": 294
        },
        {
          "name": "image2",
          "shape": 7,
          "type": "IMAGE",
          "link": 321
        },
        {
          "name": "image3",
          "shape": 7,
          "type": "IMAGE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            314
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.59",
        "Node name for S&R": "TextEncodeQwenImageEditPlus"
      },
      "widgets_values": [
        "Replace the woman in image1 with the woman from image2. Keep the background, lighting, camera framing, and overall composition the same."
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 97,
      "type": "SaveImage",
      "pos": [
        1495.48046875,
        143.6978759765625
      ],
      "size": [
        472.93079977851585,
        500.33047162734374
      ],
      "flags": {},
      "order": 14,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 254
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.51"
      },
      "widgets_values": [
        "ComfyUI"
      ]
    }
  ],
  "links": [
    [
      35,
      3,
      0,
      8,
      0,
      "LATENT"
    ],
    [
      76,
      39,
      0,
      8,
      1,
      "VAE"
    ],
    [
      112,
      63,
      0,
      3,
      3,
      "LATENT"
    ],
    [
      115,
      39,
      0,
      63,
      1,
      "VAE"
    ],
    [
      123,
      54,
      0,
      3,
      0,
      "MODEL"
    ],
    [
      239,
      83,
      0,
      63,
      0,
      "IMAGE"
    ],
    [
      244,
      82,
      0,
      83,
      0,
      "IMAGE"
    ],
    [
      254,
      8,
      0,
      97,
      0,
      "IMAGE"
    ],
    [
      275,
      99,
      0,
      82,
      0,
      "IMAGE"
    ],
    [
      282,
      111,
      0,
      54,
      0,
      "MODEL"
    ],
    [
      290,
      112,
      0,
      113,
      0,
      "CLIP"
    ],
    [
      291,
      112,
      0,
      114,
      0,
      "CLIP"
    ],
    [
      292,
      39,
      0,
      113,
      1,
      "VAE"
    ],
    [
      293,
      39,
      0,
      114,
      1,
      "VAE"
    ],
    [
      294,
      83,
      0,
      113,
      2,
      "IMAGE"
    ],
    [
      295,
      83,
      0,
      114,
      2,
      "IMAGE"
    ],
    [
      314,
      113,
      0,
      3,
      1,
      "CONDITIONING"
    ],
    [
      315,
      114,
      0,
      3,
      2,
      "CONDITIONING"
    ],
    [
      320,
      124,
      0,
      114,
      3,
      "IMAGE"
    ],
    [
      321,
      124,
      0,
      113,
      3,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.5131581182307068,
      "offset": [
        622.9654541015625,
        271.1671905517578
      ]
    },
    "frontendVersion": "1.34.2",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}