Talking Head

什么是 Talking Head？

Talking Head 是让一张图像或人脸照片看起来像“正在说话的人”那样移动的技术。以输入的图像为基础，利用另外准备的 参考视频的动作 或音频为线索，驱动嘴巴和表情。

它与对口型（Lip Sync）非常相似，但对口型主要是“让原本视频的嘴巴配合音频”。Talking Head 基本上是驱动单张图片，且很多工具主要是基于参考视频的动作而不是音频来驱动。

正如 Talking Head 之名，它是从驱动脸部开始的，但正在向驱动上半身乃至全身的方向进化。

变形基 Talking Head

Thin-Plate Spline Motion Model for Image Animation

输入一张图像和正在活动的人的视频，图像侧就会模仿那个动作进行变形。

与其说是 3D 模型，不如说更接近在 2D 状态下“扭曲”的印象。就像 Photoshop 的操控变形一样。

LivePortrait

LivePortrait_i2v_ref.json

{
  "id": "dba15c18-c2e7-4547-8472-85361bc55454",
  "revision": 0,
  "last_node_id": 25,
  "last_link_id": 28,
  "nodes": [
    {
      "id": 8,
      "type": "LoadImage",
      "pos": [
        1053.8902782149364,
        -0.15546305848340092
      ],
      "size": [
        282.1988281250001,
        508.7
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            17
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.76",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "pasted/image (105).png",
        "image"
      ]
    },
    {
      "id": 21,
      "type": "VHS_VideoCombine",
      "pos": [
        2019.7634896752875,
        -1.365463058483401
      ],
      "size": [
        344.2576171874998,
        780.8260157993859
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 19
        },
        {
          "name": "audio",
          "shape": 7,
          "type": "AUDIO",
          "link": null
        },
        {
          "name": "meta_batch",
          "shape": 7,
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "shape": 7,
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfyui-videohelpersuite",
        "ver": "8923bd836bdab8b7bbdf4ed104b7d045e70c66e2",
        "Node name for S&R": "VHS_VideoCombine"
      },
      "widgets_values": {
        "frame_rate": 16,
        "loop_count": 0,
        "filename_prefix": "LivePortrait",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "trim_to_audio": false,
        "pingpong": false,
        "save_output": true,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "LivePortrait_00002.mp4",
            "subfolder": "",
            "type": "output",
            "format": "video/h264-mp4",
            "frame_rate": 8,
            "workflow": "LivePortrait_00002.png",
            "fullpath": "D:\\AI\\ComfyUI_windows_portable\\ComfyUI\\output\\LivePortrait_00002.mp4"
          }
        }
      }
    },
    {
      "id": 20,
      "type": "AdvancedLivePortrait",
      "pos": [
        1665.2686130082168,
        -1.365463058483401
      ],
      "size": [
        311.2184264611833,
        272
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "src_images",
          "shape": 7,
          "type": "IMAGE",
          "link": 17
        },
        {
          "name": "motion_link",
          "shape": 7,
          "type": "EDITOR_LINK",
          "link": null
        },
        {
          "name": "driving_images",
          "shape": 7,
          "type": "IMAGE",
          "link": 28
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            19
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfyui-advancedliveportrait",
        "ver": "3bba732915e22f18af0d221b9c5c282990181f1b",
        "Node name for S&R": "AdvancedLivePortrait"
      },
      "widgets_values": [
        0,
        0,
        1.7,
        true,
        false,
        false,
        ""
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 25,
      "type": "VHS_LoadVideo",
      "pos": [
        1371.2861499441556,
        114.0272015894756
      ],
      "size": [
        261.6533203125,
        460.08634187370603
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "shape": 7,
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "shape": 7,
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            28
          ]
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "links": null
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfyui-videohelpersuite",
        "ver": "8923bd836bdab8b7bbdf4ed104b7d045e70c66e2",
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "7327398-uhd_3840_2160_25fps.mp4",
        "force_rate": 16,
        "custom_width": 0,
        "custom_height": 0,
        "frame_load_cap": 48,
        "skip_first_frames": 0,
        "select_every_nth": 1,
        "format": "None",
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "7327398-uhd_3840_2160_25fps.mp4",
            "type": "input",
            "format": "video/mp4",
            "force_rate": 16,
            "custom_width": 0,
            "custom_height": 0,
            "frame_load_cap": 48,
            "skip_first_frames": 0,
            "select_every_nth": 1
          }
        }
      }
    }
  ],
  "links": [
    [
      17,
      8,
      0,
      20,
      0,
      "IMAGE"
    ],
    [
      19,
      20,
      0,
      21,
      0,
      "IMAGE"
    ],
    [
      28,
      25,
      0,
      20,
      2,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.9090909090909091,
      "offset": [
        -786.1067299591815,
        216.72498248265038
      ]
    },
    "frontendVersion": "1.35.0",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

这个也是输入一张图和参考视频，但为了能稳定重现脸部各部分的动作、视线、感情的细微差别等进行了改良。

因为不是扩散模型，所以比较轻，也适合接近实时的处理。此外，因为可以进行“脸稍微朝下”或“眼睛稍微睁大”等编辑，所以现在也经常被使用。

扩散模型基 Talking Head

到了下一代，出现了使用扩散模型“重绘图像本身”方向的 Talking Head。X-Portrait 或 HelloMeme 就属于这一系。

HelloMeme_video.json

{
  "last_node_id": 26,
  "last_link_id": 40,
  "nodes": [
    {
      "id": 21,
      "type": "GetReferenceImageRT",
      "pos": [
        740,
        410
      ],
      "size": [
        241.79998779296875,
        46
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "face_toolkits",
          "type": "FACE_TOOLKITS",
          "link": 32
        },
        {
          "name": "image",
          "type": "IMAGE",
          "link": 35
        }
      ],
      "outputs": [
        {
          "name": "REFRT",
          "type": "REFRT",
          "links": [
            31
          ]
        }
      ],
      "properties": {
        "Node name for S&R": "GetReferenceImageRT"
      },
      "widgets_values": [],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 24,
      "type": "PreviewImage",
      "pos": [
        744,
        551
      ],
      "size": [
        210,
        246
      ],
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 37
        }
      ],
      "outputs": [],
      "properties": {
        "Node name for S&R": "PreviewImage"
      }
    },
    {
      "id": 19,
      "type": "GetVideoDriveParams",
      "pos": [
        1064,
        330
      ],
      "size": [
        270.3999938964844,
        98
      ],
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "face_toolkits",
          "type": "FACE_TOOLKITS",
          "link": 33
        },
        {
          "name": "images",
          "type": "IMAGE",
          "link": 40
        },
        {
          "name": "ref_rt",
          "type": "REFRT",
          "link": 31
        }
      ],
      "outputs": [
        {
          "name": "drive_video_params",
          "type": "DRIVE_VIDEO_PARAMS",
          "links": [
            29
          ]
        }
      ],
      "properties": {
        "Node name for S&R": "GetVideoDriveParams"
      },
      "widgets_values": [
        0
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 22,
      "type": "HMFaceToolkitsLoader",
      "pos": [
        458,
        330
      ],
      "size": [
        230.03500366210938,
        58
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "FACE_TOOLKITS",
          "type": "FACE_TOOLKITS",
          "links": [
            32,
            33
          ],
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "HMFaceToolkitsLoader"
      },
      "widgets_values": [
        0
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 14,
      "type": "ImageResize",
      "pos": [
        380,
        470
      ],
      "size": [
        315,
        246
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "pixels",
          "type": "IMAGE",
          "link": 19
        },
        {
          "name": "mask_optional",
          "type": "MASK",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            35,
            36,
            37
          ],
          "slot_index": 0
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResize"
      },
      "widgets_values": [
        "crop to ratio",
        0,
        0,
        0,
        "reduce size only",
        "1:1",
        0,
        20
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 26,
      "type": "VHS_LoadVideo",
      "pos": [
        386,
        803
      ],
      "size": [
        305.44378662109375,
        436.5621337890625
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null,
          "shape": 7
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            40
          ],
          "slot_index": 0
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "links": null
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "3762907-uhd_3840_2160_25fps.mp4",
        "force_rate": 16,
        "force_size": "512x?",
        "custom_width": 512,
        "custom_height": 512,
        "frame_load_cap": 48,
        "skip_first_frames": 0,
        "select_every_nth": 1,
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "force_rate": 16,
            "frame_load_cap": 48,
            "skip_first_frames": 0,
            "select_every_nth": 1,
            "filename": "3762907-uhd_3840_2160_25fps.mp4",
            "type": "input",
            "format": "video/mp4"
          },
          "muted": false
        }
      }
    },
    {
      "id": 18,
      "type": "HMVideoPipelineLoader",
      "pos": [
        981,
        161
      ],
      "size": [
        352.79998779296875,
        106
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "HMVIDEOPIPELINE",
          "type": "HMVIDEOPIPELINE",
          "links": [
            28
          ]
        }
      ],
      "properties": {
        "Node name for S&R": "HMVideoPipelineLoader"
      },
      "widgets_values": [
        "None",
        "None",
        0
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 25,
      "type": "VHS_VideoCombine",
      "pos": [
        1762,
        445
      ],
      "size": [
        341.20306396484375,
        645.2030639648438
      ],
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 39
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "link": null,
          "shape": 7
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null,
          "shape": 7
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null,
          "shape": 7
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_VideoCombine"
      },
      "widgets_values": {
        "frame_rate": 16,
        "loop_count": 0,
        "filename_prefix": "AnimateDiff",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
        "save_output": true,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "AnimateDiff_00033.mp4",
            "subfolder": "",
            "type": "output",
            "format": "video/h264-mp4",
            "frame_rate": 16
          },
          "muted": false
        }
      }
    },
    {
      "id": 17,
      "type": "HMPipelineVideo",
      "pos": [
        1409,
        448
      ],
      "size": [
        315,
        218
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "HMVIDEOPIPELINE",
          "link": 28
        },
        {
          "name": "image",
          "type": "IMAGE",
          "link": 36
        },
        {
          "name": "drive_video_params",
          "type": "DRIVE_VIDEO_PARAMS",
          "link": 29
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            39
          ],
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "HMPipelineVideo"
      },
      "widgets_values": [
        "(best quality), highly detailed, ultra-detailed, headshot, person, well-placed five sense organs, looking at the viewer, centered composition, sharp focus, realistic skin texture",
        "",
        25,
        1234,
        "fixed",
        2
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 4,
      "type": "LoadImage",
      "pos": [
        32,
        471
      ],
      "size": [
        315,
        314
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            19
          ],
          "slot_index": 0
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "pexels-photo-28252721.jpg",
        "image"
      ]
    }
  ],
  "links": [
    [
      19,
      4,
      0,
      14,
      0,
      "IMAGE"
    ],
    [
      28,
      18,
      0,
      17,
      0,
      "HMVIDEOPIPELINE"
    ],
    [
      29,
      19,
      0,
      17,
      2,
      "DRIVE_VIDEO_PARAMS"
    ],
    [
      31,
      21,
      0,
      19,
      2,
      "REFRT"
    ],
    [
      32,
      22,
      0,
      21,
      0,
      "FACE_TOOLKITS"
    ],
    [
      33,
      22,
      0,
      19,
      0,
      "FACE_TOOLKITS"
    ],
    [
      35,
      14,
      0,
      21,
      1,
      "IMAGE"
    ],
    [
      36,
      14,
      0,
      17,
      1,
      "IMAGE"
    ],
    [
      37,
      14,
      0,
      24,
      0,
      "IMAGE"
    ],
    [
      39,
      17,
      0,
      25,
      0,
      "IMAGE"
    ],
    [
      40,
      26,
      0,
      19,
      1,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.7513148009015778,
      "offset": [
        93.6229236924027,
        107.24457527189101
      ]
    }
  },
  "version": 0.4
}

它们从参考视频中提取出相当于“头的方向”或“表情变化”的信号，并将其作为条件传递给扩散模型。所做的事情接近于一边用 ControlNet 固定姿势和构图一边生成图像，就像指定“希望用这个动作重绘这个角色的脸”一样。

视频生成模型基 Talking Head

在更新的一代中，出现了以视频生成模型本身为基础的 Talking Head / Avatar 模型。OmniAvatar 或 Wan-Animate 就属于这一类。

Wan-Animate

Wan-Animate 是输入角色图像和“带有动作的参考视频”，让角色像描绘那个动作一样移动的模型。

走向 Human Motion Transfer

当 Talking Head 技术能够稳定处理脸部周围时，自然会变成“也想驱动上半身或全身”。

像 Thin-Plate Spline 这样的老技术，原本就不只用于脸部，也能应用于全身，而且 Wan-Animate 也能完美地处理全身，所以感觉没必要特意和 Talking Head 区分开来，但由于 Human Motion Transfer 也是独自进化而来的，所以稍微看一看吧。

→ Human Motion Transfer

Talking Head

什么是 Talking Head？

变形基 Talking Head

Thin-Plate Spline Motion Model for Image Animation

LivePortrait

扩散模型基 Talking Head

视频生成模型基 Talking Head

走向 Human Motion Transfer

什么是 JSON 复制按钮？

这个页面有问题！

请补充讲解！

感想 / 其他

感谢！

Talking Head

什么是 Talking Head？

变形基 Talking Head

Thin-Plate Spline Motion Model for Image Animation

LivePortrait

扩散模型基 Talking Head

视频生成模型基 Talking Head

走向 Human Motion Transfer

相关工作流