Human Motion Transfer

什么是 Human Motion Transfer？

Human Motion Transfer 是将 其他视频的全身动作转移 到一张人物图像（或角色图像）上的技术。

我想很多用法是借用舞蹈视频或走路视频的“动作”，让自己的角色来表演。

与 talking head 主要以“脸〜上半身”为对象、细致地匹配表情和嘴部动作相对，Human Motion Transfer 主要处理 全身的姿势。

Animate Anyone 之后的潮流

虽然像 BDMM 等，以前就存在转移动作的研究，但在图像生成 AI 社区中并没有广泛传播，让这个任务广为人知的应该是 Animate Anyone。

以一张人物图像和另一个人物的舞蹈视频等作为输入，生成“该角色做同样动作的全身视频”为概念，流传了许多演示视频。

但是 Animate Anyone 本身没有开源，所以作为实际能接触到的模型，出现了以 Stable Video Diffusion 为基础试图重现的 MimicMotion 等模型。

MimicMotion.json

{
  "last_node_id": 41,
  "last_link_id": 57,
  "nodes": [
    {
      "id": 6,
      "type": "MimicMotionDecode",
      "pos": {
        "0": 1840,
        "1": 90
      },
      "size": {
        "0": 245.6194305419922,
        "1": 78
      },
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "mimic_pipeline",
          "type": "MIMICPIPE",
          "link": 8
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 6
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            23
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "MimicMotionDecode"
      },
      "widgets_values": [
        4
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 21,
      "type": "VHS_VideoCombine",
      "pos": {
        "0": 2110,
        "1": 90
      },
      "size": [
        420,
        895.4285714285714
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 23
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "link": null
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_VideoCombine"
      },
      "widgets_values": {
        "frame_rate": 15,
        "loop_count": 0,
        "filename_prefix": "AnimateDiff",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
        "save_output": false,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "AnimateDiff_00014.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
            "frame_rate": 15
          },
          "muted": false
        }
      }
    },
    {
      "id": 11,
      "type": "VHS_LoadVideo",
      "pos": {
        "0": 111,
        "1": 617
      },
      "size": [
        253.279296875,
        713.398681640625
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            20
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "links": null,
          "shape": 3
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "TikTok.mp4",
        "force_rate": 15,
        "force_size": "Disabled",
        "custom_width": 512,
        "custom_height": 512,
        "frame_load_cap": 72,
        "skip_first_frames": 0,
        "select_every_nth": 1,
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "frame_load_cap": 72,
            "skip_first_frames": 0,
            "force_rate": 15,
            "filename": "TikTok.mp4",
            "type": "input",
            "format": "video/mp4",
            "select_every_nth": 1
          },
          "muted": true
        }
      }
    },
    {
      "id": 2,
      "type": "MimicMotionGetPoses",
      "pos": {
        "0": 1070,
        "1": 330
      },
      "size": {
        "0": 330,
        "1": 126
      },
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "ref_image",
          "type": "IMAGE",
          "link": 45
        },
        {
          "name": "pose_images",
          "type": "IMAGE",
          "link": 18
        }
      ],
      "outputs": [
        {
          "name": "poses_with_ref",
          "type": "IMAGE",
          "links": [
            46
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "pose_images",
          "type": "IMAGE",
          "links": [],
          "slot_index": 1,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "MimicMotionGetPoses"
      },
      "widgets_values": [
        true,
        true,
        true
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 25,
      "type": "ImageScale",
      "pos": {
        "0": 750,
        "1": 240
      },
      "size": {
        "0": 261.69830322265625,
        "1": 122
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 29
        },
        {
          "name": "width",
          "type": "INT",
          "link": 34,
          "widget": {
            "name": "width"
          }
        },
        {
          "name": "height",
          "type": "INT",
          "link": 35,
          "widget": {
            "name": "height"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            45,
            47
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageScale"
      },
      "widgets_values": [
        "nearest-exact",
        448,
        640,
        "center"
      ]
    },
    {
      "id": 18,
      "type": "ImageScale",
      "pos": {
        "0": 750,
        "1": 430
      },
      "size": {
        "0": 259.49639892578125,
        "1": 122
      },
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 20
        },
        {
          "name": "width",
          "type": "INT",
          "link": 36,
          "widget": {
            "name": "width"
          }
        },
        {
          "name": "height",
          "type": "INT",
          "link": 37,
          "widget": {
            "name": "height"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            18
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageScale"
      },
      "widgets_values": [
        "nearest-exact",
        448,
        640,
        "center"
      ]
    },
    {
      "id": 4,
      "type": "DiffusersScheduler",
      "pos": {
        "0": 1072,
        "1": 518
      },
      "size": {
        "0": 330.98272705078125,
        "1": 130
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "scheduler",
          "type": "DIFFUSERS_SCHEDULER",
          "links": [
            4
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DiffusersScheduler"
      },
      "widgets_values": [
        "EulerDiscreteScheduler",
        0.002,
        700,
        false
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 9,
      "type": "LoadImage",
      "pos": {
        "0": 118,
        "1": 46
      },
      "size": {
        "0": 260.6280212402344,
        "1": 482.00115966796875
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            29
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null,
          "slot_index": 1,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "pexels-photo-2989688.jpg",
        "image"
      ]
    },
    {
      "id": 26,
      "type": "PrimitiveNode",
      "pos": {
        "0": 450,
        "1": 290
      },
      "size": {
        "0": 210,
        "1": 82
      },
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "INT",
          "type": "INT",
          "links": [
            34,
            36
          ],
          "widget": {
            "name": "width"
          }
        }
      ],
      "properties": {
        "Run widget replace on values": false
      },
      "widgets_values": [
        448,
        "fixed"
      ]
    },
    {
      "id": 29,
      "type": "PrimitiveNode",
      "pos": {
        "0": 450,
        "1": 420
      },
      "size": {
        "0": 212.04299926757812,
        "1": 82
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "INT",
          "type": "INT",
          "links": [
            35,
            37
          ],
          "widget": {
            "name": "height"
          }
        }
      ],
      "properties": {
        "Run widget replace on values": false
      },
      "widgets_values": [
        640,
        "fixed"
      ]
    },
    {
      "id": 3,
      "type": "DownloadAndLoadMimicMotionModel",
      "pos": {
        "0": 1090,
        "1": 90
      },
      "size": {
        "0": 299.5453796386719,
        "1": 82
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "mimic_pipeline",
          "type": "MIMICPIPE",
          "links": [
            3,
            8
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadMimicMotionModel"
      },
      "widgets_values": [
        "MimicMotionMergedUnet_1-1-fp16.safetensors",
        "bf16"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 1,
      "type": "MimicMotionSampler",
      "pos": {
        "0": 1460,
        "1": 170
      },
      "size": {
        "0": 330,
        "1": 430
      },
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "mimic_pipeline",
          "type": "MIMICPIPE",
          "link": 3
        },
        {
          "name": "ref_image",
          "type": "IMAGE",
          "link": 47
        },
        {
          "name": "pose_images",
          "type": "IMAGE",
          "link": 46
        },
        {
          "name": "optional_scheduler",
          "type": "DIFFUSERS_SCHEDULER",
          "link": 4
        }
      ],
      "outputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            6
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "MimicMotionSampler"
      },
      "widgets_values": [
        25,
        2,
        2,
        1234,
        "fixed",
        15,
        0,
        72,
        36,
        false,
        1,
        0,
        1,
        1
      ],
      "color": "#232",
      "bgcolor": "#353"
    }
  ],
  "links": [
    [
      3,
      3,
      0,
      1,
      0,
      "MIMICPIPE"
    ],
    [
      4,
      4,
      0,
      1,
      3,
      "DIFFUSERS_SCHEDULER"
    ],
    [
      6,
      1,
      0,
      6,
      1,
      "LATENT"
    ],
    [
      8,
      3,
      0,
      6,
      0,
      "MIMICPIPE"
    ],
    [
      18,
      18,
      0,
      2,
      1,
      "IMAGE"
    ],
    [
      20,
      11,
      0,
      18,
      0,
      "IMAGE"
    ],
    [
      23,
      6,
      0,
      21,
      0,
      "IMAGE"
    ],
    [
      29,
      9,
      0,
      25,
      0,
      "IMAGE"
    ],
    [
      34,
      26,
      0,
      25,
      1,
      "INT"
    ],
    [
      35,
      29,
      0,
      25,
      2,
      "INT"
    ],
    [
      36,
      26,
      0,
      18,
      1,
      "INT"
    ],
    [
      37,
      29,
      0,
      18,
      2,
      "INT"
    ],
    [
      45,
      25,
      0,
      2,
      0,
      "IMAGE"
    ],
    [
      46,
      2,
      0,
      1,
      2,
      "IMAGE"
    ],
    [
      47,
      25,
      0,
      1,
      1,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.5644739300537777,
      "offset": [
        47.979942073719656,
        120.46737166341701
      ]
    }
  },
  "version": 0.4
}

DiT 世代和 Wan-Animate

随着基于 DiT 的视频生成模型的登场，Human Motion Transfer 也在顺理成章地进化。

Wan2.1 VACE

Wan2.1 中有一种被称为 VACE 的机制。

VACE 是在视频生成中可以汇总处理 ControlNet、reference2video、inpainting 的框架。通过组合 ControlNet Pose 和 reference2video 式的操作，可以做到接近 Human Motion Transfer 的事情。

与其说是专用的 Human Motion Transfer 模型，不如说是作为“以 Wan2.1 为基础，使用姿势和参考视频控制动作的平台”来使用。

Wan-Animate

更专注于动作转移的模型是 Wan-Animate。

Wan-Animate

输入角色图像和带有动作的驱动视频，可以转移全身的动作。

不仅是全身的姿势，还可以使用“面部特写视频”作为驱动，因此特征是 可以覆盖 talking head 式的用法和 Human Motion Transfer 式的用法两者。

Human Motion Transfer

什么是 Human Motion Transfer？

Animate Anyone 之后的潮流

DiT 世代和 Wan-Animate

Wan2.1 VACE

Wan-Animate

什么是 JSON 复制按钮？

这个页面有问题！

请补充讲解！

感想 / 其他

感谢！

Human Motion Transfer

什么是 Human Motion Transfer？

Animate Anyone 之后的潮流

DiT 世代和 Wan-Animate

Wan2.1 VACE

Wan-Animate

相关工作流