Human Motion Transferとは?
Human Motion Transfer は、1枚の人物画像(またはキャラ画像)に、別の動画の全身モーションを移し替える技術です。
ダンス動画やウォーキング動画の「動きだけ」を借りてきて、自分のキャラクターに演じてもらう、といった使い方が多いと思います。
talking head が主に「顔〜上半身」を対象に、表情や口の動きを細かく合わせるのに対し、Human Motion Transfer は 全身のポーズ を中心に扱います。
Animate Anyone以降の流れ
BDMM など、以前からモーションを転送する研究は存在していましたが、画像生成AIコミュニティでこのタスクを広く知らしめたのは Animate Anyone でしょう。
1枚の人物画像と、別の人物のダンス動画などを入力にし、「そのキャラが同じ動きをするフルボディ動画」を生成するコンセプトで、多くのデモ動画が出回りました。
ただし Animate Anyone 自体はオープンソースではなかったため、実際に触れるモデルとしては、Stable Video Diffusion をベースに再現を試みた MimicMotion といったモデルが登場します。
{
"last_node_id": 41,
"last_link_id": 57,
"nodes": [
{
"id": 6,
"type": "MimicMotionDecode",
"pos": {
"0": 1840,
"1": 90
},
"size": {
"0": 245.6194305419922,
"1": 78
},
"flags": {},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"link": 8
},
{
"name": "samples",
"type": "LATENT",
"link": 6
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
23
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "MimicMotionDecode"
},
"widgets_values": [
4
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 21,
"type": "VHS_VideoCombine",
"pos": {
"0": 2110,
"1": 90
},
"size": [
420,
895.4285714285714
],
"flags": {},
"order": 11,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 23
},
{
"name": "audio",
"type": "AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 15,
"loop_count": 0,
"filename_prefix": "AnimateDiff",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "AnimateDiff_00014.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
"frame_rate": 15
},
"muted": false
}
}
},
{
"id": 11,
"type": "VHS_LoadVideo",
"pos": {
"0": 111,
"1": 617
},
"size": [
253.279296875,
713.398681640625
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
20
],
"slot_index": 0,
"shape": 3
},
{
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "TikTok.mp4",
"force_rate": 15,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 72,
"skip_first_frames": 0,
"select_every_nth": 1,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 72,
"skip_first_frames": 0,
"force_rate": 15,
"filename": "TikTok.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 1
},
"muted": true
}
}
},
{
"id": 2,
"type": "MimicMotionGetPoses",
"pos": {
"0": 1070,
"1": 330
},
"size": {
"0": 330,
"1": 126
},
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "ref_image",
"type": "IMAGE",
"link": 45
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 18
}
],
"outputs": [
{
"name": "poses_with_ref",
"type": "IMAGE",
"links": [
46
],
"slot_index": 0,
"shape": 3
},
{
"name": "pose_images",
"type": "IMAGE",
"links": [],
"slot_index": 1,
"shape": 3
}
],
"properties": {
"Node name for S&R": "MimicMotionGetPoses"
},
"widgets_values": [
true,
true,
true
],
"color": "#323",
"bgcolor": "#535"
},
{
"id": 25,
"type": "ImageScale",
"pos": {
"0": 750,
"1": 240
},
"size": {
"0": 261.69830322265625,
"1": 122
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 29
},
{
"name": "width",
"type": "INT",
"link": 34,
"widget": {
"name": "width"
}
},
{
"name": "height",
"type": "INT",
"link": 35,
"widget": {
"name": "height"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
45,
47
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageScale"
},
"widgets_values": [
"nearest-exact",
448,
640,
"center"
]
},
{
"id": 18,
"type": "ImageScale",
"pos": {
"0": 750,
"1": 430
},
"size": {
"0": 259.49639892578125,
"1": 122
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 20
},
{
"name": "width",
"type": "INT",
"link": 36,
"widget": {
"name": "width"
}
},
{
"name": "height",
"type": "INT",
"link": 37,
"widget": {
"name": "height"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
18
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageScale"
},
"widgets_values": [
"nearest-exact",
448,
640,
"center"
]
},
{
"id": 4,
"type": "DiffusersScheduler",
"pos": {
"0": 1072,
"1": 518
},
"size": {
"0": 330.98272705078125,
"1": 130
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "scheduler",
"type": "DIFFUSERS_SCHEDULER",
"links": [
4
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "DiffusersScheduler"
},
"widgets_values": [
"EulerDiscreteScheduler",
0.002,
700,
false
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 9,
"type": "LoadImage",
"pos": {
"0": 118,
"1": 46
},
"size": {
"0": 260.6280212402344,
"1": 482.00115966796875
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
29
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"slot_index": 1,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"pexels-photo-2989688.jpg",
"image"
]
},
{
"id": 26,
"type": "PrimitiveNode",
"pos": {
"0": 450,
"1": 290
},
"size": {
"0": 210,
"1": 82
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "INT",
"type": "INT",
"links": [
34,
36
],
"widget": {
"name": "width"
}
}
],
"properties": {
"Run widget replace on values": false
},
"widgets_values": [
448,
"fixed"
]
},
{
"id": 29,
"type": "PrimitiveNode",
"pos": {
"0": 450,
"1": 420
},
"size": {
"0": 212.04299926757812,
"1": 82
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "INT",
"type": "INT",
"links": [
35,
37
],
"widget": {
"name": "height"
}
}
],
"properties": {
"Run widget replace on values": false
},
"widgets_values": [
640,
"fixed"
]
},
{
"id": 3,
"type": "DownloadAndLoadMimicMotionModel",
"pos": {
"0": 1090,
"1": 90
},
"size": {
"0": 299.5453796386719,
"1": 82
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"links": [
3,
8
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadMimicMotionModel"
},
"widgets_values": [
"MimicMotionMergedUnet_1-1-fp16.safetensors",
"bf16"
],
"color": "#232",
"bgcolor": "#353"
},
{
"id": 1,
"type": "MimicMotionSampler",
"pos": {
"0": 1460,
"1": 170
},
"size": {
"0": 330,
"1": 430
},
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "mimic_pipeline",
"type": "MIMICPIPE",
"link": 3
},
{
"name": "ref_image",
"type": "IMAGE",
"link": 47
},
{
"name": "pose_images",
"type": "IMAGE",
"link": 46
},
{
"name": "optional_scheduler",
"type": "DIFFUSERS_SCHEDULER",
"link": 4
}
],
"outputs": [
{
"name": "samples",
"type": "LATENT",
"links": [
6
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "MimicMotionSampler"
},
"widgets_values": [
25,
2,
2,
1234,
"fixed",
15,
0,
72,
36,
false,
1,
0,
1,
1
],
"color": "#232",
"bgcolor": "#353"
}
],
"links": [
[
3,
3,
0,
1,
0,
"MIMICPIPE"
],
[
4,
4,
0,
1,
3,
"DIFFUSERS_SCHEDULER"
],
[
6,
1,
0,
6,
1,
"LATENT"
],
[
8,
3,
0,
6,
0,
"MIMICPIPE"
],
[
18,
18,
0,
2,
1,
"IMAGE"
],
[
20,
11,
0,
18,
0,
"IMAGE"
],
[
23,
6,
0,
21,
0,
"IMAGE"
],
[
29,
9,
0,
25,
0,
"IMAGE"
],
[
34,
26,
0,
25,
1,
"INT"
],
[
35,
29,
0,
25,
2,
"INT"
],
[
36,
26,
0,
18,
1,
"INT"
],
[
37,
29,
0,
18,
2,
"INT"
],
[
45,
25,
0,
2,
0,
"IMAGE"
],
[
46,
2,
0,
1,
2,
"IMAGE"
],
[
47,
25,
0,
1,
1,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.5644739300537777,
"offset": [
47.979942073719656,
120.46737166341701
]
}
},
"version": 0.4
}
DiT世代とWan-Animate
DiTベースの動画生成モデルの登場によって、Human Motion Transfer も順当に進化しています。
Wan2.1 VACE
Wan2.1 には、VACE と呼ばれる仕組みがあります。
VACE は、動画生成において ControlNet、reference2video、inpainting をまとめて扱えるフレームワークです。
ControlNet Pose と reference2video 的な操作を組み合わせることで、Human Motion Transfer に近いことができます。
専用の Human Motion Transfer モデルというよりは、「Wan2.1 を土台に、ポーズと参照動画を使って動きをコントロールするための土台」として使われます。
Wan-Animate
よりモーション転送に特化したモデルが、Wan-Animate です。

Wan-Animate
キャラクター画像と、動きを持ったドライバー動画を入力にして、フルボディのモーションを転送することができます。
全身のポーズだけでなく、「顔のアップ動画」をドライバーとして使うこともできるため、talking head 的な使い方と、Human Motion Transfer 的な使い方の両方をカバーできるのが特徴です。