Human Motion Transfer

Human Motion Transferとは？

Human Motion Transfer は、1枚の人物画像（またはキャラ画像）に、別の動画の全身モーションを移し替える技術です。

ダンス動画やウォーキング動画の「動きだけ」を借りてきて、自分のキャラクターに演じてもらう、といった使い方が多いと思います。

talking head が主に「顔〜上半身」を対象に、表情や口の動きを細かく合わせるのに対し、Human Motion Transfer は 全身のポーズ を中心に扱います。

Animate Anyone以降の流れ

BDMM など、以前からモーションを転送する研究は存在していましたが、画像生成AIコミュニティでこのタスクを広く知らしめたのは Animate Anyone でしょう。

1枚の人物画像と、別の人物のダンス動画などを入力にし、「そのキャラが同じ動きをするフルボディ動画」を生成するコンセプトで、多くのデモ動画が出回りました。

ただし Animate Anyone 自体はオープンソースではなかったため、実際に触れるモデルとしては、Stable Video Diffusion をベースに再現を試みた MimicMotion といったモデルが登場します。

MimicMotion.json

{
  "last_node_id": 41,
  "last_link_id": 57,
  "nodes": [
    {
      "id": 6,
      "type": "MimicMotionDecode",
      "pos": {
        "0": 1840,
        "1": 90
      },
      "size": {
        "0": 245.6194305419922,
        "1": 78
      },
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "mimic_pipeline",
          "type": "MIMICPIPE",
          "link": 8
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 6
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            23
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "MimicMotionDecode"
      },
      "widgets_values": [
        4
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 21,
      "type": "VHS_VideoCombine",
      "pos": {
        "0": 2110,
        "1": 90
      },
      "size": [
        420,
        895.4285714285714
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 23
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "link": null
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_VideoCombine"
      },
      "widgets_values": {
        "frame_rate": 15,
        "loop_count": 0,
        "filename_prefix": "AnimateDiff",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
        "save_output": false,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "AnimateDiff_00014.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
            "frame_rate": 15
          },
          "muted": false
        }
      }
    },
    {
      "id": 11,
      "type": "VHS_LoadVideo",
      "pos": {
        "0": 111,
        "1": 617
      },
      "size": [
        253.279296875,
        713.398681640625
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            20
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "links": null,
          "shape": 3
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "TikTok.mp4",
        "force_rate": 15,
        "force_size": "Disabled",
        "custom_width": 512,
        "custom_height": 512,
        "frame_load_cap": 72,
        "skip_first_frames": 0,
        "select_every_nth": 1,
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "frame_load_cap": 72,
            "skip_first_frames": 0,
            "force_rate": 15,
            "filename": "TikTok.mp4",
            "type": "input",
            "format": "video/mp4",
            "select_every_nth": 1
          },
          "muted": true
        }
      }
    },
    {
      "id": 2,
      "type": "MimicMotionGetPoses",
      "pos": {
        "0": 1070,
        "1": 330
      },
      "size": {
        "0": 330,
        "1": 126
      },
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "ref_image",
          "type": "IMAGE",
          "link": 45
        },
        {
          "name": "pose_images",
          "type": "IMAGE",
          "link": 18
        }
      ],
      "outputs": [
        {
          "name": "poses_with_ref",
          "type": "IMAGE",
          "links": [
            46
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "pose_images",
          "type": "IMAGE",
          "links": [],
          "slot_index": 1,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "MimicMotionGetPoses"
      },
      "widgets_values": [
        true,
        true,
        true
      ],
      "color": "#323",
      "bgcolor": "#535"
    },
    {
      "id": 25,
      "type": "ImageScale",
      "pos": {
        "0": 750,
        "1": 240
      },
      "size": {
        "0": 261.69830322265625,
        "1": 122
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 29
        },
        {
          "name": "width",
          "type": "INT",
          "link": 34,
          "widget": {
            "name": "width"
          }
        },
        {
          "name": "height",
          "type": "INT",
          "link": 35,
          "widget": {
            "name": "height"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            45,
            47
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageScale"
      },
      "widgets_values": [
        "nearest-exact",
        448,
        640,
        "center"
      ]
    },
    {
      "id": 18,
      "type": "ImageScale",
      "pos": {
        "0": 750,
        "1": 430
      },
      "size": {
        "0": 259.49639892578125,
        "1": 122
      },
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 20
        },
        {
          "name": "width",
          "type": "INT",
          "link": 36,
          "widget": {
            "name": "width"
          }
        },
        {
          "name": "height",
          "type": "INT",
          "link": 37,
          "widget": {
            "name": "height"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            18
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageScale"
      },
      "widgets_values": [
        "nearest-exact",
        448,
        640,
        "center"
      ]
    },
    {
      "id": 4,
      "type": "DiffusersScheduler",
      "pos": {
        "0": 1072,
        "1": 518
      },
      "size": {
        "0": 330.98272705078125,
        "1": 130
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "scheduler",
          "type": "DIFFUSERS_SCHEDULER",
          "links": [
            4
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DiffusersScheduler"
      },
      "widgets_values": [
        "EulerDiscreteScheduler",
        0.002,
        700,
        false
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 9,
      "type": "LoadImage",
      "pos": {
        "0": 118,
        "1": 46
      },
      "size": {
        "0": 260.6280212402344,
        "1": 482.00115966796875
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            29
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null,
          "slot_index": 1,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "pexels-photo-2989688.jpg",
        "image"
      ]
    },
    {
      "id": 26,
      "type": "PrimitiveNode",
      "pos": {
        "0": 450,
        "1": 290
      },
      "size": {
        "0": 210,
        "1": 82
      },
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "INT",
          "type": "INT",
          "links": [
            34,
            36
          ],
          "widget": {
            "name": "width"
          }
        }
      ],
      "properties": {
        "Run widget replace on values": false
      },
      "widgets_values": [
        448,
        "fixed"
      ]
    },
    {
      "id": 29,
      "type": "PrimitiveNode",
      "pos": {
        "0": 450,
        "1": 420
      },
      "size": {
        "0": 212.04299926757812,
        "1": 82
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "INT",
          "type": "INT",
          "links": [
            35,
            37
          ],
          "widget": {
            "name": "height"
          }
        }
      ],
      "properties": {
        "Run widget replace on values": false
      },
      "widgets_values": [
        640,
        "fixed"
      ]
    },
    {
      "id": 3,
      "type": "DownloadAndLoadMimicMotionModel",
      "pos": {
        "0": 1090,
        "1": 90
      },
      "size": {
        "0": 299.5453796386719,
        "1": 82
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "mimic_pipeline",
          "type": "MIMICPIPE",
          "links": [
            3,
            8
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadMimicMotionModel"
      },
      "widgets_values": [
        "MimicMotionMergedUnet_1-1-fp16.safetensors",
        "bf16"
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 1,
      "type": "MimicMotionSampler",
      "pos": {
        "0": 1460,
        "1": 170
      },
      "size": {
        "0": 330,
        "1": 430
      },
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "mimic_pipeline",
          "type": "MIMICPIPE",
          "link": 3
        },
        {
          "name": "ref_image",
          "type": "IMAGE",
          "link": 47
        },
        {
          "name": "pose_images",
          "type": "IMAGE",
          "link": 46
        },
        {
          "name": "optional_scheduler",
          "type": "DIFFUSERS_SCHEDULER",
          "link": 4
        }
      ],
      "outputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            6
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "MimicMotionSampler"
      },
      "widgets_values": [
        25,
        2,
        2,
        1234,
        "fixed",
        15,
        0,
        72,
        36,
        false,
        1,
        0,
        1,
        1
      ],
      "color": "#232",
      "bgcolor": "#353"
    }
  ],
  "links": [
    [
      3,
      3,
      0,
      1,
      0,
      "MIMICPIPE"
    ],
    [
      4,
      4,
      0,
      1,
      3,
      "DIFFUSERS_SCHEDULER"
    ],
    [
      6,
      1,
      0,
      6,
      1,
      "LATENT"
    ],
    [
      8,
      3,
      0,
      6,
      0,
      "MIMICPIPE"
    ],
    [
      18,
      18,
      0,
      2,
      1,
      "IMAGE"
    ],
    [
      20,
      11,
      0,
      18,
      0,
      "IMAGE"
    ],
    [
      23,
      6,
      0,
      21,
      0,
      "IMAGE"
    ],
    [
      29,
      9,
      0,
      25,
      0,
      "IMAGE"
    ],
    [
      34,
      26,
      0,
      25,
      1,
      "INT"
    ],
    [
      35,
      29,
      0,
      25,
      2,
      "INT"
    ],
    [
      36,
      26,
      0,
      18,
      1,
      "INT"
    ],
    [
      37,
      29,
      0,
      18,
      2,
      "INT"
    ],
    [
      45,
      25,
      0,
      2,
      0,
      "IMAGE"
    ],
    [
      46,
      2,
      0,
      1,
      2,
      "IMAGE"
    ],
    [
      47,
      25,
      0,
      1,
      1,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.5644739300537777,
      "offset": [
        47.979942073719656,
        120.46737166341701
      ]
    }
  },
  "version": 0.4
}

DiT世代とWan-Animate

DiTベースの動画生成モデルの登場によって、Human Motion Transfer も順当に進化しています。

Wan2.1 VACE

Wan2.1 には、VACE と呼ばれる仕組みがあります。

VACE は、動画生成において ControlNet、reference2video、inpainting をまとめて扱えるフレームワークです。 ControlNet Pose と reference2video 的な操作を組み合わせることで、Human Motion Transfer に近いことができます。

専用の Human Motion Transfer モデルというよりは、「Wan2.1 を土台に、ポーズと参照動画を使って動きをコントロールするための土台」として使われます。

Wan-Animate

よりモーション転送に特化したモデルが、Wan-Animate です。

Wan-Animate

キャラクター画像と、動きを持ったドライバー動画を入力にして、フルボディのモーションを転送することができます。

全身のポーズだけでなく、「顔のアップ動画」をドライバーとして使うこともできるため、talking head 的な使い方と、Human Motion Transfer 的な使い方の両方をカバーできるのが特徴です。

Human Motion Transfer