当前位置:   article > 正文

最新口型同步技术EchoMimic部署_echomimic安装

echomimic安装

EchoMimic是由蚂蚁集团推出的一个 AI 驱动的口型同步技术项目,能够通过人像面部特征和音频来帮助人物“对口型”,生成逼真的动态肖像视频。

EchoMimic的技术亮点在于其创新的动画生成方法,它不仅能够通过音频和面部关键点单独驱动图像动画,还能结合这两种方式,通过音频信号和面部关键点的组合来生成逼真的“说话的头部”视频。

EchoMimic支持单独使用音频或面部标志点生成肖像视频,也支持将音频和人像照片相结合,实现更自然、流畅的对口型效果。

EchoMimic支持多语言,包括中文普通话、英语,以及适应唱歌等场景。

github项目地址:https://github.com/BadToBest/EchoMimic。

一、环境安装

1、python环境

建议安装python版本在3.10以上。

2、pip库安装

pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118

pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

3、模型下载

git lfs install

git clone https://huggingface.co/BadToBest/EchoMimic

、功能测试

1、运行测试

(1)python代码调用测试audio2video

  1. import argparse
  2. import os
  3. import random
  4. import platform
  5. import subprocess
  6. from datetime import datetime
  7. from pathlib import Path
  8. import cv2
  9. import numpy as np
  10. import torch
  11. from diffusers import AutoencoderKL, DDIMScheduler
  12. from omegaconf import OmegaConf
  13. from PIL import Image
  14. from src.models.unet_2d_condition import UNet2DConditionModel
  15. from src.models.unet_3d_echo import EchoUNet3DConditionModel
  16. from src.models.whisper.audio2feature import load_audio_model
  17. from src.pipelines.pipeline_echo_mimic import Audio2VideoPipeline
  18. from src.utils.util import save_videos_grid, crop_and_pad
  19. from src.models.face_locator import FaceLocator
  20. from moviepy.editor import VideoFileClip, AudioFileClip
  21. from facenet_pytorch import MTCNN
  22. # Check and add FFmpeg path if necessary
  23. ffmpeg_path = os.getenv('FFMPEG_PATH')
  24. if ffmpeg_path is None and platform.system() in ['Linux', 'Darwin']:
  25. try:
  26. result = subprocess.run(['which', 'ffmpeg'], capture_output=True, text=True)
  27. if result.returncode == 0:
  28. ffmpeg_path = result.stdout.strip()
  29. print(f"FFmpeg is installed at: {ffmpeg_path}")
  30. else:
  31. print("FFmpeg is not installed. Please download ffmpeg-static and export to FFMPEG_PATH.")
  32. print("For example: export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static")
  33. except Exception as e:
  34. print(f"Error finding ffmpeg: {e}")
  35. else:
  36. if ffmpeg_path and ffmpeg_path not in os.getenv('PATH'):
  37. print("Adding FFMPEG_PATH to PATH")
  38. os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
  39. def parse_args():
  40. parser = argparse.ArgumentParser()
  41. parser.add_argument("--config", type=str, default="./configs/prompts/animation.yaml")
  42. parser.add_argument("-W", type=int, default=512)
  43. parser.add_argument("-H", type=int, default=512)
  44. parser.add_argument("-L", type=int, default=1200)
  45. parser.add_argument("--seed", type=int, default=420)
  46. parser.add_argument("--facemusk_dilation_ratio", type=float, default=0.1)
  47. parser.add_argument("--facecrop_dilation_ratio", type=float, default=0.5)
  48. parser.add_argument("--context_frames", type=int, default=12)
  49. parser.add_argument("--context_overlap", type=int, default=3)
  50. parser.add_argument("--cfg", type=float, default=2.5)
  51. parser.add_argument("--steps", type=int, default=30)
  52. parser.add_argument("--sample_rate", type=int, default=16000)
  53. parser.add_argument("--fps", type=int, default=24)
  54. parser.add_argument("--device", type=str, default="cuda")
  55. return parser.parse_args()
  56. def select_face(det_bboxes, probs):
  57. """
  58. Select the largest face with a detection probability above 0.8.
  59. """
  60. if det_bboxes is None or probs is None:
  61. return None
  62. filtered_bboxes = [det_bboxes[i] for i in range(len(det_bboxes)) if probs[i] > 0.8]
  63. if not filtered_bboxes:
  64. return None
  65. return max(filtered_bboxes, key=lambda x: (x[3] - x[1]) * (x[2] - x[0]))
  66. def main():
  67. args = parse_args()
  68. config = OmegaConf.load(args.config)
  69. weight_dtype = torch.float16 if config.weight_dtype == "fp16" else torch.float32
  70. device = args.device
  71. if "cuda" in device and not torch.cuda.is_available():
  72. device = "cpu"
  73. infer_config = OmegaConf.load(config.inference_config)
  74. ############# Initialize models #############
  75. vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
  76. reference_unet = UNet2DConditionModel.from_pretrained(config.pretrained_base_model_path, subfolder="unet").to(dtype=weight_dtype, device=device)
  77. reference_unet.load_state_dict(torch.load(config.reference_unet_path, map_location="cpu"))
  78. unet_kwargs = infer_config.unet_additional_kwargs or {}
  79. denoising_unet = EchoUNet3DConditionModel.from_pretrained_2d(
  80. config.pretrained_base_model_path,
  81. config.motion_module_path if os.path.exists(config.motion_module_path) else "",
  82. subfolder="unet",
  83. unet_additional_kwargs=unet_kwargs
  84. ).to(dtype=weight_dtype, device=device)
  85. denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
  86. face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")
  87. face_locator.load_state_dict(torch.load(config.face_locator_path))
  88. audio_processor = load_audio_model(model_path=config.audio_model_path, device=device)
  89. face_detector = MTCNN(image_size=320, margin=0, min_face_size=20, thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True, device=device)
  90. ############# Initiate pipeline #############
  91. scheduler = DDIMScheduler(**OmegaConf.to_container(infer_config.noise_scheduler_kwargs))
  92. pipe = Audio2VideoPipeline(
  93. vae=vae,
  94. reference_unet=reference_unet,
  95. denoising_unet=denoising_unet,
  96. audio_guider=audio_processor,
  97. face_locator=face_locator,
  98. scheduler=scheduler,
  99. ).to("cuda", dtype=weight_dtype)
  100. date_str = datetime.now().strftime("%Y%m%d")
  101. time_str = datetime.now().strftime("%H%M")
  102. save_dir_name = f"{time_str}--seed_{args.seed}-{args.W}x{args.H}"
  103. save_dir = Path(f"output/{date_str}/{save_dir_name}")
  104. save_dir.mkdir(exist_ok=True, parents=True)
  105. for ref_image_path, audio_paths in config["test_cases"].items():
  106. for audio_path in audio_paths:
  107. seed = args.seed if args.seed is not None and args.seed > -1 else random.randint(100, 1000000)
  108. generator = torch.manual_seed(seed)
  109. ref_name = Path(ref_image_path).stem
  110. audio_name = Path(audio_path).stem
  111. final_fps = args.fps
  112. #### Prepare face mask
  113. face_img = cv2.imread(ref_image_path)
  114. face_mask = np.zeros((face_img.shape[0], face_img.shape[1]), dtype='uint8')
  115. det_bboxes, probs = face_detector.detect(face_img)
  116. select_bbox = select_face(det_bboxes, probs)
  117. if select_bbox is None:
  118. face_mask[:, :] = 255
  119. else:
  120. xyxy = np.round(select_bbox[:4]).astype('int')
  121. rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
  122. r_pad = int((re - rb) * args.facemusk_dilation_ratio)
  123. c_pad = int((ce - cb) * args.facemusk_dilation_ratio)
  124. face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
  125. r_pad_crop = int((re - rb) * args.facecrop_dilation_ratio)
  126. c_pad_crop = int((ce - cb) * args.facecrop_dilation_ratio)
  127. crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
  128. face_img = crop_and_pad(face_img, crop_rect)
  129. face_mask = crop_and_pad(face_mask, crop_rect)
  130. face_img = cv2.resize(face_img, (args.W, args.H))
  131. face_mask = cv2.resize(face_mask, (args.W, args.H))
  132. ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
  133. face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
  134. video = pipe(
  135. ref_image_pil,
  136. audio_path,
  137. face_mask_tensor,
  138. width=args.W,
  139. height=args.H,
  140. duration=args.L,
  141. num_inference_steps=args.steps,
  142. cfg_scale=args.cfg,
  143. generator=generator,
  144. audio_sample_rate=args.sample_rate,
  145. context_frames=args.context_frames,
  146. fps=final_fps,
  147. context_overlap=args.context_overlap
  148. ).videos
  149. video_save_path = save_dir / f"{ref_name}_{audio_name}_{args.H}x{args.W}_{int(args.cfg)}_{time_str}.mp4"
  150. save_videos_grid(video, str(video_save_path), n_rows=1, fps=final_fps)
  151. # Add audio to generated video
  152. with_audio_path = save_dir / f"{ref_name}_{audio_name}_{args.H}x{args.W}_{int(args.cfg)}_{time_str}_withaudio.mp4"
  153. video_clip = VideoFileClip(str(video_save_path))
  154. audio_clip = AudioFileClip(audio_path)
  155. final_video = video_clip.set_audio(audio_clip)
  156. final_video.write_videofile(str(with_audio_path), codec="libx264", audio_codec="aac")
  157. print(f"Saved video with audio to {with_audio_path}")
  158. if __name__ == "__main__":
  159. main()

(2)python代码调用测试audio2pose

未完......

更多详细的内容欢迎关注:杰哥新技术

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/984741
推荐阅读
相关标签
  

闽ICP备14008679号