赞
踩
EchoMimic是由蚂蚁集团推出的一个 AI 驱动的口型同步技术项目,能够通过人像面部特征和音频来帮助人物“对口型”,生成逼真的动态肖像视频。
EchoMimic的技术亮点在于其创新的动画生成方法,它不仅能够通过音频和面部关键点单独驱动图像动画,还能结合这两种方式,通过音频信号和面部关键点的组合来生成逼真的“说话的头部”视频。
EchoMimic支持单独使用音频或面部标志点生成肖像视频,也支持将音频和人像照片相结合,实现更自然、流畅的对口型效果。
EchoMimic支持多语言,包括中文普通话、英语,以及适应唱歌等场景。
github项目地址:https://github.com/BadToBest/EchoMimic。
1、python环境
建议安装python版本在3.10以上。
2、pip库安装
pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
3、模型下载:
git lfs install
git clone https://huggingface.co/BadToBest/EchoMimic
1、运行测试:
(1)python代码调用测试audio2video
- import argparse
- import os
- import random
- import platform
- import subprocess
- from datetime import datetime
- from pathlib import Path
-
- import cv2
- import numpy as np
- import torch
- from diffusers import AutoencoderKL, DDIMScheduler
- from omegaconf import OmegaConf
- from PIL import Image
-
- from src.models.unet_2d_condition import UNet2DConditionModel
- from src.models.unet_3d_echo import EchoUNet3DConditionModel
- from src.models.whisper.audio2feature import load_audio_model
- from src.pipelines.pipeline_echo_mimic import Audio2VideoPipeline
- from src.utils.util import save_videos_grid, crop_and_pad
- from src.models.face_locator import FaceLocator
- from moviepy.editor import VideoFileClip, AudioFileClip
- from facenet_pytorch import MTCNN
-
- # Check and add FFmpeg path if necessary
- ffmpeg_path = os.getenv('FFMPEG_PATH')
- if ffmpeg_path is None and platform.system() in ['Linux', 'Darwin']:
- try:
- result = subprocess.run(['which', 'ffmpeg'], capture_output=True, text=True)
- if result.returncode == 0:
- ffmpeg_path = result.stdout.strip()
- print(f"FFmpeg is installed at: {ffmpeg_path}")
- else:
- print("FFmpeg is not installed. Please download ffmpeg-static and export to FFMPEG_PATH.")
- print("For example: export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static")
- except Exception as e:
- print(f"Error finding ffmpeg: {e}")
- else:
- if ffmpeg_path and ffmpeg_path not in os.getenv('PATH'):
- print("Adding FFMPEG_PATH to PATH")
- os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
-
- def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("--config", type=str, default="./configs/prompts/animation.yaml")
- parser.add_argument("-W", type=int, default=512)
- parser.add_argument("-H", type=int, default=512)
- parser.add_argument("-L", type=int, default=1200)
- parser.add_argument("--seed", type=int, default=420)
- parser.add_argument("--facemusk_dilation_ratio", type=float, default=0.1)
- parser.add_argument("--facecrop_dilation_ratio", type=float, default=0.5)
- parser.add_argument("--context_frames", type=int, default=12)
- parser.add_argument("--context_overlap", type=int, default=3)
- parser.add_argument("--cfg", type=float, default=2.5)
- parser.add_argument("--steps", type=int, default=30)
- parser.add_argument("--sample_rate", type=int, default=16000)
- parser.add_argument("--fps", type=int, default=24)
- parser.add_argument("--device", type=str, default="cuda")
- return parser.parse_args()
-
- def select_face(det_bboxes, probs):
- """
- Select the largest face with a detection probability above 0.8.
- """
- if det_bboxes is None or probs is None:
- return None
- filtered_bboxes = [det_bboxes[i] for i in range(len(det_bboxes)) if probs[i] > 0.8]
- if not filtered_bboxes:
- return None
- return max(filtered_bboxes, key=lambda x: (x[3] - x[1]) * (x[2] - x[0]))
-
- def main():
- args = parse_args()
-
- config = OmegaConf.load(args.config)
- weight_dtype = torch.float16 if config.weight_dtype == "fp16" else torch.float32
-
- device = args.device
- if "cuda" in device and not torch.cuda.is_available():
- device = "cpu"
-
- infer_config = OmegaConf.load(config.inference_config)
-
- ############# Initialize models #############
-
- vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
-
- reference_unet = UNet2DConditionModel.from_pretrained(config.pretrained_base_model_path, subfolder="unet").to(dtype=weight_dtype, device=device)
- reference_unet.load_state_dict(torch.load(config.reference_unet_path, map_location="cpu"))
-
- unet_kwargs = infer_config.unet_additional_kwargs or {}
- denoising_unet = EchoUNet3DConditionModel.from_pretrained_2d(
- config.pretrained_base_model_path,
- config.motion_module_path if os.path.exists(config.motion_module_path) else "",
- subfolder="unet",
- unet_additional_kwargs=unet_kwargs
- ).to(dtype=weight_dtype, device=device)
- denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
-
- face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")
- face_locator.load_state_dict(torch.load(config.face_locator_path))
-
- audio_processor = load_audio_model(model_path=config.audio_model_path, device=device)
- face_detector = MTCNN(image_size=320, margin=0, min_face_size=20, thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True, device=device)
-
- ############# Initiate pipeline #############
-
- scheduler = DDIMScheduler(**OmegaConf.to_container(infer_config.noise_scheduler_kwargs))
- pipe = Audio2VideoPipeline(
- vae=vae,
- reference_unet=reference_unet,
- denoising_unet=denoising_unet,
- audio_guider=audio_processor,
- face_locator=face_locator,
- scheduler=scheduler,
- ).to("cuda", dtype=weight_dtype)
-
- date_str = datetime.now().strftime("%Y%m%d")
- time_str = datetime.now().strftime("%H%M")
- save_dir_name = f"{time_str}--seed_{args.seed}-{args.W}x{args.H}"
- save_dir = Path(f"output/{date_str}/{save_dir_name}")
- save_dir.mkdir(exist_ok=True, parents=True)
-
- for ref_image_path, audio_paths in config["test_cases"].items():
- for audio_path in audio_paths:
- seed = args.seed if args.seed is not None and args.seed > -1 else random.randint(100, 1000000)
- generator = torch.manual_seed(seed)
-
- ref_name = Path(ref_image_path).stem
- audio_name = Path(audio_path).stem
- final_fps = args.fps
-
- #### Prepare face mask
- face_img = cv2.imread(ref_image_path)
- face_mask = np.zeros((face_img.shape[0], face_img.shape[1]), dtype='uint8')
-
- det_bboxes, probs = face_detector.detect(face_img)
- select_bbox = select_face(det_bboxes, probs)
-
- if select_bbox is None:
- face_mask[:, :] = 255
- else:
- xyxy = np.round(select_bbox[:4]).astype('int')
- rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
- r_pad = int((re - rb) * args.facemusk_dilation_ratio)
- c_pad = int((ce - cb) * args.facemusk_dilation_ratio)
- face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
-
- r_pad_crop = int((re - rb) * args.facecrop_dilation_ratio)
- c_pad_crop = int((ce - cb) * args.facecrop_dilation_ratio)
- crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
- face_img = crop_and_pad(face_img, crop_rect)
- face_mask = crop_and_pad(face_mask, crop_rect)
- face_img = cv2.resize(face_img, (args.W, args.H))
- face_mask = cv2.resize(face_mask, (args.W, args.H))
-
- ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
- face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
-
- video = pipe(
- ref_image_pil,
- audio_path,
- face_mask_tensor,
- width=args.W,
- height=args.H,
- duration=args.L,
- num_inference_steps=args.steps,
- cfg_scale=args.cfg,
- generator=generator,
- audio_sample_rate=args.sample_rate,
- context_frames=args.context_frames,
- fps=final_fps,
- context_overlap=args.context_overlap
- ).videos
-
- video_save_path = save_dir / f"{ref_name}_{audio_name}_{args.H}x{args.W}_{int(args.cfg)}_{time_str}.mp4"
- save_videos_grid(video, str(video_save_path), n_rows=1, fps=final_fps)
-
- # Add audio to generated video
- with_audio_path = save_dir / f"{ref_name}_{audio_name}_{args.H}x{args.W}_{int(args.cfg)}_{time_str}_withaudio.mp4"
- video_clip = VideoFileClip(str(video_save_path))
- audio_clip = AudioFileClip(audio_path)
- final_video = video_clip.set_audio(audio_clip)
- final_video.write_videofile(str(with_audio_path), codec="libx264", audio_codec="aac")
- print(f"Saved video with audio to {with_audio_path}")
-
- if __name__ == "__main__":
- main()

(2)python代码调用测试audio2pose
未完......
更多详细的内容欢迎关注:杰哥新技术
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。