Spaces:
Paused
Paused
Update generate_multitalk.py
Browse files- generate_multitalk.py +53 -84
generate_multitalk.py
CHANGED
|
@@ -207,14 +207,14 @@ def _parse_args():
|
|
| 207 |
help="Norm threshold used in adaptive projected guidance (APG)."
|
| 208 |
)
|
| 209 |
|
| 210 |
-
|
| 211 |
args = parser.parse_args()
|
| 212 |
|
| 213 |
_validate_args(args)
|
| 214 |
|
| 215 |
return args
|
| 216 |
|
| 217 |
-
def custom_init(device, wav2vec):
|
| 218 |
audio_encoder = Wav2Vec2Model.from_pretrained(args.wav2vec_dir, attn_implementation="eager").to(device)
|
| 219 |
audio_encoder.freeze_feature_extractor()
|
| 220 |
wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec, local_files_only=True)
|
|
@@ -244,7 +244,7 @@ def audio_prepare_multi(left_path, right_path, audio_type, sample_rate=16000):
|
|
| 244 |
new_human_speech1 = human_speech_array1
|
| 245 |
new_human_speech2 = human_speech_array2
|
| 246 |
elif audio_type=='add':
|
| 247 |
-
new_human_speech1 = np.concatenate([human_speech_array1[: human_speech_array1.shape[0]], np.zeros(human_speech_array2.shape[0])])
|
| 248 |
new_human_speech2 = np.concatenate([np.zeros(human_speech_array1.shape[0]), human_speech_array2[:human_speech_array2.shape[0]]])
|
| 249 |
sum_human_speechs = new_human_speech1 + new_human_speech2
|
| 250 |
return new_human_speech1, new_human_speech2, sum_human_speechs
|
|
@@ -359,21 +359,6 @@ def generate(args):
|
|
| 359 |
ulysses_degree=args.ulysses_size,
|
| 360 |
)
|
| 361 |
|
| 362 |
-
# TODO: use prompt refine
|
| 363 |
-
# if args.use_prompt_extend:
|
| 364 |
-
# if args.prompt_extend_method == "dashscope":
|
| 365 |
-
# prompt_expander = DashScopePromptExpander(
|
| 366 |
-
# model_name=args.prompt_extend_model,
|
| 367 |
-
# is_vl="i2v" in args.task or "flf2v" in args.task)
|
| 368 |
-
# elif args.prompt_extend_method == "local_qwen":
|
| 369 |
-
# prompt_expander = QwenPromptExpander(
|
| 370 |
-
# model_name=args.prompt_extend_model,
|
| 371 |
-
# is_vl="i2v" in args.task,
|
| 372 |
-
# device=rank)
|
| 373 |
-
# else:
|
| 374 |
-
# raise NotImplementedError(
|
| 375 |
-
# f"Unsupport prompt_extend_method: {args.prompt_extend_method}")
|
| 376 |
-
|
| 377 |
cfg = WAN_CONFIGS[args.task]
|
| 378 |
if args.ulysses_size > 1:
|
| 379 |
assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."
|
|
@@ -387,66 +372,50 @@ def generate(args):
|
|
| 387 |
args.base_seed = base_seed[0]
|
| 388 |
|
| 389 |
assert args.task == "multitalk-14B", 'You should choose multitalk in args.task.'
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
# TODO: add prompt refine
|
| 393 |
-
# img = Image.open(args.image).convert("RGB")
|
| 394 |
-
# if args.use_prompt_extend:
|
| 395 |
-
# logging.info("Extending prompt ...")
|
| 396 |
-
# if rank == 0:
|
| 397 |
-
# prompt_output = prompt_expander(
|
| 398 |
-
# args.prompt,
|
| 399 |
-
# tar_lang=args.prompt_extend_target_lang,
|
| 400 |
-
# image=img,
|
| 401 |
-
# seed=args.base_seed)
|
| 402 |
-
# if prompt_output.status == False:
|
| 403 |
-
# logging.info(
|
| 404 |
-
# f"Extending prompt failed: {prompt_output.message}")
|
| 405 |
-
# logging.info("Falling back to original prompt.")
|
| 406 |
-
# input_prompt = args.prompt
|
| 407 |
-
# else:
|
| 408 |
-
# input_prompt = prompt_output.prompt
|
| 409 |
-
# input_prompt = [input_prompt]
|
| 410 |
-
# else:
|
| 411 |
-
# input_prompt = [None]
|
| 412 |
-
# if dist.is_initialized():
|
| 413 |
-
# dist.broadcast_object_list(input_prompt, src=0)
|
| 414 |
-
# args.prompt = input_prompt[0]
|
| 415 |
-
# logging.info(f"Extended prompt: {args.prompt}")
|
| 416 |
-
|
| 417 |
-
# read input files
|
| 418 |
|
|
|
|
|
|
|
| 419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
wav2vec_feature_extractor, audio_encoder= custom_init('cpu', args.wav2vec_dir)
|
| 425 |
-
args.audio_save_dir = os.path.join(args.audio_save_dir, input_data['cond_image'].split('/')[-1].split('.')[0])
|
| 426 |
-
os.makedirs(args.audio_save_dir,exist_ok=True)
|
| 427 |
-
|
| 428 |
-
if len(input_data['cond_audio'])==2:
|
| 429 |
-
new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
|
| 430 |
-
audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
|
| 431 |
-
audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
|
| 432 |
-
emb1_path = os.path.join(args.audio_save_dir, '1.pt')
|
| 433 |
-
emb2_path = os.path.join(args.audio_save_dir, '2.pt')
|
| 434 |
-
sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
|
| 435 |
-
sf.write(sum_audio, sum_human_speechs, 16000)
|
| 436 |
-
torch.save(audio_embedding_1, emb1_path)
|
| 437 |
-
torch.save(audio_embedding_2, emb2_path)
|
| 438 |
-
input_data['cond_audio']['person1'] = emb1_path
|
| 439 |
-
input_data['cond_audio']['person2'] = emb2_path
|
| 440 |
-
input_data['video_audio'] = sum_audio
|
| 441 |
-
elif len(input_data['cond_audio'])==1:
|
| 442 |
-
human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
|
| 443 |
-
audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
|
| 444 |
-
emb_path = os.path.join(args.audio_save_dir, '1.pt')
|
| 445 |
-
sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
|
| 446 |
-
sf.write(sum_audio, human_speech, 16000)
|
| 447 |
-
torch.save(audio_embedding, emb_path)
|
| 448 |
-
input_data['cond_audio']['person1'] = emb_path
|
| 449 |
-
input_data['video_audio'] = sum_audio
|
| 450 |
|
| 451 |
logging.info("Creating MultiTalk pipeline.")
|
| 452 |
wan_i2v = wan.MultiTalkPipeline(
|
|
@@ -455,8 +424,8 @@ def generate(args):
|
|
| 455 |
device_id=device,
|
| 456 |
rank=rank,
|
| 457 |
t5_fsdp=args.t5_fsdp,
|
| 458 |
-
dit_fsdp=args.dit_fsdp,
|
| 459 |
-
use_usp=(args.ulysses_size > 1 or args.ring_size > 1),
|
| 460 |
t5_cpu=args.t5_cpu
|
| 461 |
)
|
| 462 |
|
|
@@ -465,7 +434,7 @@ def generate(args):
|
|
| 465 |
wan_i2v.enable_vram_management(
|
| 466 |
num_persistent_param_in_dit=args.num_persistent_param_in_dit
|
| 467 |
)
|
| 468 |
-
|
| 469 |
logging.info("Generating video ...")
|
| 470 |
video = wan_i2v.generate(
|
| 471 |
input_data,
|
|
@@ -481,19 +450,19 @@ def generate(args):
|
|
| 481 |
max_frames_num=args.frame_num if args.mode == 'clip' else 1000,
|
| 482 |
extra_args=args,
|
| 483 |
)
|
| 484 |
-
|
| 485 |
|
| 486 |
if rank == 0:
|
| 487 |
-
|
| 488 |
if args.save_file is None:
|
| 489 |
-
formatted_time = datetime
|
| 490 |
formatted_prompt = input_data['prompt'].replace(" ", "_").replace("/",
|
| 491 |
"_")[:50]
|
| 492 |
args.save_file = f"{args.task}_{args.size.replace('*','x') if sys.platform=='win32' else args.size}_{args.ulysses_size}_{args.ring_size}_{formatted_prompt}_{formatted_time}"
|
| 493 |
-
|
| 494 |
logging.info(f"Saving generated video to {args.save_file}.mp4")
|
| 495 |
save_video_ffmpeg(video, args.save_file, [input_data['video_audio']])
|
| 496 |
-
|
| 497 |
logging.info("Finished.")
|
| 498 |
|
| 499 |
if torch.cuda.is_available():
|
|
@@ -506,4 +475,4 @@ def generate(args):
|
|
| 506 |
|
| 507 |
if __name__ == "__main__":
|
| 508 |
args = _parse_args()
|
| 509 |
-
generate(args)
|
|
|
|
| 207 |
help="Norm threshold used in adaptive projected guidance (APG)."
|
| 208 |
)
|
| 209 |
|
| 210 |
+
|
| 211 |
args = parser.parse_args()
|
| 212 |
|
| 213 |
_validate_args(args)
|
| 214 |
|
| 215 |
return args
|
| 216 |
|
| 217 |
+
def custom_init(device, wav2vec):
|
| 218 |
audio_encoder = Wav2Vec2Model.from_pretrained(args.wav2vec_dir, attn_implementation="eager").to(device)
|
| 219 |
audio_encoder.freeze_feature_extractor()
|
| 220 |
wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec, local_files_only=True)
|
|
|
|
| 244 |
new_human_speech1 = human_speech_array1
|
| 245 |
new_human_speech2 = human_speech_array2
|
| 246 |
elif audio_type=='add':
|
| 247 |
+
new_human_speech1 = np.concatenate([human_speech_array1[: human_speech_array1.shape[0]], np.zeros(human_speech_array2.shape[0])])
|
| 248 |
new_human_speech2 = np.concatenate([np.zeros(human_speech_array1.shape[0]), human_speech_array2[:human_speech_array2.shape[0]]])
|
| 249 |
sum_human_speechs = new_human_speech1 + new_human_speech2
|
| 250 |
return new_human_speech1, new_human_speech2, sum_human_speechs
|
|
|
|
| 359 |
ulysses_degree=args.ulysses_size,
|
| 360 |
)
|
| 361 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
cfg = WAN_CONFIGS[args.task]
|
| 363 |
if args.ulysses_size > 1:
|
| 364 |
assert cfg.num_heads % args.ulysses_size == 0, f"`{cfg.num_heads=}` cannot be divided evenly by `{args.ulysses_size=}`."
|
|
|
|
| 372 |
args.base_seed = base_seed[0]
|
| 373 |
|
| 374 |
assert args.task == "multitalk-14B", 'You should choose multitalk in args.task.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
+
# Initialize a placeholder for all processes
|
| 377 |
+
input_data = None
|
| 378 |
|
| 379 |
+
# Let only the main process prepare the data
|
| 380 |
+
if rank == 0:
|
| 381 |
+
with open(args.input_json, 'r', encoding='utf-8') as f:
|
| 382 |
+
input_data = json.load(f)
|
| 383 |
+
|
| 384 |
+
wav2vec_feature_extractor, audio_encoder= custom_init('cpu', args.wav2vec_dir)
|
| 385 |
+
args.audio_save_dir = os.path.join(args.audio_save_dir, input_data['cond_image'].split('/')[-1].split('.')[0])
|
| 386 |
+
os.makedirs(args.audio_save_dir,exist_ok=True)
|
| 387 |
+
|
| 388 |
+
if len(input_data['cond_audio'])==2:
|
| 389 |
+
new_human_speech1, new_human_speech2, sum_human_speechs = audio_prepare_multi(input_data['cond_audio']['person1'], input_data['cond_audio']['person2'], input_data['audio_type'])
|
| 390 |
+
audio_embedding_1 = get_embedding(new_human_speech1, wav2vec_feature_extractor, audio_encoder)
|
| 391 |
+
audio_embedding_2 = get_embedding(new_human_speech2, wav2vec_feature_extractor, audio_encoder)
|
| 392 |
+
emb1_path = os.path.join(args.audio_save_dir, '1.pt')
|
| 393 |
+
emb2_path = os.path.join(args.audio_save_dir, '2.pt')
|
| 394 |
+
sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
|
| 395 |
+
sf.write(sum_audio, sum_human_speechs, 16000)
|
| 396 |
+
torch.save(audio_embedding_1, emb1_path)
|
| 397 |
+
torch.save(audio_embedding_2, emb2_path)
|
| 398 |
+
input_data['cond_audio']['person1'] = emb1_path
|
| 399 |
+
input_data['cond_audio']['person2'] = emb2_path
|
| 400 |
+
input_data['video_audio'] = sum_audio
|
| 401 |
+
elif len(input_data['cond_audio'])==1:
|
| 402 |
+
human_speech = audio_prepare_single(input_data['cond_audio']['person1'])
|
| 403 |
+
audio_embedding = get_embedding(human_speech, wav2vec_feature_extractor, audio_encoder)
|
| 404 |
+
emb_path = os.path.join(args.audio_save_dir, '1.pt')
|
| 405 |
+
sum_audio = os.path.join(args.audio_save_dir, 'sum.wav')
|
| 406 |
+
sf.write(sum_audio, human_speech, 16000)
|
| 407 |
+
torch.save(audio_embedding, emb_path)
|
| 408 |
+
input_data['cond_audio']['person1'] = emb_path
|
| 409 |
+
input_data['video_audio'] = sum_audio
|
| 410 |
+
|
| 411 |
+
# Broadcast the data from rank 0 to all other processes
|
| 412 |
+
if dist.is_initialized():
|
| 413 |
+
objects_to_broadcast = [input_data] if rank == 0 else [None]
|
| 414 |
+
dist.broadcast_object_list(objects_to_broadcast, src=0)
|
| 415 |
+
input_data = objects_to_broadcast[0]
|
| 416 |
|
| 417 |
+
# Wait for all file I/O to be complete before proceeding
|
| 418 |
+
dist.barrier()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
logging.info("Creating MultiTalk pipeline.")
|
| 421 |
wan_i2v = wan.MultiTalkPipeline(
|
|
|
|
| 424 |
device_id=device,
|
| 425 |
rank=rank,
|
| 426 |
t5_fsdp=args.t5_fsdp,
|
| 427 |
+
dit_fsdp=args.dit_fsdp,
|
| 428 |
+
use_usp=(args.ulysses_size > 1 or args.ring_size > 1),
|
| 429 |
t5_cpu=args.t5_cpu
|
| 430 |
)
|
| 431 |
|
|
|
|
| 434 |
wan_i2v.enable_vram_management(
|
| 435 |
num_persistent_param_in_dit=args.num_persistent_param_in_dit
|
| 436 |
)
|
| 437 |
+
|
| 438 |
logging.info("Generating video ...")
|
| 439 |
video = wan_i2v.generate(
|
| 440 |
input_data,
|
|
|
|
| 450 |
max_frames_num=args.frame_num if args.mode == 'clip' else 1000,
|
| 451 |
extra_args=args,
|
| 452 |
)
|
| 453 |
+
|
| 454 |
|
| 455 |
if rank == 0:
|
| 456 |
+
|
| 457 |
if args.save_file is None:
|
| 458 |
+
formatted_time = datetime..now().strftime("%Y%m%d_%H%M%S")
|
| 459 |
formatted_prompt = input_data['prompt'].replace(" ", "_").replace("/",
|
| 460 |
"_")[:50]
|
| 461 |
args.save_file = f"{args.task}_{args.size.replace('*','x') if sys.platform=='win32' else args.size}_{args.ulysses_size}_{args.ring_size}_{formatted_prompt}_{formatted_time}"
|
| 462 |
+
|
| 463 |
logging.info(f"Saving generated video to {args.save_file}.mp4")
|
| 464 |
save_video_ffmpeg(video, args.save_file, [input_data['video_audio']])
|
| 465 |
+
|
| 466 |
logging.info("Finished.")
|
| 467 |
|
| 468 |
if torch.cuda.is_available():
|
|
|
|
| 475 |
|
| 476 |
if __name__ == "__main__":
|
| 477 |
args = _parse_args()
|
| 478 |
+
generate(args)
|