Spaces:
Running
on
L4
Running
on
L4
Update hugging_face/app.py
#4
by
assile
- opened
- hugging_face/app.py +33 -37
hugging_face/app.py
CHANGED
|
@@ -25,6 +25,9 @@ from matanyone_wrapper import matanyone
|
|
| 25 |
from matanyone.utils.get_default_model import get_matanyone_model
|
| 26 |
from matanyone.inference.inference_core import InferenceCore
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
def parse_augment():
|
| 29 |
parser = argparse.ArgumentParser()
|
| 30 |
parser.add_argument('--device', type=str, default=None)
|
|
@@ -121,7 +124,6 @@ def get_frames_from_video(video_input, video_state):
|
|
| 121 |
except Exception as e:
|
| 122 |
print(f"Audio extraction error: {str(e)}")
|
| 123 |
audio_path = "" # Set to "" if extraction fails
|
| 124 |
-
# print(f'audio_path: {audio_path}')
|
| 125 |
|
| 126 |
# extract frames
|
| 127 |
try:
|
|
@@ -140,15 +142,15 @@ def get_frames_from_video(video_input, video_state):
|
|
| 140 |
print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
|
| 141 |
image_size = (frames[0].shape[0],frames[0].shape[1])
|
| 142 |
|
| 143 |
-
# resize if resolution too big
|
| 144 |
-
if image_size[0]>=1280 and image_size[0]>=1280:
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
|
| 153 |
# initialize video_state
|
| 154 |
video_state = {
|
|
@@ -165,8 +167,7 @@ def get_frames_from_video(video_input, video_state):
|
|
| 165 |
video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
|
| 166 |
model.samcontroler.sam_controler.reset_image()
|
| 167 |
model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
|
| 168 |
-
return video_state, video_info, video_state["origin_images"][0], \
|
| 169 |
-
gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
|
| 170 |
gr.update(visible=True), gr.update(visible=True), \
|
| 171 |
gr.update(visible=True), gr.update(visible=True),\
|
| 172 |
gr.update(visible=True), gr.update(visible=True), \
|
|
@@ -292,6 +293,7 @@ def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
|
|
| 292 |
foreground, alpha = matanyone(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
|
| 293 |
foreground_output = Image.fromarray(foreground[-1])
|
| 294 |
alpha_output = Image.fromarray(alpha[-1][:,:,0])
|
|
|
|
| 295 |
return foreground_output, alpha_output
|
| 296 |
|
| 297 |
# video matting
|
|
@@ -324,7 +326,7 @@ def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
|
|
| 324 |
|
| 325 |
foreground_output = generate_video_from_frames(foreground, output_path="./results/{}_fg.mp4".format(video_state["video_name"]), fps=fps, audio_path=audio_path) # import video_input to name the output video
|
| 326 |
alpha_output = generate_video_from_frames(alpha, output_path="./results/{}_alpha.mp4".format(video_state["video_name"]), fps=fps, gray2rgb=True, audio_path=audio_path) # import video_input to name the output video
|
| 327 |
-
|
| 328 |
return foreground_output, alpha_output
|
| 329 |
|
| 330 |
|
|
@@ -409,38 +411,32 @@ sam_checkpoint_url_dict = {
|
|
| 409 |
'vit_l': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
|
| 410 |
'vit_b': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
|
| 411 |
}
|
| 412 |
-
checkpoint_folder = os.path.join('
|
| 413 |
|
| 414 |
sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type], checkpoint_folder)
|
| 415 |
# initialize sams
|
| 416 |
model = MaskGenerator(sam_checkpoint, args)
|
| 417 |
|
| 418 |
# initialize matanyone
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
# matanyone_model = get_matanyone_model(ckpt_path, args.device)
|
| 423 |
-
# load from Hugging Face
|
| 424 |
-
from matanyone.model.matanyone import MatAnyone
|
| 425 |
-
matanyone_model = MatAnyone.from_pretrained("PeiqingYang/MatAnyone")
|
| 426 |
-
|
| 427 |
matanyone_model = matanyone_model.to(args.device).eval()
|
| 428 |
-
matanyone_processor = InferenceCore(matanyone_model, cfg=matanyone_model.cfg)
|
| 429 |
|
| 430 |
# download test samples
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
load_file_from_url(
|
| 434 |
-
load_file_from_url(
|
| 435 |
-
load_file_from_url(
|
| 436 |
-
load_file_from_url(
|
| 437 |
-
load_file_from_url(
|
| 438 |
-
load_file_from_url(os.path.join(media_url, 'test-sample1.jpg'), test_sample_path)
|
| 439 |
|
| 440 |
# download assets
|
| 441 |
-
assets_path = os.path.join('
|
| 442 |
-
load_file_from_url(
|
| 443 |
-
load_file_from_url(
|
| 444 |
|
| 445 |
# documents
|
| 446 |
title = r"""<div class="multi-layer" align="center"><span>MatAnyone</span></div>
|
|
@@ -574,11 +570,11 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
|
|
| 574 |
with gr.Row():
|
| 575 |
with gr.Column():
|
| 576 |
gr.Markdown("### Case 1: Single Target")
|
| 577 |
-
gr.Video(value="
|
| 578 |
|
| 579 |
with gr.Column():
|
| 580 |
gr.Markdown("### Case 2: Multiple Targets")
|
| 581 |
-
gr.Video(value="
|
| 582 |
|
| 583 |
with gr.Tabs():
|
| 584 |
with gr.TabItem("Video"):
|
|
@@ -978,4 +974,4 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
|
|
| 978 |
gr.Markdown(article)
|
| 979 |
|
| 980 |
demo.queue()
|
| 981 |
-
demo.launch(debug=True)
|
|
|
|
| 25 |
from matanyone.utils.get_default_model import get_matanyone_model
|
| 26 |
from matanyone.inference.inference_core import InferenceCore
|
| 27 |
|
| 28 |
+
import warnings
|
| 29 |
+
warnings.filterwarnings("ignore")
|
| 30 |
+
|
| 31 |
def parse_augment():
|
| 32 |
parser = argparse.ArgumentParser()
|
| 33 |
parser.add_argument('--device', type=str, default=None)
|
|
|
|
| 124 |
except Exception as e:
|
| 125 |
print(f"Audio extraction error: {str(e)}")
|
| 126 |
audio_path = "" # Set to "" if extraction fails
|
|
|
|
| 127 |
|
| 128 |
# extract frames
|
| 129 |
try:
|
|
|
|
| 142 |
print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
|
| 143 |
image_size = (frames[0].shape[0],frames[0].shape[1])
|
| 144 |
|
| 145 |
+
# [remove for local demo] resize if resolution too big
|
| 146 |
+
# if image_size[0]>=1280 and image_size[0]>=1280:
|
| 147 |
+
# scale = 1080 / min(image_size)
|
| 148 |
+
# new_w = int(image_size[1] * scale)
|
| 149 |
+
# new_h = int(image_size[0] * scale)
|
| 150 |
+
# # update frames
|
| 151 |
+
# frames = [cv2.resize(f, (new_w, new_h), interpolation=cv2.INTER_AREA) for f in frames]
|
| 152 |
+
# # update image_size
|
| 153 |
+
# image_size = (frames[0].shape[0],frames[0].shape[1])
|
| 154 |
|
| 155 |
# initialize video_state
|
| 156 |
video_state = {
|
|
|
|
| 167 |
video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
|
| 168 |
model.samcontroler.sam_controler.reset_image()
|
| 169 |
model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
|
| 170 |
+
return video_state, video_info, video_state["origin_images"][0], gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
|
|
|
|
| 171 |
gr.update(visible=True), gr.update(visible=True), \
|
| 172 |
gr.update(visible=True), gr.update(visible=True),\
|
| 173 |
gr.update(visible=True), gr.update(visible=True), \
|
|
|
|
| 293 |
foreground, alpha = matanyone(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
|
| 294 |
foreground_output = Image.fromarray(foreground[-1])
|
| 295 |
alpha_output = Image.fromarray(alpha[-1][:,:,0])
|
| 296 |
+
|
| 297 |
return foreground_output, alpha_output
|
| 298 |
|
| 299 |
# video matting
|
|
|
|
| 326 |
|
| 327 |
foreground_output = generate_video_from_frames(foreground, output_path="./results/{}_fg.mp4".format(video_state["video_name"]), fps=fps, audio_path=audio_path) # import video_input to name the output video
|
| 328 |
alpha_output = generate_video_from_frames(alpha, output_path="./results/{}_alpha.mp4".format(video_state["video_name"]), fps=fps, gray2rgb=True, audio_path=audio_path) # import video_input to name the output video
|
| 329 |
+
|
| 330 |
return foreground_output, alpha_output
|
| 331 |
|
| 332 |
|
|
|
|
| 411 |
'vit_l': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
|
| 412 |
'vit_b': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
|
| 413 |
}
|
| 414 |
+
checkpoint_folder = os.path.join('..', 'pretrained_models')
|
| 415 |
|
| 416 |
sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type], checkpoint_folder)
|
| 417 |
# initialize sams
|
| 418 |
model = MaskGenerator(sam_checkpoint, args)
|
| 419 |
|
| 420 |
# initialize matanyone
|
| 421 |
+
pretrain_model_url = "https://github.com/pq-yang/MatAnyone/releases/download/v1.0.0/matanyone.pth"
|
| 422 |
+
ckpt_path = load_file_from_url(pretrain_model_url, checkpoint_folder)
|
| 423 |
+
matanyone_model = get_matanyone_model(ckpt_path, args.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
matanyone_model = matanyone_model.to(args.device).eval()
|
| 425 |
+
# matanyone_processor = InferenceCore(matanyone_model, cfg=matanyone_model.cfg)
|
| 426 |
|
| 427 |
# download test samples
|
| 428 |
+
test_sample_path = os.path.join('.', "test_sample/")
|
| 429 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/test-sample0-720p.mp4', test_sample_path)
|
| 430 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/test-sample1-720p.mp4', test_sample_path)
|
| 431 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/test-sample2-720p.mp4', test_sample_path)
|
| 432 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/test-sample3-720p.mp4', test_sample_path)
|
| 433 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/test-sample0.jpg', test_sample_path)
|
| 434 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/test-sample1.jpg', test_sample_path)
|
|
|
|
| 435 |
|
| 436 |
# download assets
|
| 437 |
+
assets_path = os.path.join('.', "assets/")
|
| 438 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_single_target.mp4', assets_path)
|
| 439 |
+
load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_multi_targets.mp4', assets_path)
|
| 440 |
|
| 441 |
# documents
|
| 442 |
title = r"""<div class="multi-layer" align="center"><span>MatAnyone</span></div>
|
|
|
|
| 570 |
with gr.Row():
|
| 571 |
with gr.Column():
|
| 572 |
gr.Markdown("### Case 1: Single Target")
|
| 573 |
+
gr.Video(value="./assets/tutorial_single_target.mp4", elem_classes="video")
|
| 574 |
|
| 575 |
with gr.Column():
|
| 576 |
gr.Markdown("### Case 2: Multiple Targets")
|
| 577 |
+
gr.Video(value="./assets/tutorial_multi_targets.mp4", elem_classes="video")
|
| 578 |
|
| 579 |
with gr.Tabs():
|
| 580 |
with gr.TabItem("Video"):
|
|
|
|
| 974 |
gr.Markdown(article)
|
| 975 |
|
| 976 |
demo.queue()
|
| 977 |
+
demo.launch(share=True, debug=True)
|