Spaces:
Paused
Paused
| import os | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoProcessor | |
| # Clone the model if not already downloaded | |
| if not os.path.exists("VideoLLaMA3-7B"): | |
| os.system("apt-get update && apt-get install -y git git-lfs && git lfs install") | |
| os.system("git clone https://huggingface.co/DAMO-NLP-SG/VideoLLaMA3-7B") | |
| model_path = "./VideoLLaMA3-7B" | |
| # Load model (no flash_attn) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | |
| def describe_video(video, question): | |
| conversation = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "video", "video": {"video_path": video, "fps": 1, "max_frames": 128}}, | |
| {"type": "text", "text": question}, | |
| ] | |
| }, | |
| ] | |
| inputs = processor(conversation=conversation, return_tensors="pt") | |
| inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} | |
| if "pixel_values" in inputs: | |
| inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16) | |
| output_ids = model.generate(**inputs, max_new_tokens=128) | |
| return processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip() | |
| # Gradio interface | |
| demo = gr.Interface( | |
| fn=describe_video, | |
| inputs=[ | |
| gr.Video(label="Upload a video"), | |
| gr.Textbox(label="Question", value="Describe this video in detail."), | |
| ], | |
| outputs=gr.Textbox(label="Response"), | |
| ) | |
| demo.launch() |