Spaces:

cweigendev
/

videoanalyer2

Paused

videoanalyer2 / app.py

Update app.py

d47fcec verified 4 months ago

1.69 kB

	import os
	import torch
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoProcessor

	# Clone the model if not already downloaded
	if not os.path.exists("VideoLLaMA3-7B"):
	os.system("apt-get update && apt-get install -y git git-lfs && git lfs install")
	os.system("git clone https://huggingface.co/DAMO-NLP-SG/VideoLLaMA3-7B")

	model_path = "./VideoLLaMA3-7B"

	# Load model (no flash_attn)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	trust_remote_code=True,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	)
	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

	def describe_video(video, question):
	conversation = [
	{"role": "system", "content": "You are a helpful assistant."},
	{
	"role": "user",
	"content": [
	{"type": "video", "video": {"video_path": video, "fps": 1, "max_frames": 128}},
	{"type": "text", "text": question},
	]
	},
	]
	inputs = processor(conversation=conversation, return_tensors="pt")
	inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
	if "pixel_values" in inputs:
	inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
	output_ids = model.generate(**inputs, max_new_tokens=128)
	return processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

	# Gradio interface
	demo = gr.Interface(
	fn=describe_video,
	inputs=[
	gr.Video(label="Upload a video"),
	gr.Textbox(label="Question", value="Describe this video in detail."),
	],
	outputs=gr.Textbox(label="Response"),
	)

	demo.launch()