Spaces:

remyxai
/

SpaceQwen2.5-VL-3B-Instruct

Sleeping

App Files Files Community

SpaceQwen2.5-VL-3B-Instruct / app.py

salma-remyx

initial commit

a9f74e5 10 months ago

raw

history blame contribute delete

7.49 kB

	import spaces
	import torch
	import time
	import gradio as gr
	from PIL import Image
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from typing import List

	MODEL_ID = "remyxai/SpaceQwen2.5-VL-3B-Instruct"

	@spaces.GPU
	def load_model():
	print("Loading model and processor...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
	).to(device)
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	return model, processor

	model, processor = load_model()

	def process_image(image_path_or_obj):
	"""Loads, resizes, and preprocesses an image path or Pillow Image."""
	if isinstance(image_path_or_obj, str):
	# Path on disk or from history
	image = Image.open(image_path_or_obj).convert("RGB")
	elif isinstance(image_path_or_obj, Image.Image):
	image = image_path_or_obj.convert("RGB")
	else:
	raise ValueError("process_image expects a file path (str) or PIL.Image")

	max_width = 512
	if image.width > max_width:
	aspect_ratio = image.height / image.width
	new_height = int(max_width * aspect_ratio)
	image = image.resize((max_width, new_height), Image.Resampling.LANCZOS)
	print(f"Resized image to: {max_width}x{new_height}")
	return image

	def get_latest_image(history):
	"""
	Look from the end to find the last user-uploaded image (stored as (file_path,) ).
	Return None if not found.
	"""
	for user_msg, _assistant_msg in reversed(history):
	if isinstance(user_msg, tuple) and len(user_msg) > 0:
	return user_msg[0]
	return None

	def only_assistant_text(full_text: str) -> str:
	"""
	Helper to strip out any lines containing 'system', 'user', etc.,
	and return only the final assistant answer.
	Adjust this parsing if your model's output format differs.
	"""
	# Example output might look like:
	# system
	# ...
	# user
	# ...
	# assistant
	# The final answer
	#
	# We'll just split on 'assistant' and return everything after it.
	if "assistant" in full_text:
	parts = full_text.split("assistant", 1)
	result = parts[-1].strip()
	# Remove any leading punctuation (like a colon)
	result = result.lstrip(":").strip()
	return result
	return full_text.strip()

	def run_inference(image, prompt):
	"""Runs Qwen2.5-VL inference on a single image and text prompt."""
	system_msg = (
	"You are a Vision Language Model specialized in interpreting visual data from images. "
	"Your task is to analyze the provided image and respond to queries with concise answers."
	)
	conversation = [
	{
	"role": "system",
	"content": [{"type": "text", "text": system_msg}],
	},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt},
	],
	},
	]
	text_input = processor.apply_chat_template(
	conversation, tokenize=False, add_generation_prompt=True
	)

	inputs = processor(text=[text_input], images=[image], return_tensors="pt").to(model.device)
	generated_ids = model.generate(**inputs, max_new_tokens=1024)
	output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	# Parse out only the final assistant text
	return only_assistant_text(output_text)

	def add_message(history, user_input):
	"""
	Step 1 (triggered by user's 'Submit' or 'Send'):
	- Save new text or images into `history`.
	- The Chatbot display uses pairs: [user_text_or_image, assistant_reply].
	"""
	if not isinstance(history, list):
	history = []

	files = user_input.get("files", [])
	text = user_input.get("text", "")

	# Store images
	for f in files:
	# Each image is stored as `[(file_path,), None]`
	history.append([(f,), None])

	# Store text
	if text:
	history.append([text, None])

	return history, gr.MultimodalTextbox(value=None)

	def inference_interface(history):
	"""
	Step 2: Use the most recent text + the most recent image to run Qwen2.5-VL.
	Instead of adding another entry, we fill the assistant's answer into
	the last user text entry.
	"""
	if not history:
	return history, gr.MultimodalTextbox(value=None)

	# 1) Get the user's most recent text
	user_text = ""
	# We'll search from the end for the first str we find
	for idx in range(len(history) - 1, -1, -1):
	user_msg, assistant_msg = history[idx]
	if isinstance(user_msg, str):
	user_text = user_msg
	# We'll also keep track of this index so we can fill in the assistant reply
	user_idx = idx
	break
	else:
	# No user text found
	print("No user text found in history. Skipping inference.")
	return history, gr.MultimodalTextbox(value=None)

	# 2) Get the latest image from the entire conversation
	latest_image = get_latest_image(history)
	if not latest_image:
	# No image found => can't run the model
	print("No image found in history. Skipping inference.")
	return history, gr.MultimodalTextbox(value=None)

	# 3) Process the image
	pil_image = process_image(latest_image)

	# 4) Run inference
	assistant_reply = run_inference(pil_image, user_text)

	# 5) Fill that assistant reply back into the last user text entry
	history[user_idx][1] = assistant_reply
	return history, gr.MultimodalTextbox(value=None)

	def build_demo():
	with gr.Blocks() as demo:
	gr.Markdown("# SpaceQwen2.5-VL Image Prompt Chatbot")

	chatbot = gr.Chatbot([], line_breaks=True)
	chat_input = gr.MultimodalTextbox(
	interactive=True,
	file_types=["image"],
	placeholder="Enter text or upload an image (or both).",
	show_label=True
	)

	# When the user presses Enter in the MultimodalTextbox:
	submit_event = chat_input.submit(
	fn=add_message, # Step 1: store user data
	inputs=[chatbot, chat_input],
	outputs=[chatbot, chat_input]
	)
	# After storing, run inference
	submit_event.then(
	fn=inference_interface, # Step 2: run Qwen2.5-VL
	inputs=[chatbot],
	outputs=[chatbot, chat_input]
	)

	# Same logic for a "Send" button
	with gr.Row():
	send_button = gr.Button("Send")
	clear_button = gr.ClearButton([chatbot, chat_input])

	send_click = send_button.click(
	fn=add_message,
	inputs=[chatbot, chat_input],
	outputs=[chatbot, chat_input]
	)
	send_click.then(
	fn=inference_interface,
	inputs=[chatbot],
	outputs=[chatbot, chat_input]
	)

	# Example
	gr.Examples(
	examples=[
	{
	"text": "Give me the height of the man in the red hat in feet.",
	"files": ["./examples/warehouse_rgb.jpg"]
	}
	],
	inputs=[chat_input],
	)

	return demo

	if __name__ == "__main__":
	demo = build_demo()
	demo.launch(share=True)