Spaces:

romadanskiy
/

open-source-models-hg

Sleeping

App Files Files Community

kirill commited on Dec 19, 2024

Commit

e02f821

1 Parent(s): ee2df8c

Added Image Captioning and Visual Q&A

Browse files

Files changed (4) hide show

app.py +11 -68
image_captioning.py +61 -0
visual_qa.py +58 -0
zero_shot_classification.py +68 -0

app.py CHANGED Viewed

@@ -1,75 +1,18 @@
-from transformers import CLIPModel, CLIPProcessor
-from PIL import Image
-import time
 import gradio as gr
-openai_model_name = "openai/clip-vit-large-patch14"
-openai_model = CLIPModel.from_pretrained(openai_model_name)
-openai_processor = CLIPProcessor.from_pretrained(openai_model_name)
-patrickjohncyh_model_name = "patrickjohncyh/fashion-clip"
-patrickjohncyh_model = CLIPModel.from_pretrained(patrickjohncyh_model_name)
-patrickjohncyh_processor = CLIPProcessor.from_pretrained(patrickjohncyh_model_name)
-model_map = {
-    openai_model_name: (openai_model, openai_processor),
-    patrickjohncyh_model_name: (patrickjohncyh_model, patrickjohncyh_processor)
-}
-def gradio_process(model_name, image, text):
-    (model, processor) = model_map[model_name]
-    labels = text.split(", ")
-    print (labels)
-    start = time.time()
-    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
-    outputs = model(**inputs)
-    probs = outputs.logits_per_image.softmax(dim=1)[0]
-    end = time.time()
-    time_spent = end - start
-    probs = list(probs)
-    results = []
-    for i in range(len(labels)):
-      results.append(f"{labels[i]} - {probs[i].item():.4f}")
-    result = "\n".join(results)
-    return [result, time_spent]
-with gr.Blocks() as zero_shot_image_classification_tab:
-  gr.Markdown("# Zero-Shot Image Classification")
-  with gr.Row():
-      with gr.Column():
-          # Input components
-          input_image = gr.Image(label="Upload Image", type="pil")
-          input_text = gr.Textbox(label="Labels (comma separated)")
-          model_selector = gr.Dropdown([openai_model_name, patrickjohncyh_model_name],
-                                        label = "Select Model")
-           # Process button
-          process_btn = gr.Button("Classificate")
-      with gr.Column():
-          # Output components
-          elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
-          output_text = gr.Textbox(label="Classification")
-  # Connect the input components to the processing function
-  process_btn.click(
-      fn=gradio_process,
-      inputs=[
-          model_selector,
-          input_image,
-          input_text
-      ],
-      outputs=[output_text, elapsed_result]
-  )
 with gr.Blocks() as app:
-  gr.TabbedInterface([zero_shot_image_classification_tab], ["Zero-Shot Classification"])
 app.launch()

 import gradio as gr
+from image_captioning import get_image_captioning_tab
+from visual_qa.py import get_visual_qa_tab
+from zero_shot_classification import get_zero_shot_classification_tab
 with gr.Blocks() as app:
+    image_captioning_tab = get_image_captioning_tab()
+    visual_qa_tab = get_visual_qa_tab()
+    zero_shot_classification_tab = get_zero_shot_classification_tab()
+    gr.TabbedInterface(
+        [image_captioning_tab, visual_qa_tab, zero_shot_classification_tab],
+        ["Image Captioning", "Visual Q&A", "Zero-Shot Classification"]
+    )
 app.launch()

image_captioning.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from transformers import BlipForConditionalGeneration, BlipProcessor
+import time
+import gradio as gr
+def get_image_captioning_tab():
+    salesforce_model_name = "Salesforce/blip-image-captioning-base"
+    salesforce_model = BlipForConditionalGeneration.from_pretrained(salesforce_model_name)
+    salesforce_processor = BlipProcessor.from_pretrained(salesforce_model_name)
+    noamrot_model_name = "noamrot/FuseCap_Image_Captioning"
+    noamrot_model = BlipForConditionalGeneration.from_pretrained(noamrot_model_name)
+    noamrot_processor = BlipProcessor.from_pretrained(noamrot_model_name)
+    model_map = {
+        salesforce_model_name: (salesforce_model, salesforce_processor),
+        noamrot_model_name: (noamrot_model, noamrot_processor)
+    }
+    def gradio_process(model_name, image, text):
+        (model, processor) = model_map[model_name]
+        start = time.time()
+        inputs = processor(image, text, return_tensors="pt")
+        out = model.generate(**inputs)
+        result = processor.decode(out[0], skip_special_tokens=True)
+        end = time.time()
+        time_spent = end - start
+        return [result, time_spent]
+    with gr.Blocks() as image_captioning_tab:
+        gr.Markdown("# Image Captioning")
+        with gr.Row():
+            with gr.Column():
+                # Input components
+                input_image = gr.Image(label="Upload Image", type="pil")
+                input_text = gr.Textbox(label="Caption")
+                model_selector = gr.Dropdown([salesforce_model_name, noamrot_model_name],
+                                                label = "Select Model")
+                # Process button
+                process_btn = gr.Button("Generate caption")
+            with gr.Column():
+                # Output components
+                elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
+                output_text = gr.Textbox(label="Generated caption")
+        # Connect the input components to the processing function
+        process_btn.click(
+            fn=gradio_process,
+            inputs=[
+                model_selector,
+                input_image,
+                input_text
+            ],
+            outputs=[output_text, elapsed_result]
+        )
+    return image_captioning_tab

visual_qa.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from transformers import pipeline
+import time
+import gradio as gr
+def get_visual_qa_tab():
+    salesforce_model_name = "Salesforce/blip-vqa-base"
+    salesforce_pipe = pipeline("visual-question-answering", model=salesforce_model_name)
+    dandelin_model_name = "dandelin/vilt-b32-finetuned-vqa"
+    dandelin_pipe = pipeline("visual-question-answering", model=dandelin_model_name)
+    pipe_map = {
+        salesforce_model_name: salesforce_pipe,
+        dandelin_model_name: dandelin_pipe
+    }
+    def gradio_process(model_name, image, text):
+        pipe = pipe_map[model_name]
+        start = time.time()
+        output = pipe(image, text)
+        end = time.time()
+        time_spent = end - start
+        result = output[0]['answer']
+        return [result, time_spent]
+    with gr.Blocks() as visual_qa_tab:
+        gr.Markdown("# Visual Question & Answering")
+        with gr.Row():
+            with gr.Column():
+                # Input components
+                input_image = gr.Image(label="Upload Image", type="pil")
+                input_text = gr.Textbox(label="Question")
+                model_selector = gr.Dropdown([salesforce_model_name, dandelin_model_name],
+                                                label = "Select Model")
+                # Process button
+                process_btn = gr.Button("Generate answer")
+            with gr.Column():
+                # Output components
+                elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
+                output_text = gr.Textbox(label="Answer")
+        # Connect the input components to the processing function
+        process_btn.click(
+            fn=gradio_process,
+            inputs=[
+                model_selector,
+                input_image,
+                input_text
+            ],
+            outputs=[output_text, elapsed_result]
+        )
+    return visual_qa_tab

zero_shot_classification.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from transformers import CLIPModel, CLIPProcessor
+import time
+import gradio as gr
+def get_zero_shot_classification_tab():
+    openai_model_name = "openai/clip-vit-large-patch14"
+    openai_model = CLIPModel.from_pretrained(openai_model_name)
+    openai_processor = CLIPProcessor.from_pretrained(openai_model_name)
+    patrickjohncyh_model_name = "patrickjohncyh/fashion-clip"
+    patrickjohncyh_model = CLIPModel.from_pretrained(patrickjohncyh_model_name)
+    patrickjohncyh_processor = CLIPProcessor.from_pretrained(patrickjohncyh_model_name)
+    model_map = {
+        openai_model_name: (openai_model, openai_processor),
+        patrickjohncyh_model_name: (patrickjohncyh_model, patrickjohncyh_processor)
+    }
+    def gradio_process(model_name, image, text):
+        (model, processor) = model_map[model_name]
+        labels = text.split(", ")
+        print (labels)
+        start = time.time()
+        inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
+        outputs = model(**inputs)
+        probs = outputs.logits_per_image.softmax(dim=1)[0]
+        end = time.time()
+        time_spent = end - start
+        probs = list(probs)
+        results = []
+        for i in range(len(labels)):
+            results.append(f"{labels[i]} - {probs[i].item():.4f}")
+        result = "\n".join(results)
+        return [result, time_spent]
+    with gr.Blocks() as zero_shot_image_classification_tab:
+        gr.Markdown("# Zero-Shot Image Classification")
+        with gr.Row():
+            with gr.Column():
+                # Input components
+                input_image = gr.Image(label="Upload Image", type="pil")
+                input_text = gr.Textbox(label="Labels (comma separated)")
+                model_selector = gr.Dropdown([openai_model_name, patrickjohncyh_model_name],
+                                                label = "Select Model")
+                # Process button
+                process_btn = gr.Button("Classificate")
+            with gr.Column():
+                # Output components
+                elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
+                output_text = gr.Textbox(label="Classification")
+        # Connect the input components to the processing function
+        process_btn.click(
+            fn=gradio_process,
+            inputs=[
+                model_selector,
+                input_image,
+                input_text
+            ],
+            outputs=[output_text, elapsed_result]
+        )
+    return zero_shot_image_classification_tab