pdf_gemini / app.py
Sebbe33's picture
Update app.py
7e4f227 verified
raw
history blame
4.68 kB
import os
import re
import io
import streamlit as st
from PIL import Image, ImageDraw
from google import genai
from google.genai import types
from pdf2image import convert_from_bytes
# Helper functions
def parse_list_boxes(text):
"""Extracts bounding boxes from response text"""
pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
matches = re.findall(pattern, text)
return [[float(m) for m in match] for match in matches]
def draw_bounding_boxes(image, boxes):
"""Draws bounding boxes on the image"""
draw = ImageDraw.Draw(image)
width, height = image.size
for box in boxes:
ymin = max(0.0, min(1.0, box[0]))
xmin = max(0.0, min(1.0, box[1]))
ymax = max(0.0, min(1.0, box[2]))
xmax = max(0.0, min(1.0, box[3]))
draw.rectangle([
xmin * width,
ymin * height,
xmax * width,
ymax * height
], outline="#00FF00", width=3)
return image
# Streamlit UI
st.title("PDF Themenerkennung mit Gemini")
col1, col2 = st.columns(2)
with col1:
uploaded_file = st.file_uploader("PDF hochladen", type=["pdf"])
topic_name = st.text_input("Thema zur Erkennung", placeholder="z.B. 'Überschrift', 'Tabelle', 'Absatz'")
if uploaded_file and topic_name:
if st.button("Analysieren"):
with st.spinner("Analysiere PDF..."):
try:
# Convert PDF to images
pdf_bytes = uploaded_file.read()
images = convert_from_bytes(pdf_bytes)
results = []
# Initialize client
client = genai.Client(api_key=os.getenv("KEY"))
for page_num, image in enumerate(images):
# Prepare image
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format='PNG')
image_part = types.Part.from_bytes(
data=img_byte_arr.getvalue(),
mime_type="image/png"
)
# Get topic boxes
detection_prompt = (
f"Identifiziere alle {topic_name} Bereiche in diesem Dokument. "
"Gib Bounding Boxes im Format [ymin, xmin, ymax, xmax] "
"als reine Python-Liste ohne weiteren Text. "
"Beispiel: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]"
)
box_response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=[detection_prompt, image_part]
)
# Get description
desc_response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=["Beschreibe diesen Dokumentenausschnitt detailliert.", image_part]
)
# Process boxes
try:
boxes = parse_list_boxes(box_response.text)
except Exception as e:
st.error(f"Fehler bei Seite {page_num+1}: {str(e)}")
boxes = []
# Draw boxes
annotated_image = image.copy()
if boxes:
annotated_image = draw_bounding_boxes(annotated_image, boxes)
results.append({
"page": page_num + 1,
"image": annotated_image,
"description": desc_response.text,
"boxes": len(boxes)
})
# Display results
with col2:
st.write(f"## Ergebnisse ({len(results)} Seiten)")
tabs = st.tabs([f"Seite {res['page']}" for res in results])
for tab, res in zip(tabs, results):
with tab:
st.image(res["image"],
caption=f"Seite {res['page']} - {res['boxes']} {topic_name} erkannt",
use_container_width=True)
st.write("**Beschreibung:**", res["description"])
except Exception as e:
st.error(f"Fehler: {str(e)}")