import os import re import io import streamlit as st from PIL import Image, ImageDraw from google import genai from google.genai import types from pdf2image import convert_from_bytes # Helper functions def parse_list_boxes(text): """Extracts bounding boxes from response text""" pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]' matches = re.findall(pattern, text) return [[float(m) for m in match] for match in matches] def draw_bounding_boxes(image, boxes): """Draws bounding boxes on the image""" draw = ImageDraw.Draw(image) width, height = image.size for box in boxes: ymin = max(0.0, min(1.0, box[0])) xmin = max(0.0, min(1.0, box[1])) ymax = max(0.0, min(1.0, box[2])) xmax = max(0.0, min(1.0, box[3])) draw.rectangle([ xmin * width, ymin * height, xmax * width, ymax * height ], outline="#00FF00", width=3) return image # Streamlit UI st.title("PDF Themenerkennung mit Gemini") col1, col2 = st.columns(2) with col1: uploaded_file = st.file_uploader("PDF hochladen", type=["pdf"]) topic_name = st.text_input("Thema zur Erkennung", placeholder="z.B. 'Überschrift', 'Tabelle', 'Absatz'") if uploaded_file and topic_name: if st.button("Analysieren"): with st.spinner("Analysiere PDF..."): try: # Convert PDF to images pdf_bytes = uploaded_file.read() images = convert_from_bytes(pdf_bytes) results = [] # Initialize client client = genai.Client(api_key=os.getenv("KEY")) for page_num, image in enumerate(images): # Prepare image img_byte_arr = io.BytesIO() image.save(img_byte_arr, format='PNG') image_part = types.Part.from_bytes( data=img_byte_arr.getvalue(), mime_type="image/png" ) # Get topic boxes detection_prompt = ( f"Identifiziere alle {topic_name} Bereiche in diesem Dokument. " "Gib Bounding Boxes im Format [ymin, xmin, ymax, xmax] " "als reine Python-Liste ohne weiteren Text. " "Beispiel: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]" ) box_response = client.models.generate_content( model="gemini-2.0-flash-exp", contents=[detection_prompt, image_part] ) # Get description desc_response = client.models.generate_content( model="gemini-2.0-flash-exp", contents=["Beschreibe diesen Dokumentenausschnitt detailliert.", image_part] ) # Process boxes try: boxes = parse_list_boxes(box_response.text) except Exception as e: st.error(f"Fehler bei Seite {page_num+1}: {str(e)}") boxes = [] # Draw boxes annotated_image = image.copy() if boxes: annotated_image = draw_bounding_boxes(annotated_image, boxes) results.append({ "page": page_num + 1, "image": annotated_image, "description": desc_response.text, "boxes": len(boxes) }) # Display results with col2: st.write(f"## Ergebnisse ({len(results)} Seiten)") tabs = st.tabs([f"Seite {res['page']}" for res in results]) for tab, res in zip(tabs, results): with tab: st.image(res["image"], caption=f"Seite {res['page']} - {res['boxes']} {topic_name} erkannt", use_container_width=True) st.write("**Beschreibung:**", res["description"]) except Exception as e: st.error(f"Fehler: {str(e)}")