pdf_gemini

Running

File size: 4,682 Bytes

import os
import re
import io
import streamlit as st
from PIL import Image, ImageDraw
from google import genai
from google.genai import types
from pdf2image import convert_from_bytes

# Helper functions
def parse_list_boxes(text):
    """Extracts bounding boxes from response text"""
    pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
    matches = re.findall(pattern, text)
    return [[float(m) for m in match] for match in matches]

def draw_bounding_boxes(image, boxes):
    """Draws bounding boxes on the image"""
    draw = ImageDraw.Draw(image)
    width, height = image.size
    
    for box in boxes:
        ymin = max(0.0, min(1.0, box[0]))
        xmin = max(0.0, min(1.0, box[1]))
        ymax = max(0.0, min(1.0, box[2]))
        xmax = max(0.0, min(1.0, box[3]))

        draw.rectangle([
            xmin * width,
            ymin * height,
            xmax * width,
            ymax * height
        ], outline="#00FF00", width=3)
    return image

# Streamlit UI
st.title("PDF Themenerkennung mit Gemini")
col1, col2 = st.columns(2)

with col1:
    uploaded_file = st.file_uploader("PDF hochladen", type=["pdf"])
    topic_name = st.text_input("Thema zur Erkennung", placeholder="z.B. 'Überschrift', 'Tabelle', 'Absatz'")

    if uploaded_file and topic_name:
        if st.button("Analysieren"):
            with st.spinner("Analysiere PDF..."):
                try:
                    # Convert PDF to images
                    pdf_bytes = uploaded_file.read()
                    images = convert_from_bytes(pdf_bytes)
                    results = []

                    # Initialize client
                    client = genai.Client(api_key=os.getenv("KEY"))

                    for page_num, image in enumerate(images):
                        # Prepare image
                        img_byte_arr = io.BytesIO()
                        image.save(img_byte_arr, format='PNG')
                        
                        image_part = types.Part.from_bytes(
                            data=img_byte_arr.getvalue(),
                            mime_type="image/png"
                        )

                        # Get topic boxes
                        detection_prompt = (
                            f"Identifiziere alle {topic_name} Bereiche in diesem Dokument. "
                            "Gib Bounding Boxes im Format [ymin, xmin, ymax, xmax] "
                            "als reine Python-Liste ohne weiteren Text. "
                            "Beispiel: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]"
                        )
                        box_response = client.models.generate_content(
                            model="gemini-2.0-flash-exp",
                            contents=[detection_prompt, image_part]
                        )

                        # Get description
                        desc_response = client.models.generate_content(
                            model="gemini-2.0-flash-exp",
                            contents=["Beschreibe diesen Dokumentenausschnitt detailliert.", image_part]
                        )

                        # Process boxes
                        try:
                            boxes = parse_list_boxes(box_response.text)
                        except Exception as e:
                            st.error(f"Fehler bei Seite {page_num+1}: {str(e)}")
                            boxes = []

                        # Draw boxes
                        annotated_image = image.copy()
                        if boxes:
                            annotated_image = draw_bounding_boxes(annotated_image, boxes)

                        results.append({
                            "page": page_num + 1,
                            "image": annotated_image,
                            "description": desc_response.text,
                            "boxes": len(boxes)
                        })

                    # Display results
                    with col2:
                        st.write(f"## Ergebnisse ({len(results)} Seiten)")
                        tabs = st.tabs([f"Seite {res['page']}" for res in results])
                        
                        for tab, res in zip(tabs, results):
                            with tab:
                                st.image(res["image"], 
                                       caption=f"Seite {res['page']} - {res['boxes']} {topic_name} erkannt",
                                       use_container_width=True)
                                st.write("**Beschreibung:**", res["description"])

                except Exception as e:
                    st.error(f"Fehler: {str(e)}")