Spaces:

ND06-25
/

Slash

Sleeping

File size: 5,972 Bytes

import os
import streamlit as st
from typing import Dict, Any

from api.pdf_processor import PDFProcessor
from api.summarizer import BookSummarizer

DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "sshleifer/distilbart-cnn-12-6")
AVAILABLE_MODELS = BookSummarizer(DEFAULT_MODEL).get_available_models()


st.set_page_config(
    page_title="Book Summarizer",
    page_icon="📚",
    layout="wide",
    initial_sidebar_state="expanded",
)


@st.cache_resource
def get_pdf_processor() -> PDFProcessor:
    return PDFProcessor()


@st.cache_resource
def get_summarizer(model_name: str) -> BookSummarizer:
    summarizer = BookSummarizer(model_name=model_name)
    summarizer.load_model()
    return summarizer


def summarize_pdf(
    uploaded_file,
    model_name: str,
    max_length: int,
    min_length: int,
    chunk_size: int,
    overlap: int,
) -> Dict[str, Any]:
    pdf_bytes = uploaded_file.getvalue()
    processor = get_pdf_processor()

    validation = processor.validate_pdf(pdf_bytes)
    if not validation["valid"]:
        raise ValueError(validation["message"])

    metadata = processor.get_pdf_metadata(pdf_bytes)
    extraction = processor.extract_text_from_pdf(pdf_bytes)
    if not extraction["success"]:
        raise RuntimeError(extraction["message"])

    summarizer = get_summarizer(model_name)
    summary_result = summarizer.summarize_book(
        text=extraction["text"],
        chunk_size=chunk_size,
        overlap=overlap,
        max_length=max_length,
        min_length=min_length,
    )

    if not summary_result["success"]:
        raise RuntimeError(summary_result.get("error", "Summarization failed"))

    return {
        "metadata": metadata,
        "validation": validation,
        "extraction": extraction,
        "summary": summary_result,
    }


def sidebar_controls():
    st.header("Settings")

    model_names = [m["name"] for m in AVAILABLE_MODELS]
    model_descriptions = {m["name"]: m["description"] for m in AVAILABLE_MODELS}

    selected_model = st.selectbox(
        "Model",
        model_names,
        index=model_names.index(DEFAULT_MODEL) if DEFAULT_MODEL in model_names else 0,
        help="Free, locally run Hugging Face models. First run downloads weights.",
    )
    st.caption(model_descriptions.get(selected_model, ""))

    max_length = st.slider(
        "Maximum summary length (words)",
        min_value=50,
        max_value=250,
        value=140,
        step=10,
    )
    min_length_limit = min(120, max_length - 10)
    min_length = st.slider(
        "Minimum summary length (words)",
        min_value=20,
        max_value=min_length_limit,
        value=min(50, max_length - 20),
        step=5,
    )

    chunk_size = st.slider(
        "Chunk size (characters)",
        min_value=600,
        max_value=2000,
        value=1200,
        step=50,
        help="Longer chunks preserve context but take longer.",
    )
    overlap = st.slider(
        "Chunk overlap (characters)",
        min_value=50,
        max_value=300,
        value=120,
        step=10,
    )

    return {
        "model": selected_model,
        "max_length": max_length,
        "min_length": min_length,
        "chunk_size": chunk_size,
        "overlap": overlap,
    }


def show_file_info(uploaded_file):
    size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
    st.info(f"Selected: **{uploaded_file.name}** ({size_mb:.1f} MB)")


def show_results(result: Dict[str, Any]):
    summary_text = result["summary"]["summary"]
    stats = result["summary"]["statistics"]
    original_stats = result["extraction"]["statistics"]

    st.success("Summary ready!")

    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Pages", result["validation"]["pages"])
    col2.metric("Original words", f"{original_stats.get('total_words', 0):,}")
    col3.metric("Summary words", f"{stats.get('final_summary_length', 0):,}")
    compression = stats.get("overall_compression_ratio", 0)
    col4.metric("Compression", f"{compression:.1%}" if compression else "N/A")

    st.subheader("Summary")
    st.text_area("Generated summary", value=summary_text, height=400, label_visibility="collapsed")

    st.download_button(
        label="Download summary",
        data=summary_text.encode("utf-8"),
        file_name=f"{result['metadata'].get('title', 'summary').replace(' ', '_')}.txt",
        mime="text/plain",
    )

    st.subheader("Book snapshot")
    preview = result["extraction"]["text"][:1500]
    if len(result["extraction"]["text"]) > 1500:
        preview += " ..."
    st.text_area("First 1500 characters", value=preview, height=220, label_visibility="collapsed")


def main():
    st.title("📚 AI-Powered Book Summarizer")
    st.write(
        "Upload a PDF (under 50MB) to generate a concise summary locally with free, open models. "
        "No paid API keys required—first run will download model weights."
    )

    st.divider()

    with st.sidebar:
        controls = sidebar_controls()

    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

    if uploaded_file:
        show_file_info(uploaded_file)
        if st.button("Generate summary", type="primary"):
            with st.spinner("Extracting text and generating summary..."):
                try:
                    result = summarize_pdf(
                        uploaded_file=uploaded_file,
                        model_name=controls["model"],
                        max_length=controls["max_length"],
                        min_length=controls["min_length"],
                        chunk_size=controls["chunk_size"],
                        overlap=controls["overlap"],
                    )
                    show_results(result)
                except Exception as exc:
                    st.error(f"Could not summarize this PDF: {exc}")
    else:
        st.info("Upload a small/medium PDF to get started. Scans or image-only PDFs will not work well.")


if __name__ == "__main__":
    main()