File size: 5,972 Bytes
be5f84f
6880cd9
be5f84f
 
 
 
 
 
 
 
6880cd9
 
be5f84f
6880cd9
 
be5f84f
6880cd9
 
 
be5f84f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6880cd9
be5f84f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267f1ae
be5f84f
 
 
267f1ae
be5f84f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6880cd9
be5f84f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6880cd9
be5f84f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6880cd9
 
be5f84f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import os
import streamlit as st
from typing import Dict, Any

from api.pdf_processor import PDFProcessor
from api.summarizer import BookSummarizer

DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "sshleifer/distilbart-cnn-12-6")
AVAILABLE_MODELS = BookSummarizer(DEFAULT_MODEL).get_available_models()


st.set_page_config(
    page_title="Book Summarizer",
    page_icon="📚",
    layout="wide",
    initial_sidebar_state="expanded",
)


@st.cache_resource
def get_pdf_processor() -> PDFProcessor:
    return PDFProcessor()


@st.cache_resource
def get_summarizer(model_name: str) -> BookSummarizer:
    summarizer = BookSummarizer(model_name=model_name)
    summarizer.load_model()
    return summarizer


def summarize_pdf(
    uploaded_file,
    model_name: str,
    max_length: int,
    min_length: int,
    chunk_size: int,
    overlap: int,
) -> Dict[str, Any]:
    pdf_bytes = uploaded_file.getvalue()
    processor = get_pdf_processor()

    validation = processor.validate_pdf(pdf_bytes)
    if not validation["valid"]:
        raise ValueError(validation["message"])

    metadata = processor.get_pdf_metadata(pdf_bytes)
    extraction = processor.extract_text_from_pdf(pdf_bytes)
    if not extraction["success"]:
        raise RuntimeError(extraction["message"])

    summarizer = get_summarizer(model_name)
    summary_result = summarizer.summarize_book(
        text=extraction["text"],
        chunk_size=chunk_size,
        overlap=overlap,
        max_length=max_length,
        min_length=min_length,
    )

    if not summary_result["success"]:
        raise RuntimeError(summary_result.get("error", "Summarization failed"))

    return {
        "metadata": metadata,
        "validation": validation,
        "extraction": extraction,
        "summary": summary_result,
    }


def sidebar_controls():
    st.header("Settings")

    model_names = [m["name"] for m in AVAILABLE_MODELS]
    model_descriptions = {m["name"]: m["description"] for m in AVAILABLE_MODELS}

    selected_model = st.selectbox(
        "Model",
        model_names,
        index=model_names.index(DEFAULT_MODEL) if DEFAULT_MODEL in model_names else 0,
        help="Free, locally run Hugging Face models. First run downloads weights.",
    )
    st.caption(model_descriptions.get(selected_model, ""))

    max_length = st.slider(
        "Maximum summary length (words)",
        min_value=50,
        max_value=250,
        value=140,
        step=10,
    )
    min_length_limit = min(120, max_length - 10)
    min_length = st.slider(
        "Minimum summary length (words)",
        min_value=20,
        max_value=min_length_limit,
        value=min(50, max_length - 20),
        step=5,
    )

    chunk_size = st.slider(
        "Chunk size (characters)",
        min_value=600,
        max_value=2000,
        value=1200,
        step=50,
        help="Longer chunks preserve context but take longer.",
    )
    overlap = st.slider(
        "Chunk overlap (characters)",
        min_value=50,
        max_value=300,
        value=120,
        step=10,
    )

    return {
        "model": selected_model,
        "max_length": max_length,
        "min_length": min_length,
        "chunk_size": chunk_size,
        "overlap": overlap,
    }


def show_file_info(uploaded_file):
    size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
    st.info(f"Selected: **{uploaded_file.name}** ({size_mb:.1f} MB)")


def show_results(result: Dict[str, Any]):
    summary_text = result["summary"]["summary"]
    stats = result["summary"]["statistics"]
    original_stats = result["extraction"]["statistics"]

    st.success("Summary ready!")

    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Pages", result["validation"]["pages"])
    col2.metric("Original words", f"{original_stats.get('total_words', 0):,}")
    col3.metric("Summary words", f"{stats.get('final_summary_length', 0):,}")
    compression = stats.get("overall_compression_ratio", 0)
    col4.metric("Compression", f"{compression:.1%}" if compression else "N/A")

    st.subheader("Summary")
    st.text_area("Generated summary", value=summary_text, height=400, label_visibility="collapsed")

    st.download_button(
        label="Download summary",
        data=summary_text.encode("utf-8"),
        file_name=f"{result['metadata'].get('title', 'summary').replace(' ', '_')}.txt",
        mime="text/plain",
    )

    st.subheader("Book snapshot")
    preview = result["extraction"]["text"][:1500]
    if len(result["extraction"]["text"]) > 1500:
        preview += " ..."
    st.text_area("First 1500 characters", value=preview, height=220, label_visibility="collapsed")


def main():
    st.title("📚 AI-Powered Book Summarizer")
    st.write(
        "Upload a PDF (under 50MB) to generate a concise summary locally with free, open models. "
        "No paid API keys required—first run will download model weights."
    )

    st.divider()

    with st.sidebar:
        controls = sidebar_controls()

    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

    if uploaded_file:
        show_file_info(uploaded_file)
        if st.button("Generate summary", type="primary"):
            with st.spinner("Extracting text and generating summary..."):
                try:
                    result = summarize_pdf(
                        uploaded_file=uploaded_file,
                        model_name=controls["model"],
                        max_length=controls["max_length"],
                        min_length=controls["min_length"],
                        chunk_size=controls["chunk_size"],
                        overlap=controls["overlap"],
                    )
                    show_results(result)
                except Exception as exc:
                    st.error(f"Could not summarize this PDF: {exc}")
    else:
        st.info("Upload a small/medium PDF to get started. Scans or image-only PDFs will not work well.")


if __name__ == "__main__":
    main()