Spaces:
Running
Running
| import gradio as gr | |
| import PyPDF2 | |
| import io | |
| from transformers import pipeline, AutoTokenizer | |
| import torch | |
| import re | |
| from typing import List, Tuple | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| class PDFSummarizer: | |
| def __init__(self): | |
| # Use a much faster, lighter model for summarization | |
| self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {self.device}") | |
| try: | |
| # Initialize the summarization pipeline with optimizations | |
| self.summarizer = pipeline( | |
| "summarization", | |
| model=self.model_name, | |
| device=0 if self.device == "cuda" else -1, | |
| framework="pt", | |
| model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32} | |
| ) | |
| # Initialize tokenizer for length calculations | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| print("Model loaded successfully") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| # Fallback to an even faster model | |
| self.model_name = "facebook/bart-large-cnn" | |
| self.summarizer = pipeline("summarization", model=self.model_name, device=-1) | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| print("Fallback model loaded") | |
| def extract_text_from_pdf(self, pdf_file) -> str: | |
| """Extract text content from PDF file""" | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file)) | |
| text = "" | |
| for page_num, page in enumerate(pdf_reader.pages): | |
| page_text = page.extract_text() | |
| if page_text.strip(): | |
| text += f"\n--- Page {page_num + 1} ---\n" | |
| text += page_text | |
| return text.strip() | |
| except Exception as e: | |
| raise Exception(f"Error extracting text from PDF: {str(e)}") | |
| def clean_text(self, text: str) -> str: | |
| """Clean and preprocess text""" | |
| # Remove extra whitespaces and newlines | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters but keep punctuation | |
| text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text) | |
| # Remove page markers | |
| text = re.sub(r'--- Page \d+ ---', '', text) | |
| return text.strip() | |
| def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]: | |
| """Split text into smaller, more manageable chunks for faster processing""" | |
| sentences = text.split('. ') | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| # Check if adding this sentence would exceed the limit | |
| potential_chunk = current_chunk + sentence + ". " | |
| # Use faster length estimation | |
| if len(potential_chunk.split()) <= max_chunk_length: | |
| current_chunk = potential_chunk | |
| else: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + ". " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| # Limit number of chunks for speed | |
| return chunks[:5] # Process max 5 chunks for speed | |
| def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str: | |
| """Summarize a single chunk of text with speed optimizations""" | |
| try: | |
| # Speed optimizations | |
| summary = self.summarizer( | |
| chunk, | |
| max_length=max_length, | |
| min_length=min_length, | |
| do_sample=False, | |
| truncation=True, | |
| early_stopping=True, | |
| num_beams=2 # Reduced from default 4 for speed | |
| ) | |
| return summary[0]['summary_text'] | |
| except Exception as e: | |
| return f"Error summarizing chunk: {str(e)}" | |
| def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]: | |
| """Main function to process PDF and generate summary""" | |
| try: | |
| # Extract text from PDF | |
| raw_text = self.extract_text_from_pdf(pdf_file) | |
| if not raw_text.strip(): | |
| return "β Error: No text could be extracted from the PDF.", "", "" | |
| # Clean the text | |
| cleaned_text = self.clean_text(raw_text) | |
| # Calculate text statistics | |
| word_count = len(cleaned_text.split()) | |
| char_count = len(cleaned_text) | |
| if word_count < 50: | |
| return "β Error: PDF contains too little text to summarize.", "", "" | |
| # Chunk the text for processing | |
| chunks = self.chunk_text(cleaned_text) | |
| # Determine summary parameters based on type (optimized for speed) | |
| if summary_type == "Brief (Quick)": | |
| max_len, min_len = 60, 20 | |
| elif summary_type == "Detailed": | |
| max_len, min_len = 100, 40 | |
| else: # Comprehensive | |
| max_len, min_len = 150, 60 | |
| # Summarize each chunk (with progress tracking) | |
| chunk_summaries = [] | |
| for i, chunk in enumerate(chunks): | |
| print(f"Processing chunk {i+1}/{len(chunks)}") | |
| summary = self.summarize_chunk(chunk, max_len, min_len) | |
| chunk_summaries.append(summary) | |
| # Combine summaries | |
| combined_summary = " ".join(chunk_summaries) | |
| # Skip final summarization for speed if we have few chunks | |
| if len(chunks) <= 2: | |
| final_summary = combined_summary | |
| else: | |
| # Quick final summary for multiple chunks | |
| final_summary = self.summarize_chunk( | |
| combined_summary, | |
| max_length=min(200, max_len * 1.5), | |
| min_length=min_len | |
| ) | |
| # Create statistics | |
| summary_stats = f""" | |
| π **Document Statistics:** | |
| - Original word count: {word_count:,} | |
| - Original character count: {char_count:,} | |
| - Pages processed: {len(chunks)} | |
| - Summary word count: {len(final_summary.split()):,} | |
| - Compression ratio: {word_count / len(final_summary.split()):.1f}:1 | |
| """ | |
| return final_summary, summary_stats, "β Summary generated successfully!" | |
| except Exception as e: | |
| return f"β Error processing PDF: {str(e)}", "", "" | |
| # Initialize the summarizer | |
| pdf_summarizer = PDFSummarizer() | |
| def summarize_pdf_interface(pdf_file, summary_type): | |
| """Gradio interface function""" | |
| if pdf_file is None: | |
| return "β Please upload a PDF file.", "", "" | |
| try: | |
| # Read the uploaded file - pdf_file is already the file path | |
| with open(pdf_file, 'rb') as f: | |
| pdf_content = f.read() | |
| # Process the PDF | |
| summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type) | |
| return summary, stats, status | |
| except Exception as e: | |
| return f"β Error: {str(e)}", "", "" | |
| # Create Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="π AI PDF Summarizer", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| .summary-box { | |
| border-left: 4px solid #2196F3; | |
| padding: 16px; | |
| background-color: #f8f9fa; | |
| } | |
| """ | |
| ) as interface: | |
| gr.Markdown(""" | |
| # π AI-Powered PDF Summarizer | |
| Upload any PDF document and get an intelligent summary in seconds! | |
| Perfect for research papers, reports, articles, and books. | |
| **Features:** | |
| - β‘ Fast processing with BART model | |
| - π Document statistics | |
| - π― Multiple summary lengths | |
| - π Smart text chunking | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_input = gr.File( | |
| label="π Upload PDF File", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| summary_type = gr.Radio( | |
| choices=["Brief (Quick)", "Detailed", "Comprehensive"], | |
| value="Detailed", | |
| label="π Summary Length", | |
| info="Choose how detailed you want the summary to be" | |
| ) | |
| summarize_btn = gr.Button( | |
| "π Generate Summary", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| status_output = gr.Textbox( | |
| label="π Status", | |
| interactive=False, | |
| max_lines=2 | |
| ) | |
| with gr.Column(scale=2): | |
| summary_output = gr.Textbox( | |
| label="π Generated Summary", | |
| lines=15, | |
| max_lines=20, | |
| interactive=False, | |
| elem_classes=["summary-box"] | |
| ) | |
| stats_output = gr.Markdown( | |
| label="π Document Statistics", | |
| value="Upload a PDF to see statistics" | |
| ) | |
| # Examples section | |
| gr.Markdown(""" | |
| ## π‘ Tips for Best Results: | |
| - **File Quality**: Ensure your PDF has selectable text (not just images) | |
| - **Length**: Works best with documents between 500-10,000 words | |
| - **Language**: Optimized for English content | |
| - **Format**: Clean, well-formatted PDFs produce better summaries | |
| ## π§ Technical Details: | |
| - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization) | |
| - **Processing**: Smart text chunking with overlap prevention | |
| - **Speed**: GPU-accelerated when available | |
| """) | |
| # Connect the button to the function | |
| summarize_btn.click( | |
| fn=summarize_pdf_interface, | |
| inputs=[pdf_input, summary_type], | |
| outputs=[summary_output, stats_output, status_output] | |
| ) | |
| # Auto-process when file is uploaded | |
| pdf_input.change( | |
| fn=summarize_pdf_interface, | |
| inputs=[pdf_input, summary_type], | |
| outputs=[summary_output, stats_output, status_output] | |
| ) | |
| return interface | |
| # Launch the application | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch() |