Spaces:

LovnishVerma
/

pdf_summarization

Running

App Files Files Community

pdf_summarization / app.py

LovnishVerma

Update app.py

099d4aa verified 7 months ago

raw

history blame contribute delete

11 kB

	import gradio as gr
	import PyPDF2
	import io
	from transformers import pipeline, AutoTokenizer
	import torch
	import re
	from typing import List, Tuple
	import warnings
	warnings.filterwarnings("ignore")

	class PDFSummarizer:
	def __init__(self):
	# Use a much faster, lighter model for summarization
	self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {self.device}")

	try:
	# Initialize the summarization pipeline with optimizations
	self.summarizer = pipeline(
	"summarization",
	model=self.model_name,
	device=0 if self.device == "cuda" else -1,
	framework="pt",
	model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
	)

	# Initialize tokenizer for length calculations
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	print("Model loaded successfully")

	except Exception as e:
	print(f"Error loading model: {e}")
	# Fallback to an even faster model
	self.model_name = "facebook/bart-large-cnn"
	self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	print("Fallback model loaded")

	def extract_text_from_pdf(self, pdf_file) -> str:
	"""Extract text content from PDF file"""
	try:
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
	text = ""

	for page_num, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text()
	if page_text.strip():
	text += f"\n--- Page {page_num + 1} ---\n"
	text += page_text

	return text.strip()
	except Exception as e:
	raise Exception(f"Error extracting text from PDF: {str(e)}")

	def clean_text(self, text: str) -> str:
	"""Clean and preprocess text"""
	# Remove extra whitespaces and newlines
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters but keep punctuation
	text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
	# Remove page markers
	text = re.sub(r'--- Page \d+ ---', '', text)
	return text.strip()

	def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
	"""Split text into smaller, more manageable chunks for faster processing"""
	sentences = text.split('. ')
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	# Check if adding this sentence would exceed the limit
	potential_chunk = current_chunk + sentence + ". "
	# Use faster length estimation
	if len(potential_chunk.split()) <= max_chunk_length:
	current_chunk = potential_chunk
	else:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + ". "

	if current_chunk:
	chunks.append(current_chunk.strip())

	# Limit number of chunks for speed
	return chunks[:5] # Process max 5 chunks for speed

	def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
	"""Summarize a single chunk of text with speed optimizations"""
	try:
	# Speed optimizations
	summary = self.summarizer(
	chunk,
	max_length=max_length,
	min_length=min_length,
	do_sample=False,
	truncation=True,
	early_stopping=True,
	num_beams=2 # Reduced from default 4 for speed
	)
	return summary[0]['summary_text']
	except Exception as e:
	return f"Error summarizing chunk: {str(e)}"

	def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
	"""Main function to process PDF and generate summary"""
	try:
	# Extract text from PDF
	raw_text = self.extract_text_from_pdf(pdf_file)

	if not raw_text.strip():
	return "❌ Error: No text could be extracted from the PDF.", "", ""

	# Clean the text
	cleaned_text = self.clean_text(raw_text)

	# Calculate text statistics
	word_count = len(cleaned_text.split())
	char_count = len(cleaned_text)

	if word_count < 50:
	return "❌ Error: PDF contains too little text to summarize.", "", ""

	# Chunk the text for processing
	chunks = self.chunk_text(cleaned_text)

	# Determine summary parameters based on type (optimized for speed)
	if summary_type == "Brief (Quick)":
	max_len, min_len = 60, 20
	elif summary_type == "Detailed":
	max_len, min_len = 100, 40
	else: # Comprehensive
	max_len, min_len = 150, 60

	# Summarize each chunk (with progress tracking)
	chunk_summaries = []
	for i, chunk in enumerate(chunks):
	print(f"Processing chunk {i+1}/{len(chunks)}")
	summary = self.summarize_chunk(chunk, max_len, min_len)
	chunk_summaries.append(summary)

	# Combine summaries
	combined_summary = " ".join(chunk_summaries)

	# Skip final summarization for speed if we have few chunks
	if len(chunks) <= 2:
	final_summary = combined_summary
	else:
	# Quick final summary for multiple chunks
	final_summary = self.summarize_chunk(
	combined_summary,
	max_length=min(200, max_len * 1.5),
	min_length=min_len
	)

	# Create statistics
	summary_stats = f"""
	📊 Document Statistics:
	- Original word count: {word_count:,}
	- Original character count: {char_count:,}
	- Pages processed: {len(chunks)}
	- Summary word count: {len(final_summary.split()):,}
	- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
	"""

	return final_summary, summary_stats, "✅ Summary generated successfully!"

	except Exception as e:
	return f"❌ Error processing PDF: {str(e)}", "", ""

	# Initialize the summarizer
	pdf_summarizer = PDFSummarizer()

	def summarize_pdf_interface(pdf_file, summary_type):
	"""Gradio interface function"""
	if pdf_file is None:
	return "❌ Please upload a PDF file.", "", ""

	try:
	# Read the uploaded file - pdf_file is already the file path
	with open(pdf_file, 'rb') as f:
	pdf_content = f.read()

	# Process the PDF
	summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)

	return summary, stats, status

	except Exception as e:
	return f"❌ Error: {str(e)}", "", ""

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(
	title="📄 AI PDF Summarizer",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1200px !important;
	}
	.summary-box {
	border-left: 4px solid #2196F3;
	padding: 16px;
	background-color: #f8f9fa;
	}
	"""
	) as interface:

	gr.Markdown("""
	# 📄 AI-Powered PDF Summarizer

	Upload any PDF document and get an intelligent summary in seconds!
	Perfect for research papers, reports, articles, and books.

	Features:
	- ⚡ Fast processing with BART model
	- 📊 Document statistics
	- 🎯 Multiple summary lengths
	- 🔍 Smart text chunking
	""")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="📁 Upload PDF File",
	file_types=[".pdf"],
	type="filepath"
	)

	summary_type = gr.Radio(
	choices=["Brief (Quick)", "Detailed", "Comprehensive"],
	value="Detailed",
	label="📏 Summary Length",
	info="Choose how detailed you want the summary to be"
	)

	summarize_btn = gr.Button(
	"🚀 Generate Summary",
	variant="primary",
	size="lg"
	)

	status_output = gr.Textbox(
	label="📋 Status",
	interactive=False,
	max_lines=2
	)

	with gr.Column(scale=2):
	summary_output = gr.Textbox(
	label="📝 Generated Summary",
	lines=15,
	max_lines=20,
	interactive=False,
	elem_classes=["summary-box"]
	)

	stats_output = gr.Markdown(
	label="📊 Document Statistics",
	value="Upload a PDF to see statistics"
	)

	# Examples section
	gr.Markdown("""
	## 💡 Tips for Best Results:

	- File Quality: Ensure your PDF has selectable text (not just images)
	- Length: Works best with documents between 500-10,000 words
	- Language: Optimized for English content
	- Format: Clean, well-formatted PDFs produce better summaries

	## 🔧 Technical Details:
	- Model: Facebook BART-Large-CNN (state-of-the-art summarization)
	- Processing: Smart text chunking with overlap prevention
	- Speed: GPU-accelerated when available
	""")

	# Connect the button to the function
	summarize_btn.click(
	fn=summarize_pdf_interface,
	inputs=[pdf_input, summary_type],
	outputs=[summary_output, stats_output, status_output]
	)

	# Auto-process when file is uploaded
	pdf_input.change(
	fn=summarize_pdf_interface,
	inputs=[pdf_input, summary_type],
	outputs=[summary_output, stats_output, status_output]
	)

	return interface

	# Launch the application
	if __name__ == "__main__":
	interface = create_interface()
	interface.launch()