Spaces:

prat1003
/

project2

Sleeping

App Files Files Community

project2 / app.py

prat1003

Update app.py

ca5f6c8 verified 4 months ago

raw

history blame contribute delete

4.61 kB

	import gradio as gr
	import tempfile
	import shutil
	import os
	import json
	import numpy as np
	from pdf2image import convert_from_path
	import easyocr
	from PyPDF2 import PdfReader
	from transformers import pipeline
	import random

	# -----------------------------
	# Initialize OCR and Transformers
	# -----------------------------
	reader = easyocr.Reader(['en'])

	# Question generation model
	qg_pipeline = pipeline(
	"text2text-generation",
	model="valhalla/t5-small-qg-prepend",
	tokenizer="t5-small"
	)

	# Question-answer generation model
	qa_pipeline = pipeline(
	"text2text-generation",
	model="valhalla/t5-small-qa-qg-hl",
	tokenizer="t5-small"
	)

	# -----------------------------
	# Extract text from selectable PDFs
	# -----------------------------
	def extract_text_from_pdf(file_path):
	reader_pdf = PdfReader(file_path)
	text = ""
	for page in reader_pdf.pages:
	t = getattr(page, 'extract_text', lambda: None)()
	if t:
	text += t + "\n"
	return text.strip()

	# -----------------------------
	# Extract text from scanned PDFs using EasyOCR
	# -----------------------------
	def extract_text_from_scanned_pdf(file_path):
	pages = convert_from_path(file_path, dpi=150)
	text = ""
	for page in pages:
	try:
	img_array = np.array(page)
	result = reader.readtext(img_array, detail=0)
	text += " ".join(result) + "\n"
	except Exception as e:
	print("OCR error on page:", e)
	return text.strip()

	# -----------------------------
	# Generate dummy options
	# -----------------------------
	def generate_options(correct_answer):
	options = [correct_answer]
	dummy_opts = [
	"None of the above",
	"All of the above",
	"Not mentioned",
	"Cannot be determined",
	"Irrelevant information"
	]
	while len(options) < 4:
	opt = random.choice(dummy_opts)
	if opt not in options:
	options.append(opt)
	random.shuffle(options)
	return options

	# -----------------------------
	# Main processing function
	# -----------------------------
	def process_pdf(pdf_file):
	# Save uploaded PDF to temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	shutil.copy(pdf_file.name, temp_pdf.name)
	temp_pdf_path = temp_pdf.name

	# Step 1: Try extracting text from PDF directly
	extracted_text = extract_text_from_pdf(temp_pdf_path)

	# Step 2: If empty, use OCR
	if not extracted_text.strip():
	extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)

	os.remove(temp_pdf_path)

	if not extracted_text.strip():
	return "❌ Could not extract text. Make sure the PDF has readable content."

	# Step 3: Generate questions
	prompt_q = "generate questions: " + extracted_text[:1000]
	questions_output = qg_pipeline(prompt_q, max_length=128, num_beams=3, num_return_sequences=3)

	# Step 4: Generate answers
	prompt_a = "answer questions: " + extracted_text[:1000]
	answers_output = qa_pipeline(prompt_a, max_length=64, num_beams=3, num_return_sequences=3)

	# Step 5: Build question list
	question_list = []
	for i, q in enumerate(questions_output):
	question = q["generated_text"]
	correct_answer = answers_output[i]["generated_text"] if i < len(answers_output) else "N/A"

	options = generate_options(correct_answer)
	question_list.append({
	"questiontext": question,
	"questiontype": "single_select",
	"marks": 10,
	"options": [
	{"optiontext": opt, "score": "10" if opt == correct_answer else "0"}
	for opt in options
	]
	})

	# Step 6: Build <questiondata> structure
	data = {
	"title": "Certification Title",
	"totalmarks": "50",
	"time": "20",
	"cutoff": "35",
	"failurl": "",
	"passurl": "",
	"sendpassemail": True,
	"questions": json.dumps({"questions": question_list}),
	"maxattempts": 3
	}

	# Step 7: Wrap JSON in XML CDATA
	xml_output = "<questiondata><![CDATA[" + json.dumps(data, indent=2) + "]]></questiondata>"
	return xml_output

	# -----------------------------
	# Gradio Interface
	# -----------------------------
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="📄 Upload your PDF"),
	outputs="text",
	title="PDF → Question & Answer Generator (with OCR)",
	description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates XML with questions + answers."
	)

	iface.launch()