import gradio as gr import tempfile import shutil import os import json import numpy as np from pdf2image import convert_from_path import easyocr from PyPDF2 import PdfReader from transformers import pipeline import random # ----------------------------- # Initialize OCR and Transformers # ----------------------------- reader = easyocr.Reader(['en']) # Question generation model qg_pipeline = pipeline( "text2text-generation", model="valhalla/t5-small-qg-prepend", tokenizer="t5-small" ) # Question-answer generation model qa_pipeline = pipeline( "text2text-generation", model="valhalla/t5-small-qa-qg-hl", tokenizer="t5-small" ) # ----------------------------- # Extract text from selectable PDFs # ----------------------------- def extract_text_from_pdf(file_path): reader_pdf = PdfReader(file_path) text = "" for page in reader_pdf.pages: t = getattr(page, 'extract_text', lambda: None)() if t: text += t + "\n" return text.strip() # ----------------------------- # Extract text from scanned PDFs using EasyOCR # ----------------------------- def extract_text_from_scanned_pdf(file_path): pages = convert_from_path(file_path, dpi=150) text = "" for page in pages: try: img_array = np.array(page) result = reader.readtext(img_array, detail=0) text += " ".join(result) + "\n" except Exception as e: print("OCR error on page:", e) return text.strip() # ----------------------------- # Generate dummy options # ----------------------------- def generate_options(correct_answer): options = [correct_answer] dummy_opts = [ "None of the above", "All of the above", "Not mentioned", "Cannot be determined", "Irrelevant information" ] while len(options) < 4: opt = random.choice(dummy_opts) if opt not in options: options.append(opt) random.shuffle(options) return options # ----------------------------- # Main processing function # ----------------------------- def process_pdf(pdf_file): # Save uploaded PDF to temp file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: shutil.copy(pdf_file.name, temp_pdf.name) temp_pdf_path = temp_pdf.name # Step 1: Try extracting text from PDF directly extracted_text = extract_text_from_pdf(temp_pdf_path) # Step 2: If empty, use OCR if not extracted_text.strip(): extracted_text = extract_text_from_scanned_pdf(temp_pdf_path) os.remove(temp_pdf_path) if not extracted_text.strip(): return "❌ Could not extract text. Make sure the PDF has readable content." # Step 3: Generate questions prompt_q = "generate questions: " + extracted_text[:1000] questions_output = qg_pipeline(prompt_q, max_length=128, num_beams=3, num_return_sequences=3) # Step 4: Generate answers prompt_a = "answer questions: " + extracted_text[:1000] answers_output = qa_pipeline(prompt_a, max_length=64, num_beams=3, num_return_sequences=3) # Step 5: Build question list question_list = [] for i, q in enumerate(questions_output): question = q["generated_text"] correct_answer = answers_output[i]["generated_text"] if i < len(answers_output) else "N/A" options = generate_options(correct_answer) question_list.append({ "questiontext": question, "questiontype": "single_select", "marks": 10, "options": [ {"optiontext": opt, "score": "10" if opt == correct_answer else "0"} for opt in options ] }) # Step 6: Build structure data = { "title": "Certification Title", "totalmarks": "50", "time": "20", "cutoff": "35", "failurl": "", "passurl": "", "sendpassemail": True, "questions": json.dumps({"questions": question_list}), "maxattempts": 3 } # Step 7: Wrap JSON in XML CDATA xml_output = "" return xml_output # ----------------------------- # Gradio Interface # ----------------------------- iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="📄 Upload your PDF"), outputs="text", title="PDF → Question & Answer Generator (with OCR)", description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates XML with questions + answers." ) iface.launch()