Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tempfile | |
| import shutil | |
| import os | |
| import json | |
| import numpy as np | |
| from pdf2image import convert_from_path | |
| import easyocr | |
| from PyPDF2 import PdfReader | |
| from transformers import pipeline | |
| import random | |
| # ----------------------------- | |
| # Initialize OCR and Transformers | |
| # ----------------------------- | |
| reader = easyocr.Reader(['en']) | |
| # Question generation model | |
| qg_pipeline = pipeline( | |
| "text2text-generation", | |
| model="valhalla/t5-small-qg-prepend", | |
| tokenizer="t5-small" | |
| ) | |
| # Question-answer generation model | |
| qa_pipeline = pipeline( | |
| "text2text-generation", | |
| model="valhalla/t5-small-qa-qg-hl", | |
| tokenizer="t5-small" | |
| ) | |
| # ----------------------------- | |
| # Extract text from selectable PDFs | |
| # ----------------------------- | |
| def extract_text_from_pdf(file_path): | |
| reader_pdf = PdfReader(file_path) | |
| text = "" | |
| for page in reader_pdf.pages: | |
| t = getattr(page, 'extract_text', lambda: None)() | |
| if t: | |
| text += t + "\n" | |
| return text.strip() | |
| # ----------------------------- | |
| # Extract text from scanned PDFs using EasyOCR | |
| # ----------------------------- | |
| def extract_text_from_scanned_pdf(file_path): | |
| pages = convert_from_path(file_path, dpi=150) | |
| text = "" | |
| for page in pages: | |
| try: | |
| img_array = np.array(page) | |
| result = reader.readtext(img_array, detail=0) | |
| text += " ".join(result) + "\n" | |
| except Exception as e: | |
| print("OCR error on page:", e) | |
| return text.strip() | |
| # ----------------------------- | |
| # Generate dummy options | |
| # ----------------------------- | |
| def generate_options(correct_answer): | |
| options = [correct_answer] | |
| dummy_opts = [ | |
| "None of the above", | |
| "All of the above", | |
| "Not mentioned", | |
| "Cannot be determined", | |
| "Irrelevant information" | |
| ] | |
| while len(options) < 4: | |
| opt = random.choice(dummy_opts) | |
| if opt not in options: | |
| options.append(opt) | |
| random.shuffle(options) | |
| return options | |
| # ----------------------------- | |
| # Main processing function | |
| # ----------------------------- | |
| def process_pdf(pdf_file): | |
| # Save uploaded PDF to temp file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: | |
| shutil.copy(pdf_file.name, temp_pdf.name) | |
| temp_pdf_path = temp_pdf.name | |
| # Step 1: Try extracting text from PDF directly | |
| extracted_text = extract_text_from_pdf(temp_pdf_path) | |
| # Step 2: If empty, use OCR | |
| if not extracted_text.strip(): | |
| extracted_text = extract_text_from_scanned_pdf(temp_pdf_path) | |
| os.remove(temp_pdf_path) | |
| if not extracted_text.strip(): | |
| return "β Could not extract text. Make sure the PDF has readable content." | |
| # Step 3: Generate questions | |
| prompt_q = "generate questions: " + extracted_text[:1000] | |
| questions_output = qg_pipeline(prompt_q, max_length=128, num_beams=3, num_return_sequences=3) | |
| # Step 4: Generate answers | |
| prompt_a = "answer questions: " + extracted_text[:1000] | |
| answers_output = qa_pipeline(prompt_a, max_length=64, num_beams=3, num_return_sequences=3) | |
| # Step 5: Build question list | |
| question_list = [] | |
| for i, q in enumerate(questions_output): | |
| question = q["generated_text"] | |
| correct_answer = answers_output[i]["generated_text"] if i < len(answers_output) else "N/A" | |
| options = generate_options(correct_answer) | |
| question_list.append({ | |
| "questiontext": question, | |
| "questiontype": "single_select", | |
| "marks": 10, | |
| "options": [ | |
| {"optiontext": opt, "score": "10" if opt == correct_answer else "0"} | |
| for opt in options | |
| ] | |
| }) | |
| # Step 6: Build <questiondata> structure | |
| data = { | |
| "title": "Certification Title", | |
| "totalmarks": "50", | |
| "time": "20", | |
| "cutoff": "35", | |
| "failurl": "", | |
| "passurl": "", | |
| "sendpassemail": True, | |
| "questions": json.dumps({"questions": question_list}), | |
| "maxattempts": 3 | |
| } | |
| # Step 7: Wrap JSON in XML CDATA | |
| xml_output = "<questiondata><![CDATA[" + json.dumps(data, indent=2) + "]]></questiondata>" | |
| return xml_output | |
| # ----------------------------- | |
| # Gradio Interface | |
| # ----------------------------- | |
| iface = gr.Interface( | |
| fn=process_pdf, | |
| inputs=gr.File(label="π Upload your PDF"), | |
| outputs="text", | |
| title="PDF β Question & Answer Generator (with OCR)", | |
| description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates XML with questions + answers." | |
| ) | |
| iface.launch() | |