Spaces:
Running
Running
| """ | |
| Read papers from a PDF file and extract the title, abstract, figures and tables captions, and main content. These | |
| functions work best with ICLR / NeurIPS papers. | |
| """ | |
| from io import StringIO | |
| from pdfminer.converter import TextConverter | |
| from pdfminer.layout import LAParams | |
| from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
| from pdfminer.pdfpage import PDFPage | |
| def extract_text_from_pdf(path: str) -> str: | |
| """Extracts text from a PDF file. | |
| Args: | |
| path (str): A string specifying the path to the PDF file. | |
| Returns: | |
| A string containing the extracted text from the PDF. | |
| """ | |
| with open(path, 'rb') as file_handle: | |
| # Initialize a PDF resource manager to store shared resources. | |
| resource_manager = PDFResourceManager() | |
| # Set up a StringIO instance to capture the extracted text. | |
| text_output = StringIO() | |
| # Create a TextConverter to convert PDF pages to text. | |
| converter = TextConverter(resource_manager, text_output, laparams=LAParams()) | |
| # Initialize a PDF page interpreter. | |
| interpreter = PDFPageInterpreter(resource_manager, converter) | |
| # Process each page in the PDF. | |
| for page in PDFPage.get_pages(file_handle, caching=True, check_extractable=True): | |
| interpreter.process_page(page) | |
| # Retrieve the extracted text and close the StringIO instance. | |
| extracted_text = text_output.getvalue() | |
| text_output.close() | |
| # Finalize the converter. | |
| converter.close() | |
| # Replace form feed characters with newlines. | |
| extracted_text = extracted_text.replace('\x0c', '\n') | |
| return extracted_text | |
| def convert_text_into_dict(text: str) -> dict: | |
| """Converts the extracted text into a dictionary. | |
| Args: | |
| text (str): the extracted text from the PDF. | |
| Returns: | |
| A json object containing the extracted fields from the paper. | |
| """ | |
| lines = text.split('\n') | |
| # Create a filtered list to store non-matching lines | |
| filtered_lines = [line for line in lines if not (line.startswith('Under review') or | |
| line.startswith('Published as') or | |
| line.startswith('Paper under double-blind review'))] | |
| # Remove the first few empty lines before the title | |
| while filtered_lines[0].strip() == "": | |
| filtered_lines.pop(0) | |
| # Get title | |
| title = "" | |
| while filtered_lines[0] != "": | |
| title += filtered_lines.pop(0) + ' ' | |
| title = title.strip().capitalize() | |
| # Remove the author information between the title and the abstract | |
| while filtered_lines[0].lower() != "abstract": | |
| filtered_lines.pop(0) | |
| filtered_lines.pop(0) | |
| # Get abstract | |
| abstract = "" | |
| while filtered_lines[0].lower() != "introduction": | |
| abstract += filtered_lines.pop(0) + ' ' | |
| main_content = "" | |
| figures_captions = [] | |
| tables_captions = [] | |
| while filtered_lines != [] and not filtered_lines[0].lower().startswith("references"): | |
| figure_caption = "" | |
| table_caption = "" | |
| if filtered_lines[0].lower().startswith("figure"): | |
| while not filtered_lines[0] == "": | |
| figure_caption += filtered_lines.pop(0) + ' ' | |
| elif filtered_lines[0].lower().startswith("Table"): | |
| while not filtered_lines[0] == "": | |
| table_caption += filtered_lines.pop(0) + ' ' | |
| else: | |
| main_content += filtered_lines.pop(0) + ' ' | |
| if figure_caption != "": | |
| figures_captions.append(figure_caption) | |
| if table_caption != "": | |
| tables_captions.append(table_caption) | |
| figures_captions = "\n".join(figures_captions) + "\n" + "\n".join(tables_captions) | |
| # Get the first section title in the Appendix | |
| # Example section title: "A ENVIRONMENT DETAILS" | |
| while filtered_lines != [] and not (filtered_lines[0].isupper() and filtered_lines[0][0] == "A"): | |
| filtered_lines.pop(0) | |
| appendix = "" | |
| while filtered_lines != []: | |
| appendix += filtered_lines.pop(0) + ' ' | |
| # Now we have reached the "References" section | |
| # Skip until we reach | |
| paper = { | |
| "Title": title.strip(), | |
| "Abstract": abstract.strip(), | |
| "Figures/Tables Captions": figures_captions.strip(), | |
| "Main Content": main_content.strip(), | |
| "Appendix": appendix.strip(), | |
| } | |
| return paper | |
| if __name__ == "__main__": | |
| from agentreview.utility.authentication_utils import read_and_set_openai_key | |
| from agentreview.review import get_lm_review | |
| read_and_set_openai_key() | |
| path = "data/rejected/6359.pdf" | |
| text = extract_text_from_pdf(path) | |
| parsed_paper = convert_text_into_dict(text) | |
| review_generated = get_lm_review(parsed_paper) | |
| print(review_generated["review_generated"]) | |