Spaces:
Running
Running
| import torch | |
| from PIL import Image | |
| import numpy as np | |
| import os | |
| import sys | |
| # Try to import pytesseract, but handle if it's not available | |
| try: | |
| import pytesseract | |
| TESSERACT_AVAILABLE = True | |
| except ImportError: | |
| TESSERACT_AVAILABLE = False | |
| # Check if tesseract is installed | |
| if TESSERACT_AVAILABLE: | |
| try: | |
| pytesseract.get_tesseract_version() | |
| except Exception: | |
| TESSERACT_AVAILABLE = False | |
| # Initialize the model and processor with caching | |
| processor = None | |
| model = None | |
| def get_document_ai_models(): | |
| """Get or initialize document AI models with proper caching.""" | |
| global processor, model | |
| if processor is None: | |
| from transformers import LayoutLMv2Processor | |
| processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") | |
| if model is None: | |
| from transformers import LayoutLMv2ForSequenceClassification | |
| model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased") | |
| return processor, model | |
| def extract_text_with_tesseract(image): | |
| """Extract text using Tesseract OCR.""" | |
| if not TESSERACT_AVAILABLE: | |
| raise RuntimeError("tesseract is not installed or it's not in your PATH. See README file for more information.") | |
| if isinstance(image, np.ndarray): | |
| pil_image = Image.fromarray(image).convert("RGB") | |
| else: | |
| pil_image = image.convert("RGB") | |
| # Use pytesseract for OCR | |
| text = pytesseract.image_to_string(pil_image) | |
| # Get word boxes for structure | |
| boxes = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT) | |
| # Extract words and their positions | |
| words = [] | |
| word_boxes = [] | |
| for i in range(len(boxes['text'])): | |
| if boxes['text'][i].strip() != '': | |
| words.append(boxes['text'][i]) | |
| x, y, w, h = boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i] | |
| word_boxes.append([x, y, x + w, y + h]) | |
| return words, word_boxes | |
| def extract_text_with_transformers(image): | |
| """Extract text using transformers models when Tesseract is not available.""" | |
| try: | |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| # Initialize the processor and model | |
| processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") | |
| model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed") | |
| # Prepare the image | |
| if isinstance(image, np.ndarray): | |
| pil_image = Image.fromarray(image).convert("RGB") | |
| else: | |
| pil_image = image.convert("RGB") | |
| # Process the image | |
| pixel_values = processor(pil_image, return_tensors="pt").pixel_values | |
| generated_ids = model.generate(pixel_values) | |
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # Split into words | |
| words = generated_text.split() | |
| # Since we don't have bounding boxes, return empty boxes | |
| word_boxes = [[0, 0, 0, 0] for _ in words] | |
| return words, word_boxes | |
| except Exception as e: | |
| # If transformers OCR fails, return a simple error message | |
| return ["Error extracting text with transformers OCR:", str(e)], [[0, 0, 0, 0], [0, 0, 0, 0]] | |
| def extract_text_and_layout(image): | |
| """ | |
| Extract text and layout information using OCR. | |
| Args: | |
| image: PIL Image object | |
| Returns: | |
| Dictionary with extracted text and layout information | |
| """ | |
| # Convert numpy array to PIL Image if needed | |
| if isinstance(image, np.ndarray): | |
| image = Image.fromarray(image).convert("RGB") | |
| try: | |
| # Try Tesseract first | |
| if TESSERACT_AVAILABLE: | |
| words, boxes = extract_text_with_tesseract(image) | |
| else: | |
| # Fall back to transformers OCR | |
| words, boxes = extract_text_with_transformers(image) | |
| except Exception as e: | |
| # If both methods fail, return the error | |
| return { | |
| 'words': [f"Error extracting text: {str(e)}"], | |
| 'boxes': [[0, 0, 0, 0]], | |
| 'success': False | |
| } | |
| # If no words were found, return empty result | |
| if not words: | |
| return { | |
| 'words': [], | |
| 'boxes': [], | |
| 'success': False | |
| } | |
| return { | |
| 'words': words, | |
| 'boxes': boxes, | |
| 'success': True | |
| } | |