import os from typing import List from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS class RAGEngine: def __init__(self, index_path="faiss_index"): self.index_path = index_path self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") self.vector_store = None self._load_index() def _load_index(self): if os.path.exists(self.index_path): try: self.vector_store = FAISS.load_local(self.index_path, self.embeddings, allow_dangerous_deserialization=True) print("Loaded existing FAISS index.") except Exception as e: print(f"Failed to load index: {e}") self.vector_store = None else: print("No existing FAISS index found.") def ingest_file(self, file_path: str): if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") # Load document if file_path.endswith(".pdf"): loader = PyPDFLoader(file_path) else: loader = TextLoader(file_path) documents = loader.load() # Split text text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_documents(documents) # Create or update vector store if self.vector_store is None: self.vector_store = FAISS.from_documents(texts, self.embeddings) else: self.vector_store.add_documents(texts) # Save index self.vector_store.save_local(self.index_path) print(f"Ingested {file_path} and updated index.") def search(self, query: str, k: int = 3) -> List[str]: if self.vector_store is None: return [] docs = self.vector_store.similarity_search(query, k=k) return [doc.page_content for doc in docs] def clear_index(self): if os.path.exists(self.index_path): import shutil shutil.rmtree(self.index_path) self.vector_store = None print("Index cleared.")