Spaces:
Running
Running
File size: 6,014 Bytes
38198b1 9dbf344 90553eb 38198b1 cc495e1 38198b1 55f0ce3 ff32b4a 9dbf344 38198b1 8216d8c 55f0ce3 8216d8c 55f0ce3 9dbf344 71afe01 cb7a4c9 71afe01 cc495e1 db3eaec 9eeba1e 9dbf344 9c6425d b4510a6 9eeba1e b4510a6 9eeba1e b4510a6 9dbf344 9eeba1e b4510a6 9eeba1e 9dbf344 9eeba1e 43ac0d8 9eeba1e 55f0ce3 9eeba1e 9dbf344 9eeba1e 4cfed8e 9eeba1e 9dbf344 55f0ce3 9eeba1e 34f1e83 55f0ce3 9dbf344 9eeba1e 02721f3 cc495e1 5d87c3c 9eeba1e 02721f3 9dbf344 cc495e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import spaces
import time
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from funcs.helper_functions import GPU_SPACE_DURATION
# If you want to disable cuda for testing purposes
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
@spaces.GPU(duration=GPU_SPACE_DURATION)
def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1", random_seed:int=42) -> np.ndarray:
"""
Create or load embeddings for the given documents.
Args:
docs (list): List of documents to embed.
file_list (list): List of file names to check for existing embeddings.
embeddings_out (np.ndarray): Array to store the embeddings.
embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
random_seed (int): Random seed for vectorisation
Returns:
np.ndarray: The generated or loaded embeddings.
"""
# Check for torch cuda
# from torch import cuda, backends, version
# print("Is CUDA enabled? ", cuda.is_available())
# print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
# if cuda.is_available():
# torch_device = "gpu"
# print("Cuda version installed is: ", version.cuda)
# high_quality_mode = "Yes"
# os.system("nvidia-smi")
# else:
# torch_device = "cpu"
# high_quality_mode = "No"
if high_quality_mode_opt == "Yes":
# Define a list of possible local locations to search for the model
local_embeddings_locations = [
"model/embed/", # Potential local location
"/model/embed/", # Potential location in Docker container
"/home/user/app/model/embed/" # This is inside a Docker container
]
# Attempt to load the model from each local location
for location in local_embeddings_locations:
try:
embedding_model = SentenceTransformer(location)#, truncate_dim=512)
print(f"Found local model installation at: {location}")
break # Exit the loop if the model is found
except Exception as e:
print(f"Failed to load model from {location}: {e}")
continue
else:
# If the loop completes without finding the model in any local location
embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
print("Could not find local model installation. Downloading from Huggingface")
else:
embedding_model = make_pipeline(
TfidfVectorizer(),
TruncatedSVD(100, random_state=random_seed)
)
# Ensure embeddings_out is a numpy array (handle case where it might be a string from Gradio state)
if not isinstance(embeddings_out, np.ndarray):
embeddings_out = np.array([])
# If no embeddings found, make or load in
if embeddings_out.size == 0:
print("Embeddings not found. Loading or generating new ones.")
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
if embeddings_file_names:
embeddings_file_name = embeddings_file_names[0]
print("Loading embeddings from file.")
embeddings_out = np.load(embeddings_file_name)['arr_0']
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
if "compress" in embeddings_file_name:
embeddings_out /= 100
if not embeddings_file_names:
tic = time.perf_counter()
print("Starting to embed documents.")
# Custom model
# If on CPU, don't resort to embedding models
if high_quality_mode_opt == "No":
print("Creating simplified 'sparse' embeddings based on TfIDF")
# Fit the pipeline to the text data
embedding_model.fit(docs)
# Transform text data to embeddings
embeddings_out = embedding_model.transform(docs)
elif high_quality_mode_opt == "Yes":
print("Creating dense embeddings based on transformers model")
# Convert model to half precision (fp16)
embedding_model.half()
embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)#, precision="int8") # For large
toc = time.perf_counter()
time_out = f"The embedding took {toc - tic:0.1f} seconds"
print(time_out)
# If the user has chosen to go with super compressed embedding files to save disk space
if embeddings_super_compress == "Yes":
embeddings_out = np.round(embeddings_out, 3)
embeddings_out *= 100
# Move model to CPU before returning to avoid CUDA initialization in main process
if high_quality_mode_opt == "Yes" and hasattr(embedding_model, 'to'):
try:
embedding_model = embedding_model.to('cpu')
except:
pass # If moving to CPU fails, continue anyway
return embeddings_out, embedding_model
else:
print("Found pre-loaded embeddings.")
# Ensure embeddings are on CPU even when loaded from file
if hasattr(embeddings_out, 'cpu'):
embeddings_out = embeddings_out.cpu().numpy()
elif not isinstance(embeddings_out, np.ndarray):
embeddings_out = np.array(embeddings_out)
return embeddings_out, embedding_model |