Spaces:
Sleeping
Sleeping
| ## Upload Telegram 300K to hybrid-sparse | |
| from pinecone.grpc import PineconeGRPC as Pinecone | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| from pinecone import ServerlessSpec | |
| from pinecone_text.sparse import BM25Encoder | |
| import sys | |
| sys.path.append('src/python') | |
| import DataLoader | |
| pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
| pc.delete_index("oc-hybrid-300k-index") | |
| pc.create_index( | |
| name="oc-hybrid-300k-index", | |
| dimension=1024, | |
| metric="dotproduct", | |
| spec=ServerlessSpec( | |
| cloud="aws", | |
| region="us-east-1" | |
| ) | |
| ) | |
| ## Upsert Indicator Test Data | |
| df = pd.read_csv('data/random_300k.csv') | |
| ## get top three rows | |
| #df = df.head(3) | |
| # get text and MessageID | |
| bm25, newdf = DataLoader.create_sparse_embeds(pc, df) | |
| metadata = df[['text']].to_dict(orient='records') | |
| newdf.loc[:, 'metadata'] = metadata | |
| ## Taka look at rows where sparse values is an empty array | |
| sparse_lengths = [len(x) for x in newdf['sparse_values']] | |
| # Pull out the values where sparse length is 0 | |
| #edf = newdf[pd.Series(sparse_lengths) == 0] | |
| ## Drop newdf rows where sparse length is 0 | |
| newdf = newdf[pd.Series(sparse_lengths) != 0] | |
| vecs = DataLoader.create_sparse_dense_dict(newdf) | |
| index = pc.Index("oc-hybrid-300k-index") | |
| # Batch upsert the vectors | |
| for i in range(0, len(vecs), 400): | |
| end_index = min(i + 400, len(vecs)) | |
| index.upsert(vecs[i:end_index], namespace="telegram-300k") | |
| ################# Querying the index | |
| df = pd.read_csv('data/random_300k.csv') | |
| corpus = df['text'].tolist() | |
| vector, bm25 = DataLoader.encode_documents(corpus) | |
| index = pc.Index("oc-hybrid-300k-index") | |
| querytext = "satanic" | |
| queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) | |
| query_sparse_vector = bm25.encode_documents(querytext) | |
| query_response = index.query( | |
| top_k=5, | |
| namespace="telegram-300k", | |
| vector=queryembed, | |
| sparse_vector=query_sparse_vector, | |
| include_metadata=True | |
| ) | |
| query_response |