Spaces:
Sleeping
Sleeping
| ### PineCone Embed and I/O Functions | |
| """ | |
| # This dataset matches the example data from DataLoader.py | |
| import OstreaCultura as OC | |
| hi = OC.example_data() | |
| hi = OC.df_to_pd(hi) | |
| OC.DataLoader.create_vectors_from_df(hi) | |
| """ | |
| function example_data() | |
| DataFrame( | |
| Embeddings = [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]], | |
| id = ["vec1", "vec2"], | |
| genre = ["drama", "action"] | |
| ) | |
| end | |
| """ | |
| df= OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") | |
| df_julia = OC.pd_to_df(df) | |
| """ | |
| function pd_to_df(df_pd) | |
| df= DataFrame() | |
| for col in df_pd.columns | |
| df[!, col] = getproperty(df_pd, col).values | |
| end | |
| df | |
| end | |
| """ | |
| Available functions | |
| pc.create_index - see below | |
| pc.delete_index: pc.delete_index(index_name) | |
| """ | |
| function create_pinecone_context() | |
| pc = DataLoader.Pinecone(api_key=ENV["PINECONE_API_KEY"]) | |
| return pc | |
| end | |
| """ | |
| # Context for inference endpoints | |
| """ | |
| function create_inf_pinecone_context() | |
| pc = DataLoader.Pinecone(ENV["PINECONE_API_KEY"]) | |
| return pc | |
| end | |
| """ | |
| pc = create_pinecone_context() | |
| create_index("new-index", 4, "cosine", "aws", "us-east-1") | |
| """ | |
| function create_index(name, dimension, metric, cloud, region) | |
| ppc = create_pinecone_context() | |
| DataLoader.create_index(ppc, name, dimension, metric, cloud, region) | |
| end | |
| """ | |
| import OstreaCultura as OC | |
| df = OC.DataLoader.pd.read_csv("data/climate_test.csv") | |
| model = "multilingual-e5-large" | |
| out = OC.multi_embeddings(model, df, 96, "text") | |
| # Id and Embeddings are required columns in the DataFrame | |
| OC.upsert_data(out, "test-index", "test-namespace") | |
| df = OC.DataLoader.pd.read_csv("data/Indicator_Test.csv") | |
| model = "multilingual-e5-large" | |
| test_embeds = OC.multi_embeddings(model, df, 96, "text") | |
| test_embeds_min = test_embeds.head(10) | |
| # Id and Embeddings are required columns in the DataFrame | |
| OC.upsert_data(test_embeds_min, "test-index", "indicator-test-namespace", chunk_size=100) | |
| """ | |
| function upsert_data(df, indexname, namespace; chunk_size=1000) | |
| # Import DataLoader.py | |
| pc = create_pinecone_context() | |
| index = pc.Index(indexname) | |
| DataLoader.chunk_df_and_upsert(index, df, namespace=namespace, chunk_size=chunk_size) | |
| end | |
| """ | |
| ## How to query data using an existing embedding | |
| import OstreaCultura as OC; using DataFrames | |
| mydf = DataFrame(id = ["vec1", "vec2"], text = ["drama", "action"]) | |
| mydf = OC.multi_embeddings(mydf) | |
| vector = mydf.Embeddings[1] | |
| top_k = 5 | |
| include_values = true | |
| OC.query_data("test-index", "test-namespace", vector, top_k, include_values) | |
| """ | |
| function query_data(indexname, namespace, vector, top_k, include_values) | |
| pc = create_pinecone_context() | |
| index = pc.Index(indexname) | |
| DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() | |
| end | |
| """ | |
| ## How to query data using an existing hybrid embedding | |
| import OstreaCultura as OC; using DataFrames | |
| querytext = "drama" | |
| dense = OC.embed_query(querytext) | |
| top_k = 5 | |
| include_values = true | |
| include_metadata = true | |
| OC.query_data_with_sparse("oc-hybrid-library-index", "immigration", dense, OC.DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) | |
| """ | |
| function query_data_with_sparse(indexname, namespace, dense, sparse, top_k, include_values, include_metadata) | |
| pc = create_pinecone_context() | |
| index = pc.Index(indexname) | |
| DataLoader.query_data_with_sparse(index, namespace, dense, sparse, top_k=top_k, include_values=include_values, include_metadata=include_metadata).to_dict() | |
| end | |
| """ | |
| ## Querying function for GGWP - using updated hybrid vector | |
| import OstreaCultura as OC | |
| claim = "drama" | |
| indexname = "oc-hybrid-library-index" | |
| ocmodel = "expanded-fact-checks" | |
| OC.search(claim, indexname, ocmodel, include_values=false, include_metadata=false) | |
| res = OC.search(claim, indexname, ocmodel) | |
| """ | |
| function search(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) | |
| dense = embed_query(claim) | |
| query_data_with_sparse(indexname, ocmodel, dense, DataLoader.empty_sparse_vector(), top_k, include_values, include_metadata) | |
| end | |
| function unicodebarplot(x, y, title = "Query Matches") | |
| UnicodePlots.barplot(x, y, title=title) | |
| end | |
| function searchresult_to_unicodeplot(searchresult) | |
| scores = [x["score"] for x in searchresult["matches"]] | |
| text = [x["metadata"]["text"] for x in searchresult["matches"]] | |
| ## reduce the text to 41 characters | |
| text_to_show = [length(x) > 41 ? x[1:41] * "..." : x for x in text] | |
| unicodebarplot(text_to_show, scores) | |
| end | |
| """ | |
| ## Search and plot the results | |
| import OstreaCultura as OC | |
| claim = "drama" | |
| indexname = "oc-hybrid-library-index" | |
| ocmodel = "immigration" | |
| OC.searchplot(claim, indexname, ocmodel) | |
| """ | |
| function searchplot(claim, indexname, ocmodel; top_k=5, include_values=true, include_metadata=true) | |
| searchresult = search(claim, indexname, ocmodel, top_k=top_k, include_values=include_values, include_metadata=include_metadata) | |
| searchresult_to_unicodeplot(searchresult) | |
| end | |
| """ | |
| import OstreaCultura as OC | |
| df = OC.DataLoader.pd.read_csv("data/climate_test.csv") | |
| model = "multilingual-e5-large" | |
| out = OC.multi_embeddings(model, df, 96, "text") | |
| using CSV, DataFrames | |
| tdat = CSV.read("data/climate_test.csv", DataFrame) | |
| OC.multi_embeddings(model, Pandas.DataFrame(tdat), 96, "text") | |
| """ | |
| function multi_embeddings(model, data, chunk_size, textcol) | |
| pc = create_inf_pinecone_context() | |
| DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) | |
| end | |
| """ | |
| using CSV, DataFrames | |
| import OstreaCultura as OC | |
| tdat = CSV.read("data/climate_test.csv", DataFrame) | |
| OC.multi_embeddings(tdat) | |
| """ | |
| function multi_embeddings(data::DataFrames.DataFrame; kwargs...) | |
| data = df_to_pd(data) | |
| model = get(kwargs, :model, "multilingual-e5-large") | |
| chunk_size = get(kwargs, :chunk_size, 96) | |
| textcol = get(kwargs, :textcol, "text") | |
| pc = create_inf_pinecone_context() | |
| DataLoader.chunk_and_embed(pc, model, data, chunk_size, textcol) | |
| end | |
| """ | |
| ## Julia DataFrame to pandas DataFrame | |
| """ | |
| function df_to_pd(df::DataFrames.DataFrame) | |
| pdataframe(df) | |
| end | |
| function embed_query(querytext; kwargs...) | |
| firstdf = DataFrame(id = "vec1", text = querytext) | |
| firstdf = multi_embeddings(firstdf) | |
| vector = firstdf.Embeddings[1] | |
| return vector | |
| end | |
| """ | |
| ## Query with a vector of embeddings | |
| import OstreaCultura as OC | |
| vector = rand(1024) | |
| indexname = "test-index" | |
| namespace = "test-namespace" | |
| vecresults = OC.query_w_vector(vector, indexname, namespace) | |
| """ | |
| function query_w_vector(vector, indexname, namespace; kwargs...) | |
| top_k = get(kwargs, :top_k, 5) | |
| include_values = get(kwargs, :include_values, true) | |
| pc = create_pinecone_context() | |
| index = pc.Index(indexname) | |
| queryresults = DataLoader.query_data(index, namespace, vector, top_k, include_values).to_dict() | |
| ## | |
| if include_values | |
| values_vector = [queryresults["matches"][i]["values"] for i in 1:length(queryresults["matches"])] | |
| else | |
| values_vector = [missing for i in 1:length(queryresults["matches"])] | |
| end | |
| # drop the "values" key from each dict so it doesn't get added to the DataFrame | |
| for i in 1:length(queryresults["matches"]) | |
| delete!(queryresults["matches"][i], "values") | |
| end | |
| out = DataFrame() | |
| for i in 1:length(queryresults["matches"]) | |
| out = vcat(out, DataFrame(queryresults["matches"][i])) | |
| end | |
| # If desired update this function to add the embeddings to the DataFrame | |
| if include_values | |
| out[:, "values"] = values_vector | |
| end | |
| return out | |
| end | |
| """ | |
| import OstreaCultura as OC | |
| indexname = "test-index" | |
| namespace = "test-namespace" | |
| pc = OC.create_pinecone_context() | |
| vector = OC.embed_query("drama") | |
| queryresults = OC.query_w_vector(vector, indexname, namespace, top_k=5, include_values=false) | |
| ### now, fetch the underlying data | |
| #fetched_data = OC.fetch_data(queryresults.id, indexname, namespace) | |
| index = pc.Index(indexname) | |
| resultfetch = OC.DataLoader.fetch_data(index, queryresults.id, namespace).to_dict() | |
| OC.parse_fetched_results(resultfetch) | |
| """ | |
| function parse_fetched_results(resultfetch) | |
| if length(resultfetch["vectors"]) > 0 | |
| ids = collect(keys(resultfetch["vectors"])) | |
| ## Grab the MetaData | |
| data = [] | |
| for id in ids | |
| push!(data, resultfetch["vectors"][id]["metadata"]) | |
| end | |
| ## Create a DataFrame From the MetaData | |
| out = DataFrame() | |
| for i in 1:length(data) | |
| try | |
| out = vcat(out, DataFrame(data[i])) | |
| catch | |
| out = vcat(out, DataFrame(data[i]), cols=:union) | |
| end | |
| end | |
| out[!, :id] = ids | |
| return out | |
| else | |
| "No data found" | |
| return DataFrame() | |
| end | |
| end | |
| """ | |
| import OstreaCultura as OC | |
| indexname = "test-index" | |
| namespace = "test-namespace" | |
| pc = OC.create_pinecone_context() | |
| index = pc.Index(indexname) | |
| ids = ["OSJeL7", "3TxWTNpPn"] | |
| query_results_as_dataframe = OC.fetch_data(ids, indexname, namespace) | |
| """ | |
| function fetch_data(ids, indexname, namespace; chunk_size=900) | |
| pc = create_pinecone_context() | |
| index = pc.Index(indexname) | |
| result_out = DataFrame() | |
| for i in 1:ceil(Int, length(ids)/chunk_size) | |
| chunk = ids[(i-1)*chunk_size+1:min(i*chunk_size, length(ids))] | |
| resultfetch = DataLoader.fetch_data(index, chunk, namespace).to_dict() | |
| result_out = vcat(result_out, parse_fetched_results(resultfetch)) | |
| end | |
| return result_out | |
| end | |
| """ | |
| ## FINAL Query function - embeds, queries, and fetches data | |
| import OstreaCultura as OC | |
| querytext = "drama" | |
| indexname = "test-index" | |
| namespace = "test-namespace" | |
| OC.query(querytext, indexname, namespace) | |
| """ | |
| function query(querytext::String, indexname::String, namespace::String; kwargs...) | |
| top_k = get(kwargs, :top_k, 5) | |
| include_values = get(kwargs, :include_values, true) | |
| vector = embed_query(querytext) | |
| queryresults = query_w_vector(vector, indexname, namespace, top_k=top_k, include_values=include_values) | |
| ### now, fetch the underlying data | |
| fetched_data = fetch_data(queryresults.id, indexname, namespace) | |
| # join the two dataframes on id | |
| merged = innerjoin(queryresults, fetched_data, on=:id) | |
| return merged | |
| end | |
| function filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) | |
| # Rename scores to avoid conflicts | |
| rename!(claim_results, :score => :claim_score) | |
| rename!(counterclaim_results, :score => :counterclaim_score) | |
| # Innerjoin | |
| df = leftjoin(claim_results, counterclaim_results, on=:id) | |
| # Fill missing values with 0 | |
| df.counterclaim_score = coalesce.(df.counterclaim_score, 0.0) | |
| # Keep only results where the claim score is greater than the counterclaim score | |
| df = df[df.claim_score .> df.counterclaim_score, :] | |
| return df | |
| end | |
| """ | |
| ## Query with claims and counterclaims | |
| import OstreaCultura as OC | |
| claim = "Climate change is a hoax" | |
| counterclaim = "Climate change is real" | |
| indexname = "test-index" | |
| namespace = "test-namespace" | |
| hi = OC.query_claims(claim, counterclaim, indexname, namespace) | |
| """ | |
| function query_claims(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) | |
| threshold = get(kwargs, :threshold, 0.8) | |
| top_k = get(kwargs, :top_k, 5000) # top_k for the initial query | |
| # Get embeddings | |
| claim_vector = embed_query(claim) | |
| counterclaim_vector = embed_query(counterclaim) | |
| # Query the embeddings | |
| claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
| counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
| # If a given id has a greater score for the claim than the counterclaim, keep it | |
| allscores = filter_claims_closer_to_counterclaims(claim_results, counterclaim_results) | |
| # Filter to scores above the threshold | |
| allscores = allscores[allscores.claim_score .> threshold, :] | |
| if size(allscores)[1] == 0 | |
| "No claims were above the threshold" | |
| return DataFrame() | |
| else | |
| ## now, fetch the data | |
| resulting_data = fetch_data(allscores.id, indexname, namespace) | |
| # merge the data on id | |
| resulting_data = innerjoin(allscores, resulting_data, on=:id) | |
| return resulting_data | |
| end | |
| end | |
| """ | |
| ## Classify a claim against the existing misinformation library | |
| import OstreaCultura as OC | |
| ## Example 1 | |
| claim = "There is a lot of dispute about whether the Holocaust happened" | |
| counterclaim = "The Holocaust is a well-documented historical event" | |
| indexname = "ostreacultura-v1" | |
| namespace = "modified-misinfo-library" | |
| hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) | |
| ## Example 2 | |
| claim = "it's cool to be trans these days" | |
| counterclaim = "" | |
| indexname = "ostreacultura-v1" | |
| namespace = "modified-misinfo-library" | |
| hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) | |
| ## Example 3 | |
| claim = "No existe racismo contra las personas negras" | |
| counterclaim = "Racism is a systemic issue that affects people of color" | |
| indexname = "ostreacultura-v1" | |
| namespace = "modified-misinfo-library" | |
| hi, counterscore = OC.classify_claim(claim, counterclaim, indexname, namespace) | |
| """ | |
| function classify_claim(claim::String, counterclaim::String, indexname::String, namespace::String; kwargs...) | |
| threshold = get(kwargs, :threshold, 0.8) | |
| top_k = get(kwargs, :top_k, 10) # top_k for the initial query | |
| # Get embeddings | |
| claim_vector = embed_query(claim) | |
| if counterclaim != "" | |
| counterclaim_vector = embed_query(counterclaim) | |
| counterclaim_results = query_w_vector(counterclaim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
| counterclaim_score = counterclaim_results.score[1] | |
| else | |
| counterclaim_score = 0.0 | |
| end | |
| # Query the embeddings | |
| claim_results = query_w_vector(claim_vector, indexname, namespace, top_k=top_k, include_values=false) | |
| # Filter to scores above the threshold | |
| claim_results = claim_results[claim_results.score .> threshold, :] | |
| ## now, fetch the data | |
| resulting_data = fetch_data(claim_results.id, indexname, namespace) | |
| resulting_data.scores = claim_results.score | |
| return resulting_data, counterclaim_score | |
| end | |
| function generate_sparse_model() | |
| df = DataLoader.pd.read_csv("data/random_300k.csv") | |
| corpus = df["text"].tolist() | |
| vector, bm25 = OC.DataLoader.encode_documents(corpus) | |
| return vector, bm25 | |
| end |