Spaces:
Sleeping
Sleeping
| ## Utility Functions | |
| ## Note: edit ~/.bigqueryrc to set global settings for bq command line tool | |
| """ | |
| ## ostreacultura_bq_auth() | |
| - Activate the service account using the credentials file | |
| """ | |
| function ostreacultura_bq_auth() | |
| if isfile("ostreacultura-credentials.json") | |
| run(`gcloud auth activate-service-account --key-file=ostreacultura-credentials.json`) | |
| else | |
| println("Credentials file not found") | |
| end | |
| end | |
| """ | |
| ## bq(query::String) | |
| - Run a BigQuery query and return the result as a DataFrame | |
| Example: bq("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10") | |
| """ | |
| function bq(query::String) | |
| tname = tempname() | |
| run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, tname)) | |
| return CSV.read(tname, DataFrame) | |
| end | |
| """ | |
| ## bq_db(query::String, db::String) | |
| - Run a BigQuery query and save to a database | |
| Example: | |
| bq_db("SELECT * FROM ostreacultura.climate_truth.training LIMIT 10", "data/test.csv") | |
| """ | |
| function bq_db(query::String, db::String) | |
| run(pipeline(`bq query --use_legacy_sql=false --format=csv $query`, db)) | |
| end | |
| """ | |
| one token is roughly 3/4 of a word | |
| """ | |
| function token_estimate(allstrings::Vector{String}) | |
| ## Tokenize the strings | |
| tokens = [split(x) for x in allstrings] | |
| ## Estimate the number of tokens | |
| token_estimate = sum([length(x) for x in tokens]) | |
| return token_estimate * 4 / 3 | |
| end | |
| function chunk_by_tokens(allstrings::Vector{String}, max_tokens::Int=8191) | |
| ## Tokenize the strings | |
| tokens = [split(x) for x in allstrings] | |
| ## Estimate the number of tokens | |
| token_estimate = sum([length(x) for x in tokens]) * 4 / 3 | |
| ## Chunk the strings | |
| chunks = [] | |
| chunk = [] | |
| chunk_tokens = 0 | |
| for i in 1:length(allstrings) | |
| if chunk_tokens + length(tokens[i]) < max_tokens | |
| push!(chunk, allstrings[i]) | |
| chunk_tokens += length(tokens[i]) | |
| else | |
| push!(chunks, chunk) | |
| chunk = [allstrings[i]] | |
| chunk_tokens = length(tokens[i]) | |
| end | |
| end | |
| push!(chunks, chunk) | |
| return chunks | |
| end |