=
commited on
Commit
·
fd342b4
1
Parent(s):
0d00a0e
Add compression and decompression functions for fact check data; update dependencies and remove obsolete files
Browse files- .gitattributes +1 -0
- Manifest.toml +1 -1
- Project.toml +1 -0
- data/fc_latest_maxi_compr +3 -0
- scripts/UpdateHuggingFaceAPI.jl +4 -0
- server.jl +6 -1
- src/OstreaCultura.jl +3 -0
- src/compress.jl +39 -0
.gitattributes
CHANGED
|
@@ -37,3 +37,4 @@ data/filtered_fact_check_latest_embed.csv filter=lfs diff=lfs merge=lfs -text
|
|
| 37 |
data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 39 |
*.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 37 |
data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 39 |
*.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
data/fc_latest_maxi_compr filter=lfs diff=lfs merge=lfs -text
|
Manifest.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
julia_version = "1.10.4"
|
| 4 |
manifest_format = "2.0"
|
| 5 |
-
project_hash = "
|
| 6 |
|
| 7 |
[[deps.AbstractTrees]]
|
| 8 |
git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
|
|
|
|
| 2 |
|
| 3 |
julia_version = "1.10.4"
|
| 4 |
manifest_format = "2.0"
|
| 5 |
+
project_hash = "071291b10413261c56b71962d94f340814c6f62c"
|
| 6 |
|
| 7 |
[[deps.AbstractTrees]]
|
| 8 |
git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
|
Project.toml
CHANGED
|
@@ -5,6 +5,7 @@ version = "1.0.0-DEV"
|
|
| 5 |
|
| 6 |
[deps]
|
| 7 |
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
|
|
|
|
| 8 |
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
| 9 |
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
| 10 |
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
|
|
|
| 5 |
|
| 6 |
[deps]
|
| 7 |
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
|
| 8 |
+
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
|
| 9 |
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
| 10 |
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
| 11 |
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
data/fc_latest_maxi_compr
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bfec2c2ec6ed5b0d4df6bc91838a72cd4a87db7d66c6f89245d6534557973e27
|
| 3 |
+
size 341251717
|
scripts/UpdateHuggingFaceAPI.jl
CHANGED
|
@@ -33,3 +33,7 @@ narrs = narrs[.!ismissing.(narrs.text), :]
|
|
| 33 |
narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
|
| 34 |
narrs.Embeddings = narratives_embed
|
| 35 |
CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
|
| 34 |
narrs.Embeddings = narratives_embed
|
| 35 |
CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
|
| 36 |
+
# Compress the fact check data
|
| 37 |
+
OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr")
|
| 38 |
+
# Delete the original
|
| 39 |
+
rm("data/fact_check_latest_embed_maxi.csv")
|
server.jl
CHANGED
|
@@ -3,7 +3,12 @@ using HTTP
|
|
| 3 |
import OstreaCultura as OC
|
| 4 |
|
| 5 |
# Load the fasttext embeddings and the fasttext model
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
|
| 8 |
|
| 9 |
@get "/greet" function(req::HTTP.Request)
|
|
|
|
| 3 |
import OstreaCultura as OC
|
| 4 |
|
| 5 |
# Load the fasttext embeddings and the fasttext model
|
| 6 |
+
tmp_destination = tempname()
|
| 7 |
+
# Decompress the fact check data
|
| 8 |
+
OC.decompress_csv("data/fc_latest_maxi_compr", tmp_destination)
|
| 9 |
+
|
| 10 |
+
#####
|
| 11 |
+
const (fc_embed, fc) = OC.load_fasttext_embeddings(tmp_destination)
|
| 12 |
const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
|
| 13 |
|
| 14 |
@get "/greet" function(req::HTTP.Request)
|
src/OstreaCultura.jl
CHANGED
|
@@ -5,6 +5,8 @@ module OstreaCultura
|
|
| 5 |
|
| 6 |
using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
|
| 7 |
|
|
|
|
|
|
|
| 8 |
import Pandas.DataFrame as pdataframe
|
| 9 |
|
| 10 |
export MiniEncoder
|
|
@@ -20,6 +22,7 @@ export MiniEncoder
|
|
| 20 |
include("py_init.jl")
|
| 21 |
include("Embeddings.jl")
|
| 22 |
include("PyPineCone.jl")
|
|
|
|
| 23 |
#include("Models.jl")
|
| 24 |
|
| 25 |
end
|
|
|
|
| 5 |
|
| 6 |
using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
|
| 7 |
|
| 8 |
+
using CodecZlib
|
| 9 |
+
|
| 10 |
import Pandas.DataFrame as pdataframe
|
| 11 |
|
| 12 |
export MiniEncoder
|
|
|
|
| 22 |
include("py_init.jl")
|
| 23 |
include("Embeddings.jl")
|
| 24 |
include("PyPineCone.jl")
|
| 25 |
+
include("compress.jl")
|
| 26 |
#include("Models.jl")
|
| 27 |
|
| 28 |
end
|
src/compress.jl
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#using CodecZlib
|
| 2 |
+
#using CSV
|
| 3 |
+
#using DataFrames
|
| 4 |
+
|
| 5 |
+
function compress_csv(input_path::String, output_path::String=input_path * ".gz")
|
| 6 |
+
println("Compressing $input_path to $output_path...")
|
| 7 |
+
|
| 8 |
+
open(input_path, "r") do input_io
|
| 9 |
+
open(output_path, "w") do output_io
|
| 10 |
+
stream = GzipCompressorStream(output_io)
|
| 11 |
+
write(stream, read(input_io))
|
| 12 |
+
close(stream)
|
| 13 |
+
end
|
| 14 |
+
end
|
| 15 |
+
|
| 16 |
+
# Calculate compression ratio
|
| 17 |
+
original_size = filesize(input_path)
|
| 18 |
+
compressed_size = filesize(output_path)
|
| 19 |
+
ratio = (1 - compressed_size / original_size) * 100
|
| 20 |
+
|
| 21 |
+
println("Compression complete: $(round(original_size / 1024^2, digits=2)) MB → $(round(compressed_size / 1024^2, digits=2)) MB ($(round(ratio, digits=1))% reduction)")
|
| 22 |
+
return output_path
|
| 23 |
+
end
|
| 24 |
+
|
| 25 |
+
function decompress_csv(input_path::String, output_path::String)
|
| 26 |
+
println("Decompressing $input_path to $output_path...")
|
| 27 |
+
|
| 28 |
+
open(input_path, "r") do input_io
|
| 29 |
+
open(output_path, "w") do output_io
|
| 30 |
+
stream = GzipDecompressorStream(input_io)
|
| 31 |
+
write(output_io, read(stream))
|
| 32 |
+
close(stream)
|
| 33 |
+
end
|
| 34 |
+
end
|
| 35 |
+
|
| 36 |
+
println("Decompression complete!")
|
| 37 |
+
return output_path
|
| 38 |
+
end
|
| 39 |
+
|