Update app.py
Browse files
app.py
CHANGED
|
@@ -30,6 +30,7 @@ import spaces
|
|
| 30 |
from datetime import datetime
|
| 31 |
from datasets import Dataset, load_dataset
|
| 32 |
from huggingface_hub import HfApi
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
# Model configuration
|
|
@@ -99,6 +100,9 @@ class HFDatasetLogger:
|
|
| 99 |
"""
|
| 100 |
Log a prediction to the HuggingFace dataset.
|
| 101 |
|
|
|
|
|
|
|
|
|
|
| 102 |
Args:
|
| 103 |
text: Input text
|
| 104 |
mode: Prediction mode
|
|
@@ -108,8 +112,8 @@ class HFDatasetLogger:
|
|
| 108 |
json_output: JSON output with scores
|
| 109 |
"""
|
| 110 |
try:
|
| 111 |
-
# Prepare data entry
|
| 112 |
-
|
| 113 |
"timestamp": datetime.utcnow().isoformat(),
|
| 114 |
"text": text,
|
| 115 |
"mode": mode,
|
|
@@ -117,44 +121,64 @@ class HFDatasetLogger:
|
|
| 117 |
"anonymize": bool(anonymize),
|
| 118 |
"predictions": predictions,
|
| 119 |
"json_output": json_output,
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
-
# Create dataset from single entry
|
| 123 |
-
new_data = Dataset.from_dict({k: [v] for k, v in data_entry.items()})
|
| 124 |
|
| 125 |
if self.dataset_exists:
|
| 126 |
# Append to existing dataset
|
| 127 |
try:
|
|
|
|
| 128 |
existing_dataset = load_dataset(
|
| 129 |
-
self.dataset_name,
|
|
|
|
|
|
|
|
|
|
| 130 |
)
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
|
|
|
|
| 134 |
combined_dataset.push_to_hub(
|
| 135 |
self.dataset_name,
|
| 136 |
token=self.hf_token,
|
| 137 |
private=self.private,
|
| 138 |
-
commit_message=f"Add prediction at {datetime.utcnow().isoformat()}"
|
| 139 |
)
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
| 144 |
self.dataset_name,
|
| 145 |
token=self.hf_token,
|
| 146 |
private=self.private,
|
| 147 |
)
|
| 148 |
self.dataset_exists = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
else:
|
| 150 |
# Create new dataset
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
| 153 |
)
|
| 154 |
self.dataset_exists = True
|
|
|
|
| 155 |
|
| 156 |
except Exception as e:
|
| 157 |
print(f"⚠ Error logging to HuggingFace dataset: {e}")
|
|
|
|
|
|
|
| 158 |
|
| 159 |
|
| 160 |
def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
|
|
@@ -173,6 +197,7 @@ def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
|
|
| 173 |
return text
|
| 174 |
|
| 175 |
|
|
|
|
| 176 |
def load_model():
|
| 177 |
"""
|
| 178 |
Load the model, tokenizer, and calibration artifacts.
|
|
|
|
| 30 |
from datetime import datetime
|
| 31 |
from datasets import Dataset, load_dataset
|
| 32 |
from huggingface_hub import HfApi
|
| 33 |
+
import pandas as pd
|
| 34 |
|
| 35 |
|
| 36 |
# Model configuration
|
|
|
|
| 100 |
"""
|
| 101 |
Log a prediction to the HuggingFace dataset.
|
| 102 |
|
| 103 |
+
Uses pandas DataFrame as intermediate format to ensure proper
|
| 104 |
+
parquet compatibility when appending to existing datasets.
|
| 105 |
+
|
| 106 |
Args:
|
| 107 |
text: Input text
|
| 108 |
mode: Prediction mode
|
|
|
|
| 112 |
json_output: JSON output with scores
|
| 113 |
"""
|
| 114 |
try:
|
| 115 |
+
# Prepare new data entry as DataFrame
|
| 116 |
+
new_row = pd.DataFrame([{
|
| 117 |
"timestamp": datetime.utcnow().isoformat(),
|
| 118 |
"text": text,
|
| 119 |
"mode": mode,
|
|
|
|
| 121 |
"anonymize": bool(anonymize),
|
| 122 |
"predictions": predictions,
|
| 123 |
"json_output": json_output,
|
| 124 |
+
}])
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
if self.dataset_exists:
|
| 127 |
# Append to existing dataset
|
| 128 |
try:
|
| 129 |
+
# Download existing dataset and convert to pandas
|
| 130 |
existing_dataset = load_dataset(
|
| 131 |
+
self.dataset_name,
|
| 132 |
+
split="train",
|
| 133 |
+
token=self.hf_token,
|
| 134 |
+
download_mode="force_redownload",
|
| 135 |
)
|
| 136 |
+
existing_df = existing_dataset.to_pandas()
|
| 137 |
+
|
| 138 |
+
# Concatenate DataFrames
|
| 139 |
+
combined_df = pd.concat([existing_df, new_row], ignore_index=True)
|
| 140 |
|
| 141 |
+
# Convert back to Dataset and push
|
| 142 |
+
combined_dataset = Dataset.from_pandas(combined_df)
|
| 143 |
combined_dataset.push_to_hub(
|
| 144 |
self.dataset_name,
|
| 145 |
token=self.hf_token,
|
| 146 |
private=self.private,
|
| 147 |
+
commit_message=f"Add prediction at {datetime.utcnow().isoformat()}",
|
| 148 |
)
|
| 149 |
+
print(f"✓ Appended prediction (total rows: {len(combined_df)})")
|
| 150 |
+
|
| 151 |
+
except FileNotFoundError:
|
| 152 |
+
# Dataset doesn't exist yet despite our check - create it
|
| 153 |
+
print("⚠ Dataset not found, creating new dataset")
|
| 154 |
+
new_dataset = Dataset.from_pandas(new_row)
|
| 155 |
+
new_dataset.push_to_hub(
|
| 156 |
self.dataset_name,
|
| 157 |
token=self.hf_token,
|
| 158 |
private=self.private,
|
| 159 |
)
|
| 160 |
self.dataset_exists = True
|
| 161 |
+
except Exception as e:
|
| 162 |
+
# For any other error, DO NOT fall back to push_to_hub
|
| 163 |
+
# as that would REPLACE the entire dataset with just the new entry!
|
| 164 |
+
print(f"⚠ Error appending to dataset (data not saved): {e}")
|
| 165 |
+
import traceback
|
| 166 |
+
traceback.print_exc()
|
| 167 |
else:
|
| 168 |
# Create new dataset
|
| 169 |
+
new_dataset = Dataset.from_pandas(new_row)
|
| 170 |
+
new_dataset.push_to_hub(
|
| 171 |
+
self.dataset_name,
|
| 172 |
+
token=self.hf_token,
|
| 173 |
+
private=self.private,
|
| 174 |
)
|
| 175 |
self.dataset_exists = True
|
| 176 |
+
print("✓ Created new dataset with first prediction")
|
| 177 |
|
| 178 |
except Exception as e:
|
| 179 |
print(f"⚠ Error logging to HuggingFace dataset: {e}")
|
| 180 |
+
import traceback
|
| 181 |
+
traceback.print_exc()
|
| 182 |
|
| 183 |
|
| 184 |
def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
|
|
|
|
| 197 |
return text
|
| 198 |
|
| 199 |
|
| 200 |
+
@spaces.GPU
|
| 201 |
def load_model():
|
| 202 |
"""
|
| 203 |
Load the model, tokenizer, and calibration artifacts.
|