yazoniak commited on
Commit
061ca04
·
verified ·
1 Parent(s): 7aee990

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -16
app.py CHANGED
@@ -30,6 +30,7 @@ import spaces
30
  from datetime import datetime
31
  from datasets import Dataset, load_dataset
32
  from huggingface_hub import HfApi
 
33
 
34
 
35
  # Model configuration
@@ -99,6 +100,9 @@ class HFDatasetLogger:
99
  """
100
  Log a prediction to the HuggingFace dataset.
101
 
 
 
 
102
  Args:
103
  text: Input text
104
  mode: Prediction mode
@@ -108,8 +112,8 @@ class HFDatasetLogger:
108
  json_output: JSON output with scores
109
  """
110
  try:
111
- # Prepare data entry
112
- data_entry = {
113
  "timestamp": datetime.utcnow().isoformat(),
114
  "text": text,
115
  "mode": mode,
@@ -117,44 +121,64 @@ class HFDatasetLogger:
117
  "anonymize": bool(anonymize),
118
  "predictions": predictions,
119
  "json_output": json_output,
120
- }
121
-
122
- # Create dataset from single entry
123
- new_data = Dataset.from_dict({k: [v] for k, v in data_entry.items()})
124
 
125
  if self.dataset_exists:
126
  # Append to existing dataset
127
  try:
 
128
  existing_dataset = load_dataset(
129
- self.dataset_name, split="train", token=self.hf_token, download_mode="force_redownload"
 
 
 
130
  )
131
- from datasets import concatenate_datasets
 
 
 
132
 
133
- combined_dataset = concatenate_datasets([existing_dataset, new_data])
 
134
  combined_dataset.push_to_hub(
135
  self.dataset_name,
136
  token=self.hf_token,
137
  private=self.private,
138
- commit_message=f"Add prediction at {datetime.utcnow().isoformat()}"
139
  )
140
- except Exception as e:
141
- print(f"⚠ Error appending to dataset: {e}")
142
- # Fall back to creating new dataset if append fails
143
- new_data.push_to_hub(
 
 
 
144
  self.dataset_name,
145
  token=self.hf_token,
146
  private=self.private,
147
  )
148
  self.dataset_exists = True
 
 
 
 
 
 
149
  else:
150
  # Create new dataset
151
- new_data.push_to_hub(
152
- self.dataset_name, token=self.hf_token, private=self.private
 
 
 
153
  )
154
  self.dataset_exists = True
 
155
 
156
  except Exception as e:
157
  print(f"⚠ Error logging to HuggingFace dataset: {e}")
 
 
158
 
159
 
160
  def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
@@ -173,6 +197,7 @@ def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
173
  return text
174
 
175
 
 
176
  def load_model():
177
  """
178
  Load the model, tokenizer, and calibration artifacts.
 
30
  from datetime import datetime
31
  from datasets import Dataset, load_dataset
32
  from huggingface_hub import HfApi
33
+ import pandas as pd
34
 
35
 
36
  # Model configuration
 
100
  """
101
  Log a prediction to the HuggingFace dataset.
102
 
103
+ Uses pandas DataFrame as intermediate format to ensure proper
104
+ parquet compatibility when appending to existing datasets.
105
+
106
  Args:
107
  text: Input text
108
  mode: Prediction mode
 
112
  json_output: JSON output with scores
113
  """
114
  try:
115
+ # Prepare new data entry as DataFrame
116
+ new_row = pd.DataFrame([{
117
  "timestamp": datetime.utcnow().isoformat(),
118
  "text": text,
119
  "mode": mode,
 
121
  "anonymize": bool(anonymize),
122
  "predictions": predictions,
123
  "json_output": json_output,
124
+ }])
 
 
 
125
 
126
  if self.dataset_exists:
127
  # Append to existing dataset
128
  try:
129
+ # Download existing dataset and convert to pandas
130
  existing_dataset = load_dataset(
131
+ self.dataset_name,
132
+ split="train",
133
+ token=self.hf_token,
134
+ download_mode="force_redownload",
135
  )
136
+ existing_df = existing_dataset.to_pandas()
137
+
138
+ # Concatenate DataFrames
139
+ combined_df = pd.concat([existing_df, new_row], ignore_index=True)
140
 
141
+ # Convert back to Dataset and push
142
+ combined_dataset = Dataset.from_pandas(combined_df)
143
  combined_dataset.push_to_hub(
144
  self.dataset_name,
145
  token=self.hf_token,
146
  private=self.private,
147
+ commit_message=f"Add prediction at {datetime.utcnow().isoformat()}",
148
  )
149
+ print(f"✓ Appended prediction (total rows: {len(combined_df)})")
150
+
151
+ except FileNotFoundError:
152
+ # Dataset doesn't exist yet despite our check - create it
153
+ print("⚠ Dataset not found, creating new dataset")
154
+ new_dataset = Dataset.from_pandas(new_row)
155
+ new_dataset.push_to_hub(
156
  self.dataset_name,
157
  token=self.hf_token,
158
  private=self.private,
159
  )
160
  self.dataset_exists = True
161
+ except Exception as e:
162
+ # For any other error, DO NOT fall back to push_to_hub
163
+ # as that would REPLACE the entire dataset with just the new entry!
164
+ print(f"⚠ Error appending to dataset (data not saved): {e}")
165
+ import traceback
166
+ traceback.print_exc()
167
  else:
168
  # Create new dataset
169
+ new_dataset = Dataset.from_pandas(new_row)
170
+ new_dataset.push_to_hub(
171
+ self.dataset_name,
172
+ token=self.hf_token,
173
+ private=self.private,
174
  )
175
  self.dataset_exists = True
176
+ print("✓ Created new dataset with first prediction")
177
 
178
  except Exception as e:
179
  print(f"⚠ Error logging to HuggingFace dataset: {e}")
180
+ import traceback
181
+ traceback.print_exc()
182
 
183
 
184
  def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
 
197
  return text
198
 
199
 
200
+ @spaces.GPU
201
  def load_model():
202
  """
203
  Load the model, tokenizer, and calibration artifacts.