Spaces:

yazoniak
/

twitteremo-pl-classifier

Running

App Files Files Community

yazoniak commited on Dec 12, 2025

Commit

061ca04

verified ·

1 Parent(s): 7aee990

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -16

app.py CHANGED Viewed

@@ -30,6 +30,7 @@ import spaces
 from datetime import datetime
 from datasets import Dataset, load_dataset
 from huggingface_hub import HfApi
 # Model configuration
@@ -99,6 +100,9 @@ class HFDatasetLogger:
         """
         Log a prediction to the HuggingFace dataset.
         Args:
             text: Input text
             mode: Prediction mode
@@ -108,8 +112,8 @@ class HFDatasetLogger:
             json_output: JSON output with scores
         """
         try:
-            # Prepare data entry
-            data_entry = {
                 "timestamp": datetime.utcnow().isoformat(),
                 "text": text,
                 "mode": mode,
@@ -117,44 +121,64 @@ class HFDatasetLogger:
                 "anonymize": bool(anonymize),
                 "predictions": predictions,
                 "json_output": json_output,
-            }
-            # Create dataset from single entry
-            new_data = Dataset.from_dict({k: [v] for k, v in data_entry.items()})
             if self.dataset_exists:
                 # Append to existing dataset
                 try:
                     existing_dataset = load_dataset(
-                        self.dataset_name, split="train", token=self.hf_token, download_mode="force_redownload"
                     )
-                    from datasets import concatenate_datasets
-                    combined_dataset = concatenate_datasets([existing_dataset, new_data])
                     combined_dataset.push_to_hub(
                         self.dataset_name,
                         token=self.hf_token,
                         private=self.private,
-                        commit_message=f"Add prediction at {datetime.utcnow().isoformat()}"
                     )
-                except Exception as e:
-                    print(f"⚠ Error appending to dataset: {e}")
-                    # Fall back to creating new dataset if append fails
-                    new_data.push_to_hub(
                         self.dataset_name,
                         token=self.hf_token,
                         private=self.private,
                     )
                     self.dataset_exists = True
             else:
                 # Create new dataset
-                new_data.push_to_hub(
-                    self.dataset_name, token=self.hf_token, private=self.private
                 )
                 self.dataset_exists = True
         except Exception as e:
             print(f"⚠ Error logging to HuggingFace dataset: {e}")
 def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
@@ -173,6 +197,7 @@ def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
     return text
 def load_model():
     """
     Load the model, tokenizer, and calibration artifacts.

 from datetime import datetime
 from datasets import Dataset, load_dataset
 from huggingface_hub import HfApi
+import pandas as pd
 # Model configuration
         """
         Log a prediction to the HuggingFace dataset.
+        Uses pandas DataFrame as intermediate format to ensure proper
+        parquet compatibility when appending to existing datasets.
         Args:
             text: Input text
             mode: Prediction mode
             json_output: JSON output with scores
         """
         try:
+            # Prepare new data entry as DataFrame
+            new_row = pd.DataFrame([{
                 "timestamp": datetime.utcnow().isoformat(),
                 "text": text,
                 "mode": mode,
                 "anonymize": bool(anonymize),
                 "predictions": predictions,
                 "json_output": json_output,
+            }])
             if self.dataset_exists:
                 # Append to existing dataset
                 try:
+                    # Download existing dataset and convert to pandas
                     existing_dataset = load_dataset(
+                        self.dataset_name,
+                        split="train",
+                        token=self.hf_token,
+                        download_mode="force_redownload",
                     )
+                    existing_df = existing_dataset.to_pandas()
+                    # Concatenate DataFrames
+                    combined_df = pd.concat([existing_df, new_row], ignore_index=True)
+                    # Convert back to Dataset and push
+                    combined_dataset = Dataset.from_pandas(combined_df)
                     combined_dataset.push_to_hub(
                         self.dataset_name,
                         token=self.hf_token,
                         private=self.private,
+                        commit_message=f"Add prediction at {datetime.utcnow().isoformat()}",
                     )
+                    print(f"✓ Appended prediction (total rows: {len(combined_df)})")
+                except FileNotFoundError:
+                    # Dataset doesn't exist yet despite our check - create it
+                    print("⚠ Dataset not found, creating new dataset")
+                    new_dataset = Dataset.from_pandas(new_row)
+                    new_dataset.push_to_hub(
                         self.dataset_name,
                         token=self.hf_token,
                         private=self.private,
                     )
                     self.dataset_exists = True
+                except Exception as e:
+                    # For any other error, DO NOT fall back to push_to_hub
+                    # as that would REPLACE the entire dataset with just the new entry!
+                    print(f"⚠ Error appending to dataset (data not saved): {e}")
+                    import traceback
+                    traceback.print_exc()
             else:
                 # Create new dataset
+                new_dataset = Dataset.from_pandas(new_row)
+                new_dataset.push_to_hub(
+                    self.dataset_name,
+                    token=self.hf_token,
+                    private=self.private,
                 )
                 self.dataset_exists = True
+                print("✓ Created new dataset with first prediction")
         except Exception as e:
             print(f"⚠ Error logging to HuggingFace dataset: {e}")
+            import traceback
+            traceback.print_exc()
 def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
     return text
+@spaces.GPU
 def load_model():
     """
     Load the model, tokenizer, and calibration artifacts.