Spaces:

PD03
/

RICA-AIRevenueIntelligenceAgent

Sleeping

App Files Files Community

PD03 commited on Aug 31, 2025

Commit

af504e3

verified ·

1 Parent(s): c829fa9

Create agent_tools/ml_tools.py

Browse files

Files changed (1) hide show

agent_tools/ml_tools.py +204 -0

agent_tools/ml_tools.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+ML Tools optimized for Hugging Face Spaces
+"""
+from smolagents import tool
+import joblib
+import pandas as pd
+import numpy as np
+import json
+from pathlib import Path
+from datetime import datetime
+import duckdb
+import streamlit as st
+# Global model cache for HF Spaces
+_model_cache = {}
+def load_model_with_cache(model_name: str = 'churn_model_v1'):
+    """Load model with HF Spaces caching"""
+    if model_name not in _model_cache:
+        model_path = Path(f'models/{model_name}.pkl')
+        if model_path.exists():
+            _model_cache[model_name] = joblib.load(model_path)
+        else:
+            return None
+    return _model_cache[model_name]
+@tool
+def predict_customer_churn_hf(customer_ids: str = None, risk_threshold: float = 0.6) -> str:
+    """
+    HF Spaces optimized churn prediction with performance constraints.
+    Args:
+        customer_ids: Comma-separated customer IDs (optional)
+        risk_threshold: Risk threshold for alerts (default 0.6)
+    Returns:
+        JSON with churn predictions optimized for HF Spaces
+    """
+    try:
+        # Load model
+        model_data = load_model_with_cache()
+        if model_data is None:
+            return json.dumps({"error": "Model not found. Please wait for training to complete."})
+        model = model_data['model']
+        label_encoders = model_data['label_encoders']
+        feature_columns = model_data['feature_columns']
+        # Load data with limits for HF Spaces performance
+        conn = duckdb.connect(':memory:')
+        conn.execute("""
+            CREATE TABLE customers AS
+            SELECT * FROM 'hf://datasets/SAP/SALT/I_Customer.parquet'
+            LIMIT 2000
+        """)  # Limit for performance
+        conn.execute("""
+            CREATE TABLE sales_docs AS
+            SELECT * FROM 'hf://datasets/SAP/SALT/I_SalesDocument.parquet'
+            LIMIT 5000
+        """)  # Limit for performance
+        # Filter customers if specified
+        if customer_ids:
+            customer_list = [f"'{cid.strip()}'" for cid in customer_ids.split(',')]
+            where_clause = f"WHERE c.Customer IN ({','.join(customer_list)})"
+        else:
+            where_clause = "LIMIT 500"  # Further limit for demo
+        # Get customer data
+        customer_data = conn.execute(f"""
+            SELECT
+                c.Customer,
+                c.CustomerName,
+                c.Country,
+                c.CustomerGroup,
+                COUNT(s.SalesDocument) as total_orders,
+                MAX(s.CreationDate) as last_order_date,
+                MIN(s.CreationDate) as first_order_date
+            FROM customers c
+            LEFT JOIN sales_docs s ON c.Customer = s.SoldToParty
+            {where_clause if not customer_ids else ""}
+            GROUP BY c.Customer, c.CustomerName, c.Country, c.CustomerGroup
+            {where_clause if customer_ids else ""}
+        """).df()
+        if len(customer_data) == 0:
+            return json.dumps({"error": "No customers found"})
+        # Feature engineering (same as training)
+        reference_date = pd.to_datetime('2024-12-31')
+        customer_data['last_order_date'] = pd.to_datetime(customer_data['last_order_date'])
+        customer_data['first_order_date'] = pd.to_datetime(customer_data['first_order_date'])
+        # RFM features
+        customer_data['Recency'] = (reference_date - customer_data['last_order_date']).dt.days
+        customer_data['Recency'] = customer_data['Recency'].fillna(365)
+        customer_data['Frequency'] = customer_data['total_orders'].fillna(0)
+        np.random.seed(42)
+        customer_data['Monetary'] = customer_data['Frequency'] * np.random.exponential(500, len(customer_data))
+        customer_data['Tenure'] = (reference_date - customer_data['first_order_date']).dt.days
+        customer_data['Tenure'] = customer_data['Tenure'].fillna(0)
+        customer_data['OrderVelocity'] = customer_data['Frequency'] / (customer_data['Tenure'] / 30 + 1)
+        # Encode categoricals
+        for col in ['Country', 'CustomerGroup']:
+            if col in label_encoders:
+                try:
+                    customer_data[f'{col}_encoded'] = label_encoders[col].transform(
+                        customer_data[col].fillna('Unknown')
+                    )
+                except:
+                    # Handle unseen categories
+                    customer_data[f'{col}_encoded'] = 0
+        # Make predictions
+        try:
+            X = customer_data[feature_columns].fillna(0)
+            predictions = model.predict(X)
+            probabilities = model.predict_proba(X)[:, 1]
+            # Results
+            results = customer_data.copy()
+            results['churn_probability'] = probabilities
+            results['risk_level'] = results['churn_probability'].apply(
+                lambda x: 'CRITICAL' if x > 0.8 else 'HIGH' if x > 0.6 else 'MEDIUM' if x > 0.4 else 'LOW'
+            )
+            # High risk customers
+            high_risk = results[results['churn_probability'] >= risk_threshold].sort_values(
+                'churn_probability', ascending=False
+            ).head(20)  # Limit results for HF Spaces
+            # Generate recommendations
+            recommendations = []
+            for _, customer in high_risk.iterrows():
+                recommendations.append({
+                    "customer_id": customer['Customer'],
+                    "customer_name": customer['CustomerName'],
+                    "churn_probability": round(float(customer['churn_probability']), 3),
+                    "risk_level": customer['risk_level'],
+                    "recommended_action": "Immediate contact" if customer['churn_probability'] > 0.8 else "Schedule follow-up",
+                    "days_since_order": int(customer['Recency']) if not pd.isna(customer['Recency']) else 0
+                })
+            return json.dumps({
+                "analysis_date": datetime.now().isoformat(),
+                "customers_analyzed": len(results),
+                "high_risk_count": len(high_risk),
+                "churn_rate_predicted": round(len(high_risk) / len(results) * 100, 2) if len(results) > 0 else 0,
+                "urgent_actions": recommendations,
+                "model_performance": f"Accuracy: {model_data.get('accuracy', 'N/A')}",
+                "hf_spaces_note": "Results limited for demo performance"
+            })
+        except Exception as e:
+            return json.dumps({"error": f"Prediction failed: {str(e)}"})
+    except Exception as e:
+        return json.dumps({
+            "error": f"Churn analysis failed: {str(e)}",
+            "suggestion": "Please ensure model is trained"
+        })
+@tool
+def get_model_status() -> str:
+    """
+    Get ML model status for HF Spaces.
+    Returns:
+        JSON with model information and health
+    """
+    try:
+        metadata_path = Path('models/model_metadata.json')
+        model_path = Path('models/churn_model_v1.pkl')
+        if metadata_path.exists() and model_path.exists():
+            with open(metadata_path, 'r') as f:
+                metadata = json.load(f)
+            return json.dumps({
+                "model_status": "Ready",
+                "model_info": metadata,
+                "files_present": {
+                    "model_file": model_path.exists(),
+                    "metadata_file": metadata_path.exists()
+                },
+                "recommendation": "Model is ready for predictions"
+            })
+        else:
+            return json.dumps({
+                "model_status": "Not Found",
+                "message": "Model will be trained automatically on first use",
+                "training_time": "Approximately 1-2 minutes"
+            })
+    except Exception as e:
+        return json.dumps({
+            "error": f"Status check failed: {str(e)}"
+        })