""""
ANOMALY DETECTION PIPELINE FOR FAKE NEWS DETECTION
===================================================
Better suited for data on a continuum (low clustering silhouette).
Uses multiple anomaly detection methods to find suspicious posts.

Outputs:
- anomaly_assignments.csv (same format as clustering)
- anomaly_analysis.png (visualizations)
- suspicious_users.csv (enhanced user profiling)
"""

import torch
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

print("="*80)
print("ANOMALY DETECTION PIPELINE FOR FAKE NEWS DETECTION")
print("="*80)

# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================

print("\n[STEP 1] Loading data...")

try:
    prepared_data = torch.load("prepared_clustering_data.pt", map_location=device, weights_only=False)
    z_out = prepared_data['z_out'].cpu().numpy()
    v_mismatch = prepared_data['v_mismatch'].cpu().numpy()
    user_ids = prepared_data['user_ids']
    timestamps = prepared_data['timestamps']
    post_ids = prepared_data['post_ids']
    
    print(f"✅ Loaded: {len(z_out)} posts")
except:
    print("⚠️ Could not load prepared data")
    exit(1)
    
# Load ground truth labels
print("Loading ground truth labels...")
df_labels = pd.read_pickle("Dataset/twitter/df_preprocessed_with_scores.pkl")
df_labels = df_labels.drop_duplicates(subset='post_id', keep='first').reset_index(drop=True)

# Align labels to post_ids in prepared_data
post_id_to_label = dict(zip(df_labels['post_id'], df_labels['label']))
labels_aligned = np.array([
    post_id_to_label.get(pid, 'unknown') for pid in post_ids
])

real_mask = labels_aligned == 'real'
print(f"✅ Real posts: {real_mask.sum()} | Fake posts: {(~real_mask).sum()}")

# ============================================================================
# STEP 2: FEATURE ENGINEERING
# ============================================================================

print("\n[STEP 2] Advanced feature engineering...")

# Mismatch features
mismatch_magnitude = np.linalg.norm(v_mismatch, axis=1, keepdims=True)

# Statistical features
z_abs = np.abs(z_out)
z_abs_normalized = z_abs / (z_abs.sum(axis=1, keepdims=True) + 1e-10)
z_entropy = -np.sum(z_abs_normalized * np.log(z_abs_normalized + 1e-10), axis=1, keepdims=True)
z_variance = np.var(z_out, axis=1, keepdims=True)
z_skewness = np.mean((z_out - z_out.mean(axis=1, keepdims=True))**3, axis=1, keepdims=True)
z_kurtosis = np.mean((z_out - z_out.mean(axis=1, keepdims=True))**4, axis=1, keepdims=True)

# Cross-modal features
z_v_dot = np.sum(z_out * v_mismatch, axis=1, keepdims=True)
z_v_cosine = z_v_dot / (np.linalg.norm(z_out, axis=1, keepdims=True) * 
                         np.linalg.norm(v_mismatch, axis=1, keepdims=True) + 1e-10)

# Concatenate
X_engineered = np.hstack([
    z_out,                    # 128-dim
    v_mismatch,               # 128-dim
    mismatch_magnitude,       # 1-dim
    z_entropy,                # 1-dim
    z_variance,               # 1-dim
    z_skewness,               # 1-dim
    z_kurtosis,               # 1-dim
    z_v_cosine                # 1-dim
])

print(f"✅ Feature vector: {X_engineered.shape}")

# ============================================================================
# STEP 3: PREPROCESSING
# ============================================================================

print("\n[STEP 3] Scaling & dimensionality reduction...")

# Robust scaling
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_engineered)

# PCA (95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_reduced = pca.fit_transform(X_scaled)

print(f"✅ Reduced: {X_scaled.shape[1]} → {X_reduced.shape[1]} dimensions")
print(f"   Explained variance: {pca.explained_variance_ratio_.sum():.1%}")

# ============================================================================
# STEP 4: MULTIPLE ANOMALY DETECTION METHODS
# ============================================================================

print("\n[STEP 4] Running multiple anomaly detection algorithms...")

# ============================================================================
# STEP 4: MULTIPLE ANOMALY DETECTION METHODS (fit on real posts only)
# ============================================================================

print("\n[STEP 4] Running multiple anomaly detection algorithms...")
print(f"   Fitting on {real_mask.sum()} real posts, scoring all {len(X_reduced)} posts...")

anomaly_scores = {}

# 4.1 Isolation Forest
print("   [4.1] Isolation Forest...")
iso_forest = IsolationForest(
    contamination=0.1,
    random_state=42,
    n_estimators=200,
    n_jobs=-1
)
iso_forest.fit(X_reduced[real_mask])
iso_scores = -iso_forest.score_samples(X_reduced)
iso_labels = iso_forest.predict(X_reduced)
anomaly_scores['isolation_forest'] = iso_scores
print(f"   ✅ Detected {(iso_labels == -1).sum()} anomalies ({(iso_labels == -1).sum()/len(X_reduced)*100:.1f}%)")

# 4.2 Local Outlier Factor
print("   [4.2] Local Outlier Factor...")
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.1,
    novelty=True
)
lof.fit(X_reduced[real_mask])
lof_labels = lof.predict(X_reduced)
lof_scores = -lof.score_samples(X_reduced)
anomaly_scores['lof'] = lof_scores
print(f"   ✅ Detected {(lof_labels == -1).sum()} anomalies ({(lof_labels == -1).sum()/len(X_reduced)*100:.1f}%)")

# 4.3 One-Class SVM
print("   [4.3] One-Class SVM...")
ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
ocsvm.fit(X_reduced[real_mask])
ocsvm_labels = ocsvm.predict(X_reduced)
# OCSVM score_samples returns all-negative values where
# more negative = more anomalous (further outside boundary)
# So we negate to flip: high positive = more anomalous
# But since all values are negative, raw = -score_samples is also negative.
# Instead: anomaly = -score_samples, then shift to positive range
ocsvm_raw = -ocsvm.score_samples(X_reduced)  # all negative
ocsvm_shift = float(ocsvm_raw.min())         # save this offset
ocsvm_scores = ocsvm_raw - ocsvm_shift       # shift to [0, range]
anomaly_scores['ocsvm'] = ocsvm_scores
print(f"   ✅ Detected {(ocsvm_labels == -1).sum()} anomalies ({(ocsvm_labels == -1).sum()/len(X_reduced)*100:.1f}%)")

# 4.4 Elliptic Envelope
print("   [4.4] Elliptic Envelope (Robust Covariance)...")
elliptic = EllipticEnvelope(contamination=0.1, random_state=42)
elliptic.fit(X_reduced[real_mask])
elliptic_labels = elliptic.predict(X_reduced)
elliptic_scores = -elliptic.score_samples(X_reduced)
anomaly_scores['elliptic'] = elliptic_scores
print(f"   ✅ Detected {(elliptic_labels == -1).sum()} anomalies ({(elliptic_labels == -1).sum()/len(X_reduced)*100:.1f}%)")

# ============================================================================
# STEP 5: ENSEMBLE ANOMALY SCORING
# ============================================================================

print("\n[STEP 5] Creating ensemble anomaly scores...")

# Normalize all scores to [0, 1]
anomaly_scores_normalized = {}
for method, scores in anomaly_scores.items():
    min_score = scores.min()
    max_score = scores.max()
    normalized = (scores - min_score) / (max_score - min_score + 1e-10)
    anomaly_scores_normalized[method] = normalized

# Ensemble: weighted average
weights = {
    'isolation_forest': 0.35,  # Best for high-dimensional data
    'lof': 0.30,               # Good for local anomalies
    'ocsvm': 0.20,             # Global view
    'elliptic': 0.15           # Statistical baseline
}

ensemble_score = np.zeros(len(X_reduced))
for method, weight in weights.items():
    ensemble_score += weight * anomaly_scores_normalized[method]

# Define anomaly levels based on ensemble score
anomaly_percentiles = np.percentile(ensemble_score, [75, 90, 95, 99])

def get_anomaly_level(score):
    if score >= anomaly_percentiles[3]:
        return 'critical'
    elif score >= anomaly_percentiles[2]:
        return 'high'
    elif score >= anomaly_percentiles[1]:
        return 'medium'
    elif score >= anomaly_percentiles[0]:
        return 'low'
    else:
        return 'normal'

anomaly_levels = np.array([get_anomaly_level(s) for s in ensemble_score])

# Convert to numeric labels for compatibility with clustering output format
level_to_id = {'normal': 0, 'low': 1, 'medium': 2, 'high': 3, 'critical': 4}
anomaly_labels = np.array([level_to_id[level] for level in anomaly_levels])

print(f"✅ Ensemble scoring complete")
print(f"\n   Anomaly Distribution:")
for level in ['normal', 'low', 'medium', 'high', 'critical']:
    count = (anomaly_levels == level).sum()
    pct = count / len(anomaly_levels) * 100
    print(f"      {level:8s}: {count:5d} posts ({pct:5.1f}%)")


# ============================================================================
# STEP 7: USER-LEVEL BEHAVIORAL ANALYSIS
# ============================================================================

print("\n[STEP 7] Enhanced user behavior analysis...")

# Create results dataframe
results_df = pd.DataFrame({
    'post_id': post_ids,
    'user_id': user_ids,
    'timestamp': timestamps,
    'anomaly_score': ensemble_score,
    'anomaly_level': anomaly_levels,
    'anomaly_label': anomaly_labels,
    'iso_forest_score': anomaly_scores_normalized['isolation_forest'],
    'lof_score': anomaly_scores_normalized['lof'],
    'ocsvm_score': anomaly_scores_normalized['ocsvm'],
    'elliptic_score': anomaly_scores_normalized['elliptic']
})

user_behavior = {}

for user_id in tqdm(set(results_df['user_id']), desc="Analyzing users"):
    user_posts = results_df[results_df['user_id'] == user_id]
    n_posts = len(user_posts)
    
    # Anomaly statistics
    mean_anomaly_score = user_posts['anomaly_score'].mean()
    max_anomaly_score = user_posts['anomaly_score'].max()
    std_anomaly_score = user_posts['anomaly_score'].std()
    
    # Count by severity
    n_critical = (user_posts['anomaly_level'] == 'critical').sum()
    n_high = (user_posts['anomaly_level'] == 'high').sum()
    n_medium = (user_posts['anomaly_level'] == 'medium').sum()
    n_low = (user_posts['anomaly_level'] == 'low').sum()
    n_normal = (user_posts['anomaly_level'] == 'normal').sum()
    
    # Calculate risk score (0-100)
    risk_score = (
        n_critical * 10 +
        n_high * 5 +
        n_medium * 2 +
        n_low * 0.5
    ) / max(n_posts, 1) * 10 * min(n_posts/ 5, 1.0)
    
    risk_score = min(risk_score, 100)  # Cap at 100
    
    # Determine suspicion level
    is_suspicious = (
        (n_critical >= 2) or
        (n_high >= 3) or
        (mean_anomaly_score > anomaly_percentiles[2]) or
        (risk_score > 20)
    )
    
    # Risk category
    if risk_score >= 35:
        risk_category = 'critical'
    elif risk_score >= 20:
        risk_category = 'high'
    elif risk_score >= 10:
        risk_category = 'medium'
    else:
        risk_category = 'low'
    
    user_behavior[user_id] = {
        'n_posts': int(n_posts),
        'mean_anomaly_score': float(mean_anomaly_score),
        'max_anomaly_score': float(max_anomaly_score),
        'std_anomaly_score': float(std_anomaly_score),
        'n_critical': int(n_critical),
        'n_high': int(n_high),
        'n_medium': int(n_medium),
        'n_low': int(n_low),
        'n_normal': int(n_normal),
        'risk_score': float(risk_score),
        'risk_category': risk_category,
        'is_suspicious': bool(is_suspicious)
    }

suspicious_users = {
    uid: behavior for uid, behavior in user_behavior.items()
    if behavior['is_suspicious']
}

print(f"\n✅ User Analysis Complete:")
print(f"   Total users: {len(user_behavior)}")
print(f"   Suspicious users: {len(suspicious_users)} ({len(suspicious_users)/len(user_behavior)*100:.1f}%)")

if suspicious_users:
    print(f"\n   Top 10 Most Suspicious Users:")
    for user_id, behavior in sorted(suspicious_users.items(), 
                                    key=lambda x: x[1]['risk_score'], 
                                    reverse=True)[:10]:
        print(f"      {user_id}: risk={behavior['risk_score']:.1f}/100 | "
              f"posts={behavior['n_posts']} | "
              f"critical={behavior['n_critical']} high={behavior['n_high']} medium={behavior['n_medium']}")

# ============================================================================
# STEP 8: SAVE RESULTS
# ============================================================================

print("\n[STEP 8] Saving results...")

output_dir = Path("anomaly_detection_results")
output_dir.mkdir(exist_ok=True)

# Save main results (compatible with clustering format)
results_df.to_csv(output_dir / "anomaly_assignments.csv", index=False)

# Save user analysis
user_df = pd.DataFrame.from_dict(user_behavior, orient='index')
user_df.index.name = 'user_id'
user_df = user_df.sort_values('risk_score', ascending=False)
user_df.to_csv(output_dir / "user_risk_analysis.csv")

# Save suspicious users separately
if suspicious_users:
    suspicious_df = pd.DataFrame.from_dict(suspicious_users, orient='index')
    suspicious_df.index.name = 'user_id'
    suspicious_df = suspicious_df.sort_values('risk_score', ascending=False)
    suspicious_df.to_csv(output_dir / "suspicious_users.csv")

# Save per-method score distributions for correct inference normalization
method_score_distributions = {}
for name, (scores_raw, scores_norm) in [
    ('isolation_forest', (anomaly_scores['isolation_forest'], anomaly_scores_normalized['isolation_forest'])),
    ('lof',              (anomaly_scores['lof'],              anomaly_scores_normalized['lof'])),
    ('ocsvm',            (anomaly_scores['ocsvm'],            anomaly_scores_normalized['ocsvm'])),
    ('elliptic',         (anomaly_scores['elliptic'],         anomaly_scores_normalized['elliptic'])),
]:
    method_score_distributions[name] = {
        'min':  float(scores_raw.min()),
        'max':  float(scores_raw.max()),
        'mean': float(scores_raw.mean()),
        'std':  float(scores_raw.std()),
        'p25':  float(np.percentile(scores_raw, 25)),
        'p75':  float(np.percentile(scores_raw, 75)),
        'p95':  float(np.percentile(scores_raw, 95)),
    }
    print(f"  {name}: raw range=[{scores_raw.min():.4f}, {scores_raw.max():.4f}]")

ensemble_score_distribution = {
    'min':  float(ensemble_score.min()),
    'max':  float(ensemble_score.max()),
    'mean': float(ensemble_score.mean()),
    'std':  float(ensemble_score.std()),
    'p25':  float(np.percentile(ensemble_score, 25)),
    'p50':  float(np.percentile(ensemble_score, 50)),
    'p75':  float(np.percentile(ensemble_score, 75)),
    'p90':  float(np.percentile(ensemble_score, 90)),
    'p95':  float(np.percentile(ensemble_score, 95)),
    'p99':  float(np.percentile(ensemble_score, 99)),
}

torch.save({
    'scaler': scaler,
    'pca': pca,
    
    'models': {
        'isolation_forest': iso_forest,
        'lof': lof,
        'ocsvm': ocsvm,
        'elliptic': elliptic
    },
    'ensemble_weights': weights,
    'anomaly_percentiles': anomaly_percentiles,
    'method_score_distributions': method_score_distributions,
    'ensemble_score_distribution': ensemble_score_distribution,
    'X_reduced': X_reduced,
    'ocsvm_shift': ocsvm_shift
}, output_dir / "anomaly_models.pt")

print(f"\n✅ Saved with score distributions")
print(f"   Ensemble: p25={ensemble_score_distribution['p25']:.4f} "
      f"p50={ensemble_score_distribution['p50']:.4f} "
      f"p75={ensemble_score_distribution['p75']:.4f} "
      f"p95={ensemble_score_distribution['p95']:.4f}")

print(f"✅ Saved results to {output_dir}/")

# ============================================================================
# STEP 9: COMPREHENSIVE VISUALIZATION
# ============================================================================

print("\n[STEP 9] Creating visualizations...")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Anomaly Detection Analysis', fontsize=16, fontweight='bold')

# 1. Anomaly score distribution
axes[0, 0].hist(ensemble_score, bins=50, color='coral', edgecolor='black', alpha=0.7)
for i, pct in enumerate([75, 90, 95, 99]):
    axes[0, 0].axvline(anomaly_percentiles[i], color='red', linestyle='--',
                       linewidth=1.5, alpha=0.7, label=f'{pct}th percentile')
axes[0, 0].set_title('Anomaly Score Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Ensemble Anomaly Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend(fontsize=8)

# 2. Method comparison
method_counts = {
    'IsoForest': (iso_labels == -1).sum(),
    'LOF': (lof_labels == -1).sum(),
    'OCSVM': (ocsvm_labels == -1).sum(),
    'Elliptic': (elliptic_labels == -1).sum()
}
axes[0, 1].bar(method_counts.keys(), method_counts.values(),
               color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'], edgecolor='black')
axes[0, 1].set_title('Anomalies Detected by Method', fontweight='bold')
axes[0, 1].set_ylabel('Number of Anomalies')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Severity distribution
level_counts = {level: (anomaly_levels == level).sum()
                for level in ['normal', 'low', 'medium', 'high', 'critical']}
colors_pie = ['#2ecc71', '#f1c40f', '#e67e22', '#e74c3c', '#8e44ad']
axes[1, 0].pie(level_counts.values(), labels=level_counts.keys(), autopct='%1.1f%%',
               colors=colors_pie, startangle=90)
axes[1, 0].set_title('Anomaly Severity Distribution', fontweight='bold')

# 4. User risk distribution
user_risk_counts = {}
for b in user_behavior.values():
    risk = b['risk_category']
    user_risk_counts[risk] = user_risk_counts.get(risk, 0) + 1

risk_order = ['low', 'medium', 'high', 'critical']
risk_colors = ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c']
risk_values = [user_risk_counts.get(r, 0) for r in risk_order]
axes[1, 1].bar(risk_order, risk_values, color=risk_colors, edgecolor='black')
axes[1, 1].set_title('User Risk Categories', fontweight='bold')
axes[1, 1].set_ylabel('Number of Users')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(output_dir / "anomaly_analysis.png", dpi=150, bbox_inches='tight')
print(f"✅ Saved visualization")