import pandas as pd import numpy as np # Create Dataset 1: The "Dirty" Data (For Intel Scan Testing) # Purpose: Test Missing values, Outliers, and Redundancy data1 = { 'Age': [25, 30, 22, np.nan, 28, 150, 35, 40, np.nan, 32], # 150 is an outlier, NaNs present 'Income': [50000, 60000, 45000, 52000, 58000, 100000, 70000, 80000, 48000, 62000], 'Savings': [5000, 6000, 4500, 5200, 5800, 10000, 7000, 8000, 4800, 6200], # Perfectly correlated with Income (redundant) 'Credit_Score': [700, 720, np.nan, 680, 710, 300, 750, 800, 690, 730], # 300 is low outlier 'Target': [1, 1, 0, 1, 0, 0, 1, 1, 0, 1] } df1 = pd.DataFrame(data1) df1.to_csv(r'C:\Users\muham\Desktop\AutoDS-Hackathon\complex_anomalies.csv', index=False) # Create Dataset 2: The "Imbalanced" Data (For Model Performance Testing) # Purpose: Test Class imbalance and non-linear relationships n_rows = 200 np.random.seed(42) data2 = { 'Feature_A': np.random.randn(n_rows), 'Feature_B': np.random.rand(n_rows) * 100, 'Feature_C': np.random.choice(['Alpha', 'Beta', 'Gamma'], n_rows), 'target': [1] * 180 + [0] * 20 # 90% Class 1, 10% Class 0 (Extreme Imbalance) } df2 = pd.DataFrame(data2) # One-hot encode Feature_C for training df2 = pd.get_dummies(df2, columns=['Feature_C']) df2.to_csv(r'C:\Users\muham\Desktop\AutoDS-Hackathon\imbalanced_mission.csv', index=False) print("Double 'Hard' Datasets created on Desktop/AutoDS-Hackathon/")