Spaces:

HEWJDEWJDBQWJDWEJ
/

nova-ai-backend

Running

nova-ai-backend / create_test_data.py

Upload folder using huggingface_hub

5979631 verified 12 days ago

1.47 kB

	import pandas as pd
	import numpy as np

	# Create Dataset 1: The "Dirty" Data (For Intel Scan Testing)
	# Purpose: Test Missing values, Outliers, and Redundancy
	data1 = {
	'Age': [25, 30, 22, np.nan, 28, 150, 35, 40, np.nan, 32], # 150 is an outlier, NaNs present
	'Income': [50000, 60000, 45000, 52000, 58000, 100000, 70000, 80000, 48000, 62000],
	'Savings': [5000, 6000, 4500, 5200, 5800, 10000, 7000, 8000, 4800, 6200], # Perfectly correlated with Income (redundant)
	'Credit_Score': [700, 720, np.nan, 680, 710, 300, 750, 800, 690, 730], # 300 is low outlier
	'Target': [1, 1, 0, 1, 0, 0, 1, 1, 0, 1]
	}
	df1 = pd.DataFrame(data1)
	df1.to_csv(r'C:\Users\muham\Desktop\AutoDS-Hackathon\complex_anomalies.csv', index=False)

	# Create Dataset 2: The "Imbalanced" Data (For Model Performance Testing)
	# Purpose: Test Class imbalance and non-linear relationships
	n_rows = 200
	np.random.seed(42)
	data2 = {
	'Feature_A': np.random.randn(n_rows),
	'Feature_B': np.random.rand(n_rows) * 100,
	'Feature_C': np.random.choice(['Alpha', 'Beta', 'Gamma'], n_rows),
	'target': [1] * 180 + [0] * 20 # 90% Class 1, 10% Class 0 (Extreme Imbalance)
	}
	df2 = pd.DataFrame(data2)
	# One-hot encode Feature_C for training
	df2 = pd.get_dummies(df2, columns=['Feature_C'])
	df2.to_csv(r'C:\Users\muham\Desktop\AutoDS-Hackathon\imbalanced_mission.csv', index=False)

	print("Double 'Hard' Datasets created on Desktop/AutoDS-Hackathon/")