Spaces:

richardyoung
/

IS-335-Demo

Sleeping

IS-335-Demo / decision_tree_tutorial.py

Ric

Add UNLV study location space app

23a9ae4 about 1 month ago

9.85 kB

	"""
	Decision Tree Tutorial for UNLV Undergrads
	===========================================
	A practical example using a Las Vegas student scenario

	This tutorial demonstrates how decision trees work by predicting
	whether a UNLV student should study at the library or outdoors.
	"""

	import pandas as pd
	import numpy as np
	from sklearn.tree import DecisionTreeClassifier, plot_tree
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Set style for better-looking plots
	sns.set_style("whitegrid")
	plt.rcParams['figure.figsize'] = (12, 8)

	print("=" * 60)
	print("DECISION TREE TUTORIAL FOR UNLV STUDENTS")
	print("=" * 60)
	print("\nScenario: Should you study at the library or outdoors?")
	print("Factors: Temperature, Humidity, Wind, Time of Day\n")

	# ============================================================
	# STEP 1: CREATE A REALISTIC LAS VEGAS DATASET
	# ============================================================
	print("\n" + "="*60)
	print("STEP 1: Creating the Dataset")
	print("="*60)

	# Create sample data based on Las Vegas conditions
	np.random.seed(42) # For reproducibility

	n_samples = 200

	data = {
	'temperature_f': np.random.randint(60, 115, n_samples), # Las Vegas temps!
	'humidity_percent': np.random.randint(10, 40, n_samples), # Vegas is dry
	'wind_mph': np.random.randint(0, 25, n_samples),
	'hour_of_day': np.random.randint(8, 22, n_samples), # 8 AM to 10 PM
	'is_weekend': np.random.choice([0, 1], n_samples),
	}

	# Create labels based on logical rules (this is our "ground truth")
	labels = []
	for i in range(n_samples):
	temp = data['temperature_f'][i]
	wind = data['wind_mph'][i]
	hour = data['hour_of_day'][i]

	# Decision logic: Go outdoors if conditions are nice
	if temp < 85 and wind < 15 and 8 <= hour <= 18:
	labels.append('Outdoors') # Nice conditions
	elif temp > 105:
	labels.append('Library') # Too hot!
	elif wind > 20:
	labels.append('Library') # Too windy!
	elif hour > 19:
	labels.append('Library') # Evening - better indoor lighting
	else:
	# Add some randomness for realistic data
	labels.append(np.random.choice(['Library', 'Outdoors'], p=[0.6, 0.4]))

	data['study_location'] = labels

	# Convert to DataFrame
	df = pd.DataFrame(data)

	print(f"\nDataset created with {len(df)} student decisions")
	print(f"\nFirst few rows:")
	print(df.head(10))

	print(f"\n📊 Class Distribution:")
	print(df['study_location'].value_counts())

	# ============================================================
	# STEP 2: PREPARE DATA FOR MACHINE LEARNING
	# ============================================================
	print("\n" + "="*60)
	print("STEP 2: Preparing Data")
	print("="*60)

	# Separate features (X) and target (y)
	X = df[['temperature_f', 'humidity_percent', 'wind_mph', 'hour_of_day', 'is_weekend']]
	y = df['study_location']

	print("\nFeatures (what the model uses to decide):")
	print(X.columns.tolist())
	print("\nTarget (what we're predicting):", y.name)

	# Split into training and testing sets (80% train, 20% test)
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
	)

	print(f"\n✓ Training set: {len(X_train)} samples")
	print(f"✓ Testing set: {len(X_test)} samples")

	# ============================================================
	# STEP 3: BUILD THE DECISION TREE
	# ============================================================
	print("\n" + "="*60)
	print("STEP 3: Building the Decision Tree")
	print("="*60)

	# Create the decision tree classifier
	# max_depth=3 keeps it simple and easy to visualize
	tree_model = DecisionTreeClassifier(
	max_depth=3, # Limit tree depth for interpretability
	min_samples_split=10, # Need at least 10 samples to split a node
	random_state=42
	)

	# Train the model
	print("\n🌳 Training the decision tree...")
	tree_model.fit(X_train, y_train)
	print("✓ Training complete!")

	# ============================================================
	# STEP 4: EVALUATE THE MODEL
	# ============================================================
	print("\n" + "="*60)
	print("STEP 4: Evaluating Model Performance")
	print("="*60)

	# Make predictions
	y_pred_train = tree_model.predict(X_train)
	y_pred_test = tree_model.predict(X_test)

	# Calculate accuracy
	train_accuracy = accuracy_score(y_train, y_pred_train)
	test_accuracy = accuracy_score(y_test, y_pred_test)

	print(f"\n📈 Training Accuracy: {train_accuracy:.2%}")
	print(f"📈 Testing Accuracy: {test_accuracy:.2%}")

	print("\n📋 Detailed Classification Report:")
	print(classification_report(y_test, y_pred_test))

	# ============================================================
	# STEP 5: VISUALIZE THE DECISION TREE
	# ============================================================
	print("\n" + "="*60)
	print("STEP 5: Visualizing the Decision Tree")
	print("="*60)

	plt.figure(figsize=(20, 10))
	plot_tree(
	tree_model,
	feature_names=X.columns,
	class_names=['Library', 'Outdoors'],
	filled=True,
	rounded=True,
	fontsize=10
	)
	plt.title("Decision Tree: Study Location Predictor\n(UNLV Student Example)",
	fontsize=16, fontweight='bold')
	plt.tight_layout()
	plt.savefig('/mnt/user-data/outputs/decision_tree_visualization.png', dpi=300, bbox_inches='tight')
	print("\n✓ Decision tree visualization saved!")

	# ============================================================
	# STEP 6: FEATURE IMPORTANCE
	# ============================================================
	print("\n" + "="*60)
	print("STEP 6: Understanding Feature Importance")
	print("="*60)

	# Get feature importance
	feature_importance = pd.DataFrame({
	'feature': X.columns,
	'importance': tree_model.feature_importances_
	}).sort_values('importance', ascending=False)

	print("\n🎯 Feature Importance (which factors matter most?):")
	print(feature_importance)

	# Visualize feature importance
	plt.figure(figsize=(10, 6))
	plt.barh(feature_importance['feature'], feature_importance['importance'])
	plt.xlabel('Importance Score', fontsize=12)
	plt.title('Feature Importance in Decision Making', fontsize=14, fontweight='bold')
	plt.tight_layout()
	plt.savefig('/mnt/user-data/outputs/feature_importance.png', dpi=300, bbox_inches='tight')
	print("\n✓ Feature importance plot saved!")

	# ============================================================
	# STEP 7: CONFUSION MATRIX
	# ============================================================
	print("\n" + "="*60)
	print("STEP 7: Confusion Matrix")
	print("="*60)

	cm = confusion_matrix(y_test, y_pred_test)
	plt.figure(figsize=(8, 6))
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
	xticklabels=['Library', 'Outdoors'],
	yticklabels=['Library', 'Outdoors'])
	plt.title('Confusion Matrix: Actual vs Predicted', fontsize=14, fontweight='bold')
	plt.ylabel('Actual', fontsize=12)
	plt.xlabel('Predicted', fontsize=12)
	plt.tight_layout()
	plt.savefig('/mnt/user-data/outputs/confusion_matrix.png', dpi=300, bbox_inches='tight')
	print("\n✓ Confusion matrix saved!")

	# ============================================================
	# STEP 8: TEST WITH NEW EXAMPLES
	# ============================================================
	print("\n" + "="*60)
	print("STEP 8: Making Predictions with New Data")
	print("="*60)

	# Create some test scenarios
	test_scenarios = pd.DataFrame([
	{'temperature_f': 75, 'humidity_percent': 15, 'wind_mph': 5, 'hour_of_day': 10, 'is_weekend': 1},
	{'temperature_f': 108, 'humidity_percent': 20, 'wind_mph': 10, 'hour_of_day': 14, 'is_weekend': 0},
	{'temperature_f': 65, 'humidity_percent': 25, 'wind_mph': 20, 'hour_of_day': 16, 'is_weekend': 1},
	{'temperature_f': 90, 'humidity_percent': 18, 'wind_mph': 8, 'hour_of_day': 20, 'is_weekend': 0},
	])

	predictions = tree_model.predict(test_scenarios)

	print("\n🔮 Predictions for new scenarios:\n")
	for i, (idx, row) in enumerate(test_scenarios.iterrows()):
	print(f"Scenario {i+1}:")
	print(f" Temperature: {row['temperature_f']}°F")
	print(f" Humidity: {row['humidity_percent']}%")
	print(f" Wind: {row['wind_mph']} mph")
	print(f" Time: {row['hour_of_day']}:00")
	print(f" Weekend: {'Yes' if row['is_weekend'] else 'No'}")
	print(f" → Recommended location: {predictions[i]}")
	print()

	# ============================================================
	# SUMMARY FOR STUDENTS
	# ============================================================
	print("\n" + "="*60)
	print("KEY TAKEAWAYS FOR UNLV STUDENTS")
	print("="*60)
	print("""
	1. What is a Decision Tree?
	- A flowchart-like model that makes decisions by asking questions
	- Easy to interpret and visualize
	- Works like playing "20 Questions"

	2. How Does It Work?
	- Starts at the root (top) with all data
	- Splits data based on features (temperature, wind, etc.)
	- Continues splitting until reaching a decision (leaf node)

	3. Key Concepts:
	- Training: Teaching the model using past examples
	- Testing: Checking how well it works on new data
	- Overfitting: When the tree memorizes training data (bad!)
	- Feature Importance: Which factors matter most

	4. Real-World Applications:
	- Medical diagnosis
	- Credit approval
	- Customer segmentation
	- Game AI
	- Weather prediction

	5. Advantages:
	✓ Easy to understand and explain
	✓ Works with both numbers and categories
	✓ Requires little data preparation

	6. Limitations:
	✗ Can overfit if too complex
	✗ Sensitive to small data changes
	✗ May not capture complex relationships
	""")

	print("\n" + "="*60)
	print("🎓 Tutorial Complete!")
	print("="*60)
	print("\nFiles saved:")
	print(" • decision_tree_visualization.png")
	print(" • feature_importance.png")
	print(" • confusion_matrix.png")