IS-335-Demo / decision_tree_tutorial.py
Ric
Add UNLV study location space app
23a9ae4
"""
Decision Tree Tutorial for UNLV Undergrads
===========================================
A practical example using a Las Vegas student scenario
This tutorial demonstrates how decision trees work by predicting
whether a UNLV student should study at the library or outdoors.
"""
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
print("=" * 60)
print("DECISION TREE TUTORIAL FOR UNLV STUDENTS")
print("=" * 60)
print("\nScenario: Should you study at the library or outdoors?")
print("Factors: Temperature, Humidity, Wind, Time of Day\n")
# ============================================================
# STEP 1: CREATE A REALISTIC LAS VEGAS DATASET
# ============================================================
print("\n" + "="*60)
print("STEP 1: Creating the Dataset")
print("="*60)
# Create sample data based on Las Vegas conditions
np.random.seed(42) # For reproducibility
n_samples = 200
data = {
'temperature_f': np.random.randint(60, 115, n_samples), # Las Vegas temps!
'humidity_percent': np.random.randint(10, 40, n_samples), # Vegas is dry
'wind_mph': np.random.randint(0, 25, n_samples),
'hour_of_day': np.random.randint(8, 22, n_samples), # 8 AM to 10 PM
'is_weekend': np.random.choice([0, 1], n_samples),
}
# Create labels based on logical rules (this is our "ground truth")
labels = []
for i in range(n_samples):
temp = data['temperature_f'][i]
wind = data['wind_mph'][i]
hour = data['hour_of_day'][i]
# Decision logic: Go outdoors if conditions are nice
if temp < 85 and wind < 15 and 8 <= hour <= 18:
labels.append('Outdoors') # Nice conditions
elif temp > 105:
labels.append('Library') # Too hot!
elif wind > 20:
labels.append('Library') # Too windy!
elif hour > 19:
labels.append('Library') # Evening - better indoor lighting
else:
# Add some randomness for realistic data
labels.append(np.random.choice(['Library', 'Outdoors'], p=[0.6, 0.4]))
data['study_location'] = labels
# Convert to DataFrame
df = pd.DataFrame(data)
print(f"\nDataset created with {len(df)} student decisions")
print(f"\nFirst few rows:")
print(df.head(10))
print(f"\n๐Ÿ“Š Class Distribution:")
print(df['study_location'].value_counts())
# ============================================================
# STEP 2: PREPARE DATA FOR MACHINE LEARNING
# ============================================================
print("\n" + "="*60)
print("STEP 2: Preparing Data")
print("="*60)
# Separate features (X) and target (y)
X = df[['temperature_f', 'humidity_percent', 'wind_mph', 'hour_of_day', 'is_weekend']]
y = df['study_location']
print("\nFeatures (what the model uses to decide):")
print(X.columns.tolist())
print("\nTarget (what we're predicting):", y.name)
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"\nโœ“ Training set: {len(X_train)} samples")
print(f"โœ“ Testing set: {len(X_test)} samples")
# ============================================================
# STEP 3: BUILD THE DECISION TREE
# ============================================================
print("\n" + "="*60)
print("STEP 3: Building the Decision Tree")
print("="*60)
# Create the decision tree classifier
# max_depth=3 keeps it simple and easy to visualize
tree_model = DecisionTreeClassifier(
max_depth=3, # Limit tree depth for interpretability
min_samples_split=10, # Need at least 10 samples to split a node
random_state=42
)
# Train the model
print("\n๐ŸŒณ Training the decision tree...")
tree_model.fit(X_train, y_train)
print("โœ“ Training complete!")
# ============================================================
# STEP 4: EVALUATE THE MODEL
# ============================================================
print("\n" + "="*60)
print("STEP 4: Evaluating Model Performance")
print("="*60)
# Make predictions
y_pred_train = tree_model.predict(X_train)
y_pred_test = tree_model.predict(X_test)
# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"\n๐Ÿ“ˆ Training Accuracy: {train_accuracy:.2%}")
print(f"๐Ÿ“ˆ Testing Accuracy: {test_accuracy:.2%}")
print("\n๐Ÿ“‹ Detailed Classification Report:")
print(classification_report(y_test, y_pred_test))
# ============================================================
# STEP 5: VISUALIZE THE DECISION TREE
# ============================================================
print("\n" + "="*60)
print("STEP 5: Visualizing the Decision Tree")
print("="*60)
plt.figure(figsize=(20, 10))
plot_tree(
tree_model,
feature_names=X.columns,
class_names=['Library', 'Outdoors'],
filled=True,
rounded=True,
fontsize=10
)
plt.title("Decision Tree: Study Location Predictor\n(UNLV Student Example)",
fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('/mnt/user-data/outputs/decision_tree_visualization.png', dpi=300, bbox_inches='tight')
print("\nโœ“ Decision tree visualization saved!")
# ============================================================
# STEP 6: FEATURE IMPORTANCE
# ============================================================
print("\n" + "="*60)
print("STEP 6: Understanding Feature Importance")
print("="*60)
# Get feature importance
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': tree_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n๐ŸŽฏ Feature Importance (which factors matter most?):")
print(feature_importance)
# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance Score', fontsize=12)
plt.title('Feature Importance in Decision Making', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('/mnt/user-data/outputs/feature_importance.png', dpi=300, bbox_inches='tight')
print("\nโœ“ Feature importance plot saved!")
# ============================================================
# STEP 7: CONFUSION MATRIX
# ============================================================
print("\n" + "="*60)
print("STEP 7: Confusion Matrix")
print("="*60)
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Library', 'Outdoors'],
yticklabels=['Library', 'Outdoors'])
plt.title('Confusion Matrix: Actual vs Predicted', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.savefig('/mnt/user-data/outputs/confusion_matrix.png', dpi=300, bbox_inches='tight')
print("\nโœ“ Confusion matrix saved!")
# ============================================================
# STEP 8: TEST WITH NEW EXAMPLES
# ============================================================
print("\n" + "="*60)
print("STEP 8: Making Predictions with New Data")
print("="*60)
# Create some test scenarios
test_scenarios = pd.DataFrame([
{'temperature_f': 75, 'humidity_percent': 15, 'wind_mph': 5, 'hour_of_day': 10, 'is_weekend': 1},
{'temperature_f': 108, 'humidity_percent': 20, 'wind_mph': 10, 'hour_of_day': 14, 'is_weekend': 0},
{'temperature_f': 65, 'humidity_percent': 25, 'wind_mph': 20, 'hour_of_day': 16, 'is_weekend': 1},
{'temperature_f': 90, 'humidity_percent': 18, 'wind_mph': 8, 'hour_of_day': 20, 'is_weekend': 0},
])
predictions = tree_model.predict(test_scenarios)
print("\n๐Ÿ”ฎ Predictions for new scenarios:\n")
for i, (idx, row) in enumerate(test_scenarios.iterrows()):
print(f"Scenario {i+1}:")
print(f" Temperature: {row['temperature_f']}ยฐF")
print(f" Humidity: {row['humidity_percent']}%")
print(f" Wind: {row['wind_mph']} mph")
print(f" Time: {row['hour_of_day']}:00")
print(f" Weekend: {'Yes' if row['is_weekend'] else 'No'}")
print(f" โ†’ Recommended location: {predictions[i]}")
print()
# ============================================================
# SUMMARY FOR STUDENTS
# ============================================================
print("\n" + "="*60)
print("KEY TAKEAWAYS FOR UNLV STUDENTS")
print("="*60)
print("""
1. **What is a Decision Tree?**
- A flowchart-like model that makes decisions by asking questions
- Easy to interpret and visualize
- Works like playing "20 Questions"
2. **How Does It Work?**
- Starts at the root (top) with all data
- Splits data based on features (temperature, wind, etc.)
- Continues splitting until reaching a decision (leaf node)
3. **Key Concepts:**
- Training: Teaching the model using past examples
- Testing: Checking how well it works on new data
- Overfitting: When the tree memorizes training data (bad!)
- Feature Importance: Which factors matter most
4. **Real-World Applications:**
- Medical diagnosis
- Credit approval
- Customer segmentation
- Game AI
- Weather prediction
5. **Advantages:**
โœ“ Easy to understand and explain
โœ“ Works with both numbers and categories
โœ“ Requires little data preparation
6. **Limitations:**
โœ— Can overfit if too complex
โœ— Sensitive to small data changes
โœ— May not capture complex relationships
""")
print("\n" + "="*60)
print("๐ŸŽ“ Tutorial Complete!")
print("="*60)
print("\nFiles saved:")
print(" โ€ข decision_tree_visualization.png")
print(" โ€ข feature_importance.png")
print(" โ€ข confusion_matrix.png")