Spaces:

richardyoung
/

IS-335-Demo

Sleeping

File size: 9,852 Bytes

23a9ae4

"""
Decision Tree Tutorial for UNLV Undergrads
===========================================
A practical example using a Las Vegas student scenario

This tutorial demonstrates how decision trees work by predicting
whether a UNLV student should study at the library or outdoors.
"""

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("=" * 60)
print("DECISION TREE TUTORIAL FOR UNLV STUDENTS")
print("=" * 60)
print("\nScenario: Should you study at the library or outdoors?")
print("Factors: Temperature, Humidity, Wind, Time of Day\n")

# ============================================================
# STEP 1: CREATE A REALISTIC LAS VEGAS DATASET
# ============================================================
print("\n" + "="*60)
print("STEP 1: Creating the Dataset")
print("="*60)

# Create sample data based on Las Vegas conditions
np.random.seed(42)  # For reproducibility

n_samples = 200

data = {
    'temperature_f': np.random.randint(60, 115, n_samples),  # Las Vegas temps!
    'humidity_percent': np.random.randint(10, 40, n_samples),  # Vegas is dry
    'wind_mph': np.random.randint(0, 25, n_samples),
    'hour_of_day': np.random.randint(8, 22, n_samples),  # 8 AM to 10 PM
    'is_weekend': np.random.choice([0, 1], n_samples),
}

# Create labels based on logical rules (this is our "ground truth")
labels = []
for i in range(n_samples):
    temp = data['temperature_f'][i]
    wind = data['wind_mph'][i]
    hour = data['hour_of_day'][i]
    
    # Decision logic: Go outdoors if conditions are nice
    if temp < 85 and wind < 15 and 8 <= hour <= 18:
        labels.append('Outdoors')  # Nice conditions
    elif temp > 105:
        labels.append('Library')  # Too hot!
    elif wind > 20:
        labels.append('Library')  # Too windy!
    elif hour > 19:
        labels.append('Library')  # Evening - better indoor lighting
    else:
        # Add some randomness for realistic data
        labels.append(np.random.choice(['Library', 'Outdoors'], p=[0.6, 0.4]))

data['study_location'] = labels

# Convert to DataFrame
df = pd.DataFrame(data)

print(f"\nDataset created with {len(df)} student decisions")
print(f"\nFirst few rows:")
print(df.head(10))

print(f"\n📊 Class Distribution:")
print(df['study_location'].value_counts())

# ============================================================
# STEP 2: PREPARE DATA FOR MACHINE LEARNING
# ============================================================
print("\n" + "="*60)
print("STEP 2: Preparing Data")
print("="*60)

# Separate features (X) and target (y)
X = df[['temperature_f', 'humidity_percent', 'wind_mph', 'hour_of_day', 'is_weekend']]
y = df['study_location']

print("\nFeatures (what the model uses to decide):")
print(X.columns.tolist())
print("\nTarget (what we're predicting):", y.name)

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\n✓ Training set: {len(X_train)} samples")
print(f"✓ Testing set: {len(X_test)} samples")

# ============================================================
# STEP 3: BUILD THE DECISION TREE
# ============================================================
print("\n" + "="*60)
print("STEP 3: Building the Decision Tree")
print("="*60)

# Create the decision tree classifier
# max_depth=3 keeps it simple and easy to visualize
tree_model = DecisionTreeClassifier(
    max_depth=3,           # Limit tree depth for interpretability
    min_samples_split=10,  # Need at least 10 samples to split a node
    random_state=42
)

# Train the model
print("\n🌳 Training the decision tree...")
tree_model.fit(X_train, y_train)
print("✓ Training complete!")

# ============================================================
# STEP 4: EVALUATE THE MODEL
# ============================================================
print("\n" + "="*60)
print("STEP 4: Evaluating Model Performance")
print("="*60)

# Make predictions
y_pred_train = tree_model.predict(X_train)
y_pred_test = tree_model.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"\n📈 Training Accuracy: {train_accuracy:.2%}")
print(f"📈 Testing Accuracy: {test_accuracy:.2%}")

print("\n📋 Detailed Classification Report:")
print(classification_report(y_test, y_pred_test))

# ============================================================
# STEP 5: VISUALIZE THE DECISION TREE
# ============================================================
print("\n" + "="*60)
print("STEP 5: Visualizing the Decision Tree")
print("="*60)

plt.figure(figsize=(20, 10))
plot_tree(
    tree_model,
    feature_names=X.columns,
    class_names=['Library', 'Outdoors'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title("Decision Tree: Study Location Predictor\n(UNLV Student Example)", 
          fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('/mnt/user-data/outputs/decision_tree_visualization.png', dpi=300, bbox_inches='tight')
print("\n✓ Decision tree visualization saved!")

# ============================================================
# STEP 6: FEATURE IMPORTANCE
# ============================================================
print("\n" + "="*60)
print("STEP 6: Understanding Feature Importance")
print("="*60)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': tree_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n🎯 Feature Importance (which factors matter most?):")
print(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance Score', fontsize=12)
plt.title('Feature Importance in Decision Making', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('/mnt/user-data/outputs/feature_importance.png', dpi=300, bbox_inches='tight')
print("\n✓ Feature importance plot saved!")

# ============================================================
# STEP 7: CONFUSION MATRIX
# ============================================================
print("\n" + "="*60)
print("STEP 7: Confusion Matrix")
print("="*60)

cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Library', 'Outdoors'],
            yticklabels=['Library', 'Outdoors'])
plt.title('Confusion Matrix: Actual vs Predicted', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.savefig('/mnt/user-data/outputs/confusion_matrix.png', dpi=300, bbox_inches='tight')
print("\n✓ Confusion matrix saved!")

# ============================================================
# STEP 8: TEST WITH NEW EXAMPLES
# ============================================================
print("\n" + "="*60)
print("STEP 8: Making Predictions with New Data")
print("="*60)

# Create some test scenarios
test_scenarios = pd.DataFrame([
    {'temperature_f': 75, 'humidity_percent': 15, 'wind_mph': 5, 'hour_of_day': 10, 'is_weekend': 1},
    {'temperature_f': 108, 'humidity_percent': 20, 'wind_mph': 10, 'hour_of_day': 14, 'is_weekend': 0},
    {'temperature_f': 65, 'humidity_percent': 25, 'wind_mph': 20, 'hour_of_day': 16, 'is_weekend': 1},
    {'temperature_f': 90, 'humidity_percent': 18, 'wind_mph': 8, 'hour_of_day': 20, 'is_weekend': 0},
])

predictions = tree_model.predict(test_scenarios)

print("\n🔮 Predictions for new scenarios:\n")
for i, (idx, row) in enumerate(test_scenarios.iterrows()):
    print(f"Scenario {i+1}:")
    print(f"  Temperature: {row['temperature_f']}°F")
    print(f"  Humidity: {row['humidity_percent']}%")
    print(f"  Wind: {row['wind_mph']} mph")
    print(f"  Time: {row['hour_of_day']}:00")
    print(f"  Weekend: {'Yes' if row['is_weekend'] else 'No'}")
    print(f"  → Recommended location: {predictions[i]}")
    print()

# ============================================================
# SUMMARY FOR STUDENTS
# ============================================================
print("\n" + "="*60)
print("KEY TAKEAWAYS FOR UNLV STUDENTS")
print("="*60)
print("""
1. **What is a Decision Tree?**
   - A flowchart-like model that makes decisions by asking questions
   - Easy to interpret and visualize
   - Works like playing "20 Questions"

2. **How Does It Work?**
   - Starts at the root (top) with all data
   - Splits data based on features (temperature, wind, etc.)
   - Continues splitting until reaching a decision (leaf node)

3. **Key Concepts:**
   - Training: Teaching the model using past examples
   - Testing: Checking how well it works on new data
   - Overfitting: When the tree memorizes training data (bad!)
   - Feature Importance: Which factors matter most

4. **Real-World Applications:**
   - Medical diagnosis
   - Credit approval
   - Customer segmentation
   - Game AI
   - Weather prediction

5. **Advantages:**
   ✓ Easy to understand and explain
   ✓ Works with both numbers and categories
   ✓ Requires little data preparation

6. **Limitations:**
   ✗ Can overfit if too complex
   ✗ Sensitive to small data changes
   ✗ May not capture complex relationships
""")

print("\n" + "="*60)
print("🎓 Tutorial Complete!")
print("="*60)
print("\nFiles saved:")
print("  • decision_tree_visualization.png")
print("  • feature_importance.png")
print("  • confusion_matrix.png")