""" Decision Tree Tutorial for UNLV Undergrads =========================================== A practical example using a Las Vegas student scenario This tutorial demonstrates how decision trees work by predicting whether a UNLV student should study at the library or outdoors. """ import pandas as pd import numpy as np from sklearn.tree import DecisionTreeClassifier, plot_tree from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns # Set style for better-looking plots sns.set_style("whitegrid") plt.rcParams['figure.figsize'] = (12, 8) print("=" * 60) print("DECISION TREE TUTORIAL FOR UNLV STUDENTS") print("=" * 60) print("\nScenario: Should you study at the library or outdoors?") print("Factors: Temperature, Humidity, Wind, Time of Day\n") # ============================================================ # STEP 1: CREATE A REALISTIC LAS VEGAS DATASET # ============================================================ print("\n" + "="*60) print("STEP 1: Creating the Dataset") print("="*60) # Create sample data based on Las Vegas conditions np.random.seed(42) # For reproducibility n_samples = 200 data = { 'temperature_f': np.random.randint(60, 115, n_samples), # Las Vegas temps! 'humidity_percent': np.random.randint(10, 40, n_samples), # Vegas is dry 'wind_mph': np.random.randint(0, 25, n_samples), 'hour_of_day': np.random.randint(8, 22, n_samples), # 8 AM to 10 PM 'is_weekend': np.random.choice([0, 1], n_samples), } # Create labels based on logical rules (this is our "ground truth") labels = [] for i in range(n_samples): temp = data['temperature_f'][i] wind = data['wind_mph'][i] hour = data['hour_of_day'][i] # Decision logic: Go outdoors if conditions are nice if temp < 85 and wind < 15 and 8 <= hour <= 18: labels.append('Outdoors') # Nice conditions elif temp > 105: labels.append('Library') # Too hot! elif wind > 20: labels.append('Library') # Too windy! elif hour > 19: labels.append('Library') # Evening - better indoor lighting else: # Add some randomness for realistic data labels.append(np.random.choice(['Library', 'Outdoors'], p=[0.6, 0.4])) data['study_location'] = labels # Convert to DataFrame df = pd.DataFrame(data) print(f"\nDataset created with {len(df)} student decisions") print(f"\nFirst few rows:") print(df.head(10)) print(f"\nšŸ“Š Class Distribution:") print(df['study_location'].value_counts()) # ============================================================ # STEP 2: PREPARE DATA FOR MACHINE LEARNING # ============================================================ print("\n" + "="*60) print("STEP 2: Preparing Data") print("="*60) # Separate features (X) and target (y) X = df[['temperature_f', 'humidity_percent', 'wind_mph', 'hour_of_day', 'is_weekend']] y = df['study_location'] print("\nFeatures (what the model uses to decide):") print(X.columns.tolist()) print("\nTarget (what we're predicting):", y.name) # Split into training and testing sets (80% train, 20% test) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) print(f"\nāœ“ Training set: {len(X_train)} samples") print(f"āœ“ Testing set: {len(X_test)} samples") # ============================================================ # STEP 3: BUILD THE DECISION TREE # ============================================================ print("\n" + "="*60) print("STEP 3: Building the Decision Tree") print("="*60) # Create the decision tree classifier # max_depth=3 keeps it simple and easy to visualize tree_model = DecisionTreeClassifier( max_depth=3, # Limit tree depth for interpretability min_samples_split=10, # Need at least 10 samples to split a node random_state=42 ) # Train the model print("\n🌳 Training the decision tree...") tree_model.fit(X_train, y_train) print("āœ“ Training complete!") # ============================================================ # STEP 4: EVALUATE THE MODEL # ============================================================ print("\n" + "="*60) print("STEP 4: Evaluating Model Performance") print("="*60) # Make predictions y_pred_train = tree_model.predict(X_train) y_pred_test = tree_model.predict(X_test) # Calculate accuracy train_accuracy = accuracy_score(y_train, y_pred_train) test_accuracy = accuracy_score(y_test, y_pred_test) print(f"\nšŸ“ˆ Training Accuracy: {train_accuracy:.2%}") print(f"šŸ“ˆ Testing Accuracy: {test_accuracy:.2%}") print("\nšŸ“‹ Detailed Classification Report:") print(classification_report(y_test, y_pred_test)) # ============================================================ # STEP 5: VISUALIZE THE DECISION TREE # ============================================================ print("\n" + "="*60) print("STEP 5: Visualizing the Decision Tree") print("="*60) plt.figure(figsize=(20, 10)) plot_tree( tree_model, feature_names=X.columns, class_names=['Library', 'Outdoors'], filled=True, rounded=True, fontsize=10 ) plt.title("Decision Tree: Study Location Predictor\n(UNLV Student Example)", fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig('/mnt/user-data/outputs/decision_tree_visualization.png', dpi=300, bbox_inches='tight') print("\nāœ“ Decision tree visualization saved!") # ============================================================ # STEP 6: FEATURE IMPORTANCE # ============================================================ print("\n" + "="*60) print("STEP 6: Understanding Feature Importance") print("="*60) # Get feature importance feature_importance = pd.DataFrame({ 'feature': X.columns, 'importance': tree_model.feature_importances_ }).sort_values('importance', ascending=False) print("\nšŸŽÆ Feature Importance (which factors matter most?):") print(feature_importance) # Visualize feature importance plt.figure(figsize=(10, 6)) plt.barh(feature_importance['feature'], feature_importance['importance']) plt.xlabel('Importance Score', fontsize=12) plt.title('Feature Importance in Decision Making', fontsize=14, fontweight='bold') plt.tight_layout() plt.savefig('/mnt/user-data/outputs/feature_importance.png', dpi=300, bbox_inches='tight') print("\nāœ“ Feature importance plot saved!") # ============================================================ # STEP 7: CONFUSION MATRIX # ============================================================ print("\n" + "="*60) print("STEP 7: Confusion Matrix") print("="*60) cm = confusion_matrix(y_test, y_pred_test) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Library', 'Outdoors'], yticklabels=['Library', 'Outdoors']) plt.title('Confusion Matrix: Actual vs Predicted', fontsize=14, fontweight='bold') plt.ylabel('Actual', fontsize=12) plt.xlabel('Predicted', fontsize=12) plt.tight_layout() plt.savefig('/mnt/user-data/outputs/confusion_matrix.png', dpi=300, bbox_inches='tight') print("\nāœ“ Confusion matrix saved!") # ============================================================ # STEP 8: TEST WITH NEW EXAMPLES # ============================================================ print("\n" + "="*60) print("STEP 8: Making Predictions with New Data") print("="*60) # Create some test scenarios test_scenarios = pd.DataFrame([ {'temperature_f': 75, 'humidity_percent': 15, 'wind_mph': 5, 'hour_of_day': 10, 'is_weekend': 1}, {'temperature_f': 108, 'humidity_percent': 20, 'wind_mph': 10, 'hour_of_day': 14, 'is_weekend': 0}, {'temperature_f': 65, 'humidity_percent': 25, 'wind_mph': 20, 'hour_of_day': 16, 'is_weekend': 1}, {'temperature_f': 90, 'humidity_percent': 18, 'wind_mph': 8, 'hour_of_day': 20, 'is_weekend': 0}, ]) predictions = tree_model.predict(test_scenarios) print("\nšŸ”® Predictions for new scenarios:\n") for i, (idx, row) in enumerate(test_scenarios.iterrows()): print(f"Scenario {i+1}:") print(f" Temperature: {row['temperature_f']}°F") print(f" Humidity: {row['humidity_percent']}%") print(f" Wind: {row['wind_mph']} mph") print(f" Time: {row['hour_of_day']}:00") print(f" Weekend: {'Yes' if row['is_weekend'] else 'No'}") print(f" → Recommended location: {predictions[i]}") print() # ============================================================ # SUMMARY FOR STUDENTS # ============================================================ print("\n" + "="*60) print("KEY TAKEAWAYS FOR UNLV STUDENTS") print("="*60) print(""" 1. **What is a Decision Tree?** - A flowchart-like model that makes decisions by asking questions - Easy to interpret and visualize - Works like playing "20 Questions" 2. **How Does It Work?** - Starts at the root (top) with all data - Splits data based on features (temperature, wind, etc.) - Continues splitting until reaching a decision (leaf node) 3. **Key Concepts:** - Training: Teaching the model using past examples - Testing: Checking how well it works on new data - Overfitting: When the tree memorizes training data (bad!) - Feature Importance: Which factors matter most 4. **Real-World Applications:** - Medical diagnosis - Credit approval - Customer segmentation - Game AI - Weather prediction 5. **Advantages:** āœ“ Easy to understand and explain āœ“ Works with both numbers and categories āœ“ Requires little data preparation 6. **Limitations:** āœ— Can overfit if too complex āœ— Sensitive to small data changes āœ— May not capture complex relationships """) print("\n" + "="*60) print("šŸŽ“ Tutorial Complete!") print("="*60) print("\nFiles saved:") print(" • decision_tree_visualization.png") print(" • feature_importance.png") print(" • confusion_matrix.png")