Spaces:
Sleeping
Sleeping
| """ | |
| Decision Tree Tutorial for UNLV Undergrads | |
| =========================================== | |
| A practical example using a Las Vegas student scenario | |
| This tutorial demonstrates how decision trees work by predicting | |
| whether a UNLV student should study at the library or outdoors. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.tree import DecisionTreeClassifier, plot_tree | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Set style for better-looking plots | |
| sns.set_style("whitegrid") | |
| plt.rcParams['figure.figsize'] = (12, 8) | |
| print("=" * 60) | |
| print("DECISION TREE TUTORIAL FOR UNLV STUDENTS") | |
| print("=" * 60) | |
| print("\nScenario: Should you study at the library or outdoors?") | |
| print("Factors: Temperature, Humidity, Wind, Time of Day\n") | |
| # ============================================================ | |
| # STEP 1: CREATE A REALISTIC LAS VEGAS DATASET | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 1: Creating the Dataset") | |
| print("="*60) | |
| # Create sample data based on Las Vegas conditions | |
| np.random.seed(42) # For reproducibility | |
| n_samples = 200 | |
| data = { | |
| 'temperature_f': np.random.randint(60, 115, n_samples), # Las Vegas temps! | |
| 'humidity_percent': np.random.randint(10, 40, n_samples), # Vegas is dry | |
| 'wind_mph': np.random.randint(0, 25, n_samples), | |
| 'hour_of_day': np.random.randint(8, 22, n_samples), # 8 AM to 10 PM | |
| 'is_weekend': np.random.choice([0, 1], n_samples), | |
| } | |
| # Create labels based on logical rules (this is our "ground truth") | |
| labels = [] | |
| for i in range(n_samples): | |
| temp = data['temperature_f'][i] | |
| wind = data['wind_mph'][i] | |
| hour = data['hour_of_day'][i] | |
| # Decision logic: Go outdoors if conditions are nice | |
| if temp < 85 and wind < 15 and 8 <= hour <= 18: | |
| labels.append('Outdoors') # Nice conditions | |
| elif temp > 105: | |
| labels.append('Library') # Too hot! | |
| elif wind > 20: | |
| labels.append('Library') # Too windy! | |
| elif hour > 19: | |
| labels.append('Library') # Evening - better indoor lighting | |
| else: | |
| # Add some randomness for realistic data | |
| labels.append(np.random.choice(['Library', 'Outdoors'], p=[0.6, 0.4])) | |
| data['study_location'] = labels | |
| # Convert to DataFrame | |
| df = pd.DataFrame(data) | |
| print(f"\nDataset created with {len(df)} student decisions") | |
| print(f"\nFirst few rows:") | |
| print(df.head(10)) | |
| print(f"\n๐ Class Distribution:") | |
| print(df['study_location'].value_counts()) | |
| # ============================================================ | |
| # STEP 2: PREPARE DATA FOR MACHINE LEARNING | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 2: Preparing Data") | |
| print("="*60) | |
| # Separate features (X) and target (y) | |
| X = df[['temperature_f', 'humidity_percent', 'wind_mph', 'hour_of_day', 'is_weekend']] | |
| y = df['study_location'] | |
| print("\nFeatures (what the model uses to decide):") | |
| print(X.columns.tolist()) | |
| print("\nTarget (what we're predicting):", y.name) | |
| # Split into training and testing sets (80% train, 20% test) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42 | |
| ) | |
| print(f"\nโ Training set: {len(X_train)} samples") | |
| print(f"โ Testing set: {len(X_test)} samples") | |
| # ============================================================ | |
| # STEP 3: BUILD THE DECISION TREE | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 3: Building the Decision Tree") | |
| print("="*60) | |
| # Create the decision tree classifier | |
| # max_depth=3 keeps it simple and easy to visualize | |
| tree_model = DecisionTreeClassifier( | |
| max_depth=3, # Limit tree depth for interpretability | |
| min_samples_split=10, # Need at least 10 samples to split a node | |
| random_state=42 | |
| ) | |
| # Train the model | |
| print("\n๐ณ Training the decision tree...") | |
| tree_model.fit(X_train, y_train) | |
| print("โ Training complete!") | |
| # ============================================================ | |
| # STEP 4: EVALUATE THE MODEL | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 4: Evaluating Model Performance") | |
| print("="*60) | |
| # Make predictions | |
| y_pred_train = tree_model.predict(X_train) | |
| y_pred_test = tree_model.predict(X_test) | |
| # Calculate accuracy | |
| train_accuracy = accuracy_score(y_train, y_pred_train) | |
| test_accuracy = accuracy_score(y_test, y_pred_test) | |
| print(f"\n๐ Training Accuracy: {train_accuracy:.2%}") | |
| print(f"๐ Testing Accuracy: {test_accuracy:.2%}") | |
| print("\n๐ Detailed Classification Report:") | |
| print(classification_report(y_test, y_pred_test)) | |
| # ============================================================ | |
| # STEP 5: VISUALIZE THE DECISION TREE | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 5: Visualizing the Decision Tree") | |
| print("="*60) | |
| plt.figure(figsize=(20, 10)) | |
| plot_tree( | |
| tree_model, | |
| feature_names=X.columns, | |
| class_names=['Library', 'Outdoors'], | |
| filled=True, | |
| rounded=True, | |
| fontsize=10 | |
| ) | |
| plt.title("Decision Tree: Study Location Predictor\n(UNLV Student Example)", | |
| fontsize=16, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig('/mnt/user-data/outputs/decision_tree_visualization.png', dpi=300, bbox_inches='tight') | |
| print("\nโ Decision tree visualization saved!") | |
| # ============================================================ | |
| # STEP 6: FEATURE IMPORTANCE | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 6: Understanding Feature Importance") | |
| print("="*60) | |
| # Get feature importance | |
| feature_importance = pd.DataFrame({ | |
| 'feature': X.columns, | |
| 'importance': tree_model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| print("\n๐ฏ Feature Importance (which factors matter most?):") | |
| print(feature_importance) | |
| # Visualize feature importance | |
| plt.figure(figsize=(10, 6)) | |
| plt.barh(feature_importance['feature'], feature_importance['importance']) | |
| plt.xlabel('Importance Score', fontsize=12) | |
| plt.title('Feature Importance in Decision Making', fontsize=14, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig('/mnt/user-data/outputs/feature_importance.png', dpi=300, bbox_inches='tight') | |
| print("\nโ Feature importance plot saved!") | |
| # ============================================================ | |
| # STEP 7: CONFUSION MATRIX | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 7: Confusion Matrix") | |
| print("="*60) | |
| cm = confusion_matrix(y_test, y_pred_test) | |
| plt.figure(figsize=(8, 6)) | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', | |
| xticklabels=['Library', 'Outdoors'], | |
| yticklabels=['Library', 'Outdoors']) | |
| plt.title('Confusion Matrix: Actual vs Predicted', fontsize=14, fontweight='bold') | |
| plt.ylabel('Actual', fontsize=12) | |
| plt.xlabel('Predicted', fontsize=12) | |
| plt.tight_layout() | |
| plt.savefig('/mnt/user-data/outputs/confusion_matrix.png', dpi=300, bbox_inches='tight') | |
| print("\nโ Confusion matrix saved!") | |
| # ============================================================ | |
| # STEP 8: TEST WITH NEW EXAMPLES | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("STEP 8: Making Predictions with New Data") | |
| print("="*60) | |
| # Create some test scenarios | |
| test_scenarios = pd.DataFrame([ | |
| {'temperature_f': 75, 'humidity_percent': 15, 'wind_mph': 5, 'hour_of_day': 10, 'is_weekend': 1}, | |
| {'temperature_f': 108, 'humidity_percent': 20, 'wind_mph': 10, 'hour_of_day': 14, 'is_weekend': 0}, | |
| {'temperature_f': 65, 'humidity_percent': 25, 'wind_mph': 20, 'hour_of_day': 16, 'is_weekend': 1}, | |
| {'temperature_f': 90, 'humidity_percent': 18, 'wind_mph': 8, 'hour_of_day': 20, 'is_weekend': 0}, | |
| ]) | |
| predictions = tree_model.predict(test_scenarios) | |
| print("\n๐ฎ Predictions for new scenarios:\n") | |
| for i, (idx, row) in enumerate(test_scenarios.iterrows()): | |
| print(f"Scenario {i+1}:") | |
| print(f" Temperature: {row['temperature_f']}ยฐF") | |
| print(f" Humidity: {row['humidity_percent']}%") | |
| print(f" Wind: {row['wind_mph']} mph") | |
| print(f" Time: {row['hour_of_day']}:00") | |
| print(f" Weekend: {'Yes' if row['is_weekend'] else 'No'}") | |
| print(f" โ Recommended location: {predictions[i]}") | |
| print() | |
| # ============================================================ | |
| # SUMMARY FOR STUDENTS | |
| # ============================================================ | |
| print("\n" + "="*60) | |
| print("KEY TAKEAWAYS FOR UNLV STUDENTS") | |
| print("="*60) | |
| print(""" | |
| 1. **What is a Decision Tree?** | |
| - A flowchart-like model that makes decisions by asking questions | |
| - Easy to interpret and visualize | |
| - Works like playing "20 Questions" | |
| 2. **How Does It Work?** | |
| - Starts at the root (top) with all data | |
| - Splits data based on features (temperature, wind, etc.) | |
| - Continues splitting until reaching a decision (leaf node) | |
| 3. **Key Concepts:** | |
| - Training: Teaching the model using past examples | |
| - Testing: Checking how well it works on new data | |
| - Overfitting: When the tree memorizes training data (bad!) | |
| - Feature Importance: Which factors matter most | |
| 4. **Real-World Applications:** | |
| - Medical diagnosis | |
| - Credit approval | |
| - Customer segmentation | |
| - Game AI | |
| - Weather prediction | |
| 5. **Advantages:** | |
| โ Easy to understand and explain | |
| โ Works with both numbers and categories | |
| โ Requires little data preparation | |
| 6. **Limitations:** | |
| โ Can overfit if too complex | |
| โ Sensitive to small data changes | |
| โ May not capture complex relationships | |
| """) | |
| print("\n" + "="*60) | |
| print("๐ Tutorial Complete!") | |
| print("="*60) | |
| print("\nFiles saved:") | |
| print(" โข decision_tree_visualization.png") | |
| print(" โข feature_importance.png") | |
| print(" โข confusion_matrix.png") | |