Update app.py
Browse files
app.py
CHANGED
|
@@ -12,8 +12,10 @@ from typing import Optional, Tuple, Dict, Any
|
|
| 12 |
import logging
|
| 13 |
from datetime import datetime
|
| 14 |
import re
|
| 15 |
-
|
| 16 |
-
from
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Configure logging
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -24,6 +26,8 @@ class EnhancedDataAnalyzer:
|
|
| 24 |
self.api_base_url = "https://llm.chutes.ai/v1/chat/completions"
|
| 25 |
self.max_file_size = 50 * 1024 * 1024 # 50MB limit
|
| 26 |
self.conversation_history = []
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def validate_api_key(self, api_key: str) -> bool:
|
| 29 |
"""Validate API key format"""
|
|
@@ -129,7 +133,7 @@ Format your response with clear sections and bullet points for readability."""
|
|
| 129 |
logger.error(f"API Error: {str(e)}")
|
| 130 |
return f"β **Connection Error**: {str(e)}"
|
| 131 |
|
| 132 |
-
def process_file(self, file_path: str) -> Tuple[pd.DataFrame, str,
|
| 133 |
"""Enhanced file processing with better error handling"""
|
| 134 |
try:
|
| 135 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
@@ -153,11 +157,14 @@ Format your response with clear sections and bullet points for readability."""
|
|
| 153 |
# Clean column names
|
| 154 |
df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
|
| 155 |
|
|
|
|
|
|
|
|
|
|
| 156 |
# Generate enhanced summaries
|
| 157 |
data_summary = self.generate_enhanced_summary(df)
|
| 158 |
-
|
| 159 |
|
| 160 |
-
return df, data_summary,
|
| 161 |
|
| 162 |
except Exception as e:
|
| 163 |
raise Exception(f"Error processing file: {str(e)}")
|
|
@@ -223,28 +230,231 @@ Format your response with clear sections and bullet points for readability."""
|
|
| 223 |
|
| 224 |
return "\n".join(summary)
|
| 225 |
|
| 226 |
-
def
|
| 227 |
-
"""Generate
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
# Numerical distribution charts
|
| 231 |
-
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 232 |
-
if len(numeric_cols) > 0:
|
| 233 |
-
for col in numeric_cols[:3]: # First 3 numeric columns
|
| 234 |
-
fig = px.histogram(df, x=col, title=f"Distribution of {col}")
|
| 235 |
-
charts[f"hist_{col}"] = fig
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
# Initialize the analyzer
|
| 250 |
analyzer = EnhancedDataAnalyzer()
|
|
@@ -252,31 +462,29 @@ analyzer = EnhancedDataAnalyzer()
|
|
| 252 |
async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
|
| 253 |
"""Enhanced analysis function with progress tracking"""
|
| 254 |
if not file:
|
| 255 |
-
return "β Please upload a CSV or Excel file.", "", "", None
|
| 256 |
-
|
| 257 |
if not analyzer.validate_api_key(api_key):
|
| 258 |
-
return "β Please enter a valid Chutes API key (minimum 10 characters).", "", "", None
|
| 259 |
-
|
| 260 |
# Validate file
|
| 261 |
is_valid, validation_msg = analyzer.validate_file(file)
|
| 262 |
if not is_valid:
|
| 263 |
-
return f"β {validation_msg}", "", "", None
|
| 264 |
-
|
| 265 |
progress(0.1, desc="π Reading file...")
|
| 266 |
-
|
| 267 |
try:
|
| 268 |
# Process the uploaded file
|
| 269 |
-
df, data_summary,
|
| 270 |
progress(0.3, desc="π Processing data...")
|
| 271 |
-
|
| 272 |
-
# Generate visualizations
|
| 273 |
-
chart_html = create_basic_charts(df)
|
| 274 |
progress(0.5, desc="π€ Generating AI insights...")
|
| 275 |
-
|
| 276 |
# Get AI analysis
|
| 277 |
ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
|
| 278 |
progress(0.9, desc="β¨ Finalizing results...")
|
| 279 |
-
|
| 280 |
# Format the complete response
|
| 281 |
response = f"""# π― Analysis Complete!
|
| 282 |
|
|
@@ -286,57 +494,46 @@ async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
|
|
| 286 |
*Analysis powered by OpenAI gpt-oss-20b via Chutes β’ Generated at {datetime.now().strftime('%H:%M:%S')}*
|
| 287 |
"""
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
progress(1.0, desc="β
Done!")
|
| 290 |
-
return response, data_summary,
|
| 291 |
-
|
| 292 |
-
except Exception as e:
|
| 293 |
-
logger.error(f"Analysis error: {str(e)}")
|
| 294 |
-
return f"β **Error**: {str(e)}", "", "", None
|
| 295 |
|
| 296 |
-
def create_basic_charts(df: pd.DataFrame) -> str:
|
| 297 |
-
"""Create basic visualizations for the dataset"""
|
| 298 |
-
charts_html = []
|
| 299 |
-
|
| 300 |
-
try:
|
| 301 |
-
# Chart 1: Data completeness heatmap
|
| 302 |
-
missing_data = df.isnull().sum()
|
| 303 |
-
if missing_data.sum() > 0:
|
| 304 |
-
fig = px.bar(x=missing_data.index, y=missing_data.values,
|
| 305 |
-
title="Missing Data by Column",
|
| 306 |
-
labels={'x': 'Columns', 'y': 'Missing Count'})
|
| 307 |
-
fig.update_layout(height=400, showlegend=False)
|
| 308 |
-
charts_html.append(fig.to_html(include_plotlyjs='cdn'))
|
| 309 |
-
|
| 310 |
-
# Chart 2: Numerical columns correlation (if multiple numeric columns)
|
| 311 |
-
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 312 |
-
if len(numeric_cols) > 1:
|
| 313 |
-
corr_matrix = df[numeric_cols].corr()
|
| 314 |
-
fig = px.imshow(corr_matrix,
|
| 315 |
-
title="Correlation Matrix",
|
| 316 |
-
color_continuous_scale='RdBu_r',
|
| 317 |
-
aspect="auto")
|
| 318 |
-
fig.update_layout(height=500)
|
| 319 |
-
charts_html.append(fig.to_html(include_plotlyjs='cdn'))
|
| 320 |
-
|
| 321 |
-
# Chart 3: Distribution of first numeric column
|
| 322 |
-
if len(numeric_cols) > 0:
|
| 323 |
-
first_numeric = numeric_cols[0]
|
| 324 |
-
fig = px.histogram(df, x=first_numeric,
|
| 325 |
-
title=f"Distribution: {first_numeric}",
|
| 326 |
-
marginal="box")
|
| 327 |
-
fig.update_layout(height=400)
|
| 328 |
-
charts_html.append(fig.to_html(include_plotlyjs='cdn'))
|
| 329 |
-
|
| 330 |
-
# Additional charts from generate_chart_data
|
| 331 |
-
charts_data = analyzer.generate_chart_data(df)
|
| 332 |
-
for key, fig in charts_data.items():
|
| 333 |
-
charts_html.append(fig.to_html(include_plotlyjs='cdn'))
|
| 334 |
-
|
| 335 |
-
return "\n".join(charts_html) if charts_html else "<p>No charts generated for this dataset.</p>"
|
| 336 |
-
|
| 337 |
except Exception as e:
|
| 338 |
-
logger.error(f"
|
| 339 |
-
return f"
|
| 340 |
|
| 341 |
def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
|
| 342 |
"""Synchronous wrapper for the async analyze function"""
|
|
@@ -344,15 +541,43 @@ def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
|
|
| 344 |
|
| 345 |
def clear_all():
|
| 346 |
"""Clear all inputs and outputs"""
|
| 347 |
-
|
|
|
|
|
|
|
| 348 |
|
| 349 |
-
def
|
| 350 |
-
"""Generate downloadable
|
| 351 |
if not analysis_text:
|
| 352 |
-
return None
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
|
|
| 356 |
|
| 357 |
## AI Analysis:
|
| 358 |
{analysis_text}
|
|
@@ -360,35 +585,15 @@ Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
| 360 |
## Raw Data Summary:
|
| 361 |
{data_summary}
|
| 362 |
"""
|
| 363 |
-
|
| 364 |
-
base_filename = f"data_analysis_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 365 |
-
filename = None
|
| 366 |
-
|
| 367 |
-
try:
|
| 368 |
-
if format_choice == "PDF":
|
| 369 |
-
# Convert MD to HTML first
|
| 370 |
-
report_html = markdown.markdown(report_md)
|
| 371 |
-
# Wrap in basic HTML structure for better PDF rendering
|
| 372 |
-
full_html = f"""
|
| 373 |
-
<html>
|
| 374 |
-
<head><style>body {{ font-family: Arial, sans-serif; }}</style></head>
|
| 375 |
-
<body>{report_html}</body>
|
| 376 |
-
</html>
|
| 377 |
-
"""
|
| 378 |
-
filename = base_filename + ".pdf"
|
| 379 |
-
WeasyHTML(string=full_html).write_pdf(filename)
|
| 380 |
-
|
| 381 |
-
elif format_choice == "HTML":
|
| 382 |
-
report_html = markdown.markdown(report_md, output_format='html5')
|
| 383 |
-
filename = base_filename + ".html"
|
| 384 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 385 |
-
f.write(
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
except Exception as e:
|
| 390 |
-
logger.error(f"
|
| 391 |
-
return None
|
| 392 |
|
| 393 |
# Create enhanced Gradio interface
|
| 394 |
with gr.Blocks(
|
|
@@ -408,15 +613,25 @@ with gr.Blocks(
|
|
| 408 |
text-align: center;
|
| 409 |
background: #f8f9ff;
|
| 410 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
"""
|
| 412 |
) as app:
|
| 413 |
|
|
|
|
|
|
|
|
|
|
| 414 |
# Header
|
| 415 |
gr.Markdown("""
|
| 416 |
# π Smart Data Analyzer Pro
|
| 417 |
### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
|
| 418 |
|
| 419 |
-
Upload your data files and get instant professional insights, visualizations, and
|
| 420 |
""")
|
| 421 |
|
| 422 |
# Main interface
|
|
@@ -483,7 +698,8 @@ with gr.Blocks(
|
|
| 483 |
with gr.Tab("π Visualizations"):
|
| 484 |
charts_output = gr.HTML(
|
| 485 |
label="Auto-Generated Charts",
|
| 486 |
-
value="<p
|
|
|
|
| 487 |
)
|
| 488 |
|
| 489 |
with gr.Tab("π Raw Summary"):
|
|
@@ -494,15 +710,20 @@ with gr.Blocks(
|
|
| 494 |
show_copy_button=True
|
| 495 |
)
|
| 496 |
|
| 497 |
-
with gr.Tab("πΎ Export"):
|
| 498 |
-
gr.Markdown("### Download Your Analysis Report")
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
# Event handlers
|
| 508 |
def update_file_stats(file):
|
|
@@ -516,19 +737,35 @@ with gr.Blocks(
|
|
| 516 |
except:
|
| 517 |
return "File information unavailable"
|
| 518 |
|
| 519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
analyze_btn.click(
|
| 521 |
-
fn=
|
| 522 |
inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
|
| 523 |
-
outputs=[analysis_output, raw_summary, data_preview, charts_output],
|
| 524 |
show_progress=True
|
| 525 |
)
|
| 526 |
|
| 527 |
# Follow-up questions
|
| 528 |
ask_btn.click(
|
| 529 |
-
fn=
|
| 530 |
inputs=[file_input, api_key_input, question_input],
|
| 531 |
-
outputs=[question_output
|
| 532 |
show_progress=True
|
| 533 |
)
|
| 534 |
|
|
@@ -543,14 +780,14 @@ with gr.Blocks(
|
|
| 543 |
clear_btn.click(
|
| 544 |
fn=clear_all,
|
| 545 |
outputs=[file_input, api_key_input, question_input, analysis_output,
|
| 546 |
-
question_output, data_preview, charts_output]
|
| 547 |
)
|
| 548 |
|
| 549 |
-
#
|
| 550 |
download_btn.click(
|
| 551 |
-
fn=
|
| 552 |
-
inputs=[analysis_output, raw_summary, format_choice],
|
| 553 |
-
outputs=[download_file]
|
| 554 |
)
|
| 555 |
|
| 556 |
# Footer with usage tips
|
|
@@ -563,6 +800,18 @@ with gr.Blocks(
|
|
| 563 |
- Use descriptive column names
|
| 564 |
- Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
|
| 565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
**β‘ Speed Optimization:**
|
| 567 |
- Files under 10MB process fastest
|
| 568 |
- CSV files typically load faster than Excel
|
|
@@ -571,14 +820,13 @@ with gr.Blocks(
|
|
| 571 |
**π§ Supported Formats:** CSV, XLSX, XLS | **π Max Size:** 50MB | **π Response Time:** ~3-5 seconds
|
| 572 |
""")
|
| 573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
# Launch configuration
|
| 575 |
if __name__ == "__main__":
|
| 576 |
app.queue(max_size=10) # Handle multiple users
|
| 577 |
app.launch(
|
| 578 |
-
share=True
|
| 579 |
-
server_name="0.0.0.0",
|
| 580 |
-
server_port=7860,
|
| 581 |
-
show_error=True,
|
| 582 |
-
favicon_path=None,
|
| 583 |
-
ssl_verify=False
|
| 584 |
)
|
|
|
|
| 12 |
import logging
|
| 13 |
from datetime import datetime
|
| 14 |
import re
|
| 15 |
+
import base64
|
| 16 |
+
from io import BytesIO
|
| 17 |
+
import weasyprint # For PDF generation
|
| 18 |
+
from jinja2 import Template # For HTML templating
|
| 19 |
|
| 20 |
# Configure logging
|
| 21 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 26 |
self.api_base_url = "https://llm.chutes.ai/v1/chat/completions"
|
| 27 |
self.max_file_size = 50 * 1024 * 1024 # 50MB limit
|
| 28 |
self.conversation_history = []
|
| 29 |
+
self.current_df = None
|
| 30 |
+
self.current_charts = None
|
| 31 |
|
| 32 |
def validate_api_key(self, api_key: str) -> bool:
|
| 33 |
"""Validate API key format"""
|
|
|
|
| 133 |
logger.error(f"API Error: {str(e)}")
|
| 134 |
return f"β **Connection Error**: {str(e)}"
|
| 135 |
|
| 136 |
+
def process_file(self, file_path: str) -> Tuple[pd.DataFrame, str, str]:
|
| 137 |
"""Enhanced file processing with better error handling"""
|
| 138 |
try:
|
| 139 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
|
| 157 |
# Clean column names
|
| 158 |
df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
|
| 159 |
|
| 160 |
+
# Store dataframe for visualizations
|
| 161 |
+
self.current_df = df
|
| 162 |
+
|
| 163 |
# Generate enhanced summaries
|
| 164 |
data_summary = self.generate_enhanced_summary(df)
|
| 165 |
+
charts_html = self.generate_visualizations(df)
|
| 166 |
|
| 167 |
+
return df, data_summary, charts_html
|
| 168 |
|
| 169 |
except Exception as e:
|
| 170 |
raise Exception(f"Error processing file: {str(e)}")
|
|
|
|
| 230 |
|
| 231 |
return "\n".join(summary)
|
| 232 |
|
| 233 |
+
def generate_visualizations(self, df: pd.DataFrame) -> str:
|
| 234 |
+
"""Generate comprehensive visualizations for the dataset"""
|
| 235 |
+
charts_html = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
try:
|
| 238 |
+
# Chart 1: Data completeness analysis
|
| 239 |
+
missing_data = df.isnull().sum()
|
| 240 |
+
if missing_data.sum() > 0:
|
| 241 |
+
fig = px.bar(
|
| 242 |
+
x=missing_data.index,
|
| 243 |
+
y=missing_data.values,
|
| 244 |
+
title="π Missing Data Analysis",
|
| 245 |
+
labels={'x': 'Columns', 'y': 'Missing Values Count'},
|
| 246 |
+
color=missing_data.values,
|
| 247 |
+
color_continuous_scale='Reds'
|
| 248 |
+
)
|
| 249 |
+
fig.update_layout(
|
| 250 |
+
height=400,
|
| 251 |
+
showlegend=False,
|
| 252 |
+
title_x=0.5,
|
| 253 |
+
xaxis_tickangle=-45
|
| 254 |
+
)
|
| 255 |
+
charts_html.append(f"<h3>π Data Quality Overview</h3>")
|
| 256 |
+
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_chart"))
|
| 257 |
+
|
| 258 |
+
# Chart 2: Numerical columns correlation heatmap
|
| 259 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 260 |
+
if len(numeric_cols) > 1:
|
| 261 |
+
corr_matrix = df[numeric_cols].corr()
|
| 262 |
+
fig = px.imshow(
|
| 263 |
+
corr_matrix,
|
| 264 |
+
title="π Correlation Matrix - Numerical Variables",
|
| 265 |
+
color_continuous_scale='RdBu_r',
|
| 266 |
+
aspect="auto",
|
| 267 |
+
text_auto=True
|
| 268 |
+
)
|
| 269 |
+
fig.update_layout(height=500, title_x=0.5)
|
| 270 |
+
charts_html.append(f"<h3>π Correlation Analysis</h3>")
|
| 271 |
+
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_chart"))
|
| 272 |
+
|
| 273 |
+
# Chart 3: Distribution plots for numerical columns
|
| 274 |
+
if len(numeric_cols) > 0:
|
| 275 |
+
for i, col in enumerate(numeric_cols[:3]): # First 3 numeric columns
|
| 276 |
+
fig = px.histogram(
|
| 277 |
+
df,
|
| 278 |
+
x=col,
|
| 279 |
+
title=f"π Distribution: {col}",
|
| 280 |
+
marginal="box",
|
| 281 |
+
nbins=30
|
| 282 |
+
)
|
| 283 |
+
fig.update_layout(height=400, title_x=0.5)
|
| 284 |
+
if i == 0:
|
| 285 |
+
charts_html.append(f"<h3>π Data Distributions</h3>")
|
| 286 |
+
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"dist_chart_{i}"))
|
| 287 |
+
|
| 288 |
+
# Chart 4: Categorical analysis
|
| 289 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 290 |
+
if len(categorical_cols) > 0:
|
| 291 |
+
for i, col in enumerate(categorical_cols[:2]): # First 2 categorical columns
|
| 292 |
+
if df[col].nunique() <= 20: # Only if reasonable number of categories
|
| 293 |
+
value_counts = df[col].value_counts().head(10)
|
| 294 |
+
fig = px.bar(
|
| 295 |
+
x=value_counts.values,
|
| 296 |
+
y=value_counts.index,
|
| 297 |
+
orientation='h',
|
| 298 |
+
title=f"π Top 10 Values: {col}",
|
| 299 |
+
labels={'x': 'Count', 'y': col}
|
| 300 |
+
)
|
| 301 |
+
fig.update_layout(height=400, title_x=0.5)
|
| 302 |
+
if i == 0:
|
| 303 |
+
charts_html.append(f"<h3>π Categorical Data Analysis</h3>")
|
| 304 |
+
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"cat_chart_{i}"))
|
| 305 |
+
|
| 306 |
+
# Chart 5: Data overview summary
|
| 307 |
+
summary_data = {
|
| 308 |
+
'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 'Missing Values'],
|
| 309 |
+
'Count': [
|
| 310 |
+
len(df),
|
| 311 |
+
len(df.columns),
|
| 312 |
+
len(numeric_cols),
|
| 313 |
+
len(categorical_cols),
|
| 314 |
+
df.isnull().sum().sum()
|
| 315 |
+
]
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
fig = px.bar(
|
| 319 |
+
summary_data,
|
| 320 |
+
x='Metric',
|
| 321 |
+
y='Count',
|
| 322 |
+
title="π Dataset Overview",
|
| 323 |
+
color='Count',
|
| 324 |
+
color_continuous_scale='Blues'
|
| 325 |
+
)
|
| 326 |
+
fig.update_layout(height=400, title_x=0.5, showlegend=False)
|
| 327 |
+
charts_html.append(f"<h3>π Dataset Overview</h3>")
|
| 328 |
+
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_chart"))
|
| 329 |
+
|
| 330 |
+
# Store charts for export
|
| 331 |
+
self.current_charts = charts_html
|
| 332 |
+
|
| 333 |
+
return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
|
| 334 |
|
| 335 |
+
except Exception as e:
|
| 336 |
+
logger.error(f"Chart generation error: {str(e)}")
|
| 337 |
+
return f"<p>β Chart generation failed: {str(e)}</p>"
|
| 338 |
+
|
| 339 |
+
def generate_report_html(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
|
| 340 |
+
"""Generate HTML report with embedded charts"""
|
| 341 |
+
|
| 342 |
+
html_template = """
|
| 343 |
+
<!DOCTYPE html>
|
| 344 |
+
<html>
|
| 345 |
+
<head>
|
| 346 |
+
<meta charset="UTF-8">
|
| 347 |
+
<title>Data Analysis Report</title>
|
| 348 |
+
<style>
|
| 349 |
+
body {
|
| 350 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 351 |
+
line-height: 1.6;
|
| 352 |
+
color: #333;
|
| 353 |
+
max-width: 1200px;
|
| 354 |
+
margin: 0 auto;
|
| 355 |
+
padding: 20px;
|
| 356 |
+
background: #f8f9fa;
|
| 357 |
+
}
|
| 358 |
+
.header {
|
| 359 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 360 |
+
color: white;
|
| 361 |
+
padding: 30px;
|
| 362 |
+
border-radius: 10px;
|
| 363 |
+
margin-bottom: 30px;
|
| 364 |
+
text-align: center;
|
| 365 |
+
}
|
| 366 |
+
.section {
|
| 367 |
+
background: white;
|
| 368 |
+
padding: 25px;
|
| 369 |
+
margin-bottom: 20px;
|
| 370 |
+
border-radius: 8px;
|
| 371 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
| 372 |
+
}
|
| 373 |
+
.chart-container {
|
| 374 |
+
margin: 20px 0;
|
| 375 |
+
padding: 15px;
|
| 376 |
+
background: #f8f9ff;
|
| 377 |
+
border-radius: 8px;
|
| 378 |
+
border-left: 4px solid #667eea;
|
| 379 |
+
}
|
| 380 |
+
h1, h2, h3 { color: #2c3e50; }
|
| 381 |
+
.metadata {
|
| 382 |
+
background: #e8f4f8;
|
| 383 |
+
padding: 15px;
|
| 384 |
+
border-radius: 5px;
|
| 385 |
+
margin-bottom: 20px;
|
| 386 |
+
}
|
| 387 |
+
.footer {
|
| 388 |
+
text-align: center;
|
| 389 |
+
color: #666;
|
| 390 |
+
margin-top: 40px;
|
| 391 |
+
padding: 20px;
|
| 392 |
+
background: #f1f1f1;
|
| 393 |
+
border-radius: 5px;
|
| 394 |
+
}
|
| 395 |
+
pre {
|
| 396 |
+
background: #f4f4f4;
|
| 397 |
+
padding: 15px;
|
| 398 |
+
border-radius: 5px;
|
| 399 |
+
overflow-x: auto;
|
| 400 |
+
white-space: pre-wrap;
|
| 401 |
+
}
|
| 402 |
+
</style>
|
| 403 |
+
</head>
|
| 404 |
+
<body>
|
| 405 |
+
<div class="header">
|
| 406 |
+
<h1>π Smart Data Analysis Report</h1>
|
| 407 |
+
<p>Comprehensive AI-Powered Data Insights</p>
|
| 408 |
+
</div>
|
| 409 |
+
|
| 410 |
+
<div class="metadata">
|
| 411 |
+
<strong>π File:</strong> {{ file_name }}<br>
|
| 412 |
+
<strong>π
Generated:</strong> {{ timestamp }}<br>
|
| 413 |
+
<strong>π€ Model:</strong> OpenAI gpt-oss-20b via Chutes AI
|
| 414 |
+
</div>
|
| 415 |
+
|
| 416 |
+
<div class="section">
|
| 417 |
+
<h2>π― AI Analysis & Insights</h2>
|
| 418 |
+
<div>{{ ai_analysis }}</div>
|
| 419 |
+
</div>
|
| 420 |
+
|
| 421 |
+
<div class="section">
|
| 422 |
+
<h2>π Visualizations</h2>
|
| 423 |
+
<div class="chart-container">
|
| 424 |
+
{{ charts_html }}
|
| 425 |
+
</div>
|
| 426 |
+
</div>
|
| 427 |
+
|
| 428 |
+
<div class="section">
|
| 429 |
+
<h2>π Technical Data Summary</h2>
|
| 430 |
+
<pre>{{ data_summary }}</pre>
|
| 431 |
+
</div>
|
| 432 |
+
|
| 433 |
+
<div class="footer">
|
| 434 |
+
<p>Report generated by Smart Data Analyzer Pro β’ Powered by AI</p>
|
| 435 |
+
<p>For questions or support, visit chutes.ai</p>
|
| 436 |
+
</div>
|
| 437 |
+
</body>
|
| 438 |
+
</html>
|
| 439 |
+
"""
|
| 440 |
+
|
| 441 |
+
template = Template(html_template)
|
| 442 |
+
|
| 443 |
+
# Convert markdown to HTML for AI analysis
|
| 444 |
+
ai_analysis_html = analysis_text.replace('\n', '<br>')
|
| 445 |
+
ai_analysis_html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', ai_analysis_html)
|
| 446 |
+
ai_analysis_html = re.sub(r'## (.*?)\n', r'<h3>\1</h3>', ai_analysis_html)
|
| 447 |
+
ai_analysis_html = re.sub(r'# (.*?)\n', r'<h2>\1</h2>', ai_analysis_html)
|
| 448 |
+
|
| 449 |
+
charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
|
| 450 |
+
|
| 451 |
+
return template.render(
|
| 452 |
+
file_name=file_name,
|
| 453 |
+
timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
| 454 |
+
ai_analysis=ai_analysis_html,
|
| 455 |
+
charts_html=charts_content,
|
| 456 |
+
data_summary=data_summary
|
| 457 |
+
)
|
| 458 |
|
| 459 |
# Initialize the analyzer
|
| 460 |
analyzer = EnhancedDataAnalyzer()
|
|
|
|
| 462 |
async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
|
| 463 |
"""Enhanced analysis function with progress tracking"""
|
| 464 |
if not file:
|
| 465 |
+
return "β Please upload a CSV or Excel file.", "", "", "", None
|
| 466 |
+
|
| 467 |
if not analyzer.validate_api_key(api_key):
|
| 468 |
+
return "β Please enter a valid Chutes API key (minimum 10 characters).", "", "", "", None
|
| 469 |
+
|
| 470 |
# Validate file
|
| 471 |
is_valid, validation_msg = analyzer.validate_file(file)
|
| 472 |
if not is_valid:
|
| 473 |
+
return f"β {validation_msg}", "", "", "", None
|
| 474 |
+
|
| 475 |
progress(0.1, desc="π Reading file...")
|
| 476 |
+
|
| 477 |
try:
|
| 478 |
# Process the uploaded file
|
| 479 |
+
df, data_summary, charts_html = analyzer.process_file(file.name)
|
| 480 |
progress(0.3, desc="π Processing data...")
|
| 481 |
+
|
|
|
|
|
|
|
| 482 |
progress(0.5, desc="π€ Generating AI insights...")
|
| 483 |
+
|
| 484 |
# Get AI analysis
|
| 485 |
ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
|
| 486 |
progress(0.9, desc="β¨ Finalizing results...")
|
| 487 |
+
|
| 488 |
# Format the complete response
|
| 489 |
response = f"""# π― Analysis Complete!
|
| 490 |
|
|
|
|
| 494 |
*Analysis powered by OpenAI gpt-oss-20b via Chutes β’ Generated at {datetime.now().strftime('%H:%M:%S')}*
|
| 495 |
"""
|
| 496 |
|
| 497 |
+
# Generate data preview
|
| 498 |
+
data_preview_html = df.head(15).to_html(
|
| 499 |
+
classes="table table-striped table-hover",
|
| 500 |
+
table_id="data-preview-table",
|
| 501 |
+
escape=False
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
# Add some styling to the preview
|
| 505 |
+
styled_preview = f"""
|
| 506 |
+
<style>
|
| 507 |
+
#data-preview-table {{
|
| 508 |
+
width: 100%;
|
| 509 |
+
border-collapse: collapse;
|
| 510 |
+
margin: 20px 0;
|
| 511 |
+
font-size: 14px;
|
| 512 |
+
}}
|
| 513 |
+
#data-preview-table th {{
|
| 514 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 515 |
+
color: white;
|
| 516 |
+
padding: 12px 8px;
|
| 517 |
+
text-align: left;
|
| 518 |
+
font-weight: bold;
|
| 519 |
+
}}
|
| 520 |
+
#data-preview-table td {{
|
| 521 |
+
padding: 10px 8px;
|
| 522 |
+
border-bottom: 1px solid #ddd;
|
| 523 |
+
}}
|
| 524 |
+
#data-preview-table tr:hover {{
|
| 525 |
+
background-color: #f5f5f5;
|
| 526 |
+
}}
|
| 527 |
+
</style>
|
| 528 |
+
{data_preview_html}
|
| 529 |
+
"""
|
| 530 |
+
|
| 531 |
progress(1.0, desc="β
Done!")
|
| 532 |
+
return response, data_summary, styled_preview, charts_html, file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
except Exception as e:
|
| 535 |
+
logger.error(f"Analysis error: {str(e)}")
|
| 536 |
+
return f"β **Error**: {str(e)}", "", "", "", None
|
| 537 |
|
| 538 |
def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
|
| 539 |
"""Synchronous wrapper for the async analyze function"""
|
|
|
|
| 541 |
|
| 542 |
def clear_all():
|
| 543 |
"""Clear all inputs and outputs"""
|
| 544 |
+
analyzer.current_df = None
|
| 545 |
+
analyzer.current_charts = None
|
| 546 |
+
return None, "", "", "", "", "", "", None
|
| 547 |
|
| 548 |
+
def download_report(analysis_text, data_summary, file_name, format_choice):
|
| 549 |
+
"""Generate downloadable report in PDF or HTML format"""
|
| 550 |
if not analysis_text:
|
| 551 |
+
return None, "β No analysis data available for download."
|
| 552 |
+
|
| 553 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 554 |
+
file_base_name = os.path.splitext(file_name)[0] if file_name else "data_analysis"
|
| 555 |
|
| 556 |
+
try:
|
| 557 |
+
if format_choice == "HTML":
|
| 558 |
+
# Generate HTML report
|
| 559 |
+
html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
|
| 560 |
+
filename = f"{file_base_name}_analysis_report_{timestamp}.html"
|
| 561 |
+
|
| 562 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
| 563 |
+
f.write(html_content)
|
| 564 |
+
|
| 565 |
+
return filename, f"β
HTML report generated successfully! File: {filename}"
|
| 566 |
+
|
| 567 |
+
elif format_choice == "PDF":
|
| 568 |
+
# Generate PDF report
|
| 569 |
+
html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
|
| 570 |
+
filename = f"{file_base_name}_analysis_report_{timestamp}.pdf"
|
| 571 |
+
|
| 572 |
+
# Convert HTML to PDF using weasyprint
|
| 573 |
+
weasyprint.HTML(string=html_content).write_pdf(filename)
|
| 574 |
+
|
| 575 |
+
return filename, f"β
PDF report generated successfully! File: {filename}"
|
| 576 |
+
|
| 577 |
+
else: # Markdown fallback
|
| 578 |
+
report = f"""# Data Analysis Report
|
| 579 |
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 580 |
+
File: {file_name}
|
| 581 |
|
| 582 |
## AI Analysis:
|
| 583 |
{analysis_text}
|
|
|
|
| 585 |
## Raw Data Summary:
|
| 586 |
{data_summary}
|
| 587 |
"""
|
| 588 |
+
filename = f"{file_base_name}_analysis_report_{timestamp}.md"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 590 |
+
f.write(report)
|
| 591 |
+
|
| 592 |
+
return filename, f"β
Markdown report generated successfully! File: {filename}"
|
| 593 |
+
|
| 594 |
except Exception as e:
|
| 595 |
+
logger.error(f"Report generation error: {str(e)}")
|
| 596 |
+
return None, f"β Error generating report: {str(e)}"
|
| 597 |
|
| 598 |
# Create enhanced Gradio interface
|
| 599 |
with gr.Blocks(
|
|
|
|
| 613 |
text-align: center;
|
| 614 |
background: #f8f9ff;
|
| 615 |
}
|
| 616 |
+
.charts-container {
|
| 617 |
+
max-height: 800px;
|
| 618 |
+
overflow-y: auto;
|
| 619 |
+
padding: 10px;
|
| 620 |
+
background: #fafafa;
|
| 621 |
+
border-radius: 8px;
|
| 622 |
+
}
|
| 623 |
"""
|
| 624 |
) as app:
|
| 625 |
|
| 626 |
+
# Store file name for downloads
|
| 627 |
+
current_file_name = gr.State("")
|
| 628 |
+
|
| 629 |
# Header
|
| 630 |
gr.Markdown("""
|
| 631 |
# π Smart Data Analyzer Pro
|
| 632 |
### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
|
| 633 |
|
| 634 |
+
Upload your data files and get instant professional insights, visualizations, and downloadable reports!
|
| 635 |
""")
|
| 636 |
|
| 637 |
# Main interface
|
|
|
|
| 698 |
with gr.Tab("π Visualizations"):
|
| 699 |
charts_output = gr.HTML(
|
| 700 |
label="Auto-Generated Charts",
|
| 701 |
+
value="<div class='charts-container'><p>π Interactive charts will appear here after analysis...</p></div>",
|
| 702 |
+
elem_classes=["charts-container"]
|
| 703 |
)
|
| 704 |
|
| 705 |
with gr.Tab("π Raw Summary"):
|
|
|
|
| 710 |
show_copy_button=True
|
| 711 |
)
|
| 712 |
|
| 713 |
+
with gr.Tab("πΎ Export Reports"):
|
| 714 |
+
gr.Markdown("### π₯ Download Your Analysis Report")
|
| 715 |
+
|
| 716 |
+
with gr.Row():
|
| 717 |
+
format_choice = gr.Radio(
|
| 718 |
+
choices=["HTML", "PDF", "Markdown"],
|
| 719 |
+
value="HTML",
|
| 720 |
+
label="π Report Format",
|
| 721 |
+
info="Choose your preferred download format"
|
| 722 |
+
)
|
| 723 |
+
|
| 724 |
+
download_btn = gr.Button("π₯ Generate & Download Report", variant="primary", size="lg")
|
| 725 |
+
download_status = gr.Textbox(label="Download Status", interactive=False)
|
| 726 |
+
download_file = gr.File(label="π Download Link", visible=True)
|
| 727 |
|
| 728 |
# Event handlers
|
| 729 |
def update_file_stats(file):
|
|
|
|
| 737 |
except:
|
| 738 |
return "File information unavailable"
|
| 739 |
|
| 740 |
+
def handle_analysis(file, api_key, user_question="", progress=gr.Progress()):
|
| 741 |
+
"""Handle main analysis and return all outputs including file name"""
|
| 742 |
+
result = sync_analyze_data(file, api_key, user_question, progress)
|
| 743 |
+
if len(result) == 5: # Check if file name was returned
|
| 744 |
+
return result[0], result[1], result[2], result[3], result[4] # analysis, summary, preview, charts, filename
|
| 745 |
+
else:
|
| 746 |
+
return result[0], result[1], result[2], result[3], "" # fallback without filename
|
| 747 |
+
|
| 748 |
+
def handle_question_analysis(file, api_key, question, progress=gr.Progress()):
|
| 749 |
+
"""Handle question-specific analysis"""
|
| 750 |
+
if not question.strip():
|
| 751 |
+
return "β Please enter a specific question about your data."
|
| 752 |
+
|
| 753 |
+
result = sync_analyze_data(file, api_key, question, progress)
|
| 754 |
+
return result[0] # Return only the analysis output
|
| 755 |
+
|
| 756 |
+
# Main analysis event
|
| 757 |
analyze_btn.click(
|
| 758 |
+
fn=handle_analysis,
|
| 759 |
inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
|
| 760 |
+
outputs=[analysis_output, raw_summary, data_preview, charts_output, current_file_name],
|
| 761 |
show_progress=True
|
| 762 |
)
|
| 763 |
|
| 764 |
# Follow-up questions
|
| 765 |
ask_btn.click(
|
| 766 |
+
fn=handle_question_analysis,
|
| 767 |
inputs=[file_input, api_key_input, question_input],
|
| 768 |
+
outputs=[question_output],
|
| 769 |
show_progress=True
|
| 770 |
)
|
| 771 |
|
|
|
|
| 780 |
clear_btn.click(
|
| 781 |
fn=clear_all,
|
| 782 |
outputs=[file_input, api_key_input, question_input, analysis_output,
|
| 783 |
+
question_output, data_preview, charts_output, raw_summary]
|
| 784 |
)
|
| 785 |
|
| 786 |
+
# Enhanced download functionality
|
| 787 |
download_btn.click(
|
| 788 |
+
fn=download_report,
|
| 789 |
+
inputs=[analysis_output, raw_summary, current_file_name, format_choice],
|
| 790 |
+
outputs=[download_file, download_status]
|
| 791 |
)
|
| 792 |
|
| 793 |
# Footer with usage tips
|
|
|
|
| 800 |
- Use descriptive column names
|
| 801 |
- Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
|
| 802 |
|
| 803 |
+
**π Visualizations Include:**
|
| 804 |
+
- Missing data analysis
|
| 805 |
+
- Correlation matrices for numerical data
|
| 806 |
+
- Distribution plots and histograms
|
| 807 |
+
- Top categories for categorical data
|
| 808 |
+
- Dataset overview metrics
|
| 809 |
+
|
| 810 |
+
**π₯ Export Options:**
|
| 811 |
+
- **HTML**: Interactive report with embedded charts
|
| 812 |
+
- **PDF**: Professional report for presentations
|
| 813 |
+
- **Markdown**: Simple text format for documentation
|
| 814 |
+
|
| 815 |
**β‘ Speed Optimization:**
|
| 816 |
- Files under 10MB process fastest
|
| 817 |
- CSV files typically load faster than Excel
|
|
|
|
| 820 |
**π§ Supported Formats:** CSV, XLSX, XLS | **π Max Size:** 50MB | **π Response Time:** ~3-5 seconds
|
| 821 |
""")
|
| 822 |
|
| 823 |
+
def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
|
| 824 |
+
"""Synchronous wrapper for the async analyze function"""
|
| 825 |
+
return asyncio.run(analyze_data(file, api_key, user_question, progress))
|
| 826 |
+
|
| 827 |
# Launch configuration
|
| 828 |
if __name__ == "__main__":
|
| 829 |
app.queue(max_size=10) # Handle multiple users
|
| 830 |
app.launch(
|
| 831 |
+
share=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
)
|