import argparse import subprocess import os import json import logging import sys import time from datetime import datetime # Configure Enterprise Logging - Force UTF-8 to avoid Windows cp1252 emoji issues logging.basicConfig( level=logging.INFO, format='%(asctime)s - [AutoDS-Enterprise] - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("autods_agent.log", encoding='utf-8'), logging.StreamHandler(sys.stdout) ] ) # Force stdout/stderr to use UTF-8 (Robust for Windows) if sys.stdout.encoding != 'utf-8': import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') logger = logging.getLogger(__name__) def heal_code(command, error_msg): import requests script_path = None for part in command.split(' '): if part.endswith('.py'): script_path = part break if not script_path or not os.path.exists(script_path): return False logger.warning(f"[HEALER] Code crashed! Attempting to self-heal '{script_path}' using LLM...") try: keys_path = os.path.join(os.path.dirname(__file__), "keys_config.json") with open(keys_path) as f: keys = json.load(f) groq_key = keys.get("groq", "") if isinstance(groq_key, list): groq_key = groq_key[0] if groq_key else "" if not groq_key: return False with open(script_path, 'r', encoding='utf-8') as f: code = f.read() prompt = f"The following Python script '{script_path}' failed with this error:\n\n{error_msg}\n\nHere is the code:\n```python\n{code}\n```\n\nPlease fix the bug so it doesn't crash. Reply ONLY with the fully corrected python code. Do not use markdown blocks." res = requests.post( "https://api.groq.com/openai/v1/chat/completions", headers={"Authorization": f"Bearer {groq_key}"}, json={ "model": "llama-3.3-70b-versatile", "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, } ) res_json = res.json() if 'choices' not in res_json: logger.error(f"[HEALER] API Error: {res_json}") return False fixed_code = res_json['choices'][0]['message']['content'].strip() if fixed_code.startswith("```python"): fixed_code = fixed_code[9:] if fixed_code.endswith("```"): fixed_code = fixed_code[:-3] fixed_code = fixed_code.strip() if not fixed_code: return False with open(script_path, 'w', encoding='utf-8') as f: f.write(fixed_code) logger.info(f"[HEALER] Neural code injection successful for '{script_path}'. Resuming...") return True except Exception as e: logger.warning(f"[HEALER] Self-healing failed: {e}") return False def run_step(step_name, command, agent="master", auto_heal_attempts=1): print(f"\n[AGENT_ID: {agent}] Starting specialized task: {step_name}") logger.info(f"[STEP] Executing: {step_name} with agent {agent}") logger.debug(f"Command: {command}") start_time = time.time() for attempt in range(auto_heal_attempts + 1): try: result = subprocess.run(command, shell=True, check=True, text=True, capture_output=True, encoding='utf-8', errors='replace') duration = time.time() - start_time if attempt > 0: logger.info(f"[OK] Step '{step_name}' completed in {duration:.2f}s after {attempt} self-healing cycle(s)!") else: logger.info(f"[OK] Step '{step_name}' completed in {duration:.2f}s") # Print output so bridge server can parse metrics if any if result.stdout: print(result.stdout) logger.debug(f"Output:\n{result.stdout}") return True except subprocess.CalledProcessError as e: error_msg = f"{e.stdout}\n{e.stderr}".strip() logger.error(f"[FAIL] Error in {step_name}: {error_msg}") if attempt < auto_heal_attempts: if heal_code(command, error_msg): continue return False except Exception as e: logger.critical(f"[CRITICAL] Unexpected error in {step_name}: {str(e)}") return False def validate_config(config): required_keys = ["input_csv", "target_column", "data_dir"] for key in required_keys: if key not in config: logger.error(f"[CONFIG] Missing required key: {key}") return False return True def post_to_moltbook(message): logger.info("[UPLINK] Posting update to Moltbook (lablab submolt)...") logger.info(f"[MOLTBOOK] {message}") print(f"\n[OPENCLAW-SKILL] moltbook-post: {message}") def main(): parser = argparse.ArgumentParser(description="AutoDS-MCP Enterprise Agent") parser.add_argument("--config", default="config.json", help="Path to configuration file") parser.add_argument("--mode", default="production", choices=["dev", "production"], help="Execution mode") parser.add_argument("--csv", help="Override input CSV path") parser.add_argument("--task", default="full", choices=["full", "intel", "train", "predict"], help="Agent Task") args = parser.parse_args() logger.info("[INIT] Initializing ROBOTIC AI CORE v1.0") logger.info(f"[BOOT] Starting AutoDS-MCP Agent in {args.mode} mode") # Load config try: with open(args.config, 'r') as f: config = json.load(f) except FileNotFoundError: logger.warning(f"[CONFIG] Config file not found, using defaults.") config = {"target_column": "target", "data_dir": "data"} # Override config with CLI args if args.csv: config["input_csv"] = args.csv if not validate_config(config): logger.critical("[ABORT] Configuration validation failed. Ensure 'input_csv' is provided.") return # Setup paths data_dir = config.get("data_dir", "data") os.makedirs(data_dir, exist_ok=True) input_csv = config["input_csv"] target_col = config["target_column"] # GEMINI OMNI-PERCEPTION: Convert unstructured (PDF/Image) to CSV first if input_csv.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')): logger.info(f"[PERCEPTION] Initiating Omni-Ingest for unstructured file: {input_csv}") if not run_step("Gemini Omni-Perception Ingest", f"python mcp_servers/gemini_ingest.py {input_csv}"): logger.error("[ABORT] Perception ingest failed. Finalizing early.") return # gemini_ingest.py creates 'perceived_data.csv' in the same folder input_csv = os.path.join(os.path.dirname(input_csv), "perceived_data.csv") config["input_csv"] = input_csv logger.info(f"[PERCEPTION] Switch successful. Structured Vector: {input_csv}") # RAW DATA HANDLING: Buffering raw strings to temp file if not a valid path is_path = os.path.exists(input_csv) has_delimiters = ("," in input_csv or "\t" in input_csv or "\n" in input_csv) if not is_path and has_delimiters: ingest_path = os.path.join(data_dir, "neural_ingest.csv") # AGENTIC REPAIR: Check if CSV is flattened into a single line (common for clipboard) if "\n" not in input_csv.strip() and "," in input_csv: logger.info("[AGENT] Flattened stream detected. Initiating line-wrap reconstruction...") # Try to guess header count from first potential values parts = input_csv.split(',') # Heuristic: If we find a space in a part, it's likely the junction of last header and first data for i, p in enumerate(parts): if " " in p.strip(): h_count = i + 1 header_part = p.split(' ')[0] first_val = p.split(' ')[1] # Update parts to be clean parts[i] = header_part headers = parts[:h_count] data_stream = [first_val] + parts[h_count:] # Reconstruct CSV csv_lines = [",".join(headers)] for j in range(0, len(data_stream), h_count): row = data_stream[j:j+h_count] if len(row) == h_count: csv_lines.append(",".join(row)) input_csv = "\n".join(csv_lines) logger.info(f"[AGENT] Synthetic row structure recovered: {h_count} columns found.") break logger.info(f"[INGEST] Raw reconnaissance data detected. Buffering to: {ingest_path}") with open(ingest_path, "w", encoding="utf-8") as f: f.write(input_csv.strip()) input_csv = ingest_path config["input_csv"] = input_csv # AGENTIC TARGET DISCOVERY (3-Tier Heuristic Brain) def find_target_autonomous(csv_path, current_target): """ Tier 1: Keyword semantic search Tier 2: Correlation Matrix analysis (most correlated = likely target) Tier 3: Last column fallback (universal ML convention) """ try: import pandas as pd import numpy as np df_full = pd.read_csv(csv_path, nrows=500) # Sample for speed actual_cols = [c.strip() for c in df_full.columns.tolist()] # Tier 1: Keyword Match (Semantic Search) keywords = ['target', 'label', 'outcome', 'buy_again', 'loyalty', 'default', 'churn', 'status', 'risk', 'class', 'y', 'result'] for col in actual_cols: if any(kw == col.lower() or col.lower().endswith(f'_{kw}') for kw in keywords): logger.info(f"[AGENT-T1] Semantic match. Target locked: '{col}'") return col # Tier 2: Correlation Matrix (Highest correlated with others = likely label) numeric_df = df_full.select_dtypes(include=[np.number]) if len(numeric_df.columns) > 1: corr = numeric_df.corr().abs() # Find col with highest MEAN correlation to all others AND low unique values (categorical) for col in actual_cols: if col in numeric_df.columns and numeric_df[col].nunique() <= 10: avg_corr = corr[col].drop(col).mean() if avg_corr > 0.3: logger.info(f"[AGENT-T2] Correlation matrix analysis. Target locked: '{col}' (avg_corr={avg_corr:.2f})") return col # Tier 3: Final Fallback — Last column (universal ML tabular convention) last_col = actual_cols[-1] logger.info(f"[AGENT-T3] Last-column fallback. Target locked: '{last_col}'") return last_col except Exception as e: logger.warning(f"[AGENT] Heuristic Brain failed: {e}") return current_target try: import pandas as pd temp_df = pd.read_csv(input_csv, nrows=0) actual_cols = [c.strip() for c in temp_df.columns.tolist()] if target_col not in actual_cols: target_col = find_target_autonomous(input_csv, target_col) config["target_column"] = target_col else: logger.info(f"[AGENT] Target column '{target_col}' confirmed in dataset.") except Exception as e: logger.warning(f"[AGENT] Target validation failed: {e}") logger.info(f"[CONFIG] Input Vector: {input_csv} | Target: {target_col} | Task: {args.task}") # Task Specific Pipelines if args.task == "intel": steps = [ { "name": "Neural Intel Scan", "command": f"python mcp_servers/intel_scan.py {input_csv} {data_dir}/intelligence.json", "agent": "intel" } ] elif args.task == "full" or args.task == "train": steps = [ { "name": "Data Intelligence Pre-Scan", "command": f"python mcp_servers/intel_scan.py {input_csv} {data_dir}/intelligence.json", "agent": "intel" }, { "name": "Web Contextual Enrichment", "command": f"python mcp_servers/web_enrichment.py {input_csv} {data_dir}/enriched.csv", "agent": "data" }, { "name": "Industrial Anomaly Scan (Vision Unit)", "command": f"python mcp_servers/vision_agent.py {data_dir}/enriched.csv {data_dir}/anomalies.json", "agent": "vision" }, { "name": "Neural Data Cleaning", "command": f"python mcp_servers/cleaner.py {data_dir}/enriched.csv {data_dir}/cleaned.csv", "agent": "data" }, { "name": "Feature Engineering", "command": f"python mcp_servers/features.py {data_dir}/cleaned.csv {data_dir}/features.csv {target_col}", "agent": "data" }, { "name": "Model Training (Gluon Engine)", "command": f"python mcp_servers/trainer.py {data_dir}/features.csv {target_col} {data_dir}/models", "agent": "trainer" }, { "name": "Neural Analyst Review", "command": f"python mcp_servers/evaluator.py {data_dir}/features.csv {target_col} {data_dir}/models {data_dir}/evaluation.csv", "agent": "intel" }, { "name": "Mission Report Export (Premium Dossier)", "command": f"python mcp_servers/premium_reporter.py {data_dir}/evaluation.csv {data_dir}/report.pdf", "agent": "deploy" }, { "name": "Industrial Compliance Audit", "command": f"python mcp_servers/compliance_agent.py {data_dir}/cleaned.csv {data_dir}/compliance_report.json", "agent": "intel" } ] else: logger.error(f"[ERROR] Task {args.task} not implemented.") return # Execute Pipeline success = True total = len(steps) for i, step in enumerate(steps, 1): logger.info(f"[{i}/{total}] Starting: {step['name']}") if not run_step(step["name"], step["command"], agent=step.get("agent", "master")): success = False break if success: logger.info("[SUCCESS] Pipeline completed successfully!") report_path = os.path.abspath(f'{data_dir}/report.pdf') logger.info(f"[REPORT] Final Report: {report_path}") # Post to Moltbook for Hackathon Requirement update_msg = f"AutoDS-MCP successfully processed '{input_csv}'. AutoGluon ensemble training complete. Report: {report_path}" post_to_moltbook(update_msg) else: logger.error("[FAILED] Pipeline execution failed. Check logs.") if __name__ == "__main__": main()