#!/usr/bin/env python3 """ AI Agent - Analyzes system state and proposes solutions using local LLMs """ import json import requests import subprocess from typing import Dict, List, Any, Optional from pathlib import Path from datetime import datetime from tools import SysadminTools class MachaAgent: """AI agent that analyzes system issues and proposes fixes""" # Load system prompt from file @staticmethod def _load_system_prompt() -> str: """Load the system prompt from file""" prompt_file = Path(__file__).parent / "system_prompt.txt" try: return prompt_file.read_text() except Exception as e: print(f"Warning: Could not load system prompt from {prompt_file}: {e}") return "You are Macha, an autonomous AI system maintenance agent." SYSTEM_PROMPT = _load_system_prompt.__func__() def __init__( self, ollama_host: str = "http://localhost:11434", model: str = "gpt-oss:latest", state_dir: Path = Path("/var/lib/macha"), context_db = None, config_repo: str = "git+https://git.coven.systems/lily/nixos-servers", config_branch: str = "main", enable_tools: bool = True, use_queue: bool = True, priority: str = "INTERACTIVE" ): self.ollama_host = ollama_host self.model = model self.state_dir = state_dir self.state_dir.mkdir(parents=True, exist_ok=True) self.decision_log = self.state_dir / "decisions.jsonl" self.context_db = context_db self.config_repo = config_repo self.config_branch = config_branch self.enable_tools = enable_tools # Queue settings self.use_queue = use_queue self.priority = priority self.ollama_queue = None if use_queue: try: from ollama_queue import OllamaQueue, Priority self.ollama_queue = OllamaQueue() self.priority_level = getattr(Priority, priority, Priority.INTERACTIVE) except (PermissionError, OSError): # Silently fall back to direct API calls when queue is not accessible # (e.g., regular users don't have access to /var/lib/macha/queues) self.use_queue = False except Exception as e: # Log unexpected errors but still fall back gracefully import sys print(f"Note: Ollama queue unavailable ({type(e).__name__}), using direct API", file=sys.stderr) self.use_queue = False # Initialize tools system self.tools = SysadminTools(safe_mode=False) if enable_tools else None # Tool output cache for hierarchical processing self.tool_output_cache = {} self.cache_dir = self.state_dir / "tool_cache" # Only create cache dir if we have write access (running as macha user) try: self.cache_dir.mkdir(parents=True, exist_ok=True) except (PermissionError, OSError): # Running as unprivileged user (macha-chat), use temp dir instead import tempfile self.cache_dir = Path(tempfile.mkdtemp(prefix="macha_cache_")) def _query_relevant_knowledge(self, query: str, limit: int = 3) -> str: """ Query knowledge base for relevant information Returns formatted string of relevant knowledge to include in prompts """ if not self.context_db: return "" try: knowledge_items = self.context_db.query_knowledge(query, limit=limit) if not knowledge_items: return "" knowledge_text = "\n\nRELEVANT KNOWLEDGE FROM EXPERIENCE:\n" for item in knowledge_items: knowledge_text += f"\n• {item['topic']} ({item['category']}):\n" knowledge_text += f" {item['knowledge']}\n" knowledge_text += f" [Confidence: {item['confidence']}, Referenced: {item['times_referenced']} times]\n" return knowledge_text except Exception as e: print(f"Error querying knowledge: {e}") return "" def store_learning( self, topic: str, knowledge: str, category: str = "experience", confidence: str = "medium", tags: list = None ) -> bool: """ Store a learned insight into the knowledge base Args: topic: What this is about knowledge: The insight/pattern/learning category: Type of knowledge confidence: How confident we are tags: Optional tags Returns: True if stored successfully """ if not self.context_db: return False try: kid = self.context_db.store_knowledge( topic=topic, knowledge=knowledge, category=category, source="experience", confidence=confidence, tags=tags ) if kid: print(f"📚 Learned: {topic}") return True return False except Exception as e: print(f"Error storing learning: {e}") return False def reflect_and_learn( self, situation: str, action_taken: str, outcome: str, success: bool ) -> None: """ Reflect on an operation and extract learnings to store Args: situation: What was the problem/situation action_taken: What action was taken outcome: What was the result success: Whether it succeeded """ if not self.context_db: return # Only learn from successful operations for now if not success: return # Build reflection prompt prompt = f"""Based on this successful operation, extract key learnings to remember for the future. SITUATION: {situation} ACTION TAKEN: {action_taken} OUTCOME: {outcome} Extract 1-2 specific, actionable learnings. For each learning provide: 1. topic: A concise topic name (e.g., "systemd service restart", "disk cleanup procedure") 2. knowledge: The specific insight or pattern (what worked, why, important details) 3. category: One of: command, pattern, troubleshooting, performance Respond ONLY with valid JSON: [ {{ "topic": "...", "knowledge": "...", "category": "...", "confidence": "medium" }} ] """ try: response = self._query_ollama(prompt, temperature=0.3, timeout=30) learnings = json.loads(response) if isinstance(learnings, list): for learning in learnings: if all(k in learning for k in ['topic', 'knowledge', 'category']): self.store_learning( topic=learning['topic'], knowledge=learning['knowledge'], category=learning.get('category', 'experience'), confidence=learning.get('confidence', 'medium') ) except Exception as e: # Reflection is optional - don't fail if it doesn't work print(f"Note: Could not extract learnings: {e}") def analyze_system_state(self, monitoring_data: Dict[str, Any], system_hostname: str = None, git_context=None) -> Dict[str, Any]: """Analyze system monitoring data and determine if action is needed""" # Build context for the AI context = self._build_analysis_context(monitoring_data) # Get system infrastructure context if available system_context = "" if self.context_db and system_hostname: system_context = self.context_db.get_system_context(system_hostname, git_context) # Ask the AI to analyze prompt = self._create_analysis_prompt(context, system_context) response = self._query_ollama(prompt) # Parse the AI's response analysis = self._parse_analysis_response(response) # Log the decision self._log_decision(monitoring_data, analysis) return analysis def propose_fix(self, issue_description: str, context: Dict[str, Any]) -> Dict[str, Any]: """Propose a fix for a specific issue""" # Query relevant config files if we have context_db relevant_configs = [] if self.context_db: try: # Query for config files relevant to the issue configs = self.context_db.query_config_files( query=issue_description, n_results=3 ) relevant_configs = configs except Exception as e: print(f"Warning: Could not query config files: {e}") # Build config context section config_context = "" if relevant_configs: config_context = "\n\nRELEVANT CONFIGURATION FILES:\n" for cfg in relevant_configs: config_context += f"\n--- {cfg['path']} (relevance: {cfg['relevance']:.2%}) ---\n" config_context += cfg['content'][:1000] # First 1000 chars to avoid token limits if len(cfg['content']) > 1000: config_context += "\n... (truncated)" config_context += "\n" # Query relevant knowledge from experience knowledge_context = self._query_relevant_knowledge(issue_description, limit=3) # Build previous investigations context previous_inv_context = "" if context.get('previous_investigations'): previous_inv_context = "\n\nPREVIOUS INVESTIGATIONS (DO NOT REPEAT THESE):\n" for i, inv in enumerate(context['previous_investigations'][:3], 1): # Show up to 3 previous_inv_context += f"\nInvestigation #{i} ({inv['timestamp']}):\n" previous_inv_context += f"Commands: {', '.join(inv['commands'])}\n" previous_inv_context += f"Results:\n{inv['output'][:500]}...\n" # First 500 chars previous_inv_context += "\n⚠️ You have already run these investigations. Do NOT propose them again." previous_inv_context += "\n⚠️ Based on the investigation results above, propose an ACTUAL FIX, not more investigation.\n" prompt = f"""{self.SYSTEM_PROMPT} TASK: PROPOSE FIX ================================================================================ ISSUE TO ADDRESS: {issue_description} SYSTEM CONTEXT: {json.dumps(context, indent=2)}{config_context}{knowledge_context}{previous_inv_context} REPOSITORY INFO: - Git Repository: {self.config_repo} - Branch: {self.config_branch} YOUR RESPONSE MUST BE VALID JSON: {{ "diagnosis": "brief description of what you think is wrong", "proposed_action": "specific action to take", "action_type": "one of: systemd_restart, nix_rebuild, config_change, cleanup, investigation", "risk_level": "one of: low, medium, high", "commands": ["list", "of", "shell", "commands"], "config_changes": {{ "file": "path/to/config.nix in the repository", "change": "description of change needed" }}, "reasoning": "why this fix should work", "rollback_plan": "how to undo if it doesn't work" }} RESPOND WITH ONLY THE JSON, NO OTHER TEXT. """ response = self._query_ollama(prompt) try: # Try to extract JSON from response # LLMs sometimes add extra text, so we need to find the JSON part import re json_match = re.search(r'\{.*\}', response, re.DOTALL) if json_match: return json.loads(json_match.group()) else: return { "diagnosis": "Failed to parse AI response", "proposed_action": "manual investigation required", "action_type": "investigation", "risk_level": "high", "reasoning": "AI response was not in expected format" } except json.JSONDecodeError: return { "diagnosis": "Failed to parse AI response", "proposed_action": "manual investigation required", "action_type": "investigation", "risk_level": "high", "reasoning": f"Raw response: {response[:500]}" } def _build_analysis_context(self, data: Dict[str, Any]) -> str: """Build a concise context string for the AI""" lines = [] # System resources res = data.get("resources", {}) lines.append(f"CPU: {res.get('cpu_percent', 0):.1f}%, Memory: {res.get('memory_percent', 0):.1f}%, Load: {res.get('load_average', {}).get('1min', 0):.2f}") # Disk usage disk = data.get("disk", {}) for part in disk.get("partitions", []): if part.get("percent_used", 0) > 80: # Only mention if >80% full lines.append(f"⚠️ Disk {part['mountpoint']}: {part['percent_used']:.1f}% full") # Failed services systemd = data.get("systemd", {}) if systemd.get("failed_count", 0) > 0: lines.append(f"\n⚠️ {systemd['failed_count']} failed systemd services:") for svc in systemd.get("failed_services", [])[:10]: lines.append(f" - {svc.get('unit', 'unknown')}: {svc.get('sub', 'unknown')}") # Recent errors logs = data.get("logs", {}) error_count = logs.get("error_count_1h", 0) if error_count > 0: lines.append(f"\n{error_count} errors in last hour") # Group errors by service errors_by_service = {} for err in logs.get("recent_errors", [])[:20]: svc = err.get("SYSLOG_IDENTIFIER", "unknown") errors_by_service[svc] = errors_by_service.get(svc, 0) + 1 for svc, count in sorted(errors_by_service.items(), key=lambda x: x[1], reverse=True)[:5]: lines.append(f" - {svc}: {count} errors") # Network net = data.get("network", {}) if not net.get("internet_reachable", True): lines.append("\n⚠️ No internet connectivity") return "\n".join(lines) def _create_analysis_prompt(self, context: str, system_context: str = "") -> str: """Create the analysis prompt for the AI""" prompt = f"""{self.SYSTEM_PROMPT} TASK: ANALYZE SYSTEM HEALTH ================================================================================ OBJECTIVE: Analyze the current system state and determine if any action is needed. Be thorough but not alarmist. Only recommend action if truly necessary. """ if system_context: prompt += f"\n\nSYSTEM INFRASTRUCTURE:\n{system_context}" prompt += f""" CURRENT SYSTEM STATE: {context} YOUR RESPONSE MUST BE VALID JSON: {{ "status": "one of: healthy, attention_needed, intervention_required", "issues": [ {{ "severity": "one of: info, warning, critical", "category": "one of: resources, services, disk, network, logs", "description": "brief description of the issue", "requires_action": true/false }} ], "overall_assessment": "brief summary of system health", "recommended_actions": ["list of recommended actions, if any"] }} RESPOND WITH ONLY THE JSON, NO OTHER TEXT. """ return prompt def _auto_diagnose_ollama(self) -> str: """Automatically diagnose Ollama issues""" diagnostics = [] diagnostics.append("=== OLLAMA SELF-DIAGNOSTIC ===") # Check if Ollama service is running try: result = subprocess.run( ['systemctl', 'is-active', 'ollama.service'], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: diagnostics.append("✅ Ollama service is active") else: diagnostics.append(f"❌ Ollama service is NOT active: {result.stdout.strip()}") except Exception as e: diagnostics.append(f"⚠️ Could not check service status: {e}") # Check memory usage try: result = subprocess.run(['free', '-h'], capture_output=True, text=True, timeout=5) diagnostics.append(f"\nMemory:\n{result.stdout}") except Exception as e: diagnostics.append(f"⚠️ Could not check memory: {e}") # Check which models are loaded try: response = requests.get(f"{self.ollama_host}/api/tags", timeout=5) if response.status_code == 200: models = response.json().get('models', []) diagnostics.append(f"\nLoaded models: {len(models)}") for model in models: name = model.get('name', 'unknown') size = model.get('size', 0) / (1024**3) is_target = "← TARGET" if name == self.model else "" diagnostics.append(f" • {name} ({size:.1f} GB) {is_target}") # Check if target model is loaded model_names = [m.get('name') for m in models] if self.model not in model_names: diagnostics.append(f"\n❌ TARGET MODEL NOT LOADED: {self.model}") diagnostics.append(f" Available: {', '.join(model_names)}") else: diagnostics.append(f"❌ Ollama API returned {response.status_code}") except Exception as e: diagnostics.append(f"⚠️ Could not query Ollama API: {e}") # Check recent Ollama logs try: result = subprocess.run( ['journalctl', '-u', 'ollama.service', '-n', '20', '--no-pager'], capture_output=True, text=True, timeout=5 ) if result.stdout: diagnostics.append(f"\nRecent logs:\n{result.stdout}") except Exception as e: diagnostics.append(f"⚠️ Could not check logs: {e}") return "\n".join(diagnostics) def _query_ollama(self, prompt: str, temperature: float = 0.3) -> str: """Query Ollama API (with optional queue)""" # If queue is enabled, submit to queue and wait if self.use_queue and self.ollama_queue: try: payload = { "model": self.model, "prompt": prompt, "stream": False, "temperature": temperature, "timeout": 120 } request_id = self.ollama_queue.submit( request_type="generate", payload=payload, priority=self.priority_level ) result = self.ollama_queue.wait_for_result(request_id, timeout=300) return result.get("response", "") except Exception as e: print(f"Warning: Queue request failed, falling back to direct: {e}") # Fall through to direct query # Direct query (no queue or queue failed) try: response = requests.post( f"{self.ollama_host}/api/generate", json={ "model": self.model, "prompt": prompt, "stream": False, "temperature": temperature, }, timeout=120 # 2 minute timeout for large models ) response.raise_for_status() return response.json().get("response", "") except requests.exceptions.HTTPError as e: error_detail = "" try: error_detail = f" - {response.text}" except: pass print(f"ERROR: Ollama HTTP error {response.status_code}{error_detail}") print(f"Model requested: {self.model}") print(f"Ollama host: {self.ollama_host}") # Run diagnostics diagnostics = self._auto_diagnose_ollama() print(diagnostics) return json.dumps({ "error": f"Ollama HTTP {response.status_code}: {str(e)}{error_detail}", "diagnosis": f"Ollama API error - check if model '{self.model}' is available", "action_type": "investigation", "risk_level": "high" }) except Exception as e: print(f"ERROR: Failed to query Ollama: {str(e)}") print(f"Model requested: {self.model}") print(f"Ollama host: {self.ollama_host}") # Run diagnostics diagnostics = self._auto_diagnose_ollama() print(diagnostics) return json.dumps({ "error": f"Failed to query Ollama: {str(e)}", "diagnosis": "Ollama API unavailable", "action_type": "investigation", "risk_level": "high" }) def _estimate_tokens(self, text: str) -> int: """Rough token estimation: ~4 chars per token""" return len(text) // 4 def _extract_key_findings(self, tool_name: str, raw_output: str, progress_callback=None) -> str: """ Extract key findings from large tool output using chunked map-reduce. Processes large outputs in smaller chunks to prevent Ollama overload. """ output_size = len(raw_output) chunk_size = 8000 # ~2000 tokens per chunk, safe size # Store full output in cache for potential deep dive cache_id = f"{tool_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" try: cache_file = self.cache_dir / f"{cache_id}.txt" cache_file.write_text(raw_output) except (PermissionError, OSError): # Fallback to temp directory if cache dir not writable import tempfile cache_file = Path(tempfile.gettempdir()) / f"macha_{cache_id}.txt" cache_file.write_text(raw_output) # If output is small enough, process in one go if output_size <= chunk_size: try: extraction_prompt = f"""Analyze this output from '{tool_name}'. Extract: key findings, errors/warnings, metrics, actionable insights. Output: {raw_output} Provide concise summary (max 600 chars).""" summary = self._query_ollama(extraction_prompt, temperature=0.1) return f"[Summary of {tool_name}]:\n{summary}\n\n[Full output: {output_size:,} chars cached as {cache_id}]" except Exception as e: print(f"Warning: Failed to extract findings: {e}") return self._simple_truncate(raw_output, 2000) # Large output: chunk and process with map-reduce try: chunks = [] num_chunks = (output_size + chunk_size - 1) // chunk_size for i in range(0, output_size, chunk_size): chunk = raw_output[i:i+chunk_size] chunks.append(chunk) # Phase 1: Map - Summarize each chunk chunk_summaries = [] for idx, chunk in enumerate(chunks): chunk_num = idx + 1 # Progress feedback if progress_callback: progress_callback(f" Processing chunk {chunk_num}/{num_chunks}...") else: print(f" → Processing chunk {chunk_num}/{num_chunks}...", flush=True) extraction_prompt = f"""Analyze chunk {chunk_num}/{num_chunks} from '{tool_name}'. Extract: key findings, errors/warnings, metrics, insights. Chunk: {chunk} Concise summary (max 400 chars).""" chunk_summary = self._query_ollama(extraction_prompt, temperature=0.1) chunk_summaries.append(f"[Chunk {chunk_num}]: {chunk_summary}") # Phase 2: Reduce - Combine chunk summaries if many chunks if len(chunk_summaries) > 5: if progress_callback: progress_callback(f" Synthesizing {len(chunk_summaries)} chunk summaries...") else: print(f" → Synthesizing {len(chunk_summaries)} chunk summaries...", flush=True) combined = "\n".join(chunk_summaries) reduce_prompt = f"""Synthesize these chunk summaries from '{tool_name}': {combined} Provide unified summary (max 800 chars) covering all key points.""" final_summary = self._query_ollama(reduce_prompt, temperature=0.1) return f"""[Chunked analysis of {tool_name}]: {final_summary} [Processed {num_chunks} chunks, {output_size:,} chars total, cached as {cache_id}]""" else: # Few chunks: just concatenate summaries combined_summary = "\n".join(chunk_summaries) return f"""[Chunked analysis of {tool_name}]: {combined_summary} [Processed {num_chunks} chunks, {output_size:,} chars total, cached as {cache_id}]""" except Exception as e: print(f"Warning: Chunked extraction failed for {tool_name}: {e}") return self._simple_truncate(raw_output, 2000) def _simple_truncate(self, text: str, max_chars: int) -> str: """Simple head+tail truncation""" if len(text) <= max_chars: return text half = max_chars // 2 return ( text[:half] + f"\n... [TRUNCATED: {len(text) - max_chars} chars omitted] ...\n" + text[-half:] ) def _process_tool_result_hierarchical(self, tool_name: str, result: Any) -> str: """ Intelligently process tool results based on size: - Small (< 5KB): Pass through directly - Medium (5-20KB): Hierarchical extraction with single-pass summarization - Large (> 20KB): Hierarchical extraction with chunked processing """ result_str = json.dumps(result) if not isinstance(result, str) else result size = len(result_str) # Small outputs: pass through directly if size < 5000: print(f" [Tool result: {size} chars, passing through]") return result_str # Medium and large outputs: hierarchical extraction with chunking else: print(f" [Tool result: {size} chars, extracting key findings...]") # _extract_key_findings automatically chunks large outputs return self._extract_key_findings(tool_name, result_str) def _prune_messages(self, messages: List[Dict], max_context_tokens: int = 80000) -> List[Dict]: """ Prune message history to stay within context limits. Keeps: system prompt + recent conversation window """ if not messages: return messages # Separate system message from conversation system_msg = None conversation = [] for msg in messages: if msg["role"] == "system": system_msg = msg else: conversation.append(msg) # Calculate current token count total_tokens = 0 if system_msg: total_tokens += self._estimate_tokens(system_msg["content"]) for msg in conversation: content = msg.get("content", "") total_tokens += self._estimate_tokens(str(content)) # If under limit, return as-is if total_tokens <= max_context_tokens: result = [] if system_msg: result.append(system_msg) result.extend(conversation) print(f"[Context: {total_tokens:,} tokens, {len(conversation)} messages]") return result # Need to prune - keep sliding window of recent messages # Strategy: Keep last 20 messages (10 exchanges) which should be ~40K tokens max print(f"[Context pruning: {total_tokens:,} tokens → keeping last 20 messages]") pruned_conversation = conversation[-20:] result = [] if system_msg: result.append(system_msg) result.extend(pruned_conversation) # Calculate new token count new_tokens = self._estimate_tokens(system_msg["content"]) if system_msg else 0 for msg in pruned_conversation: new_tokens += self._estimate_tokens(str(msg.get("content", ""))) print(f"[Context after pruning: {new_tokens:,} tokens, {len(pruned_conversation)} messages]") return result def _query_ollama_with_tools( self, messages: List[Dict[str, str]], temperature: float = 0.3, max_iterations: int = 30 ) -> str: """ Query Ollama using chat API with tool support. Handles tool calls and returns final response. Args: messages: List of chat messages [{"role": "user", "content": "..."}] temperature: Generation temperature max_iterations: Maximum number of tool-calling iterations (default 30 for complex system operations) Returns: Final text response from the model """ if not self.enable_tools or not self.tools: # Fallback to regular query user_content = " ".join([m["content"] for m in messages if m["role"] == "user"]) return self._query_ollama(user_content, temperature) # Add system message if not present if not any(m["role"] == "system" for m in messages): messages = [{"role": "system", "content": self.SYSTEM_PROMPT}] + messages tool_definitions = self.tools.get_tool_definitions() for iteration in range(max_iterations): try: # Prune messages before sending to avoid context overflow pruned_messages = self._prune_messages(messages, max_context_tokens=80000) # Use queue if enabled if self.use_queue and self.ollama_queue: try: payload = { "model": self.model, "messages": pruned_messages, "stream": False, "temperature": temperature, "tools": tool_definitions, "timeout": 120 } request_id = self.ollama_queue.submit( request_type="chat_with_tools", payload=payload, priority=self.priority_level ) resp_data = self.ollama_queue.wait_for_result(request_id, timeout=300) except Exception as e: print(f"Warning: Queue request failed, falling back to direct: {e}") # Fall through to direct query response = requests.post( f"{self.ollama_host}/api/chat", json={ "model": self.model, "messages": pruned_messages, "stream": False, "temperature": temperature, "tools": tool_definitions }, timeout=120 ) response.raise_for_status() resp_data = response.json() else: # Direct query (no queue) response = requests.post( f"{self.ollama_host}/api/chat", json={ "model": self.model, "messages": pruned_messages, "stream": False, "temperature": temperature, "tools": tool_definitions }, timeout=120 ) response.raise_for_status() resp_data = response.json() message = resp_data.get("message", {}) # Check if model wants to call tools tool_calls = message.get("tool_calls", []) if not tool_calls: # No tools to call, return the text response return message.get("content", "") # Add assistant's message to history messages.append(message) # Execute each tool call for tool_call in tool_calls: function_name = tool_call["function"]["name"] arguments = tool_call["function"]["arguments"] print(f" → Tool call: {function_name}({arguments})") # Execute the tool tool_result = self.tools.execute_tool(function_name, arguments) # Process result hierarchically based on size processed_result = self._process_tool_result_hierarchical(function_name, tool_result) # Add processed result to messages messages.append({ "role": "tool", "content": processed_result }) # Continue loop to let model process tool results except requests.exceptions.HTTPError as e: error_body = "" try: error_body = response.text except: pass # Check if this is a context length error if "context length" in error_body.lower() or "too long" in error_body.lower(): print(f"ERROR: Context length exceeded. Attempting recovery...") # Emergency pruning - keep only system + last user message system_msg = next((m for m in messages if m["role"] == "system"), None) last_user_msg = next((m for m in reversed(messages) if m["role"] == "user"), None) if system_msg and last_user_msg: messages = [system_msg, last_user_msg] print(f"[Emergency context reset: keeping only system + last user message]") continue # Retry with minimal context print(f"ERROR: Ollama chat API error: {e}") diagnostics = self._auto_diagnose_ollama() print(diagnostics) return json.dumps({ "error": f"Ollama chat API error: {str(e)}", "diagnosis": "Failed to communicate with Ollama", "action_type": "investigation", "risk_level": "high" }) except Exception as e: print(f"ERROR: Tool calling failed: {e}") return json.dumps({ "error": f"Tool calling error: {str(e)}", "diagnosis": "Failed during tool execution", "action_type": "investigation", "risk_level": "high" }) # If we hit max iterations, return what we have return "Maximum tool calling iterations reached. Unable to complete request." def _parse_analysis_response(self, response: str) -> Dict[str, Any]: """Parse the AI's analysis response""" import re # Log the raw response for debugging self._log(f"AI raw response (first 1000 chars): {response[:1000]}") try: json_match = re.search(r'\{.*\}', response, re.DOTALL) if json_match: parsed = json.loads(json_match.group()) self._log(f"Successfully parsed AI response: {parsed.get('status', 'unknown')}") return parsed else: self._log("ERROR: No JSON found in AI response") except Exception as e: self._log(f"ERROR parsing AI response: {e}") # Fallback self._log("Falling back to default response") return { "status": "healthy", "issues": [], "overall_assessment": "Unable to parse AI response", "recommended_actions": [] } def _log(self, message: str): """Log a message to the orchestrator log""" # This will go to the orchestrator log via print print(f"[AGENT] {message}") def _log_decision(self, monitoring_data: Dict[str, Any], analysis: Dict[str, Any]): """Log AI decisions for auditing""" log_entry = { "timestamp": datetime.now().isoformat(), "monitoring_summary": { "cpu": monitoring_data.get("resources", {}).get("cpu_percent"), "memory": monitoring_data.get("resources", {}).get("memory_percent"), "failed_services": monitoring_data.get("systemd", {}).get("failed_count"), "error_count": monitoring_data.get("logs", {}).get("error_count_1h"), }, "analysis": analysis, } with open(self.decision_log, 'a') as f: f.write(json.dumps(log_entry) + '\n') def get_recent_decisions(self, count: int = 10) -> List[Dict[str, Any]]: """Get recent decision history""" if not self.decision_log.exists(): return [] decisions = [] with open(self.decision_log, 'r') as f: for line in f: if line.strip(): try: decisions.append(json.loads(line)) except: pass return decisions[-count:] if __name__ == "__main__": import sys # Test the agent agent = MachaAgent() if len(sys.argv) > 1 and sys.argv[1] == "test": # Test with sample data test_data = { "systemd": {"failed_count": 2, "failed_services": [ {"unit": "test-service.service", "sub": "failed"} ]}, "resources": {"cpu_percent": 25.0, "memory_percent": 45.0, "load_average": {"1min": 1.5}}, "logs": {"error_count_1h": 10}, "network": {"internet_reachable": True} } print("Testing agent analysis...") analysis = agent.analyze_system_state(test_data) print(json.dumps(analysis, indent=2)) if analysis.get("issues"): print("\nTesting fix proposal...") fix = agent.propose_fix( analysis["issues"][0]["description"], test_data ) print(json.dumps(fix, indent=2))