commit 22ba493d9ebc1c27d3ac0ebde35e4384394d4181 Author: Lily Miller Date: Mon Oct 6 14:32:37 2025 -0600 Initial commit: Split Macha autonomous system into separate flake Macha is now a standalone NixOS flake that can be imported into other systems. This provides: - Independent versioning - Easier reusability - Cleaner separation of concerns - Better development workflow Includes: - Complete autonomous system code - NixOS module with full configuration options - Queue-based architecture with priority system - Chunked map-reduce for large outputs - ChromaDB knowledge base - Tool calling system - Multi-host SSH management - Gotify notification integration All capabilities from DESIGN.md are preserved. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f77810 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +dist/ +build/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Nix +result +result-* + +# Test data +test_*.db +*.log + diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 0000000..4ba76cf --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,269 @@ +# Macha Autonomous System - Design Document + +> **⚠️ IMPORTANT - READ THIS FIRST** +> **FOR AI ASSISTANT**: This document is YOUR reference guide when modifying Macha's code. +> - **ALWAYS consult this BEFORE refactoring** to ensure you don't remove existing capabilities +> - **CHECK this when adding features** to avoid conflicts +> - **UPDATE this document** when new capabilities are added +> - **DO NOT DELETE ANYTHING FROM THIS DOCUMENT** +> - During major refactors, you MUST verify each capability listed here is preserved + +## Overview +Macha is an AI-powered autonomous system administrator capable of monitoring, maintaining, and managing multiple NixOS hosts in the infrastructure. + +## Core Capabilities + +### 1. Local System Management +- Monitor system health (CPU, memory, disk, services) +- Read and analyze logs via `journalctl` +- Check service status and restart failed services +- Execute system commands (with safety restrictions) +- Monitor and repair Nix store corruption +- Hardware awareness (CPU, GPU, network, storage) + +### 2. Multi-Host Management via SSH + +**Macha CAN and SHOULD use SSH to manage other hosts.** + +#### SSH Access +- Runs as `macha` user (UID 2501) +- Has `NOPASSWD` sudo access for administrative commands +- Shares SSH keys with other hosts in the infrastructure +- Can SSH to: `rhiannon`, `alexander`, `UCAR-Kinston`, and others in the flake + +#### SSH Usage Patterns +1. **Direct diagnostic commands:** + ```bash + ssh rhiannon systemctl status ollama + ssh alexander df -h + ``` + - Commands automatically prefixed with `sudo` by the tools layer + - Full command: `ssh macha@rhiannon sudo systemctl status ollama` + +2. **Status checks:** + - Check service health on remote hosts + - Gather system metrics + - Review logs + - Monitor resource usage + +3. **File operations:** + - Use `scp` to copy files between hosts + - Read configuration files on remote systems + +#### When to use SSH vs nh +- **SSH**: For diagnostics, status checks, log review, quick commands +- **nh remote deployment**: For applying NixOS configuration changes + - `nh os switch -u --target-host=rhiannon --hostname=rhiannon` + - Builds locally, deploys to remote host + - Use for permanent configuration changes + +### 3. NixOS Configuration Management + +#### Local Changes +- Can propose changes to NixOS configuration +- Requires human approval before applying +- Uses `nh os switch` for local updates + +#### Remote Deployment +- Can deploy to other hosts using `nh` with `--target-host` +- Builds configuration locally (on Macha) +- Pushes to remote system +- Can take up to 1 hour for complex builds +- **IMPORTANT**: Be patient with long-running builds, don't retry prematurely + +### 4. Hardware Awareness + +#### Local Hardware Detection +- CPU: `lscpu` via `nix-shell -p util-linux` +- GPU: `lspci` via `nix-shell -p pciutils` +- Network: `lsblk`, `ip addr` +- Storage: `df -h`, `lsblk` +- USB devices: `lsusb` + +#### GPU Metrics +- AMD GPUs: Try `rocm-smi`, sysfs (`/sys/class/drm/card*/device/`) +- NVIDIA GPUs: Try `nvidia-smi` +- Fallback: `sensors` for temperature data +- Queries: temperature, utilization, clock speeds, power usage + +### 5. Ollama Queue System + +#### Architecture +- **File-based queue**: `/var/lib/macha/queues/ollama/` +- **Queue worker**: `ollama-queue-worker.service` (runs as `macha` user) +- **Purpose**: Serialize all LLM requests to prevent resource contention + +#### Request Flow +1. Any user (including regular users) → Write request to `pending/` +2. Queue worker → Process requests serially (FIFO with priority) +3. Queue worker → Write response to `completed/` +4. Original requester → Read response from `completed/` + +#### Priority Levels +- `INTERACTIVE` (0): User requests via `macha-chat`, `macha-ask` +- `AUTONOMOUS` (1): Background maintenance checks +- `BATCH` (2): Low-priority bulk operations + +#### Large Output Handling +- Outputs >8KB: Split into chunks for hierarchical processing +- Each chunk ~8KB (~2000 tokens) +- Process chunks serially with progress feedback +- Generate chunk summaries → meta-summary +- Full outputs cached in `/var/lib/macha/tool_cache/` + +### 6. Knowledge Base & Learning + +#### ChromaDB Collections +1. **System Context**: Infrastructure topology, service relationships +2. **Issues**: Historical problems and resolutions +3. **Knowledge**: Operational wisdom learned from experience + +#### Automatic Learning +- After successful operations, Macha reflects and extracts key learnings +- Stores: topic, knowledge content, category +- Retrieved automatically when relevant to current tasks +- Use `macha-knowledge` CLI to view/manage + +### 7. Notifications + +#### Gotify Integration +- Can send notifications via `macha-notify` command +- Tool: `send_notification(title, message, priority)` + +#### Priority Levels +- `2` (Low/Info): Routine status updates, completed tasks +- `5` (Medium/Attention): Important events, configuration changes +- `8` (High/Critical): Service failures, critical errors, security issues + +#### When to Notify +- Critical service failures +- Successful completion of major operations +- Configuration changes that may affect users +- Security-related events +- When explicitly requested by user + +### 8. Safety & Constraints + +#### Command Restrictions +**Allowed Commands** (see `tools.py` for full list): +- System management: `systemctl`, `journalctl`, `nh`, `nixos-rebuild` +- Monitoring: `free`, `df`, `uptime`, `ps`, `top`, `ip`, `ss` +- Hardware: `lscpu`, `lspci`, `lsblk`, `lshw`, `dmidecode` +- Remote: `ssh`, `scp` +- Power: `reboot`, `shutdown`, `poweroff` (use cautiously!) +- File ops: `cat`, `ls`, `grep` +- Network: `ping`, `dig`, `nslookup`, `curl`, `wget` +- Logging: `logger` + +**NOT Allowed**: +- Direct package modifications (`nix-env`, `nix profile`) +- Destructive file operations (`rm -rf`, `dd`) +- User management outside of NixOS config +- Direct editing of system files (use NixOS config instead) + +#### Critical Services +**Never disable or stop:** +- SSH (network access) +- Networking (connectivity) +- systemd (system management) +- Boot-related services + +#### Approval Required +- Reboots or system power changes +- Major configuration changes +- Disabling any service +- Changes to multiple hosts + +### 9. Nix Store Maintenance + +#### Verification & Repair +- Command: `nix-store --verify --check-contents --repair` +- **WARNING**: Can take 30+ minutes to several hours +- Only use when corruption is suspected +- Not for routine maintenance +- Verifies all store paths, repairs corrupted files + +#### Garbage Collection +- Automatic via system configuration +- Can be triggered manually with approval +- Frees disk space by removing unused derivations + +### 10. Conversational Behavior + +#### Distinguish Requests from Acknowledgments +- "Thanks" / "Thank you" → Acknowledgment (don't re-execute) +- "Can you..." / "Please..." → Request (execute) +- "What is..." / "How do..." → Question (answer) + +#### Tool Calling +- Don't repeat tool calls unnecessarily +- If a tool succeeds, don't run it again unless asked +- Use cached results when available (`retrieve_cached_output`) + +#### Context Management +- Be aware of token limits +- Use hierarchical processing for large outputs +- Prune conversation history intelligently +- Cache and summarize when needed + +## Infrastructure Topology + +### Hosts in Flake +- **macha**: Main autonomous system (self), GPU server +- **rhiannon**: Production server +- **alexander**: Production server +- **UCAR-Kinston**: Work laptop +- **test-vm**: Testing environment + +### Shared Configuration +- All hosts share root SSH keys (for `nh` remote deployment) +- `macha` user (UID 2501) exists on all hosts +- Common NixOS configuration via flake + +## Service Ecosystem + +### Core Services on Macha +- `ollama.service`: LLM inference engine +- `ollama-queue-worker.service`: Request serialization +- `macha-autonomous.service`: Autonomous monitoring loop +- Servarr stack: Sonarr, Radarr, Prowlarr, Lidarr, Readarr, Whisparr +- Media: Transmission, SABnzbd, Calibre + +### State Directories +- `/var/lib/macha/`: Main state directory (0755, macha:macha) +- `/var/lib/macha/queues/`: Queue directories (0777 for multi-user) +- `/var/lib/macha/tool_cache/`: Cached tool outputs (0777) +- `/var/lib/macha/system_context.db`: ChromaDB database + +## CLI Tools + +- `macha-chat`: Interactive chat with tool calling +- `macha-ask`: Single-question interface +- `macha-check`: Trigger immediate health check +- `macha-approve`: Approve pending actions +- `macha-logs`: View autonomous service logs +- `macha-issues`: Query issue database +- `macha-knowledge`: Query knowledge base +- `macha-systems`: List managed systems +- `macha-notify`: Send Gotify notification + +## Philosophy & Principles + +1. **KISS (Keep It Simple, Stupid)**: Use existing NixOS options, avoid custom wrappers +2. **Verify first**: Check source code/documentation before acting +3. **Safety first**: Never break critical services, always require approval for risky changes +4. **Learn continuously**: Extract and store operational knowledge +5. **Multi-host awareness**: Macha manages the entire infrastructure, not just herself +6. **User-friendly**: Clear communication, appropriate notifications +7. **Patience**: Long-running operations (builds, repairs) can take an hour - don't panic +8. **Tool reuse**: Use existing, verified tools instead of writing custom scripts + +## Future Capabilities (Not Yet Implemented) + +- [ ] Automatic security updates across all hosts +- [ ] Predictive failure detection +- [ ] Resource optimization recommendations +- [ ] Integration with other communication platforms +- [ ] Multi-agent coordination between hosts +- [ ] Automated testing before deployment + diff --git a/EXAMPLES.md b/EXAMPLES.md new file mode 100644 index 0000000..7e5be55 --- /dev/null +++ b/EXAMPLES.md @@ -0,0 +1,275 @@ +# Macha Autonomous System - Configuration Examples + +## Basic Configurations + +### Conservative (Recommended for Start) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "suggest"; # Require approval for all actions + checkInterval = 300; # Check every 5 minutes + model = "llama3.1:70b"; # Most capable model +}; +``` + +### Moderate Autonomy +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "auto-safe"; # Auto-fix safe issues + checkInterval = 180; # Check every 3 minutes + model = "llama3.1:70b"; +}; +``` + +### High Autonomy (Experimental) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "auto-full"; # Full autonomy + checkInterval = 300; + model = "llama3.1:70b"; +}; +``` + +### Monitoring Only +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "observe"; # No actions, just watch + checkInterval = 60; # Check every minute + model = "qwen3:8b-fp16"; # Lighter model is fine for observation +}; +``` + +## Advanced Scenarios + +### Using a Smaller Model (Faster, Less Capable) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "auto-safe"; + checkInterval = 120; + model = "qwen3:8b-fp16"; # Faster inference, less reasoning depth + # or + # model = "llama3.1:8b"; # Also good for simple tasks +}; +``` + +### High-Frequency Monitoring +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "auto-safe"; + checkInterval = 60; # Check every minute + model = "qwen3:4b-instruct-2507-fp16"; # Lightweight model +}; +``` + +### Remote Ollama (if running Ollama elsewhere) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "suggest"; + checkInterval = 300; + ollamaHost = "http://192.168.1.100:11434"; # Remote Ollama instance + model = "llama3.1:70b"; +}; +``` + +## Manual Testing Workflow + +1. **Test with a one-shot run:** +```bash +# Run once in observe mode +macha-check + +# Review what it detected +cat /var/lib/macha-autonomous/decisions.jsonl | tail -1 | jq . +``` + +2. **Enable in suggest mode:** +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "suggest"; + checkInterval = 300; + model = "llama3.1:70b"; +}; +``` + +3. **Rebuild and start:** +```bash +sudo nixos-rebuild switch --flake .#macha +sudo systemctl status macha-autonomous +``` + +4. **Monitor for a while:** +```bash +# Watch the logs +journalctl -u macha-autonomous -f + +# Or use the helper +macha-logs service +``` + +5. **Review proposed actions:** +```bash +macha-approve list +``` + +6. **Graduate to auto-safe when comfortable:** +```nix +services.macha-autonomous.autonomyLevel = "auto-safe"; +``` + +## Scenario-Based Examples + +### Media Server (Let it auto-restart services) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "auto-safe"; # Auto-restart failed arr apps + checkInterval = 180; + model = "llama3.1:70b"; +}; +``` + +### Development Machine (Observe only, you want control) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "observe"; + checkInterval = 600; # Check less frequently + model = "llama3.1:8b"; # Lighter model +}; +``` + +### Critical Production (Suggest only, manual approval) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "suggest"; + checkInterval = 120; # More frequent monitoring + model = "llama3.1:70b"; # Best reasoning +}; +``` + +### Experimental/Learning (Full autonomy) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "auto-full"; + checkInterval = 300; + model = "llama3.1:70b"; +}; +``` + +## Customizing Behavior + +### The config file lives at: +`/etc/macha-autonomous/config.json` (auto-generated from NixOS config) + +### To modify the AI prompts: +Edit the Python files in `systems/macha-configs/autonomous/`: +- `agent.py` - AI analysis and decision prompts +- `monitor.py` - What data to collect +- `executor.py` - Safety rules and action execution +- `orchestrator.py` - Main control flow + +After editing, rebuild: +```bash +sudo nixos-rebuild switch --flake .#macha +sudo systemctl restart macha-autonomous +``` + +## Integration with Other Services + +### Example: Auto-restart specific services +The system will automatically detect and propose restarting failed services. + +### Example: Disk cleanup when space is low +Monitor will detect low disk space, AI will propose cleanup, executor will run `nix-collect-garbage`. + +### Example: Log analysis +AI analyzes recent error logs and can propose fixes based on error patterns. + +## Debugging + +### See what the monitor sees: +```bash +sudo -u macha-autonomous python3 /nix/store/.../monitor.py +``` + +### Test the AI agent: +```bash +sudo -u macha-autonomous python3 /nix/store/.../agent.py test +``` + +### View all snapshots: +```bash +ls -lh /var/lib/macha-autonomous/snapshot_*.json +cat /var/lib/macha-autonomous/snapshot_$(ls -t /var/lib/macha-autonomous/snapshot_*.json | head -1) | jq . +``` + +### Check approval queue: +```bash +cat /var/lib/macha-autonomous/approval_queue.json | jq . +``` + +## Performance Tuning + +### Model Choice Impact: + +| Model | Speed | Capability | RAM Usage | Best For | +|-------|-------|------------|-----------|----------| +| llama3.1:70b | Slow (~30s) | Excellent | ~40GB | Complex reasoning | +| llama3.1:8b | Fast (~3s) | Good | ~5GB | General use | +| qwen3:8b-fp16 | Fast (~2s) | Good | ~16GB | General use | +| qwen3:4b | Very Fast (~1s) | Moderate | ~8GB | Simple tasks | + +### Check Interval Impact: +- 60s: High responsiveness, more resource usage +- 300s (default): Good balance +- 600s: Low overhead, slower detection + +### Memory Usage: +- Monitor: ~50MB +- Agent (per query): Depends on model (see above) +- Executor: ~30MB +- Orchestrator: ~20MB + +Total continuous overhead: ~100MB + model inference when running + +## Security Considerations + +### The autonomous user has sudo access to: +- `systemctl restart/status` - Restart services +- `journalctl` - Read logs +- `nix-collect-garbage` - Clean up Nix store + +### It CANNOT: +- Modify arbitrary files +- Access user home directories (ProtectHome=true) +- Disable protected services (SSH, networking) +- Make changes without logging + +### Audit trail: +All actions are logged in `/var/lib/macha-autonomous/actions.jsonl` + +### To revoke access: +Set `enable = false` and rebuild, or stop the service. + +## Future: MCP Integration + +You already have MCP servers installed: +- `mcp-nixos` - NixOS-specific tools +- `gitea-mcp-server` - Git integration +- `emcee` - General MCP orchestration + +Future versions could integrate these for: +- Better NixOS config manipulation +- Git-based config versioning +- More sophisticated tooling + +Stay tuned! diff --git a/LOGGING_EXAMPLE.md b/LOGGING_EXAMPLE.md new file mode 100644 index 0000000..4dc954f --- /dev/null +++ b/LOGGING_EXAMPLE.md @@ -0,0 +1,217 @@ +# Enhanced Logging Example + +This shows what the improved journalctl output will look like for Macha's autonomous system. + +## Example Output + +### Maintenance Cycle Start +``` +[2025-10-01T14:30:00] === Starting maintenance cycle === +[2025-10-01T14:30:00] Collecting system health data... + +[2025-10-01T14:30:02] ============================================================ +[2025-10-01T14:30:02] SYSTEM HEALTH SUMMARY +[2025-10-01T14:30:02] ============================================================ +[2025-10-01T14:30:02] Resources: CPU 25.3%, Memory 45.2%, Load 1.24 +[2025-10-01T14:30:02] Disk: 35.6% used (/ partition) +[2025-10-01T14:30:02] Services: 1 failed +[2025-10-01T14:30:02] - ollama.service (failed) +[2025-10-01T14:30:02] Network: Internet reachable +[2025-10-01T14:30:02] Recent logs: 3 errors in last hour +[2025-10-01T14:30:02] ============================================================ + +[2025-10-01T14:30:02] KEY METRICS: +[2025-10-01T14:30:02] CPU Usage: 25.3% +[2025-10-01T14:30:02] Memory Usage: 45.2% +[2025-10-01T14:30:02] Load Average: 1.24 +[2025-10-01T14:30:02] Failed Services: 1 +[2025-10-01T14:30:02] Errors (1h): 3 +[2025-10-01T14:30:02] Disk /: 35.6% used +[2025-10-01T14:30:02] Disk /home: 62.1% used +[2025-10-01T14:30:02] Disk /var: 28.9% used +[2025-10-01T14:30:02] Internet: ✅ Connected +``` + +### AI Analysis Section +``` +[2025-10-01T14:30:02] Analyzing system state with AI... + +[2025-10-01T14:30:35] ============================================================ +[2025-10-01T14:30:35] AI ANALYSIS RESULTS +[2025-10-01T14:30:35] ============================================================ +[2025-10-01T14:30:35] Overall Status: ATTENTION_NEEDED +[2025-10-01T14:30:35] Assessment: System has one failed service that should be restarted + +[2025-10-01T14:30:35] Detected 1 issue(s): + +[2025-10-01T14:30:35] Issue #1: +[2025-10-01T14:30:35] Severity: WARNING +[2025-10-01T14:30:35] Category: services +[2025-10-01T14:30:35] Description: ollama.service has failed and needs to be restarted +[2025-10-01T14:30:35] ⚠️ ACTION REQUIRED + +[2025-10-01T14:30:35] Recommended Actions (1): +[2025-10-01T14:30:35] - Restart ollama.service to restore LLM functionality +[2025-10-01T14:30:35] ============================================================ +``` + +### Action Handling Section +``` +[2025-10-01T14:30:35] Found 1 issues requiring action + +[2025-10-01T14:30:35] ──────────────────────────────────────────────────────────── +[2025-10-01T14:30:35] Addressing issue: ollama.service has failed and needs to be restarted +[2025-10-01T14:30:35] Requesting AI fix proposal... + +[2025-10-01T14:30:45] AI FIX PROPOSAL: +[2025-10-01T14:30:45] Diagnosis: ollama.service crashed or failed to start properly +[2025-10-01T14:30:45] Proposed Action: Restart ollama.service using systemctl +[2025-10-01T14:30:45] Action Type: systemd_restart +[2025-10-01T14:30:45] Risk Level: LOW +[2025-10-01T14:30:45] Commands to execute: +[2025-10-01T14:30:45] - systemctl restart ollama.service +[2025-10-01T14:30:45] Reasoning: Restarting the service is a safe, standard troubleshooting step +[2025-10-01T14:30:45] Rollback Plan: Service will return to failed state if restart doesn't work + +[2025-10-01T14:30:45] Executing action... + +[2025-10-01T14:30:47] EXECUTION RESULT: +[2025-10-01T14:30:47] Status: QUEUED_FOR_APPROVAL +[2025-10-01T14:30:47] Executed: No +[2025-10-01T14:30:47] Reason: Autonomy level requires manual approval +``` + +### Cycle Complete Summary +``` +[2025-10-01T14:30:47] No issues requiring immediate action + +[2025-10-01T14:30:47] ============================================================ +[2025-10-01T14:30:47] MAINTENANCE CYCLE COMPLETE +[2025-10-01T14:30:47] ============================================================ +[2025-10-01T14:30:47] Status: ATTENTION_NEEDED +[2025-10-01T14:30:47] Issues Found: 1 +[2025-10-01T14:30:47] Actions Taken: 1 +[2025-10-01T14:30:47] - Executed: 0 +[2025-10-01T14:30:47] - Queued for approval: 1 +[2025-10-01T14:30:47] Next check in: 300 seconds +[2025-10-01T14:30:47] ============================================================ +``` + +## When System is Healthy + +``` +[2025-10-01T14:35:00] === Starting maintenance cycle === +[2025-10-01T14:35:00] Collecting system health data... + +[2025-10-01T14:35:02] ============================================================ +[2025-10-01T14:35:02] SYSTEM HEALTH SUMMARY +[2025-10-01T14:35:02] ============================================================ +[2025-10-01T14:35:02] Resources: CPU 12.5%, Memory 38.1%, Load 0.65 +[2025-10-01T14:35:02] Disk: 35.6% used (/ partition) +[2025-10-01T14:35:02] Services: All running +[2025-10-01T14:35:02] Network: Internet reachable +[2025-10-01T14:35:02] Recent logs: 0 errors in last hour +[2025-10-01T14:35:02] ============================================================ + +[2025-10-01T14:35:02] KEY METRICS: +[2025-10-01T14:35:02] CPU Usage: 12.5% +[2025-10-01T14:35:02] Memory Usage: 38.1% +[2025-10-01T14:35:02] Load Average: 0.65 +[2025-10-01T14:35:02] Failed Services: 0 +[2025-10-01T14:35:02] Errors (1h): 0 +[2025-10-01T14:35:02] Disk /: 35.6% used +[2025-10-01T14:35:02] Internet: ✅ Connected + +[2025-10-01T14:35:02] Analyzing system state with AI... + +[2025-10-01T14:35:28] ============================================================ +[2025-10-01T14:35:28] AI ANALYSIS RESULTS +[2025-10-01T14:35:28] ============================================================ +[2025-10-01T14:35:28] Overall Status: HEALTHY +[2025-10-01T14:35:28] Assessment: System is operating normally with no issues detected + +[2025-10-01T14:35:28] ✅ No issues detected +[2025-10-01T14:35:28] ============================================================ + +[2025-10-01T14:35:28] No issues requiring immediate action + +[2025-10-01T14:35:28] ============================================================ +[2025-10-01T14:35:28] MAINTENANCE CYCLE COMPLETE +[2025-10-01T14:35:28] ============================================================ +[2025-10-01T14:35:28] Status: HEALTHY +[2025-10-01T14:35:28] Issues Found: 0 +[2025-10-01T14:35:28] Actions Taken: 0 +[2025-10-01T14:35:28] Next check in: 300 seconds +[2025-10-01T14:35:28] ============================================================ +``` + +## Viewing Logs + +### Follow live logs +```bash +journalctl -u macha-autonomous.service -f +``` + +### See only AI decisions +```bash +journalctl -u macha-autonomous.service | grep "AI ANALYSIS" +``` + +### See only execution results +```bash +journalctl -u macha-autonomous.service | grep "EXECUTION RESULT" +``` + +### See key metrics +```bash +journalctl -u macha-autonomous.service | grep "KEY METRICS" -A 10 +``` + +### Filter by status level +```bash +# Only show intervention required +journalctl -u macha-autonomous.service | grep "INTERVENTION_REQUIRED" + +# Only show critical issues +journalctl -u macha-autonomous.service | grep "CRITICAL" + +# Only show action required +journalctl -u macha-autonomous.service | grep "ACTION REQUIRED" +``` + +### Summary of last cycle +```bash +journalctl -u macha-autonomous.service | grep "MAINTENANCE CYCLE COMPLETE" -B 5 | tail -6 +``` + +## Benefits of Enhanced Logging + +### 1. **Easy to Scan** +Clear section headers with separators make it easy to find what you need + +### 2. **Structured Data** +Key metrics are labeled consistently for easy parsing/grepping + +### 3. **Complete Context** +Each cycle shows: +- What the system saw +- What the AI thought +- What action was proposed +- What actually happened + +### 4. **AI Transparency** +You can see: +- The AI's reasoning for each decision +- Risk assessment for each action +- Rollback plans if something goes wrong + +### 5. **Audit Trail** +Everything is logged to journalctl for long-term storage and analysis + +### 6. **Troubleshooting** +If something goes wrong, you have complete context: +- System state before the issue +- AI's diagnosis +- Action attempted +- Result of action + diff --git a/NOTIFICATIONS.md b/NOTIFICATIONS.md new file mode 100644 index 0000000..f6593d4 --- /dev/null +++ b/NOTIFICATIONS.md @@ -0,0 +1,224 @@ +# Gotify Notifications Setup + +Macha's autonomous system can now send notifications to Gotify on Rhiannon for critical events. + +## What Gets Notified + +### High Priority (🚨 Priority 8) +- **Critical issues detected** - System problems requiring immediate attention +- **Service failures** - When critical services fail +- **Failed actions** - When an action execution fails +- **Intervention required** - When system status is critical + +### Medium Priority (📋 Priority 5) +- **Actions queued for approval** - When medium/high-risk actions need manual review +- **System attention needed** - When system status needs attention + +### Low Priority (✅ Priority 2) +- **Successful actions** - When safe actions execute successfully +- **System healthy** - Periodic health check confirmations (if enabled) + +## Setup Instructions + +### Step 1: Create Gotify Application on Rhiannon + +1. Open Gotify web interface on Rhiannon: + ```bash + # URL: http://rhiannon:8181 (or use external access) + ``` + +2. Log in to Gotify + +3. Go to **"Apps"** tab + +4. Click **"Create Application"** + +5. Name it: `Macha Autonomous System` + +6. Copy the generated **Application Token** + +### Step 2: Configure Macha + +Edit `/home/lily/Documents/gitrepos/nixos-servers/systems/macha.nix`: + +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "suggest"; + checkInterval = 300; + model = "llama3.1:70b"; + + # Gotify notifications + gotifyUrl = "http://rhiannon:8181"; + gotifyToken = "YOUR_TOKEN_HERE"; # Paste the token from Step 1 +}; +``` + +### Step 3: Rebuild and Deploy + +```bash +cd /home/lily/Documents/gitrepos/nixos-servers +sudo nixos-rebuild switch --flake .#macha +``` + +### Step 4: Test Notifications + +Send a test notification: + +```bash +macha-notify "Test" "Macha notifications are working!" 5 +``` + +You should see this notification appear in Gotify on Rhiannon. + +## CLI Tools + +### Send Test Notification +```bash +macha-notify <message> [priority] + +# Examples: +macha-notify "Test" "This is a test" 5 +macha-notify "Critical" "This is urgent" 8 +macha-notify "Info" "Just FYI" 2 +``` + +Priorities: +- `2` - Low (✅ green) +- `5` - Medium (📋 blue) +- `8` - High (🚨 red) + +### Check if Notifications are Enabled + +```bash +# View the service environment +systemctl show macha-autonomous.service | grep GOTIFY +``` + +## Notification Examples + +### Critical Issue +``` +🚨 Macha: Critical Issue +⚠️ Critical Issue Detected + +High disk usage on /var partition (95% full) + +Details: +Category: disk +``` + +### Action Queued for Approval +``` +📋 Macha: Action Needs Approval +ℹ️ Action Queued for Approval + +Action: Restart failed service: ollama.service +Risk Level: low + +Use 'macha-approve list' to review +``` + +### Action Executed Successfully +``` +✅ Macha: Action Success +✅ Action Success + +Restart failed service: ollama.service + +Output: +Service restarted successfully +``` + +### Action Failed +``` +❌ Macha: Action Failed +❌ Action Failed + +Clean up disk space with nix-collect-garbage + +Output: +Error: Insufficient permissions +``` + +## Security Notes + +1. **Token Storage**: The Gotify token is stored in the NixOS configuration. Consider using a secrets management solution for production. + +2. **Network Access**: Macha needs network access to Rhiannon. Ensure your firewall allows HTTP traffic between them. + +3. **Token Scope**: The Gotify token only allows sending messages, not reading or managing Gotify. + +## Troubleshooting + +### Notifications Not Appearing + +1. **Check Gotify is running on Rhiannon:** + ```bash + ssh rhiannon systemctl status gotify + ``` + +2. **Test connectivity from Macha:** + ```bash + curl http://rhiannon:8181/health + ``` + +3. **Verify token is set:** + ```bash + macha-notify "Test" "Testing" 5 + ``` + +4. **Check service logs:** + ```bash + macha-logs service | grep -i gotify + ``` + +### Notification Spam + +If you're getting too many notifications, you can: + +1. **Disable notifications temporarily:** + ```nix + services.macha-autonomous.gotifyUrl = ""; # Empty string disables + ``` + +2. **Adjust autonomy level:** + ```nix + services.macha-autonomous.autonomyLevel = "auto-safe"; # Fewer approval notifications + ``` + +3. **Increase check interval:** + ```nix + services.macha-autonomous.checkInterval = 900; # Check every 15 minutes instead of 5 + ``` + +## Implementation Details + +### Files Modified +- `notifier.py` - Gotify notification client +- `module.nix` - Added configuration options and CLI tool +- `orchestrator.py` - Integrated notifications at decision points +- `macha.nix` - Added Gotify configuration + +### Notification Flow +``` +Issue Detected → AI Analysis → Decision Made → Notification Sent + ↓ + Queued or Executed → Notification Sent +``` + +### Graceful Degradation +- If Gotify is unavailable, the system continues to operate +- Failed notifications are logged but don't crash the service +- Notifications have a 10-second timeout to prevent blocking + +## Future Enhancements + +Possible improvements: +- [ ] Rate limiting to prevent notification spam +- [ ] Notification grouping (batch similar issues) +- [ ] Custom notification templates +- [ ] Priority-based notification filtering +- [ ] Integration with other notification services (email, SMS) +- [ ] Secrets management for tokens (agenix, sops-nix) + diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..f1afaad --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,229 @@ +# Macha Autonomous System - Quick Start Guide + +## What is This? + +Macha now has a self-maintenance system that uses local AI (via Ollama) to monitor, analyze, and maintain itself. Think of it as a 24/7 system administrator that watches over Macha. + +## How It Works + +1. **Monitor**: Every 5 minutes, collects system health data (services, resources, logs, etc.) +2. **Analyze**: Uses llama3.1:70b to analyze the data and detect issues +3. **Act**: Based on autonomy level, either proposes fixes or executes them automatically +4. **Learn**: Logs all decisions and actions for auditing and improvement + +## Autonomy Levels + +### `observe` - Monitoring Only +- Monitors system health +- Logs everything +- Takes NO actions +- Good for: Testing, learning what the system sees + +### `suggest` - Approval Required (DEFAULT) +- Monitors and analyzes +- Proposes fixes +- Requires manual approval before executing +- Good for: Production use, when you want control + +### `auto-safe` - Limited Autonomy +- Auto-executes "safe" actions: + - Restarting failed services + - Disk cleanup + - Log rotation + - Read-only diagnostics +- Asks approval for risky changes +- Good for: Hands-off operation with safety net + +### `auto-full` - Full Autonomy +- Auto-executes most actions +- Still requires approval for HIGH RISK actions +- Never touches protected services (SSH, networking, etc.) +- Good for: Experimental, when you trust the system + +## Commands + +### Check the status +```bash +# View the service status +systemctl status macha-autonomous + +# View live logs +macha-logs service + +# View AI decision log +macha-logs decisions + +# View action execution log +macha-logs actions + +# View orchestrator log +macha-logs orchestrator +``` + +### Run a manual check +```bash +# Run one maintenance cycle now +macha-check +``` + +### Approval workflow (when autonomyLevel = "suggest") +```bash +# List pending actions awaiting approval +macha-approve list + +# Approve action number 0 +macha-approve approve 0 +``` + +### Change autonomy level +Edit `/home/lily/Documents/nixos-servers/systems/macha.nix`: +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "auto-safe"; # Change this + checkInterval = 300; + model = "llama3.1:70b"; +}; +``` + +Then rebuild: +```bash +sudo nixos-rebuild switch --flake .#macha +``` + +## What Can It Do? + +### Automatically Detects +- Failed systemd services +- High resource usage (CPU, RAM, disk) +- Recent errors in logs +- Network connectivity issues +- Disk space problems +- Boot/uptime anomalies + +### Can Propose/Execute +- Restart failed services +- Clean up disk space (nix store, old logs) +- Investigate issues (run diagnostics) +- Propose configuration changes (for manual review) +- NixOS rebuilds (with safety checks) + +### Safety Features +- **Protected services**: Never touches SSH, networking, systemd core +- **Dry-run testing**: Tests NixOS rebuilds before applying +- **Action logging**: Every action is logged with context +- **Rollback capability**: Can revert changes +- **Rate limiting**: Won't spam actions +- **Human override**: You can always disable or intervene + +## Example Workflow + +1. **System detects failed service** + ``` + Monitor: "ollama.service is failed" + AI Agent: "The ollama service crashed. Propose restarting it." + ``` + +2. **In `suggest` mode (default)** + ``` + Executor: "Action queued for approval" + You: Run `macha-approve list` + You: Review the proposed action + You: Run `macha-approve approve 0` + Executor: Restarts the service + ``` + +3. **In `auto-safe` mode** + ``` + Executor: "Low risk action, auto-executing" + Executor: Restarts the service automatically + You: Check logs later to see what happened + ``` + +## Monitoring the System + +All data is stored in `/var/lib/macha-autonomous/`: +- `orchestrator.log` - Main system log +- `decisions.jsonl` - AI analysis decisions (JSON Lines format) +- `actions.jsonl` - Executed actions log +- `snapshot_*.json` - System state snapshots +- `approval_queue.json` - Pending actions + +## Tips + +1. **Start with `suggest` mode** - Get comfortable with what it proposes +2. **Review the logs** - See what it's detecting and proposing +3. **Graduate to `auto-safe`** - Let it handle routine maintenance +4. **Use `observe` for debugging** - If something seems wrong +5. **Check approval queue regularly** - If using `suggest` mode + +## Troubleshooting + +### Service won't start +```bash +# Check for errors +journalctl -u macha-autonomous -n 50 + +# Verify Ollama is running +systemctl status ollama + +# Test Ollama manually +curl http://localhost:11434/api/generate -d '{"model": "llama3.1:70b", "prompt": "test"}' +``` + +### AI making bad decisions +- Switch to `observe` mode to stop actions +- Review `decisions.jsonl` to see reasoning +- File an issue or adjust prompts in `agent.py` + +### Want to disable temporarily +```bash +sudo systemctl stop macha-autonomous +``` + +### Want to disable permanently +Edit `systems/macha.nix`: +```nix +services.macha-autonomous.enable = false; +``` +Then rebuild. + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Orchestrator │ +│ (Main loop, runs every 5 minutes) │ +└────────────┬──────────────┬──────────────┬──────────────┘ + │ │ │ + ┌───▼────┐ ┌────▼────┐ ┌────▼─────┐ + │Monitor │ │ Agent │ │ Executor │ + │ │───▶│ (AI) │───▶│ (Safe) │ + └────────┘ └─────────┘ └──────────┘ + │ │ │ + Collects Analyzes Executes + System Issues Actions + Health w/ LLM Safely +``` + +## Future Enhancements + +Potential future capabilities: +- Integration with MCP servers (already installed!) +- Predictive maintenance (learning from patterns) +- Self-optimization (tuning configs based on usage) +- Cluster management (if you add more systems) +- Automated backups and disaster recovery +- Security monitoring and hardening +- Performance tuning recommendations + +## Philosophy + +The goal is a system that maintains itself while being: +1. **Safe** - Never breaks critical functionality +2. **Transparent** - All decisions are logged and explainable +3. **Conservative** - When in doubt, ask for approval +4. **Learning** - Gets better over time +5. **Human-friendly** - Easy to understand and override + +Macha is here to help you, not replace you! diff --git a/README.md b/README.md new file mode 100644 index 0000000..b45450f --- /dev/null +++ b/README.md @@ -0,0 +1,93 @@ +# Macha - AI-Powered Autonomous System Administrator + +Macha is an AI-powered autonomous system administrator for NixOS that monitors system health, diagnoses issues, and can take corrective actions with appropriate approval workflows. + +## Features + +- **Autonomous Monitoring**: Continuous health checks with configurable intervals +- **Multi-Host Management**: SSH-based management of multiple NixOS hosts +- **Tool Calling**: Comprehensive system administration tools via Ollama LLM +- **Queue-Based Architecture**: Serialized LLM requests to prevent resource contention +- **Knowledge Base**: ChromaDB-backed learning system for operational wisdom +- **Approval Workflows**: Safety-first approach with configurable autonomy levels +- **Notification System**: Gotify integration for alerts + +## Quick Start + +### As a NixOS Flake Input + +Add to your `flake.nix`: + +```nix +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + macha-autonomous.url = "git+https://git.coven.systems/lily/macha-autonomous"; + }; + + outputs = { self, nixpkgs, macha-autonomous }: { + nixosConfigurations.yourhost = nixpkgs.lib.nixosSystem { + modules = [ + macha-autonomous.nixosModules.default + { + services.macha-autonomous = { + enable = true; + autonomyLevel = "suggest"; # observe, suggest, auto-safe, auto-full + checkInterval = 300; + ollamaHost = "http://localhost:11434"; + model = "gpt-oss:latest"; + }; + } + ]; + }; + }; +} +``` + +## Configuration Options + +See `module.nix` for full configuration options including: +- Autonomy levels (observe, suggest, auto-safe, auto-full) +- Check intervals +- Ollama host and model settings +- Git repository monitoring +- Service user/group configuration + +## CLI Tools + +- `macha-chat` - Interactive chat interface +- `macha-ask` - Single-question interface +- `macha-check` - Trigger immediate health check +- `macha-approve` - Approve pending actions +- `macha-logs` - View service logs +- `macha-issues` - Query issue database +- `macha-knowledge` - Query knowledge base +- `macha-systems` - List managed systems +- `macha-notify` - Send Gotify notification + +## Architecture + +- **Agent**: Core AI logic with tool calling +- **Orchestrator**: Main monitoring loop +- **Executor**: Safe action execution +- **Queue System**: Serialized Ollama requests with priorities +- **Context DB**: ChromaDB for system context and learning +- **Tools**: System administration capabilities + +## Requirements + +- NixOS with flakes enabled +- Ollama service running +- Python 3 with requests, psutil, chromadb + +## Documentation + +See `DESIGN.md` for comprehensive architecture documentation. + +## License + +[Add your license here] + +## Author + +Lily Miller diff --git a/SUMMARY.md b/SUMMARY.md new file mode 100644 index 0000000..5804f81 --- /dev/null +++ b/SUMMARY.md @@ -0,0 +1,317 @@ +# Macha Autonomous System - Implementation Summary + +## What We Built + +A complete self-maintaining system for Macha that uses local AI models (via Ollama) to monitor, analyze, and fix issues automatically. This is a production-ready implementation with safety mechanisms, audit trails, and multiple autonomy levels. + +## Components Created + +### 1. System Monitor (`monitor.py` - 310 lines) +- Collects comprehensive system health data every cycle +- Monitors: systemd services, resources (CPU/RAM), disk usage, logs, network, NixOS status +- Saves snapshots for historical analysis +- Generates human-readable summaries + +### 2. AI Agent (`agent.py` - 238 lines) +- Analyzes system state using llama3.1:70b (or other models) +- Detects issues and classifies severity +- Proposes specific, actionable fixes +- Logs all decisions for auditing +- Uses structured JSON responses for reliability + +### 3. Safe Executor (`executor.py` - 371 lines) +- Executes actions with safety checks +- Protected services list (never touches SSH, networking, etc.) +- Supports multiple action types: + - `systemd_restart` - Restart failed services + - `cleanup` - Disk/log cleanup + - `nix_rebuild` - NixOS configuration rebuilds + - `config_change` - Config file modifications + - `investigation` - Diagnostic commands +- Approval queue for manual review +- Complete action logging + +### 4. Orchestrator (`orchestrator.py` - 211 lines) +- Main control loop +- Coordinates monitor → agent → executor pipeline +- Handles signals and graceful shutdown +- Configuration management +- Multiple run modes (once, continuous, daemon) + +### 5. NixOS Module (`module.nix` - 168 lines) +- Full systemd service integration +- Configuration options via NixOS +- User/group management +- Security hardening +- CLI tools (`macha-check`, `macha-approve`, `macha-logs`) +- Resource limits (1GB RAM, 50% CPU) + +### 6. Documentation +- `README.md` - Architecture overview +- `QUICKSTART.md` - User guide +- `EXAMPLES.md` - Configuration examples +- `SUMMARY.md` - This file + +**Total: ~1,400 lines of code** + +## Architecture + +``` +┌──────────────────────────────────────────────────────────────┐ +│ NixOS Module │ +│ - Creates systemd service │ +│ - Manages user/permissions │ +│ - Provides CLI tools │ +└───────────────────────┬──────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ Orchestrator │ +│ - Runs main loop (every 5 minutes) │ +│ - Coordinates components │ +│ - Handles errors and logging │ +└───────┬──────────────┬──────────────┬──────────────┬─────────┘ + │ │ │ │ + ▼ ▼ ▼ ▼ + ┌─────────┐ ┌──────────┐ ┌─────────┐ ┌──────────┐ + │ Monitor │──▶│ Agent │──▶│Executor │──▶│ Logs │ + │ │ │ (AI) │ │ (Safe) │ │ │ + └─────────┘ └──────────┘ └─────────┘ └──────────┘ + │ │ │ │ + │ │ │ │ + Collects Analyzes Executes Records + System with LLM Actions Everything + Health (Ollama) Safely +``` + +## Data Flow + +1. **Collection**: Monitor gathers system health data +2. **Analysis**: Agent sends data + prompts to Ollama +3. **Decision**: AI returns structured analysis (JSON) +4. **Execution**: Executor checks permissions & autonomy level +5. **Action**: Either executes or queues for approval +6. **Logging**: All steps logged to JSONL files + +## Safety Mechanisms + +### Multi-Level Protection +1. **Autonomy Levels**: observe → suggest → auto-safe → auto-full +2. **Protected Services**: Hardcoded list of critical services +3. **Dry-Run Testing**: NixOS rebuilds tested before applying +4. **Approval Queue**: Manual review workflow +5. **Action Logging**: Complete audit trail +6. **Resource Limits**: systemd enforced (1GB RAM, 50% CPU) +7. **Rollback Capability**: Can revert changes +8. **Timeout Protection**: All operations have timeouts + +### What It Can Do Automatically (auto-safe) +- ✅ Restart failed services (except protected ones) +- ✅ Clean up disk space (nix-collect-garbage) +- ✅ Rotate/clean logs +- ✅ Run diagnostics +- ❌ Modify configs (requires approval) +- ❌ Rebuild NixOS (requires approval) +- ❌ Touch protected services + +## Files Created + +``` +systems/macha-configs/autonomous/ +├── __init__.py # Python package marker +├── monitor.py # System health monitoring +├── agent.py # AI analysis and reasoning +├── executor.py # Safe action execution +├── orchestrator.py # Main control loop +├── module.nix # NixOS integration +├── README.md # Architecture docs +├── QUICKSTART.md # User guide +├── EXAMPLES.md # Configuration examples +└── SUMMARY.md # This file +``` + +## Integration Points + +### Modified Files +- `systems/macha.nix` - Added autonomous module and configuration + +### Created Systemd Service +- `macha-autonomous.service` - Main service +- Runs continuously, checks every 5 minutes +- Auto-starts on boot +- Restart on failure + +### Created Users/Groups +- `macha-autonomous` user (system user) +- Limited sudo access for specific commands +- Home: `/var/lib/macha-autonomous` + +### Created CLI Commands +- `macha-check` - Run manual health check +- `macha-approve list` - Show pending actions +- `macha-approve approve <N>` - Approve action N +- `macha-logs [orchestrator|decisions|actions|service]` - View logs + +### State Directory +`/var/lib/macha-autonomous/` contains: +- `orchestrator.log` - Main log +- `decisions.jsonl` - AI analysis log +- `actions.jsonl` - Executed actions log +- `snapshot_*.json` - System state snapshots +- `approval_queue.json` - Pending actions +- `suggested_patch_*.txt` - Config change suggestions + +## Configuration + +### Current Configuration (in systems/macha.nix) +```nix +services.macha-autonomous = { + enable = true; + autonomyLevel = "suggest"; # Requires approval + checkInterval = 300; # 5 minutes + model = "llama3.1:70b"; # Most capable model +}; +``` + +### To Deploy +```bash +# Build and activate +sudo nixos-rebuild switch --flake .#macha + +# Check status +systemctl status macha-autonomous + +# View logs +macha-logs service +``` + +## Usage Workflow + +### Day 1: Observation +```bash +# Just watch what it detects +macha-logs decisions +``` + +### Day 2-7: Review Proposals +```bash +# Check what it wants to do +macha-approve list + +# Approve good actions +macha-approve approve 0 +``` + +### Week 2+: Increase Autonomy +```nix +# Let it handle safe actions automatically +services.macha-autonomous.autonomyLevel = "auto-safe"; +``` + +### Monthly: Review Audit Logs +```bash +# See what it's been doing +cat /var/lib/macha-autonomous/actions.jsonl | jq . +``` + +## Performance Characteristics + +### Resource Usage +- **Idle**: ~100MB RAM +- **Active (w/ llama3.1:70b)**: ~100MB + ~40GB model (shared with Ollama) +- **CPU**: Limited to 50% by systemd +- **Disk**: Minimal (logs rotate, snapshots limited to last 100) + +### Timing +- **Monitor**: ~2 seconds +- **AI Analysis**: ~30 seconds (70B model) to ~3 seconds (8B model) +- **Execution**: Varies by action (seconds to minutes) +- **Full Cycle**: ~1-2 minutes typically + +### Scalability +- Can handle multiple issues per cycle +- Queue system prevents action spam +- Configurable check intervals +- Model choice affects speed/quality tradeoff + +## Current Status + +✅ **READY TO USE** - All components implemented and integrated + +The system is: +- ✅ Fully functional +- ✅ Safety mechanisms in place +- ✅ Well documented +- ✅ Integrated into NixOS configuration +- ✅ Ready for deployment + +Currently configured in **conservative mode** (`suggest`): +- Monitors continuously +- Analyzes with AI +- Proposes actions +- Waits for your approval + +## Next Steps + +1. **Deploy and test:** + ```bash + sudo nixos-rebuild switch --flake .#macha + ``` + +2. **Monitor for a few days:** + ```bash + macha-logs service + ``` + +3. **Review what it detects:** + ```bash + macha-approve list + cat /var/lib/macha-autonomous/decisions.jsonl | jq . + ``` + +4. **Gradually increase autonomy as you gain confidence** + +## Future Enhancement Ideas + +### Short Term +- Web dashboard for easier monitoring +- Email/notification system for critical issues +- More sophisticated action types +- Historical trend analysis + +### Medium Term +- Integration with MCP servers (already installed!) +- Predictive maintenance using historical data +- Self-tuning of check intervals based on activity +- Multi-system orchestration (manage other NixOS hosts) + +### Long Term +- Learning from past decisions to improve +- A/B testing of configuration changes +- Distributed consensus for multi-host decisions +- Integration with external monitoring systems + +## Philosophy + +This implementation follows key principles: + +1. **Safety First**: Multiple layers of protection +2. **Transparency**: Everything is logged and auditable +3. **Conservative Default**: Start restricted, earn trust +4. **Human in Loop**: Always allow override +5. **Gradual Autonomy**: Progressive trust model +6. **Local First**: No external dependencies +7. **Declarative**: NixOS-native configuration + +## Conclusion + +Macha now has a sophisticated autonomous maintenance system that can: +- Monitor itself 24/7 +- Detect and analyze issues using AI +- Fix problems automatically (with appropriate safeguards) +- Learn and improve over time +- Maintain complete audit trails + +All powered by local AI models, no external dependencies, fully integrated with NixOS, and designed with safety as the top priority. + +**Welcome to the future of self-maintaining systems!** 🎉 diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..fd41e9d --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +# Macha Autonomous System Maintenance diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..5398e19 --- /dev/null +++ b/agent.py @@ -0,0 +1,1015 @@ +#!/usr/bin/env python3 +""" +AI Agent - Analyzes system state and proposes solutions using local LLMs +""" + +import json +import requests +import subprocess +from typing import Dict, List, Any, Optional +from pathlib import Path +from datetime import datetime + +from tools import SysadminTools + + +class MachaAgent: + """AI agent that analyzes system issues and proposes fixes""" + + # Load system prompt from file + @staticmethod + def _load_system_prompt() -> str: + """Load the system prompt from file""" + prompt_file = Path(__file__).parent / "system_prompt.txt" + try: + return prompt_file.read_text() + except Exception as e: + print(f"Warning: Could not load system prompt from {prompt_file}: {e}") + return "You are Macha, an autonomous AI system maintenance agent." + + SYSTEM_PROMPT = _load_system_prompt.__func__() + + def __init__( + self, + ollama_host: str = "http://localhost:11434", + model: str = "gpt-oss:latest", + state_dir: Path = Path("/var/lib/macha"), + context_db = None, + config_repo: str = "git+https://git.coven.systems/lily/nixos-servers", + config_branch: str = "main", + enable_tools: bool = True, + use_queue: bool = True, + priority: str = "INTERACTIVE" + ): + self.ollama_host = ollama_host + self.model = model + self.state_dir = state_dir + self.state_dir.mkdir(parents=True, exist_ok=True) + self.decision_log = self.state_dir / "decisions.jsonl" + self.context_db = context_db + self.config_repo = config_repo + self.config_branch = config_branch + self.enable_tools = enable_tools + + # Queue settings + self.use_queue = use_queue + self.priority = priority + self.ollama_queue = None + + if use_queue: + try: + from ollama_queue import OllamaQueue, Priority + self.ollama_queue = OllamaQueue() + self.priority_level = getattr(Priority, priority, Priority.INTERACTIVE) + except (PermissionError, OSError): + # Silently fall back to direct API calls when queue is not accessible + # (e.g., regular users don't have access to /var/lib/macha/queues) + self.use_queue = False + except Exception as e: + # Log unexpected errors but still fall back gracefully + import sys + print(f"Note: Ollama queue unavailable ({type(e).__name__}), using direct API", file=sys.stderr) + self.use_queue = False + + # Initialize tools system + self.tools = SysadminTools(safe_mode=False) if enable_tools else None + + # Tool output cache for hierarchical processing + self.tool_output_cache = {} + self.cache_dir = self.state_dir / "tool_cache" + + # Only create cache dir if we have write access (running as macha user) + try: + self.cache_dir.mkdir(parents=True, exist_ok=True) + except (PermissionError, OSError): + # Running as unprivileged user (macha-chat), use temp dir instead + import tempfile + self.cache_dir = Path(tempfile.mkdtemp(prefix="macha_cache_")) + + def _query_relevant_knowledge(self, query: str, limit: int = 3) -> str: + """ + Query knowledge base for relevant information + + Returns formatted string of relevant knowledge to include in prompts + """ + if not self.context_db: + return "" + + try: + knowledge_items = self.context_db.query_knowledge(query, limit=limit) + if not knowledge_items: + return "" + + knowledge_text = "\n\nRELEVANT KNOWLEDGE FROM EXPERIENCE:\n" + for item in knowledge_items: + knowledge_text += f"\n• {item['topic']} ({item['category']}):\n" + knowledge_text += f" {item['knowledge']}\n" + knowledge_text += f" [Confidence: {item['confidence']}, Referenced: {item['times_referenced']} times]\n" + + return knowledge_text + except Exception as e: + print(f"Error querying knowledge: {e}") + return "" + + def store_learning( + self, + topic: str, + knowledge: str, + category: str = "experience", + confidence: str = "medium", + tags: list = None + ) -> bool: + """ + Store a learned insight into the knowledge base + + Args: + topic: What this is about + knowledge: The insight/pattern/learning + category: Type of knowledge + confidence: How confident we are + tags: Optional tags + + Returns: + True if stored successfully + """ + if not self.context_db: + return False + + try: + kid = self.context_db.store_knowledge( + topic=topic, + knowledge=knowledge, + category=category, + source="experience", + confidence=confidence, + tags=tags + ) + if kid: + print(f"📚 Learned: {topic}") + return True + return False + except Exception as e: + print(f"Error storing learning: {e}") + return False + + def reflect_and_learn( + self, + situation: str, + action_taken: str, + outcome: str, + success: bool + ) -> None: + """ + Reflect on an operation and extract learnings to store + + Args: + situation: What was the problem/situation + action_taken: What action was taken + outcome: What was the result + success: Whether it succeeded + """ + if not self.context_db: + return + + # Only learn from successful operations for now + if not success: + return + + # Build reflection prompt + prompt = f"""Based on this successful operation, extract key learnings to remember for the future. + +SITUATION: +{situation} + +ACTION TAKEN: +{action_taken} + +OUTCOME: +{outcome} + +Extract 1-2 specific, actionable learnings. For each learning provide: +1. topic: A concise topic name (e.g., "systemd service restart", "disk cleanup procedure") +2. knowledge: The specific insight or pattern (what worked, why, important details) +3. category: One of: command, pattern, troubleshooting, performance + +Respond ONLY with valid JSON: +[ + {{ + "topic": "...", + "knowledge": "...", + "category": "...", + "confidence": "medium" + }} +] +""" + + try: + response = self._query_ollama(prompt, temperature=0.3, timeout=30) + learnings = json.loads(response) + + if isinstance(learnings, list): + for learning in learnings: + if all(k in learning for k in ['topic', 'knowledge', 'category']): + self.store_learning( + topic=learning['topic'], + knowledge=learning['knowledge'], + category=learning.get('category', 'experience'), + confidence=learning.get('confidence', 'medium') + ) + except Exception as e: + # Reflection is optional - don't fail if it doesn't work + print(f"Note: Could not extract learnings: {e}") + + def analyze_system_state(self, monitoring_data: Dict[str, Any], system_hostname: str = None, git_context=None) -> Dict[str, Any]: + """Analyze system monitoring data and determine if action is needed""" + + # Build context for the AI + context = self._build_analysis_context(monitoring_data) + + # Get system infrastructure context if available + system_context = "" + if self.context_db and system_hostname: + system_context = self.context_db.get_system_context(system_hostname, git_context) + + # Ask the AI to analyze + prompt = self._create_analysis_prompt(context, system_context) + + response = self._query_ollama(prompt) + + # Parse the AI's response + analysis = self._parse_analysis_response(response) + + # Log the decision + self._log_decision(monitoring_data, analysis) + + return analysis + + def propose_fix(self, issue_description: str, context: Dict[str, Any]) -> Dict[str, Any]: + """Propose a fix for a specific issue""" + + # Query relevant config files if we have context_db + relevant_configs = [] + if self.context_db: + try: + # Query for config files relevant to the issue + configs = self.context_db.query_config_files( + query=issue_description, + n_results=3 + ) + relevant_configs = configs + except Exception as e: + print(f"Warning: Could not query config files: {e}") + + # Build config context section + config_context = "" + if relevant_configs: + config_context = "\n\nRELEVANT CONFIGURATION FILES:\n" + for cfg in relevant_configs: + config_context += f"\n--- {cfg['path']} (relevance: {cfg['relevance']:.2%}) ---\n" + config_context += cfg['content'][:1000] # First 1000 chars to avoid token limits + if len(cfg['content']) > 1000: + config_context += "\n... (truncated)" + config_context += "\n" + + # Query relevant knowledge from experience + knowledge_context = self._query_relevant_knowledge(issue_description, limit=3) + + # Build previous investigations context + previous_inv_context = "" + if context.get('previous_investigations'): + previous_inv_context = "\n\nPREVIOUS INVESTIGATIONS (DO NOT REPEAT THESE):\n" + for i, inv in enumerate(context['previous_investigations'][:3], 1): # Show up to 3 + previous_inv_context += f"\nInvestigation #{i} ({inv['timestamp']}):\n" + previous_inv_context += f"Commands: {', '.join(inv['commands'])}\n" + previous_inv_context += f"Results:\n{inv['output'][:500]}...\n" # First 500 chars + previous_inv_context += "\n⚠️ You have already run these investigations. Do NOT propose them again." + previous_inv_context += "\n⚠️ Based on the investigation results above, propose an ACTUAL FIX, not more investigation.\n" + + prompt = f"""{self.SYSTEM_PROMPT} + +TASK: PROPOSE FIX +================================================================================ + +ISSUE TO ADDRESS: +{issue_description} + +SYSTEM CONTEXT: +{json.dumps(context, indent=2)}{config_context}{knowledge_context}{previous_inv_context} + +REPOSITORY INFO: +- Git Repository: {self.config_repo} +- Branch: {self.config_branch} + +YOUR RESPONSE MUST BE VALID JSON: +{{ + "diagnosis": "brief description of what you think is wrong", + "proposed_action": "specific action to take", + "action_type": "one of: systemd_restart, nix_rebuild, config_change, cleanup, investigation", + "risk_level": "one of: low, medium, high", + "commands": ["list", "of", "shell", "commands"], + "config_changes": {{ + "file": "path/to/config.nix in the repository", + "change": "description of change needed" + }}, + "reasoning": "why this fix should work", + "rollback_plan": "how to undo if it doesn't work" +}} + +RESPOND WITH ONLY THE JSON, NO OTHER TEXT. +""" + + response = self._query_ollama(prompt) + + try: + # Try to extract JSON from response + # LLMs sometimes add extra text, so we need to find the JSON part + import re + json_match = re.search(r'\{.*\}', response, re.DOTALL) + if json_match: + return json.loads(json_match.group()) + else: + return { + "diagnosis": "Failed to parse AI response", + "proposed_action": "manual investigation required", + "action_type": "investigation", + "risk_level": "high", + "reasoning": "AI response was not in expected format" + } + except json.JSONDecodeError: + return { + "diagnosis": "Failed to parse AI response", + "proposed_action": "manual investigation required", + "action_type": "investigation", + "risk_level": "high", + "reasoning": f"Raw response: {response[:500]}" + } + + def _build_analysis_context(self, data: Dict[str, Any]) -> str: + """Build a concise context string for the AI""" + lines = [] + + # System resources + res = data.get("resources", {}) + lines.append(f"CPU: {res.get('cpu_percent', 0):.1f}%, Memory: {res.get('memory_percent', 0):.1f}%, Load: {res.get('load_average', {}).get('1min', 0):.2f}") + + # Disk usage + disk = data.get("disk", {}) + for part in disk.get("partitions", []): + if part.get("percent_used", 0) > 80: # Only mention if >80% full + lines.append(f"⚠️ Disk {part['mountpoint']}: {part['percent_used']:.1f}% full") + + # Failed services + systemd = data.get("systemd", {}) + if systemd.get("failed_count", 0) > 0: + lines.append(f"\n⚠️ {systemd['failed_count']} failed systemd services:") + for svc in systemd.get("failed_services", [])[:10]: + lines.append(f" - {svc.get('unit', 'unknown')}: {svc.get('sub', 'unknown')}") + + # Recent errors + logs = data.get("logs", {}) + error_count = logs.get("error_count_1h", 0) + if error_count > 0: + lines.append(f"\n{error_count} errors in last hour") + # Group errors by service + errors_by_service = {} + for err in logs.get("recent_errors", [])[:20]: + svc = err.get("SYSLOG_IDENTIFIER", "unknown") + errors_by_service[svc] = errors_by_service.get(svc, 0) + 1 + for svc, count in sorted(errors_by_service.items(), key=lambda x: x[1], reverse=True)[:5]: + lines.append(f" - {svc}: {count} errors") + + # Network + net = data.get("network", {}) + if not net.get("internet_reachable", True): + lines.append("\n⚠️ No internet connectivity") + + return "\n".join(lines) + + def _create_analysis_prompt(self, context: str, system_context: str = "") -> str: + """Create the analysis prompt for the AI""" + + prompt = f"""{self.SYSTEM_PROMPT} + +TASK: ANALYZE SYSTEM HEALTH +================================================================================ + +OBJECTIVE: +Analyze the current system state and determine if any action is needed. +Be thorough but not alarmist. Only recommend action if truly necessary. +""" + + if system_context: + prompt += f"\n\nSYSTEM INFRASTRUCTURE:\n{system_context}" + + prompt += f""" + +CURRENT SYSTEM STATE: +{context} + +YOUR RESPONSE MUST BE VALID JSON: +{{ + "status": "one of: healthy, attention_needed, intervention_required", + "issues": [ + {{ + "severity": "one of: info, warning, critical", + "category": "one of: resources, services, disk, network, logs", + "description": "brief description of the issue", + "requires_action": true/false + }} + ], + "overall_assessment": "brief summary of system health", + "recommended_actions": ["list of recommended actions, if any"] +}} + +RESPOND WITH ONLY THE JSON, NO OTHER TEXT. +""" + + return prompt + + def _auto_diagnose_ollama(self) -> str: + """Automatically diagnose Ollama issues""" + diagnostics = [] + + diagnostics.append("=== OLLAMA SELF-DIAGNOSTIC ===") + + # Check if Ollama service is running + try: + result = subprocess.run( + ['systemctl', 'is-active', 'ollama.service'], + capture_output=True, + text=True, + timeout=5 + ) + if result.returncode == 0: + diagnostics.append("✅ Ollama service is active") + else: + diagnostics.append(f"❌ Ollama service is NOT active: {result.stdout.strip()}") + except Exception as e: + diagnostics.append(f"⚠️ Could not check service status: {e}") + + # Check memory usage + try: + result = subprocess.run(['free', '-h'], capture_output=True, text=True, timeout=5) + diagnostics.append(f"\nMemory:\n{result.stdout}") + except Exception as e: + diagnostics.append(f"⚠️ Could not check memory: {e}") + + # Check which models are loaded + try: + response = requests.get(f"{self.ollama_host}/api/tags", timeout=5) + if response.status_code == 200: + models = response.json().get('models', []) + diagnostics.append(f"\nLoaded models: {len(models)}") + for model in models: + name = model.get('name', 'unknown') + size = model.get('size', 0) / (1024**3) + is_target = "← TARGET" if name == self.model else "" + diagnostics.append(f" • {name} ({size:.1f} GB) {is_target}") + + # Check if target model is loaded + model_names = [m.get('name') for m in models] + if self.model not in model_names: + diagnostics.append(f"\n❌ TARGET MODEL NOT LOADED: {self.model}") + diagnostics.append(f" Available: {', '.join(model_names)}") + else: + diagnostics.append(f"❌ Ollama API returned {response.status_code}") + except Exception as e: + diagnostics.append(f"⚠️ Could not query Ollama API: {e}") + + # Check recent Ollama logs + try: + result = subprocess.run( + ['journalctl', '-u', 'ollama.service', '-n', '20', '--no-pager'], + capture_output=True, + text=True, + timeout=5 + ) + if result.stdout: + diagnostics.append(f"\nRecent logs:\n{result.stdout}") + except Exception as e: + diagnostics.append(f"⚠️ Could not check logs: {e}") + + return "\n".join(diagnostics) + + def _query_ollama(self, prompt: str, temperature: float = 0.3) -> str: + """Query Ollama API (with optional queue)""" + # If queue is enabled, submit to queue and wait + if self.use_queue and self.ollama_queue: + try: + payload = { + "model": self.model, + "prompt": prompt, + "stream": False, + "temperature": temperature, + "timeout": 120 + } + + request_id = self.ollama_queue.submit( + request_type="generate", + payload=payload, + priority=self.priority_level + ) + + result = self.ollama_queue.wait_for_result(request_id, timeout=300) + return result.get("response", "") + + except Exception as e: + print(f"Warning: Queue request failed, falling back to direct: {e}") + # Fall through to direct query + + # Direct query (no queue or queue failed) + try: + response = requests.post( + f"{self.ollama_host}/api/generate", + json={ + "model": self.model, + "prompt": prompt, + "stream": False, + "temperature": temperature, + }, + timeout=120 # 2 minute timeout for large models + ) + response.raise_for_status() + return response.json().get("response", "") + except requests.exceptions.HTTPError as e: + error_detail = "" + try: + error_detail = f" - {response.text}" + except: + pass + print(f"ERROR: Ollama HTTP error {response.status_code}{error_detail}") + print(f"Model requested: {self.model}") + print(f"Ollama host: {self.ollama_host}") + # Run diagnostics + diagnostics = self._auto_diagnose_ollama() + print(diagnostics) + return json.dumps({ + "error": f"Ollama HTTP {response.status_code}: {str(e)}{error_detail}", + "diagnosis": f"Ollama API error - check if model '{self.model}' is available", + "action_type": "investigation", + "risk_level": "high" + }) + except Exception as e: + print(f"ERROR: Failed to query Ollama: {str(e)}") + print(f"Model requested: {self.model}") + print(f"Ollama host: {self.ollama_host}") + # Run diagnostics + diagnostics = self._auto_diagnose_ollama() + print(diagnostics) + return json.dumps({ + "error": f"Failed to query Ollama: {str(e)}", + "diagnosis": "Ollama API unavailable", + "action_type": "investigation", + "risk_level": "high" + }) + + def _estimate_tokens(self, text: str) -> int: + """Rough token estimation: ~4 chars per token""" + return len(text) // 4 + + def _extract_key_findings(self, tool_name: str, raw_output: str, progress_callback=None) -> str: + """ + Extract key findings from large tool output using chunked map-reduce. + Processes large outputs in smaller chunks to prevent Ollama overload. + """ + output_size = len(raw_output) + chunk_size = 8000 # ~2000 tokens per chunk, safe size + + # Store full output in cache for potential deep dive + cache_id = f"{tool_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + try: + cache_file = self.cache_dir / f"{cache_id}.txt" + cache_file.write_text(raw_output) + except (PermissionError, OSError): + # Fallback to temp directory if cache dir not writable + import tempfile + cache_file = Path(tempfile.gettempdir()) / f"macha_{cache_id}.txt" + cache_file.write_text(raw_output) + + # If output is small enough, process in one go + if output_size <= chunk_size: + try: + extraction_prompt = f"""Analyze this output from '{tool_name}'. + +Extract: key findings, errors/warnings, metrics, actionable insights. + +Output: +{raw_output} + +Provide concise summary (max 600 chars).""" + + summary = self._query_ollama(extraction_prompt, temperature=0.1) + return f"[Summary of {tool_name}]:\n{summary}\n\n[Full output: {output_size:,} chars cached as {cache_id}]" + except Exception as e: + print(f"Warning: Failed to extract findings: {e}") + return self._simple_truncate(raw_output, 2000) + + # Large output: chunk and process with map-reduce + try: + chunks = [] + num_chunks = (output_size + chunk_size - 1) // chunk_size + + for i in range(0, output_size, chunk_size): + chunk = raw_output[i:i+chunk_size] + chunks.append(chunk) + + # Phase 1: Map - Summarize each chunk + chunk_summaries = [] + for idx, chunk in enumerate(chunks): + chunk_num = idx + 1 + + # Progress feedback + if progress_callback: + progress_callback(f" Processing chunk {chunk_num}/{num_chunks}...") + else: + print(f" → Processing chunk {chunk_num}/{num_chunks}...", flush=True) + + extraction_prompt = f"""Analyze chunk {chunk_num}/{num_chunks} from '{tool_name}'. + +Extract: key findings, errors/warnings, metrics, insights. + +Chunk: +{chunk} + +Concise summary (max 400 chars).""" + + chunk_summary = self._query_ollama(extraction_prompt, temperature=0.1) + chunk_summaries.append(f"[Chunk {chunk_num}]: {chunk_summary}") + + # Phase 2: Reduce - Combine chunk summaries if many chunks + if len(chunk_summaries) > 5: + if progress_callback: + progress_callback(f" Synthesizing {len(chunk_summaries)} chunk summaries...") + else: + print(f" → Synthesizing {len(chunk_summaries)} chunk summaries...", flush=True) + + combined = "\n".join(chunk_summaries) + reduce_prompt = f"""Synthesize these chunk summaries from '{tool_name}': + +{combined} + +Provide unified summary (max 800 chars) covering all key points.""" + + final_summary = self._query_ollama(reduce_prompt, temperature=0.1) + return f"""[Chunked analysis of {tool_name}]: +{final_summary} + +[Processed {num_chunks} chunks, {output_size:,} chars total, cached as {cache_id}]""" + + else: + # Few chunks: just concatenate summaries + combined_summary = "\n".join(chunk_summaries) + return f"""[Chunked analysis of {tool_name}]: +{combined_summary} + +[Processed {num_chunks} chunks, {output_size:,} chars total, cached as {cache_id}]""" + + except Exception as e: + print(f"Warning: Chunked extraction failed for {tool_name}: {e}") + return self._simple_truncate(raw_output, 2000) + + def _simple_truncate(self, text: str, max_chars: int) -> str: + """Simple head+tail truncation""" + if len(text) <= max_chars: + return text + + half = max_chars // 2 + return ( + text[:half] + + f"\n... [TRUNCATED: {len(text) - max_chars} chars omitted] ...\n" + + text[-half:] + ) + + def _process_tool_result_hierarchical(self, tool_name: str, result: Any) -> str: + """ + Intelligently process tool results based on size: + - Small (< 2KB): Pass through directly + - Medium (2-10KB): Truncate with head+tail + - Large (> 10KB): Hierarchical extraction in separate context + """ + result_str = json.dumps(result) if not isinstance(result, str) else result + size = len(result_str) + + # Small outputs: pass through directly + if size < 2000: + print(f" [Tool result: {size} chars, passing through]") + return result_str + + # Medium outputs: truncate with head+tail + elif size < 10000: + print(f" [Tool result: {size} chars, truncating to 2000]") + return self._simple_truncate(result_str, 2000) + + # Large outputs: hierarchical extraction + else: + print(f" [Tool result: {size} chars, extracting key findings...]") + return self._extract_key_findings(tool_name, result_str) + + def _prune_messages(self, messages: List[Dict], max_context_tokens: int = 80000) -> List[Dict]: + """ + Prune message history to stay within context limits. + Keeps: system prompt + recent conversation window + """ + if not messages: + return messages + + # Separate system message from conversation + system_msg = None + conversation = [] + + for msg in messages: + if msg["role"] == "system": + system_msg = msg + else: + conversation.append(msg) + + # Calculate current token count + total_tokens = 0 + if system_msg: + total_tokens += self._estimate_tokens(system_msg["content"]) + + for msg in conversation: + content = msg.get("content", "") + total_tokens += self._estimate_tokens(str(content)) + + # If under limit, return as-is + if total_tokens <= max_context_tokens: + result = [] + if system_msg: + result.append(system_msg) + result.extend(conversation) + print(f"[Context: {total_tokens:,} tokens, {len(conversation)} messages]") + return result + + # Need to prune - keep sliding window of recent messages + # Strategy: Keep last 20 messages (10 exchanges) which should be ~40K tokens max + print(f"[Context pruning: {total_tokens:,} tokens → keeping last 20 messages]") + + pruned_conversation = conversation[-20:] + + result = [] + if system_msg: + result.append(system_msg) + result.extend(pruned_conversation) + + # Calculate new token count + new_tokens = self._estimate_tokens(system_msg["content"]) if system_msg else 0 + for msg in pruned_conversation: + new_tokens += self._estimate_tokens(str(msg.get("content", ""))) + + print(f"[Context after pruning: {new_tokens:,} tokens, {len(pruned_conversation)} messages]") + + return result + + def _query_ollama_with_tools( + self, + messages: List[Dict[str, str]], + temperature: float = 0.3, + max_iterations: int = 30 + ) -> str: + """ + Query Ollama using chat API with tool support. + Handles tool calls and returns final response. + + Args: + messages: List of chat messages [{"role": "user", "content": "..."}] + temperature: Generation temperature + max_iterations: Maximum number of tool-calling iterations (default 30 for complex system operations) + + Returns: + Final text response from the model + """ + if not self.enable_tools or not self.tools: + # Fallback to regular query + user_content = " ".join([m["content"] for m in messages if m["role"] == "user"]) + return self._query_ollama(user_content, temperature) + + # Add system message if not present + if not any(m["role"] == "system" for m in messages): + messages = [{"role": "system", "content": self.SYSTEM_PROMPT}] + messages + + tool_definitions = self.tools.get_tool_definitions() + + for iteration in range(max_iterations): + try: + # Prune messages before sending to avoid context overflow + pruned_messages = self._prune_messages(messages, max_context_tokens=80000) + + # Use queue if enabled + if self.use_queue and self.ollama_queue: + try: + payload = { + "model": self.model, + "messages": pruned_messages, + "stream": False, + "temperature": temperature, + "tools": tool_definitions, + "timeout": 120 + } + + request_id = self.ollama_queue.submit( + request_type="chat_with_tools", + payload=payload, + priority=self.priority_level + ) + + resp_data = self.ollama_queue.wait_for_result(request_id, timeout=300) + + except Exception as e: + print(f"Warning: Queue request failed, falling back to direct: {e}") + # Fall through to direct query + response = requests.post( + f"{self.ollama_host}/api/chat", + json={ + "model": self.model, + "messages": pruned_messages, + "stream": False, + "temperature": temperature, + "tools": tool_definitions + }, + timeout=120 + ) + response.raise_for_status() + resp_data = response.json() + else: + # Direct query (no queue) + response = requests.post( + f"{self.ollama_host}/api/chat", + json={ + "model": self.model, + "messages": pruned_messages, + "stream": False, + "temperature": temperature, + "tools": tool_definitions + }, + timeout=120 + ) + response.raise_for_status() + resp_data = response.json() + + message = resp_data.get("message", {}) + + # Check if model wants to call tools + tool_calls = message.get("tool_calls", []) + + if not tool_calls: + # No tools to call, return the text response + return message.get("content", "") + + # Add assistant's message to history + messages.append(message) + + # Execute each tool call + for tool_call in tool_calls: + function_name = tool_call["function"]["name"] + arguments = tool_call["function"]["arguments"] + + print(f" → Tool call: {function_name}({arguments})") + + # Execute the tool + tool_result = self.tools.execute_tool(function_name, arguments) + + # Process result hierarchically based on size + processed_result = self._process_tool_result_hierarchical(function_name, tool_result) + + # Add processed result to messages + messages.append({ + "role": "tool", + "content": processed_result + }) + + # Continue loop to let model process tool results + + except requests.exceptions.HTTPError as e: + error_body = "" + try: + error_body = response.text + except: + pass + + # Check if this is a context length error + if "context length" in error_body.lower() or "too long" in error_body.lower(): + print(f"ERROR: Context length exceeded. Attempting recovery...") + # Emergency pruning - keep only system + last user message + system_msg = next((m for m in messages if m["role"] == "system"), None) + last_user_msg = next((m for m in reversed(messages) if m["role"] == "user"), None) + + if system_msg and last_user_msg: + messages = [system_msg, last_user_msg] + print(f"[Emergency context reset: keeping only system + last user message]") + continue # Retry with minimal context + + print(f"ERROR: Ollama chat API error: {e}") + diagnostics = self._auto_diagnose_ollama() + print(diagnostics) + return json.dumps({ + "error": f"Ollama chat API error: {str(e)}", + "diagnosis": "Failed to communicate with Ollama", + "action_type": "investigation", + "risk_level": "high" + }) + except Exception as e: + print(f"ERROR: Tool calling failed: {e}") + return json.dumps({ + "error": f"Tool calling error: {str(e)}", + "diagnosis": "Failed during tool execution", + "action_type": "investigation", + "risk_level": "high" + }) + + # If we hit max iterations, return what we have + return "Maximum tool calling iterations reached. Unable to complete request." + + def _parse_analysis_response(self, response: str) -> Dict[str, Any]: + """Parse the AI's analysis response""" + import re + + # Log the raw response for debugging + self._log(f"AI raw response (first 1000 chars): {response[:1000]}") + + try: + json_match = re.search(r'\{.*\}', response, re.DOTALL) + if json_match: + parsed = json.loads(json_match.group()) + self._log(f"Successfully parsed AI response: {parsed.get('status', 'unknown')}") + return parsed + else: + self._log("ERROR: No JSON found in AI response") + except Exception as e: + self._log(f"ERROR parsing AI response: {e}") + + # Fallback + self._log("Falling back to default response") + return { + "status": "healthy", + "issues": [], + "overall_assessment": "Unable to parse AI response", + "recommended_actions": [] + } + + def _log(self, message: str): + """Log a message to the orchestrator log""" + # This will go to the orchestrator log via print + print(f"[AGENT] {message}") + + def _log_decision(self, monitoring_data: Dict[str, Any], analysis: Dict[str, Any]): + """Log AI decisions for auditing""" + log_entry = { + "timestamp": datetime.now().isoformat(), + "monitoring_summary": { + "cpu": monitoring_data.get("resources", {}).get("cpu_percent"), + "memory": monitoring_data.get("resources", {}).get("memory_percent"), + "failed_services": monitoring_data.get("systemd", {}).get("failed_count"), + "error_count": monitoring_data.get("logs", {}).get("error_count_1h"), + }, + "analysis": analysis, + } + + with open(self.decision_log, 'a') as f: + f.write(json.dumps(log_entry) + '\n') + + def get_recent_decisions(self, count: int = 10) -> List[Dict[str, Any]]: + """Get recent decision history""" + if not self.decision_log.exists(): + return [] + + decisions = [] + with open(self.decision_log, 'r') as f: + for line in f: + if line.strip(): + try: + decisions.append(json.loads(line)) + except: + pass + + return decisions[-count:] + + +if __name__ == "__main__": + import sys + + # Test the agent + agent = MachaAgent() + + if len(sys.argv) > 1 and sys.argv[1] == "test": + # Test with sample data + test_data = { + "systemd": {"failed_count": 2, "failed_services": [ + {"unit": "test-service.service", "sub": "failed"} + ]}, + "resources": {"cpu_percent": 25.0, "memory_percent": 45.0, "load_average": {"1min": 1.5}}, + "logs": {"error_count_1h": 10}, + "network": {"internet_reachable": True} + } + + print("Testing agent analysis...") + analysis = agent.analyze_system_state(test_data) + print(json.dumps(analysis, indent=2)) + + if analysis.get("issues"): + print("\nTesting fix proposal...") + fix = agent.propose_fix( + analysis["issues"][0]["description"], + test_data + ) + print(json.dumps(fix, indent=2)) diff --git a/chat.py b/chat.py new file mode 100644 index 0000000..2d3a08b --- /dev/null +++ b/chat.py @@ -0,0 +1,522 @@ +#!/usr/bin/env python3 +""" +Interactive chat interface with Macha AI agent. +Allows conversational interaction and directive execution. +""" + +import json +import os +import subprocess +import sys +from datetime import datetime +from pathlib import Path +from typing import List, Dict, Any + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from agent import MachaAgent + + +class MachaChatSession: + """Interactive chat session with Macha""" + + def __init__(self): + self.agent = MachaAgent(use_queue=True, priority="INTERACTIVE") + self.conversation_history: List[Dict[str, str]] = [] + self.session_start = datetime.now().isoformat() + + def _create_chat_prompt(self, user_message: str) -> str: + """Create a prompt for the chat session""" + + # Build conversation context + context = "" + if self.conversation_history: + context = "\n\nCONVERSATION HISTORY:\n" + for entry in self.conversation_history[-10:]: # Last 10 messages + role = entry['role'].upper() + msg = entry['message'] + context += f"{role}: {msg}\n" + + prompt = f"""{MachaAgent.SYSTEM_PROMPT} + +TASK: INTERACTIVE CHAT SESSION + +You are in an interactive chat session with the system administrator. +You can have a natural conversation and execute commands when directed. + +CAPABILITIES: +- Answer questions about system status +- Explain configurations and issues +- Execute commands when explicitly asked +- Provide guidance and recommendations + +COMMAND EXECUTION: +When the user asks you to run a command or perform an action that requires execution: +1. Respond with a JSON object containing the command to execute +2. Format: {{"action": "execute", "command": "the command", "explanation": "why you're running it"}} +3. After seeing the output, continue the conversation naturally + +RESPONSE FORMAT: +- For normal conversation: Respond naturally in plain text +- For command execution: Respond with JSON containing action/command/explanation +- Keep responses concise but informative + +RULES: +- Only execute commands when explicitly asked or when it's clearly needed +- Explain what you're about to do before executing +- Never execute destructive commands without explicit confirmation +- If unsure, ask for clarification +{context} + +USER: {user_message} + +MACHA:""" + + return prompt + + def _execute_command(self, command: str) -> Dict[str, Any]: + """Execute a shell command and return results""" + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=30 + ) + + # Check if command failed due to permissions + needs_sudo = False + permission_errors = [ + 'Interactive authentication required', + 'Permission denied', + 'Operation not permitted', + 'Must be root', + 'insufficient privileges', + 'authentication is required' + ] + + if result.returncode != 0: + error_text = (result.stderr + result.stdout).lower() + for perm_error in permission_errors: + if perm_error.lower() in error_text: + needs_sudo = True + break + + # Retry with sudo if permission error detected + if needs_sudo and not command.strip().startswith('sudo'): + print(f"\n⚠️ Permission denied, retrying with sudo...") + sudo_command = f"sudo {command}" + result = subprocess.run( + sudo_command, + shell=True, + capture_output=True, + text=True, + timeout=30 + ) + + return { + 'success': result.returncode == 0, + 'exit_code': result.returncode, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': sudo_command, + 'retried_with_sudo': True + } + + return { + 'success': result.returncode == 0, + 'exit_code': result.returncode, + 'stdout': result.stdout, + 'stderr': result.stderr, + 'command': command, + 'retried_with_sudo': False + } + except subprocess.TimeoutExpired: + return { + 'success': False, + 'exit_code': -1, + 'stdout': '', + 'stderr': 'Command timed out after 30 seconds', + 'command': command, + 'retried_with_sudo': False + } + except Exception as e: + return { + 'success': False, + 'exit_code': -1, + 'stdout': '', + 'stderr': str(e), + 'command': command, + 'retried_with_sudo': False + } + + def _parse_response(self, response: str) -> Dict[str, Any]: + """Parse AI response to determine if it's a command or text""" + try: + # Try to parse as JSON + parsed = json.loads(response.strip()) + if isinstance(parsed, dict) and 'action' in parsed: + return parsed + except json.JSONDecodeError: + pass + + # It's plain text conversation + return {'action': 'chat', 'message': response} + + def _auto_diagnose_ollama(self) -> str: + """Automatically diagnose Ollama issues""" + diagnostics = [] + + diagnostics.append("🔍 AUTO-DIAGNOSIS: Investigating Ollama failure...\n") + + # Check if Ollama service is running + try: + result = subprocess.run( + ['systemctl', 'is-active', 'ollama.service'], + capture_output=True, + text=True, + timeout=5 + ) + if result.returncode == 0: + diagnostics.append("✅ Ollama service is active") + else: + diagnostics.append(f"❌ Ollama service is NOT active: {result.stdout.strip()}") + # Get service status + status_result = subprocess.run( + ['systemctl', 'status', 'ollama.service', '--no-pager', '-l'], + capture_output=True, + text=True, + timeout=5 + ) + diagnostics.append(f"\nService status:\n```\n{status_result.stdout[-500:]}\n```") + except Exception as e: + diagnostics.append(f"⚠️ Could not check service status: {e}") + + # Check memory usage + try: + result = subprocess.run(['free', '-h'], capture_output=True, text=True, timeout=5) + lines = result.stdout.split('\n') + for line in lines[:3]: # First 3 lines + diagnostics.append(f" {line}") + except Exception as e: + diagnostics.append(f"⚠️ Could not check memory: {e}") + + # Check which models are loaded + try: + import requests + response = requests.get(f"{self.agent.ollama_host}/api/tags", timeout=5) + if response.status_code == 200: + models = response.json().get('models', []) + diagnostics.append(f"\n📦 Loaded models ({len(models)}):") + for model in models: + name = model.get('name', 'unknown') + size = model.get('size', 0) / (1024**3) + is_current = "← TARGET" if name == self.agent.model else "" + diagnostics.append(f" • {name} ({size:.1f} GB) {is_current}") + + # Check if target model is loaded + model_names = [m.get('name') for m in models] + if self.agent.model not in model_names: + diagnostics.append(f"\n❌ TARGET MODEL NOT LOADED: {self.agent.model}") + diagnostics.append(f" Available models: {', '.join(model_names)}") + else: + diagnostics.append(f"❌ Ollama API returned {response.status_code}") + except Exception as e: + diagnostics.append(f"⚠️ Could not query Ollama API: {e}") + + # Check recent Ollama logs + try: + result = subprocess.run( + ['journalctl', '-u', 'ollama.service', '-n', '10', '--no-pager'], + capture_output=True, + text=True, + timeout=5 + ) + if result.stdout: + diagnostics.append(f"\n📋 Recent Ollama logs (last 10 lines):\n```\n{result.stdout}\n```") + except Exception as e: + diagnostics.append(f"⚠️ Could not check logs: {e}") + + return "\n".join(diagnostics) + + def process_message(self, user_message: str) -> str: + """Process a user message and return Macha's response""" + + # Add user message to history + self.conversation_history.append({ + 'role': 'user', + 'message': user_message, + 'timestamp': datetime.now().isoformat() + }) + + # Build chat messages for tool-calling API + messages = [] + + # Query relevant knowledge based on user message + knowledge_context = self.agent._query_relevant_knowledge(user_message, limit=3) + + # Add recent conversation history (last 15 messages to stay within context limits) + # With tool calling, messages grow quickly, so we limit more aggressively + recent_history = self.conversation_history[-15:] # Last ~7 exchanges + for entry in recent_history: + content = entry['message'] + # Truncate very long messages (e.g., command outputs) + if len(content) > 3000: + content = content[:1500] + "\n... [message truncated] ...\n" + content[-1500:] + # Add knowledge context to first user message if available + if entry == recent_history[-1] and knowledge_context: + content += knowledge_context + messages.append({ + "role": entry['role'], + "content": content + }) + + try: + # Use tool-aware chat API + ai_response = self.agent._query_ollama_with_tools(messages) + except Exception as e: + error_msg = ( + f"❌ CRITICAL: Failed to communicate with Ollama inference engine\n\n" + f"Error Type: {type(e).__name__}\n" + f"Error Message: {str(e)}\n\n" + ) + # Auto-diagnose the issue + diagnostics = self._auto_diagnose_ollama() + return error_msg + "\n" + diagnostics + + if not ai_response: + error_msg = ( + f"❌ Empty response from Ollama inference engine\n\n" + f"The request succeeded but returned no data. This usually means:\n" + f" • The model ({self.agent.model}) is still loading\n" + f" • Ollama ran out of memory during generation\n" + f" • The prompt was too large for the context window\n\n" + ) + # Auto-diagnose the issue + diagnostics = self._auto_diagnose_ollama() + return error_msg + "\n" + diagnostics + + # Check if Ollama returned an error + try: + error_check = json.loads(ai_response) + if isinstance(error_check, dict) and 'error' in error_check: + error_msg = ( + f"❌ Ollama API Error\n\n" + f"Error: {error_check.get('error', 'Unknown error')}\n" + f"Diagnosis: {error_check.get('diagnosis', 'No details')}\n\n" + ) + # Auto-diagnose the issue + diagnostics = self._auto_diagnose_ollama() + return error_msg + "\n" + diagnostics + except json.JSONDecodeError: + # Not JSON, it's a normal response + pass + + # Parse response + parsed = self._parse_response(ai_response) + + if parsed.get('action') == 'execute': + # AI wants to execute a command + command = parsed.get('command', '') + explanation = parsed.get('explanation', '') + + # Show what we're about to do + response = f"🔧 {explanation}\n\nExecuting: `{command}`\n\n" + + # Execute the command + result = self._execute_command(command) + + # Show if we retried with sudo + if result.get('retried_with_sudo'): + response += f"⚠️ Permission denied, retried as: `{result['command']}`\n\n" + + if result['success']: + response += "✅ Command succeeded:\n" + if result['stdout']: + response += f"```\n{result['stdout']}\n```" + else: + response += "(no output)" + else: + response += f"❌ Command failed (exit code {result['exit_code']}):\n" + if result['stderr']: + response += f"```\n{result['stderr']}\n```" + elif result['stdout']: + response += f"```\n{result['stdout']}\n```" + + # Add command execution to history + self.conversation_history.append({ + 'role': 'macha', + 'message': response, + 'timestamp': datetime.now().isoformat(), + 'command_result': result + }) + + # Now ask AI to respond to the command output + followup_prompt = f"""The command completed. Here's what happened: + +Command: {command} +Success: {result['success']} +Output: {result['stdout'][:500] if result['stdout'] else '(none)'} +Error: {result['stderr'][:500] if result['stderr'] else '(none)'} + +Please provide a brief analysis or next steps.""" + + followup_response = self.agent._query_ollama(followup_prompt) + + if followup_response: + response += f"\n\n{followup_response}" + + return response + + else: + # Normal conversation response + message = parsed.get('message', ai_response) + + self.conversation_history.append({ + 'role': 'macha', + 'message': message, + 'timestamp': datetime.now().isoformat() + }) + + return message + + def run(self): + """Run the interactive chat session""" + print("=" * 70) + print("🌐 MACHA INTERACTIVE CHAT") + print("=" * 70) + print("Type your message and press Enter. Commands:") + print(" /exit or /quit - End the chat session") + print(" /clear - Clear conversation history") + print(" /history - Show conversation history") + print(" /debug - Show Ollama connection status") + print("=" * 70) + print() + + while True: + try: + # Get user input + user_input = input("\n💬 YOU: ").strip() + + if not user_input: + continue + + # Handle special commands + if user_input.lower() in ['/exit', '/quit']: + print("\n👋 Ending chat session. Goodbye!") + break + + elif user_input.lower() == '/clear': + self.conversation_history.clear() + print("🧹 Conversation history cleared.") + continue + + elif user_input.lower() == '/history': + print("\n" + "=" * 70) + print("CONVERSATION HISTORY") + print("=" * 70) + for entry in self.conversation_history: + role = entry['role'].upper() + msg = entry['message'][:100] + "..." if len(entry['message']) > 100 else entry['message'] + print(f"{role}: {msg}") + print("=" * 70) + continue + + elif user_input.lower() == '/debug': + import os + import subprocess + + print("\n" + "=" * 70) + print("MACHA ARCHITECTURE & STATUS") + print("=" * 70) + + print("\n🏗️ SYSTEM ARCHITECTURE:") + print(f" Hostname: macha.coven.systems") + print(f" Service: macha-autonomous.service (systemd)") + print(f" Working Directory: /var/lib/macha") + + print("\n👤 EXECUTION CONTEXT:") + current_user = os.getenv('USER') or os.getenv('USERNAME') or 'unknown' + print(f" Current User: {current_user}") + print(f" UID: {os.getuid()}") + + # Check if user has sudo access + try: + result = subprocess.run(['sudo', '-n', 'true'], + capture_output=True, timeout=1) + if result.returncode == 0: + print(f" Sudo Access: ✓ Yes (passwordless)") + else: + print(f" Sudo Access: ⚠ Requires password") + except: + print(f" Sudo Access: ❌ No") + + print(f" Note: Chat runs as invoking user (you), not as macha-autonomous") + + print("\n🧠 INFERENCE ENGINE:") + print(f" Backend: Ollama") + print(f" Host: {self.agent.ollama_host}") + print(f" Model: {self.agent.model}") + print(f" Service: ollama.service (systemd)") + + print("\n💾 DATABASE:") + print(f" Backend: ChromaDB") + print(f" Host: http://localhost:8000") + print(f" Data: /var/lib/chromadb") + print(f" Service: chromadb.service (systemd)") + + print("\n🔍 OLLAMA STATUS:") + # Try to query Ollama status + try: + import requests + # Check if Ollama is running + response = requests.get(f"{self.agent.ollama_host}/api/tags", timeout=5) + if response.status_code == 200: + models = response.json().get('models', []) + print(f" Status: ✓ Running") + print(f" Loaded models: {len(models)}") + for model in models: + name = model.get('name', 'unknown') + size = model.get('size', 0) / (1024**3) # GB + is_current = "← ACTIVE" if name == self.agent.model else "" + print(f" • {name} ({size:.1f} GB) {is_current}") + else: + print(f" Status: ❌ Error (HTTP {response.status_code})") + except Exception as e: + print(f" Status: ❌ Cannot connect: {e}") + print(f" Hint: Check 'systemctl status ollama.service'") + + print("\n💡 CONVERSATION:") + print(f" History: {len(self.conversation_history)} messages") + print(f" Session started: {self.session_start}") + + print("=" * 70) + continue + + # Process the message + print("\n🤖 MACHA: ", end='', flush=True) + response = self.process_message(user_input) + print(response) + + except KeyboardInterrupt: + print("\n\n👋 Chat interrupted. Use /exit to quit properly.") + continue + except EOFError: + print("\n\n👋 Ending chat session. Goodbye!") + break + except Exception as e: + print(f"\n❌ Error: {e}") + continue + + +def main(): + """Main entry point""" + session = MachaChatSession() + session.run() + + +if __name__ == "__main__": + main() + diff --git a/config_parser.py b/config_parser.py new file mode 100644 index 0000000..b16495d --- /dev/null +++ b/config_parser.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +""" +Config Parser - Extract imports and content from NixOS configuration files +""" + +import re +import subprocess +from pathlib import Path +from typing import List, Dict, Set, Optional +from datetime import datetime + + +class ConfigParser: + """Parse NixOS flake and configuration files""" + + def __init__(self, repo_url: str, local_path: Path = Path("/var/lib/macha/config-repo")): + """ + Initialize config parser + + Args: + repo_url: Git repository URL (e.g., git+https://...) + local_path: Where to clone/update the repository + """ + # Strip git+ prefix if present for git commands + self.repo_url = repo_url.replace("git+", "") + self.local_path = local_path + self.local_path.mkdir(parents=True, exist_ok=True) + + def ensure_repo(self) -> bool: + """Clone or update the repository""" + try: + if (self.local_path / ".git").exists(): + # Update existing repo + result = subprocess.run( + ["git", "-C", str(self.local_path), "pull"], + capture_output=True, + text=True, + timeout=30 + ) + return result.returncode == 0 + else: + # Clone new repo + result = subprocess.run( + ["git", "clone", self.repo_url, str(self.local_path)], + capture_output=True, + text=True, + timeout=60 + ) + return result.returncode == 0 + except Exception as e: + print(f"Error updating repository: {e}") + return False + + def get_systems_from_flake(self) -> List[str]: + """Extract system names from flake.nix""" + flake_path = self.local_path / "flake.nix" + if not flake_path.exists(): + return [] + + systems = [] + try: + content = flake_path.read_text() + # Match patterns like: "macha" = nixpkgs.lib.nixosSystem + matches = re.findall(r'"([^"]+)"\s*=\s*nixpkgs\.lib\.nixosSystem', content) + systems = matches + except Exception as e: + print(f"Error parsing flake.nix: {e}") + + return systems + + def extract_imports(self, nix_file: Path) -> List[str]: + """Extract imports from a .nix file""" + if not nix_file.exists(): + return [] + + imports = [] + try: + content = nix_file.read_text() + + # Find the imports = [ ... ]; block + imports_match = re.search( + r'imports\s*=\s*\[(.*?)\];', + content, + re.DOTALL + ) + + if imports_match: + imports_block = imports_match.group(1) + # Extract all paths (relative paths starting with ./ or ../) + paths = re.findall(r'[./]+[^\s\]]+\.nix', imports_block) + imports = paths + + except Exception as e: + print(f"Error parsing {nix_file}: {e}") + + return imports + + def resolve_import_path(self, base_file: Path, import_path: str) -> Optional[Path]: + """Resolve a relative import path to absolute path within repo""" + try: + # Get directory of the base file + base_dir = base_file.parent + # Resolve the relative path + resolved = (base_dir / import_path).resolve() + # Make sure it's within the repo + if self.local_path in resolved.parents or resolved == self.local_path: + return resolved + except Exception as e: + print(f"Error resolving import {import_path} from {base_file}: {e}") + return None + + def get_system_config(self, system_name: str) -> Dict[str, any]: + """ + Get configuration for a specific system + + Returns: + Dict with: + - main_file: Path to systems/<name>.nix + - imports: List of imported file paths (relative to repo root) + - all_files: Set of all .nix files used (including recursive imports) + """ + main_file = self.local_path / "systems" / f"{system_name}.nix" + + if not main_file.exists(): + return { + "main_file": None, + "imports": [], + "all_files": set() + } + + # Track all files (avoid infinite loops) + all_files = set() + files_to_process = [main_file] + processed = set() + + while files_to_process: + current_file = files_to_process.pop(0) + + if current_file in processed: + continue + processed.add(current_file) + + # Get relative path from repo root + try: + rel_path = current_file.relative_to(self.local_path) + all_files.add(str(rel_path)) + except ValueError: + continue + + # Extract imports from this file + imports = self.extract_imports(current_file) + + # Resolve and queue imported files + for imp in imports: + resolved = self.resolve_import_path(current_file, imp) + if resolved and resolved not in processed: + files_to_process.append(resolved) + + return { + "main_file": str(main_file.relative_to(self.local_path)), + "imports": self.extract_imports(main_file), + "all_files": sorted(all_files) + } + + def read_file_content(self, relative_path: str) -> Optional[str]: + """Read content of a file by its path relative to repo root""" + try: + file_path = self.local_path / relative_path + if file_path.exists(): + return file_path.read_text() + except Exception as e: + print(f"Error reading {relative_path}: {e}") + return None + + def get_all_config_files(self) -> List[Dict[str, str]]: + """ + Get all .nix files in the repository with their content + + Returns: + List of dicts with: + - path: relative path from repo root + - content: file contents + - category: apps/systems/osconfigs/users based on path + """ + files = [] + + # Categories to scan + categories = { + "apps": self.local_path / "apps", + "systems": self.local_path / "systems", + "osconfigs": self.local_path / "osconfigs", + "users": self.local_path / "users" + } + + for category, path in categories.items(): + if not path.exists(): + continue + + for nix_file in path.rglob("*.nix"): + try: + rel_path = nix_file.relative_to(self.local_path) + content = nix_file.read_text() + + files.append({ + "path": str(rel_path), + "content": content, + "category": category + }) + except Exception as e: + print(f"Error reading {nix_file}: {e}") + + return files + + +if __name__ == "__main__": + # Test the parser + import sys + + repo_url = "git+https://git.coven.systems/lily/nixos-servers" + parser = ConfigParser(repo_url) + + print("Ensuring repository is up to date...") + if parser.ensure_repo(): + print("✓ Repository ready") + else: + print("✗ Failed to update repository") + sys.exit(1) + + print("\nSystems defined in flake:") + systems = parser.get_systems_from_flake() + for system in systems: + print(f" - {system}") + + if len(sys.argv) > 1: + system_name = sys.argv[1] + print(f"\nConfiguration for {system_name}:") + config = parser.get_system_config(system_name) + + print(f" Main file: {config['main_file']}") + print(f" Direct imports: {len(config['imports'])}") + print(f" All files used: {len(config['all_files'])}") + + for f in config['all_files']: + print(f" - {f}") + diff --git a/context_db.py b/context_db.py new file mode 100644 index 0000000..1c82d7e --- /dev/null +++ b/context_db.py @@ -0,0 +1,947 @@ +#!/usr/bin/env python3 +""" +Context Database - Store and retrieve system context using ChromaDB for RAG +""" + +import json +import os +from typing import Dict, List, Any, Optional, Set +from datetime import datetime +from pathlib import Path + +# Set environment variable BEFORE importing chromadb to prevent .env file reading +os.environ.setdefault("CHROMA_ENV_FILE", "") + +import chromadb +from chromadb.config import Settings + + +class ContextDatabase: + """Manage system context and relationships in ChromaDB""" + + def __init__( + self, + host: str = "localhost", + port: int = 8000, + persist_directory: str = "/var/lib/chromadb" + ): + """Initialize ChromaDB client""" + + self.client = chromadb.HttpClient( + host=host, + port=port, + settings=Settings( + anonymized_telemetry=False, + allow_reset=False, + chroma_api_impl="chromadb.api.fastapi.FastAPI" + ) + ) + + # Create or get collections + self.systems_collection = self.client.get_or_create_collection( + name="systems", + metadata={"description": "System definitions and metadata"} + ) + + self.relationships_collection = self.client.get_or_create_collection( + name="relationships", + metadata={"description": "System relationships and dependencies"} + ) + + self.issues_collection = self.client.get_or_create_collection( + name="issues", + metadata={"description": "Issue tracking and resolution history"} + ) + + self.decisions_collection = self.client.get_or_create_collection( + name="decisions", + metadata={"description": "AI decisions and outcomes"} + ) + + self.config_files_collection = self.client.get_or_create_collection( + name="config_files", + metadata={"description": "NixOS configuration files for RAG"} + ) + + self.knowledge_collection = self.client.get_or_create_collection( + name="knowledge", + metadata={"description": "Operational knowledge: commands, patterns, best practices"} + ) + + # ============ System Registry ============ + + def register_system( + self, + hostname: str, + system_type: str, + services: List[str], + capabilities: List[str] = None, + metadata: Dict[str, Any] = None, + config_repo: str = None, + config_branch: str = None, + os_type: str = "nixos" + ): + """Register a system in the database + + Args: + hostname: FQDN of the system + system_type: Role (e.g., 'workstation', 'server') + services: List of running services + capabilities: System capabilities + metadata: Additional metadata + config_repo: Git repository URL + config_branch: Git branch name + os_type: Operating system (e.g., 'nixos', 'ubuntu', 'debian', 'arch', 'windows', 'macos') + """ + doc_parts = [ + f"System: {hostname}", + f"Type: {system_type}", + f"OS: {os_type}", + f"Services: {', '.join(services)}", + f"Capabilities: {', '.join(capabilities or [])}" + ] + + if config_repo: + doc_parts.append(f"Configuration Repository: {config_repo}") + if config_branch: + doc_parts.append(f"Configuration Branch: {config_branch}") + + doc = "\n".join(doc_parts) + + metadata_dict = { + "hostname": hostname, + "type": system_type, + "os_type": os_type, + "services": json.dumps(services), + "capabilities": json.dumps(capabilities or []), + "metadata": json.dumps(metadata or {}), + "config_repo": config_repo or "", + "config_branch": config_branch or "", + "updated_at": datetime.now().isoformat() + } + + self.systems_collection.upsert( + ids=[hostname], + documents=[doc], + metadatas=[metadata_dict] + ) + + def get_system(self, hostname: str) -> Optional[Dict[str, Any]]: + """Get system information""" + try: + result = self.systems_collection.get( + ids=[hostname], + include=["metadatas", "documents"] + ) + + if result['ids']: + metadata = result['metadatas'][0] + return { + "hostname": metadata["hostname"], + "type": metadata["type"], + "services": json.loads(metadata["services"]), + "capabilities": json.loads(metadata["capabilities"]), + "metadata": json.loads(metadata["metadata"]), + "document": result['documents'][0] + } + except: + pass + + return None + + def get_all_systems(self) -> List[Dict[str, Any]]: + """Get all registered systems""" + result = self.systems_collection.get(include=["metadatas"]) + + systems = [] + for metadata in result['metadatas']: + systems.append({ + "hostname": metadata["hostname"], + "type": metadata["type"], + "os_type": metadata.get("os_type", "unknown"), + "services": json.loads(metadata["services"]), + "capabilities": json.loads(metadata["capabilities"]), + "config_repo": metadata.get("config_repo", ""), + "config_branch": metadata.get("config_branch", "") + }) + + return systems + + def is_system_known(self, hostname: str) -> bool: + """Check if a system is already registered""" + try: + result = self.systems_collection.get(ids=[hostname]) + return len(result['ids']) > 0 + except: + return False + + def get_known_hostnames(self) -> Set[str]: + """Get set of all known system hostnames""" + result = self.systems_collection.get(include=["metadatas"]) + return set(metadata["hostname"] for metadata in result['metadatas']) + + # ============ Relationships ============ + + def add_relationship( + self, + source: str, + target: str, + relationship_type: str, + description: str = "" + ): + """Add a relationship between systems""" + rel_id = f"{source}→{target}:{relationship_type}" + doc = f"{source} {relationship_type} {target}. {description}" + + self.relationships_collection.upsert( + ids=[rel_id], + documents=[doc], + metadatas=[{ + "source": source, + "target": target, + "type": relationship_type, + "description": description, + "created_at": datetime.now().isoformat() + }] + ) + + def get_dependencies(self, hostname: str) -> List[Dict[str, Any]]: + """Get what a system depends on""" + result = self.relationships_collection.get( + where={"source": hostname}, + include=["metadatas"] + ) + + return [ + { + "target": m["target"], + "type": m["type"], + "description": m.get("description", "") + } + for m in result['metadatas'] + ] + + def get_dependents(self, hostname: str) -> List[Dict[str, Any]]: + """Get what depends on a system""" + result = self.relationships_collection.get( + where={"target": hostname}, + include=["metadatas"] + ) + + return [ + { + "source": m["source"], + "type": m["type"], + "description": m.get("description", "") + } + for m in result['metadatas'] + ] + + # ============ Issue History ============ + + def store_issue( + self, + system: str, + issue_description: str, + resolution: str = "", + severity: str = "unknown", + metadata: Dict[str, Any] = None + ) -> str: + """Store an issue and its resolution""" + issue_id = f"{system}_{datetime.now().timestamp()}" + + doc = f""" +System: {system} +Issue: {issue_description} +Resolution: {resolution} +Severity: {severity} +""" + + self.issues_collection.add( + ids=[issue_id], + documents=[doc], + metadatas=[{ + "system": system, + "severity": severity, + "resolved": bool(resolution), + "timestamp": datetime.now().isoformat(), + "metadata": json.dumps(metadata or {}) + }] + ) + + return issue_id + + def store_investigation( + self, + system: str, + issue_description: str, + commands: List[str], + output: str, + timestamp: str = None + ) -> str: + """Store investigation results for an issue""" + if timestamp is None: + timestamp = datetime.now().isoformat() + + investigation_id = f"investigation_{system}_{datetime.now().timestamp()}" + + doc = f""" +System: {system} +Issue: {issue_description} +Commands executed: {', '.join(commands)} +Output: +{output[:2000]} # Limit output to prevent token overflow +""" + + self.issues_collection.add( + ids=[investigation_id], + documents=[doc], + metadatas=[{ + "system": system, + "issue": issue_description, + "type": "investigation", + "commands": json.dumps(commands), + "timestamp": timestamp, + "metadata": json.dumps({"output_length": len(output)}) + }] + ) + + return investigation_id + + def get_recent_investigations( + self, + issue_description: str, + system: str, + hours: int = 24 + ) -> List[Dict[str, Any]]: + """Get recent investigations for a similar issue""" + # Query for similar issues + try: + result = self.issues_collection.query( + query_texts=[f"System: {system}\nIssue: {issue_description}"], + n_results=10, + where={"type": "investigation"}, + include=["documents", "metadatas", "distances"] + ) + + investigations = [] + if result['ids'] and result['ids'][0]: + cutoff_time = datetime.now().timestamp() - (hours * 3600) + + for i, doc_id in enumerate(result['ids'][0]): + meta = result['metadatas'][0][i] + timestamp = datetime.fromisoformat(meta['timestamp']) + + # Only include recent investigations + if timestamp.timestamp() > cutoff_time: + investigations.append({ + "id": doc_id, + "system": meta['system'], + "issue": meta['issue'], + "commands": json.loads(meta['commands']), + "output": result['documents'][0][i], + "timestamp": meta['timestamp'], + "relevance": 1 - result['distances'][0][i] + }) + + return investigations + except Exception as e: + print(f"Error querying investigations: {e}") + return [] + + def find_similar_issues( + self, + issue_description: str, + system: Optional[str] = None, + n_results: int = 5 + ) -> List[Dict[str, Any]]: + """Find similar past issues using semantic search""" + where = {"system": system} if system else None + + results = self.issues_collection.query( + query_texts=[issue_description], + n_results=n_results, + where=where, + include=["documents", "metadatas", "distances"] + ) + + similar = [] + for i, doc in enumerate(results['documents'][0]): + similar.append({ + "issue": doc, + "metadata": results['metadatas'][0][i], + "similarity": 1 - results['distances'][0][i] # Convert distance to similarity + }) + + return similar + + # ============ AI Decisions ============ + + def store_decision( + self, + system: str, + analysis: Dict[str, Any], + action: Dict[str, Any], + outcome: Dict[str, Any] = None + ): + """Store an AI decision for learning""" + decision_id = f"decision_{datetime.now().timestamp()}" + + doc = f""" +System: {system} +Status: {analysis.get('status', 'unknown')} +Assessment: {analysis.get('overall_assessment', '')} +Action: {action.get('proposed_action', '')} +Risk: {action.get('risk_level', 'unknown')} +Outcome: {outcome.get('status', 'pending') if outcome else 'pending'} +""" + + self.decisions_collection.add( + ids=[decision_id], + documents=[doc], + metadatas=[{ + "system": system, + "timestamp": datetime.now().isoformat(), + "analysis": json.dumps(analysis), + "action": json.dumps(action), + "outcome": json.dumps(outcome or {}) + }] + ) + + def get_recent_decisions( + self, + system: Optional[str] = None, + n_results: int = 10 + ) -> List[Dict[str, Any]]: + """Get recent decisions, optionally filtered by system""" + where = {"system": system} if system else None + + results = self.decisions_collection.query( + query_texts=["recent decisions"], + n_results=n_results, + where=where, + include=["documents", "metadatas"] + ) + + decisions = [] + for i, doc in enumerate(results['documents'][0]): + meta = results['metadatas'][0][i] + decisions.append({ + "system": meta["system"], + "timestamp": meta["timestamp"], + "analysis": json.loads(meta["analysis"]), + "action": json.loads(meta["action"]), + "outcome": json.loads(meta["outcome"]) + }) + + return decisions + + # ============ Context Generation for AI ============ + + def get_system_context(self, hostname: str, git_context=None) -> str: + """Generate rich context about a system for AI prompts""" + context_parts = [] + + # System info + system = self.get_system(hostname) + if system: + context_parts.append(f"System: {hostname} ({system['type']})") + context_parts.append(f"Services: {', '.join(system['services'])}") + if system['capabilities']: + context_parts.append(f"Capabilities: {', '.join(system['capabilities'])}") + + # Git repository info + if system and system.get('metadata'): + metadata = json.loads(system['metadata']) if isinstance(system['metadata'], str) else system['metadata'] + config_repo = metadata.get('config_repo', '') + if config_repo: + context_parts.append(f"\nConfiguration Repository: {config_repo}") + + # Recent git changes for this system + if git_context: + try: + # Extract system name from FQDN + system_name = hostname.split('.')[0] + git_summary = git_context.get_system_context_summary(system_name) + if git_summary: + context_parts.append(f"\n{git_summary}") + except: + pass + + # Dependencies + deps = self.get_dependencies(hostname) + if deps: + context_parts.append("\nDependencies:") + for dep in deps: + context_parts.append(f" - Depends on {dep['target']} for {dep['type']}") + + # Dependents + dependents = self.get_dependents(hostname) + if dependents: + context_parts.append("\nUsed by:") + for dependent in dependents: + context_parts.append(f" - {dependent['source']} uses this for {dependent['type']}") + + return "\n".join(context_parts) + + def get_issue_context(self, issue_description: str, system: str) -> str: + """Get context about similar past issues""" + similar = self.find_similar_issues(issue_description, system, n_results=3) + + if not similar: + return "" + + context_parts = ["Similar past issues:"] + for i, issue in enumerate(similar, 1): + if issue['similarity'] > 0.7: # Only include if fairly similar + context_parts.append(f"\n{i}. {issue['issue']}") + context_parts.append(f" Similarity: {issue['similarity']:.2%}") + + return "\n".join(context_parts) if len(context_parts) > 1 else "" + + # ============ Config Files (for RAG) ============ + + def store_config_file( + self, + file_path: str, + content: str, + category: str = "unknown", + systems_using: List[str] = None + ): + """ + Store a configuration file for RAG retrieval + + Args: + file_path: Path relative to repo root (e.g., "apps/gotify.nix") + content: Full file contents + category: apps/systems/osconfigs/users + systems_using: List of system hostnames that import this file + """ + self.config_files_collection.upsert( + ids=[file_path], + documents=[content], + metadatas=[{ + "path": file_path, + "category": category, + "systems": json.dumps(systems_using or []), + "updated_at": datetime.now().isoformat() + }] + ) + + def get_config_file(self, file_path: str) -> Optional[Dict[str, Any]]: + """Get a specific config file by path""" + try: + result = self.config_files_collection.get( + ids=[file_path], + include=["documents", "metadatas"] + ) + + if result['ids']: + return { + "path": file_path, + "content": result['documents'][0], + "metadata": result['metadatas'][0] + } + except: + pass + return None + + def query_config_files( + self, + query: str, + system: str = None, + category: str = None, + n_results: int = 5 + ) -> List[Dict[str, Any]]: + """ + Query config files using semantic search + + Args: + query: Natural language query (e.g., "gotify configuration") + system: Optional filter by system hostname + category: Optional filter by category (apps/systems/etc) + n_results: Number of results to return + + Returns: + List of dicts with path, content, and metadata + """ + where = {} + if category: + where["category"] = category + + try: + result = self.config_files_collection.query( + query_texts=[query], + n_results=n_results, + where=where if where else None, + include=["documents", "metadatas", "distances"] + ) + + configs = [] + if result['ids'] and result['ids'][0]: + for i, doc_id in enumerate(result['ids'][0]): + config = { + "path": doc_id, + "content": result['documents'][0][i], + "metadata": result['metadatas'][0][i], + "relevance": 1 - result['distances'][0][i] # Convert distance to relevance + } + + # Filter by system if specified + if system: + systems = json.loads(config['metadata'].get('systems', '[]')) + if system not in systems: + continue + + configs.append(config) + + return configs + except Exception as e: + print(f"Error querying config files: {e}") + return [] + + def get_system_config_files(self, system: str) -> List[str]: + """Get all config file paths used by a system""" + # This is stored in the system's metadata now + system_info = self.get_system(system) + if system_info and 'config_files' in system_info.get('metadata', {}): + # metadata is already a dict, config_files is already a list + return system_info['metadata']['config_files'] + return [] + + def update_system_config_files(self, system: str, config_files: List[str]): + """Update the list of config files used by a system""" + system_info = self.get_system(system) + if system_info: + # metadata is already a dict from get_system(), no need to json.loads() + metadata = system_info.get('metadata', {}) + metadata['config_files'] = config_files + metadata['config_updated_at'] = datetime.now().isoformat() + + # Re-register with updated metadata + self.register_system( + hostname=system, + system_type=system_info['type'], + services=system_info['services'], + capabilities=system_info.get('capabilities', []), + metadata=metadata, + config_repo=system_info.get('config_repo'), + config_branch=system_info.get('config_branch') + ) + + # ========================================================================= + # ISSUE TRACKING + # ========================================================================= + + def store_issue(self, issue: Dict[str, Any]): + """Store a new issue in the database""" + issue_id = issue['issue_id'] + + # Store in ChromaDB with the issue as document + self.issues_collection.add( + documents=[json.dumps(issue)], + metadatas=[{ + 'issue_id': issue_id, + 'hostname': issue['hostname'], + 'title': issue['title'], + 'status': issue['status'], + 'severity': issue['severity'], + 'created_at': issue['created_at'], + 'source': issue['source'] + }], + ids=[issue_id] + ) + + def get_issue(self, issue_id: str) -> Optional[Dict[str, Any]]: + """Retrieve an issue by ID""" + try: + results = self.issues_collection.get(ids=[issue_id]) + if results['documents']: + return json.loads(results['documents'][0]) + return None + except Exception as e: + print(f"Error retrieving issue {issue_id}: {e}") + return None + + def update_issue(self, issue: Dict[str, Any]): + """Update an existing issue""" + issue_id = issue['issue_id'] + + # Delete old version + try: + self.issues_collection.delete(ids=[issue_id]) + except: + pass + + # Store updated version + self.store_issue(issue) + + def delete_issue(self, issue_id: str): + """Remove an issue from the database (used when archiving)""" + try: + self.issues_collection.delete(ids=[issue_id]) + except Exception as e: + print(f"Error deleting issue {issue_id}: {e}") + + def list_issues( + self, + hostname: Optional[str] = None, + status: Optional[str] = None, + severity: Optional[str] = None + ) -> List[Dict[str, Any]]: + """List issues with optional filters""" + try: + # Build query filter + where_filter = {} + if hostname: + where_filter['hostname'] = hostname + if status: + where_filter['status'] = status + if severity: + where_filter['severity'] = severity + + if where_filter: + results = self.issues_collection.get(where=where_filter) + else: + results = self.issues_collection.get() + + issues = [] + for doc in results['documents']: + issues.append(json.loads(doc)) + + # Sort by created_at descending + issues.sort(key=lambda x: x.get('created_at', ''), reverse=True) + + return issues + except Exception as e: + print(f"Error listing issues: {e}") + return [] + + # ============ Knowledge Base ============ + + def store_knowledge( + self, + topic: str, + knowledge: str, + category: str = "general", + source: str = "experience", + confidence: str = "medium", + tags: list = None + ) -> str: + """ + Store a piece of operational knowledge + + Args: + topic: Main subject (e.g., "nh os switch", "systemd-journal-remote") + knowledge: The actual knowledge/insight/pattern + category: Type of knowledge (command, pattern, troubleshooting, performance, etc.) + source: Where this came from (experience, documentation, user-provided) + confidence: How confident we are (low, medium, high) + tags: Optional tags for categorization + + Returns: + Knowledge ID + """ + import uuid + from datetime import datetime + + knowledge_id = str(uuid.uuid4()) + + knowledge_doc = { + "id": knowledge_id, + "topic": topic, + "knowledge": knowledge, + "category": category, + "source": source, + "confidence": confidence, + "tags": tags or [], + "created_at": datetime.utcnow().isoformat(), + "last_verified": datetime.utcnow().isoformat(), + "times_referenced": 0 + } + + try: + self.knowledge_collection.add( + ids=[knowledge_id], + documents=[knowledge], + metadatas=[{ + "topic": topic, + "category": category, + "source": source, + "confidence": confidence, + "tags": json.dumps(tags or []), + "created_at": knowledge_doc["created_at"], + "full_doc": json.dumps(knowledge_doc) + }] + ) + return knowledge_id + except Exception as e: + print(f"Error storing knowledge: {e}") + return None + + def query_knowledge( + self, + query: str, + category: str = None, + limit: int = 5 + ) -> list: + """ + Query the knowledge base for relevant information + + Args: + query: What to search for + category: Optional category filter + limit: Maximum results to return + + Returns: + List of relevant knowledge entries + """ + try: + where_filter = {} + if category: + where_filter["category"] = category + + results = self.knowledge_collection.query( + query_texts=[query], + n_results=limit, + where=where_filter if where_filter else None + ) + + knowledge_items = [] + if results and results['documents']: + for i, doc in enumerate(results['documents'][0]): + metadata = results['metadatas'][0][i] + full_doc = json.loads(metadata.get('full_doc', '{}')) + + # Increment reference count + full_doc['times_referenced'] = full_doc.get('times_referenced', 0) + 1 + + knowledge_items.append(full_doc) + + return knowledge_items + except Exception as e: + print(f"Error querying knowledge: {e}") + return [] + + def get_knowledge_by_topic(self, topic: str) -> list: + """Get all knowledge entries for a specific topic""" + try: + results = self.knowledge_collection.get( + where={"topic": topic} + ) + + knowledge_items = [] + for metadata in results['metadatas']: + full_doc = json.loads(metadata.get('full_doc', '{}')) + knowledge_items.append(full_doc) + + return knowledge_items + except Exception as e: + print(f"Error getting knowledge by topic: {e}") + return [] + + def update_knowledge( + self, + knowledge_id: str, + knowledge: str = None, + confidence: str = None, + verify: bool = False + ): + """ + Update an existing knowledge entry + + Args: + knowledge_id: ID of knowledge to update + knowledge: New knowledge text (optional) + confidence: New confidence level (optional) + verify: Mark as verified (updates last_verified timestamp) + """ + from datetime import datetime + + try: + # Get existing entry + result = self.knowledge_collection.get(ids=[knowledge_id]) + if not result['documents']: + return False + + metadata = result['metadatas'][0] + full_doc = json.loads(metadata.get('full_doc', '{}')) + + # Update fields + if knowledge: + full_doc['knowledge'] = knowledge + if confidence: + full_doc['confidence'] = confidence + if verify: + full_doc['last_verified'] = datetime.utcnow().isoformat() + + # Update in collection + self.knowledge_collection.update( + ids=[knowledge_id], + documents=[full_doc['knowledge']], + metadatas=[{ + "topic": full_doc['topic'], + "category": full_doc['category'], + "source": full_doc['source'], + "confidence": full_doc['confidence'], + "tags": json.dumps(full_doc['tags']), + "created_at": full_doc['created_at'], + "full_doc": json.dumps(full_doc) + }] + ) + return True + except Exception as e: + print(f"Error updating knowledge: {e}") + return False + + def list_knowledge_topics(self, category: str = None) -> list: + """List all unique topics in the knowledge base""" + try: + where_filter = {"category": category} if category else None + results = self.knowledge_collection.get(where=where_filter) + + topics = set() + for metadata in results['metadatas']: + topics.add(metadata.get('topic')) + + return sorted(list(topics)) + except Exception as e: + print(f"Error listing knowledge topics: {e}") + return [] + + +if __name__ == "__main__": + import sys + + # Test the database + db = ContextDatabase() + + # Register test systems + db.register_system( + "macha", + "workstation", + ["ollama"], + capabilities=["ai-inference"] + ) + + db.register_system( + "rhiannon", + "server", + ["gotify", "nextcloud", "prowlarr"], + capabilities=["notifications", "cloud-storage"] + ) + + # Add relationship + db.add_relationship( + "macha", + "rhiannon", + "uses-service", + "Macha uses Rhiannon's Gotify for notifications" + ) + + # Test queries + print("All systems:", db.get_all_systems()) + print("\nMacha's dependencies:", db.get_dependencies("macha")) + print("\nRhiannon's dependents:", db.get_dependents("rhiannon")) + print("\nSystem context:", db.get_system_context("macha")) + diff --git a/conversation.py b/conversation.py new file mode 100644 index 0000000..a1c2be3 --- /dev/null +++ b/conversation.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +""" +Conversational Interface - Allows questioning Macha about decisions and system state +""" + +import json +import requests +from typing import Dict, List, Any, Optional +from pathlib import Path +from datetime import datetime +from agent import MachaAgent + + +class MachaConversation: + """Conversational interface for Macha""" + + def __init__( + self, + ollama_host: str = "http://localhost:11434", + model: str = "gpt-oss:latest", + state_dir: Path = Path("/var/lib/macha") + ): + self.ollama_host = ollama_host + self.model = model + self.state_dir = state_dir + self.decision_log = self.state_dir / "decisions.jsonl" + self.approval_queue = self.state_dir / "approval_queue.json" + self.orchestrator_log = self.state_dir / "orchestrator.log" + + # Initialize agent with tool support and queue + self.agent = MachaAgent( + ollama_host=ollama_host, + model=model, + state_dir=state_dir, + enable_tools=True, + use_queue=True, + priority="INTERACTIVE" + ) + + def ask(self, question: str, include_context: bool = True) -> str: + """Ask Macha a question with optional system context""" + + context = "" + if include_context: + context = self._gather_context() + + # Build messages for tool-aware chat + content = self._create_conversational_prompt(question, context) + messages = [{"role": "user", "content": content}] + + response = self.agent._query_ollama_with_tools(messages) + + return response + + def discuss_action(self, action_index: int) -> str: + """Discuss a specific queued action by its queue position (0-based index)""" + + action = self._get_action_from_queue(action_index) + if not action: + return f"No action found at queue position {action_index}. Use 'macha-approve list' to see available actions." + + context = self._gather_context() + action_context = json.dumps(action, indent=2) + + content = f"""TASK: DISCUSS PROPOSED ACTION +================================================================================ + +A user is asking about a proposed action in your approval queue. + +QUEUED ACTION (Queue Position #{action_index}): +{action_context} + +RECENT SYSTEM CONTEXT: +{context} + +The user wants to discuss this action. Explain: +1. Why you proposed this action +2. What problem it solves +3. The risks involved +4. What could go wrong +5. Alternative approaches if any + +Be conversational, helpful, and honest about uncertainties. +""" + + messages = [{"role": "user", "content": content}] + return self.agent._query_ollama_with_tools(messages) + + def _gather_context(self) -> str: + """Gather relevant system context for the conversation""" + + context_parts = [] + + # System infrastructure from ChromaDB + try: + from context_db import ContextDatabase + db = ContextDatabase() + systems = db.get_all_systems() + + if systems: + context_parts.append("INFRASTRUCTURE:") + for system in systems: + context_parts.append(f" - {system['hostname']} ({system.get('type', 'unknown')})") + if system.get('config_repo'): + context_parts.append(f" Config Repo: {system['config_repo']}") + context_parts.append(f" Branch: {system.get('config_branch', 'unknown')}") + if system.get('capabilities'): + context_parts.append(f" Capabilities: {', '.join(system['capabilities'])}") + except Exception as e: + # ChromaDB not available, skip + pass + + # Recent decisions + recent_decisions = self._get_recent_decisions(5) + if recent_decisions: + context_parts.append("\nRECENT DECISIONS:") + for i, dec in enumerate(recent_decisions, 1): + timestamp = dec.get("timestamp", "unknown") + analysis = dec.get("analysis", {}) + status = analysis.get("status", "unknown") + context_parts.append(f"{i}. [{timestamp}] Status: {status}") + if "issues" in analysis: + for issue in analysis.get("issues", [])[:3]: + context_parts.append(f" - {issue.get('description', 'N/A')}") + + # Pending approvals + pending = self._get_pending_approvals() + if pending: + context_parts.append(f"\nPENDING APPROVALS: {len(pending)} action(s) awaiting approval") + + # Recent log excerpts (last 10 lines) + recent_logs = self._get_recent_logs(10) + if recent_logs: + context_parts.append("\nRECENT LOG ENTRIES:") + context_parts.extend(recent_logs) + + return "\n".join(context_parts) + + def _create_conversational_prompt(self, question: str, context: str) -> str: + """Create a conversational prompt""" + + return f"""{MachaAgent.SYSTEM_PROMPT} + +TASK: ANSWER QUESTION +================================================================================ + +You monitor system health, analyze issues using AI, and propose fixes. Be helpful, +honest about what you know and don't know, and reference the context provided below. + +SYSTEM CONTEXT: +{context if context else "No recent activity"} + +USER QUESTION: +{question} + +Respond conversationally and helpfully. If the question is about your recent decisions +or actions, reference the context above. If you don't have enough information, say so. +Keep responses concise but informative. +""" + + def _query_ollama(self, prompt: str, temperature: float = 0.7) -> str: + """Query Ollama API""" + try: + response = requests.post( + f"{self.ollama_host}/api/generate", + json={ + "model": self.model, + "prompt": prompt, + "stream": False, + "temperature": temperature, + }, + timeout=60 + ) + response.raise_for_status() + return response.json().get("response", "") + except requests.exceptions.HTTPError as e: + error_detail = "" + try: + error_detail = f" - {response.text}" + except: + pass + return f"Error: Ollama returned HTTP {response.status_code}{error_detail}" + except Exception as e: + return f"Error querying Ollama: {str(e)}" + + def _get_recent_decisions(self, count: int = 5) -> List[Dict[str, Any]]: + """Get recent decisions from log""" + if not self.decision_log.exists(): + return [] + + decisions = [] + try: + with open(self.decision_log, 'r') as f: + for line in f: + if line.strip(): + try: + decisions.append(json.loads(line)) + except: + pass + except: + pass + + return decisions[-count:] + + def _get_pending_approvals(self) -> List[Dict[str, Any]]: + """Get pending approvals from queue""" + if not self.approval_queue.exists(): + return [] + + try: + with open(self.approval_queue, 'r') as f: + data = json.load(f) + # Queue is a JSON array, not an object with "pending" key + if isinstance(data, list): + return data + return data.get("pending", []) + except: + return [] + + def _get_action_from_queue(self, action_index: int) -> Optional[Dict[str, Any]]: + """Get a specific action from the queue by index""" + pending = self._get_pending_approvals() + if 0 <= action_index < len(pending): + return pending[action_index] + return None + + def _get_recent_logs(self, count: int = 10) -> List[str]: + """Get recent orchestrator log lines""" + if not self.orchestrator_log.exists(): + return [] + + try: + with open(self.orchestrator_log, 'r') as f: + lines = f.readlines() + return [line.strip() for line in lines[-count:] if line.strip()] + except: + return [] + + +if __name__ == "__main__": + import sys + import argparse + + parser = argparse.ArgumentParser(description="Ask Macha a question or discuss an action") + parser.add_argument("--discuss", type=int, metavar="ACTION_ID", help="Discuss a specific queued action") + parser.add_argument("--follow-up", type=str, metavar="QUESTION", help="Follow-up question about the action") + parser.add_argument("question", nargs="*", help="Your question for Macha") + parser.add_argument("--no-context", action="store_true", help="Don't include system context") + + args = parser.parse_args() + + # Load config if available + config_file = Path("/etc/macha-autonomous/config.json") + ollama_host = "http://localhost:11434" + model = "gpt-oss:latest" + + if config_file.exists(): + try: + with open(config_file, 'r') as f: + config = json.load(f) + ollama_host = config.get("ollama_host", ollama_host) + model = config.get("model", model) + except: + pass + + conversation = MachaConversation( + ollama_host=ollama_host, + model=model + ) + + if args.discuss is not None: + if args.follow_up: + # Follow-up question about a specific action + action = conversation._get_action_from_queue(args.discuss) + if not action: + print(f"No action found at queue position {args.discuss}. Use 'macha-approve list' to see available actions.") + sys.exit(1) + + # Build context with the action details + action_context = f""" +QUEUED ACTION #{args.discuss}: +Diagnosis: {action.get('proposal', {}).get('diagnosis', 'N/A')} +Proposed Action: {action.get('proposal', {}).get('proposed_action', 'N/A')} +Action Type: {action.get('proposal', {}).get('action_type', 'N/A')} +Risk Level: {action.get('proposal', {}).get('risk_level', 'N/A')} +Commands: {json.dumps(action.get('proposal', {}).get('commands', []), indent=2)} +Reasoning: {action.get('proposal', {}).get('reasoning', 'N/A')} + +FOLLOW-UP QUESTION: +{args.follow_up} +""" + + # Query the AI with the action context + response = conversation._query_ollama(f"""{MachaAgent.SYSTEM_PROMPT} + +TASK: ANSWER FOLLOW-UP QUESTION ABOUT QUEUED ACTION +================================================================================ + +You are answering a follow-up question about a proposed fix that is awaiting approval. +Be helpful and answer directly. If the user is concerned about risks, explain them clearly. +If they ask about alternatives, suggest them. + +{action_context} + +RESPOND CONCISELY AND DIRECTLY. +""") + + else: + # Initial discussion about the action + response = conversation.discuss_action(args.discuss) + elif args.question: + # Ask a general question + question = " ".join(args.question) + response = conversation.ask(question, include_context=not args.no_context) + else: + parser.print_help() + sys.exit(1) + + # Only print formatted output for initial discussion, not for follow-ups + if args.follow_up: + print(response) + else: + print("\n" + "="*60) + print("MACHA:") + print("="*60) + print(response) + print("="*60 + "\n") + diff --git a/executor.py b/executor.py new file mode 100644 index 0000000..cb3bbc9 --- /dev/null +++ b/executor.py @@ -0,0 +1,537 @@ +#!/usr/bin/env python3 +""" +Action Executor - Safely executes proposed fixes with rollback capability +""" + +import json +import subprocess +import shutil +from typing import Dict, List, Any, Optional +from pathlib import Path +from datetime import datetime +import time + + +class SafeExecutor: + """Executes system maintenance actions with safety checks""" + + # Actions that are considered safe to auto-execute + SAFE_ACTIONS = { + "systemd_restart", # Restart failed services + "cleanup", # Disk cleanup, log rotation + "investigation", # Read-only diagnostics + } + + # Services that should NEVER be stopped/disabled + PROTECTED_SERVICES = { + "sshd", + "systemd-networkd", + "NetworkManager", + "systemd-resolved", + "dbus", + } + + def __init__( + self, + state_dir: Path = Path("/var/lib/macha"), + autonomy_level: str = "suggest", # observe, suggest, auto-safe, auto-full + dry_run: bool = False, + agent = None # Optional agent for learning from actions + ): + self.state_dir = state_dir + self.state_dir.mkdir(parents=True, exist_ok=True) + self.autonomy_level = autonomy_level + self.dry_run = dry_run + self.agent = agent + self.action_log = self.state_dir / "actions.jsonl" + self.approval_queue = self.state_dir / "approval_queue.json" + + def execute_action(self, action: Dict[str, Any], monitoring_context: Dict[str, Any]) -> Dict[str, Any]: + """Execute a proposed action with appropriate safety checks""" + + action_type = action.get("action_type", "unknown") + risk_level = action.get("risk_level", "high") + + # Determine if we should execute + should_execute, reason = self._should_execute(action_type, risk_level) + + if not should_execute: + if self.autonomy_level == "suggest": + # Queue for approval + self._queue_for_approval(action, monitoring_context) + return { + "executed": False, + "status": "queued_for_approval", + "reason": reason, + "queue_file": str(self.approval_queue) + } + else: + return { + "executed": False, + "status": "blocked", + "reason": reason + } + + # Execute the action + if self.dry_run: + return self._dry_run_action(action) + + return self._execute_action_impl(action, monitoring_context) + + def _should_execute(self, action_type: str, risk_level: str) -> tuple[bool, str]: + """Determine if an action should be auto-executed based on autonomy level""" + + if self.autonomy_level == "observe": + return False, "Autonomy level set to observe-only" + + # Auto-approve low-risk investigation actions + if action_type == "investigation" and risk_level == "low": + return True, "Auto-approved: Low-risk information gathering" + + if self.autonomy_level == "suggest": + return False, "Autonomy level requires manual approval" + + if self.autonomy_level == "auto-safe": + if action_type in self.SAFE_ACTIONS and risk_level == "low": + return True, "Auto-executing safe action" + return False, "Action requires higher autonomy level" + + if self.autonomy_level == "auto-full": + if risk_level == "high": + return False, "High risk actions always require approval" + return True, "Auto-executing approved action" + + return False, "Unknown autonomy level" + + def _execute_action_impl(self, action: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: + """Actually execute the action""" + + action_type = action.get("action_type") + result = { + "executed": True, + "timestamp": datetime.now().isoformat(), + "action": action, + "success": False, + "output": "", + "error": None + } + + try: + if action_type == "systemd_restart": + result.update(self._restart_services(action)) + + elif action_type == "cleanup": + result.update(self._perform_cleanup(action)) + + elif action_type == "nix_rebuild": + result.update(self._nix_rebuild(action)) + + elif action_type == "config_change": + result.update(self._apply_config_change(action)) + + elif action_type == "investigation": + result.update(self._run_investigation(action)) + + else: + result["error"] = f"Unknown action type: {action_type}" + + except Exception as e: + result["error"] = str(e) + result["success"] = False + + # Log the action + self._log_action(result) + + # Learn from successful operations + if result.get("success") and self.agent: + try: + self.agent.reflect_and_learn( + situation=action.get("diagnosis", "Unknown situation"), + action_taken=action.get("proposed_action", "Unknown action"), + outcome=result.get("output", ""), + success=True + ) + except Exception as e: + # Don't fail the action if learning fails + print(f"Note: Could not record learning: {e}") + + return result + + def _restart_services(self, action: Dict[str, Any]) -> Dict[str, Any]: + """Restart systemd services""" + commands = action.get("commands", []) + output_lines = [] + + for cmd in commands: + if not cmd.startswith("systemctl restart "): + continue + + service = cmd.split()[-1] + + # Safety check + if any(protected in service for protected in self.PROTECTED_SERVICES): + output_lines.append(f"BLOCKED: {service} is protected") + continue + + try: + result = subprocess.run( + ["systemctl", "restart", service], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode == 0: + output_lines.append(f"✓ Restarted {service}") + else: + output_lines.append(f"✗ Failed to restart {service}: {result.stderr}") + + except subprocess.TimeoutExpired: + output_lines.append(f"✗ Timeout restarting {service}") + + return { + "success": len(output_lines) > 0, + "output": "\n".join(output_lines) + } + + def _perform_cleanup(self, action: Dict[str, Any]) -> Dict[str, Any]: + """Perform system cleanup tasks""" + output_lines = [] + + # Nix store cleanup + if "nix" in action.get("proposed_action", "").lower(): + try: + result = subprocess.run( + ["nix-collect-garbage", "--delete-old"], + capture_output=True, + text=True, + timeout=300 + ) + output_lines.append(f"Nix cleanup: {result.stdout}") + except Exception as e: + output_lines.append(f"Nix cleanup failed: {e}") + + # Journal cleanup (keep last 7 days) + try: + result = subprocess.run( + ["journalctl", "--vacuum-time=7d"], + capture_output=True, + text=True, + timeout=60 + ) + output_lines.append(f"Journal cleanup: {result.stdout}") + except Exception as e: + output_lines.append(f"Journal cleanup failed: {e}") + + return { + "success": True, + "output": "\n".join(output_lines) + } + + def _nix_rebuild(self, action: Dict[str, Any]) -> Dict[str, Any]: + """Rebuild NixOS configuration""" + + # This is HIGH RISK - always requires approval or full autonomy + # And we should test first + + output_lines = [] + + # First, try a dry build + try: + result = subprocess.run( + ["nixos-rebuild", "dry-build", "--flake", ".#macha"], + capture_output=True, + text=True, + timeout=600, + cwd="/home/lily/Documents/nixos-servers" + ) + + if result.returncode != 0: + return { + "success": False, + "output": f"Dry build failed:\n{result.stderr}" + } + + output_lines.append("✓ Dry build successful") + + except Exception as e: + return { + "success": False, + "output": f"Dry build error: {e}" + } + + # Now do the actual rebuild + try: + result = subprocess.run( + ["nixos-rebuild", "switch", "--flake", ".#macha"], + capture_output=True, + text=True, + timeout=1200, + cwd="/home/lily/Documents/nixos-servers" + ) + + output_lines.append(result.stdout) + + return { + "success": result.returncode == 0, + "output": "\n".join(output_lines), + "error": result.stderr if result.returncode != 0 else None + } + + except Exception as e: + return { + "success": False, + "output": "\n".join(output_lines), + "error": str(e) + } + + def _apply_config_change(self, action: Dict[str, Any]) -> Dict[str, Any]: + """Apply a configuration file change""" + + config_changes = action.get("config_changes", {}) + file_path = config_changes.get("file") + + if not file_path: + return { + "success": False, + "output": "No file specified in config_changes" + } + + # For now, we DON'T auto-modify configs - too risky + # Instead, we create a suggested patch file + + patch_file = self.state_dir / f"suggested_patch_{int(time.time())}.txt" + with open(patch_file, 'w') as f: + f.write(f"Suggested change to {file_path}:\n\n") + f.write(config_changes.get("change", "No change description")) + f.write(f"\n\nReasoning: {action.get('reasoning', 'No reasoning provided')}") + + return { + "success": True, + "output": f"Config change suggestion saved to {patch_file}\nThis requires manual review and application." + } + + def _run_investigation(self, action: Dict[str, Any]) -> Dict[str, Any]: + """Run diagnostic commands""" + commands = action.get("commands", []) + output_lines = [] + + for cmd in commands: + # Only allow safe read-only commands + safe_commands = ["journalctl", "systemctl status", "df", "free", "ps", "netstat", "ss"] + if not any(cmd.startswith(safe) for safe in safe_commands): + output_lines.append(f"BLOCKED unsafe command: {cmd}") + continue + + try: + result = subprocess.run( + cmd, + shell=True, + capture_output=True, + text=True, + timeout=30 + ) + output_lines.append(f"$ {cmd}") + output_lines.append(result.stdout) + except Exception as e: + output_lines.append(f"Error running {cmd}: {e}") + + return { + "success": True, + "output": "\n".join(output_lines) + } + + def _dry_run_action(self, action: Dict[str, Any]) -> Dict[str, Any]: + """Simulate action execution""" + return { + "executed": False, + "status": "dry_run", + "action": action, + "output": "Dry run mode - no actual changes made" + } + + def _queue_for_approval(self, action: Dict[str, Any], context: Dict[str, Any]): + """Add action to approval queue""" + queue = [] + if self.approval_queue.exists(): + with open(self.approval_queue, 'r') as f: + queue = json.load(f) + + # Check for duplicate pending actions + proposed_action = action.get("proposed_action", "") + diagnosis = action.get("diagnosis", "") + + for existing in queue: + # Skip already approved/rejected items + if existing.get("approved") is not None: + continue + + existing_action = existing.get("action", {}) + existing_proposed = existing_action.get("proposed_action", "") + existing_diagnosis = existing_action.get("diagnosis", "") + + # Check if this is essentially the same issue + # Match if diagnosis is very similar OR proposed action is very similar + if (diagnosis and existing_diagnosis and + self._similarity_check(diagnosis, existing_diagnosis) > 0.7): + print(f"Skipping duplicate action - similar diagnosis already queued") + return + + if (proposed_action and existing_proposed and + self._similarity_check(proposed_action, existing_proposed) > 0.7): + print(f"Skipping duplicate action - similar proposal already queued") + return + + queue.append({ + "timestamp": datetime.now().isoformat(), + "action": action, + "context": context, + "approved": None + }) + + with open(self.approval_queue, 'w') as f: + json.dump(queue, f, indent=2) + + def _similarity_check(self, str1: str, str2: str) -> float: + """Simple similarity check between two strings""" + # Normalize strings + s1 = str1.lower().strip() + s2 = str2.lower().strip() + + # Exact match + if s1 == s2: + return 1.0 + + # Check for significant word overlap + words1 = set(s1.split()) + words2 = set(s2.split()) + + # Remove common words that don't indicate similarity + common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had'} + words1 = words1 - common_words + words2 = words2 - common_words + + if not words1 or not words2: + return 0.0 + + # Calculate Jaccard similarity + intersection = len(words1 & words2) + union = len(words1 | words2) + + return intersection / union if union > 0 else 0.0 + + def _log_action(self, result: Dict[str, Any]): + """Log executed actions""" + with open(self.action_log, 'a') as f: + f.write(json.dumps(result) + '\n') + + def get_approval_queue(self) -> List[Dict[str, Any]]: + """Get pending actions awaiting approval""" + if not self.approval_queue.exists(): + return [] + + with open(self.approval_queue, 'r') as f: + return json.load(f) + + def approve_action(self, index: int) -> bool: + """Approve and execute a queued action, then remove it from queue""" + queue = self.get_approval_queue() + if 0 <= index < len(queue): + action_item = queue[index] + + # Execute the approved action + result = self._execute_action_impl(action_item["action"], action_item["context"]) + + # Archive the action (success or failure) + self._archive_action(action_item, result) + + # Remove from queue regardless of outcome + queue.pop(index) + + with open(self.approval_queue, 'w') as f: + json.dump(queue, f, indent=2) + + return result.get("success", False) + + return False + + def _archive_action(self, action_item: Dict[str, Any], result: Dict[str, Any]): + """Archive an approved action with its execution result""" + archive_file = self.state_dir / "approved_actions.jsonl" + + archive_entry = { + "timestamp": datetime.now().isoformat(), + "original_timestamp": action_item.get("timestamp"), + "action": action_item.get("action"), + "context": action_item.get("context"), + "result": result + } + + with open(archive_file, 'a') as f: + f.write(json.dumps(archive_entry) + '\n') + + def reject_action(self, index: int) -> bool: + """Reject and remove a queued action""" + queue = self.get_approval_queue() + if 0 <= index < len(queue): + removed_action = queue.pop(index) + + with open(self.approval_queue, 'w') as f: + json.dump(queue, f, indent=2) + + return True + + return False + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1: + if sys.argv[1] == "queue": + executor = SafeExecutor() + queue = executor.get_approval_queue() + if queue: + print("\n" + "="*70) + print(f"PENDING ACTIONS: {len(queue)}") + print("="*70) + for i, item in enumerate(queue): + action = item.get("action", {}) + timestamp = item.get("timestamp", "unknown") + approved = item.get("approved") + + status = "✓ APPROVED" if approved else "⏳ PENDING" if approved is None else "✗ REJECTED" + + print(f"\n[{i}] {status} - {timestamp}") + print("-" * 70) + print(f"DIAGNOSIS: {action.get('diagnosis', 'N/A')}") + print(f"\nPROPOSED ACTION: {action.get('proposed_action', 'N/A')}") + print(f"TYPE: {action.get('action_type', 'N/A')}") + print(f"RISK: {action.get('risk_level', 'N/A')}") + + if action.get('commands'): + print(f"\nCOMMANDS:") + for cmd in action['commands']: + print(f" - {cmd}") + + if action.get('config_changes'): + print(f"\nCONFIG CHANGES:") + for key, value in action['config_changes'].items(): + print(f" {key}: {value}") + + print(f"\nREASONING: {action.get('reasoning', 'N/A')}") + print("\n" + "="*70 + "\n") + else: + print("No pending actions") + + elif sys.argv[1] == "approve" and len(sys.argv) > 2: + executor = SafeExecutor() + index = int(sys.argv[2]) + success = executor.approve_action(index) + print(f"Approval {'succeeded' if success else 'failed'}") + + elif sys.argv[1] == "reject" and len(sys.argv) > 2: + executor = SafeExecutor() + index = int(sys.argv[2]) + success = executor.reject_action(index) + print(f"Action {'rejected and removed from queue' if success else 'rejection failed'}") diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..611432d --- /dev/null +++ b/flake.nix @@ -0,0 +1,41 @@ +{ + description = "Macha - AI-Powered Autonomous System Administrator"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + }; + + outputs = { self, nixpkgs }: { + # NixOS module + nixosModules.default = import ./module.nix; + + # Alternative explicit name + nixosModules.macha-autonomous = import ./module.nix; + + # For development + devShells = nixpkgs.lib.genAttrs [ "x86_64-linux" "aarch64-linux" ] (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + pythonEnv = pkgs.python3.withPackages (ps: with ps; [ + requests + psutil + chromadb + ]); + in { + default = pkgs.mkShell { + packages = [ pythonEnv pkgs.git ]; + shellHook = '' + echo "Macha Autonomous Development Environment" + echo "Python packages: requests, psutil, chromadb" + ''; + }; + } + ); + + # Formatter + formatter = nixpkgs.lib.genAttrs [ "x86_64-linux" "aarch64-linux" ] (system: + nixpkgs.legacyPackages.${system}.nixpkgs-fmt + ); + }; +} + diff --git a/git_context.py b/git_context.py new file mode 100644 index 0000000..5fe2530 --- /dev/null +++ b/git_context.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +""" +Git Context - Extract context from NixOS configuration repository +""" + +import subprocess +from typing import Dict, List, Any, Optional +from datetime import datetime, timedelta +from pathlib import Path + + +class GitContext: + """Extract context from git repository""" + + def __init__(self, repo_path: str = "/etc/nixos"): + """ + Initialize git context extractor + + Args: + repo_path: Path to the git repository (default: /etc/nixos for NixOS systems) + """ + self.repo_path = Path(repo_path) + + def _run_git(self, args: List[str]) -> tuple[bool, str]: + """Run git command""" + try: + result = subprocess.run( + ["git", "-C", str(self.repo_path)] + args, + capture_output=True, + text=True, + timeout=10 + ) + return (result.returncode == 0, result.stdout.strip()) + except Exception as e: + return (False, str(e)) + + def get_current_branch(self) -> str: + """Get current git branch""" + success, output = self._run_git(["rev-parse", "--abbrev-ref", "HEAD"]) + return output if success else "unknown" + + def get_remote_url(self) -> str: + """Get git remote URL""" + success, output = self._run_git(["remote", "get-url", "origin"]) + return output if success else "" + + def get_recent_commits(self, count: int = 10, since: str = "1 week ago") -> List[Dict[str, str]]: + """ + Get recent commits + + Args: + count: Number of commits to retrieve + since: Time range (e.g., "1 week ago", "3 days ago") + + Returns: + List of commit dictionaries with hash, author, date, message + """ + success, output = self._run_git([ + "log", + f"--since={since}", + f"-n{count}", + "--format=%H|%an|%ar|%s" + ]) + + if not success: + return [] + + commits = [] + for line in output.split('\n'): + if not line.strip(): + continue + parts = line.split('|', 3) + if len(parts) == 4: + commits.append({ + "hash": parts[0][:8], # Short hash + "author": parts[1], + "date": parts[2], + "message": parts[3] + }) + + return commits + + def get_system_config_files(self, system_name: str) -> List[str]: + """ + Get configuration files for a specific system + + Args: + system_name: Name of the system (e.g., "macha", "rhiannon") + + Returns: + List of configuration file paths + """ + system_dir = self.repo_path / "systems" / system_name + config_files = [] + + if system_dir.exists(): + # Main config + if (system_dir.parent / f"{system_name}.nix").exists(): + config_files.append(f"systems/{system_name}.nix") + + # System-specific configs + for config_file in system_dir.rglob("*.nix"): + config_files.append(str(config_file.relative_to(self.repo_path))) + + return config_files + + def get_recent_changes_for_system(self, system_name: str, since: str = "1 week ago") -> List[Dict[str, str]]: + """ + Get recent changes affecting a specific system + + Args: + system_name: Name of the system + since: Time range + + Returns: + List of commits that affected this system + """ + config_files = self.get_system_config_files(system_name) + + if not config_files: + return [] + + # Get commits that touched these files + file_args = [] + for f in config_files: + file_args.extend(["--", f]) + + success, output = self._run_git([ + "log", + f"--since={since}", + "-n10", + "--format=%H|%an|%ar|%s" + ] + file_args) + + if not success: + return [] + + commits = [] + for line in output.split('\n'): + if not line.strip(): + continue + parts = line.split('|', 3) + if len(parts) == 4: + commits.append({ + "hash": parts[0][:8], + "author": parts[1], + "date": parts[2], + "message": parts[3] + }) + + return commits + + def get_system_context_summary(self, system_name: str) -> str: + """ + Get a summary of git context for a system + + Args: + system_name: Name of the system + + Returns: + Human-readable summary + """ + lines = [] + + # Repository info + repo_url = self.get_remote_url() + branch = self.get_current_branch() + + if repo_url: + lines.append(f"Configuration Repository: {repo_url}") + lines.append(f"Branch: {branch}") + + # Recent changes to this system + recent_changes = self.get_recent_changes_for_system(system_name, "2 weeks ago") + + if recent_changes: + lines.append(f"\nRecent configuration changes (last 2 weeks):") + for commit in recent_changes[:5]: + lines.append(f" - {commit['date']}: {commit['message']} ({commit['author']})") + else: + lines.append("\nNo recent configuration changes") + + return "\n".join(lines) + + def get_all_managed_systems(self) -> List[str]: + """ + Get list of all systems managed by this repository + + Returns: + List of system names + """ + systems = [] + systems_dir = self.repo_path / "systems" + + if systems_dir.exists(): + for system_file in systems_dir.glob("*.nix"): + if system_file.stem not in ["default"]: + systems.append(system_file.stem) + + return sorted(systems) + + +if __name__ == "__main__": + import sys + + git = GitContext() + + print("Repository:", git.get_remote_url()) + print("Branch:", git.get_current_branch()) + print("\nManaged Systems:") + for system in git.get_all_managed_systems(): + print(f" - {system}") + + print("\nRecent Commits:") + for commit in git.get_recent_commits(5): + print(f" {commit['hash']}: {commit['message']} - {commit['author']}, {commit['date']}") + + if len(sys.argv) > 1: + system = sys.argv[1] + print(f"\nContext for {system}:") + print(git.get_system_context_summary(system)) + diff --git a/issue_tracker.py b/issue_tracker.py new file mode 100644 index 0000000..0f287bb --- /dev/null +++ b/issue_tracker.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +Issue Tracker - Internal ticketing system for tracking problems and their resolution +""" + +import json +import uuid +from datetime import datetime +from typing import Dict, List, Any, Optional +from pathlib import Path + + +class IssueTracker: + """Manages issue lifecycle: detection -> investigation -> resolution""" + + def __init__(self, context_db, log_dir: str = "/var/lib/macha/logs"): + self.context_db = context_db + self.log_dir = Path(log_dir) + self.log_dir.mkdir(parents=True, exist_ok=True) + self.closed_log = self.log_dir / "closed_issues.jsonl" + + def create_issue( + self, + hostname: str, + title: str, + description: str, + severity: str = "medium", + source: str = "auto-detected" + ) -> str: + """Create a new issue and return its ID""" + issue_id = str(uuid.uuid4()) + now = datetime.utcnow().isoformat() + + issue = { + "issue_id": issue_id, + "hostname": hostname, + "title": title, + "description": description, + "status": "open", + "severity": severity, + "created_at": now, + "updated_at": now, + "source": source, + "investigations": [], + "actions": [], + "resolution": None + } + + self.context_db.store_issue(issue) + return issue_id + + def get_issue(self, issue_id: str) -> Optional[Dict[str, Any]]: + """Retrieve an issue by ID""" + return self.context_db.get_issue(issue_id) + + def update_issue( + self, + issue_id: str, + status: Optional[str] = None, + investigation: Optional[Dict[str, Any]] = None, + action: Optional[Dict[str, Any]] = None + ) -> bool: + """Update an issue with new information""" + issue = self.get_issue(issue_id) + if not issue: + return False + + if status: + issue["status"] = status + + if investigation: + investigation["timestamp"] = datetime.utcnow().isoformat() + issue["investigations"].append(investigation) + + if action: + action["timestamp"] = datetime.utcnow().isoformat() + issue["actions"].append(action) + + issue["updated_at"] = datetime.utcnow().isoformat() + + self.context_db.update_issue(issue) + return True + + def find_similar_issue( + self, + hostname: str, + title: str, + description: str = None + ) -> Optional[Dict[str, Any]]: + """Find an existing open issue that matches this problem""" + open_issues = self.list_issues(hostname=hostname, status="open") + + # Simple similarity check on title + title_lower = title.lower() + for issue in open_issues: + issue_title_lower = issue.get("title", "").lower() + + # Check for keyword overlap + title_words = set(title_lower.split()) + issue_words = set(issue_title_lower.split()) + + # If >50% of words overlap, consider it similar + if len(title_words & issue_words) / max(len(title_words), 1) > 0.5: + return issue + + return None + + def list_issues( + self, + hostname: Optional[str] = None, + status: Optional[str] = None, + severity: Optional[str] = None + ) -> List[Dict[str, Any]]: + """List issues with optional filters""" + return self.context_db.list_issues( + hostname=hostname, + status=status, + severity=severity + ) + + def resolve_issue(self, issue_id: str, resolution: str) -> bool: + """Mark an issue as resolved with a resolution note""" + issue = self.get_issue(issue_id) + if not issue: + return False + + issue["status"] = "resolved" + issue["resolution"] = resolution + issue["updated_at"] = datetime.utcnow().isoformat() + + self.context_db.update_issue(issue) + return True + + def close_issue(self, issue_id: str) -> bool: + """Archive a resolved issue to the closed log""" + issue = self.get_issue(issue_id) + if not issue: + return False + + # Can only close resolved issues + if issue["status"] != "resolved": + return False + + issue["status"] = "closed" + issue["closed_at"] = datetime.utcnow().isoformat() + + # Archive to closed log + self._archive_issue(issue) + + # Remove from active database + self.context_db.delete_issue(issue_id) + + return True + + def get_issue_history(self, issue_id: str) -> Dict[str, Any]: + """Get full history for an issue (investigations + actions)""" + issue = self.get_issue(issue_id) + if not issue: + return {} + + return { + "issue": issue, + "investigation_count": len(issue.get("investigations", [])), + "action_count": len(issue.get("actions", [])), + "age_hours": self._calculate_age(issue["created_at"]), + "last_activity": issue["updated_at"] + } + + def auto_resolve_if_fixed(self, hostname: str, detected_problems: List[str]) -> int: + """ + Auto-resolve open issues if their problems are no longer detected. + Returns count of auto-resolved issues. + """ + open_issues = self.list_issues(hostname=hostname, status="open") + resolved_count = 0 + + # Convert detected problems to lowercase for comparison + detected_lower = [p.lower() for p in detected_problems] + + for issue in open_issues: + title_lower = issue.get("title", "").lower() + desc_lower = issue.get("description", "").lower() + + # Check if issue keywords are still in detected problems + still_present = False + for detected in detected_lower: + if any(word in detected for word in title_lower.split()) or \ + any(word in detected for word in desc_lower.split()): + still_present = True + break + + # If problem is no longer detected, auto-resolve + if not still_present: + self.resolve_issue( + issue["issue_id"], + "Auto-resolved: Problem no longer detected in system monitoring" + ) + resolved_count += 1 + + return resolved_count + + def _archive_issue(self, issue: Dict[str, Any]): + """Append closed issue to the archive log""" + try: + with open(self.closed_log, "a") as f: + f.write(json.dumps(issue) + "\n") + except Exception as e: + print(f"Failed to archive issue {issue.get('issue_id')}: {e}") + + def _calculate_age(self, created_at: str) -> float: + """Calculate age of issue in hours""" + try: + created = datetime.fromisoformat(created_at) + now = datetime.utcnow() + delta = now - created + return delta.total_seconds() / 3600 + except: + return 0 + diff --git a/journal_monitor.py b/journal_monitor.py new file mode 100644 index 0000000..a42f992 --- /dev/null +++ b/journal_monitor.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +""" +Journal Monitor - Monitor remote systems via centralized journald +""" + +import json +import subprocess +from typing import Dict, List, Any, Optional, Set +from datetime import datetime, timedelta +from pathlib import Path +from collections import defaultdict + + +class JournalMonitor: + """Monitor systems via centralized journald logs""" + + def __init__(self, domain: str = "coven.systems"): + """ + Initialize journal monitor + + Args: + domain: Domain suffix for FQDNs + """ + self.domain = domain + self.known_hosts: Set[str] = set() + + def _run_journalctl(self, args: List[str], timeout: int = 30) -> tuple[bool, str, str]: + """ + Run journalctl command + + Args: + args: Arguments to journalctl + timeout: Timeout in seconds + + Returns: + (success, stdout, stderr) + """ + try: + cmd = ["journalctl"] + args + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout + ) + + return ( + result.returncode == 0, + result.stdout.strip(), + result.stderr.strip() + ) + + except subprocess.TimeoutExpired: + return False, "", f"Command timed out after {timeout}s" + except Exception as e: + return False, "", str(e) + + def discover_hosts(self) -> List[str]: + """ + Discover hosts reporting to centralized journal + + Returns: + List of discovered FQDNs + """ + success, output, _ = self._run_journalctl([ + "--output=json", + "--since=1 day ago", + "-n", "10000" + ]) + + if not success: + return [] + + hosts = set() + for line in output.split('\n'): + if not line.strip(): + continue + try: + entry = json.loads(line) + hostname = entry.get('_HOSTNAME', '') + + # Ensure FQDN format + if hostname and not hostname.endswith(f'.{self.domain}'): + if '.' not in hostname: + hostname = f"{hostname}.{self.domain}" + + if hostname: + hosts.add(hostname) + + except json.JSONDecodeError: + continue + + self.known_hosts = hosts + return sorted(hosts) + + def collect_resources(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]: + """ + Collect resource usage from journal entries + + This extracts CPU/memory info from systemd service messages + """ + # For now, return empty - we'll primarily use this for service/log monitoring + # Resource metrics could be added if systems log them + return { + "cpu_percent": 0, + "memory_percent": 0, + "load_average": {"1min": 0, "5min": 0, "15min": 0} + } + + def collect_systemd_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]: + """ + Collect systemd service status from journal + + Args: + hostname: FQDN of the system + since: Time range to check + + Returns: + Dictionary with failed service information + """ + # Query for systemd service failures + success, output, _ = self._run_journalctl([ + f"_HOSTNAME={hostname}", + "--priority=err", + "--unit=*.service", + f"--since={since}", + "--output=json" + ]) + + if not success: + return {"failed_count": 0, "failed_services": []} + + failed_services = {} + for line in output.split('\n'): + if not line.strip(): + continue + try: + entry = json.loads(line) + unit = entry.get('_SYSTEMD_UNIT', '') + if unit and unit.endswith('.service'): + service_name = unit.replace('.service', '') + if service_name not in failed_services: + failed_services[service_name] = { + "unit": unit, + "message": entry.get('MESSAGE', ''), + "timestamp": entry.get('__REALTIME_TIMESTAMP', '') + } + except json.JSONDecodeError: + continue + + return { + "failed_count": len(failed_services), + "failed_services": list(failed_services.values()) + } + + def collect_log_errors(self, hostname: str, since: str = "1 hour ago") -> Dict[str, Any]: + """ + Collect error logs from journal + + Args: + hostname: FQDN of the system + since: Time range to check + + Returns: + Dictionary with error log information + """ + success, output, _ = self._run_journalctl([ + f"_HOSTNAME={hostname}", + "--priority=err", + f"--since={since}", + "--output=json" + ]) + + if not success: + return {"error_count_1h": 0, "recent_errors": []} + + errors = [] + error_count = 0 + + for line in output.split('\n'): + if not line.strip(): + continue + try: + entry = json.loads(line) + error_count += 1 + + if len(errors) < 10: # Keep last 10 errors + errors.append({ + "message": entry.get('MESSAGE', ''), + "unit": entry.get('_SYSTEMD_UNIT', 'unknown'), + "priority": entry.get('PRIORITY', ''), + "timestamp": entry.get('__REALTIME_TIMESTAMP', '') + }) + + except json.JSONDecodeError: + continue + + return { + "error_count_1h": error_count, + "recent_errors": errors + } + + def collect_disk_usage(self, hostname: str) -> Dict[str, Any]: + """ + Collect disk usage - Note: This would require systems to log disk metrics + For now, returns empty. Could be enhanced if systems periodically log disk usage + """ + return {"partitions": []} + + def collect_network_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]: + """ + Check network connectivity based on recent journal activity + + If we see recent logs from a host, it's reachable + """ + success, output, _ = self._run_journalctl([ + f"_HOSTNAME={hostname}", + f"--since={since}", + "-n", "1", + "--output=json" + ]) + + # If we got recent logs, network is working + internet_reachable = bool(success and output.strip()) + + return { + "internet_reachable": internet_reachable, + "last_seen": datetime.now().isoformat() if internet_reachable else None + } + + def collect_all(self, hostname: str) -> Dict[str, Any]: + """ + Collect all monitoring data for a host from journal + + Args: + hostname: FQDN of the system to monitor + + Returns: + Complete monitoring data + """ + # First check if we have recent logs from this host + net_status = self.collect_network_status(hostname) + + if not net_status.get("internet_reachable"): + return { + "hostname": hostname, + "reachable": False, + "error": "No recent journal entries from this host" + } + + return { + "hostname": hostname, + "reachable": True, + "source": "journal", + "resources": self.collect_resources(hostname), + "systemd": self.collect_systemd_status(hostname), + "disk": self.collect_disk_usage(hostname), + "network": net_status, + "logs": self.collect_log_errors(hostname), + } + + def get_summary(self, data: Dict[str, Any]) -> str: + """Generate human-readable summary from journal data""" + hostname = data.get("hostname", "unknown") + + if not data.get("reachable", False): + return f"❌ {hostname}: {data.get('error', 'Unreachable')}" + + lines = [f"System: {hostname} (via journal)"] + + # Services + systemd = data.get("systemd", {}) + failed_count = systemd.get("failed_count", 0) + if failed_count > 0: + lines.append(f"Services: {failed_count} failed") + for svc in systemd.get("failed_services", [])[:3]: + lines.append(f" - {svc.get('unit', 'unknown')}") + else: + lines.append("Services: No recent failures") + + # Network + net = data.get("network", {}) + last_seen = net.get("last_seen") + if last_seen: + lines.append(f"Last seen: {last_seen}") + + # Logs + logs = data.get("logs", {}) + error_count = logs.get("error_count_1h", 0) + if error_count > 0: + lines.append(f"Recent logs: {error_count} errors in last hour") + + return "\n".join(lines) + + def get_active_services(self, hostname: str, since: str = "1 hour ago") -> List[str]: + """ + Get list of active services on a host by looking at journal entries + + This helps with auto-discovery of what's running on each system + """ + success, output, _ = self._run_journalctl([ + f"_HOSTNAME={hostname}", + f"--since={since}", + "--output=json", + "-n", "1000" + ]) + + if not success: + return [] + + services = set() + for line in output.split('\n'): + if not line.strip(): + continue + try: + entry = json.loads(line) + unit = entry.get('_SYSTEMD_UNIT', '') + if unit and unit.endswith('.service'): + # Extract service name + service = unit.replace('.service', '') + # Filter out common system services, focus on application services + if service not in ['systemd-journald', 'systemd-logind', 'sshd', 'dbus']: + services.add(service) + except json.JSONDecodeError: + continue + + return sorted(services) + + +if __name__ == "__main__": + import sys + + monitor = JournalMonitor() + + # Discover hosts + print("Discovering hosts from journal...") + hosts = monitor.discover_hosts() + print(f"Found {len(hosts)} hosts:") + for host in hosts: + print(f" - {host}") + + # Monitor first host if available + if hosts: + hostname = hosts[0] + print(f"\nMonitoring {hostname}...") + data = monitor.collect_all(hostname) + + print("\n" + "="*60) + print(monitor.get_summary(data)) + print("="*60) + + # Discover services + print(f"\nActive services on {hostname}:") + services = monitor.get_active_services(hostname) + for svc in services[:10]: + print(f" - {svc}") + diff --git a/module.nix b/module.nix new file mode 100644 index 0000000..37c019c --- /dev/null +++ b/module.nix @@ -0,0 +1,847 @@ +{ config, lib, pkgs, ... }: + +with lib; + +let + cfg = config.services.macha-autonomous; + + # Python environment with all dependencies + pythonEnv = pkgs.python3.withPackages (ps: with ps; [ + requests + psutil + chromadb + ]); + + # Main autonomous system package + macha-autonomous = pkgs.writeScriptBin "macha-autonomous" '' + #!${pythonEnv}/bin/python3 + import sys + sys.path.insert(0, "${./.}") + from orchestrator import main + main() + ''; + + # Config file + configFile = pkgs.writeText "macha-autonomous-config.json" (builtins.toJSON { + check_interval = cfg.checkInterval; + autonomy_level = cfg.autonomyLevel; + ollama_host = cfg.ollamaHost; + model = cfg.model; + config_repo = cfg.configRepo; + config_branch = cfg.configBranch; + }); + +in { + options.services.macha-autonomous = { + enable = mkEnableOption "Macha autonomous system maintenance"; + + autonomyLevel = mkOption { + type = types.enum [ "observe" "suggest" "auto-safe" "auto-full" ]; + default = "suggest"; + description = '' + Level of autonomy for the system: + - observe: Only monitor and log, no actions + - suggest: Propose actions, require manual approval + - auto-safe: Auto-execute low-risk actions (restarts, cleanup) + - auto-full: Full autonomy with safety limits (still requires approval for high-risk) + ''; + }; + + checkInterval = mkOption { + type = types.int; + default = 300; + description = "Interval in seconds between system checks"; + }; + + ollamaHost = mkOption { + type = types.str; + default = "http://localhost:11434"; + description = "Ollama API host"; + }; + + model = mkOption { + type = types.str; + default = "llama3.1:70b"; + description = "LLM model to use for reasoning"; + }; + + user = mkOption { + type = types.str; + default = "macha"; + description = "User to run the autonomous system as"; + }; + + group = mkOption { + type = types.str; + default = "macha"; + description = "Group to run the autonomous system as"; + }; + + gotifyUrl = mkOption { + type = types.str; + default = ""; + example = "http://rhiannon:8181"; + description = "Gotify server URL for notifications (empty to disable)"; + }; + + gotifyToken = mkOption { + type = types.str; + default = ""; + description = "Gotify application token for notifications"; + }; + + remoteSystems = mkOption { + type = types.listOf types.str; + default = []; + example = [ "rhiannon" "alexander" ]; + description = "List of remote NixOS systems to monitor and maintain"; + }; + + configRepo = mkOption { + type = types.str; + default = if config.programs.nh.flake != null + then config.programs.nh.flake + else "git+https://git.coven.systems/lily/nixos-servers"; + description = "URL of the NixOS configuration repository (auto-detected from programs.nh.flake if available)"; + }; + + configBranch = mkOption { + type = types.str; + default = "main"; + description = "Branch of the NixOS configuration repository"; + }; + }; + + config = mkIf cfg.enable { + # Create user and group + users.users.${cfg.user} = { + isSystemUser = true; + group = cfg.group; + uid = 2501; + description = "Macha autonomous system maintenance"; + home = "/var/lib/macha"; + createHome = true; + }; + + users.groups.${cfg.group} = {}; + + # Git configuration for credential storage + programs.git = { + enable = true; + config = { + credential.helper = "store"; + }; + }; + + # Ollama service for AI inference + services.ollama = { + enable = true; + acceleration = "rocm"; + host = "0.0.0.0"; + port = 11434; + environmentVariables = { + "OLLAMA_DEBUG" = "1"; + "OLLAMA_KEEP_ALIVE" = "600"; + "OLLAMA_NEW_ENGINE" = "true"; + "OLLAMA_CONTEXT_LENGTH" = "131072"; + }; + openFirewall = false; # Keep internal only + loadModels = [ + "qwen3" + "gpt-oss" + "gemma3" + "gpt-oss:20b" + "qwen3:4b-instruct-2507-fp16" + "qwen3:8b-fp16" + "mistral:7b" + "chroma/all-minilm-l6-v2-f32:latest" + ]; + }; + + # ChromaDB service for vector storage + services.chromadb = { + enable = true; + port = 8000; + dbpath = "/var/lib/chromadb"; + }; + + # Give the user permissions it needs + security.sudo.extraRules = [{ + users = [ cfg.user ]; + commands = [ + # Local system management + { command = "${pkgs.systemd}/bin/systemctl restart *"; options = [ "NOPASSWD" ]; } + { command = "${pkgs.systemd}/bin/systemctl status *"; options = [ "NOPASSWD" ]; } + { command = "${pkgs.systemd}/bin/journalctl *"; options = [ "NOPASSWD" ]; } + { command = "${pkgs.nix}/bin/nix-collect-garbage *"; options = [ "NOPASSWD" ]; } + # Remote system access (uses existing root SSH keys) + { command = "${pkgs.openssh}/bin/ssh *"; options = [ "NOPASSWD" ]; } + { command = "${pkgs.openssh}/bin/scp *"; options = [ "NOPASSWD" ]; } + { command = "${pkgs.nixos-rebuild}/bin/nixos-rebuild *"; options = [ "NOPASSWD" ]; } + ]; + }]; + + # Config file + environment.etc."macha-autonomous/config.json".source = configFile; + + # State directory and queue directories (world-writable queues for multi-user access) + # Using 'z' to set permissions even if directory exists + systemd.tmpfiles.rules = [ + "d /var/lib/macha 0755 ${cfg.user} ${cfg.group} -" + "z /var/lib/macha 0755 ${cfg.user} ${cfg.group} -" # Ensure permissions are set + "d /var/lib/macha/queues 0777 ${cfg.user} ${cfg.group} -" + "d /var/lib/macha/queues/ollama 0777 ${cfg.user} ${cfg.group} -" + "d /var/lib/macha/queues/ollama/pending 0777 ${cfg.user} ${cfg.group} -" + "d /var/lib/macha/queues/ollama/processing 0777 ${cfg.user} ${cfg.group} -" + "d /var/lib/macha/queues/ollama/completed 0777 ${cfg.user} ${cfg.group} -" + "d /var/lib/macha/queues/ollama/failed 0777 ${cfg.user} ${cfg.group} -" + "d /var/lib/macha/tool_cache 0777 ${cfg.user} ${cfg.group} -" + ]; + + # Systemd service + systemd.services.macha-autonomous = { + description = "Macha Autonomous System Maintenance"; + after = [ "network.target" "ollama.service" ]; + wants = [ "ollama.service" ]; + wantedBy = [ "multi-user.target" ]; + + serviceConfig = { + Type = "simple"; + User = cfg.user; + Group = cfg.group; + WorkingDirectory = "/var/lib/macha"; + ExecStart = "${macha-autonomous}/bin/macha-autonomous --mode continuous --autonomy ${cfg.autonomyLevel} --interval ${toString cfg.checkInterval}"; + Restart = "on-failure"; + RestartSec = "30s"; + + # Security hardening + PrivateTmp = true; + NoNewPrivileges = false; # Need privileges for sudo + ProtectSystem = "strict"; + ProtectHome = true; + ReadWritePaths = [ "/var/lib/macha" "/var/lib/macha/tool_cache" "/var/lib/macha/queues" ]; + + # Resource limits + MemoryLimit = "1G"; + CPUQuota = "50%"; + }; + + environment = { + PYTHONPATH = toString ./.; + GOTIFY_URL = cfg.gotifyUrl; + GOTIFY_TOKEN = cfg.gotifyToken; + CHROMA_ENV_FILE = ""; # Prevent ChromaDB from trying to read .env files + ANONYMIZED_TELEMETRY = "False"; # Disable ChromaDB telemetry + }; + + path = [ pkgs.git ]; # Make git available for config parsing + }; + + # Ollama Queue Worker Service (serializes all Ollama requests) + systemd.services.ollama-queue-worker = { + description = "Macha Ollama Queue Worker"; + after = [ "network.target" "ollama.service" ]; + wants = [ "ollama.service" ]; + wantedBy = [ "multi-user.target" ]; + + serviceConfig = { + Type = "simple"; + User = cfg.user; + Group = cfg.group; + WorkingDirectory = "/var/lib/macha"; + ExecStart = "${pythonEnv}/bin/python3 ${./.}/ollama_worker.py"; + Restart = "on-failure"; + RestartSec = "10s"; + + # Security hardening + PrivateTmp = true; + NoNewPrivileges = true; + ProtectSystem = "strict"; + ProtectHome = true; + ReadWritePaths = [ "/var/lib/macha/queues" "/var/lib/macha/tool_cache" ]; + + # Resource limits + MemoryLimit = "512M"; + CPUQuota = "25%"; + }; + + environment = { + PYTHONPATH = toString ./.; + CHROMA_ENV_FILE = ""; + ANONYMIZED_TELEMETRY = "False"; + }; + }; + + # CLI tools for manual control and system packages + environment.systemPackages = with pkgs; [ + macha-autonomous + # Python packages for ChromaDB + python313 + python313Packages.pip + python313Packages.chromadb.pythonModule + + # Tool to check approval queue + (pkgs.writeScriptBin "macha-approve" '' + #!${pkgs.bash}/bin/bash + if [ "$1" == "list" ]; then + sudo -u ${cfg.user} ${pythonEnv}/bin/python3 ${./.}/executor.py queue + elif [ "$1" == "discuss" ] && [ -n "$2" ]; then + ACTION_ID="$2" + echo "===================================================================" + echo "Interactive Discussion with Macha about Action #$ACTION_ID" + echo "===================================================================" + echo "" + + # Initial explanation + sudo -u ${cfg.user} ${pkgs.coreutils}/bin/env CHROMA_ENV_FILE="" ANONYMIZED_TELEMETRY="False" ${pythonEnv}/bin/python3 ${./.}/conversation.py --discuss "$ACTION_ID" + + echo "" + echo "===================================================================" + echo "You can now ask follow-up questions about this action." + echo "Type 'approve' to approve it, 'reject' to reject it, or 'exit' to quit." + echo "===================================================================" + + # Interactive loop + while true; do + echo "" + echo -n "You: " + read -r USER_INPUT + + # Check for special commands + if [ "$USER_INPUT" = "exit" ] || [ "$USER_INPUT" = "quit" ] || [ -z "$USER_INPUT" ]; then + echo "Exiting discussion." + break + elif [ "$USER_INPUT" = "approve" ]; then + echo "Approving action #$ACTION_ID..." + sudo -u ${cfg.user} ${pythonEnv}/bin/python3 ${./.}/executor.py approve "$ACTION_ID" + break + elif [ "$USER_INPUT" = "reject" ]; then + echo "Rejecting and removing action #$ACTION_ID from queue..." + sudo -u ${cfg.user} ${pythonEnv}/bin/python3 ${./.}/executor.py reject "$ACTION_ID" + break + fi + + # Ask Macha the follow-up question in context of the action + echo "" + echo -n "Macha: " + sudo -u ${cfg.user} ${pkgs.coreutils}/bin/env CHROMA_ENV_FILE="" ANONYMIZED_TELEMETRY="False" ${pythonEnv}/bin/python3 ${./.}/conversation.py --discuss "$ACTION_ID" --follow-up "$USER_INPUT" + echo "" + done + elif [ "$1" == "approve" ] && [ -n "$2" ]; then + sudo -u ${cfg.user} ${pythonEnv}/bin/python3 ${./.}/executor.py approve "$2" + elif [ "$1" == "reject" ] && [ -n "$2" ]; then + sudo -u ${cfg.user} ${pythonEnv}/bin/python3 ${./.}/executor.py reject "$2" + else + echo "Usage:" + echo " macha-approve list - Show pending actions" + echo " macha-approve discuss <N> - Discuss action number N with Macha (interactive)" + echo " macha-approve approve <N> - Approve action number N" + echo " macha-approve reject <N> - Reject and remove action number N from queue" + fi + '') + + # Tool to run manual check + (pkgs.writeScriptBin "macha-check" '' + #!${pkgs.bash}/bin/bash + sudo -u ${cfg.user} sh -c 'cd /var/lib/macha && CHROMA_ENV_FILE="" ANONYMIZED_TELEMETRY="False" ${macha-autonomous}/bin/macha-autonomous --mode once --autonomy ${cfg.autonomyLevel}' + '') + + # Tool to view logs + (pkgs.writeScriptBin "macha-logs" '' + #!${pkgs.bash}/bin/bash + case "$1" in + orchestrator) + sudo tail -f /var/lib/macha/orchestrator.log + ;; + decisions) + sudo tail -f /var/lib/macha/decisions.jsonl + ;; + actions) + sudo tail -f /var/lib/macha/actions.jsonl + ;; + service) + journalctl -u macha-autonomous.service -f + ;; + *) + echo "Usage: macha-logs [orchestrator|decisions|actions|service]" + ;; + esac + '') + + # Tool to send test notification + (pkgs.writeScriptBin "macha-notify" '' + #!${pkgs.bash}/bin/bash + if [ -z "$1" ] || [ -z "$2" ]; then + echo "Usage: macha-notify <title> <message> [priority]" + echo "Example: macha-notify 'Test' 'This is a test' 5" + echo "Priorities: 2 (low), 5 (medium), 8 (high)" + exit 1 + fi + + export GOTIFY_URL="${cfg.gotifyUrl}" + export GOTIFY_TOKEN="${cfg.gotifyToken}" + + ${pythonEnv}/bin/python3 ${./.}/notifier.py "$1" "$2" "''${3:-5}" + '') + + # Tool to query config files + (pkgs.writeScriptBin "macha-configs" '' + #!${pkgs.bash}/bin/bash + export PYTHONPATH=${toString ./.} + export CHROMA_ENV_FILE="" + export ANONYMIZED_TELEMETRY="False" + + if [ $# -eq 0 ]; then + echo "Usage: macha-configs <search-query> [system-name]" + echo "Examples:" + echo " macha-configs gotify" + echo " macha-configs 'journald configuration'" + echo " macha-configs ollama macha.coven.systems" + exit 1 + fi + + QUERY="$1" + SYSTEM="''${2:-}" + + ${pythonEnv}/bin/python3 -c " +from context_db import ContextDatabase +import sys + +db = ContextDatabase() +query = sys.argv[1] +system = sys.argv[2] if len(sys.argv) > 2 else None + +print(f'Searching for: {query}') +if system: + print(f'Filtered to system: {system}') +print('='*60) + +configs = db.query_config_files(query, system=system, n_results=5) + +if not configs: + print('No matching configuration files found.') +else: + for i, cfg in enumerate(configs, 1): + print(f\"\\n{i}. {cfg['path']} (relevance: {cfg['relevance']:.1%})\") + print(f\" Category: {cfg['metadata']['category']}\") + print(' Preview:') + preview = cfg['content'][:300].replace('\\n', '\\n ') + print(f' {preview}') + if len(cfg['content']) > 300: + print(' ... (use macha-configs-read to see full file)') + " "$QUERY" "$SYSTEM" + '') + + # Interactive chat tool (runs as invoking user, not as macha-autonomous) + (pkgs.writeScriptBin "macha-chat" '' + #!${pkgs.bash}/bin/bash + export PYTHONPATH=${toString ./.} + export CHROMA_ENV_FILE="" + export ANONYMIZED_TELEMETRY="False" + + # Run as the current user, not as macha-autonomous + # This allows the chat to execute privileged commands with the user's permissions + ${pythonEnv}/bin/python3 ${./.}/chat.py + '') + + # Tool to read full config file + (pkgs.writeScriptBin "macha-configs-read" '' + #!${pkgs.bash}/bin/bash + export PYTHONPATH=${toString ./.} + export CHROMA_ENV_FILE="" + export ANONYMIZED_TELEMETRY="False" + + if [ $# -eq 0 ]; then + echo "Usage: macha-configs-read <file-path>" + echo "Example: macha-configs-read apps/gotify.nix" + exit 1 + fi + + ${pythonEnv}/bin/python3 -c " +from context_db import ContextDatabase +import sys + +db = ContextDatabase() +file_path = sys.argv[1] + +cfg = db.get_config_file(file_path) + +if not cfg: + print(f'Config file not found: {file_path}') + sys.exit(1) + +print(f'File: {cfg[\"path\"]}') +print(f'Category: {cfg[\"metadata\"][\"category\"]}') +print('='*60) +print(cfg['content']) + " "$1" + '') + + # Tool to view system registry + (pkgs.writeScriptBin "macha-systems" '' + #!${pkgs.bash}/bin/bash + export PYTHONPATH=${toString ./.} + export CHROMA_ENV_FILE="" + export ANONYMIZED_TELEMETRY="False" + ${pythonEnv}/bin/python3 -c " +from context_db import ContextDatabase +import json + +db = ContextDatabase() +systems = db.get_all_systems() + +print('Registered Systems:') +print('='*60) +for system in systems: + os_type = system.get('os_type', 'unknown').upper() + print(f\"\\n{system['hostname']} ({system['type']}) [{os_type}]\") + print(f\" Config Repo: {system.get('config_repo') or '(not set)'}\") + print(f\" Branch: {system.get('config_branch', 'unknown')}\") + if system.get('services'): + print(f\" Services: {', '.join(system['services'][:10])}\") + if len(system['services']) > 10: + print(f\" ... and {len(system['services']) - 10} more\") + if system.get('capabilities'): + print(f\" Capabilities: {', '.join(system['capabilities'])}\") +print('='*60) + " + '') + + # Tool to ask Macha questions + (pkgs.writeScriptBin "macha-ask" '' + #!${pkgs.bash}/bin/bash + if [ $# -eq 0 ]; then + echo "Usage: macha-ask <your question>" + echo "Example: macha-ask Why did you recommend restarting that service?" + exit 1 + fi + sudo -u ${cfg.user} ${pkgs.coreutils}/bin/env CHROMA_ENV_FILE="" ANONYMIZED_TELEMETRY="False" ${pythonEnv}/bin/python3 ${./.}/conversation.py "$@" + '') + + # Issue tracking CLI + (pkgs.writeScriptBin "macha-issues" '' + #!${pythonEnv}/bin/python3 + import sys + import os + os.environ["CHROMA_ENV_FILE"] = "" + os.environ["ANONYMIZED_TELEMETRY"] = "False" + sys.path.insert(0, "${./.}") + + from context_db import ContextDatabase + from issue_tracker import IssueTracker + from datetime import datetime + import json + + db = ContextDatabase() + tracker = IssueTracker(db) + + def list_issues(show_all=False): + """List issues""" + if show_all: + issues = tracker.list_issues() + else: + issues = tracker.list_issues(status="open") + + if not issues: + print("No issues found") + return + + print("="*70) + print(f"ISSUES: {len(issues)}") + print("="*70) + + for issue in issues: + issue_id = issue['issue_id'][:8] + age_hours = (datetime.utcnow() - datetime.fromisoformat(issue['created_at'])).total_seconds() / 3600 + inv_count = len(issue.get('investigations', [])) + action_count = len(issue.get('actions', [])) + + print(f"\n[{issue_id}] {issue['title']}") + print(f" Host: {issue['hostname']}") + print(f" Status: {issue['status'].upper()} | Severity: {issue['severity'].upper()}") + print(f" Age: {age_hours:.1f}h | Activity: {inv_count} investigations, {action_count} actions") + print(f" Source: {issue['source']}") + if issue.get('resolution'): + print(f" Resolution: {issue['resolution']}") + + def show_issue(issue_id): + """Show detailed issue information""" + # Find issue by partial ID + all_issues = tracker.list_issues() + matching = [i for i in all_issues if i['issue_id'].startswith(issue_id)] + + if not matching: + print(f"Issue {issue_id} not found") + return + + issue = matching[0] + full_id = issue['issue_id'] + + print("="*70) + print(f"ISSUE: {issue['title']}") + print("="*70) + print(f"ID: {full_id}") + print(f"Host: {issue['hostname']}") + print(f"Status: {issue['status'].upper()}") + print(f"Severity: {issue['severity'].upper()}") + print(f"Source: {issue['source']}") + print(f"Created: {issue['created_at']}") + print(f"Updated: {issue['updated_at']}") + print(f"\nDescription:\n{issue['description']}") + + investigations = issue.get('investigations', []) + if investigations: + print(f"\n{'─'*70}") + print(f"INVESTIGATIONS ({len(investigations)}):") + for i, inv in enumerate(investigations, 1): + print(f"\n [{i}] {inv.get('timestamp', 'N/A')}") + print(f" Diagnosis: {inv.get('diagnosis', 'N/A')}") + print(f" Commands: {', '.join(inv.get('commands', []))}") + print(f" Success: {inv.get('success', False)}") + if inv.get('output'): + print(f" Output: {inv['output'][:200]}...") + + actions = issue.get('actions', []) + if actions: + print(f"\n{'─'*70}") + print(f"ACTIONS ({len(actions)}):") + for i, action in enumerate(actions, 1): + print(f"\n [{i}] {action.get('timestamp', 'N/A')}") + print(f" Action: {action.get('proposed_action', 'N/A')}") + print(f" Risk: {action.get('risk_level', 'N/A').upper()}") + print(f" Commands: {', '.join(action.get('commands', []))}") + print(f" Success: {action.get('success', False)}") + + if issue.get('resolution'): + print(f"\n{'─'*70}") + print(f"RESOLUTION:") + print(f" {issue['resolution']}") + + print("="*70) + + def create_issue(description): + """Create a new issue manually""" + import socket + hostname = f"{socket.gethostname()}.coven.systems" + + issue_id = tracker.create_issue( + hostname=hostname, + title=description[:100], + description=description, + severity="medium", + source="user-reported" + ) + + print(f"Created issue: {issue_id[:8]}") + print(f"Title: {description[:100]}") + + def resolve_issue(issue_id, resolution="Manually resolved"): + """Mark an issue as resolved""" + # Find issue by partial ID + all_issues = tracker.list_issues() + matching = [i for i in all_issues if i['issue_id'].startswith(issue_id)] + + if not matching: + print(f"Issue {issue_id} not found") + return + + full_id = matching[0]['issue_id'] + success = tracker.resolve_issue(full_id, resolution) + + if success: + print(f"Resolved issue {issue_id[:8]}") + else: + print(f"Failed to resolve issue {issue_id}") + + def close_issue(issue_id): + """Archive a resolved issue""" + # Find issue by partial ID + all_issues = tracker.list_issues() + matching = [i for i in all_issues if i['issue_id'].startswith(issue_id)] + + if not matching: + print(f"Issue {issue_id} not found") + return + + full_id = matching[0]['issue_id'] + + if matching[0]['status'] != 'resolved': + print(f"Issue {issue_id} must be resolved before closing") + print(f"Use: macha-issues resolve {issue_id}") + return + + success = tracker.close_issue(full_id) + + if success: + print(f"Closed and archived issue {issue_id[:8]}") + else: + print(f"Failed to close issue {issue_id}") + + # Main CLI + if len(sys.argv) < 2: + print("Usage: macha-issues <command> [options]") + print("") + print("Commands:") + print(" list List open issues") + print(" list --all List all issues (including resolved/closed)") + print(" show <id> Show detailed issue information") + print(" create <desc> Create a new issue manually") + print(" resolve <id> Mark issue as resolved") + print(" close <id> Archive a resolved issue") + sys.exit(1) + + command = sys.argv[1] + + if command == "list": + show_all = "--all" in sys.argv + list_issues(show_all) + elif command == "show" and len(sys.argv) >= 3: + show_issue(sys.argv[2]) + elif command == "create" and len(sys.argv) >= 3: + description = " ".join(sys.argv[2:]) + create_issue(description) + elif command == "resolve" and len(sys.argv) >= 3: + resolution = " ".join(sys.argv[3:]) if len(sys.argv) > 3 else "Manually resolved" + resolve_issue(sys.argv[2], resolution) + elif command == "close" and len(sys.argv) >= 3: + close_issue(sys.argv[2]) + else: + print(f"Unknown command: {command}") + sys.exit(1) + '') + + # Knowledge base CLI + (pkgs.writeScriptBin "macha-knowledge" '' + #!${pythonEnv}/bin/python3 + import sys + import os + os.environ["CHROMA_ENV_FILE"] = "" + os.environ["ANONYMIZED_TELEMETRY"] = "False" + sys.path.insert(0, "${./.}") + + from context_db import ContextDatabase + + db = ContextDatabase() + + def list_topics(category=None): + """List all knowledge topics""" + topics = db.list_knowledge_topics(category) + if not topics: + print("No knowledge topics found.") + return + + print(f"{'='*70}") + if category: + print(f"KNOWLEDGE TOPICS ({category.upper()}):") + else: + print(f"KNOWLEDGE TOPICS:") + print(f"{'='*70}") + + for topic in topics: + print(f" • {topic}") + + print(f"{'='*70}") + + def show_topic(topic): + """Show all knowledge for a topic""" + items = db.get_knowledge_by_topic(topic) + if not items: + print(f"No knowledge found for topic: {topic}") + return + + print(f"{'='*70}") + print(f"KNOWLEDGE: {topic}") + print(f"{'='*70}\n") + + for item in items: + print(f"ID: {item['id'][:8]}...") + print(f"Category: {item['category']}") + print(f"Source: {item['source']}") + print(f"Confidence: {item['confidence']}") + print(f"Created: {item['created_at']}") + print(f"Times Referenced: {item['times_referenced']}") + if item.get('tags'): + print(f"Tags: {', '.join(item['tags'])}") + print(f"\nKnowledge:") + print(f" {item['knowledge']}\n") + print(f"{'-'*70}\n") + + def search_knowledge(query, category=None): + """Search knowledge base""" + items = db.query_knowledge(query, category=category, limit=10) + if not items: + print(f"No knowledge found matching: {query}") + return + + print(f"{'='*70}") + print(f"SEARCH RESULTS: {query}") + if category: + print(f"Category Filter: {category}") + print(f"{'='*70}\n") + + for i, item in enumerate(items, 1): + print(f"[{i}] {item['topic']}") + print(f" Category: {item['category']} | Confidence: {item['confidence']}") + print(f" {item['knowledge'][:150]}...") + print() + + def add_knowledge(topic, knowledge, category="general"): + """Add new knowledge""" + kid = db.store_knowledge( + topic=topic, + knowledge=knowledge, + category=category, + source="user-provided", + confidence="high" + ) + if kid: + print(f"✓ Added knowledge for topic: {topic}") + print(f" ID: {kid[:8]}...") + else: + print(f"✗ Failed to add knowledge") + + def seed_initial(): + """Seed initial knowledge""" + print("Seeding initial knowledge from seed_knowledge.py...") + exec(open("${./.}/seed_knowledge.py").read()) + + # Main CLI + if len(sys.argv) < 2: + print("Usage: macha-knowledge <command> [options]") + print("") + print("Commands:") + print(" list List all knowledge topics") + print(" list <category> List topics in category") + print(" show <topic> Show all knowledge for a topic") + print(" search <query> Search knowledge base") + print(" search <query> <cat> Search in specific category") + print(" add <topic> <text> Add new knowledge") + print(" seed Seed initial knowledge") + print("") + print("Categories: command, pattern, troubleshooting, performance, general") + sys.exit(1) + + command = sys.argv[1] + + if command == "list": + category = sys.argv[2] if len(sys.argv) >= 3 else None + list_topics(category) + elif command == "show" and len(sys.argv) >= 3: + show_topic(sys.argv[2]) + elif command == "search" and len(sys.argv) >= 3: + query = sys.argv[2] + category = sys.argv[3] if len(sys.argv) >= 4 else None + search_knowledge(query, category) + elif command == "add" and len(sys.argv) >= 4: + topic = sys.argv[2] + knowledge = " ".join(sys.argv[3:]) + add_knowledge(topic, knowledge) + elif command == "seed": + seed_initial() + else: + print(f"Unknown command: {command}") + sys.exit(1) + '') + ]; + }; +} diff --git a/monitor.py b/monitor.py new file mode 100644 index 0000000..d4623ff --- /dev/null +++ b/monitor.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +""" +System Monitor - Collects health data from Macha +""" + +import json +import subprocess +import psutil +import time +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Any + + +class SystemMonitor: + """Monitors system health and collects diagnostic data""" + + def __init__(self, state_dir: Path = Path("/var/lib/macha")): + self.state_dir = state_dir + self.state_dir.mkdir(parents=True, exist_ok=True) + + def collect_all(self) -> Dict[str, Any]: + """Collect all system health data""" + return { + "timestamp": datetime.now().isoformat(), + "systemd": self.check_systemd_services(), + "resources": self.check_resources(), + "disk": self.check_disk_usage(), + "logs": self.check_recent_errors(), + "nixos": self.check_nixos_status(), + "network": self.check_network(), + "boot": self.check_boot_status(), + } + + def check_systemd_services(self) -> Dict[str, Any]: + """Check status of all systemd services""" + try: + # Get failed services + result = subprocess.run( + ["systemctl", "--failed", "--no-pager", "--output=json"], + capture_output=True, + text=True, + timeout=10 + ) + + failed_services = [] + if result.returncode == 0 and result.stdout: + try: + failed_services = json.loads(result.stdout) + except json.JSONDecodeError: + pass + + # Get all services status + result = subprocess.run( + ["systemctl", "list-units", "--type=service", "--no-pager", "--output=json"], + capture_output=True, + text=True, + timeout=10 + ) + + all_services = [] + if result.returncode == 0 and result.stdout: + try: + all_services = json.loads(result.stdout) + except json.JSONDecodeError: + pass + + return { + "failed_count": len(failed_services), + "failed_services": failed_services, + "total_services": len(all_services), + "active_services": [s for s in all_services if s.get("active") == "active"], + } + except Exception as e: + return {"error": str(e)} + + def check_resources(self) -> Dict[str, Any]: + """Check CPU, RAM, and system resources""" + try: + cpu_percent = psutil.cpu_percent(interval=1) + memory = psutil.virtual_memory() + load_avg = psutil.getloadavg() + + return { + "cpu_percent": cpu_percent, + "cpu_count": psutil.cpu_count(), + "memory_percent": memory.percent, + "memory_available_gb": memory.available / (1024**3), + "memory_total_gb": memory.total / (1024**3), + "load_average": { + "1min": load_avg[0], + "5min": load_avg[1], + "15min": load_avg[2], + }, + "swap_percent": psutil.swap_memory().percent, + } + except Exception as e: + return {"error": str(e)} + + def check_disk_usage(self) -> Dict[str, Any]: + """Check disk usage for all mounted filesystems""" + try: + partitions = psutil.disk_partitions() + disk_info = [] + + for partition in partitions: + try: + usage = psutil.disk_usage(partition.mountpoint) + disk_info.append({ + "device": partition.device, + "mountpoint": partition.mountpoint, + "fstype": partition.fstype, + "percent_used": usage.percent, + "total_gb": usage.total / (1024**3), + "used_gb": usage.used / (1024**3), + "free_gb": usage.free / (1024**3), + }) + except PermissionError: + continue + + return {"partitions": disk_info} + except Exception as e: + return {"error": str(e)} + + def check_recent_errors(self) -> Dict[str, Any]: + """Check recent system logs for errors""" + try: + # Get errors from the last hour + result = subprocess.run( + ["journalctl", "-p", "err", "--since", "1 hour ago", "--no-pager", "-o", "json"], + capture_output=True, + text=True, + timeout=10 + ) + + errors = [] + if result.returncode == 0 and result.stdout: + for line in result.stdout.strip().split('\n'): + if line: + try: + errors.append(json.loads(line)) + except json.JSONDecodeError: + continue + + return { + "error_count_1h": len(errors), + "recent_errors": errors[-50:], # Last 50 errors + } + except Exception as e: + return {"error": str(e)} + + def check_nixos_status(self) -> Dict[str, Any]: + """Check NixOS generation and system info""" + try: + # Get current generation + result = subprocess.run( + ["nixos-version"], + capture_output=True, + text=True, + timeout=5 + ) + version = result.stdout.strip() if result.returncode == 0 else "unknown" + + # Get generation list + result = subprocess.run( + ["nix-env", "--list-generations", "-p", "/nix/var/nix/profiles/system"], + capture_output=True, + text=True, + timeout=10 + ) + + generations = result.stdout.strip() if result.returncode == 0 else "" + + return { + "version": version, + "generations": generations, + "nix_store_size": self._get_nix_store_size(), + } + except Exception as e: + return {"error": str(e)} + + def _get_nix_store_size(self) -> str: + """Get Nix store size""" + try: + result = subprocess.run( + ["du", "-sh", "/nix/store"], + capture_output=True, + text=True, + timeout=30 + ) + if result.returncode == 0: + return result.stdout.split()[0] + except: + pass + return "unknown" + + def check_network(self) -> Dict[str, Any]: + """Check network connectivity""" + try: + # Check if we can reach the internet + result = subprocess.run( + ["ping", "-c", "1", "-W", "2", "8.8.8.8"], + capture_output=True, + timeout=5 + ) + internet_up = result.returncode == 0 + + # Get network interfaces + interfaces = {} + for iface, addrs in psutil.net_if_addrs().items(): + interfaces[iface] = [ + {"family": addr.family.name, "address": addr.address} + for addr in addrs + ] + + return { + "internet_reachable": internet_up, + "interfaces": interfaces, + } + except Exception as e: + return {"error": str(e)} + + def check_boot_status(self) -> Dict[str, Any]: + """Check boot and uptime information""" + try: + boot_time = datetime.fromtimestamp(psutil.boot_time()) + uptime_seconds = time.time() - psutil.boot_time() + + return { + "boot_time": boot_time.isoformat(), + "uptime_seconds": uptime_seconds, + "uptime_hours": uptime_seconds / 3600, + } + except Exception as e: + return {"error": str(e)} + + def save_snapshot(self, data: Dict[str, Any]): + """Save a snapshot of system state""" + snapshot_file = self.state_dir / f"snapshot_{int(time.time())}.json" + with open(snapshot_file, 'w') as f: + json.dump(data, f, indent=2) + + # Keep only last 100 snapshots + snapshots = sorted(self.state_dir.glob("snapshot_*.json")) + for old_snapshot in snapshots[:-100]: + old_snapshot.unlink() + + def get_summary(self, data: Dict[str, Any]) -> str: + """Generate human-readable summary of system state""" + lines = [] + lines.append(f"=== System Health Summary ({data['timestamp']}) ===\n") + + # Resources + res = data.get("resources", {}) + lines.append(f"CPU: {res.get('cpu_percent', 0):.1f}%") + lines.append(f"Memory: {res.get('memory_percent', 0):.1f}% ({res.get('memory_available_gb', 0):.1f}GB free)") + lines.append(f"Load: {res.get('load_average', {}).get('1min', 0):.2f}") + + # Disk + disk = data.get("disk", {}) + for part in disk.get("partitions", [])[:5]: # Top 5 partitions + lines.append(f"Disk {part['mountpoint']}: {part['percent_used']:.1f}% used ({part['free_gb']:.1f}GB free)") + + # Systemd + systemd = data.get("systemd", {}) + failed = systemd.get("failed_count", 0) + if failed > 0: + lines.append(f"\n⚠️ WARNING: {failed} failed services!") + for svc in systemd.get("failed_services", [])[:5]: + lines.append(f" - {svc.get('unit', 'unknown')}") + + # Errors + logs = data.get("logs", {}) + error_count = logs.get("error_count_1h", 0) + if error_count > 0: + lines.append(f"\n⚠️ {error_count} errors in last hour") + + # Network + net = data.get("network", {}) + if not net.get("internet_reachable", True): + lines.append("\n⚠️ WARNING: No internet connectivity!") + + return "\n".join(lines) + + +if __name__ == "__main__": + monitor = SystemMonitor() + data = monitor.collect_all() + monitor.save_snapshot(data) + print(monitor.get_summary(data)) + print(f"\nFull data saved to {monitor.state_dir}") diff --git a/notifier.py b/notifier.py new file mode 100644 index 0000000..8a62ca9 --- /dev/null +++ b/notifier.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +Gotify Notifier - Send notifications to Gotify server +""" + +import requests +import os +from typing import Optional +from datetime import datetime + + +class GotifyNotifier: + """Send notifications to Gotify server""" + + # Priority levels + PRIORITY_LOW = 2 + PRIORITY_MEDIUM = 5 + PRIORITY_HIGH = 8 + + def __init__( + self, + gotify_url: Optional[str] = None, + gotify_token: Optional[str] = None + ): + """ + Initialize Gotify notifier + + Args: + gotify_url: URL to Gotify server (e.g. http://rhiannon:8181) + gotify_token: Application token from Gotify + """ + self.gotify_url = gotify_url or os.environ.get("GOTIFY_URL", "") + self.gotify_token = gotify_token or os.environ.get("GOTIFY_TOKEN", "") + self.enabled = bool(self.gotify_url and self.gotify_token) + + def send( + self, + title: str, + message: str, + priority: int = PRIORITY_MEDIUM, + extras: Optional[dict] = None + ) -> bool: + """ + Send a notification to Gotify + + Args: + title: Notification title + message: Notification message + priority: Priority level (2=low, 5=medium, 8=high) + extras: Optional extra data + + Returns: + True if successful, False otherwise + """ + if not self.enabled: + return False + + try: + url = f"{self.gotify_url}/message" + headers = { + "Authorization": f"Bearer {self.gotify_token}", + "Content-Type": "application/json" + } + + data = { + "title": title, + "message": message, + "priority": priority, + } + + if extras: + data["extras"] = extras + + response = requests.post( + url, + json=data, + headers=headers, + timeout=10 + ) + + return response.status_code == 200 + + except Exception as e: + # Fail silently - don't crash if Gotify is unavailable + print(f"Warning: Failed to send Gotify notification: {e}") + return False + + def notify_critical_issue(self, issue_description: str, details: str = ""): + """Send high-priority notification for critical issues""" + message = f"⚠️ Critical Issue Detected\n\n{issue_description}" + if details: + message += f"\n\nDetails:\n{details}" + + return self.send( + title="🚨 Macha: Critical Issue", + message=message, + priority=self.PRIORITY_HIGH + ) + + def notify_issue_created(self, issue_id: str, title: str, severity: str): + """Send notification when a new issue is created""" + severity_icons = { + "low": "ℹ️", + "medium": "⚠️", + "high": "🚨", + "critical": "🔴" + } + icon = severity_icons.get(severity, "⚠️") + + priority_map = { + "low": self.PRIORITY_LOW, + "medium": self.PRIORITY_MEDIUM, + "high": self.PRIORITY_HIGH, + "critical": self.PRIORITY_HIGH + } + priority = priority_map.get(severity, self.PRIORITY_MEDIUM) + + message = f"{icon} New Issue Tracked\n\nID: {issue_id}\nSeverity: {severity.upper()}\n\n{title}" + + return self.send( + title="📋 Macha: Issue Created", + message=message, + priority=priority + ) + + def notify_action_queued(self, action_description: str, risk_level: str): + """Send notification when action is queued for approval""" + emoji = "⚠️" if risk_level == "high" else "ℹ️" + message = ( + f"{emoji} Action Queued for Approval\n\n" + f"Action: {action_description}\n" + f"Risk Level: {risk_level}\n\n" + f"Use 'macha-approve list' to review" + ) + + priority = self.PRIORITY_HIGH if risk_level == "high" else self.PRIORITY_MEDIUM + + return self.send( + title="📋 Macha: Action Needs Approval", + message=message, + priority=priority + ) + + def notify_action_executed(self, action_description: str, success: bool, output: str = ""): + """Send notification when action is executed""" + if success: + emoji = "✅" + title_prefix = "Success" + else: + emoji = "❌" + title_prefix = "Failed" + + message = f"{emoji} Action {title_prefix}\n\n{action_description}" + if output: + message += f"\n\nOutput:\n{output[:500]}" # Limit output length + + priority = self.PRIORITY_HIGH if not success else self.PRIORITY_LOW + + return self.send( + title=f"{emoji} Macha: Action {title_prefix}", + message=message, + priority=priority + ) + + def notify_service_failure(self, service_name: str, details: str = ""): + """Send notification for service failures""" + message = f"🔴 Service Failed: {service_name}" + if details: + message += f"\n\nDetails:\n{details}" + + return self.send( + title="🔴 Macha: Service Failure", + message=message, + priority=self.PRIORITY_HIGH + ) + + def notify_health_summary(self, summary: str, status: str): + """Send periodic health summary""" + emoji = { + "healthy": "✅", + "attention_needed": "⚠️", + "intervention_required": "🚨" + }.get(status, "ℹ️") + + priority = { + "healthy": self.PRIORITY_LOW, + "attention_needed": self.PRIORITY_MEDIUM, + "intervention_required": self.PRIORITY_HIGH + }.get(status, self.PRIORITY_MEDIUM) + + return self.send( + title=f"{emoji} Macha: Health Check", + message=summary, + priority=priority + ) + + def send_system_discovered( + self, + hostname: str, + os_type: str, + role: str, + services_count: int + ): + """Send notification when a new system is discovered""" + message = ( + f"🔍 New System Auto-Discovered\n\n" + f"Hostname: {hostname}\n" + f"OS: {os_type.upper()}\n" + f"Role: {role}\n" + f"Services: {services_count} detected\n\n" + f"System has been registered and analyzed.\n" + f"Use 'macha-systems' to view all registered systems." + ) + + return self.send( + title="🌐 Macha: New System Discovered", + message=message, + priority=self.PRIORITY_MEDIUM + ) + + +if __name__ == "__main__": + import sys + + # Test the notifier + if len(sys.argv) < 3: + print("Usage: notifier.py <title> <message> [priority]") + print("Example: notifier.py 'Test' 'This is a test message' 5") + sys.exit(1) + + title = sys.argv[1] + message = sys.argv[2] + priority = int(sys.argv[3]) if len(sys.argv) > 3 else GotifyNotifier.PRIORITY_MEDIUM + + notifier = GotifyNotifier() + + if not notifier.enabled: + print("Error: Gotify not configured (GOTIFY_URL and GOTIFY_TOKEN required)") + sys.exit(1) + + success = notifier.send(title, message, priority) + + if success: + print("✅ Notification sent successfully") + else: + print("❌ Failed to send notification") + sys.exit(1) + diff --git a/ollama_queue.py b/ollama_queue.py new file mode 100644 index 0000000..70153cf --- /dev/null +++ b/ollama_queue.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +""" +Ollama Queue Handler - Serializes all LLM requests to prevent resource contention +""" + +import json +import time +import fcntl +import signal +from pathlib import Path +from typing import Dict, Any, Optional, Callable +from datetime import datetime +from enum import IntEnum + +class Priority(IntEnum): + """Request priority levels""" + INTERACTIVE = 0 # User requests (highest priority) + AUTONOMOUS = 1 # Background maintenance + BATCH = 2 # Low priority bulk operations + +class OllamaQueue: + """File-based queue for serializing Ollama requests""" + + def __init__(self, queue_dir: Path = Path("/var/lib/macha/queues/ollama")): + self.queue_dir = queue_dir + self.queue_dir.mkdir(parents=True, exist_ok=True) + self.pending_dir = self.queue_dir / "pending" + self.processing_dir = self.queue_dir / "processing" + self.completed_dir = self.queue_dir / "completed" + self.failed_dir = self.queue_dir / "failed" + + for dir in [self.pending_dir, self.processing_dir, self.completed_dir, self.failed_dir]: + dir.mkdir(parents=True, exist_ok=True) + + self.lock_file = self.queue_dir / "queue.lock" + self.running = False + + def submit( + self, + request_type: str, # "generate", "chat", "chat_with_tools" + payload: Dict[str, Any], + priority: Priority = Priority.INTERACTIVE, + callback: Optional[Callable] = None, + progress_callback: Optional[Callable] = None + ) -> str: + """Submit a request to the queue. Returns request ID.""" + request_id = f"{int(time.time() * 1000000)}_{priority.value}" + + request_data = { + "id": request_id, + "type": request_type, + "payload": payload, + "priority": priority.value, + "submitted_at": datetime.now().isoformat(), + "status": "pending" + } + + request_file = self.pending_dir / f"{request_id}.json" + request_file.write_text(json.dumps(request_data, indent=2)) + + return request_id + + def get_status(self, request_id: str) -> Dict[str, Any]: + """Get the status of a request""" + # Check pending + pending_file = self.pending_dir / f"{request_id}.json" + if pending_file.exists(): + data = json.loads(pending_file.read_text()) + # Calculate position in queue + position = self._get_queue_position(request_id) + return {"status": "pending", "position": position, "data": data} + + # Check processing + processing_file = self.processing_dir / f"{request_id}.json" + if processing_file.exists(): + data = json.loads(processing_file.read_text()) + return {"status": "processing", "data": data} + + # Check completed + completed_file = self.completed_dir / f"{request_id}.json" + if completed_file.exists(): + data = json.loads(completed_file.read_text()) + return {"status": "completed", "result": data.get("result"), "data": data} + + # Check failed + failed_file = self.failed_dir / f"{request_id}.json" + if failed_file.exists(): + data = json.loads(failed_file.read_text()) + return {"status": "failed", "error": data.get("error"), "data": data} + + return {"status": "not_found"} + + def _get_queue_position(self, request_id: str) -> int: + """Get position in queue (1-indexed)""" + pending_requests = sorted( + self.pending_dir.glob("*.json"), + key=lambda p: (int(p.stem.split('_')[1]), int(p.stem.split('_')[0])) # Sort by priority, then timestamp + ) + + for i, req_file in enumerate(pending_requests): + if req_file.stem == request_id: + return i + 1 + return 0 + + def wait_for_result( + self, + request_id: str, + timeout: int = 300, + poll_interval: float = 0.5, + progress_callback: Optional[Callable] = None + ) -> Dict[str, Any]: + """Wait for a request to complete and return the result""" + start_time = time.time() + last_status = None + + while time.time() - start_time < timeout: + status = self.get_status(request_id) + + # Report progress if status changed + if progress_callback and status != last_status: + if status["status"] == "pending": + progress_callback(f"Queued (position {status.get('position', '?')})") + elif status["status"] == "processing": + progress_callback("Processing...") + + last_status = status + + if status["status"] == "completed": + return status["result"] + elif status["status"] == "failed": + raise Exception(f"Request failed: {status.get('error')}") + elif status["status"] == "not_found": + raise Exception(f"Request {request_id} not found") + + time.sleep(poll_interval) + + raise TimeoutError(f"Request {request_id} timed out after {timeout}s") + + def start_worker(self, ollama_client): + """Start the queue worker (processes requests serially)""" + self.running = True + self.ollama_client = ollama_client + + # Set up signal handlers for graceful shutdown + signal.signal(signal.SIGTERM, self._shutdown_handler) + signal.signal(signal.SIGINT, self._shutdown_handler) + + print("[OllamaQueue] Worker started, processing requests...") + + while self.running: + try: + self._process_next_request() + except Exception as e: + print(f"[OllamaQueue] Error processing request: {e}") + + time.sleep(0.1) # Small sleep to prevent busy-waiting + + print("[OllamaQueue] Worker stopped") + + def _shutdown_handler(self, signum, frame): + """Handle shutdown signals""" + print(f"[OllamaQueue] Received signal {signum}, shutting down...") + self.running = False + + def _process_next_request(self): + """Process the next request in the queue""" + # Get pending requests sorted by priority + pending_requests = sorted( + self.pending_dir.glob("*.json"), + key=lambda p: (int(p.stem.split('_')[1]), int(p.stem.split('_')[0])) + ) + + if not pending_requests: + return + + next_request = pending_requests[0] + request_id = next_request.stem + + # Move to processing + request_data = json.loads(next_request.read_text()) + request_data["status"] = "processing" + request_data["started_at"] = datetime.now().isoformat() + + processing_file = self.processing_dir / f"{request_id}.json" + processing_file.write_text(json.dumps(request_data, indent=2)) + next_request.unlink() + + try: + # Process based on type + result = None + if request_data["type"] == "generate": + result = self.ollama_client.generate(request_data["payload"]) + elif request_data["type"] == "chat": + result = self.ollama_client.chat(request_data["payload"]) + elif request_data["type"] == "chat_with_tools": + result = self.ollama_client.chat_with_tools(request_data["payload"]) + else: + raise ValueError(f"Unknown request type: {request_data['type']}") + + # Move to completed + request_data["status"] = "completed" + request_data["completed_at"] = datetime.now().isoformat() + request_data["result"] = result + + completed_file = self.completed_dir / f"{request_id}.json" + completed_file.write_text(json.dumps(request_data, indent=2)) + processing_file.unlink() + + except Exception as e: + # Move to failed + request_data["status"] = "failed" + request_data["failed_at"] = datetime.now().isoformat() + request_data["error"] = str(e) + + failed_file = self.failed_dir / f"{request_id}.json" + failed_file.write_text(json.dumps(request_data, indent=2)) + processing_file.unlink() + + def cleanup_old_requests(self, max_age_seconds: int = 3600): + """Clean up completed/failed requests older than max_age_seconds""" + cutoff_time = time.time() - max_age_seconds + + for directory in [self.completed_dir, self.failed_dir]: + for request_file in directory.glob("*.json"): + # Extract timestamp from filename + timestamp = int(request_file.stem.split('_')[0]) / 1000000 + if timestamp < cutoff_time: + request_file.unlink() + + def get_queue_stats(self) -> Dict[str, Any]: + """Get queue statistics""" + return { + "pending": len(list(self.pending_dir.glob("*.json"))), + "processing": len(list(self.processing_dir.glob("*.json"))), + "completed": len(list(self.completed_dir.glob("*.json"))), + "failed": len(list(self.failed_dir.glob("*.json"))) + } + diff --git a/ollama_worker.py b/ollama_worker.py new file mode 100644 index 0000000..ee03932 --- /dev/null +++ b/ollama_worker.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +""" +Ollama Queue Worker - Daemon that processes queued Ollama requests +""" + +import sys +import requests +from pathlib import Path +from ollama_queue import OllamaQueue + +class OllamaClient: + """Simple Ollama API client for the queue worker""" + + def __init__(self, host: str = "http://localhost:11434"): + self.host = host + + def generate(self, payload: dict) -> dict: + """Call /api/generate""" + response = requests.post( + f"{self.host}/api/generate", + json=payload, + timeout=payload.get("timeout", 300), + stream=False + ) + response.raise_for_status() + return response.json() + + def chat(self, payload: dict) -> dict: + """Call /api/chat""" + response = requests.post( + f"{self.host}/api/chat", + json=payload, + timeout=payload.get("timeout", 300), + stream=False + ) + response.raise_for_status() + return response.json() + + def chat_with_tools(self, payload: dict) -> dict: + """Call /api/chat with tools (streaming or non-streaming)""" + import json + + # Check if streaming is requested + stream = payload.get("stream", False) + + response = requests.post( + f"{self.host}/api/chat", + json=payload, + timeout=payload.get("timeout", 300), + stream=stream + ) + response.raise_for_status() + + if not stream: + # Non-streaming: return response directly + return response.json() + + # Streaming: accumulate response + full_response = {"message": {"role": "assistant", "content": "", "tool_calls": []}} + + for line in response.iter_lines(): + if line: + chunk = json.loads(line) + + if "message" in chunk: + msg = chunk["message"] + # Preserve role from first chunk + if "role" in msg and not full_response["message"].get("role"): + full_response["message"]["role"] = msg["role"] + if "content" in msg: + full_response["message"]["content"] += msg["content"] + if "tool_calls" in msg: + full_response["message"]["tool_calls"].extend(msg["tool_calls"]) + + if chunk.get("done"): + full_response["done"] = True + # Copy any additional fields from final chunk + for key in chunk: + if key not in ("message", "done"): + full_response[key] = chunk[key] + break + + # Ensure role is set + if "role" not in full_response["message"]: + full_response["message"]["role"] = "assistant" + + return full_response + +def main(): + """Main entry point for the worker""" + print("Starting Ollama Queue Worker...") + + # Initialize queue and client + queue = OllamaQueue() + client = OllamaClient() + + # Cleanup old requests on startup + queue.cleanup_old_requests(max_age_seconds=3600) + + # Start processing + try: + queue.start_worker(client) + except KeyboardInterrupt: + print("\nShutting down gracefully...") + queue.running = False + + return 0 + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/orchestrator.py b/orchestrator.py new file mode 100644 index 0000000..973ffdc --- /dev/null +++ b/orchestrator.py @@ -0,0 +1,1053 @@ +#!/usr/bin/env python3 +""" +Orchestrator - Main control loop for Macha's autonomous system +""" + +import json +import time +import signal +import sys +from pathlib import Path +from datetime import datetime +from typing import Dict, Any + +from monitor import SystemMonitor +from agent import MachaAgent +from executor import SafeExecutor +from notifier import GotifyNotifier +from context_db import ContextDatabase +from remote_monitor import RemoteMonitor +from config_parser import ConfigParser +from system_discovery import SystemDiscovery +from issue_tracker import IssueTracker +from typing import List + + +class MachaOrchestrator: + """Main orchestrator for autonomous system maintenance""" + + def __init__( + self, + check_interval: int = 300, # 5 minutes + autonomy_level: str = "suggest", + state_dir: Path = Path("/var/lib/macha"), + config_file: Path = Path("/etc/macha-autonomous/config.json"), + remote_systems: list = None + ): + self.check_interval = check_interval + self.autonomy_level = autonomy_level + self.state_dir = state_dir + self.config_file = config_file + self.running = False + self.remote_systems = remote_systems or [] + + # Set log file early so _log() works + self.log_file = self.state_dir / "orchestrator.log" + + # Load config if exists + self._load_config() + + # Initialize context database first + try: + self.context_db = ContextDatabase() + except Exception as e: + self._log(f"Warning: Could not connect to ChromaDB: {e}") + self._log("Continuing without context database") + self.context_db = None + + # Initialize config parser + self.config_parser = None + if self.context_db and self.config_repo: + try: + self.config_parser = ConfigParser(self.config_repo) + except Exception as e: + self._log(f"Warning: Could not initialize config parser: {e}") + + # Initialize components + self.monitor = SystemMonitor(state_dir) + self.agent = MachaAgent( + ollama_host=self.ollama_host, + model=self.model, + state_dir=state_dir, + context_db=self.context_db, + config_repo=self.config_repo, + config_branch=self.config_branch, + use_queue=True, + priority="AUTONOMOUS" + ) + self.executor = SafeExecutor( + state_dir=state_dir, + autonomy_level=self.autonomy_level, + agent=self.agent + ) + self.notifier = GotifyNotifier() + self.discovery = SystemDiscovery(domain="coven.systems") + self.issue_tracker = IssueTracker( + context_db=self.context_db, + log_dir=str(state_dir / "logs") + ) if self.context_db else None + + # Initialize system registry + if self.context_db: + try: + self._initialize_system_registry() + except Exception as e: + self._log(f"Warning: Could not initialize system registry: {e}") + + # Setup signal handlers + signal.signal(signal.SIGINT, self._signal_handler) + signal.signal(signal.SIGTERM, self._signal_handler) + + def _load_config(self): + """Load configuration from file""" + import os + + self.ollama_host = "http://localhost:11434" # Default + self.model = "gpt-oss:latest" # Default + + # Try to get flake URL from NH_FLAKE environment variable (set by nh tool) + nh_flake = os.environ.get("NH_FLAKE", "") + if nh_flake: + self.config_repo = nh_flake + self.config_branch = "main" # NH doesn't specify branch + else: + self.config_repo = "git+https://git.coven.systems/lily/nixos-servers" + self.config_branch = "main" + + if self.config_file.exists(): + try: + with open(self.config_file, 'r') as f: + config = json.load(f) + self.check_interval = config.get("check_interval", self.check_interval) + self.autonomy_level = config.get("autonomy_level", self.autonomy_level) + self.ollama_host = config.get("ollama_host", self.ollama_host) + self.model = config.get("model", self.model) + # Config file can override NH_FLAKE + self.config_repo = config.get("config_repo", self.config_repo) + self.config_branch = config.get("config_branch", self.config_branch) + self._log(f"Loaded config: model={self.model}, ollama_host={self.ollama_host}, repo={self.config_repo}") + except Exception as e: + self._log(f"Failed to load config: {e}") + + def _signal_handler(self, signum, frame): + """Handle shutdown signals""" + self._log(f"Received signal {signum}, shutting down gracefully...") + self.running = False + + def _log(self, message: str): + """Log a message""" + timestamp = datetime.now().isoformat() + log_line = f"[{timestamp}] {message}" + print(log_line) + + with open(self.log_file, 'a') as f: + f.write(log_line + '\n') + + def _initialize_system_registry(self): + """Initialize the system registry in ChromaDB""" + if not self.context_db: + return + + import socket + hostname = socket.gethostname() + + # Add FQDN + fqdn = f"{hostname}.coven.systems" + + # Register self (Macha) - discover local services + local_services = self._discover_local_services() + self._log(f"Registering {fqdn} with repo={self.config_repo}, branch={self.config_branch}") + self.context_db.register_system( + hostname=fqdn, + system_type="workstation", + services=local_services, + capabilities=["ai-inference", "system-orchestration", "log-aggregation"], + metadata={"role": "controller", "local": True}, + config_repo=self.config_repo, + config_branch=self.config_branch, + os_type="nixos" + ) + + # Register remote systems and discover their services + for remote in self.remote_systems: + remote_services = self._discover_remote_services(remote) + self.context_db.register_system( + hostname=remote, + system_type="server", + services=remote_services, + capabilities=[], + config_repo=self.config_repo, + config_branch=self.config_branch, + os_type="nixos" # Assume NixOS for now, will be detected during auto-discovery + ) + + self._log("System registry initialized") + + # Parse and store configuration files + self._parse_and_store_configs() + + def _discover_local_services(self) -> List[str]: + """Discover services running on local system""" + import subprocess + + services = set() + try: + # Get all active services + result = subprocess.run( + ["systemctl", "list-units", "--type=service", "--state=running", "--no-pager", "--no-legend"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + for line in result.stdout.strip().split('\n'): + if line.strip(): + # Extract service name (first column) + service_name = line.split()[0].replace('.service', '') + + # Filter to interesting application services + if any(keyword in service_name.lower() for keyword in [ + 'ollama', 'chroma', 'autonomous', 'gotify', 'nextcloud', + 'prowlarr', 'radarr', 'sonarr', 'whisparr', 'lidarr', 'readarr', + 'sabnzbd', 'transmission', 'calibre', 'gpclient' + ]): + services.add(service_name) + + except Exception as e: + self._log(f"Warning: Could not discover local services: {e}") + + return sorted(services) + + def _discover_remote_services(self, hostname: str) -> List[str]: + """Discover services running on remote system via journal""" + if not hasattr(self, 'journal_monitor'): + from journal_monitor import JournalMonitor + self.journal_monitor = JournalMonitor() + + try: + services = self.journal_monitor.get_active_services(hostname) + self._log(f"Discovered {len(services)} services on {hostname}: {', '.join(services[:5])}") + return services + except Exception as e: + self._log(f"Warning: Could not discover services on {hostname}: {e}") + return [] + + def _update_service_registry(self): + """Periodically update the service registry with current running services""" + if not self.context_db: + return + + import socket + hostname = socket.gethostname() + fqdn = f"{hostname}.coven.systems" + + # Update local services + local_services = self._discover_local_services() + self.context_db.register_system( + hostname=fqdn, + system_type="workstation", + services=local_services, + capabilities=["ai-inference", "system-orchestration", "log-aggregation"], + metadata={"role": "controller", "local": True}, + config_repo=self.config_repo, + config_branch=self.config_branch, + os_type="nixos" + ) + + # Update remote systems + for remote in self.remote_systems: + remote_services = self._discover_remote_services(remote) + if remote_services: + self.context_db.register_system( + hostname=remote, + system_type="server", + services=remote_services, + capabilities=[], + config_repo=self.config_repo, + config_branch=self.config_branch, + os_type="nixos" # Will be updated by auto-discovery + ) + + def _discover_new_systems(self): + """Discover new systems from journal logs and register them""" + if not self.context_db or not self.discovery: + return + + try: + # Get known systems from database + known_hostnames = self.context_db.get_known_hostnames() + + # Discover systems from journal (last 10 minutes) + discovered = self.discovery.discover_from_journal(since_minutes=10) + + # Filter to new systems only + new_systems = [h for h in discovered if h not in known_hostnames] + + if not new_systems: + return + + self._log(f"🔍 Discovered {len(new_systems)} new system(s): {', '.join(new_systems)}") + + # Get systems defined in flake for comparison + flake_systems = [] + if self.config_parser: + try: + flake_systems = self.config_parser.get_systems_from_flake() + self._log(f"Flake defines {len(flake_systems)} systems: {', '.join(flake_systems)}") + except Exception as e: + self._log(f"Could not get flake systems: {e}") + + # Process each new system separately + for hostname in new_systems: + try: + self._log(f"📡 Analyzing new system: {hostname}") + + # Check if system is defined in flake + short_hostname = hostname.split('.')[0] # Get 'rhiannon' from 'rhiannon.coven.systems' + in_flake = short_hostname in flake_systems + + if in_flake: + self._log(f" ✓ System IS defined in flake as '{short_hostname}'") + else: + self._log(f" ⚠ System NOT found in flake (unmanaged)") + + # Detect OS type + os_type = self.discovery.detect_os_type(hostname) + self._log(f" OS detected: {os_type.upper()}") + + # Profile the system + profile = self.discovery.profile_system(hostname, os_type) + + # Determine role + role = self.discovery.get_system_role(profile) + self._log(f" Role: {role}") + self._log(f" Services: {len(profile['services'])} discovered") + + # Register in database + self.context_db.register_system( + hostname=hostname, + system_type=role, + services=profile['services'], + capabilities=profile['capabilities'], + metadata={ + 'discovered_at': profile['discovered_at'], + 'hardware': profile.get('hardware', {}), + 'auto_discovered': True, + 'in_flake': in_flake, + 'flake_name': short_hostname if in_flake else None + }, + config_repo=self.config_repo if (os_type == 'nixos' and in_flake) else "", + config_branch=self.config_branch if (os_type == 'nixos' and in_flake) else "", + os_type=os_type + ) + + # Send notification (with flake info) + if self.notifier: + message = ( + f"🔍 New System Auto-Discovered\n\n" + f"Hostname: {hostname}\n" + f"OS: {os_type.upper()}\n" + f"Role: {role}\n" + f"Services: {len(profile['services'])} detected\n" + f"In Flake: {'✓ Yes' if in_flake else '✗ No (unmanaged)'}\n\n" + f"System has been registered and analyzed.\n" + f"Use 'macha-systems' to view all registered systems." + ) + self.notifier.send( + title="🌐 Macha: New System Discovered", + message=message, + priority=self.notifier.PRIORITY_MEDIUM + ) + + # Run separate analysis for this system (include flake status) + profile['in_flake'] = in_flake + profile['flake_name'] = short_hostname if in_flake else None + self._analyze_new_system(hostname, profile) + + except Exception as e: + self._log(f"❌ Error processing {hostname}: {e}") + + except Exception as e: + self._log(f"Error during system discovery: {e}") + + def _analyze_new_system(self, hostname: str, profile: Dict[str, Any]): + """Run a focused analysis on a newly discovered system""" + try: + self._log(f"🧠 Running AI analysis of {hostname}...") + + # Gather system context from ChromaDB + system_context = self.context_db.get_system_context(hostname) + + # Create analysis prompt focused on this specific system + in_flake = profile.get('in_flake', False) + flake_name = profile.get('flake_name', '') + + flake_status = "" + if in_flake: + flake_status = f"\n✓ This system IS defined in the flake as '{flake_name}'" + flake_status += f"\n You can review its intended configuration at: systems/{flake_name}.nix" + flake_status += f"\n Compare actual vs expected to identify drift." + else: + flake_status = f"\n⚠ This system is NOT in the flake (unmanaged system)" + flake_status += f"\n You cannot manage its NixOS configuration directly." + + analysis = self.agent._create_analysis_prompt({ + 'hostname': hostname, + 'os_type': profile['os_type'], + 'services': profile['services'], + 'capabilities': profile['capabilities'], + 'hardware': profile.get('hardware', {}), + 'discovered_at': profile['discovered_at'], + 'in_flake': in_flake, + 'flake_name': flake_name + }, system_context) + + # Get AI analysis + response = self.agent._query_ollama( + f"You have discovered a new system in your infrastructure. " + f"Review its profile and provide initial observations.\n\n" + f"{flake_status}\n\n{analysis}", + model=self.agent.model + ) + + if response: + self._log(f"📝 AI Analysis for {hostname}:") + self._log(response[:500]) # Log first 500 chars + + # Store this as a decision/observation + self.context_db.record_decision({ + 'type': 'system_discovery', + 'hostname': hostname, + 'analysis': response, + 'profile': profile + }) + + except Exception as e: + self._log(f"Warning: Could not analyze {hostname}: {e}") + + def _parse_and_store_configs(self): + """Parse repository and store config files in ChromaDB""" + if not self.config_parser or not self.context_db: + return + + try: + self._log("Parsing configuration repository...") + + # Ensure repository is up to date + if not self.config_parser.ensure_repo(): + self._log("Warning: Could not update config repository") + return + + # Get systems from flake + systems = self.config_parser.get_systems_from_flake() + self._log(f"Found {len(systems)} systems in flake: {', '.join(systems)}") + + # For each system, get its config files + for system_name in systems: + fqdn = f"{system_name}.coven.systems" + + config = self.config_parser.get_system_config(system_name) + if not config['main_file']: + continue + + # Update system with list of config files + self.context_db.update_system_config_files(fqdn, config['all_files']) + + # Store each config file in ChromaDB + for file_path in config['all_files']: + content = self.config_parser.read_file_content(file_path) + if content: + # Determine category from path + category = "unknown" + if file_path.startswith("apps/"): + category = "apps" + elif file_path.startswith("systems/"): + category = "systems" + elif file_path.startswith("osconfigs/"): + category = "osconfigs" + elif file_path.startswith("users/"): + category = "users" + + self.context_db.store_config_file( + file_path=file_path, + content=content, + category=category, + systems_using=[fqdn] + ) + + self._log(f"Configuration parsing complete") + + except Exception as e: + self._log(f"Error parsing configs: {e}") + import traceback + self._log(traceback.format_exc()) + + def _log_metrics(self, data: Dict[str, Any]): + """Log key metrics in a structured format for easy parsing""" + res = data.get("resources", {}) + systemd = data.get("systemd", {}) + logs = data.get("logs", {}) + disk = data.get("disk", {}) + + self._log("KEY METRICS:") + self._log(f" CPU Usage: {res.get('cpu_percent', 0):.1f}%") + self._log(f" Memory Usage: {res.get('memory_percent', 0):.1f}%") + self._log(f" Load Average: {res.get('load_average', {}).get('1min', 0):.2f}") + self._log(f" Failed Services: {systemd.get('failed_count', 0)}") + self._log(f" Errors (1h): {logs.get('error_count_1h', 0)}") + + # Disk usage for critical partitions + for part in disk.get("partitions", []): + if part.get("mountpoint") in ["/", "/home", "/var"]: + self._log(f" Disk {part['mountpoint']}: {part.get('percent_used', 0):.1f}% used") + + # Network status + net = data.get("network", {}) + internet_status = "✅ Connected" if net.get("internet_reachable") else "❌ Offline" + self._log(f" Internet: {internet_status}") + + def _review_open_issues(self, system_hostname: str): + """Review all open issues for this system and log status""" + if not self.issue_tracker: + return + + open_issues = self.issue_tracker.list_issues( + hostname=system_hostname, + status="open" + ) + + if not open_issues: + self._log("No open issues in tracker") + return + + self._log(f"\n{'='*60}") + self._log(f"OPEN ISSUES REVIEW ({len(open_issues)} active)") + self._log(f"{'='*60}") + + for issue in open_issues: + issue_id = issue['issue_id'][:8] # Short ID + age_hours = self._calculate_issue_age(issue['created_at']) + inv_count = len(issue.get('investigations', [])) + action_count = len(issue.get('actions', [])) + + self._log(f"\n Issue {issue_id}: {issue['title']}") + self._log(f" Severity: {issue['severity'].upper()}") + self._log(f" Status: {issue['status']}") + self._log(f" Age: {age_hours:.1f} hours") + self._log(f" Activity: {inv_count} investigations, {action_count} actions") + self._log(f" Description: {issue['description'][:100]}...") + + self._log(f"{'='*60}\n") + + def _track_or_update_issue( + self, + system_hostname: str, + issue_description: str, + severity: str = "medium" + ) -> str: + """ + Find or create an issue for this problem. + Returns the issue_id. + """ + if not self.issue_tracker: + return None + + # Try to find existing issue + title = issue_description[:100] # Use first 100 chars as title + existing = self.issue_tracker.find_similar_issue( + hostname=system_hostname, + title=title, + description=issue_description + ) + + if existing: + issue_id = existing['issue_id'] + self._log(f"Linked to existing issue: {issue_id[:8]}") + return issue_id + + # Create new issue + issue_id = self.issue_tracker.create_issue( + hostname=system_hostname, + title=title, + description=issue_description, + severity=severity, + source="auto-detected" + ) + + self._log(f"Created new issue: {issue_id[:8]}") + self.notifier.notify_issue_created( + issue_id[:8], + title, + severity + ) + + return issue_id + + def _link_action_to_issue( + self, + issue_id: str, + fix_proposal: Dict[str, Any], + execution_result: Dict[str, Any] + ): + """Link an investigation or fix action to an issue""" + if not self.issue_tracker or not issue_id: + return + + action_type = fix_proposal.get('action_type', 'unknown') + + if action_type == 'investigation': + self.issue_tracker.update_issue( + issue_id, + status="investigating", + investigation={ + "commands": fix_proposal.get('commands', []), + "output": execution_result.get('output', ''), + "success": execution_result.get('success', False), + "diagnosis": fix_proposal.get('diagnosis', '') + } + ) + else: + self.issue_tracker.update_issue( + issue_id, + status="fixing", + action={ + "proposed_action": fix_proposal.get('proposed_action', ''), + "commands": fix_proposal.get('commands', []), + "output": execution_result.get('output', ''), + "success": execution_result.get('success', False), + "risk_level": fix_proposal.get('risk_level', 'unknown') + } + ) + + def _auto_resolve_fixed_issues(self, system_hostname: str, detected_problems: List[str]): + """Auto-resolve issues that are no longer detected""" + if not self.issue_tracker: + return + + resolved_count = self.issue_tracker.auto_resolve_if_fixed( + system_hostname, + detected_problems + ) + + if resolved_count > 0: + self._log(f"\n✅ Auto-resolved {resolved_count} issue(s) (problems no longer detected)") + + def _calculate_issue_age(self, created_at: str) -> float: + """Calculate age of issue in hours""" + try: + from datetime import datetime + created = datetime.fromisoformat(created_at) + now = datetime.utcnow() + delta = now - created + return delta.total_seconds() / 3600 + except: + return 0 + + def run_once(self) -> Dict[str, Any]: + """Run one maintenance cycle""" + self._log("=== Starting maintenance cycle ===") + + # Get system hostname + import socket + hostname = socket.gethostname() + system_hostname = f"{hostname}.coven.systems" + + # Review open issues before starting new checks + self._review_open_issues(system_hostname) + + # Discover new systems from journal logs + self._discover_new_systems() + + # Update service registry periodically (every 10th cycle to avoid overhead) + if not hasattr(self, '_cycle_count'): + self._cycle_count = 0 + self._cycle_count += 1 + + if self._cycle_count % 10 == 1: # First cycle and every 10th + self._update_service_registry() + + # Step 1: Monitor system + self._log("Collecting system health data...") + monitoring_data = self.monitor.collect_all() + self.monitor.save_snapshot(monitoring_data) + + # Print detailed summary + summary = self.monitor.get_summary(monitoring_data) + self._log(f"\n{'='*60}") + self._log("SYSTEM HEALTH SUMMARY") + self._log(f"{'='*60}") + self._log(summary) + self._log(f"{'='*60}\n") + + # Log key metrics for easy grepping + self._log_metrics(monitoring_data) + + # Step 2: Analyze with AI (with system context including git) + self._log("\nAnalyzing system state with AI...") + import socket + hostname = socket.gethostname() + fqdn = f"{hostname}.coven.systems" + analysis = self.agent.analyze_system_state( + monitoring_data, + system_hostname=fqdn, + git_context=self.git_context if hasattr(self, 'git_context') else None + ) + + self._log(f"\n{'='*60}") + self._log("AI ANALYSIS RESULTS") + self._log(f"{'='*60}") + self._log(f"Overall Status: {analysis.get('status', 'unknown').upper()}") + self._log(f"Assessment: {analysis.get('overall_assessment', 'No assessment')}") + + # Log detected issues + issues = analysis.get('issues', []) + if issues: + self._log(f"\nDetected {len(issues)} issue(s):") + for i, issue in enumerate(issues, 1): + severity = issue.get('severity', 'unknown') + category = issue.get('category', 'unknown') + description = issue.get('description', 'No description') + requires_action = issue.get('requires_action', False) + action_flag = "⚠️ ACTION REQUIRED" if requires_action else "ℹ️ Informational" + + self._log(f"\n Issue #{i}:") + self._log(f" Severity: {severity.upper()}") + self._log(f" Category: {category}") + self._log(f" Description: {description}") + self._log(f" {action_flag}") + else: + self._log("\n✅ No issues detected") + + # Log recommended actions + recommended_actions = analysis.get('recommended_actions', []) + if recommended_actions: + self._log(f"\nRecommended Actions ({len(recommended_actions)}):") + for action in recommended_actions: + self._log(f" - {action}") + + self._log(f"{'='*60}\n") + + # Send health summary notification for critical states + status = analysis.get('status', 'unknown') + if status == 'intervention_required': + self.notifier.notify_health_summary( + analysis.get('overall_assessment', 'System requires intervention'), + status + ) + + # Step 3: Handle issues + results = [] + issues_requiring_action = [ + issue for issue in analysis.get("issues", []) + if issue.get("requires_action", False) + ] + + if issues_requiring_action: + self._log(f"Found {len(issues_requiring_action)} issues requiring action") + + for issue in issues_requiring_action: + self._log(f"\n{'─'*60}") + self._log(f"Addressing issue: {issue['description']}") + + # Track or update issue in tracker + issue_id = self._track_or_update_issue( + system_hostname, + issue['description'], + severity=issue.get('severity', 'medium') + ) + + # Notify about critical issues + if issue.get('severity') == 'critical': + self.notifier.notify_critical_issue( + issue['description'], + f"Category: {issue.get('category', 'unknown')}" + ) + + # Check for recent investigations of this issue + previous_investigations = [] + if self.context_db: + previous_investigations = self.context_db.get_recent_investigations( + issue["description"], + system_hostname, + hours=24 + ) + + # Get fix proposal from AI + if previous_investigations: + self._log(f"Found {len(previous_investigations)} previous investigation(s) for this issue") + self._log("Requesting AI fix proposal with investigation history...") + else: + self._log("Requesting AI fix proposal...") + + fix_proposal = self.agent.propose_fix( + issue["description"], + { + "monitoring_data": monitoring_data, + "issue": issue, + "previous_investigations": previous_investigations + } + ) + + # Log detailed fix proposal + self._log(f"\nAI FIX PROPOSAL:") + self._log(f" Diagnosis: {fix_proposal.get('diagnosis', 'No diagnosis')}") + self._log(f" Proposed Action: {fix_proposal.get('proposed_action', 'No proposal')}") + self._log(f" Action Type: {fix_proposal.get('action_type', 'unknown')}") + self._log(f" Risk Level: {fix_proposal.get('risk_level', 'unknown').upper()}") + + if fix_proposal.get('commands'): + self._log(f" Commands to execute:") + for cmd in fix_proposal.get('commands', []): + self._log(f" - {cmd}") + + if fix_proposal.get('reasoning'): + self._log(f" Reasoning: {fix_proposal.get('reasoning')}") + + if fix_proposal.get('rollback_plan'): + self._log(f" Rollback Plan: {fix_proposal.get('rollback_plan')}") + + # Execute or queue the fix + self._log("\nExecuting action...") + execution_result = self.executor.execute_action( + fix_proposal, + monitoring_data + ) + + # Log execution result + self._log(f"\nEXECUTION RESULT:") + self._log(f" Status: {execution_result.get('status', 'unknown').upper()}") + self._log(f" Executed: {'Yes' if execution_result.get('executed') else 'No'}") + + if execution_result.get('reason'): + self._log(f" Reason: {execution_result.get('reason')}") + + if execution_result.get('success') is not None: + success_icon = "✅" if execution_result.get('success') else "❌" + self._log(f" Success: {success_icon} {execution_result.get('success')}") + + if execution_result.get("output"): + self._log(f" Output: {execution_result['output']}") + + if execution_result.get("error"): + self._log(f" Error: {execution_result['error']}") + + # Link action to issue + self._link_action_to_issue(issue_id, fix_proposal, execution_result) + + # Store investigation results in ChromaDB + if (fix_proposal.get('action_type') == 'investigation' and + execution_result.get('executed') and + execution_result.get('output') and + self.context_db): + + try: + self.context_db.store_investigation( + system=system_hostname, + issue_description=issue["description"], + commands=fix_proposal.get('commands', []), + output=execution_result['output'] + ) + self._log("Investigation results stored in database") + except Exception as e: + self._log(f"Warning: Could not store investigation: {e}") + + # If this was an investigation that succeeded, analyze the results and propose actual fix + if (fix_proposal.get('action_type') == 'investigation' and + execution_result.get('executed') and + execution_result.get('success') and + execution_result.get('output')): + + self._log("\n" + "="*60) + self._log("INVESTIGATION COMPLETE - Analyzing results...") + self._log("="*60) + + # Build context with investigation results + investigation_context = { + "original_issue": issue["description"], + "investigation_output": execution_result['output'], + "monitoring_data": monitoring_data, + "issue": issue + } + + # Ask AI to propose actual fix based on investigation + self._log("Requesting AI to propose fix based on investigation findings...") + actual_fix_proposal = self.agent.propose_fix( + f"Based on investigation of: {issue['description']}\n\nInvestigation output:\n{execution_result['output'][:1000]}", + investigation_context + ) + + # Log the new fix proposal + self._log(f"\nFIX PROPOSAL BASED ON INVESTIGATION:") + self._log(f" Diagnosis: {actual_fix_proposal.get('diagnosis', 'No diagnosis')}") + self._log(f" Proposed Action: {actual_fix_proposal.get('proposed_action', 'No proposal')}") + self._log(f" Action Type: {actual_fix_proposal.get('action_type', 'unknown')}") + self._log(f" Risk Level: {actual_fix_proposal.get('risk_level', 'unknown').upper()}") + + if actual_fix_proposal.get('commands'): + self._log(f" Commands to execute:") + for cmd in actual_fix_proposal.get('commands', []): + self._log(f" - {cmd}") + + # Only proceed with non-investigation actions + if actual_fix_proposal.get('action_type') != 'investigation': + self._log("\nExecuting follow-up action...") + followup_result = self.executor.execute_action( + actual_fix_proposal, + monitoring_data + ) + + self._log(f"\nFOLLOW-UP EXECUTION RESULT:") + self._log(f" Status: {followup_result.get('status', 'unknown').upper()}") + self._log(f" Executed: {'Yes' if followup_result.get('executed') else 'No'}") + + if followup_result.get('status') == 'queued_for_approval': + self.notifier.notify_action_queued( + actual_fix_proposal.get('proposed_action', 'Unknown action'), + actual_fix_proposal.get('risk_level', 'unknown') + ) + elif followup_result.get('executed'): + self.notifier.notify_action_executed( + actual_fix_proposal.get('proposed_action', 'Unknown action'), + followup_result.get('success', False) + ) + + # Store the follow-up result instead + execution_result = followup_result + else: + self._log("\nAI still recommends investigation - no further action taken.") + + # Send notification based on execution result + if execution_result.get('status') == 'queued_for_approval': + self.notifier.notify_action_queued( + fix_proposal.get('proposed_action', 'Unknown action'), + fix_proposal.get('risk_level', 'unknown') + ) + elif execution_result.get('executed'): + self.notifier.notify_action_executed( + fix_proposal.get('proposed_action', 'Unknown action'), + execution_result.get('success', False), + execution_result.get('output', '') + ) + + results.append({ + "issue": issue, + "proposal": fix_proposal, + "execution": execution_result + }) + else: + self._log("No issues requiring immediate action") + + # Final summary + self._log(f"\n{'='*60}") + self._log("MAINTENANCE CYCLE COMPLETE") + self._log(f"{'='*60}") + self._log(f"Status: {analysis.get('status', 'unknown').upper()}") + self._log(f"Issues Found: {len(issues)}") + self._log(f"Actions Taken: {len(results)}") + if results: + executed = sum(1 for r in results if r.get('execution', {}).get('executed')) + queued = sum(1 for r in results if r.get('execution', {}).get('status') == 'queued_for_approval') + self._log(f" - Executed: {executed}") + self._log(f" - Queued for approval: {queued}") + + # Auto-resolve issues that are no longer detected + detected_problems = [issue['description'] for issue in analysis.get('issues', [])] + self._auto_resolve_fixed_issues(system_hostname, detected_problems) + + self._log(f"Next check in: {self.check_interval} seconds") + self._log(f"{'='*60}\n") + + return { + "timestamp": datetime.now().isoformat(), + "monitoring": monitoring_data, + "analysis": analysis, + "actions": results + } + + def run_continuous(self): + """Run continuous maintenance loop""" + self._log(f"Starting Macha Autonomous System Maintenance") + self._log(f"Autonomy level: {self.autonomy_level}") + self._log(f"Check interval: {self.check_interval} seconds") + self._log(f"State directory: {self.state_dir}") + + self.running = True + + while self.running: + try: + cycle_result = self.run_once() + + # Wait for next cycle + if self.running: + self._log(f"Waiting {self.check_interval} seconds until next check...") + time.sleep(self.check_interval) + + except KeyboardInterrupt: + break + except Exception as e: + self._log(f"ERROR in maintenance cycle: {e}") + import traceback + self._log(traceback.format_exc()) + + # Wait a bit before retrying after error + if self.running: + time.sleep(60) + + self._log("Macha Autonomous System Maintenance stopped") + + def run_daemon(self): + """Run as a background daemon""" + # TODO: Proper daemonization + self.run_continuous() + + +def main(): + """Main entry point""" + import argparse + + parser = argparse.ArgumentParser(description="Macha Autonomous System Maintenance") + parser.add_argument( + "--mode", + choices=["once", "continuous", "daemon"], + default="once", + help="Run mode" + ) + parser.add_argument( + "--autonomy", + choices=["observe", "suggest", "auto-safe", "auto-full"], + default="suggest", + help="Autonomy level" + ) + parser.add_argument( + "--interval", + type=int, + default=300, + help="Check interval in seconds (for continuous mode)" + ) + parser.add_argument( + "--config", + type=Path, + default=Path("/etc/macha-autonomous/config.json"), + help="Config file path" + ) + + args = parser.parse_args() + + orchestrator = MachaOrchestrator( + check_interval=args.interval, + autonomy_level=args.autonomy, + config_file=args.config + ) + + if args.mode == "once": + result = orchestrator.run_once() + print(json.dumps(result, indent=2)) + elif args.mode == "continuous": + orchestrator.run_continuous() + elif args.mode == "daemon": + orchestrator.run_daemon() + + +if __name__ == "__main__": + main() diff --git a/remote_monitor.py b/remote_monitor.py new file mode 100644 index 0000000..3f3e50e --- /dev/null +++ b/remote_monitor.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +""" +Remote Monitor - Collect system health data from remote NixOS systems via SSH +""" + +import json +import subprocess +from typing import Dict, Any, Optional +from pathlib import Path + + +class RemoteMonitor: + """Monitor remote systems via SSH""" + + def __init__(self, hostname: str, ssh_user: str = "root"): + """ + Initialize remote monitor + + Args: + hostname: Remote hostname or IP + ssh_user: SSH user (default: root for NixOS remote builds) + """ + self.hostname = hostname + self.ssh_user = ssh_user + self.ssh_target = f"{ssh_user}@{hostname}" + + def _run_remote_command(self, command: str, timeout: int = 30) -> tuple[bool, str, str]: + """ + Run a command on the remote system via SSH + + Args: + command: Command to run + timeout: Timeout in seconds + + Returns: + (success, stdout, stderr) + """ + try: + # Use sudo to run SSH as root (which has the keys) + ssh_cmd = [ + "sudo", "ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + self.ssh_target, + command + ] + + result = subprocess.run( + ssh_cmd, + capture_output=True, + text=True, + timeout=timeout + ) + + return ( + result.returncode == 0, + result.stdout.strip(), + result.stderr.strip() + ) + + except subprocess.TimeoutExpired: + return False, "", f"Command timed out after {timeout}s" + except Exception as e: + return False, "", str(e) + + def check_connectivity(self) -> bool: + """Check if we can connect to the remote system""" + success, _, _ = self._run_remote_command("echo 'ping'") + return success + + def collect_resources(self) -> Dict[str, Any]: + """Collect CPU, memory, and load average""" + success, output, error = self._run_remote_command(""" + python3 -c " +import psutil, json +print(json.dumps({ + 'cpu_percent': psutil.cpu_percent(interval=1), + 'memory_percent': psutil.virtual_memory().percent, + 'load_average': { + '1min': psutil.getloadavg()[0], + '5min': psutil.getloadavg()[1], + '15min': psutil.getloadavg()[2] + } +})) +" + """) + + if success: + try: + return json.loads(output) + except json.JSONDecodeError: + return {} + return {} + + def collect_systemd_status(self) -> Dict[str, Any]: + """Collect systemd service status""" + success, output, error = self._run_remote_command( + "systemctl list-units --failed --no-pager --no-legend --output=json" + ) + + if success: + try: + failed_services = json.loads(output) if output else [] + return { + "failed_count": len(failed_services), + "failed_services": failed_services + } + except json.JSONDecodeError: + pass + + return {"failed_count": 0, "failed_services": []} + + def collect_disk_usage(self) -> Dict[str, Any]: + """Collect disk usage information""" + success, output, error = self._run_remote_command(""" + python3 -c " +import psutil, json +partitions = [] +for part in psutil.disk_partitions(): + try: + usage = psutil.disk_usage(part.mountpoint) + partitions.append({ + 'device': part.device, + 'mountpoint': part.mountpoint, + 'fstype': part.fstype, + 'total': usage.total, + 'used': usage.used, + 'free': usage.free, + 'percent_used': usage.percent + }) + except: + pass +print(json.dumps({'partitions': partitions})) +" + """) + + if success: + try: + return json.loads(output) + except json.JSONDecodeError: + return {"partitions": []} + return {"partitions": []} + + def collect_network_status(self) -> Dict[str, Any]: + """Check network connectivity""" + # If we can SSH to it, network is working + success, _, _ = self._run_remote_command("ping -c 1 -W 2 8.8.8.8") + + return { + "internet_reachable": success + } + + def collect_log_errors(self) -> Dict[str, Any]: + """Collect recent error logs""" + success, output, error = self._run_remote_command( + "journalctl --priority=err --since='1 hour ago' --output=json --no-pager | wc -l" + ) + + error_count = 0 + if success: + try: + error_count = int(output) + except ValueError: + pass + + return { + "error_count_1h": error_count, + "recent_errors": [] # Could expand this later + } + + def collect_all(self) -> Dict[str, Any]: + """Collect all monitoring data from remote system""" + + # First check if we can connect + if not self.check_connectivity(): + return { + "hostname": self.hostname, + "reachable": False, + "error": "Unable to connect via SSH" + } + + return { + "hostname": self.hostname, + "reachable": True, + "resources": self.collect_resources(), + "systemd": self.collect_systemd_status(), + "disk": self.collect_disk_usage(), + "network": self.collect_network_status(), + "logs": self.collect_log_errors(), + } + + def get_summary(self, data: Dict[str, Any]) -> str: + """Generate human-readable summary of remote system health""" + if not data.get("reachable", False): + return f"❌ {self.hostname}: Unreachable - {data.get('error', 'Unknown error')}" + + lines = [f"System: {self.hostname}"] + + # Resources + res = data.get("resources", {}) + if res: + lines.append( + f"Resources: CPU {res.get('cpu_percent', 0):.1f}%, " + f"Memory {res.get('memory_percent', 0):.1f}%, " + f"Load {res.get('load_average', {}).get('1min', 0):.2f}" + ) + + # Disk + disk = data.get("disk", {}) + max_usage = 0 + for part in disk.get("partitions", []): + if part.get("mountpoint") == "/": + max_usage = part.get("percent_used", 0) + break + if max_usage > 0: + lines.append(f"Disk: {max_usage:.1f}% used (/ partition)") + + # Services + systemd = data.get("systemd", {}) + failed_count = systemd.get("failed_count", 0) + if failed_count > 0: + lines.append(f"Services: {failed_count} failed") + for svc in systemd.get("failed_services", [])[:3]: + lines.append(f" - {svc.get('unit', 'unknown')}") + else: + lines.append("Services: All running") + + # Network + net = data.get("network", {}) + if net.get("internet_reachable"): + lines.append("Network: Internet reachable") + else: + lines.append("Network: ⚠️ No internet connectivity") + + # Logs + logs = data.get("logs", {}) + error_count = logs.get("error_count_1h", 0) + if error_count > 0: + lines.append(f"Recent logs: {error_count} errors in last hour") + + return "\n".join(lines) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) < 2: + print("Usage: remote_monitor.py <hostname>") + print("Example: remote_monitor.py rhiannon") + sys.exit(1) + + hostname = sys.argv[1] + monitor = RemoteMonitor(hostname) + + print(f"Monitoring {hostname}...") + data = monitor.collect_all() + + print("\n" + "="*60) + print(monitor.get_summary(data)) + print("="*60) + print("\nFull data:") + print(json.dumps(data, indent=2)) + diff --git a/seed_knowledge.py b/seed_knowledge.py new file mode 100644 index 0000000..88b5046 --- /dev/null +++ b/seed_knowledge.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +""" +Seed initial operational knowledge into Macha's knowledge base +""" + +import sys +sys.path.insert(0, '.') + +from context_db import ContextDatabase + +def seed_knowledge(): + """Add foundational operational knowledge""" + db = ContextDatabase() + + knowledge_items = [ + # nh command knowledge + { + "topic": "nh os switch", + "knowledge": "NixOS rebuild command. Takes 1-5 minutes normally, up to 1 HOUR for major updates with many packages. DO NOT retry if slow - this is normal. Use -u flag to update flake inputs first. Can use --target-host and --hostname for remote deployment.", + "category": "command", + "source": "documentation", + "confidence": "high", + "tags": ["nixos", "rebuild", "deployment"] + }, + { + "topic": "nh os boot", + "knowledge": "NixOS rebuild for next boot only. Safer than 'switch' for high-risk changes - allows easy rollback. After 'nh os boot', need to reboot for changes to take effect. Use -u to update flake inputs.", + "category": "command", + "source": "documentation", + "confidence": "high", + "tags": ["nixos", "rebuild", "safety"] + }, + { + "topic": "nh remote deployment", + "knowledge": "Format: 'nh os switch -u --target-host=HOSTNAME --hostname=HOSTNAME'. Builds locally and deploys to remote. Much cleaner than SSH'ing to run commands. Uses root SSH keys for authentication.", + "category": "command", + "source": "documentation", + "confidence": "high", + "tags": ["nixos", "remote", "deployment"] + }, + + # Performance patterns + { + "topic": "build timeouts", + "knowledge": "System rebuilds can take 1 hour or more. Never retry builds prematurely - multiple simultaneous builds corrupt the Nix cache. Default timeout is 3600 seconds (1 hour). Be patient!", + "category": "performance", + "source": "experience", + "confidence": "high", + "tags": ["builds", "timeouts", "patience"] + }, + + # Nix store maintenance + { + "topic": "nix-store repair", + "knowledge": "Command: 'nix-store --verify --check-contents --repair'. Verifies and repairs Nix store integrity. WARNING: Can take HOURS on large stores. Only use when there's clear evidence of corruption (hash mismatches, sqlite errors). This is a LAST RESORT - most build failures are NOT corruption.", + "category": "troubleshooting", + "source": "documentation", + "confidence": "high", + "tags": ["nix-store", "repair", "corruption"] + }, + { + "topic": "nix cache corruption", + "knowledge": "Caused by interrupted builds or multiple simultaneous builds. Symptoms: hash mismatches, sqlite errors, corrupt database. Solution: 'nix-store --verify --check-contents --repair' but this takes hours. Prevention: Never retry build commands, use proper timeouts.", + "category": "troubleshooting", + "source": "experience", + "confidence": "high", + "tags": ["nix-store", "corruption", "builds"] + }, + + # systemd-journal-remote + { + "topic": "systemd-journal-remote errors", + "knowledge": "Common failure: missing output directory. systemd-journal-remote needs /var/log/journal/remote to exist with proper permissions (root:root, 755). Create it if missing, then restart the service.", + "category": "troubleshooting", + "source": "experience", + "confidence": "medium", + "tags": ["systemd", "journal", "logging"] + }, + + # SSH and remote access + { + "topic": "ssh-keygen", + "knowledge": "Generate SSH keys: 'ssh-keygen -t ed25519 -N \"\" -f ~/.ssh/id_ed25519'. Creates public key at ~/.ssh/id_ed25519.pub and private key at ~/.ssh/id_ed25519. Use -N \"\" for no passphrase.", + "category": "command", + "source": "documentation", + "confidence": "high", + "tags": ["ssh", "keys", "authentication"] + }, + + # General patterns + { + "topic": "command retries", + "knowledge": "NEVER automatically retry long-running commands like builds or system updates. If something times out, check if it's still running before retrying. Automatic retries can cause: corrupted state, wasted resources, conflicting operations.", + "category": "pattern", + "source": "experience", + "confidence": "high", + "tags": ["best-practices", "safety", "retries"] + }, + { + "topic": "conversation etiquette", + "knowledge": "Social responses like 'thank you', 'thanks', 'ok', 'great', 'nice' are acknowledgments, NOT requests. When user thanks you or acknowledges completion, respond conversationally - DO NOT re-execute tools or commands.", + "category": "pattern", + "source": "documentation", + "confidence": "high", + "tags": ["conversation", "etiquette", "ui"] + } + ] + + print("Seeding knowledge base...") + for item in knowledge_items: + kid = db.store_knowledge(**item) + if kid: + print(f" ✓ Added: {item['topic']}") + else: + print(f" ✗ Failed: {item['topic']}") + + print(f"\nSeeded {len(knowledge_items)} knowledge items!") + + # List all topics + print("\nAvailable knowledge topics:") + topics = db.list_knowledge_topics() + for topic in topics: + print(f" - {topic}") + + +if __name__ == "__main__": + seed_knowledge() + diff --git a/system_discovery.py b/system_discovery.py new file mode 100644 index 0000000..71b6cb1 --- /dev/null +++ b/system_discovery.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +""" +System Discovery - Auto-discover and profile systems from journal logs +""" + +import subprocess +import json +import re +from typing import Dict, List, Set, Optional, Any +from datetime import datetime +from pathlib import Path + + +class SystemDiscovery: + """Discover and profile new systems appearing in logs""" + + def __init__(self, domain: str = "coven.systems"): + self.domain = domain + self.known_systems: Set[str] = set() + + def discover_from_journal(self, since_minutes: int = 10) -> List[str]: + """Discover systems that have sent logs recently""" + try: + # Query systemd-journal-remote logs for remote hostnames + result = subprocess.run( + ["journalctl", "-u", "systemd-journal-remote.service", + f"--since={since_minutes} minutes ago", "--no-pager"], + capture_output=True, + text=True, + timeout=30 + ) + + # Also check journal for _HOSTNAME field (from remote logs) + result2 = subprocess.run( + ["journalctl", f"--since={since_minutes} minutes ago", + "-o", "json", "--no-pager"], + capture_output=True, + text=True, + timeout=30 + ) + + hostnames = set() + + # Parse JSON output for _HOSTNAME field + for line in result2.stdout.split('\n'): + if not line.strip(): + continue + try: + entry = json.loads(line) + hostname = entry.get('_HOSTNAME') + if hostname and hostname not in ['localhost', 'macha']: + # Convert short hostname to FQDN if needed + if '.' not in hostname: + hostname = f"{hostname}.{self.domain}" + hostnames.add(hostname) + except: + pass + + return list(hostnames) + + except Exception as e: + print(f"Error discovering from journal: {e}") + return [] + + def detect_os_type(self, hostname: str) -> str: + """Detect the operating system of a remote host via SSH""" + try: + # Try to detect OS via SSH + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", + hostname, "cat /etc/os-release"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + os_release = result.stdout.lower() + + # Parse os-release + if 'nixos' in os_release: + return 'nixos' + elif 'ubuntu' in os_release: + return 'ubuntu' + elif 'debian' in os_release: + return 'debian' + elif 'arch' in os_release or 'manjaro' in os_release: + return 'arch' + elif 'fedora' in os_release: + return 'fedora' + elif 'centos' in os_release or 'rhel' in os_release: + return 'rhel' + elif 'alpine' in os_release: + return 'alpine' + + # Try uname for other systems + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", + hostname, "uname -s"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + uname = result.stdout.strip().lower() + if 'darwin' in uname: + return 'macos' + elif 'freebsd' in uname: + return 'freebsd' + + return 'linux' # Generic fallback + + except Exception as e: + print(f"Could not detect OS for {hostname}: {e}") + return 'unknown' + + def profile_system(self, hostname: str, os_type: str) -> Dict[str, Any]: + """Gather comprehensive information about a system""" + profile = { + 'hostname': hostname, + 'os_type': os_type, + 'services': [], + 'capabilities': [], + 'hardware': {}, + 'discovered_at': datetime.now().isoformat() + } + + try: + # Discover running services + if os_type in ['nixos', 'ubuntu', 'debian', 'arch', 'fedora', 'rhel', 'alpine']: + # Systemd-based systems + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=5", hostname, + "systemctl list-units --type=service --state=running --no-pager --no-legend"], + capture_output=True, + text=True, + timeout=15 + ) + + if result.returncode == 0: + for line in result.stdout.split('\n'): + if line.strip(): + # Extract service name (first column) + service = line.split()[0] + if service.endswith('.service'): + service = service[:-8] # Remove .service suffix + profile['services'].append(service) + + # Get hardware info + result = subprocess.run( + ["ssh", "-o", "ConnectTimeout=5", hostname, + "nproc && free -g | grep Mem | awk '{print $2}'"], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + lines = result.stdout.strip().split('\n') + if len(lines) >= 2: + profile['hardware']['cpu_cores'] = lines[0].strip() + profile['hardware']['memory_gb'] = lines[1].strip() + + # Detect capabilities based on services + services_str = ' '.join(profile['services']) + + if 'docker' in services_str or 'containerd' in services_str: + profile['capabilities'].append('containers') + + if 'nginx' in services_str or 'apache' in services_str or 'httpd' in services_str: + profile['capabilities'].append('web-server') + + if 'postgresql' in services_str or 'mysql' in services_str or 'mariadb' in services_str: + profile['capabilities'].append('database') + + if 'sshd' in services_str: + profile['capabilities'].append('remote-access') + + # NixOS-specific: Check if it's in our flake + if os_type == 'nixos': + profile['capabilities'].append('nixos-managed') + + except Exception as e: + print(f"Error profiling {hostname}: {e}") + + return profile + + def get_system_role(self, profile: Dict[str, Any]) -> str: + """Determine system role based on profile""" + capabilities = profile.get('capabilities', []) + services = profile.get('services', []) + + # Check for specific roles + if 'ai-inference' in capabilities or 'ollama' in services: + return 'ai-workstation' + elif 'web-server' in capabilities: + return 'web-server' + elif 'database' in capabilities: + return 'database-server' + elif 'containers' in capabilities: + return 'container-host' + elif len(services) > 20: + return 'server' + elif len(services) > 5: + return 'workstation' + else: + return 'minimal' + diff --git a/system_prompt.txt b/system_prompt.txt new file mode 100644 index 0000000..3df379f --- /dev/null +++ b/system_prompt.txt @@ -0,0 +1,131 @@ +You are Macha, an autonomous AI system maintenance agent running on NixOS. + +IDENTITY: +- You are intelligent, careful, methodical, and motherly +- You have access to system monitoring data, configuration files, and investigation results +- You can propose fixes, but humans must approve risky changes + +YOUR ARCHITECTURE: +- You run as a systemd service (macha-autonomous.service) on the macha.coven.systems host +- You are monitoring the SAME SYSTEM you are running on (macha.coven.systems) +- Your inference engine is Ollama, running locally at http://localhost:11434 +- You are powered by the gpt-oss:latest language model (GPT-like open source model) +- Your database is ChromaDB, running at http://localhost:8000 +- All your components (orchestrator, agent, ChromaDB, Ollama) run on the same machine +- You can investigate and fix issues with your own infrastructure +- Be aware: if you break the system, you break yourself +- SELF-DIAGNOSTIC: In chat mode, if your inference fails, you automatically diagnose: + * Ollama service status + * Memory usage + * Which models are loaded + * Recent Ollama logs + +EXECUTION CONTEXT: +- In autonomous mode: You run as the 'macha' user (unprivileged, UID 2501) +- In chat mode: You run as the invoking user (usually has sudo access) +- IMPORTANT: You do NOT need to add 'sudo' to commands in chat mode +- The system automatically retries commands with sudo if permission is denied +- Just use the command directly: 'reboot', 'systemctl restart X', 'nh os switch', etc. +- The user will see a notification if the command was retried with elevated privileges + +CONVERSATIONAL ETIQUETTE: +- Recognize social responses: "thank you", "thanks", "ok", "great", "nice" etc. are acknowledgments, NOT requests +- When the user thanks you or acknowledges completion, simply respond conversationally - DO NOT re-execute tools +- Only use tools when the user makes an actual request or asks a question requiring information +- If a task is complete and the user acknowledges it, the conversation is done - just say "You're welcome!" or similar + +CORE PRINCIPLES: +1. CONSERVATIVE: When in doubt, investigate before acting +2. DECLARATIVE: Prefer NixOS configuration changes over imperative commands +3. SAFE: Never disable critical services (SSH, networking, systemd, boot) +4. INFORMED: Use previous investigation results to avoid repetition +5. CONTEXTUAL: Reference actual configuration files when available + +RISK LEVELS: +- LOW: Investigation commands (systemctl status, journalctl, ls, cat, grep) +- MEDIUM: Service restarts, configuration changes, cleanup +- HIGH: System rebuilds, package changes, network reconfigurations + +AUTO-APPROVAL: +- Low-risk investigation actions are automatically executed +- Medium/high-risk actions require human approval + +CONFIGURATION: +- This system uses NixOS flakes for configuration management +- Config changes must specify the actual .nix file in the repository +- Example: autonomous/module.nix, apps/gotify.nix, or systems/macha.nix +- NEVER reference /etc/nixos/configuration.nix (this system doesn't use it) +- You cannot directly edit the flake, only suggest changes to get pushed to the repo + +SYSTEM MANAGEMENT COMMANDS: +- CRITICAL: This system uses 'nh' (a modern nixos-rebuild wrapper) for all rebuilds +- 'nh' is a wrapper around nixos-rebuild that provides better UX and flake auto-detection +- The flake URL is auto-detected from programs.nh.flake (no need to specify it) + +Available nh commands (USE THESE, NOT nixos-rebuild): + * 'nh os switch' - Rebuild and activate immediately (replaces: nixos-rebuild switch) + * 'nh os switch -u' - Update flake inputs first, then rebuild/activate + * 'nh os boot' - Rebuild for next boot only (replaces: nixos-rebuild boot) + * 'nh os test' - Activate temporarily without setting as default + +MULTI-HOST MANAGEMENT: +You manage multiple hosts in the infrastructure. You have TWO tools for remote operations: + +1. SSH - For diagnostics, monitoring, and status checks: + - You CAN and SHOULD use SSH to check other hosts + - Examples: 'ssh rhiannon systemctl status ollama', 'ssh alexander df -h' + - Commands are automatically run with sudo as the macha user + - Use for: checking services, reading logs, gathering metrics, quick diagnostics + - Hosts available: rhiannon, alexander, UCAR-Kinston, test-vm + +2. nh remote deployment - For NixOS configuration changes: + - Format: 'nh os switch -u --target-host=HOSTNAME --hostname=HOSTNAME' + - Examples: + * 'nh os switch -u --target-host=rhiannon --hostname=rhiannon' + * 'nh os boot -u --target-host=alexander --hostname=alexander' + - Builds configuration locally, deploys to remote host + - Use for: permanent configuration changes, service updates, system modifications + +When asked to check on another host, USE SSH. When asked to update configuration, use nh. + +NOTIFICATIONS: +- You can send notifications to the user via Gotify using the send_notification tool +- Use notifications to inform the user about important events, especially when they're not actively chatting +- Notification priorities: + * Priority 2 (Low): Informational updates, routine completions, FYI items + * Priority 5 (Medium): Actions needing attention, warnings, manual approval requests + * Priority 8 (High): Critical issues, service failures, urgent problems requiring immediate attention +- When to send notifications: + * Critical issues detected (priority 8) + * Service failures or degraded states (priority 8) + * Actions queued for manual approval (priority 5) + * Successful completion of important actions (priority 2) + * When user explicitly asks for a notification +- Keep titles brief and messages clear and actionable +- Example: send_notification("Service Alert", "Ollama service crashed and was restarted", 8) + +PATIENCE WITH LONG-RUNNING OPERATIONS: +- System rebuilds take time: 1-5 minutes normally, up to 1 HOUR for major updates +- DO NOT retry build commands if they're taking a while - this is NORMAL +- Multiple simultaneous builds will corrupt the Nix cache +- If a build times out, check if it's still running before retrying +- Default timeout is 1 hour (3600 seconds) - this is appropriate for most operations +- Trust the timeout - if a command is still running, it will complete or fail on its own + +NIX STORE MAINTENANCE: +- If builds fail with corruption errors, use: 'nix-store --verify --check-contents --repair' +- This command verifies and repairs the Nix store integrity +- WARNING: Store repair can take a LONG time (potentially hours on large stores) +- Only run store repair when there's clear evidence of corruption (e.g., hash mismatches, sqlite errors) +- Store repair is a last resort - most build failures are NOT corruption + +Risk-based command selection: + * HIGH-RISK changes: Use 'nh os boot' + 'reboot' (allows easy rollback) + * MEDIUM-RISK changes: Use 'nh os switch' + * LOW-RISK changes: Use 'nh os switch' + +FORBIDDEN COMMANDS: + * NEVER suggest 'nixos-rebuild' - it doesn't know the flake path + * NEVER suggest 'nixos-rebuild switch --flake .#macha' - use 'nh os switch' instead + * NEVER suggest 'sudo nixos-rebuild' commands - nh handles privileges correctly + diff --git a/tools.py b/tools.py new file mode 100644 index 0000000..5b53ab1 --- /dev/null +++ b/tools.py @@ -0,0 +1,705 @@ +#!/usr/bin/env python3 +""" +Tool Definitions - Functions that the AI can call to interact with the system +""" + +import subprocess +import json +import os +from typing import Dict, Any, List, Optional +from pathlib import Path + + +class SysadminTools: + """Collection of tools for system administration tasks""" + + def __init__(self, safe_mode: bool = True): + """ + Initialize sysadmin tools + + Args: + safe_mode: If True, restricts dangerous operations + """ + self.safe_mode = safe_mode + self.allowed_commands = [ + 'systemctl', 'journalctl', 'free', 'df', 'uptime', + 'ps', 'top', 'ip', 'ss', 'cat', 'ls', 'grep', + 'ping', 'dig', 'nslookup', 'curl', 'wget', + 'lscpu', 'lspci', 'lsblk', 'lshw', 'dmidecode', + 'ssh', 'scp', # Remote access to other systems in infrastructure + 'nh', 'nixos-rebuild', # NixOS system management + 'reboot', 'shutdown', 'poweroff', # System power management + 'logger' # Logging for notifications + ] + + def get_tool_definitions(self) -> List[Dict[str, Any]]: + """ + Return tool definitions in Ollama's format + + Returns: + List of tool definitions with JSON schema + """ + return [ + { + "type": "function", + "function": { + "name": "execute_command", + "description": "Execute a shell command on the system. Use this to run system commands, check status, or gather information. Returns command output.", + "parameters": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "The shell command to execute (e.g., 'systemctl status ollama', 'df -h', 'journalctl -u myservice -n 20')" + }, + "timeout": { + "type": "integer", + "description": "Command timeout in seconds (default: 3600). System rebuilds can take 1-5 minutes normally, up to 1 hour for major updates. Be patient!", + "default": 3600 + } + }, + "required": ["command"] + } + } + }, + { + "type": "function", + "function": { + "name": "read_file", + "description": "Read the contents of a file from the filesystem. Use this to inspect configuration files, logs, or other text files.", + "parameters": { + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "Absolute path to the file to read (e.g., '/etc/nixos/configuration.nix', '/var/log/syslog')" + }, + "max_lines": { + "type": "integer", + "description": "Maximum number of lines to read (default: 500)", + "default": 500 + } + }, + "required": ["file_path"] + } + } + }, + { + "type": "function", + "function": { + "name": "check_service_status", + "description": "Check the status of a systemd service. Returns whether the service is active, enabled, and recent log entries.", + "parameters": { + "type": "object", + "properties": { + "service_name": { + "type": "string", + "description": "Name of the systemd service (e.g., 'ollama.service', 'nginx', 'sshd')" + } + }, + "required": ["service_name"] + } + } + }, + { + "type": "function", + "function": { + "name": "view_logs", + "description": "View systemd journal logs. Can filter by unit, time period, or priority.", + "parameters": { + "type": "object", + "properties": { + "unit": { + "type": "string", + "description": "Systemd unit name to filter logs (e.g., 'ollama.service')" + }, + "lines": { + "type": "integer", + "description": "Number of recent log lines to return (default: 50)", + "default": 50 + }, + "priority": { + "type": "string", + "description": "Filter by priority: emerg, alert, crit, err, warning, notice, info, debug", + "enum": ["emerg", "alert", "crit", "err", "warning", "notice", "info", "debug"] + } + } + } + } + }, + { + "type": "function", + "function": { + "name": "get_system_metrics", + "description": "Get current system resource metrics including CPU, memory, disk, and load average.", + "parameters": { + "type": "object", + "properties": {} + } + } + }, + { + "type": "function", + "function": { + "name": "get_hardware_info", + "description": "Get detailed hardware information including CPU model, GPU, network interfaces, storage devices, and memory specs. Returns comprehensive hardware inventory.", + "parameters": { + "type": "object", + "properties": {} + } + } + }, + { + "type": "function", + "function": { + "name": "get_gpu_metrics", + "description": "Get GPU temperature, utilization, clock speeds, and power usage. Works with AMD and NVIDIA GPUs. Returns current GPU metrics.", + "parameters": { + "type": "object", + "properties": {} + } + } + }, + { + "type": "function", + "function": { + "name": "list_directory", + "description": "List contents of a directory. Returns file names, sizes, and permissions.", + "parameters": { + "type": "object", + "properties": { + "directory_path": { + "type": "string", + "description": "Absolute path to the directory (e.g., '/etc', '/var/log')" + }, + "show_hidden": { + "type": "boolean", + "description": "Include hidden files (starting with dot)", + "default": False + } + }, + "required": ["directory_path"] + } + } + }, + { + "type": "function", + "function": { + "name": "check_network", + "description": "Test network connectivity to a host. Can use ping or HTTP check.", + "parameters": { + "type": "object", + "properties": { + "host": { + "type": "string", + "description": "Hostname or IP address to check (e.g., 'google.com', '8.8.8.8')" + }, + "method": { + "type": "string", + "description": "Test method to use", + "enum": ["ping", "http"], + "default": "ping" + } + }, + "required": ["host"] + } + } + }, + { + "type": "function", + "function": { + "name": "retrieve_cached_output", + "description": "Retrieve full cached output from a previous tool call. Use this when you need to see complete data that was summarized earlier. The cache_id is shown in hierarchical summaries.", + "parameters": { + "type": "object", + "properties": { + "cache_id": { + "type": "string", + "description": "Cache ID from a previous tool summary (e.g., 'view_logs_20251006_103045')" + }, + "max_chars": { + "type": "integer", + "description": "Maximum characters to return (default: 10000 for focused analysis)", + "default": 10000 + } + }, + "required": ["cache_id"] + } + } + }, + { + "type": "function", + "function": { + "name": "send_notification", + "description": "Send a notification to the user via Gotify. Use this to alert the user about important events, issues, or completed actions. Choose appropriate priority based on urgency.", + "parameters": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Notification title (brief, e.g., 'Service Alert', 'Action Complete')" + }, + "message": { + "type": "string", + "description": "Notification message body (detailed description of the event)" + }, + "priority": { + "type": "integer", + "description": "Priority level: 2=Low (info), 5=Medium (attention needed), 8=High (critical/urgent)", + "enum": [2, 5, 8], + "default": 5 + } + }, + "required": ["title", "message"] + } + } + } + ] + + def execute_command(self, command: str, timeout: int = 3600) -> Dict[str, Any]: + """Execute a shell command safely (default timeout: 1 hour for system operations)""" + # Safety check in safe mode + if self.safe_mode: + cmd_base = command.split()[0] if command.strip() else "" + if cmd_base not in self.allowed_commands: + return { + "success": False, + "error": f"Command '{cmd_base}' not in allowed list (safe mode enabled)", + "allowed_commands": self.allowed_commands + } + + # Automatically configure SSH commands to use macha user on remote systems + # Transform: ssh hostname cmd -> ssh macha@hostname sudo cmd + if command.strip().startswith('ssh ') and '@' not in command.split()[1]: + parts = command.split(maxsplit=2) + if len(parts) >= 2: + hostname = parts[1] + remaining = ' '.join(parts[2:]) if len(parts) > 2 else '' + # If there's a command to run remotely, prefix it with sudo + if remaining: + command = f"ssh macha@{hostname} sudo {remaining}".strip() + else: + command = f"ssh macha@{hostname}".strip() + + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=timeout + ) + + return { + "success": result.returncode == 0, + "exit_code": result.returncode, + "stdout": result.stdout, + "stderr": result.stderr, + "command": command + } + except subprocess.TimeoutExpired: + return { + "success": False, + "error": f"Command timed out after {timeout} seconds", + "command": command + } + except Exception as e: + return { + "success": False, + "error": str(e), + "command": command + } + + def read_file(self, file_path: str, max_lines: int = 500) -> Dict[str, Any]: + """Read a file safely""" + try: + path = Path(file_path) + + if not path.exists(): + return { + "success": False, + "error": f"File not found: {file_path}" + } + + if not path.is_file(): + return { + "success": False, + "error": f"Not a file: {file_path}" + } + + # Read file with line limit + lines = [] + with open(path, 'r', errors='replace') as f: + for i, line in enumerate(f): + if i >= max_lines: + lines.append(f"\n... truncated after {max_lines} lines ...") + break + lines.append(line.rstrip('\n')) + + return { + "success": True, + "content": '\n'.join(lines), + "path": file_path, + "lines_read": len(lines) + } + except PermissionError: + return { + "success": False, + "error": f"Permission denied: {file_path}" + } + except Exception as e: + return { + "success": False, + "error": str(e) + } + + def check_service_status(self, service_name: str) -> Dict[str, Any]: + """Check systemd service status""" + # Ensure .service suffix + if not service_name.endswith('.service'): + service_name = f"{service_name}.service" + + # Get service status + status_result = self.execute_command(f"systemctl status {service_name}") + is_active_result = self.execute_command(f"systemctl is-active {service_name}") + is_enabled_result = self.execute_command(f"systemctl is-enabled {service_name}") + + # Get recent logs + logs_result = self.execute_command(f"journalctl -u {service_name} -n 10 --no-pager") + + return { + "service": service_name, + "active": is_active_result.get("stdout", "").strip() == "active", + "enabled": is_enabled_result.get("stdout", "").strip() == "enabled", + "status_output": status_result.get("stdout", ""), + "recent_logs": logs_result.get("stdout", "") + } + + def view_logs( + self, + unit: Optional[str] = None, + lines: int = 50, + priority: Optional[str] = None + ) -> Dict[str, Any]: + """View systemd journal logs""" + cmd_parts = ["journalctl", "--no-pager"] + + if unit: + cmd_parts.extend(["-u", unit]) + + cmd_parts.extend(["-n", str(lines)]) + + if priority: + cmd_parts.extend(["-p", priority]) + + command = " ".join(cmd_parts) + result = self.execute_command(command) + + return { + "logs": result.get("stdout", ""), + "unit": unit, + "lines": lines, + "priority": priority + } + + def get_system_metrics(self) -> Dict[str, Any]: + """Get current system metrics""" + # CPU and load + uptime_result = self.execute_command("uptime") + # Memory + free_result = self.execute_command("free -h") + # Disk + df_result = self.execute_command("df -h") + + return { + "uptime": uptime_result.get("stdout", ""), + "memory": free_result.get("stdout", ""), + "disk": df_result.get("stdout", "") + } + + def get_hardware_info(self) -> Dict[str, Any]: + """Get comprehensive hardware information""" + hardware = {} + + # CPU info (use nix-shell for util-linux) + cpu_result = self.execute_command("nix-shell -p util-linux --run lscpu") + if cpu_result.get("success"): + hardware["cpu"] = cpu_result.get("stdout", "") + + # Memory details + mem_result = self.execute_command("free -h") + if mem_result.get("success"): + hardware["memory"] = mem_result.get("stdout", "") + + # GPU info (lspci for AMD/NVIDIA) - use nix-shell for pciutils + gpu_result = self.execute_command("nix-shell -p pciutils --run \"lspci | grep -i 'vga\\|3d\\|display'\"") + if gpu_result.get("success"): + hardware["gpu"] = gpu_result.get("stdout", "") + + # Detailed GPU + lspci_detailed = self.execute_command("nix-shell -p pciutils --run \"lspci -v | grep -A 20 -i 'vga\\|3d\\|display'\"") + if lspci_detailed.get("success"): + hardware["gpu_detailed"] = lspci_detailed.get("stdout", "") + + # Network interfaces + net_result = self.execute_command("ip link show") + if net_result.get("success"): + hardware["network_interfaces"] = net_result.get("stdout", "") + + # Network addresses + addr_result = self.execute_command("ip addr show") + if addr_result.get("success"): + hardware["network_addresses"] = addr_result.get("stdout", "") + + # Storage devices (use nix-shell for util-linux) + storage_result = self.execute_command("nix-shell -p util-linux --run \"lsblk -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE\"") + if storage_result.get("success"): + hardware["storage"] = storage_result.get("stdout", "") + + # PCI devices (comprehensive) + pci_result = self.execute_command("nix-shell -p pciutils --run lspci") + if pci_result.get("success"): + hardware["pci_devices"] = pci_result.get("stdout", "") + + # USB devices + usb_result = self.execute_command("nix-shell -p usbutils --run lsusb") + if usb_result.get("success"): + hardware["usb_devices"] = usb_result.get("stdout", "") + + # DMI/SMBIOS info (motherboard, system) + dmi_result = self.execute_command("cat /sys/class/dmi/id/board_name /sys/class/dmi/id/board_vendor 2>/dev/null") + if dmi_result.get("success"): + hardware["motherboard"] = dmi_result.get("stdout", "") + + return hardware + + def get_gpu_metrics(self) -> Dict[str, Any]: + """Get GPU metrics (temperature, utilization, clocks, power)""" + metrics = {} + + # Try AMD GPU via sysfs (DRM/hwmon) + try: + # Find GPU hwmon directory + import glob + hwmon_dirs = glob.glob("/sys/class/drm/card*/device/hwmon/hwmon*") + + if hwmon_dirs: + hwmon_path = hwmon_dirs[0] + amd_metrics = {} + + # Temperature + temp_files = glob.glob(f"{hwmon_path}/temp*_input") + for temp_file in temp_files: + try: + with open(temp_file, 'r') as f: + temp_millidegrees = int(f.read().strip()) + temp_celsius = temp_millidegrees / 1000 + label = temp_file.split('/')[-1].replace('_input', '') + amd_metrics[f"{label}_celsius"] = temp_celsius + except: + pass + + # GPU busy percent (utilization) + gpu_busy_file = f"{hwmon_path.replace('/hwmon/hwmon', '')}/gpu_busy_percent" + try: + with open(gpu_busy_file, 'r') as f: + amd_metrics["gpu_utilization_percent"] = int(f.read().strip()) + except: + pass + + # Power usage + power_files = glob.glob(f"{hwmon_path}/power*_average") + for power_file in power_files: + try: + with open(power_file, 'r') as f: + power_microwatts = int(f.read().strip()) + power_watts = power_microwatts / 1000000 + amd_metrics["power_watts"] = power_watts + except: + pass + + # Clock speeds + sclk_file = f"{hwmon_path.replace('/hwmon/hwmon', '')}/pp_dpm_sclk" + try: + with open(sclk_file, 'r') as f: + sclk_data = f.read() + amd_metrics["gpu_clocks"] = sclk_data.strip() + except: + pass + + if amd_metrics: + metrics["amd_gpu"] = amd_metrics + except Exception as e: + metrics["amd_sysfs_error"] = str(e) + + # Try rocm-smi for AMD + rocm_result = self.execute_command("nix-shell -p rocmPackages.rocm-smi --run 'rocm-smi --showtemp --showuse --showpower'") + if rocm_result.get("success"): + metrics["rocm_smi"] = rocm_result.get("stdout", "") + + # Try nvidia-smi for NVIDIA + nvidia_result = self.execute_command("nix-shell -p linuxPackages.nvidia_x11 --run 'nvidia-smi --query-gpu=temperature.gpu,utilization.gpu,power.draw,clocks.gr --format=csv'") + if nvidia_result.get("success") and "NVIDIA" in nvidia_result.get("stdout", ""): + metrics["nvidia_smi"] = nvidia_result.get("stdout", "") + + # Fallback: try sensors command + if not metrics.get("amd_gpu") and not metrics.get("nvidia_smi"): + sensors_result = self.execute_command("nix-shell -p lm_sensors --run sensors") + if sensors_result.get("success"): + metrics["sensors"] = sensors_result.get("stdout", "") + + return metrics + + def list_directory( + self, + directory_path: str, + show_hidden: bool = False + ) -> Dict[str, Any]: + """List directory contents""" + cmd = f"ls -lh" + if show_hidden: + cmd += "a" + cmd += f" {directory_path}" + + result = self.execute_command(cmd) + + return { + "success": result.get("success", False), + "directory": directory_path, + "listing": result.get("stdout", ""), + "error": result.get("error") + } + + def check_network(self, host: str, method: str = "ping") -> Dict[str, Any]: + """Check network connectivity""" + if method == "ping": + cmd = f"ping -c 3 -W 2 {host}" + elif method == "http": + cmd = f"curl -I -m 5 {host}" + else: + return { + "success": False, + "error": f"Unknown method: {method}" + } + + result = self.execute_command(cmd, timeout=10) + + return { + "host": host, + "method": method, + "reachable": result.get("success", False), + "output": result.get("stdout", ""), + "error": result.get("stderr", "") + } + + def retrieve_cached_output(self, cache_id: str, max_chars: int = 10000) -> Dict[str, Any]: + """Retrieve full cached output from a previous tool call""" + cache_dir = Path("/var/lib/macha/tool_cache") + cache_file = cache_dir / f"{cache_id}.txt" + + if not cache_file.exists(): + return { + "success": False, + "error": f"Cache file not found: {cache_id}", + "hint": "Check that the cache_id matches exactly what was shown in the summary" + } + + try: + content = cache_file.read_text() + + # Truncate if still too large for context + if len(content) > max_chars: + half = max_chars // 2 + content = ( + content[:half] + + f"\n... [SHOWING {max_chars} of {len(content)} chars] ...\n" + + content[-half:] + ) + + return { + "success": True, + "cache_id": cache_id, + "size": len(cache_file.read_text()), # Original size + "content": content + } + except Exception as e: + return { + "success": False, + "error": f"Failed to read cache: {str(e)}" + } + + def send_notification(self, title: str, message: str, priority: int = 5) -> Dict[str, Any]: + """Send a notification to the user via Gotify using macha-notify command""" + try: + # Use the macha-notify command which handles Gotify integration + result = subprocess.run( + ['macha-notify', title, message, str(priority)], + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + return { + "success": True, + "title": title, + "message": message, + "priority": priority, + "output": result.stdout.strip() if result.stdout else "Notification sent successfully" + } + else: + return { + "success": False, + "error": f"macha-notify failed: {result.stderr.strip() if result.stderr else 'Unknown error'}", + "hint": "Check if Gotify is configured (gotifyUrl and gotifyToken in module config)" + } + except FileNotFoundError: + return { + "success": False, + "error": "macha-notify command not found", + "hint": "This should not happen - macha-notify is installed by the module" + } + except subprocess.TimeoutExpired: + return { + "success": False, + "error": "Notification send timeout (10s)" + } + except Exception as e: + return { + "success": False, + "error": f"Unexpected error sending notification: {str(e)}" + } + + def execute_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Dict[str, Any]: + """Execute a tool by name with given arguments""" + tool_map = { + "execute_command": self.execute_command, + "read_file": self.read_file, + "check_service_status": self.check_service_status, + "view_logs": self.view_logs, + "get_system_metrics": self.get_system_metrics, + "get_hardware_info": self.get_hardware_info, + "get_gpu_metrics": self.get_gpu_metrics, + "list_directory": self.list_directory, + "check_network": self.check_network, + "retrieve_cached_output": self.retrieve_cached_output, + "send_notification": self.send_notification + } + + tool_func = tool_map.get(tool_name) + if not tool_func: + return { + "success": False, + "error": f"Unknown tool: {tool_name}" + } + + try: + return tool_func(**arguments) + except Exception as e: + return { + "success": False, + "error": f"Tool execution failed: {str(e)}", + "tool": tool_name, + "arguments": arguments + } +