Initial commit: Split Macha autonomous system into separate flake
Macha is now a standalone NixOS flake that can be imported into other systems. This provides: - Independent versioning - Easier reusability - Cleaner separation of concerns - Better development workflow Includes: - Complete autonomous system code - NixOS module with full configuration options - Queue-based architecture with priority system - Chunked map-reduce for large outputs - ChromaDB knowledge base - Tool calling system - Multi-host SSH management - Gotify notification integration All capabilities from DESIGN.md are preserved.
This commit is contained in:
358
journal_monitor.py
Normal file
358
journal_monitor.py
Normal file
@@ -0,0 +1,358 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Journal Monitor - Monitor remote systems via centralized journald
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from typing import Dict, List, Any, Optional, Set
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class JournalMonitor:
|
||||
"""Monitor systems via centralized journald logs"""
|
||||
|
||||
def __init__(self, domain: str = "coven.systems"):
|
||||
"""
|
||||
Initialize journal monitor
|
||||
|
||||
Args:
|
||||
domain: Domain suffix for FQDNs
|
||||
"""
|
||||
self.domain = domain
|
||||
self.known_hosts: Set[str] = set()
|
||||
|
||||
def _run_journalctl(self, args: List[str], timeout: int = 30) -> tuple[bool, str, str]:
|
||||
"""
|
||||
Run journalctl command
|
||||
|
||||
Args:
|
||||
args: Arguments to journalctl
|
||||
timeout: Timeout in seconds
|
||||
|
||||
Returns:
|
||||
(success, stdout, stderr)
|
||||
"""
|
||||
try:
|
||||
cmd = ["journalctl"] + args
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
return (
|
||||
result.returncode == 0,
|
||||
result.stdout.strip(),
|
||||
result.stderr.strip()
|
||||
)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "", f"Command timed out after {timeout}s"
|
||||
except Exception as e:
|
||||
return False, "", str(e)
|
||||
|
||||
def discover_hosts(self) -> List[str]:
|
||||
"""
|
||||
Discover hosts reporting to centralized journal
|
||||
|
||||
Returns:
|
||||
List of discovered FQDNs
|
||||
"""
|
||||
success, output, _ = self._run_journalctl([
|
||||
"--output=json",
|
||||
"--since=1 day ago",
|
||||
"-n", "10000"
|
||||
])
|
||||
|
||||
if not success:
|
||||
return []
|
||||
|
||||
hosts = set()
|
||||
for line in output.split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
hostname = entry.get('_HOSTNAME', '')
|
||||
|
||||
# Ensure FQDN format
|
||||
if hostname and not hostname.endswith(f'.{self.domain}'):
|
||||
if '.' not in hostname:
|
||||
hostname = f"{hostname}.{self.domain}"
|
||||
|
||||
if hostname:
|
||||
hosts.add(hostname)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.known_hosts = hosts
|
||||
return sorted(hosts)
|
||||
|
||||
def collect_resources(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]:
|
||||
"""
|
||||
Collect resource usage from journal entries
|
||||
|
||||
This extracts CPU/memory info from systemd service messages
|
||||
"""
|
||||
# For now, return empty - we'll primarily use this for service/log monitoring
|
||||
# Resource metrics could be added if systems log them
|
||||
return {
|
||||
"cpu_percent": 0,
|
||||
"memory_percent": 0,
|
||||
"load_average": {"1min": 0, "5min": 0, "15min": 0}
|
||||
}
|
||||
|
||||
def collect_systemd_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]:
|
||||
"""
|
||||
Collect systemd service status from journal
|
||||
|
||||
Args:
|
||||
hostname: FQDN of the system
|
||||
since: Time range to check
|
||||
|
||||
Returns:
|
||||
Dictionary with failed service information
|
||||
"""
|
||||
# Query for systemd service failures
|
||||
success, output, _ = self._run_journalctl([
|
||||
f"_HOSTNAME={hostname}",
|
||||
"--priority=err",
|
||||
"--unit=*.service",
|
||||
f"--since={since}",
|
||||
"--output=json"
|
||||
])
|
||||
|
||||
if not success:
|
||||
return {"failed_count": 0, "failed_services": []}
|
||||
|
||||
failed_services = {}
|
||||
for line in output.split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
unit = entry.get('_SYSTEMD_UNIT', '')
|
||||
if unit and unit.endswith('.service'):
|
||||
service_name = unit.replace('.service', '')
|
||||
if service_name not in failed_services:
|
||||
failed_services[service_name] = {
|
||||
"unit": unit,
|
||||
"message": entry.get('MESSAGE', ''),
|
||||
"timestamp": entry.get('__REALTIME_TIMESTAMP', '')
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return {
|
||||
"failed_count": len(failed_services),
|
||||
"failed_services": list(failed_services.values())
|
||||
}
|
||||
|
||||
def collect_log_errors(self, hostname: str, since: str = "1 hour ago") -> Dict[str, Any]:
|
||||
"""
|
||||
Collect error logs from journal
|
||||
|
||||
Args:
|
||||
hostname: FQDN of the system
|
||||
since: Time range to check
|
||||
|
||||
Returns:
|
||||
Dictionary with error log information
|
||||
"""
|
||||
success, output, _ = self._run_journalctl([
|
||||
f"_HOSTNAME={hostname}",
|
||||
"--priority=err",
|
||||
f"--since={since}",
|
||||
"--output=json"
|
||||
])
|
||||
|
||||
if not success:
|
||||
return {"error_count_1h": 0, "recent_errors": []}
|
||||
|
||||
errors = []
|
||||
error_count = 0
|
||||
|
||||
for line in output.split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
error_count += 1
|
||||
|
||||
if len(errors) < 10: # Keep last 10 errors
|
||||
errors.append({
|
||||
"message": entry.get('MESSAGE', ''),
|
||||
"unit": entry.get('_SYSTEMD_UNIT', 'unknown'),
|
||||
"priority": entry.get('PRIORITY', ''),
|
||||
"timestamp": entry.get('__REALTIME_TIMESTAMP', '')
|
||||
})
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return {
|
||||
"error_count_1h": error_count,
|
||||
"recent_errors": errors
|
||||
}
|
||||
|
||||
def collect_disk_usage(self, hostname: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect disk usage - Note: This would require systems to log disk metrics
|
||||
For now, returns empty. Could be enhanced if systems periodically log disk usage
|
||||
"""
|
||||
return {"partitions": []}
|
||||
|
||||
def collect_network_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]:
|
||||
"""
|
||||
Check network connectivity based on recent journal activity
|
||||
|
||||
If we see recent logs from a host, it's reachable
|
||||
"""
|
||||
success, output, _ = self._run_journalctl([
|
||||
f"_HOSTNAME={hostname}",
|
||||
f"--since={since}",
|
||||
"-n", "1",
|
||||
"--output=json"
|
||||
])
|
||||
|
||||
# If we got recent logs, network is working
|
||||
internet_reachable = bool(success and output.strip())
|
||||
|
||||
return {
|
||||
"internet_reachable": internet_reachable,
|
||||
"last_seen": datetime.now().isoformat() if internet_reachable else None
|
||||
}
|
||||
|
||||
def collect_all(self, hostname: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Collect all monitoring data for a host from journal
|
||||
|
||||
Args:
|
||||
hostname: FQDN of the system to monitor
|
||||
|
||||
Returns:
|
||||
Complete monitoring data
|
||||
"""
|
||||
# First check if we have recent logs from this host
|
||||
net_status = self.collect_network_status(hostname)
|
||||
|
||||
if not net_status.get("internet_reachable"):
|
||||
return {
|
||||
"hostname": hostname,
|
||||
"reachable": False,
|
||||
"error": "No recent journal entries from this host"
|
||||
}
|
||||
|
||||
return {
|
||||
"hostname": hostname,
|
||||
"reachable": True,
|
||||
"source": "journal",
|
||||
"resources": self.collect_resources(hostname),
|
||||
"systemd": self.collect_systemd_status(hostname),
|
||||
"disk": self.collect_disk_usage(hostname),
|
||||
"network": net_status,
|
||||
"logs": self.collect_log_errors(hostname),
|
||||
}
|
||||
|
||||
def get_summary(self, data: Dict[str, Any]) -> str:
|
||||
"""Generate human-readable summary from journal data"""
|
||||
hostname = data.get("hostname", "unknown")
|
||||
|
||||
if not data.get("reachable", False):
|
||||
return f"❌ {hostname}: {data.get('error', 'Unreachable')}"
|
||||
|
||||
lines = [f"System: {hostname} (via journal)"]
|
||||
|
||||
# Services
|
||||
systemd = data.get("systemd", {})
|
||||
failed_count = systemd.get("failed_count", 0)
|
||||
if failed_count > 0:
|
||||
lines.append(f"Services: {failed_count} failed")
|
||||
for svc in systemd.get("failed_services", [])[:3]:
|
||||
lines.append(f" - {svc.get('unit', 'unknown')}")
|
||||
else:
|
||||
lines.append("Services: No recent failures")
|
||||
|
||||
# Network
|
||||
net = data.get("network", {})
|
||||
last_seen = net.get("last_seen")
|
||||
if last_seen:
|
||||
lines.append(f"Last seen: {last_seen}")
|
||||
|
||||
# Logs
|
||||
logs = data.get("logs", {})
|
||||
error_count = logs.get("error_count_1h", 0)
|
||||
if error_count > 0:
|
||||
lines.append(f"Recent logs: {error_count} errors in last hour")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def get_active_services(self, hostname: str, since: str = "1 hour ago") -> List[str]:
|
||||
"""
|
||||
Get list of active services on a host by looking at journal entries
|
||||
|
||||
This helps with auto-discovery of what's running on each system
|
||||
"""
|
||||
success, output, _ = self._run_journalctl([
|
||||
f"_HOSTNAME={hostname}",
|
||||
f"--since={since}",
|
||||
"--output=json",
|
||||
"-n", "1000"
|
||||
])
|
||||
|
||||
if not success:
|
||||
return []
|
||||
|
||||
services = set()
|
||||
for line in output.split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
unit = entry.get('_SYSTEMD_UNIT', '')
|
||||
if unit and unit.endswith('.service'):
|
||||
# Extract service name
|
||||
service = unit.replace('.service', '')
|
||||
# Filter out common system services, focus on application services
|
||||
if service not in ['systemd-journald', 'systemd-logind', 'sshd', 'dbus']:
|
||||
services.add(service)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return sorted(services)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
monitor = JournalMonitor()
|
||||
|
||||
# Discover hosts
|
||||
print("Discovering hosts from journal...")
|
||||
hosts = monitor.discover_hosts()
|
||||
print(f"Found {len(hosts)} hosts:")
|
||||
for host in hosts:
|
||||
print(f" - {host}")
|
||||
|
||||
# Monitor first host if available
|
||||
if hosts:
|
||||
hostname = hosts[0]
|
||||
print(f"\nMonitoring {hostname}...")
|
||||
data = monitor.collect_all(hostname)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(monitor.get_summary(data))
|
||||
print("="*60)
|
||||
|
||||
# Discover services
|
||||
print(f"\nActive services on {hostname}:")
|
||||
services = monitor.get_active_services(hostname)
|
||||
for svc in services[:10]:
|
||||
print(f" - {svc}")
|
||||
|
||||
Reference in New Issue
Block a user