#!/usr/bin/env python3 """ Journal Monitor - Monitor remote systems via centralized journald """ import json import subprocess from typing import Dict, List, Any, Optional, Set from datetime import datetime, timedelta from pathlib import Path from collections import defaultdict class JournalMonitor: """Monitor systems via centralized journald logs""" def __init__(self, domain: str = "coven.systems"): """ Initialize journal monitor Args: domain: Domain suffix for FQDNs """ self.domain = domain self.known_hosts: Set[str] = set() def _run_journalctl(self, args: List[str], timeout: int = 30) -> tuple[bool, str, str]: """ Run journalctl command Args: args: Arguments to journalctl timeout: Timeout in seconds Returns: (success, stdout, stderr) """ try: cmd = ["journalctl"] + args result = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout ) return ( result.returncode == 0, result.stdout.strip(), result.stderr.strip() ) except subprocess.TimeoutExpired: return False, "", f"Command timed out after {timeout}s" except Exception as e: return False, "", str(e) def discover_hosts(self) -> List[str]: """ Discover hosts reporting to centralized journal Returns: List of discovered FQDNs """ success, output, _ = self._run_journalctl([ "--output=json", "--since=1 day ago", "-n", "10000" ]) if not success: return [] hosts = set() for line in output.split('\n'): if not line.strip(): continue try: entry = json.loads(line) hostname = entry.get('_HOSTNAME', '') # Ensure FQDN format if hostname and not hostname.endswith(f'.{self.domain}'): if '.' not in hostname: hostname = f"{hostname}.{self.domain}" if hostname: hosts.add(hostname) except json.JSONDecodeError: continue self.known_hosts = hosts return sorted(hosts) def collect_resources(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]: """ Collect resource usage from journal entries This extracts CPU/memory info from systemd service messages """ # For now, return empty - we'll primarily use this for service/log monitoring # Resource metrics could be added if systems log them return { "cpu_percent": 0, "memory_percent": 0, "load_average": {"1min": 0, "5min": 0, "15min": 0} } def collect_systemd_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]: """ Collect systemd service status from journal Args: hostname: FQDN of the system since: Time range to check Returns: Dictionary with failed service information """ # Query for systemd service failures success, output, _ = self._run_journalctl([ f"_HOSTNAME={hostname}", "--priority=err", "--unit=*.service", f"--since={since}", "--output=json" ]) if not success: return {"failed_count": 0, "failed_services": []} failed_services = {} for line in output.split('\n'): if not line.strip(): continue try: entry = json.loads(line) unit = entry.get('_SYSTEMD_UNIT', '') if unit and unit.endswith('.service'): service_name = unit.replace('.service', '') if service_name not in failed_services: failed_services[service_name] = { "unit": unit, "message": entry.get('MESSAGE', ''), "timestamp": entry.get('__REALTIME_TIMESTAMP', '') } except json.JSONDecodeError: continue return { "failed_count": len(failed_services), "failed_services": list(failed_services.values()) } def collect_log_errors(self, hostname: str, since: str = "1 hour ago") -> Dict[str, Any]: """ Collect error logs from journal Args: hostname: FQDN of the system since: Time range to check Returns: Dictionary with error log information """ success, output, _ = self._run_journalctl([ f"_HOSTNAME={hostname}", "--priority=err", f"--since={since}", "--output=json" ]) if not success: return {"error_count_1h": 0, "recent_errors": []} errors = [] error_count = 0 for line in output.split('\n'): if not line.strip(): continue try: entry = json.loads(line) error_count += 1 if len(errors) < 10: # Keep last 10 errors errors.append({ "message": entry.get('MESSAGE', ''), "unit": entry.get('_SYSTEMD_UNIT', 'unknown'), "priority": entry.get('PRIORITY', ''), "timestamp": entry.get('__REALTIME_TIMESTAMP', '') }) except json.JSONDecodeError: continue return { "error_count_1h": error_count, "recent_errors": errors } def collect_disk_usage(self, hostname: str) -> Dict[str, Any]: """ Collect disk usage - Note: This would require systems to log disk metrics For now, returns empty. Could be enhanced if systems periodically log disk usage """ return {"partitions": []} def collect_network_status(self, hostname: str, since: str = "5 minutes ago") -> Dict[str, Any]: """ Check network connectivity based on recent journal activity If we see recent logs from a host, it's reachable """ success, output, _ = self._run_journalctl([ f"_HOSTNAME={hostname}", f"--since={since}", "-n", "1", "--output=json" ]) # If we got recent logs, network is working internet_reachable = bool(success and output.strip()) return { "internet_reachable": internet_reachable, "last_seen": datetime.now().isoformat() if internet_reachable else None } def collect_all(self, hostname: str) -> Dict[str, Any]: """ Collect all monitoring data for a host from journal Args: hostname: FQDN of the system to monitor Returns: Complete monitoring data """ # First check if we have recent logs from this host net_status = self.collect_network_status(hostname) if not net_status.get("internet_reachable"): return { "hostname": hostname, "reachable": False, "error": "No recent journal entries from this host" } return { "hostname": hostname, "reachable": True, "source": "journal", "resources": self.collect_resources(hostname), "systemd": self.collect_systemd_status(hostname), "disk": self.collect_disk_usage(hostname), "network": net_status, "logs": self.collect_log_errors(hostname), } def get_summary(self, data: Dict[str, Any]) -> str: """Generate human-readable summary from journal data""" hostname = data.get("hostname", "unknown") if not data.get("reachable", False): return f"❌ {hostname}: {data.get('error', 'Unreachable')}" lines = [f"System: {hostname} (via journal)"] # Services systemd = data.get("systemd", {}) failed_count = systemd.get("failed_count", 0) if failed_count > 0: lines.append(f"Services: {failed_count} failed") for svc in systemd.get("failed_services", [])[:3]: lines.append(f" - {svc.get('unit', 'unknown')}") else: lines.append("Services: No recent failures") # Network net = data.get("network", {}) last_seen = net.get("last_seen") if last_seen: lines.append(f"Last seen: {last_seen}") # Logs logs = data.get("logs", {}) error_count = logs.get("error_count_1h", 0) if error_count > 0: lines.append(f"Recent logs: {error_count} errors in last hour") return "\n".join(lines) def get_active_services(self, hostname: str, since: str = "1 hour ago") -> List[str]: """ Get list of active services on a host by looking at journal entries This helps with auto-discovery of what's running on each system """ success, output, _ = self._run_journalctl([ f"_HOSTNAME={hostname}", f"--since={since}", "--output=json", "-n", "1000" ]) if not success: return [] services = set() for line in output.split('\n'): if not line.strip(): continue try: entry = json.loads(line) unit = entry.get('_SYSTEMD_UNIT', '') if unit and unit.endswith('.service'): # Extract service name service = unit.replace('.service', '') # Filter out common system services, focus on application services if service not in ['systemd-journald', 'systemd-logind', 'sshd', 'dbus']: services.add(service) except json.JSONDecodeError: continue return sorted(services) if __name__ == "__main__": import sys monitor = JournalMonitor() # Discover hosts print("Discovering hosts from journal...") hosts = monitor.discover_hosts() print(f"Found {len(hosts)} hosts:") for host in hosts: print(f" - {host}") # Monitor first host if available if hosts: hostname = hosts[0] print(f"\nMonitoring {hostname}...") data = monitor.collect_all(hostname) print("\n" + "="*60) print(monitor.get_summary(data)) print("="*60) # Discover services print(f"\nActive services on {hostname}:") services = monitor.get_active_services(hostname) for svc in services[:10]: print(f" - {svc}")