Macha is now a standalone NixOS flake that can be imported into other systems. This provides: - Independent versioning - Easier reusability - Cleaner separation of concerns - Better development workflow Includes: - Complete autonomous system code - NixOS module with full configuration options - Queue-based architecture with priority system - Chunked map-reduce for large outputs - ChromaDB knowledge base - Tool calling system - Multi-host SSH management - Gotify notification integration All capabilities from DESIGN.md are preserved.
112 lines
3.4 KiB
Python
112 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ollama Queue Worker - Daemon that processes queued Ollama requests
|
|
"""
|
|
|
|
import sys
|
|
import requests
|
|
from pathlib import Path
|
|
from ollama_queue import OllamaQueue
|
|
|
|
class OllamaClient:
|
|
"""Simple Ollama API client for the queue worker"""
|
|
|
|
def __init__(self, host: str = "http://localhost:11434"):
|
|
self.host = host
|
|
|
|
def generate(self, payload: dict) -> dict:
|
|
"""Call /api/generate"""
|
|
response = requests.post(
|
|
f"{self.host}/api/generate",
|
|
json=payload,
|
|
timeout=payload.get("timeout", 300),
|
|
stream=False
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def chat(self, payload: dict) -> dict:
|
|
"""Call /api/chat"""
|
|
response = requests.post(
|
|
f"{self.host}/api/chat",
|
|
json=payload,
|
|
timeout=payload.get("timeout", 300),
|
|
stream=False
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def chat_with_tools(self, payload: dict) -> dict:
|
|
"""Call /api/chat with tools (streaming or non-streaming)"""
|
|
import json
|
|
|
|
# Check if streaming is requested
|
|
stream = payload.get("stream", False)
|
|
|
|
response = requests.post(
|
|
f"{self.host}/api/chat",
|
|
json=payload,
|
|
timeout=payload.get("timeout", 300),
|
|
stream=stream
|
|
)
|
|
response.raise_for_status()
|
|
|
|
if not stream:
|
|
# Non-streaming: return response directly
|
|
return response.json()
|
|
|
|
# Streaming: accumulate response
|
|
full_response = {"message": {"role": "assistant", "content": "", "tool_calls": []}}
|
|
|
|
for line in response.iter_lines():
|
|
if line:
|
|
chunk = json.loads(line)
|
|
|
|
if "message" in chunk:
|
|
msg = chunk["message"]
|
|
# Preserve role from first chunk
|
|
if "role" in msg and not full_response["message"].get("role"):
|
|
full_response["message"]["role"] = msg["role"]
|
|
if "content" in msg:
|
|
full_response["message"]["content"] += msg["content"]
|
|
if "tool_calls" in msg:
|
|
full_response["message"]["tool_calls"].extend(msg["tool_calls"])
|
|
|
|
if chunk.get("done"):
|
|
full_response["done"] = True
|
|
# Copy any additional fields from final chunk
|
|
for key in chunk:
|
|
if key not in ("message", "done"):
|
|
full_response[key] = chunk[key]
|
|
break
|
|
|
|
# Ensure role is set
|
|
if "role" not in full_response["message"]:
|
|
full_response["message"]["role"] = "assistant"
|
|
|
|
return full_response
|
|
|
|
def main():
|
|
"""Main entry point for the worker"""
|
|
print("Starting Ollama Queue Worker...")
|
|
|
|
# Initialize queue and client
|
|
queue = OllamaQueue()
|
|
client = OllamaClient()
|
|
|
|
# Cleanup old requests on startup
|
|
queue.cleanup_old_requests(max_age_seconds=3600)
|
|
|
|
# Start processing
|
|
try:
|
|
queue.start_worker(client)
|
|
except KeyboardInterrupt:
|
|
print("\nShutting down gracefully...")
|
|
queue.running = False
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|
|
|