macha-autonomous/ollama_worker.py

#!/usr/bin/env python3
"""
Ollama Queue Worker - Daemon that processes queued Ollama requests
"""

import sys
import requests
from pathlib import Path
from ollama_queue import OllamaQueue

class OllamaClient:
    """Simple Ollama API client for the queue worker"""

    def __init__(self, host: str = "http://localhost:11434"):
        self.host = host

    def generate(self, payload: dict) -> dict:
        """Call /api/generate"""
        response = requests.post(
            f"{self.host}/api/generate",
            json=payload,
            timeout=payload.get("timeout", 300),
            stream=False
        )
        response.raise_for_status()
        return response.json()

    def chat(self, payload: dict) -> dict:
        """Call /api/chat"""
        response = requests.post(
            f"{self.host}/api/chat",
            json=payload,
            timeout=payload.get("timeout", 300),
            stream=False
        )
        response.raise_for_status()
        return response.json()

    def chat_with_tools(self, payload: dict) -> dict:
        """Call /api/chat with tools (streaming or non-streaming)"""
        import json

        # Check if streaming is requested
        stream = payload.get("stream", False)

        response = requests.post(
            f"{self.host}/api/chat",
            json=payload,
            timeout=payload.get("timeout", 300),
            stream=stream
        )
        response.raise_for_status()

        if not stream:
            # Non-streaming: return response directly
            return response.json()

        # Streaming: accumulate response
        full_response = {"message": {"role": "assistant", "content": "", "tool_calls": []}}

        for line in response.iter_lines():
            if line:
                chunk = json.loads(line)

                if "message" in chunk:
                    msg = chunk["message"]
                    # Preserve role from first chunk
                    if "role" in msg and not full_response["message"].get("role"):
                        full_response["message"]["role"] = msg["role"]
                    if "content" in msg:
                        full_response["message"]["content"] += msg["content"]
                    if "tool_calls" in msg:
                        full_response["message"]["tool_calls"].extend(msg["tool_calls"])

                if chunk.get("done"):
                    full_response["done"] = True
                    # Copy any additional fields from final chunk
                    for key in chunk:
                        if key not in ("message", "done"):
                            full_response[key] = chunk[key]
                    break

        # Ensure role is set
        if "role" not in full_response["message"]:
            full_response["message"]["role"] = "assistant"

        return full_response

def main():
    """Main entry point for the worker"""
    print("Starting Ollama Queue Worker...")

    # Initialize queue and client
    queue = OllamaQueue()
    client = OllamaClient()

    # Cleanup old requests on startup
    queue.cleanup_old_requests(max_age_seconds=3600)

    # Start processing
    try:
        queue.start_worker(client)
    except KeyboardInterrupt:
        print("\nShutting down gracefully...")
        queue.running = False

    return 0

if __name__ == "__main__":
    sys.exit(main())