# JoyCaption GGUF Vision Model Server
# Uses llama.cpp's llama-server with CUDA support for multimodal inference
#
# The base image has llama-server as ENTRYPOINT, so we just pass arguments via CMD

FROM ghcr.io/ggml-org/llama.cpp:server-cuda

# Expose the server port
EXPOSE 8080

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=5 \
  CMD curl -f http://localhost:8080/health || exit 1

# Pass arguments to llama-server ENTRYPOINT
# Model and mmproj paths are mounted via volumes
CMD ["--model", "/models/llama-joycaption-beta-one-hf-llava.Q2_K.gguf", \
     "--mmproj", "/models/llama-joycaption-beta-one-llava-mmproj-model-f16.gguf", \
     "--host", "0.0.0.0", \
     "--port", "8080", \
     "--ctx-size", "4096", \
     "--n-gpu-layers", "999", \
     "--parallel", "1", \
     "--cont-batching"]
