25 lines
841 B
Docker

# JoyCaption GGUF Vision Model Server
# Uses llama.cpp's llama-server with CUDA support for multimodal inference
#
# The base image has llama-server as ENTRYPOINT, so we just pass arguments via CMD
FROM ghcr.io/ggml-org/llama.cpp:server-cuda
# Expose the server port
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=5 \
CMD curl -f http://localhost:8080/health || exit 1
# Pass arguments to llama-server ENTRYPOINT
# Model and mmproj paths are mounted via volumes
CMD ["--model", "/models/llama-joycaption-beta-one-hf-llava.Q2_K.gguf", \
"--mmproj", "/models/llama-joycaption-beta-one-llava-mmproj-model-f16.gguf", \
"--host", "0.0.0.0", \
"--port", "8080", \
"--ctx-size", "4096", \
"--n-gpu-layers", "999", \
"--parallel", "1", \
"--cont-batching"]