25 lines
841 B
Docker
25 lines
841 B
Docker
# JoyCaption GGUF Vision Model Server
|
|
# Uses llama.cpp's llama-server with CUDA support for multimodal inference
|
|
#
|
|
# The base image has llama-server as ENTRYPOINT, so we just pass arguments via CMD
|
|
|
|
FROM ghcr.io/ggml-org/llama.cpp:server-cuda
|
|
|
|
# Expose the server port
|
|
EXPOSE 8080
|
|
|
|
# Health check
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=5 \
|
|
CMD curl -f http://localhost:8080/health || exit 1
|
|
|
|
# Pass arguments to llama-server ENTRYPOINT
|
|
# Model and mmproj paths are mounted via volumes
|
|
CMD ["--model", "/models/llama-joycaption-beta-one-hf-llava.Q2_K.gguf", \
|
|
"--mmproj", "/models/llama-joycaption-beta-one-llava-mmproj-model-f16.gguf", \
|
|
"--host", "0.0.0.0", \
|
|
"--port", "8080", \
|
|
"--ctx-size", "4096", \
|
|
"--n-gpu-layers", "999", \
|
|
"--parallel", "1", \
|
|
"--cont-batching"]
|