# JoyCaption GGUF Vision Model Server # Uses llama.cpp's llama-server with CUDA support for multimodal inference # # The base image has llama-server as ENTRYPOINT, so we just pass arguments via CMD FROM ghcr.io/ggml-org/llama.cpp:server-cuda # Expose the server port EXPOSE 8080 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=5 \ CMD curl -f http://localhost:8080/health || exit 1 # Pass arguments to llama-server ENTRYPOINT # Model and mmproj paths are mounted via volumes CMD ["--model", "/models/llama-joycaption-beta-one-hf-llava.Q2_K.gguf", \ "--mmproj", "/models/llama-joycaption-beta-one-llava-mmproj-model-f16.gguf", \ "--host", "0.0.0.0", \ "--port", "8080", \ "--ctx-size", "4096", \ "--n-gpu-layers", "999", \ "--parallel", "1", \ "--cont-batching"]