- MacOS or X86/Nvidia based machines should have enough GPU memory to support the models.
- Download and Install Ollama and docker
- Pull the Llama3.2:1b model
curl http://localhost:11434/api/pull -d '{
"model": "deepseek-r1"
}'
- Validate if Ollama is serving the model
curl http://localhost:11434/api/chat -d '{
"model": "deepseek-r1",
"messages": [
{
"role": "user",
"content": "why is the sky blue?"
}
],
"stream": false
}'
- Launch the application with Sample database ( PostgreSQL )
docker compose -f docker-compose.local.yml build
docker compose -f docker-compose.local.yml up -d
# Check logs
docker compose -f docker-compose.local.yml logs -f
- Access the application at
http://localhost:8501
- Setup DB configuration as below

- Setup the Model configuration as below ( ensure LLM_ENDPOINT is your computers IP address )

- Run the application + model on Nvidia GPUs A100, H100
export HF_TOKEN=<YOUR TOKEN>
docker compose -f docker-compose.tgi.yml build
docker compose -f docker-compose.tgi.yml up -d
- defog/llama-3-sqlcoder-8b
- meta-llama/Llama-3.2-1B-Instruct
- microsoft/Phi-3.5-mini-instruct
- google/gemma-2-2b-it
- meta-llama/Llama-3.2-1B-Instruct
# Deploy with docker on Linux:
docker run --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
-e HF_TOKEN=$HF_TOKEN \
-p 8000:80 \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id $MODEL