Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# CLAUDE.md

## Overview
Olla is a high-performance proxy and load balancer for LLM infrastructure, written in Go. It intelligently routes requests across local and remote inference nodes (Ollama, LM Studio, OpenAI-compatible endpoints).
Olla is a high-performance proxy and load balancer for LLM infrastructure, written in Go. It intelligently routes requests across local and remote inference nodes (Ollama, LM Studio, LiteLLM, vLLM, OpenAI-compatible endpoints).

The project provides two proxy engines: Sherpa (simple, maintainable) and Olla (high-performance with advanced features).

Expand All @@ -22,6 +22,7 @@ olla/
│ ├── profiles/ # Provider-specific profiles
│ │ ├── ollama.yaml # Ollama configuration
│ │ ├── lmstudio.yaml # LM Studio configuration
│ │ ├── litellm.yaml # LiteLLM gateway configuration
│ │ ├── openai.yaml # OpenAI-compatible configuration
│ │ └── vllm.yaml # vLLM configuration
│ └── models.yaml # Model configurations
Expand Down Expand Up @@ -87,7 +88,7 @@ olla/
## Response Headers
- `X-Olla-Endpoint`: Backend name
- `X-Olla-Model`: Model used
- `X-Olla-Backend-Type`: ollama/openai/lmstudio/vllm
- `X-Olla-Backend-Type`: ollama/openai/lmstudio/vllm/litellm
- `X-Olla-Request-ID`: Request ID
- `X-Olla-Response-Time`: Total processing time

Expand Down
4 changes: 2 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ discovery:
health_check_url: "/"
check_interval: 2s # How often to check when healthy
check_timeout: 1s
- url: "http://localhost:11234"
- url: "http://localhost:1234"
name: "local-lm-studio"
type: "lm-studio"
priority: 100
model_url: "/v1/models"
health_check_url: "/"
check_interval: 2s
check_timeout: 1s
- url: "http://192.168.0.1:8000"
- url: "http://localhost:8000"
name: "local-vllm"
type: "vllm"
priority: 100
Expand Down
291 changes: 291 additions & 0 deletions config/profiles/litellm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
# LiteLLM unified gateway profile
# Docs: https://litellm.ai/
# Revised 17-08-2025
# 17-08-2025 [TF]: Updated to match latest LiteLLM features and OpenAI compatibility
name: litellm
version: "1.0"
display_name: "LiteLLM"
description: "Unified gateway to 100+ LLM providers with automatic fallbacks and load balancing"

# Routing configuration
routing:
prefixes:
- litellm
#- lite

# API compatibility
api:
openai_compatible: true
paths:
# Core OpenAI-compatible endpoints (always available)
- / # 0: root health check
- /health # 1: health check
- /v1/chat/completions # 2: chat completions (primary)
- /v1/completions # 3: text completions
- /v1/embeddings # 4: embeddings
- /v1/models # 5: list models
# Alternative paths (some deployments)
- /chat/completions # 6: chat completions (alt)
- /completions # 7: text completions (alt)
- /embeddings # 8: embeddings (alt)
- /models # 9: list models (alt)
# Health probes (Kubernetes deployments)
# Unsure if these are needed in proxy mode, but included for completeness
- /health/readiness # 10: readiness probe
- /health/liveness # 11: liveness probe
# Note: Management endpoints (/key/*, /user/*, /team/*, /spend/*)
# require database backend and are not available in basic proxy mode

model_discovery_path: /v1/models
health_check_path: /health

# Platform characteristics
characteristics:
timeout: 5m # Remote providers can be slow
max_concurrent_requests: 100 # LiteLLM handles high concurrency well
default_priority: 95 # High priority as a unified gateway
streaming_support: true

# Detection hints for auto-discovery
detection:
user_agent_patterns:
- "litellm/"
headers:
- "X-LiteLLM-Version"
- "X-LiteLLM-Provider"
path_indicators:
- "/v1/models"
- "/health"
- "/model/info"
- "/key/generate"
default_ports:
- 4000 # LiteLLM proxy default
- 8000 # Common alternative
- 8080 # Another common port

# Model handling
models:
name_format: "{{.Name}}"
# LiteLLM uses provider-prefixed model names
provider_prefixes:
- "openai/"
- "azure/"
- "bedrock/"
- "anthropic/"
- "cohere/"
- "together_ai/"
- "replicate/"
- "huggingface/"
- "vertex_ai/"
- "palm/"
- "gemini/"
- "groq/"
- "mistral/"
- "deepinfra/"
- "perplexity/"
- "anyscale/"
- "cloudflare/"
- "voyage/"
- "databricks/"
- "ai21/"
- "nlp_cloud/"
- "aleph_alpha/"
- "baseten/"
- "openrouter/"
- "custom/" # Custom endpoints

capability_patterns:
chat:
# OpenAI models
- "gpt-5*"
- "gpt-4*"
- "gpt-3.5*"
- "chatgpt*"
# Anthropic models
- "claude-*"
- "anthropic/*"
# Google models
- "gemini*"
- "palm*"
- "chat-bison*"
# Open models
- "llama*"
- "mistral*"
- "mixtral*"
- "vicuna*"
- "alpaca*"
- "wizardlm*"
- "mpt*"
- "falcon*"
- "starchat*"
# Cohere models
- "command*"
# Provider-prefixed
- "*/gpt-*"
- "*/claude-*"
- "*/llama*"
- "*/mistral*"

embeddings:
- "*embedding*"
- "voyage-*"
- "embed-*"
- "text-embedding-*"
- "*/embedding*"
- "cohere/embed-*"
- "openai/text-embedding-*"
- "bedrock/amazon.titan-embed*"

vision:
- "gpt-5-vision*"
- "gpt-5-turbo*"
- "gpt-4-vision*"
- "gpt-4-turbo*"
- "claude-3-*"
- "claude-4-*"
- "gemini-*vision*"
- "gemini-*pro*"
- "llava*"
- "bakllava*"
- "*/vision*"
- "anthropic/claude-3-*"
- "anthropic/claude-4-*"

code:
- "*code*"
- "codellama*"
- "deepseek-coder*"
- "starcoder*"
- "codegen*"
- "replit*"
- "wizardcoder*"
- "phind*"
- "*/code*"

function_calling:
- "gpt-5*"
- "gpt-4*"
- "gpt-3.5-turbo*"
- "claude-3-*"
- "claude-4-*"
- "mistral-large*"
- "mixtral*"
- "gemini*"
- "*/function*"

# Context window detection patterns (LiteLLM handles many model variants)
context_patterns:
- pattern: "*-128k*"
context: 131072
- pattern: "*-100k*"
context: 102400
- pattern: "*-64k*"
context: 65536
- pattern: "*-32k*"
context: 32768
- pattern: "*-16k*"
context: 16384
- pattern: "*-8k*"
context: 8192
- pattern: "*-4k*"
context: 4096
- pattern: "gpt-4-turbo*"
context: 128000
- pattern: "gpt-4-32k*"
context: 32768
- pattern: "gpt-4*"
context: 8192
- pattern: "claude-3-opus*"
context: 200000
- pattern: "claude-3-sonnet*"
context: 200000
- pattern: "claude-3-haiku*"
context: 200000
- pattern: "claude-2*"
context: 100000
- pattern: "gemini-1.5-pro*"
context: 1048576 # 1M context
- pattern: "gemini-1.5-flash*"
context: 1048576 # 1M context
- pattern: "mistral-large*"
context: 32768
- pattern: "mixtral*"
context: 32768

# Request/response handling
request:
model_field_paths:
- "model"
response_format: "openai" # LiteLLM uses OpenAI-compatible format
parsing_rules:
chat_completions_path: "/v1/chat/completions"
completions_path: "/v1/completions"
embeddings_path: "/v1/embeddings"
model_field_name: "model"
supports_streaming: true

# Path indices for specific functions
path_indices:
health: 1
chat_completions: 2
completions: 3
embeddings: 4
models: 5

# Resource management
resources:
# LiteLLM proxy itself is lightweight - actual models run elsewhere
defaults:
min_memory_gb: 0.5
recommended_memory_gb: 1
requires_gpu: false
estimated_load_time_ms: 100

# Concurrency is handled by remote providers
concurrency_limits:
- min_memory_gb: 0
max_concurrent: 1000 # LiteLLM can handle many concurrent requests

# Basic timeout configuration
timeout_scaling:
base_timeout_seconds: 60
load_time_buffer: false # No model loading for proxy

# Metrics extraction for LiteLLM responses
# 18-08-2025 [TF]: These are based on standard OpenAI response formats
metrics:
extraction:
enabled: true
source: "response_body"
format: "json"

# LiteLLM returns standard OpenAI format JSON responses
paths:
# Basic response fields
request_id: "$.id"
model: "$.model"
created: "$.created"
object_type: "$.object"

# Completion status - finish_reason can be: stop, length, function_call, content_filter, null (streaming)
finish_reason: "$.choices[0].finish_reason"

# Token usage (always present in non-streaming responses)
input_tokens: "$.usage.prompt_tokens"
output_tokens: "$.usage.completion_tokens"
total_tokens: "$.usage.total_tokens"

# Cache tokens (present when caching is enabled)
cache_read_tokens: "$.usage.cache_read_input_tokens"
cache_creation_tokens: "$.usage.cache_creation_input_tokens"

calculations:
# Response is complete when finish_reason is present and not null
# Valid completion reasons: stop (normal), length (max tokens), function_call, content_filter
is_complete: 'finish_reason != null && finish_reason != ""'

# Check if response was from cache (when cache tokens are present and > 0)
is_cached: 'cache_read_tokens != null && cache_read_tokens > 0'

# Calculate actual new tokens (total minus cached)
new_tokens: 'cache_read_tokens != null ? total_tokens - cache_read_tokens : total_tokens'
Loading
Loading