thushan · thushan · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
@@ -1,7 +1,7 @@
 # CLAUDE.md
 
 ## Overview
-Olla is a high-performance proxy and load balancer for LLM infrastructure, written in Go. It intelligently routes requests across local and remote inference nodes (Ollama, LM Studio, OpenAI-compatible endpoints). 
+Olla is a high-performance proxy and load balancer for LLM infrastructure, written in Go. It intelligently routes requests across local and remote inference nodes (Ollama, LM Studio, LiteLLM, vLLM, OpenAI-compatible endpoints). 
 
 The project provides two proxy engines: Sherpa (simple, maintainable) and Olla (high-performance with advanced features).
 
@@ -22,6 +22,7 @@ olla/
 │   ├── profiles/            # Provider-specific profiles
 │   │   ├── ollama.yaml     # Ollama configuration
 │   │   ├── lmstudio.yaml   # LM Studio configuration
+│   │   ├── litellm.yaml    # LiteLLM gateway configuration
 │   │   ├── openai.yaml     # OpenAI-compatible configuration
 │   │   └── vllm.yaml       # vLLM configuration
 │   └── models.yaml         # Model configurations
@@ -87,7 +88,7 @@ olla/
 ## Response Headers
 - `X-Olla-Endpoint`: Backend name
 - `X-Olla-Model`: Model used
-- `X-Olla-Backend-Type`: ollama/openai/lmstudio/vllm
+- `X-Olla-Backend-Type`: ollama/openai/lmstudio/vllm/litellm
 - `X-Olla-Request-ID`: Request ID
 - `X-Olla-Response-Time`: Total processing time
 

@@ -78,15 +78,15 @@ discovery:
         health_check_url: "/"
         check_interval: 2s # How often to check when healthy
         check_timeout: 1s
-      - url: "http://localhost:11234"
+      - url: "http://localhost:1234"
         name: "local-lm-studio"
         type: "lm-studio"
         priority: 100
         model_url: "/v1/models"
         health_check_url: "/"
         check_interval: 2s
         check_timeout: 1s
-      - url: "http://192.168.0.1:8000"
+      - url: "http://localhost:8000"
         name: "local-vllm"
         type: "vllm"
         priority: 100

@@ -0,0 +1,291 @@
+# LiteLLM unified gateway profile
+# Docs: https://litellm.ai/
+# Revised 17-08-2025
+# 17-08-2025 [TF]: Updated to match latest LiteLLM features and OpenAI compatibility 
+name: litellm
+version: "1.0"
+display_name: "LiteLLM"
+description: "Unified gateway to 100+ LLM providers with automatic fallbacks and load balancing"
+
+# Routing configuration
+routing:
+  prefixes:
+    - litellm
+    #- lite
+
+# API compatibility
+api:
+  openai_compatible: true
+  paths:
+    # Core OpenAI-compatible endpoints (always available)
+    - /                          # 0: root health check
+    - /health                    # 1: health check
+    - /v1/chat/completions       # 2: chat completions (primary)
+    - /v1/completions            # 3: text completions
+    - /v1/embeddings             # 4: embeddings
+    - /v1/models                 # 5: list models
+    # Alternative paths (some deployments)
+    - /chat/completions          # 6: chat completions (alt)
+    - /completions               # 7: text completions (alt)
+    - /embeddings                # 8: embeddings (alt)
+    - /models                    # 9: list models (alt)
+    # Health probes (Kubernetes deployments)
+    # Unsure if these are needed in proxy mode, but included for completeness
+    - /health/readiness          # 10: readiness probe
+    - /health/liveness           # 11: liveness probe
+    # Note: Management endpoints (/key/*, /user/*, /team/*, /spend/*) 
+    # require database backend and are not available in basic proxy mode
+
+  model_discovery_path: /v1/models
+  health_check_path: /health
+
+# Platform characteristics
+characteristics:
+  timeout: 5m  # Remote providers can be slow
+  max_concurrent_requests: 100  # LiteLLM handles high concurrency well
+  default_priority: 95  # High priority as a unified gateway
+  streaming_support: true
+
+# Detection hints for auto-discovery
+detection:
+  user_agent_patterns:
+    - "litellm/"
+  headers:
+    - "X-LiteLLM-Version"
+    - "X-LiteLLM-Provider"
+  path_indicators:
+    - "/v1/models"
+    - "/health"
+    - "/model/info"
+    - "/key/generate"
+  default_ports:
+    - 4000  # LiteLLM proxy default
+    - 8000  # Common alternative
+    - 8080  # Another common port
+
+# Model handling
+models:
+  name_format: "{{.Name}}"
+  # LiteLLM uses provider-prefixed model names
+  provider_prefixes:
+    - "openai/"
+    - "azure/"
+    - "bedrock/"
+    - "anthropic/"
+    - "cohere/"
+    - "together_ai/"
+    - "replicate/"
+    - "huggingface/"
+    - "vertex_ai/"
+    - "palm/"
+    - "gemini/"
+    - "groq/"
+    - "mistral/"
+    - "deepinfra/"
+    - "perplexity/"
+    - "anyscale/"
+    - "cloudflare/"
+    - "voyage/"
+    - "databricks/"
+    - "ai21/"
+    - "nlp_cloud/"
+    - "aleph_alpha/"
+    - "baseten/"
+    - "openrouter/"
+    - "custom/"  # Custom endpoints
+
+  capability_patterns:
+    chat:
+      # OpenAI models
+      - "gpt-5*"
+      - "gpt-4*"
+      - "gpt-3.5*"
+      - "chatgpt*"
+      # Anthropic models
+      - "claude-*"
+      - "anthropic/*"
+      # Google models
+      - "gemini*"
+      - "palm*"
+      - "chat-bison*"
+      # Open models
+      - "llama*"
+      - "mistral*"
+      - "mixtral*"
+      - "vicuna*"
+      - "alpaca*"
+      - "wizardlm*"
+      - "mpt*"
+      - "falcon*"
+      - "starchat*"
+      # Cohere models
+      - "command*"
+      # Provider-prefixed
+      - "*/gpt-*"
+      - "*/claude-*"
+      - "*/llama*"
+      - "*/mistral*"
+
+    embeddings:
+      - "*embedding*"
+      - "voyage-*"
+      - "embed-*"
+      - "text-embedding-*"
+      - "*/embedding*"
+      - "cohere/embed-*"
+      - "openai/text-embedding-*"
+      - "bedrock/amazon.titan-embed*"
+
+    vision:
+      - "gpt-5-vision*"
+      - "gpt-5-turbo*"
+      - "gpt-4-vision*"
+      - "gpt-4-turbo*"
+      - "claude-3-*"
+      - "claude-4-*"
+      - "gemini-*vision*"
+      - "gemini-*pro*"
+      - "llava*"
+      - "bakllava*"
+      - "*/vision*"
+      - "anthropic/claude-3-*"
+      - "anthropic/claude-4-*"
+
+    code:
+      - "*code*"
+      - "codellama*"
+      - "deepseek-coder*"
+      - "starcoder*"
+      - "codegen*"
+      - "replit*"
+      - "wizardcoder*"
+      - "phind*"
+      - "*/code*"
+
+    function_calling:
+      - "gpt-5*"
+      - "gpt-4*"
+      - "gpt-3.5-turbo*"
+      - "claude-3-*"
+      - "claude-4-*"
+      - "mistral-large*"
+      - "mixtral*"
+      - "gemini*"
+      - "*/function*"
+
+  # Context window detection patterns (LiteLLM handles many model variants)
+  context_patterns:
+    - pattern: "*-128k*"
+      context: 131072
+    - pattern: "*-100k*"
+      context: 102400
+    - pattern: "*-64k*"
+      context: 65536
+    - pattern: "*-32k*"
+      context: 32768
+    - pattern: "*-16k*"
+      context: 16384
+    - pattern: "*-8k*"
+      context: 8192
+    - pattern: "*-4k*"
+      context: 4096
+    - pattern: "gpt-4-turbo*"
+      context: 128000
+    - pattern: "gpt-4-32k*"
+      context: 32768
+    - pattern: "gpt-4*"
+      context: 8192
+    - pattern: "claude-3-opus*"
+      context: 200000
+    - pattern: "claude-3-sonnet*"
+      context: 200000
+    - pattern: "claude-3-haiku*"
+      context: 200000
+    - pattern: "claude-2*"
+      context: 100000
+    - pattern: "gemini-1.5-pro*"
+      context: 1048576  # 1M context
+    - pattern: "gemini-1.5-flash*"
+      context: 1048576  # 1M context
+    - pattern: "mistral-large*"
+      context: 32768
+    - pattern: "mixtral*"
+      context: 32768
+
+# Request/response handling
+request:
+  model_field_paths:
+    - "model"
+  response_format: "openai"  # LiteLLM uses OpenAI-compatible format
+  parsing_rules:
+    chat_completions_path: "/v1/chat/completions"
+    completions_path: "/v1/completions"
+    embeddings_path: "/v1/embeddings"
+    model_field_name: "model"
+    supports_streaming: true
+
+# Path indices for specific functions  
+path_indices:
+  health: 1
+  chat_completions: 2
+  completions: 3
+  embeddings: 4
+  models: 5
+
+# Resource management
+resources:
+  # LiteLLM proxy itself is lightweight - actual models run elsewhere
+  defaults:
+    min_memory_gb: 0.5
+    recommended_memory_gb: 1
+    requires_gpu: false
+    estimated_load_time_ms: 100
+
+  # Concurrency is handled by remote providers
+  concurrency_limits:
+    - min_memory_gb: 0
+      max_concurrent: 1000  # LiteLLM can handle many concurrent requests
+
+  # Basic timeout configuration
+  timeout_scaling:
+    base_timeout_seconds: 60
+    load_time_buffer: false  # No model loading for proxy
+
+# Metrics extraction for LiteLLM responses
+# 18-08-2025 [TF]: These are based on standard OpenAI response formats
+metrics:
+  extraction:
+    enabled: true
+    source: "response_body"
+    format: "json"
+
+    # LiteLLM returns standard OpenAI format JSON responses
+    paths:
+      # Basic response fields
+      request_id: "$.id"
+      model: "$.model"
+      created: "$.created"
+      object_type: "$.object"
+
+      # Completion status - finish_reason can be: stop, length, function_call, content_filter, null (streaming)
+      finish_reason: "$.choices[0].finish_reason"
+
+      # Token usage (always present in non-streaming responses)
+      input_tokens: "$.usage.prompt_tokens"
+      output_tokens: "$.usage.completion_tokens" 
+      total_tokens: "$.usage.total_tokens"
+
+      # Cache tokens (present when caching is enabled)
+      cache_read_tokens: "$.usage.cache_read_input_tokens"
+      cache_creation_tokens: "$.usage.cache_creation_input_tokens"
+
+    calculations:
+      # Response is complete when finish_reason is present and not null
+      # Valid completion reasons: stop (normal), length (max tokens), function_call, content_filter
+      is_complete: 'finish_reason != null && finish_reason != ""'
+
+      # Check if response was from cache (when cache tokens are present and > 0)
+      is_cached: 'cache_read_tokens != null && cache_read_tokens > 0'
+
+      # Calculate actual new tokens (total minus cached)
+      new_tokens: 'cache_read_tokens != null ? total_tokens - cache_read_tokens : total_tokens'