config.yaml

"deepseek-v3-671b":
  metadata:
    description: DeepSeek V3 671B
    provider: DeepSeek
  service_config:
    name: bentovllm-deepseek-v3-671b-service
    traffic:
      timeout: 300
    resources:
      gpu: 16
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-V3
    max_model_len: 8192
    tensor_parallel_size: 16
"deepseek-r1-671b":
  reasoning: true
  metadata:
    description: DeepSeek R1 671B
    provider: DeepSeek
  service_config:
    name: bentovllm-deepseek-r1-671b-service
    traffic:
      timeout: 300
    resources:
      gpu: 16
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-R1
    tensor_parallel_size: 16
    trust_remote_code: true
    max_model_len: 8192
  server_config:
    enable_reasoning: true
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-llama3.3-70b":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Llama 3.3 70B
    provider: DeepSeek
  service_config:
    name: bentovllm-r1-llama3.3-70b-service
    traffic:
      timeout: 300
    resources:
      gpu: 2
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
    tensor_parallel_size: 2
    max_model_len: 8192
  server_config:
    reasoning_parser: "deepseek_r1"
    tool_call_parser: "llama3_json"
"deepseek-r1-distill-llama3.3-70b-w8a8":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Llama 3.3 70B w8a8 GPTQ
    provider: NeuralMagic
  service_config:
    name: bentovllm-r1-llama3.3-70b-w8a8-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-quantized.w8a8
    max_model_len: 4096
  server_config:
    reasoning_parser: "deepseek_r1"
    tool_call_parser: "llama3_json"
"deepseek-r1-distill-llama3.3-70b-w4a16":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Llama 3.3 70B w4a16 GPTQ
    provider: NeuralMagic
  service_config:
    name: bentovllm-r1-llama3.3-70b-w4a16-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-quantized.w4a16
    max_model_len: 8192
  server_config:
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-qwen2.5-32b":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Qwen 2.5 32B
    provider: DeepSeek
  service_config:
    name: bentovllm-r1-qwen2.5-32b-service
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
    traffic:
      timeout: 300
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
    max_model_len: 8192
  server_config:
    reasoning_parser: "deepseek_r1"
    tool_call_parser: "llama3_json"
"deepseek-r1-distill-qwen2.5-32b-w8a8":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Qwen 2.5 32B w8a8 GPTQ
    provider: NeuralMagic
  service_config:
    name: bentovllm-r1-qwen2.5-32b-w8a8-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: neuralmagic/DeepSeek-R1-Distill-Qwen-32B-quantized.w8a8
    max_model_len: 4096
  server_config:
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-qwen2.5-32b-w4a16":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Qwen 2.5 32B w4a16 GPTQ
    provider: NeuralMagic
  service_config:
    name: bentovllm-r1-qwen2.5-32b-w4a16-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: neuralmagic/DeepSeek-R1-Distill-Qwen-32B-quantized.w4a16
    max_model_len: 16384
  server_config:
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-qwen2.5-14b":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Qwen 2.5 14B
    provider: DeepSeek
  service_config:
    name: bentovllm-r1-qwen2.5-14b-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
    max_model_len: 8192
  server_config:
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-qwen2.5-14b-w8a8":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Qwen 2.5 14B w8a8 GPTQ
    provider: NeuralMagic
  service_config:
    name: bentovllm-r1-qwen2.5-14b-w8a8-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
  engine_config:
    model: neuralmagic/DeepSeek-R1-Distill-Qwen-14B-quantized.w8a8
    max_model_len: 4096
  server_config:
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-qwen2.5-14b-w4a16":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Qwen 2.5 14B w4a16 GPTQ
    provider: NeuralMagic
  service_config:
    name: bentovllm-r1-qwen2.5-14b-w4a16-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
  engine_config:
    model: neuralmagic/DeepSeek-R1-Distill-Qwen-14B-quantized.w4a16
    max_model_len: 4096
  server_config:
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-qwen2.5-7b-math":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Qwen 2.5 Math 7B
    provider: DeepSeek
  service_config:
    name: bentovllm-r1-qwen2.5-7b-math-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
    max_model_len: 8192
  server_config:
    reasoning_parser: "deepseek_r1"
"deepseek-r1-distill-llama3.1-8b":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Llama 3.1 8B
    provider: DeepSeek
  service_config:
    name: bentovllm-r1-llama3.1-8b-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
    max_model_len: 4096
  server_config:
    reasoning_parser: "deepseek_r1"
    tool_call_parser: "llama3_json"
"deepseek-r1-distill-llama3.1-8b-tool-calling":
  reasoning: true
  metadata:
    description: DeepSeek R1 Distill Llama 3.1 8B Tool Calling
    provider: DeepSeek
  service_config:
    name: bentovllm-r1-llama3.1-8b-tool-calling-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
    max_model_len: 4096
  server_config:
    enable_auto_tool_choice: true
    enable_tool_call_parser: true
    tool_call_parser: "llama3_json"
    reasoning_parser: "deepseek_r1"
"gemma2-2b-instruct":
  metadata:
    description: Gemma 2 2B Instruct
    provider: Google
  service_config:
    name: bentovllm-gemma2-2b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: google/gemma-2-2b-it
    max_model_len: 2048
    dtype: half
    enable_prefix_caching: false
  server_config:
    chat_template: "{% if messages[0]['role'] == 'system' %}\n    {% set loop_messages = messages[1:] %}\n    {% set system_message = messages[0]['content'].strip() + '\\n\\n' %}\n{% else %}\n    {% set loop_messages = messages %}\n    {% set system_message = '' %}\n{% endif %}\n\n{% for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n\n    {% if loop.index0 == 0 %}\n        {% set content = system_message + message['content'] %}\n    {% else %}\n        {% set content = message['content'] %}\n    {% endif %}\n\n    {% if (message['role'] == 'assistant') %}\n        {% set role = 'model' %}\n    {% else %}\n        {% set role = message['role'] %}\n    {% endif %}\n\n    {{ '<start_of_turn>' + role + '\\n' + content.strip() + '<end_of_turn>\\n' }}\n\n    {% if loop.last and message['role'] == 'user' and add_generation_prompt %}\n        {{'<start_of_turn>model\\n'}}\n    {% endif %}\n{% endfor %}\n"
"gemma2-9b-instruct":
  metadata:
    description: Gemma 2 9B Instruct
    provider: Google
  service_config:
    name: bentovllm-gemma2-9b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: google/gemma-2-9b-it
    max_model_len: 2048
    dtype: half
    enable_prefix_caching: false
    gpu_memory_utilization: 1.0
  server_config:
    chat_template: "{% if messages[0]['role'] == 'system' %}\n    {% set loop_messages = messages[1:] %}\n    {% set system_message = messages[0]['content'].strip() + '\\n\\n' %}\n{% else %}\n    {% set loop_messages = messages %}\n    {% set system_message = '' %}\n{% endif %}\n\n{% for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n\n    {% if loop.index0 == 0 %}\n        {% set content = system_message + message['content'] %}\n    {% else %}\n        {% set content = message['content'] %}\n    {% endif %}\n\n    {% if (message['role'] == 'assistant') %}\n        {% set role = 'model' %}\n    {% else %}\n        {% set role = message['role'] %}\n    {% endif %}\n\n    {{ '<start_of_turn>' + role + '\\n' + content.strip() + '<end_of_turn>\\n' }}\n\n    {% if loop.last and message['role'] == 'user' and add_generation_prompt %}\n        {{'<start_of_turn>model\\n'}}\n    {% endif %}\n{% endfor %}\n"
"gemma2-27b-instruct":
  metadata:
    description: Gemma 2 27B Instruct
    provider: Google
  service_config:
    name: bentovllm-gemma2-27b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: google/gemma-2-27b-it
    max_model_len: 2048
    dtype: half
    enable_prefix_caching: false
  server_config:
    chat_template: "{% if messages[0]['role'] == 'system' %}\n    {% set loop_messages = messages[1:] %}\n    {% set system_message = messages[0]['content'].strip() + '\\n\\n' %}\n{% else %}\n    {% set loop_messages = messages %}\n    {% set system_message = '' %}\n{% endif %}\n\n{% for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n\n    {% if loop.index0 == 0 %}\n        {% set content = system_message + message['content'] %}\n    {% else %}\n        {% set content = message['content'] %}\n    {% endif %}\n\n    {% if (message['role'] == 'assistant') %}\n        {% set role = 'model' %}\n    {% else %}\n        {% set role = message['role'] %}\n    {% endif %}\n\n    {{ '<start_of_turn>' + role + '\\n' + content.strip() + '<end_of_turn>\\n' }}\n\n    {% if loop.last and message['role'] == 'user' and add_generation_prompt %}\n        {{'<start_of_turn>model\\n'}}\n    {% endif %}\n{% endfor %}\n"
"jamba1.5-mini":
  metadata:
    description: Jamba 1.5 Mini
    provider: AI21 Lab
  service_config:
    name: bentovllm-jamba1.5-mini-service
    traffic:
      timeout: 300
    resources:
      gpu: 2
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
      - name: UV_NO_BUILD_ISOLATION
        value: 1
  engine_config:
    model: ai21labs/AI21-Jamba-1.5-Mini
    max_model_len: 204800
    tensor_parallel_size: 2
    enable_prefix_caching: false
  server_config:
    tool_call_parser: "jamba"
  build:
    system_packages:
      - curl
      - git
    post:
      - uv pip install --compile-bytecode torch
      - curl -L -o ./causal_conv1d-1.5.0.post8+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
      - uv pip install --compile-bytecode ./causal_conv1d-1.5.0.post8+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
      - curl -L -o ./mamba_ssm-2.2.4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
      - uv pip install --compile-bytecode ./mamba_ssm-2.2.4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
  generate_config:
    max_tokens: 4096
"jamba1.5-large":
  metadata:
    description: Jamba 1.5 Large
    provider: AI21 Lab
  service_config:
    name: bentovllm-jamba1.5-large-service
    traffic:
      timeout: 300
    resources:
      gpu: 8
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
      - name: UV_NO_BUILD_ISOLATION
        value: 1
  engine_config:
    model: ai21labs/AI21-Jamba-1.5-Large
    max_model_len: 225280
    tensor_parallel_size: 8
    quantization: "experts_int8"
    enable_prefix_caching: false
  build:
    system_packages:
      - curl
      - git
    post:
      - uv pip install --compile-bytecode torch
      - curl -L -o ./causal_conv1d-1.5.0.post8+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
      - uv pip install --compile-bytecode ./causal_conv1d-1.5.0.post8+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
      - curl -L -o ./mamba_ssm-2.2.4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
      - uv pip install --compile-bytecode ./mamba_ssm-2.2.4+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
  server_config:
    tool_call_parser: "jamba"
  generate_config:
    max_tokens: 4096
"llama3.1-8b-instruct":
  metadata:
    description: Llama 3.1 8B Instruct
    provider: Meta
  service_config:
    name: bentovllm-llama3.1-8b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: meta-llama/Meta-Llama-3.1-8B-Instruct
    max_model_len: 2048
    dtype: half
  server_config:
    tool_call_parser: "llama3_json"
"llama3.1-tulu-3.1-8b":
  metadata:
    description: Llama 3.1 Tulu-3.1 8B
    provider: allenai
  service_config:
    name: bentovllm-llama3.1-tulu-3.1-8b-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
  engine_config:
    model: allenai/Llama-3.1-Tulu-3.1-8B
    max_model_len: 4096
"llama3.2-1b-instruct":
  metadata:
    description: Llama 3.2 1B Instruct
    provider: Meta
  service_config:
    name: bentovllm-llama3.2-1b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: meta-llama/Llama-3.2-1B-Instruct
    max_model_len: 8192
  server_config:
    tool_call_parser: "pythonic"
"llama3.2-3b-instruct":
  metadata:
    description: Llama 3.1 3B Instruct
    provider: Meta
  service_config:
    name: bentovllm-llama3.2-3b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: meta-llama/Llama-3.2-3B-Instruct
    max_model_len: 8192
  server_config:
    tool_call_parser: "pythonic"
"llama3.2-11b-vision-instruct":
  metadata:
    description: Llama 3.2 11B Vision Instruct
    provider: Meta
  vision: true
  service_config:
    name: bentovllm-llama3.2-11b-vision-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: meta-llama/Llama-3.2-11B-Vision-Instruct
    enforce_eager: true
    limit_mm_per_prompt:
      image: 1
    max_model_len: 16384
    max_num_seqs: 16
  build:
    exclude:
      - "original"
  server_config:
    tool_call_parser: "pythonic"
"llama3.2-90b-vision-instruct":
  metadata:
    description: Llama 3.2 90B Vision Instruct
    provider: Meta
  vision: true
  service_config:
    name: bentovllm-llama3.2-90b-vision-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 2
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: meta-llama/Llama-3.2-90B-Vision-Instruct
    enforce_eager: true
    limit_mm_per_prompt:
      image: 1
    max_model_len: 16384
    max_num_seqs: 16
  server_config:
    tool_call_parser: "pythonic"
  build:
    exclude:
      - "original"
"llama3.3-70b-instruct":
  metadata:
    description: Llama 3.3 70B Instruct
    provider: Meta
  service_config:
    name: bentovllm-llama3.3-70b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 2
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: meta-llama/Llama-3.3-70B-Instruct
    max_model_len: 2048
    tensor_parallel_size: 2
  build:
    exclude:
      - "original"
      - "consolidated*"
  server_config:
    tool_call_parser: "pythonic"
"hermes-3-llama3.1-405b":
  metadata:
    description: Hermes 3 Llama 3.1 405B FP8
    provider: NousResearch
  service_config:
    name: bentovllm-hermes-3-llama3.1-405b-service
    traffic:
      timeout: 300
    resources:
      gpu: 6
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: NousResearch/Hermes-3-Llama-3.1-405B-FP8
    max_model_len: 2048
    tensor_parallel_size: 6
  server_config:
    tool_call_parser: "pythonic"
"deephermes-3-llama3-8b":
  reasoning: true
  metadata:
    description: DeepHermes 3 Llama 3 8B
    provider: NousResearch
  service_config:
    name: bentovllm-deephermes-3-llama3-8b-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: NousResearch/DeepHermes-3-Llama-3-8B-Preview
    max_model_len: 8192
  server_config:
    tool_call_parser: "pythonic"
"pixtral-12b-2409":
  metadata:
    description: Pixtral 12B 2409
    provider: Mistral AI
  vision: true
  service_config:
    name: bentovllm-pixtral-12b-2409-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: mistralai/Pixtral-12B-2409
    tokenizer_mode: mistral
    enable_prefix_caching: false
    enable_chunked_prefill: false
    limit_mm_per_prompt:
      image: 5
    max_model_len: 32768
  requirements:
    - mistral_common[opencv]
"pixtral-large-2411":
  metadata:
    description: Pixtral Large Instruct 2411
    provider: Mistral AI
  vision: true
  service_config:
    name: bentovllm-pixtral-large-instruct-2411-service
    traffic:
      timeout: 300
    resources:
      gpu: 4
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: mistralai/Pixtral-Large-Instruct-2411
    tokenizer_mode: mistral
    enable_chunked_prefill: false
    enable_prefix_caching: false
    limit_mm_per_prompt:
      image: 5
    max_model_len: 16384
    dtype: half
    tensor_parallel_size: 4
  requirements:
    - mistral_common[opencv]
"ministral-8b-instruct-2410":
  metadata:
    description: Ministral 8B Instruct 2410
    provider: Mistral AI
  service_config:
    name: bentovllm-ministral-8b-instruct-2410-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    envs:
      - name: HF_TOKEN
  engine_config:
    model: mistralai/Ministral-8B-Instruct-2410
    dtype: half
    tokenizer_mode: mistral
    max_model_len: 4096
    enable_prefix_caching: false
  build:
    exclude:
      - "consolidated*"
"mistral-small-24b-instruct-2501":
  metadata:
    description: Mistral Small 24B Instruct 2501
    provider: Mistral AI
  service_config:
    name: bentovllm-mistral-small-24b-instruct-2501-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: mistralai/Mistral-Small-24B-Instruct-2501
    tokenizer_mode: mistral
    max_model_len: 4096
    enable_prefix_caching: false
  build:
    exclude:
      - "consolidated*"
"mistral-large-123b-instruct-2411":
  metadata:
    description: Mistral Large 123B Instruct 2411
    provider: Mistral AI
  service_config:
    name: bentovllm-mistral-large-123b-instruct-2411-service
    traffic:
      timeout: 300
    resources:
      gpu: 4
      gpu_type: nvidia-a100-80gb
    envs:
      - name: HF_TOKEN
  engine_config:
    model: mistralai/Mistral-Large-Instruct-2411
    max_model_len: 4096
    tensor_parallel_size: 4
    tokenizer_mode: mistral
    dtype: half
    enable_prefix_caching: false
  build:
    exclude:
      - "consolidated*"
"phi4-14b":
  metadata:
    description: Phi 4 14B
    provider: Microsoft
  service_config:
    name: bentovllm-phi4-14b-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: microsoft/phi-4
    max_model_len: 8192
  server_config:
    chat_template: "{% if messages[0]['role'] == 'system' %}\n    {% set offset = 1 %}\n{% else %}\n    {% set offset = 0 %}\n{% endif %}\n\n{% for message in messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n\n    {{ '<|' + message['role'] + '|>\\n' + message['content'].strip() + '<|end|>' + '\\n' }}\n\n    {% if loop.last and message['role'] == 'user' and add_generation_prompt %}\n        {{ '<|assistant|>\\n' }}\n    {% endif %}\n{% endfor %}\n"
"qwen2.5-7b-instruct":
  metadata:
    description: Qwen 2.5 7B Instruct
    provider: Alibaba
  service_config:
    name: bentovllm-qwen2.5-7b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-l4
  engine_config:
    model: Qwen/Qwen2.5-7B-Instruct
    max_model_len: 2048
"qwen2.5-14b-instruct":
  metadata:
    description: Qwen 2.5 14B Instruct
    provider: Alibaba
  service_config:
    name: bentovllm-qwen2.5-14b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: Qwen/Qwen2.5-14B-Instruct
    max_model_len: 2048
"qwen2.5-32b-instruct":
  metadata:
    description: Qwen 2.5 32B Instruct
    provider: Alibaba
  service_config:
    name: bentovllm-qwen2.5-32b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: Qwen/Qwen2.5-32B-Instruct
    max_model_len: 2048
"qwen2.5-72b-instruct":
  metadata:
    description: Qwen 2.5 72B Instruct
    provider: Alibaba
  service_config:
    name: bentovllm-qwen2.5-72b-instruct-service
    traffic:
      timeout: 300
    resources:
      gpu: 2
      gpu_type: nvidia-a100-80gb
  engine_config:
    model: Qwen/Qwen2.5-72B-Instruct
    max_model_len: 2048
    tensor_parallel_size: 2
"qwen2.5-coder-7b-instruct":
  metadata:
    description: Qwen 2.5 Coder 7B Instruct
    provider: Alibaba
  service_config:
    name: bentovllm-qwen2.5-coder-7b-instruct-service
    resources:
      gpu: 1
      gpu_type: nvidia-l4
    traffic:
      timeout: 300
  engine_config:
    model: Qwen/Qwen2.5-Coder-7B-Instruct
    max_model_len: 8192
  server_config:
    enable_auto_tool_choice: true
    enable_tool_call_parser: true
    tool_call_parser: "llama3_json"
"qwen2.5-coder-32b-instruct":
  metadata:
    description: Qwen 2.5 Coder 32B Instruct
    provider: Alibaba
  service_config:
    name: bentovllm-qwen2.5-coder-32b-instruct-service
    resources:
      gpu: 1
      gpu_type: nvidia-a100-80gb
    traffic:
      timeout: 300
  engine_config:
    model: Qwen/Qwen2.5-Coder-32B-Instruct
    max_model_len: 8192
  server_config:
    enable_auto_tool_choice: true
    enable_tool_call_parser: true
    tool_call_parser: "llama3_json"