3:["$","div",null,{"className":"max-w-[1480px] mx-auto px-4 sm:px-6 flex gap-6","children":[["$","$L17",null,{"recipesByOrg":[["arcee-ai",[{"meta":{"title":"Trinity-Large-Thinking","slug":"trinity-large-thinking","provider":"Arcee AI","description":"Arcee AI's reasoning-focused sparse MoE (AfmoeForCausalLM) with structured traces and agentic tool use","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:75:meta:tasks","related_recipes":[]},"model":{"model_id":"arcee-ai/Trinity-Large-Thinking","min_vllm_version":"0.11.1","architecture":"moe","parameter_count":"398B","active_parameters":"13B","context_length":262144,"base_args":["--dtype","bfloat16"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"DeepSeek-R1 reasoning parser extracts ... into message.reasoning","args":["--reasoning-parser","deepseek_r1"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":955,"description":"Full precision BF16 on multi-GPU (sparse MoE — multi-GPU recommended)"},"nvfp4":{"model_id":"arcee-ai/Trinity-Large-Thinking-NVFP4","precision":"nvfp4","vram_minimum_gb":239,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$18","hf_org":"arcee-ai","hf_repo":"Trinity-Large-Thinking","hf_id":"arcee-ai/Trinity-Large-Thinking","hf_released":"2026-04-01T03:35:06.000Z","engines":{"vllm":{"min_version":"0.11.1"},"sglang":{"engine":"sglang","model_id":"arcee-ai/Trinity-Large-Thinking","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","deepseek-r1"]}},"guide":"$19"}},"default_engine":"vllm"}]],["baidu",[{"meta":{"title":"ERNIE-4.5","slug":"ernie-4.5","provider":"Ernie (Baidu)","description":"Baidu ERNIE 4.5 MoE text models (21B-A3B, 300B-A47B) with BF16 and FP8 support plus ERNIE-MTP speculative decoding","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:76:meta:tasks","related_recipes":["baidu/ERNIE-4.5-VL-28B-A3B-PT"]},"model":{"model_id":"baidu/ERNIE-4.5-21B-A3B-PT","min_vllm_version":"0.10.1","architecture":"moe","parameter_count":"21B","active_parameters":"3B","context_length":131072,"base_args":[],"base_env":{}},"features":{"spec_decoding":{"description":"ERNIE-MTP (multi-token prediction) speculative decoding","args":["--speculative-config","{\"method\":\"ernie_mtp\",\"model\":\"baidu/ERNIE-4.5-21B-A3B-PT\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":106,"description":"BF16 weights; fits on 1x80GB GPU (21B variant)"},"300b":{"model_id":"baidu/ERNIE-4.5-300B-A47B-PT","precision":"bf16","vram_minimum_gb":640,"description":"300B total / 47B active; 8x80GB with FP8 online, 16x80GB for BF16","extra_args":["--tensor-parallel-size","8","--gpu-memory-utilization","0.95","--quantization","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":["--gpu-memory-utilization","0.9"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$1a","hf_org":"baidu","hf_repo":"ERNIE-4.5-21B-A3B-PT","hf_id":"baidu/ERNIE-4.5-21B-A3B-PT","hf_released":"2025-06-28T06:13:30.000Z","engines":{"vllm":{"min_version":"0.10.1"},"sglang":{"engine":"sglang","model_id":"baidu/ERNIE-4.5-21B-A3B-PT","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$1b"}},"default_engine":"vllm"},{"meta":{"title":"ERNIE-4.5-VL","slug":"ernie-4.5-vl","provider":"Ernie (Baidu)","description":"Baidu ERNIE 4.5 VL MoE vision-language models (28B-A3B, 424B-A47B) with heterogeneous text/vision experts","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:77:meta:tasks","related_recipes":["baidu/ERNIE-4.5-21B-A3B-PT"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:77:meta:hardware"},"model":{"model_id":"baidu/ERNIE-4.5-VL-28B-A3B-PT","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"28B","active_parameters":"3B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":67,"description":"BF16 weights; fits on 1x80GB GPU (28B VL variant)"},"424b":{"model_id":"baidu/ERNIE-4.5-VL-424B-A47B-PT","precision":"bf16","vram_minimum_gb":1120,"description":"424B total / 47B active; 8x140GB BF16 or 16x80GB BF16","extra_args":["--trust-remote-code","--tensor-parallel-size","8"]},"424b_fp8":{"model_id":"baidu/ERNIE-4.5-VL-424B-A47B-PT","precision":"fp8","vram_minimum_gb":640,"description":"424B with FP8 online quantization + CPU offload for 8x80GB testing","extra_args":["--trust-remote-code","--tensor-parallel-size","8","--quantization","fp8","--cpu-offload-gb","50"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$1c","hf_org":"baidu","hf_repo":"ERNIE-4.5-VL-28B-A3B-PT","hf_id":"baidu/ERNIE-4.5-VL-28B-A3B-PT","hf_released":"2025-06-28T05:50:33.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"baidu/ERNIE-4.5-VL-28B-A3B-PT","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"mi300x":1,"mi325x":1,"mi355x":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$1d"}},"default_engine":"vllm"}]],["ByteDance-Seed",[{"meta":{"title":"Seed-OSS-36B","slug":"seed-oss-36b","provider":"Seed (ByteDance)","description":"ByteDance Seed-OSS 36B dense model with unique 'thinking budget' control and 512K context support","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:62:meta:tasks","related_recipes":[]},"model":{"model_id":"ByteDance-Seed/Seed-OSS-36B-Instruct","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"36B","active_parameters":"36B","context_length":524288,"base_args":[],"base_env":{}},"dependencies":[{"note":"Pinned transformers commit required for Seed-OSS tokenizer compatibility","command":"uv pip install git+https://github.com/huggingface/transformers.git@56d68c6706ee052b445e1e476056ed92ac5eb383"}],"features":{"tool_calling":{"description":"Seed-OSS tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","seed_oss"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":86,"description":"Native BF16 weights on 8x GPU (TP=8)","extra_args":["--tensor-parallel-size","8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$1e","hf_org":"ByteDance-Seed","hf_repo":"Seed-OSS-36B-Instruct","hf_id":"ByteDance-Seed/Seed-OSS-36B-Instruct","hf_released":"2025-08-20T15:03:26.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"ByteDance-Seed/Seed-OSS-36B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$1f"}},"default_engine":"vllm"}]],["deepseek-ai",[{"meta":{"title":"DeepSeek-V4-Pro","slug":"deepseek-v4-pro","provider":"DeepSeek","description":"DeepSeek V4 flagship MoE (1.6T total / 49B active) with hybrid CSA+HCA attention, manifold-constrained hyper-connections, Muon-trained on 32T+ tokens, and three-tier reasoning.","date_updated":"2026-06-09","difficulty":"hard","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:1:meta:tasks","performance_headline":"Frontier 1.6T/49B reasoning MoE with native FP4+FP8 weights, MTP speculative decoding, and 1M-token context","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:1:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-V4-Pro","min_vllm_version":"0.20.0","docker_image":{"amd":"vllm/vllm-openai-rocm:nightly"},"architecture":"moe","parameter_count":"1600B","active_parameters":"49B","context_length":1048576,"flashinfer_autotune":true,"base_args":["--trust-remote-code","--kv-cache-dtype","fp8","--block-size","256"]},"dependencies":[{"note":"DeepGEMM FP8 kernels — install via vLLM tools/install_deepgemm.sh","command":"bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)"},{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"git clone https://github.com/Cambricon/vllm-mlu && pip install -e vllm-mlu","brand":"Cambricon","note":"Cambricon MLU (MLU370+/MLU590) runs vLLM via the out-of-tree vllm-mlu plugin. Requires Cambricon Neuware SDK 25.08 (gated — request from ecosystem@cambricon.com) and MLU-patched Ray 2.51.1 for multi-card TP. Day-0 DeepSeek-V3.2/V4 support. See https://github.com/Cambricon/vllm-mlu"}],"features":{"tool_calling":{"description":"Enable tool calling with DeepSeek V4 chat template support.","args":["--tokenizer-mode","deepseek_v4","--tool-call-parser","deepseek_v4","--enable-auto-tool-choice"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek V4 reasoning parser.","args":["--reasoning-parser","deepseek_v4"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with 2 speculative tokens.","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":2}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":960,"description":"Native FP4+FP8 mixed checkpoint (MoE experts FP4, remaining params FP8)"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tep","multi_node_dep","pd_cluster"],"default_strategy":"single_node_tep","hardware_overrides":{"hopper":{"extra_args":["--max-model-len","800000","--gpu-memory-utilization","0.95","--max-num-seqs","512","--max-num-batched-tokens","512","--no-enable-flashinfer-autotune","--compilation-config","{\"mode\": 0, \"cudagraph_mode\": \"FULL_DECODE_ONLY\"}"],"extra_env":{"VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS":"0"}},"blackwell":{"extra_args":["--attention_config.use_fp4_indexer_cache=True","--moe-backend","deep_gemm_mega_moe"]},"amd":{"extra_args":["--distributed-executor-backend","mp","--gpu-memory-utilization","0.9","--max-num-seqs","512","--max-num-batched-tokens","8192","--compilation-config","{\"mode\": 3, \"cudagraph_mode\": \"FULL_DECODE_ONLY\"}"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{"single_node_tp":{"hardware_overrides":{"hopper":{"extra_args":["--no-enable-flashinfer-autotune"]},"blackwell":{"extra_args":["--attention_config.use_fp4_indexer_cache=True","--no-enable-flashinfer-autotune"]}}},"single_node_tep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"multi_node_tep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"single_node_dep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"multi_node_dep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":{"default":2,"gb300":1},"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-seqs","2","--max-num-batched-tokens","16384","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}},"decode":{"nodes":{"default":2,"gb300":1},"parallelism":"dep","vllm_args":["--max-num-seqs","1024","--max-num-batched-tokens","1024","--max-cudagraph-capture-size","1024","--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}}}},"guide":"$20","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V4-Pro","hf_id":"deepseek-ai/DeepSeek-V4-Pro","hf_released":"2026-04-22T06:04:45.000Z","engines":{"vllm":{"min_version":"0.20.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-V4-Pro","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","deepseekv4"]},"reasoning":{"args":["--reasoning-parser","deepseek-v4"]}},"guide":"$21"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-V4-Flash","slug":"deepseek-v4-flash","provider":"DeepSeek","description":"DeepSeek V4 MoE model with hybrid CSA+HCA attention, manifold-constrained hyper-connections, and three-tier reasoning (Non-think / Think High / Think Max).","date_updated":"2026-05-01","difficulty":"hard","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:43:meta:tasks","performance_headline":"Compact 284B/13B V4 sibling — single-node 1M-context serving with FP4+FP8 weights and MTP","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:43:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-V4-Flash","min_vllm_version":"0.20.0","docker_image":{"amd":"vllm/vllm-openai-rocm:nightly"},"architecture":"moe","parameter_count":"284B","active_parameters":"13B","context_length":1048576,"flashinfer_autotune":true,"base_args":["--trust-remote-code","--kv-cache-dtype","fp8","--block-size","256"]},"dependencies":[{"note":"DeepGEMM FP8 kernels — install via vLLM tools/install_deepgemm.sh","command":"bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)"},{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"git clone https://github.com/Cambricon/vllm-mlu && pip install -e vllm-mlu","brand":"Cambricon","note":"Cambricon MLU (MLU370+/MLU590) runs vLLM via the out-of-tree vllm-mlu plugin. Requires Cambricon Neuware SDK 25.08 (gated — request from ecosystem@cambricon.com) and MLU-patched Ray 2.51.1 for multi-card TP. Day-0 DeepSeek-V3.2/V4 support. See https://github.com/Cambricon/vllm-mlu"}],"features":{"tool_calling":{"description":"Enable tool calling with DeepSeek V4 chat template support.","args":["--tokenizer-mode","deepseek_v4","--tool-call-parser","deepseek_v4","--enable-auto-tool-choice"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek V4 reasoning parser.","args":["--reasoning-parser","deepseek_v4"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with 2 speculative tokens (1 on Hopper).","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":2}"],"hardware_overrides":{"hopper":{"args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}}}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":170,"description":"Native FP4+FP8 mixed checkpoint (MoE experts FP4, remaining params FP8)"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_dep","pd_cluster"],"default_strategy":"single_node_tep","hardware_overrides":{"blackwell":{"extra_args":["--attention_config.use_fp4_indexer_cache=True","--moe-backend","deep_gemm_mega_moe"]},"amd":{"extra_args":["--distributed-executor-backend","mp","--gpu-memory-utilization","0.9","--max-num-seqs","512","--max-num-batched-tokens","8192","--compilation-config","{\"mode\": 3, \"cudagraph_mode\": \"FULL_DECODE_ONLY\"}"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":8,"hardware_overrides":{"hopper":{"extra_args":["--no-enable-flashinfer-autotune"]},"blackwell":{"extra_args":["--attention_config.use_fp4_indexer_cache=True","--no-enable-flashinfer-autotune"]}}},"single_node_dep":{"extra_args":["--data-parallel-size","4","--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"multi_node_dep":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_AND_PIECEWISE\", \"custom_ops\":[\"all\"]}"]},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":1,"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-seqs","8","--max-num-batched-tokens","65536","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}},"decode":{"nodes":1,"parallelism":"dep","vllm_args":["--max-num-seqs","1536","--max-num-batched-tokens","1536","--max-cudagraph-capture-size","1536","--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--no-disable-hybrid-kv-cache-manager","--enable-sleep-mode"],"env":{}}}},"guide":"$22","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V4-Flash","hf_id":"deepseek-ai/DeepSeek-V4-Flash","hf_released":"2026-04-22T06:04:20.000Z","engines":{"vllm":{"min_version":"0.20.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-V4-Flash","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":4,"b200":4},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","deepseekv4"]},"reasoning":{"args":["--reasoning-parser","deepseek-v4"]}},"guide":"$23"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-OCR-2","slug":"deepseek-ocr-2","provider":"DeepSeek","description":"Next-generation DeepSeek OCR model with improved document-to-markdown grounding and optical context compression.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:78:meta:tasks","performance_headline":"Improved grounding and markdown conversion over DeepSeek-OCR","related_recipes":["deepseek-ai/DeepSeek-OCR"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:78:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-OCR-2","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"3B","active_parameters":"3B","context_length":8192,"base_args":["--trust-remote-code","--logits_processors","vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor","--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":7,"description":"Full precision BF16 (~3.4B params)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$24","hf_org":"deepseek-ai","hf_repo":"DeepSeek-OCR-2","hf_id":"deepseek-ai/DeepSeek-OCR-2","hf_released":"2026-01-27T02:56:54.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-OCR-2","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$25"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-V3.2","slug":"deepseek-v3.2","provider":"DeepSeek","description":"DeepSeek V3.2 MoE model with MLA attention, sparse attention, and scalable RL for strong reasoning and agent capabilities.","date_updated":"2026-04-01","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:115:meta:tasks","performance_headline":"GPT-5-level reasoning with efficient MoE inference","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:115:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-V3.2","min_vllm_version":"0.18.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--kernel-config.enable_flashinfer_autotune=False"],"base_env":{}},"dependencies":[{"note":"DeepGEMM pinned build for MQA logits (FP8 MoE kernels)","command":"uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation"},{"note":"Set VLLM_USE_DEEP_GEMM=0 to skip DeepGEMM for the MoE path (recommended on H20)","command":"export VLLM_USE_DEEP_GEMM=0","optional":true},{"command":"git clone https://github.com/Cambricon/vllm-mlu && pip install -e vllm-mlu","brand":"Cambricon","note":"Cambricon MLU (MLU370+/MLU590) runs vLLM via the out-of-tree vllm-mlu plugin. Requires Cambricon Neuware SDK 25.08 (gated — request from ecosystem@cambricon.com) and MLU-patched Ray 2.51.1 for multi-card TP. Day-0 DeepSeek-V3.2/V4 support. See https://github.com/Cambricon/vllm-mlu"}],"features":{"tool_calling":{"description":"Enable tool calling with DeepSeek V3.2 chat template support.","args":["--tokenizer-mode","deepseek_v32","--tool-call-parser","deepseek_v32","--enable-auto-tool-choice"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek V3 reasoning parser.","args":["--reasoning-parser","deepseek_v3"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with 3 speculative tokens.","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":3}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 checkpoint (F8_E4M3)"},"nvfp4":{"model_id":"nvidia/DeepSeek-V3.2-NVFP4","precision":"nvfp4","vram_minimum_gb":403,"description":"NVIDIA FP4 quantized variant with FP8 KV cache for reduced VRAM usage.","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":["--attention-backend","FLASHINFER_MLA"],"extra_env":{}},"amd":{"extra_args":[],"extra_env":{}}},"strategy_overrides":{"pd_cluster":{"prefill":{"extra_args":[],"extra_env":{}},"decode":{"extra_args":[],"extra_env":{}}},"single_node_dep":{"extra_args":[],"extra_env":{}}},"guide":"$26","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3.2","hf_id":"deepseek-ai/DeepSeek-V3.2","hf_released":"2025-12-01T02:34:49.000Z","engines":{"vllm":{"min_version":"0.18.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-V3.2","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","deepseekv32"]},"reasoning":{"args":["--reasoning-parser","deepseek-v3"]}},"guide":"$27"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-OCR","slug":"deepseek-ocr","provider":"DeepSeek","description":"Frontier OCR model exploring optical context compression for LLMs, optimized for document parsing and markdown generation.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:79:meta:tasks","performance_headline":"Optical context compression for efficient OCR and document understanding","related_recipes":["deepseek-ai/DeepSeek-OCR-2"]},"model":{"model_id":"deepseek-ai/DeepSeek-OCR","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"3B","active_parameters":"3B","context_length":8192,"base_args":["--trust-remote-code","--logits_processors","vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor","--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":7,"description":"Full precision BF16 (~3.3B params)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$28","hf_org":"deepseek-ai","hf_repo":"DeepSeek-OCR","hf_id":"deepseek-ai/DeepSeek-OCR","hf_released":"2025-10-17T06:22:05.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-OCR","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$29"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-V3.2-Exp","slug":"deepseek-v3.2-exp","provider":"DeepSeek","description":"Experimental DeepSeek-V3.2 preview with sparse attention (MQA-like logits) and FP8 KV cache; architecture matches DeepSeek-V3.1 except for the sparse attention mechanism.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:82:meta:tasks","performance_headline":"Sparse attention MoE with FP8 KV cache and strong GSM8K score (~0.96)","related_recipes":["deepseek-ai/DeepSeek-V3.1","deepseek-ai/DeepSeek-V3.2"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:82:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-V3.2-Exp","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"DeepGEMM pinned build for MQA logits (FP8 MoE kernels)","command":"uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation"}],"features":{"tool_calling":{"description":"Enable tool calling with DeepSeek V3.2 chat template support.","args":["--tokenizer-mode","deepseek_v32","--tool-call-parser","deepseek_v32","--enable-auto-tool-choice"]},"reasoning":{"description":"Dynamic thinking mode via chat_template_kwargs, same as DeepSeek-V3.1.","args":["--reasoning-parser","deepseek_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (or H20, or 8xB200) with FP8 KV cache default"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{"amd":{"extra_args":["--block-size","1","--kv-cache-dtype","bfloat16","--no-enable-prefix-caching","--max-num-batched-tokens","32768"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1","VLLM_RPC_TIMEOUT":"18000000"}}},"strategy_overrides":{},"guide":"$2a","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3.2-Exp","hf_id":"deepseek-ai/DeepSeek-V3.2-Exp","hf_released":"2025-09-29T06:07:26.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-V3.2-Exp","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","deepseekv31"]},"reasoning":{"args":["--reasoning-parser","deepseek-v3"]}},"guide":"$2b"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-V3.1","slug":"deepseek-v3.1","provider":"DeepSeek","description":"DeepSeek-V3.1 is a hybrid MoE model that supports dynamic switching between thinking and non-thinking modes, with tool calling and function execution.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:81:meta:tasks","performance_headline":"Hybrid thinking / non-thinking MoE with native FP8 and tool calling","related_recipes":["deepseek-ai/DeepSeek-V3","deepseek-ai/DeepSeek-V3.2-Exp"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:81:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-V3.1","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--enable-expert-parallel"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable DeepSeek-V3.1 tool calling with the deepseek_v31 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","deepseek_v31","--chat-template","examples/tool_chat_template_deepseekv31.jinja"]},"reasoning":{"description":"Dynamic thinking mode via chat_template_kwargs={'thinking': true|false}. No separate parser flag is required; the chat template emits ... content.","args":["--reasoning-parser","deepseek_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (or H20) with 141GB per GPU"},"nvfp4":{"model_id":"nvidia/DeepSeek-V3.1-NVFP4","precision":"nvfp4","vram_minimum_gb":403,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$2c","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3.1","hf_id":"deepseek-ai/DeepSeek-V3.1","hf_released":"2025-08-21T02:37:52.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-V3.1","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","deepseekv31"]},"reasoning":{"args":["--reasoning-parser","deepseek-v3"]}},"guide":"$2d"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-R1","slug":"deepseek-r1","provider":"DeepSeek","description":"DeepSeek-R1 is a 671B-parameter MoE reasoning model built on the DeepSeek-V3 architecture, trained with large-scale reinforcement learning for strong chain-of-thought capabilities.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:80:meta:tasks","performance_headline":"Open-weights RL-trained reasoning model with native FP8 / FP4 variants","related_recipes":["deepseek-ai/DeepSeek-V3","deepseek-ai/DeepSeek-V3.1"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:80:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-R1","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--enable-expert-parallel"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). The FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; K100-AI is fp16/bf16 only — FP8 checkpoints run as fp16). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"},{"command":"# deploy via GPUStack 2.1+ (Alibaba T-Head PPU backend)","brand":"T-Head","note":"Alibaba T-Head 真武 PPU 810E (96 GB, ~H20 class) serves vLLM/SGLang via GPUStack 2.1+, which orchestrates the vendor container images and a pluggable backend — raw vllm serve flags are abstracted. bf16/fp8. See https://github.com/gpustack/gpustack"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"},{"command":"git clone https://github.com/Cambricon/vllm-mlu && pip install -e vllm-mlu","brand":"Cambricon","note":"Cambricon MLU (MLU370+/MLU590) runs vLLM via the out-of-tree vllm-mlu plugin. Requires Cambricon Neuware SDK 25.08 (gated — request from ecosystem@cambricon.com) and MLU-patched Ray 2.51.1 for multi-card TP. Day-0 DeepSeek-V3.2/V4 support. See https://github.com/Cambricon/vllm-mlu"}],"features":{"tool_calling":{"description":"Enable DeepSeek-R1 tool calling with the deepseek_v3 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","deepseek_v3","--chat-template","examples/tool_chat_template_deepseekr1.jinja"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek R1 reasoning parser.","args":["--reasoning-parser","deepseek_r1"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (recommended)"},"r1_0528":{"model_id":"deepseek-ai/DeepSeek-R1-0528","precision":"fp8","vram_minimum_gb":805,"description":"May 2025 DeepSeek-R1 refresh (DeepSeek-R1-0528)"},"nvfp4":{"model_id":"nvidia/DeepSeek-R1-0528-NVFP4-v2","precision":"nvfp4","vram_minimum_gb":403,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1"}},"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$2e","hf_org":"deepseek-ai","hf_repo":"DeepSeek-R1","hf_id":"deepseek-ai/DeepSeek-R1","hf_released":"2025-01-20T03:46:07.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-R1","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","deepseekv3"]},"reasoning":{"args":["--reasoning-parser","deepseek-r1"]}},"guide":"$2f"}},"default_engine":"vllm"},{"meta":{"title":"DeepSeek-V3","slug":"deepseek-v3","provider":"DeepSeek","description":"DeepSeek-V3 is a 671B-parameter Mixture-of-Experts model with native FP8 weights and strong reasoning, coding, and math capabilities.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:83:meta:tasks","performance_headline":"Frontier open-weights MoE with native FP8 and FP4 variants","related_recipes":["deepseek-ai/DeepSeek-R1","deepseek-ai/DeepSeek-V3.1"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:83:meta:hardware"},"model":{"model_id":"deepseek-ai/DeepSeek-V3","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"671B","active_parameters":"37B","context_length":163840,"supports_dcp":true,"base_args":["--trust-remote-code","--enable-expert-parallel"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; K100-AI is fp16/bf16 only — FP8 checkpoints run as fp16). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"},{"command":"# deploy via GPUStack 2.1+ (Alibaba T-Head PPU backend)","brand":"T-Head","note":"Alibaba T-Head 真武 PPU 810E (96 GB, ~H20 class) serves vLLM/SGLang via GPUStack 2.1+, which orchestrates the vendor container images and a pluggable backend — raw vllm serve flags are abstracted. bf16/fp8. See https://github.com/gpustack/gpustack"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"},{"command":"git clone https://github.com/Cambricon/vllm-mlu && pip install -e vllm-mlu","brand":"Cambricon","note":"Cambricon MLU (MLU370+/MLU590) runs vLLM via the out-of-tree vllm-mlu plugin. Requires Cambricon Neuware SDK 25.08 (gated — request from ecosystem@cambricon.com) and MLU-patched Ray 2.51.1 for multi-card TP. Day-0 DeepSeek-V3.2/V4 support. See https://github.com/Cambricon/vllm-mlu"}],"features":{"tool_calling":{"description":"Enable DeepSeek-V3 tool calling with the deepseek_v3 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","deepseek_v3","--chat-template","examples/tool_chat_template_deepseekv3.jinja"]},"reasoning":{"description":"Enable reasoning/thinking mode with the DeepSeek R1 reasoning parser.","args":["--reasoning-parser","deepseek_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":805,"description":"Native FP8 weights on 8xH200 (recommended)"},"fp4":{"model_id":"nvidia/DeepSeek-V3-FP4","precision":"fp4","vram_minimum_gb":403,"description":"NVIDIA FP4 quantized weights for Blackwell (e.g. 4xB200)","extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1"}},"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$30","hf_org":"deepseek-ai","hf_repo":"DeepSeek-V3","hf_id":"deepseek-ai/DeepSeek-V3","hf_released":"2024-12-25T12:52:23.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"deepseek-ai/DeepSeek-V3","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","deepseekv3"]}},"guide":"$31"}},"default_engine":"vllm"}]],["google",[{"meta":{"title":"DiffusionGemma 26B-A4B IT","slug":"diffusiongemma-26b-a4b-it","provider":"Google","description":"Google's DiffusionGemma — a block-diffusion language model built on Gemma 4's MoE backbone (26B total / 4B active). Generates tokens via iterative denoising over a fixed-length canvas rather than left-to-right autoregressive decoding, enabling higher throughput with parallel block generation.","date_updated":"2026-06-10","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:0:meta:tasks","performance_headline":"Block-diffusion MoE — 26B total / 4B active, canvas-based parallel generation with ~1.9x throughput vs autoregressive baseline","related_recipes":["google/gemma-4-26B-A4B-it","google/gemma-4-31B-it"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:0:meta:hardware"},"model":{"model_id":"google/diffusiongemma-26B-A4B-it","min_vllm_version":"day0-docker","docker_image":"vllm/vllm-openai:gemma","install":{"docker":null,"pip":false},"architecture":"moe","parameter_count":"26B","active_parameters":"4B","context_length":262144,"base_args":["--max-num-seqs","4"],"base_env":{}},"dependencies":[],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":64,"description":"Full BF16 — single 80 GB NVIDIA GPU (H100/H200). Requires --max-num-seqs 4 to avoid OOM from diffusion state buffers."},"fp8":{"model_id":"RedHatAI/diffusiongemma-26B-A4B-it-FP8-dynamic","precision":"fp8","vram_minimum_gb":32,"description":"FP8 (E4M3) weights with dynamic per-token activation quantization"},"nvfp4":{"model_id":"RedHatAI/diffusiongemma-26B-A4B-it-NVFP4","precision":"nvfp4","vram_minimum_gb":24,"description":"NVFP4 (4-bit) quantized weights"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$32","hf_org":"google","hf_repo":"diffusiongemma-26B-A4B-it","hf_id":"google/diffusiongemma-26B-A4B-it","hf_released":"2026-06-09T12:40:12.000Z","engines":{"vllm":{"min_version":"day0-docker"},"sglang":{"engine":"sglang","model_id":"google/diffusiongemma-26B-A4B-it","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code","--dllm-algorithm","Gemma4Renoise"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$33"}},"default_engine":"vllm"},{"meta":{"title":"Gemma 4 12B IT","slug":"gemma-4-12b-it","provider":"Google","description":"Google's encoder-free unified Gemma 4 dense model (12B) with native text, image, and audio, plus thinking mode and tool-use protocol.","date_updated":"2026-06-04","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:6:meta:tasks","performance_headline":"Encoder-free unified multimodal model with audio, structured thinking, and function calling — runs on a single 40 GB+ GPU","related_recipes":["google/gemma-4-E4B-it","google/gemma-4-26B-A4B-it","google/gemma-4-31B-it"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:6:meta:hardware"},"model":{"model_id":"google/gemma-4-12B-it","min_vllm_version":"nightly","docker_image":"vllm/vllm-openai:gemma4-unified","nightly_required":true,"install":{"docker":{},"pip":{}},"architecture":"dense","parameter_count":"12B","active_parameters":"12B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — only needed when serving the audio modality","command":"uv pip install \"vllm[audio]\"","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"spec_decoding":{"description":"MTP speculative decoding for accelerated inference","args":["--speculative-config","{\"model\":\"google/gemma-4-12B-it-assistant\",\"num_speculative_tokens\":4}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":29,"description":"Full BF16 — single 40 GB+ NVIDIA GPU"},"w4a16":{"model_id":"google/gemma-4-12B-it-qat-w4a16-ct","precision":"int4","vram_minimum_gb":9,"description":"W4A16 QAT (4-bit weights, 16-bit activations, group_size=32) via compressed-tensors — runs on any GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$34","hf_org":"google","hf_repo":"gemma-4-12B-it","hf_id":"google/gemma-4-12B-it","hf_released":"2026-05-23T01:17:15.000Z","engines":{"vllm":{"min_version":"nightly"},"sglang":{"engine":"sglang","model_id":"google/gemma-4-12B-it","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","gemma4"]},"reasoning":{"args":["--reasoning-parser","gemma4"]}},"guide":"$35"}},"default_engine":"vllm"},{"meta":{"title":"Gemma 4 26B-A4B IT","slug":"gemma-4-26b-a4b-it","provider":"Google","description":"Google's Gemma 4 MoE multimodal model (26B total / 4B active) with 128 fine-grained experts, top-8 routing, thinking mode, and tool-use protocol.","date_updated":"2026-05-11","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:35:meta:tasks","performance_headline":"MoE multimodal model — 26B total / 4B active, 128 experts with top-8 routing","related_recipes":["google/gemma-4-E2B-it","google/gemma-4-E4B-it","google/gemma-4-31B-it"],"platforms":[{"id":"modal","blurb":"Serverless deploy via the Gemma 4 Modal script bundled with this recipe.","script":"https://github.com/weetime/recipes/blob/main/Google/gemma4-modal.py","install":"curl -O https://raw.githubusercontent.com/weetime/recipes/main/Google/gemma4-modal.py\npip install modal\nmodal setup\nmodal deploy gemma4-modal.py\nmodal run gemma4-modal.py\n"}],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:35:meta:hardware"},"model":{"model_id":"google/gemma-4-26B-A4B-it","min_vllm_version":"0.19.1","architecture":"moe","parameter_count":"26B","active_parameters":"4B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"spec_decoding":{"description":"MTP speculative decoding for accelerated inference","args":["--speculative-config","{\"model\":\"google/gemma-4-26B-A4B-it-assistant\",\"num_speculative_tokens\":4}"]}},"opt_in_features":["text_only","spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":64,"description":"Full BF16 — single 80 GB NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X or 2x Xeon6/Xeon5 NUMA nodes"},"fp8":{"model_id":"RedHatAI/gemma-4-26B-A4B-it-FP8-dynamic","precision":"fp8","vram_minimum_gb":32,"description":"FP8 (E4M3) weights with dynamic per-token activation quantization — Hopper or Blackwell","extra_args":["--kv-cache-dtype","fp8"]},"nvfp4":{"model_id":"RedHatAI/gemma-4-26B-A4B-it-NVFP4","precision":"nvfp4","vram_minimum_gb":16,"description":"NVFP4 (4-bit) quantized weights — requires Blackwell (B200/B300)","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"cpu":{"extra_env":{"VLLM_CPU_KVCACHE_SPACE":"40","VLLM_CPU_ATTN_SPLIT_KV":"0"}}},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$36","hf_org":"google","hf_repo":"gemma-4-26B-A4B-it","hf_id":"google/gemma-4-26B-A4B-it","hf_released":"2026-03-11T21:25:57.000Z","engines":{"vllm":{"min_version":"0.19.1"},"sglang":{"engine":"sglang","model_id":"google/gemma-4-26B-A4B-it","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1,"mi300x":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","gemma4"]},"reasoning":{"args":["--reasoning-parser","gemma4"]}},"guide":"$37"}},"default_engine":"vllm"},{"meta":{"title":"Gemma 4 31B IT","slug":"gemma-4-31b-it","provider":"Google","description":"Google's unified multimodal Gemma 4 dense model (31B) with native text, image, and audio, plus thinking mode and tool-use protocol.","date_updated":"2026-05-11","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:36:meta:tasks","performance_headline":"Unified multimodal model with structured thinking, function calling, dynamic vision resolution","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:36:meta:hardware"},"model":{"model_id":"google/gemma-4-31B-it","min_vllm_version":"0.19.1","architecture":"dense","parameter_count":"31B","active_parameters":"31B","context_length":262144,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — only needed when serving the audio modality","command":"uv pip install \"vllm[audio]\"","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"spec_decoding":{"description":"MTP speculative decoding for accelerated inference","args":["--speculative-config","{\"model\":\"google/gemma-4-31B-it-assistant\",\"num_speculative_tokens\":4}"]}},"opt_in_features":["text_only","spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":75,"description":"Full BF16 — single 80 GB NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X"},"fp8":{"model_id":"RedHatAI/gemma-4-31B-it-FP8-dynamic","precision":"fp8","vram_minimum_gb":38,"description":"FP8 (E4M3) linear weights with dynamic per-token activation quantization (vision tower stays BF16) — Hopper or Blackwell","extra_args":["--kv-cache-dtype","fp8"]},"nvfp4":{"model_id":"nvidia/gemma-4-31B-it-NVFP4","precision":"nvfp4","vram_minimum_gb":19,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]},"w4a16":{"model_id":"google/gemma-4-31B-it-qat-w4a16-ct","precision":"int4","vram_minimum_gb":20,"description":"W4A16 QAT (4-bit weights, 16-bit activations, group_size=32) via compressed-tensors — runs on any GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$38","hf_org":"google","hf_repo":"gemma-4-31B-it","hf_id":"google/gemma-4-31B-it","hf_released":"2026-03-11T18:22:36.000Z","engines":{"vllm":{"min_version":"0.19.1"},"sglang":{"engine":"sglang","model_id":"google/gemma-4-31B-it","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":2,"b200":1,"mi300x":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","gemma4"]},"reasoning":{"args":["--reasoning-parser","gemma4"]}},"guide":"$39"}},"default_engine":"vllm"},{"meta":{"title":"Gemma 4 E2B IT","slug":"gemma-4-e2b-it","provider":"Google","description":"Google's compact Gemma 4 multimodal model (effective 2B) with native text, image, and audio, plus thinking mode and tool-use protocol.","date_updated":"2026-05-11","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:37:meta:tasks","performance_headline":"Compact unified multimodal model with audio, thinking, and function calling — runs on a single 24 GB+ GPU","related_recipes":["google/gemma-4-E4B-it","google/gemma-4-31B-it","google/gemma-4-26B-A4B-it"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:37:meta:hardware"},"model":{"model_id":"google/gemma-4-E2B-it","min_vllm_version":"0.19.1","architecture":"dense","parameter_count":"5B","active_parameters":"5B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — needed to serve the audio modality","command":"uv pip install \"vllm[audio]\"","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision and audio encoders for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"spec_decoding":{"description":"MTP speculative decoding with centroids masking for accelerated inference","args":["--speculative-config","{\"model\":\"google/gemma-4-E2B-it-assistant\",\"num_speculative_tokens\":2}"]}},"opt_in_features":["text_only","spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":13,"description":"Full BF16 — single 24 GB+ NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355X or 1x Xeon6/Xeon5 NUMA node"},"fp8":{"model_id":"RedHatAI/gemma-4-E2B-it-FP8-dynamic","precision":"fp8","vram_minimum_gb":6,"description":"FP8 (E4M3) linear weights with dynamic per-token activation quantization (vision/audio encoders stay BF16) — Hopper or Blackwell","extra_args":["--kv-cache-dtype","fp8"]},"w4a16":{"model_id":"google/gemma-4-E2B-it-qat-w4a16-ct","precision":"int4","vram_minimum_gb":8,"description":"W4A16 QAT (4-bit weights, 16-bit activations, group_size=32) via compressed-tensors — runs on any GPU"},"mobile":{"model_id":"google/gemma-4-E2B-it-qat-mobile-ct","precision":"int2/4/8","vram_minimum_gb":4,"description":"Mobile QAT — mixed int2/4/8 compressed-tensors (int2 embeddings + lm_head + deep MLP, int4 attention + early MLP, int8 vision + gates). 3.6× weight compression vs BF16, up to 1.25× faster decode at low batch"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"cpu":{"extra_env":{"VLLM_CPU_KVCACHE_SPACE":"40","VLLM_CPU_ATTN_SPLIT_KV":"0"}}},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$3a","hf_org":"google","hf_repo":"gemma-4-E2B-it","hf_id":"google/gemma-4-E2B-it","hf_released":"2026-03-02T19:58:09.000Z","engines":{"vllm":{"min_version":"0.19.1"},"sglang":{"engine":"sglang","model_id":"google/gemma-4-E2B-it","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","gemma4"]},"reasoning":{"args":["--reasoning-parser","gemma4"]}},"guide":"$3b"}},"default_engine":"vllm"},{"meta":{"title":"Gemma 4 E4B IT","slug":"gemma-4-e4b-it","provider":"Google","description":"Google's compact Gemma 4 multimodal model (effective 4B) with native text, image, and audio, plus thinking mode and tool-use protocol.","date_updated":"2026-05-11","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:38:meta:tasks","performance_headline":"Effective-4B unified multimodal model with audio, thinking, and function calling — runs on a single 24 GB+ GPU","related_recipes":["google/gemma-4-E2B-it","google/gemma-4-31B-it","google/gemma-4-26B-A4B-it"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:38:meta:hardware"},"model":{"model_id":"google/gemma-4-E4B-it","min_vllm_version":"0.19.1","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — needed to serve the audio modality","command":"uv pip install \"vllm[audio]\"","optional":true}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Gemma 4 parser and chat template","args":["--enable-auto-tool-choice","--tool-call-parser","gemma4","--chat-template","examples/tool_chat_template_gemma4.jinja"]},"reasoning":{"description":"Enable structured thinking/reasoning output","args":["--reasoning-parser","gemma4"]},"text_only":{"description":"Skip loading the vision and audio encoders for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"spec_decoding":{"description":"MTP speculative decoding with centroids masking for accelerated inference","args":["--speculative-config","{\"model\":\"google/gemma-4-E4B-it-assistant\",\"num_speculative_tokens\":4}"]}},"opt_in_features":["text_only","spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":20,"description":"Full BF16 — single 24 GB+ NVIDIA GPU or 1x MI300X/MI325X/MI350X/MI355Xi or 2x Xeon6/Xeon5 NUMA nodes"},"fp8":{"model_id":"RedHatAI/gemma-4-E4B-it-FP8-dynamic","precision":"fp8","vram_minimum_gb":10,"description":"FP8 (E4M3) linear weights with dynamic per-token activation quantization (vision/audio encoders stay BF16) — Hopper or Blackwell","extra_args":["--kv-cache-dtype","fp8"]},"w4a16":{"model_id":"google/gemma-4-E4B-it-qat-w4a16-ct","precision":"int4","vram_minimum_gb":10,"description":"W4A16 QAT (4-bit weights, 16-bit activations, group_size=32) via compressed-tensors — runs on any GPU"},"mobile":{"model_id":"google/gemma-4-E4B-it-qat-mobile-ct","precision":"int2/4/8","vram_minimum_gb":5,"description":"Mobile QAT — mixed int2/4/8 compressed-tensors (int2 embeddings + lm_head + audio, int4 LLM attention + MLP, int8 vision + gates). 4.1× weight compression vs BF16, up to 1.5× faster decode at low batch"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"cpu":{"extra_env":{"VLLM_CPU_KVCACHE_SPACE":"40","VLLM_CPU_ATTN_SPLIT_KV":"0"}}},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$3c","hf_org":"google","hf_repo":"gemma-4-E4B-it","hf_id":"google/gemma-4-E4B-it","hf_released":"2026-03-02T19:57:40.000Z","engines":{"vllm":{"min_version":"0.19.1"},"sglang":{"engine":"sglang","model_id":"google/gemma-4-E4B-it","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","gemma4"]},"reasoning":{"args":["--reasoning-parser","gemma4"]}},"guide":"$3d"}},"default_engine":"vllm"},{"meta":{"title":"TranslateGemma 27B IT","slug":"translategemma-27b-it","provider":"Google","description":"Lightweight open translation model from Google (based on Gemma 3) supporting 55 languages. Served via the vLLM-optimized Infomaniak-AI checkpoint.","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:84:meta:tasks","performance_headline":"Deployable on laptops/desktops and cloud GPUs; vLLM-optimized checkpoint removes custom JSON inputs","related_recipes":[]},"model":{"model_id":"google/translategemma-27b-it","min_vllm_version":"0.14.1","architecture":"dense","parameter_count":"27B","active_parameters":"27B","context_length":131072,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":65,"description":"Original Google checkpoint in BF16 (has vLLM compatibility issues — prefer vllm-optimized variant)"},"vllm_optimized":{"model_id":"Infomaniak-AI/vllm-translategemma-27b-it","precision":"bf16","vram_minimum_gb":65,"description":"Infomaniak-AI vLLM-optimized checkpoint — recommended. Fixes RoPE config, EOS token, and replaces custom JSON inputs with string delimiters."},"small_4b":{"model_id":"google/translategemma-4b-it","precision":"bf16","vram_minimum_gb":10,"description":"4B variant for lower-resource deployments (prefer Infomaniak-AI/vllm-translategemma-4b-it)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$3e","hf_org":"google","hf_repo":"translategemma-27b-it","hf_id":"google/translategemma-27b-it","hf_released":"2026-01-12T16:12:41.000Z","engines":{"vllm":{"min_version":"0.14.1"},"sglang":{"engine":"sglang","model_id":"google/translategemma-27b-it","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$3f"}},"default_engine":"vllm"}]],["inclusionAI",[{"meta":{"title":"Ring-2.6-1T","slug":"ring-2.6-1t","provider":"inclusionAI","description":"Ring-2.6-1T (BailingMoeV2_5) FP8 thinking model with 1T total / 50B active params, hybrid linear + MLA attention, 128K context","date_updated":"2026-05-15","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:25:meta:tasks","related_recipes":["inclusionAI/Ling-2.6-1T","inclusionAI/Ring-1T-FP8"]},"model":{"model_id":"inclusionAI/Ring-2.6-1T","min_vllm_version":"0.20.2","architecture":"moe","parameter_count":"1T","active_parameters":"50B","context_length":131072,"base_args":["--trust-remote-code","--tensor-parallel-size","8"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1200,"description":"FP8 weights with TP=8 on B300 or 8x MI300X/MI355X-class nodes"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"amd":{"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":8}},"guide":"$40","hf_org":"inclusionAI","hf_repo":"Ring-2.6-1T","hf_id":"inclusionAI/Ring-2.6-1T","hf_released":"2026-05-14T08:10:02.000Z","engines":{"vllm":{"min_version":"0.20.2"},"sglang":{"engine":"sglang","model_id":"inclusionAI/Ring-2.6-1T","min_version":"v0.5.10.post1","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"b300":8,"mi300x":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$41"}},"default_engine":"vllm"},{"meta":{"title":"Ling-2.6-1T","slug":"ling-2.6-1t","provider":"inclusionAI","description":"Ling-2.6-1T (BailingMoeV2_5) FP8 instruct model with 1T total / 50B active params, hybrid linear + MLA attention, 262K context","date_updated":"2026-05-13","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:29:meta:tasks","related_recipes":["inclusionAI/Ling-2.6-flash","inclusionAI/Ring-1T-FP8"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:29:meta:hardware"},"model":{"model_id":"inclusionAI/Ling-2.6-1T","min_vllm_version":"0.20.2","architecture":"moe","parameter_count":"1T","active_parameters":"50B","context_length":262144,"base_args":["--trust-remote-code","--tensor-parallel-size","8"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1200,"description":"FP8 weights with TP=8 on B300 or 8x MI300X/MI355X-class nodes"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":8}},"guide":"$42","hf_org":"inclusionAI","hf_repo":"Ling-2.6-1T","hf_id":"inclusionAI/Ling-2.6-1T","hf_released":"2026-04-29T03:19:36.000Z","engines":{"vllm":{"min_version":"0.20.2"},"sglang":{"engine":"sglang","model_id":"inclusionAI/Ling-2.6-1T","min_version":"v0.5.10.post1","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"b300":8,"mi300x":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen25"]}},"guide":"$43"}},"default_engine":"vllm"},{"meta":{"title":"Ling-2.6-flash","slug":"ling-2.6-flash","provider":"inclusionAI","description":"Ling-2.6-flash (BailingMoeV2_5) instruct model with 104B total / 7.4B active params, hybrid linear + MLA attention, 128K context, optimized for agent workloads","date_updated":"2026-05-13","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:30:meta:tasks","related_recipes":["inclusionAI/Ring-1T-FP8"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:30:meta:hardware"},"model":{"model_id":"inclusionAI/Ling-2.6-flash","min_vllm_version":"0.20.2","architecture":"moe","parameter_count":"104B","active_parameters":"7.4B","context_length":131072,"base_args":["--trust-remote-code","--tensor-parallel-size","4"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":250,"description":"BF16 weights with base TP=4; the guide shows a tested 2-GPU AMD command"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":4}},"guide":"$44","hf_org":"inclusionAI","hf_repo":"Ling-2.6-flash","hf_id":"inclusionAI/Ling-2.6-flash","hf_released":"2026-04-28T03:27:56.000Z","engines":{"vllm":{"min_version":"0.20.2"},"sglang":{"engine":"sglang","model_id":"inclusionAI/Ling-2.6-flash","min_version":"v0.5.10.post1","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":2,"mi300x":2},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen25"]}},"guide":"$45"}},"default_engine":"vllm"},{"meta":{"title":"Ring-1T-FP8","slug":"ring-1t-fp8","provider":"inclusionAI","description":"Ring-1T (BailingMoeV2) FP8 model (~1T total params) for 8xH200 or 8xMI300X deployment","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:85:meta:tasks","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:85:meta:hardware"},"model":{"model_id":"inclusionAI/Ring-1T-FP8","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"1T","active_parameters":"50B","context_length":65536,"base_args":["--trust-remote-code","--tensor-parallel-size","8","--max_num_seqs","32","--kv-cache-dtype","fp8","--served-model-name","Ring-1T-FP8"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1200,"description":"FP8 weights on 8x H200 (80 GB) with FP8 KV cache","extra_args":["--gpu-memory-utilization","0.97","--compilation-config","{\"use_inductor\": false}"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$46","hf_org":"inclusionAI","hf_repo":"Ring-1T-FP8","hf_id":"inclusionAI/Ring-1T-FP8","hf_released":"2025-10-11T20:02:44.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"inclusionAI/Ring-1T-FP8","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"mi300x":8,"mi325x":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$47"}},"default_engine":"vllm"}]],["internlm",[{"meta":{"title":"Intern-S2-Preview","slug":"intern-s2-preview","provider":"InternLM","description":"Scientific multimodal MoE (36B total / 3B active) continued pre-trained from Qwen3.5 — hybrid linear/full attention, 262K context, MTP-accelerated reasoning. BF16 and FP8 checkpoints.","date_updated":"2026-05-15","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:26:meta:tasks","performance_headline":"35B-A3B scientific multimodal foundation model — single-node BF16 with MTP","related_recipes":["internlm/Intern-S1","Qwen/Qwen3.5-35B-A3B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:26:meta:hardware"},"model":{"model_id":"internlm/Intern-S2-Preview","min_vllm_version":"nightly","nightly_required":true,"architecture":"moe","parameter_count":"36B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with the Qwen3 Coder parser (per official deployment guide)","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Extract ... chain-of-thought via the Qwen3 reasoning parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Shared-weight MTP speculative decoding — 4 draft tokens (recommended in the deployment guide)","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":4}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":87,"description":"BF16 on 1x H200 or 2x H100/H800 (deployment guide uses TP=2)"},"fp8":{"model_id":"internlm/Intern-S2-Preview-FP8","precision":"fp8","vram_minimum_gb":44,"description":"Official DeepSeek-style block FP8 (128x128, ue8m0 scales) — fits on a single H100/H200"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_dep","multi_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$48","hf_org":"internlm","hf_repo":"Intern-S2-Preview","hf_id":"internlm/Intern-S2-Preview","hf_released":"2026-05-15T03:49:25.000Z","engines":{"vllm":{"min_version":"nightly"},"sglang":{"engine":"sglang","model_id":"internlm/Intern-S2-Preview","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"gb200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$49"}},"default_engine":"vllm"},{"meta":{"title":"Intern-S1","slug":"intern-s1","provider":"InternLM","description":"Intern-S1 vision-language model from Shanghai AI Lab with BF16/FP8 variants and thinking/non-thinking modes","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:86:meta:tasks","related_recipes":["OpenGVLab/InternVL3_5-8B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:86:meta:hardware"},"model":{"model_id":"internlm/Intern-S1","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"241B","active_parameters":"28B","context_length":65536,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"InternLM tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","internlm"]},"reasoning":{"description":"DeepSeek-R1 reasoning parser extracts ...","args":["--reasoning-parser","deepseek_r1"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":578,"tp":8,"description":"BF16 on 8x H800 (80GB each)"},"fp8":{"model_id":"internlm/Intern-S1-FP8","precision":"fp8","vram_minimum_gb":289,"tp":4,"description":"FP8 on 4x H800 (80GB each)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"0"}}},"strategy_overrides":{},"guide":"$4a","hf_org":"internlm","hf_repo":"Intern-S1","hf_id":"internlm/Intern-S1","hf_released":"2025-07-24T06:05:13.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"internlm/Intern-S1","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":8,"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","interns1"]},"reasoning":{"args":["--reasoning-parser","interns1"]}},"guide":"$4b"}},"default_engine":"vllm"}]],["JetBrains",[{"meta":{"title":"Mellum2-12B-A2.5B-Thinking","slug":"mellum2-12b-a2.5b-thinking","provider":"JetBrains","description":"JetBrains' reasoning-augmented code MoE (12B total / 2.5B active) that emits explicit chains for debugging, planning, and agentic coding","date_updated":"2026-06-02","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:10:meta:tasks","performance_headline":"69.9 LiveCodeBench v6, 58.4 AIME — fits on a single GPU","related_recipes":["JetBrains/Mellum2-12B-A2.5B-Instruct"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:10:meta:hardware"},"model":{"model_id":"JetBrains/Mellum2-12B-A2.5B-Thinking","min_vllm_version":"nightly","nightly_required":true,"architecture":"moe","parameter_count":"12B","active_parameters":"2.5B","context_length":131072,"base_args":["--max-model-len","131072"],"base_env":{}},"features":{"reasoning":{"description":"Parse the model's ... reasoning blocks (Qwen3-style parser)","args":["--reasoning-parser","qwen3"]},"tool_calling":{"description":"Hermes tool-call parser for function calling","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":29,"description":"Native bfloat16 weights; fits comfortably on a single H200/H100/A100"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep"],"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$4c","hf_org":"JetBrains","hf_repo":"Mellum2-12B-A2.5B-Thinking","hf_id":"JetBrains/Mellum2-12B-A2.5B-Thinking","hf_released":"2026-05-26T09:12:25.000Z","engines":{"vllm":{"min_version":"nightly"},"sglang":{"engine":"sglang","model_id":"JetBrains/Mellum2-12B-A2.5B-Thinking","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$4d"}},"default_engine":"vllm"},{"meta":{"title":"Mellum2-12B-A2.5B-Instruct","slug":"mellum2-12b-a2.5b-instruct","provider":"JetBrains","description":"JetBrains' instruction-tuned code MoE (12B total / 2.5B active) that answers directly without an externalized chain of thought — low-latency coding and tool use","date_updated":"2026-06-02","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:9:meta:tasks","performance_headline":"78.4 EvalPlus, 67.1 MultiPL-E — direct answers, fits on a single GPU","related_recipes":["JetBrains/Mellum2-12B-A2.5B-Thinking"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:9:meta:hardware"},"model":{"model_id":"JetBrains/Mellum2-12B-A2.5B-Instruct","min_vllm_version":"nightly","nightly_required":true,"architecture":"moe","parameter_count":"12B","active_parameters":"2.5B","context_length":131072,"base_args":["--max-model-len","131072"],"base_env":{}},"features":{"tool_calling":{"description":"Hermes tool-call parser for function calling","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":29,"description":"Native bfloat16 weights; fits comfortably on a single H200/H100/A100"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep"],"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$4e","hf_org":"JetBrains","hf_repo":"Mellum2-12B-A2.5B-Instruct","hf_id":"JetBrains/Mellum2-12B-A2.5B-Instruct","hf_released":"2026-05-26T09:09:18.000Z","engines":{"vllm":{"min_version":"nightly"},"sglang":{"engine":"sglang","model_id":"JetBrains/Mellum2-12B-A2.5B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$4f"}},"default_engine":"vllm"}]],["jinaai",[{"meta":{"title":"Jina Embeddings v5 Text Small","slug":"jina-embeddings-v5-text-small","provider":"Jina AI","description":"Jina AI's fifth-gen multilingual text embedding model (677M, Qwen3-0.6B-Base) with task-specific LoRA adapters for retrieval, text-matching, classification, and clustering.","date_updated":"2026-05-09","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:40:meta:tasks","performance_headline":"71.7 MTEB English v2 / 67.7 MMTEB at <1B params, 119+ languages, 32K context","related_recipes":["jinaai/jina-reranker-m0"]},"model":{"model_id":"jinaai/jina-embeddings-v5-text-small","min_vllm_version":"0.20.0","architecture":"dense","parameter_count":"0.7B","active_parameters":"0.7B","context_length":32768,"base_args":["--trust-remote-code","--runner","pooling"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"model_id":"jinaai/jina-embeddings-v5-text-small-retrieval","label":"retrieval","precision":"bf16","vram_minimum_gb":2,"description":"Retrieval task: query/document encoding for RAG and search. Adapter pre-merged into the base weights."},"text_matching":{"model_id":"jinaai/jina-embeddings-v5-text-small-text-matching","label":"text-matching","precision":"bf16","vram_minimum_gb":2,"description":"Text-matching task: semantic similarity, dedup, paraphrase detection. Adapter pre-merged into the base weights."},"classification":{"model_id":"jinaai/jina-embeddings-v5-text-small-classification","label":"classification","precision":"bf16","vram_minimum_gb":2,"description":"Classification task: linear probing, intent detection. Adapter pre-merged into the base weights."},"clustering":{"model_id":"jinaai/jina-embeddings-v5-text-small-clustering","label":"clustering","precision":"bf16","vram_minimum_gb":2,"description":"Clustering task: k-means, topic discovery. Adapter pre-merged into the base weights."}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$50","hf_org":"jinaai","hf_repo":"jina-embeddings-v5-text-small","hf_id":"jinaai/jina-embeddings-v5-text-small","hf_released":"2026-01-22T09:42:41.000Z"},{"meta":{"title":"Jina Reranker m0","slug":"jina-reranker-m0","provider":"Jina AI","description":"Multilingual, multimodal reranker for text and visual documents across 29+ languages via Qwen2-VL backbone","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:87:meta:tasks","related_recipes":[]},"model":{"model_id":"jinaai/jina-reranker-m0","min_vllm_version":"0.8.0","architecture":"dense","parameter_count":"2.4B","active_parameters":"2.4B","context_length":32768,"base_args":["--gpu-memory-utilization","0.75","--max-num-seqs","32"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":6,"description":"BF16 weights; 2x T4 or 2x L4 GPUs"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$51","hf_org":"jinaai","hf_repo":"jina-reranker-m0","hf_id":"jinaai/jina-reranker-m0","hf_released":"2025-03-27T11:08:51.000Z"}]],["meituan-longcat",[{"meta":{"title":"LongCat-Image-Edit","slug":"longcat-image-edit","provider":"LongCat (Meituan)","description":"Bilingual (Chinese-English) image editing model from Meituan LongCat, served via vLLM-Omni","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:88:meta:tasks","related_recipes":[]},"model":{"model_id":"meituan-longcat/LongCat-Image-Edit","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"6B","active_parameters":"6B","context_length":0,"base_args":[],"base_env":{}},"omni":{"tasks":["i2i"]},"dependencies":[{"note":"vLLM-Omni must be installed from source with vllm==0.12.0 for LongCat-Image-Edit","command":"git clone https://github.com/vllm-project/vllm-omni.git && cd vllm-omni && uv pip install -e . vllm==0.12.0"},{"note":"xformers CUDA 12.8 build required for the diffusion attention kernels","command":"uv pip install -U xformers --index-url https://download.pytorch.org/whl/cu128"},{"note":"diffusers from source (needed by the image-edit pipeline)","command":"git clone https://github.com/huggingface/diffusers.git && cd diffusers && uv pip install -e ."}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":36,"description":"BF16 weights; served via vLLM-Omni"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$52","hf_org":"meituan-longcat","hf_repo":"LongCat-Image-Edit","hf_id":"meituan-longcat/LongCat-Image-Edit","hf_released":"2025-12-05T07:34:53.000Z"}]],["meta-llama",[{"meta":{"title":"Llama-4-Scout","slug":"llama-4-scout","provider":"Meta","description":"Llama 4 Scout 17B-16E MoE model with NVIDIA FP8/FP4 variants, fits on a single GPU with quantization","date_updated":"2026-04-16","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:114:meta:tasks","performance_headline":"","related_recipes":["meta-llama/Llama-3.3-70B-Instruct"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:114:meta:hardware"},"model":{"model_id":"meta-llama/Llama-4-Scout-17B-16E-Instruct","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"109B","active_parameters":"17B","context_length":10485760,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":262,"description":"Full precision BF16"},"fp8":{"model_id":"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8","precision":"fp8","vram_minimum_gb":131,"description":"NVIDIA FP8 quantization for Hopper and Blackwell, fits on 1x H100","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{}},"nvfp4":{"model_id":"nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":65,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"hopper":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192"],"extra_env":{}},"blackwell":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192","--compilation-config","{\"pass_config\":{\"fuse_allreduce_rms\":true,\"eliminate_noops\":true}}"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1"}},"amd":{"extra_args":["--no-enable-prefix-caching","--max-num-batched-tokens","16384","--max-num-seqs","64","--max-model-len","32000"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$53","hf_org":"meta-llama","hf_repo":"Llama-4-Scout-17B-16E-Instruct","hf_id":"meta-llama/Llama-4-Scout-17B-16E-Instruct","hf_released":"2025-04-02T13:34:17.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"meta-llama/Llama-4-Scout-17B-16E-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":8,"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","pythonic"]}},"guide":"$54"}},"default_engine":"vllm"},{"meta":{"title":"Llama-3.3-70B","slug":"llama3.3-70b","provider":"Meta","description":"Llama 3.3 70B dense model with NVIDIA FP8/FP4 quantized variants for Hopper and Blackwell GPUs","date_updated":"2026-04-16","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:113:meta:tasks","performance_headline":"","related_recipes":["meta-llama/Llama-4-Scout-17B-16E-Instruct"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:113:meta:hardware"},"model":{"model_id":"meta-llama/Llama-3.3-70B-Instruct","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"70B","active_parameters":"70B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; fp16/bf16 only). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"},{"command":"docker run --runtime iluvatar -e IX_VISIBLE_DEVICES=all ","brand":"Iluvatar","note":"Iluvatar CoreX BI-V150 runs a vendor-forked vLLM. Get the CoreX Docker installer (corex-docker-installer-4.3.0+-llm-py3.10-x86_64.run) from the Iluvatar Resource Center (login required), install the iluvatar container runtime, then vllm serve as usual. bf16/fp16/int8. See https://github.com/Deep-Spark/DeepSparkInference"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":170,"description":"Full precision BF16"},"fp8":{"model_id":"nvidia/Llama-3.3-70B-Instruct-FP8","precision":"fp8","vram_minimum_gb":84,"description":"NVIDIA FP8 quantization for Hopper and Blackwell","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{}},"nvfp4":{"model_id":"nvidia/Llama-3.3-70B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":42,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{"hopper":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192"],"extra_env":{}},"blackwell":{"extra_args":["--async-scheduling","--no-enable-prefix-caching","--max-num-batched-tokens","8192","--compilation-config","{\"pass_config\":{\"fuse_allreduce_rms\":true,\"fuse_attn_quant\":true,\"eliminate_noops\":true}}"],"extra_env":{}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$55","hf_org":"meta-llama","hf_repo":"Llama-3.3-70B-Instruct","hf_id":"meta-llama/Llama-3.3-70B-Instruct","hf_released":"2024-11-26T16:08:47.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"meta-llama/Llama-3.3-70B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":[],"tp_by_hardware":{"h100":2,"h200":2,"b200":1,"gb200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","llama3_json"]}},"guide":"$56"}},"default_engine":"vllm"},{"meta":{"title":"Llama-3.1-8B-Instruct","slug":"llama-3.1-8b-instruct","provider":"Meta","description":"Meta's Llama 3.1 8B dense instruction-tuned language model with 128K context","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:89:meta:tasks","related_recipes":["meta-llama/Llama-3.3-70B-Instruct"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:89:meta:hardware","engine_hardware":{"sglang":{"ascend_910b":"verified","ascend_910b4":"verified","ascend_910c":"verified"}}},"model":{"model_id":"meta-llama/Llama-3.1-8B-Instruct","min_vllm_version":"0.6.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; fp16/bf16 only). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"},{"command":"docker run --runtime iluvatar -e IX_VISIBLE_DEVICES=all ","brand":"Iluvatar","note":"Iluvatar CoreX BI-V150 runs a vendor-forked vLLM. Get the CoreX Docker installer (corex-docker-installer-4.3.0+-llm-py3.10-x86_64.run) from the Iluvatar Resource Center (login required), install the iluvatar container runtime, then vllm serve as usual. bf16/fp16/int8. See https://github.com/Deep-Spark/DeepSparkInference"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"}],"features":{"spec_decoding":{"description":"EAGLE3 speculative decoding (requires vLLM >= 0.9.0)","args":["--speculative-config","{\"model\":\"RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":20,"description":"Full precision BF16"},"nvfp4":{"model_id":"nvidia/Llama-3.1-8B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":5,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]},"nvidia_fp8":{"model_id":"nvidia/Llama-3.1-8B-Instruct-FP8","precision":"fp8","vram_minimum_gb":10,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$57","hf_org":"meta-llama","hf_repo":"Llama-3.1-8B-Instruct","hf_id":"meta-llama/Llama-3.1-8B-Instruct","hf_released":"2024-07-18T08:56:00.000Z","engines":{"vllm":{"min_version":"0.6.0"},"sglang":{"engine":"sglang","model_id":"meta-llama/Llama-3.1-8B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":[],"tp_by_hardware":{"h100":1,"h200":1,"ascend_910b":1,"ascend_910b4":1,"ascend_910c":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","llama3_json"]}},"guide":"$58"}},"default_engine":"vllm"}]],["microsoft",[{"meta":{"title":"Phi-4","slug":"phi-4","provider":"Microsoft","description":"Microsoft's Phi-4 family of lightweight dense models (mini-instruct, reasoning, multimodal) with 128K context","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:90:meta:tasks","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:90:meta:hardware"},"model":{"model_id":"microsoft/Phi-4-mini-instruct","min_vllm_version":"0.7.0","architecture":"dense","parameter_count":"4B","active_parameters":"4B","context_length":131072,"base_args":[],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":10,"description":"Phi-4-mini-instruct, conversational instruction-tuned"},"mini_reasoning":{"model_id":"microsoft/Phi-4-mini-reasoning","precision":"bf16","vram_minimum_gb":10,"description":"Optimized for reasoning tasks"},"reasoning":{"model_id":"microsoft/Phi-4-reasoning","precision":"bf16","vram_minimum_gb":30,"description":"Advanced reasoning capabilities (14B)"},"multimodal":{"model_id":"microsoft/Phi-4-multimodal-instruct","precision":"bf16","vram_minimum_gb":16,"description":"Multimodal instruction-following (text + image)","extra_args":["--trust-remote-code"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$59","hf_org":"microsoft","hf_repo":"Phi-4-mini-instruct","hf_id":"microsoft/Phi-4-mini-instruct","hf_released":"2025-02-19T01:00:58.000Z","engines":{"vllm":{"min_version":"0.7.0"},"sglang":{"engine":"sglang","model_id":"microsoft/Phi-4-mini-instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$5a"}},"default_engine":"vllm"}]],["MiniMaxAI",[{"meta":{"title":"MiniMax-M2.7","slug":"minimax-m2.7","provider":"MiniMax","description":"MiniMax M2.7 MoE language model (230B total / 10B active) — latest M2 release for coding, agent toolchains, and long-context reasoning with native FP8","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:64:meta:tasks","performance_headline":"Latest M2 series release; verified accuracy on AIME25, GPQA-D, GSM8K; 196K context","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:64:meta:hardware"},"model":{"model_id":"MiniMaxAI/MiniMax-M2.7","min_vllm_version":"0.20.0","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"},"nvfp4":{"model_id":"nvidia/MiniMax-M2.7-NVFP4","precision":"nvfp4","vram_minimum_gb":138,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":["--tensor-parallel-size","2","--attention-backend","ROCM_AITER_FA"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":4}},"guide":"$5b","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2.7","hf_id":"MiniMaxAI/MiniMax-M2.7","hf_released":"2026-04-09T03:37:12.000Z","engines":{"vllm":{"min_version":"0.20.0"},"sglang":{"engine":"sglang","model_id":"MiniMaxAI/MiniMax-M2.7","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":4,"h200":8,"b200":8,"b300":8,"gb200":4,"gb300":4},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","minimax-m2"]},"reasoning":{"args":["--reasoning-parser","minimax-append-think"]}},"guide":"$5c"}},"default_engine":"vllm"},{"meta":{"title":"MiniMax-M2.5","slug":"minimax-m2.5","provider":"MiniMax","description":"MiniMax M2.5 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint","date_updated":"2026-05-18","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:24:meta:tasks","performance_headline":"Refreshed M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:24:meta:hardware"},"model":{"model_id":"MiniMaxAI/MiniMax-M2.5","min_vllm_version":"0.20.2","docker_image":"vllm/vllm-openai:v0.20.2","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"},"nvfp4":{"model_id":"nvidia/MiniMax-M2.5-NVFP4","precision":"nvfp4","vram_minimum_gb":138,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"nvidia":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"PIECEWISE\"}"]},"hopper":{"extra_args":["--compilation-config","{\"cudagraph_mode\":\"PIECEWISE\"}","--max-num-seqs","512","--max-num-batched-tokens","32768","--kv-cache-dtype","fp8","--moe-backend","triton","--attention-backend","FLASHINFER","--enable-flashinfer-autotune"],"extra_env":{"PYTHONNOUSERSITE":"1","VLLM_USE_DEEP_GEMM":"0","VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER":"0","VLLM_FLOAT32_MATMUL_PRECISION":"high"}},"amd":{"extra_args":["--tensor-parallel-size","2","--attention-backend","ROCM_AITER_FA"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":4}},"guide":"$5d","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2.5","hf_id":"MiniMaxAI/MiniMax-M2.5","hf_released":"2026-02-12T06:05:24.000Z","engines":{"vllm":{"min_version":"0.20.2"},"sglang":{"engine":"sglang","model_id":"MiniMaxAI/MiniMax-M2.5","min_version":"v0.5.4.post1","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":4,"h200":2,"mi300x":2,"mi325x":2},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","minimax-m2"]},"reasoning":{"args":["--reasoning-parser","minimax-append-think"]}},"guide":"$5e"}},"default_engine":"vllm"},{"meta":{"title":"MiniMax-M2.1","slug":"minimax-m2.1","provider":"MiniMax","description":"MiniMax M2.1 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:63:meta:tasks","performance_headline":"Updated M2 series MoE with strong SWE-Bench and Terminal-Bench performance, 196K context","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:63:meta:hardware"},"model":{"model_id":"MiniMaxAI/MiniMax-M2.1","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":["--tensor-parallel-size","2","--attention-backend","ROCM_AITER_FA"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":4}},"guide":"$5f","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2.1","hf_id":"MiniMaxAI/MiniMax-M2.1","hf_released":"2025-12-20T05:45:05.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"MiniMaxAI/MiniMax-M2.1","min_version":"v0.5.4.post1","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":4,"mi300x":2,"mi325x":2},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","minimax-m2"]},"reasoning":{"args":["--reasoning-parser","minimax-append-think"]}},"guide":"$60"}},"default_engine":"vllm"},{"meta":{"title":"MiniMax-M2","slug":"minimax-m2","provider":"MiniMax","description":"MiniMax M2 MoE language model (230B total / 10B active) for coding, agent toolchains, and long-context reasoning — native FP8 checkpoint, with an NVFP4 variant for Blackwell","date_updated":"2026-05-11","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:33:meta:tasks","performance_headline":"Open-source MoE with strong SWE-Bench and Terminal-Bench performance, 196K context","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:33:meta:hardware"},"model":{"model_id":"MiniMaxAI/MiniMax-M2","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"230B","active_parameters":"10B","context_length":196608,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"Optional: DeepGEMM FP8 MoE kernels for throughput (skip on B200 — known FlashInfer FP8 MoE error)","command":"export VLLM_USE_DEEP_GEMM=1","optional":true},{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"MiniMax M2 tool call parser with automatic tool choice","args":["--tool-call-parser","minimax_m2","--enable-auto-tool-choice"]},"reasoning":{"description":"MiniMax M2 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","minimax_m2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":276,"description":"Native FP8 checkpoint — 4x H200/H20/H100 or 4x A100/A800 for weights, plus KV cache headroom"},"nvfp4":{"model_id":"RedHatAI/MiniMax-M2-NVFP4","precision":"nvfp4","vram_minimum_gb":138,"description":"NVFP4 (4-bit) quantized weights — requires Blackwell (B200/B300); roughly half the VRAM of the FP8 checkpoint","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":["--tensor-parallel-size","2","--attention-backend","ROCM_AITER_FA"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":4}},"guide":"$61","hf_org":"MiniMaxAI","hf_repo":"MiniMax-M2","hf_id":"MiniMaxAI/MiniMax-M2","hf_released":"2025-10-22T13:45:10.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"MiniMaxAI/MiniMax-M2","min_version":"v0.5.4.post3","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":4,"h200":2,"mi300x":2,"mi325x":2},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","minimax-m2"]},"reasoning":{"args":["--reasoning-parser","minimax-append-think"]}},"guide":"$62"}},"default_engine":"vllm"}]],["mistralai",[{"meta":{"title":"Mistral-Medium-3.5","slug":"mistral-medium-3.5","provider":"Mistral AI","description":"Mistral Medium 3.5 (128B) dense vision-language model with native FP8 weights and 256K context","date_updated":"2026-05-26","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:21:meta:tasks","related_recipes":["mistralai/Mistral-Large-3-675B-Instruct-2512","mistralai/Ministral-3-8B-Reasoning-2512"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:21:meta:hardware"},"model":{"model_id":"mistralai/Mistral-Medium-3.5-128B","min_vllm_version":"nightly","nightly_required":true,"architecture":"dense","parameter_count":"128B","active_parameters":"128B","context_length":262144,"base_args":["--tokenizer_mode","mistral","--config_format","mistral","--load_format","mistral"],"base_env":{}},"dependencies":[{"note":"Mistral tokenizer / chat-template runtime — Mistral 3.5 needs >= 1.11.1 (auto-installed by vLLM nightly, pin if you hit an older cached wheel)","command":"uv pip install -U \"mistral_common>=1.11.1\""}],"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"reasoning":{"description":"Mistral reasoning parser extracts [THINK]...[/THINK] into message.reasoning (emitted when reasoning_effort='high')","args":["--reasoning-parser","mistral"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]},"spec_decoding":{"description":"EAGLE speculative decoding via mistralai/Mistral-Medium-3.5-128B-EAGLE draft head","args":["--speculative-config","{\"model\":\"mistralai/Mistral-Medium-3.5-128B-EAGLE\",\"num_speculative_tokens\":3,\"method\":\"eagle\",\"max_model_len\":\"65536\"}"]}},"opt_in_features":["text_only","encoder_parallel","spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":154,"description":"Native FP8 weights (vision tower / projector / lm_head kept in BF16); recommended on 8xH200, 8xMI300X, or 4xB200"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":["--no-enable-prefix-caching"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$63","hf_org":"mistralai","hf_repo":"Mistral-Medium-3.5-128B","hf_id":"mistralai/Mistral-Medium-3.5-128B","hf_released":"2026-03-31T09:50:20.000Z","engines":{"vllm":{"min_version":"nightly"},"sglang":{"engine":"sglang","model_id":"mistralai/Mistral-Medium-3.5-128B","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":2},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mistral"]},"reasoning":{"args":["--reasoning-parser","mistral"]}},"guide":"$64"}},"default_engine":"vllm"},{"meta":{"title":"Mistral-Small-4-119B","slug":"mistral-small-4-119b","provider":"Mistral AI","description":"Mistral Small 4 (119B MoE, 6.5B active) — multimodal hybrid instruct + reasoning model with native FP8 weights and 256K context","date_updated":"2026-05-13","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:31:meta:tasks","related_recipes":["mistralai/Mistral-Medium-3.5-128B","mistralai/Mistral-Large-3-675B-Instruct-2512"]},"model":{"model_id":"mistralai/Mistral-Small-4-119B-2603","min_vllm_version":"0.20.0","architecture":"moe","parameter_count":"119B","active_parameters":"6.5B","context_length":262144,"base_args":["--max-model-len","262144","--attention-backend","FLASH_ATTN_MLA"],"base_env":{}},"dependencies":[{"note":"Mistral tokenizer / chat-template runtime — Mistral Small 4 needs >= 1.11.0 (vLLM 0.20.1+ bundles it, pin explicitly if you hit an older cached wheel)","command":"uv pip install -U \"mistral_common>=1.11.0\""},{"note":"Transformers v5 silences YaRN warnings and is required for the latest Mistral 4 chat template","command":"uv pip install -U transformers"}],"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice — emits [TOOL_CALLS] / [ARGS] from the chat template","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"reasoning":{"description":"Mistral reasoning parser extracts [THINK]...[/THINK] into message.reasoning_content (emitted when reasoning_effort='high')","args":["--reasoning-parser","mistral"]},"spec_decoding":{"description":"EAGLE speculative decoding via the mistralai/Mistral-Small-4-119B-2603-eagle 2-layer draft head","args":["--speculative-config","{\"model\":\"mistralai/Mistral-Small-4-119B-2603-eagle\",\"num_speculative_tokens\":3,\"method\":\"eagle\",\"max_model_len\":\"65536\"}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":143,"description":"Native FP8 E4M3 weights (vision tower / projector / lm_head kept in BF16); recommended on 2xB200/H200 or MI300X with FLASH_ATTN_MLA"},"nvfp4":{"model_id":"mistralai/Mistral-Small-4-119B-2603-NVFP4","precision":"nvfp4","vram_minimum_gb":72,"description":"NVFP4 4-bit weights for B200-class GPUs (Marlin fallback on Hopper); overrides the attention backend to TRITON_MLA","extra_args":["--attention-backend","TRITON_MLA"]}},"compatible_strategies":["single_node_tp","multi_node_tp","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":["--no-enable-prefix-caching"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{"single_node_tp":{"tp":2}},"guide":"$65","hf_org":"mistralai","hf_repo":"Mistral-Small-4-119B-2603","hf_id":"mistralai/Mistral-Small-4-119B-2603","hf_released":"2026-01-23T13:14:14.000Z","engines":{"vllm":{"min_version":"0.20.0"},"sglang":{"engine":"sglang","model_id":"mistralai/Mistral-Small-4-119B-2603","min_version":"v0.5.8","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":2,"h200":2,"b200":1,"b300":1},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mistral"]},"reasoning":{"args":["--reasoning-parser","mistral"]}},"guide":"$66"}},"default_engine":"vllm"},{"meta":{"title":"Voxtral-Mini-4B-Realtime-2602","slug":"voxtral-mini-4b-realtime-2602","provider":"Mistral AI","description":"Multilingual realtime speech transcription (13 languages) with a natively streaming causal audio encoder; configurable 80ms–2.4s transcription delay served via vLLM's Realtime API","date_updated":"2026-05-13","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:32:meta:tasks","performance_headline":"Matches offline open-source ASR accuracy at 480ms delay; >12.5 tok/s on a single 16GB GPU","related_recipes":[]},"model":{"model_id":"mistralai/Voxtral-Mini-4B-Realtime-2602","min_vllm_version":"0.20.0","architecture":"dense","parameter_count":"4.4B","active_parameters":"4.4B","context_length":131072,"base_args":["--tokenizer-mode","mistral","--compilation_config","{\"cudagraph_mode\": \"PIECEWISE\"}"],"base_env":{"VLLM_DISABLE_COMPILE_CACHE":"1"}},"dependencies":[{"note":"mistral-common's audio extras bundle soxr/librosa/soundfile plus the Voxtral Realtime tokenizer (>= 1.9.0)","command":"uv pip install -U \"mistral-common[audio]>=1.9.0\""},{"note":"Transformers v5 silences a barrage of warnings emitted when serving Voxtral on v4 (see vllm-project/vllm#34642)","command":"uv pip install -U transformers"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":16,"description":"BF16 weights — single 16GB+ GPU, full 131072 (~3h audio) context"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$67","hf_org":"mistralai","hf_repo":"Voxtral-Mini-4B-Realtime-2602","hf_id":"mistralai/Voxtral-Mini-4B-Realtime-2602","hf_released":"2026-01-21T17:22:02.000Z"},{"meta":{"title":"Mistral-Large-3-675B-Instruct","slug":"mistral-large-3-675b-instruct","provider":"Mistral AI","description":"Mistral Large 3 (675B) with FP8 and NVFP4 weights for 8xH200 / 4xB200 deployments","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:91:meta:tasks","related_recipes":["mistralai/Ministral-3-14B-Instruct-2512"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:91:meta:hardware"},"model":{"model_id":"mistralai/Mistral-Large-3-675B-Instruct-2512","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"675B","active_parameters":"22B","context_length":294912,"base_args":["--tokenizer_mode","mistral","--config_format","mistral","--load_format","mistral"],"base_env":{}},"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":810,"description":"FP8 weights on 8xH200 (recommended for fine-tuning; up to 256K context)","extra_args":["--tensor-parallel-size","8"]},"nvfp4":{"model_id":"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4","precision":"nvfp4","vram_minimum_gb":405,"description":"NVFP4 weights on 4xB200 (use for <64K context; B200-native, Marlin fallback on A100/H100)","extra_args":["--tensor-parallel-size","4"]},"fp8":{"model_id":"mistralai/Mistral-Large-3-675B-Instruct-2512-FP8","precision":"fp8","vram_minimum_gb":810,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":["--no-enable-prefix-caching"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$68","hf_org":"mistralai","hf_repo":"Mistral-Large-3-675B-Instruct-2512","hf_id":"mistralai/Mistral-Large-3-675B-Instruct-2512","hf_released":"2025-11-28T18:05:12.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"mistralai/Mistral-Large-3-675B-Instruct-2512","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"b200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mistral"]},"reasoning":{"args":["--reasoning-parser","mistral"]}},"guide":"$69"}},"default_engine":"vllm"},{"meta":{"title":"Ministral-3-Instruct","slug":"ministral-3-instruct","provider":"Mistral AI","description":"Ministral 3 Instruct family (3B/8B/14B) with FP8 weights, vision support, and 256K context","date_updated":"2026-05-25","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:22:meta:tasks","related_recipes":["mistralai/Ministral-3-8B-Reasoning-2512","mistralai/Mistral-Large-3-675B-Instruct-2512"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:22:meta:hardware"},"model":{"model_id":"mistralai/Ministral-3-14B-Instruct-2512","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"14B","active_parameters":"14B","context_length":262144,"base_args":["--tokenizer_mode","mistral","--config_format","mistral","--load_format","mistral"],"base_env":{}},"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":17,"description":"Native FP8 weights (14B), fits on 1x H200"},"8b":{"model_id":"mistralai/Ministral-3-8B-Instruct-2512","precision":"fp8","vram_minimum_gb":12,"description":"Smaller 8B variant with independent embedding/output layers"},"3b":{"model_id":"mistralai/Ministral-3-3B-Instruct-2512","precision":"fp8","vram_minimum_gb":6,"description":"Smallest 3B variant with tied embeddings"},"fp8":{"model_id":"mistralai/Ministral-3-14B-Instruct-2512-FP8","precision":"fp8","vram_minimum_gb":17,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":["--config_format","mistral","--load_format","mistral"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$6a","hf_org":"mistralai","hf_repo":"Ministral-3-14B-Instruct-2512","hf_id":"mistralai/Ministral-3-14B-Instruct-2512","hf_released":"2025-10-31T08:43:24.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"mistralai/Ministral-3-14B-Instruct-2512","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"mi300x":1},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mistral"]}},"guide":"$6b"}},"default_engine":"vllm"},{"meta":{"title":"Ministral-3-Reasoning","slug":"ministral-3-reasoning","provider":"Mistral AI","description":"Ministral 3 Reasoning family (3B/8B/14B) with BF16 weights, vision support, and 256K context","date_updated":"2026-05-06","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:42:meta:tasks","related_recipes":["mistralai/Ministral-3-14B-Instruct-2512","mistralai/Mistral-Large-3-675B-Instruct-2512"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:42:meta:hardware"},"model":{"model_id":"mistralai/Ministral-3-8B-Reasoning-2512","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":262144,"base_args":["--tokenizer_mode","mistral","--config_format","mistral","--load_format","mistral"],"base_env":{}},"features":{"tool_calling":{"description":"Mistral tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","mistral"]},"reasoning":{"description":"Mistral reasoning parser extracts ... into message.reasoning","args":["--reasoning-parser","mistral"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":22,"description":"Native BF16 weights (8B)"},"3b":{"model_id":"mistralai/Ministral-3-3B-Reasoning-2512","precision":"bf16","vram_minimum_gb":8,"description":"Smallest 3B variant with tied embeddings"},"14b":{"model_id":"mistralai/Ministral-3-14B-Reasoning-2512","precision":"bf16","vram_minimum_gb":32,"description":"Largest 14B variant; 2xH200 recommended for full context","extra_args":["--tensor-parallel-size","2"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":["--no-enable-prefix-caching"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$6c","hf_org":"mistralai","hf_repo":"Ministral-3-8B-Reasoning-2512","hf_id":"mistralai/Ministral-3-8B-Reasoning-2512","hf_released":"2025-10-31T08:41:36.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"mistralai/Ministral-3-8B-Reasoning-2512","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"gb200":1,"mi300x":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mistral"]},"reasoning":{"args":["--reasoning-parser","mistral"]}},"guide":"$6d"}},"default_engine":"vllm"}]],["moonshotai",[{"meta":{"title":"Kimi-K2.6","slug":"kimi-k2.6","provider":"Moonshot AI","description":"Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes","date_updated":"2026-05-14","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:28:meta:tasks","performance_headline":"Multimodal agentic MoE model with DeepSeek-V3 backbone and MLA attention","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:28:meta:hardware"},"model":{"model_id":"moonshotai/Kimi-K2.6","min_vllm_version":"0.19.1","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":262144,"supports_dcp":true,"base_args":["--trust-remote-code"]},"features":{"tool_calling":{"description":"Kimi K2 tool-call parser with automatic tool choice","args":["--tool-call-parser","kimi_k2","--enable-auto-tool-choice"]},"reasoning":{"description":"Kimi K2 reasoning parser for extracting chain-of-thought content","args":["--reasoning-parser","kimi_k2"]},"spec_decoding":{"description":"Eagle3 speculative decoding for accelerated inference","args":["--speculative-config","{\"model\":\"lightseekorg/kimi-k2.6-eagle3-mla\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"hardware_opt_in_features":{"gb200":["encoder_parallel"]},"variants":{"default":{"precision":"int4","vram_minimum_gb":714,"description":"Packed INT4 via compressed-tensors (~595 GB on disk); fits 8×H200"},"nvfp4":{"model_id":"nvidia/Kimi-K2.6-NVFP4","precision":"nvfp4","vram_minimum_gb":600,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs (e.g. GB200)","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":["--attention-config.use_trtllm_ragged_deepseek_prefill=True"]},"amd":{"extra_args":["--block-size=1"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":"INT4","VLLM_ROCM_USE_AITER_RMSNORM":"0"}}},"strategy_overrides":{"single_node_dep":{"extra_args":[],"extra_env":{}},"single_node_tep":{"extra_args":[],"extra_env":{}},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":1,"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-batched-tokens","16384","--block-size","64"],"env":{}},"decode":{"nodes":1,"parallelism":"dep","vllm_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--block-size","64","--all2all-backend","flashinfer_nvlink_one_sided"],"env":{}}}},"guide":"$6e","hf_org":"moonshotai","hf_repo":"Kimi-K2.6","hf_id":"moonshotai/Kimi-K2.6","hf_released":"2026-04-14T04:23:36.000Z","engines":{"vllm":{"min_version":"0.19.1"},"sglang":{"engine":"sglang","model_id":"moonshotai/Kimi-K2.6","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b300":8,"mi300x":4,"mi325x":4,"mi355x":4},"variants":{"default":{"precision":"int4"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","kimi_k2"]},"reasoning":{"args":["--reasoning-parser","kimi_k2"]}},"guide":"$6f"}},"default_engine":"vllm"},{"meta":{"title":"Kimi-K2.5","slug":"kimi-k2.5","provider":"Moonshot AI","description":"Open-source native multimodal agentic MoE model with vision-language understanding, tool calling, and thinking modes","date_updated":"2026-05-14","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:27:meta:tasks","performance_headline":"Multimodal agentic MoE model with DeepSeek-V3 backbone and MLA attention","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:27:meta:hardware"},"model":{"model_id":"moonshotai/Kimi-K2.5","min_vllm_version":"0.19.1","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":262144,"supports_dcp":true,"base_args":["--trust-remote-code"]},"features":{"tool_calling":{"description":"Kimi K2 tool-call parser with automatic tool choice","args":["--tool-call-parser","kimi_k2","--enable-auto-tool-choice"]},"reasoning":{"description":"Kimi K2 reasoning parser for extracting chain-of-thought content","args":["--reasoning-parser","kimi_k2"]},"spec_decoding":{"description":"Eagle3 speculative decoding for accelerated inference (requires vLLM >= 0.18.0)","args":["--speculative-config","{\"model\":\"lightseekorg/kimi-k2.5-eagle3-mla\",\"method\":\"eagle3\",\"num_speculative_tokens\":3}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"hardware_opt_in_features":{"gb200":["encoder_parallel"]},"variants":{"default":{"precision":"int4","vram_minimum_gb":714,"description":"Packed INT4 via compressed-tensors (~595 GB on disk); fits 8×H200"},"nvfp4":{"model_id":"nvidia/Kimi-K2.5-NVFP4","precision":"nvfp4","vram_minimum_gb":600,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs (e.g. GB200)","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":["--attention-config.use_trtllm_ragged_deepseek_prefill=True"]},"amd":{"extra_args":["--block-size=1"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":"INT4","VLLM_ROCM_USE_AITER_RMSNORM":"0"}}},"strategy_overrides":{"single_node_dep":{"extra_args":[],"extra_env":{}},"single_node_tep":{"extra_args":[],"extra_env":{}},"pd_cluster":{"env":{"VLLM_USE_NCCL_SYMM_MEM":"1","NCCL_CUMEM_ENABLE":"1","NCCL_MNNVL_ENABLE":"1","NCCL_NVLS_ENABLE":"1"},"prefill":{"nodes":1,"parallelism":"dep","vllm_args":["--enforce-eager","--max-num-batched-tokens","16384","--block-size","64"],"env":{}},"decode":{"nodes":1,"parallelism":"dep","vllm_args":["--compilation-config","{\"cudagraph_mode\":\"FULL_DECODE_ONLY\"}","--block-size","64","--all2all-backend","flashinfer_nvlink_one_sided"],"env":{}}}},"guide":"$70","hf_org":"moonshotai","hf_repo":"Kimi-K2.5","hf_id":"moonshotai/Kimi-K2.5","hf_released":"2026-01-01T06:06:03.000Z","engines":{"vllm":{"min_version":"0.19.1"},"sglang":{"engine":"sglang","model_id":"moonshotai/Kimi-K2.5","min_version":"v0.5.8","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b300":8,"mi300x":4,"mi325x":4,"mi355x":4},"variants":{"default":{"precision":"int4"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","kimi_k2"]},"reasoning":{"args":["--reasoning-parser","kimi_k2"]}},"guide":"$71"}},"default_engine":"vllm"},{"meta":{"title":"Kimi-K2-Thinking","slug":"kimi-k2-thinking","provider":"Moonshot AI","description":"Kimi-K2-Thinking is an advanced reasoning MoE model with native INT4 QAT weights, designed for long-horizon agent workflows interleaving chain-of-thought reasoning with tool calls.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:93:meta:tasks","performance_headline":"1T MoE thinking model with native INT4 QAT for 2x low-latency speed-up","related_recipes":["moonshotai/Kimi-K2-Instruct","moonshotai/Kimi-K2.5"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:93:meta:hardware"},"model":{"model_id":"moonshotai/Kimi-K2-Thinking","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":262144,"supports_dcp":true,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable Kimi K2 tool calling with the kimi_k2 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","kimi_k2"]},"reasoning":{"description":"Kimi K2 reasoning parser for extracting chain-of-thought content.","args":["--reasoning-parser","kimi_k2"]}},"opt_in_features":[],"variants":{"default":{"precision":"int4","vram_minimum_gb":600,"description":"Native INT4 (QAT) weights on 8xH200 / 8xH20; 2x low-latency speed-up vs FP8"},"nvfp4":{"model_id":"nvidia/Kimi-K2-Thinking-NVFP4","precision":"nvfp4","vram_minimum_gb":600,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$72","hf_org":"moonshotai","hf_repo":"Kimi-K2-Thinking","hf_id":"moonshotai/Kimi-K2-Thinking","hf_released":"2025-11-04T08:25:31.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"moonshotai/Kimi-K2-Thinking","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"int4"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","kimi_k2"]},"reasoning":{"args":["--reasoning-parser","kimi_k2"]}},"guide":"$73"}},"default_engine":"vllm"},{"meta":{"title":"Kimi-Linear-48B-A3B-Instruct","slug":"kimi-linear-48b-a3b-instruct","provider":"Moonshot AI","description":"Kimi-Linear is a 48B-parameter instruction-tuned MoE model (~3B activated) with a linear-attention variant supporting very long context (1M tokens).","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:94:meta:tasks","performance_headline":"Linear-attention MoE with 1M-token context on a single node","related_recipes":[]},"model":{"model_id":"moonshotai/Kimi-Linear-48B-A3B-Instruct","min_vllm_version":"0.11.2","architecture":"moe","parameter_count":"48B","active_parameters":"3B","context_length":1048576,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"Pin vllm==0.11.2 — 0.12.0 has a known Kimi-Linear regression","command":"uv pip install vllm==0.11.2 --torch-backend auto"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":115,"description":"Full precision BF16 on 4 or 8 GPUs (single node)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$74","hf_org":"moonshotai","hf_repo":"Kimi-Linear-48B-A3B-Instruct","hf_id":"moonshotai/Kimi-Linear-48B-A3B-Instruct","hf_released":"2025-10-30T12:37:31.000Z","engines":{"vllm":{"min_version":"0.11.2"},"sglang":{"engine":"sglang","model_id":"moonshotai/Kimi-Linear-48B-A3B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$75"}},"default_engine":"vllm"},{"meta":{"title":"Kimi-K2-Instruct","slug":"kimi-k2-instruct","provider":"Moonshot AI","description":"Moonshot AI's Kimi-K2 is a trillion-parameter MoE instruction model (~32B active) with native FP8 weights and strong tool-calling capabilities.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:92:meta:tasks","performance_headline":"Open-weights 1T-parameter MoE with native FP8 and Kimi K2 tool calling","related_recipes":["moonshotai/Kimi-K2-Thinking","moonshotai/Kimi-K2.5"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:92:meta:hardware"},"model":{"model_id":"moonshotai/Kimi-K2-Instruct","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"1T","active_parameters":"32B","context_length":131072,"supports_dcp":true,"base_args":["--trust-remote-code","--tokenizer-mode","auto"],"base_env":{}},"dependencies":[{"note":"Optional: DeepEP + DeepGEMM for the DP+EP deployment path on H800/H200","command":"uv pip install git+https://github.com/deepseek-ai/DeepGEMM.git@v2.1.1.post3 --no-build-isolation","optional":true}],"features":{"tool_calling":{"description":"Enable Kimi K2 tool calling with the kimi_k2 tool-call parser.","args":["--enable-auto-tool-choice","--tool-call-parser","kimi_k2"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1200,"description":"Native FP8 weights on 16xH800 / 16xH200 (smallest deployment for 128k seqlen)"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{"multi_node_tp_pp":{"vllm_args":["--dtype","bfloat16","--quantization","fp8","--kv-cache-dtype","fp8","--decode-context-parallel-size","8","--enable-chunked-prefill","--max-model-len","65536","--max-num-batched-tokens","1024","--max-num-seqs","1","--disable-log-requests"]}},"guide":"$76","hf_org":"moonshotai","hf_repo":"Kimi-K2-Instruct","hf_id":"moonshotai/Kimi-K2-Instruct","hf_released":"2025-07-11T00:55:12.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"moonshotai/Kimi-K2-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","kimi_k2"]},"reasoning":{"args":["--reasoning-parser","kimi_k2"]}},"guide":"$77"}},"default_engine":"vllm"}]],["nvidia",[{"meta":{"title":"NVIDIA Nemotron-3-Ultra-550B-A55B","slug":"nemotron-3-ultra-550b-a55b","provider":"NVIDIA","description":"NVIDIA Nemotron 3 Ultra hybrid Transformer-Mamba MoE model for long-context agentic reasoning, coding, and tool use.","date_updated":"2026-06-03","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:8:meta:tasks","performance_headline":"550B total / 55B active parameters with BF16 and NVFP4 serving paths","related_recipes":["nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:8:meta:hardware"},"model":{"model_id":"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16","min_vllm_version":"0.22.0","docker_image":"vllm/vllm-openai:v0.22.0","architecture":"moe","parameter_count":"550B","active_parameters":"55B","context_length":262144,"base_args":["--served-model-name","nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B","--trust-remote-code","--kv-cache-dtype","fp8","--max-num-seqs","16","--max-model-len","262144","--gpu-memory-utilization","0.90","--max-num-batched-tokens","32768","--enable-flashinfer-autotune","--async-scheduling","--mamba-backend","triton","--mamba-ssm-cache-dtype","float32"],"base_env":{"VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS":"1"}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Nemotron v3 reasoning parser","args":["--reasoning-parser","nemotron_v3"]},"spec_decoding":{"description":"Multi-Token Prediction with 5 speculative tokens","args":["--speculative_config.method","mtp","--speculative_config.num_speculative_tokens","5"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":1320,"description":"BF16 weights"},"fp4":{"model_id":"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4","precision":"nvfp4","vram_minimum_gb":330,"description":"NVFP4 weights"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$78","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16","hf_id":"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16","hf_released":"2026-06-03T14:50:04.000Z","engines":{"vllm":{"min_version":"0.22.0"},"sglang":{"engine":"sglang","model_id":"nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","nemotron_3"]}},"guide":"$79"}},"default_engine":"vllm"},{"meta":{"title":"Cosmos3-Super-Text2Image","slug":"cosmos3-super-text2image","provider":"NVIDIA","description":"64B Cosmos3-Super specialization for high-fidelity text-to-image generation","date_updated":"2026-06-02","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:13:meta:tasks","performance_headline":"High-fidelity text-to-image — 8×H100 with CFG-parallel + ulysses + HSDP","related_recipes":["nvidia/Cosmos3-Super","nvidia/Cosmos3-Nano","nvidia/Cosmos3-Super-Image2Video"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:13:meta:hardware"},"model":{"model_id":"nvidia/Cosmos3-Super-Text2Image","min_vllm_version":"0.21.0","docker_image":"vllm/vllm-omni:cosmos3","install":{"docker":{"note":"Release-tested vLLM-Omni image bundling the Cosmos3 omni handlers."},"pip":false},"architecture":"dense","parameter_count":"64B","active_parameters":"64B","context_length":262144,"base_args":["--host","0.0.0.0","--port","8000","--cfg-parallel-size","2","--ulysses-degree","4","--tensor-parallel-size","1","--use-hsdp","--hsdp-shard-size","8","--init-timeout","1800"],"base_env":{}},"omni":{"tasks":[{"id":"t2i","description":"Text → high-fidelity image"}]},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":154,"description":"BF16 weights — 8×H100 node recommended"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$7a","hf_org":"nvidia","hf_repo":"Cosmos3-Super-Text2Image","hf_id":"nvidia/Cosmos3-Super-Text2Image","hf_released":"2026-05-28T03:49:21.000Z"},{"meta":{"title":"Cosmos3-Super-Image2Video","slug":"cosmos3-super-image2video","provider":"NVIDIA","description":"64B Cosmos3-Super specialization for temporally coherent image-to-video generation","date_updated":"2026-06-02","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:12:meta:tasks","performance_headline":"Temporally coherent image-to-video — ~55s/video on 8×H200","related_recipes":["nvidia/Cosmos3-Super","nvidia/Cosmos3-Nano","nvidia/Cosmos3-Super-Text2Image"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:12:meta:hardware"},"model":{"model_id":"nvidia/Cosmos3-Super-Image2Video","min_vllm_version":"0.21.0","docker_image":"vllm/vllm-omni:cosmos3","install":{"docker":{"note":"Release-tested vLLM-Omni image bundling the Cosmos3 omni handlers."},"pip":false},"architecture":"dense","parameter_count":"64B","active_parameters":"64B","context_length":262144,"base_args":["--host","0.0.0.0","--port","8000","--cfg-parallel-size","2","--ulysses-degree","4","--use-hsdp","--hsdp-shard-size","8","--init-timeout","1800"],"base_env":{}},"omni":{"tasks":[{"id":"i2v","description":"Image → temporally coherent video","curl":"curl -X POST http://localhost:8000/v1/videos/sync \\\n -H \"Accept: video/mp4\" \\\n -F \"input_reference=@assets/example_first_frame.png\" \\\n -F \"prompt=$(cat assets/example_prompt.json)\" \\\n -F \"size=1280x720\" \\\n -F \"num_frames=189\" \\\n -F \"fps=24\" \\\n -F \"num_inference_steps=35\" \\\n -F \"guidance_scale=6.0\" \\\n -F \"max_sequence_length=4096\" \\\n -F \"flow_shift=10.0\" \\\n -F \"seed=1111\" \\\n --output output.mp4\n"}]},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":154,"description":"BF16 weights — 8×H200/H100/A100 node recommended"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$7b","hf_org":"nvidia","hf_repo":"Cosmos3-Super-Image2Video","hf_id":"nvidia/Cosmos3-Super-Image2Video","hf_released":"2026-05-21T19:21:28.000Z"},{"meta":{"title":"Nemotron-3-Nano-Omni-30B-A3B-Reasoning","slug":"nemotron-3-nano-omni-30b-a3b-reasoning","provider":"NVIDIA","description":"Mamba2-Transformer hybrid MoE omnimodal model (31B total / 3B active) with unified video, audio, image, and text understanding; reasoning + tool calling; BF16, FP8, and NVFP4 variants","date_updated":"2026-04-29","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:45:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:45:meta:hardware"},"model":{"model_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","min_vllm_version":"0.20.0","install":{"pip":{"command":"uv pip install \"vllm[audio]==0.20.0\"","note":"Pinned to 0.20.0 with the audio extra (required for audio + use_audio_in_video)."}},"architecture":"moe","parameter_count":"31B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code","--max-model-len","131072","--media-io-kwargs","{\"video\": {\"num_frames\": 512, \"fps\": 1}}","--video-pruning-rate","0.5"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Nemotron v3 reasoning parser (chain-of-thought with tags)","args":["--reasoning-parser","nemotron_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":75,"description":"BF16 weights — full-precision reference"},"fp8":{"model_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8","precision":"fp8","vram_minimum_gb":38,"description":"ModelOpt FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"]},"nvfp4":{"model_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4","precision":"nvfp4","vram_minimum_gb":28,"description":"ModelOpt NVFP4 weights — Blackwell-only; smallest footprint","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$7c","hf_org":"nvidia","hf_repo":"Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","hf_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","hf_released":"2026-04-20T04:40:42.000Z","engines":{"vllm":{"min_version":"0.20.0"},"sglang":{"engine":"sglang","model_id":"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"reasoning":{"args":["--reasoning-parser","nemotron_3"]}},"guide":"$7d"}},"default_engine":"vllm"},{"meta":{"title":"Cosmos3-Nano","slug":"cosmos3-nano","provider":"NVIDIA","description":"Compact 16B omnimodal world model (Mixture-of-Transformers) for multimodal understanding, world simulation, future prediction, action reasoning, and Physical AI","date_updated":"2026-06-02","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:11:meta:tasks","performance_headline":"16B omnimodal world model — single-GPU H200 video/audio generation","related_recipes":["nvidia/Cosmos3-Super","nvidia/Cosmos3-Super-Text2Image","nvidia/Cosmos3-Super-Image2Video"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:11:meta:hardware"},"model":{"model_id":"nvidia/Cosmos3-Nano","min_vllm_version":"0.21.0","docker_image":"vllm/vllm-omni:cosmos3","install":{"docker":{"note":"Release-tested vLLM-Omni image bundling the Cosmos3 omni handlers."},"pip":false},"architecture":"dense","parameter_count":"16B","active_parameters":"16B","context_length":262144,"base_args":["--host","0.0.0.0","--port","8000","--init-timeout","1800"],"base_env":{}},"omni":{"tasks":[{"id":"t2v","description":"Text → video (audio muxed into the MP4)","curl":"curl -X POST http://localhost:8000/v1/videos/sync \\\n -H \"Accept: video/mp4\" \\\n -F \"prompt=$(cat assets/example_t2v_prompt.json)\" \\\n -F \"negative_prompt=$(cat assets/negative_prompt.json)\" \\\n -F \"size=1280x720\" \\\n -F \"num_frames=189\" \\\n -F \"fps=24\" \\\n -F \"num_inference_steps=35\" \\\n -F \"guidance_scale=6.0\" \\\n -F \"max_sequence_length=4096\" \\\n -F \"flow_shift=10.0\" \\\n -F \"seed=123\" \\\n --output output.mp4\n"},{"id":"i2v","description":"Image → video (temporally coherent, audio muxed in)","curl":"curl -X POST http://localhost:8000/v1/videos/sync \\\n -H \"Accept: video/mp4\" \\\n -F \"input_reference=@assets/example_i2v_input.jpg\" \\\n -F \"prompt=$(cat assets/example_i2v_prompt.json)\" \\\n -F \"negative_prompt=$(cat assets/negative_prompt.json)\" \\\n -F \"size=1280x720\" \\\n -F \"num_frames=189\" \\\n -F \"fps=24\" \\\n -F \"num_inference_steps=35\" \\\n -F \"guidance_scale=6.0\" \\\n -F \"max_sequence_length=4096\" \\\n -F \"flow_shift=10.0\" \\\n -F \"seed=1111\" \\\n --output output.mp4\n"}]},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":39,"description":"BF16 weights — single-GPU H200 omni generation"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$7e","hf_org":"nvidia","hf_repo":"Cosmos3-Nano","hf_id":"nvidia/Cosmos3-Nano","hf_released":"2026-03-10T23:01:40.000Z"},{"meta":{"title":"Cosmos3-Super","slug":"cosmos3-super","provider":"NVIDIA","description":"Frontier-scale 64B omnimodal world model (Mixture-of-Transformers) for advanced multimodal understanding, world simulation, future prediction, action reasoning, and Physical AI","date_updated":"2026-06-02","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:14:meta:tasks","performance_headline":"64B omnimodal world model — ~55s/video on 8×H200 with HSDP + ulysses","related_recipes":["nvidia/Cosmos3-Nano","nvidia/Cosmos3-Super-Text2Image","nvidia/Cosmos3-Super-Image2Video"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:14:meta:hardware"},"model":{"model_id":"nvidia/Cosmos3-Super","min_vllm_version":"0.21.0","docker_image":"vllm/vllm-omni:cosmos3","install":{"docker":{"note":"Release-tested vLLM-Omni image bundling the Cosmos3 omni handlers."},"pip":false},"architecture":"dense","parameter_count":"64B","active_parameters":"64B","context_length":262144,"base_args":["--host","0.0.0.0","--port","8000","--cfg-parallel-size","2","--ulysses-degree","4","--use-hsdp","--hsdp-shard-size","8","--init-timeout","1800"],"base_env":{}},"omni":{"tasks":[{"id":"t2v","description":"Text → video (audio muxed into the MP4)","curl":"curl -X POST http://localhost:8000/v1/videos/sync \\\n -H \"Accept: video/mp4\" \\\n -F \"prompt=$(cat assets/example_t2v_prompt.json)\" \\\n -F \"negative_prompt=$(cat assets/negative_prompt.json)\" \\\n -F \"size=1280x720\" \\\n -F \"num_frames=189\" \\\n -F \"fps=24\" \\\n -F \"num_inference_steps=35\" \\\n -F \"guidance_scale=6.0\" \\\n -F \"max_sequence_length=4096\" \\\n -F \"flow_shift=10.0\" \\\n -F \"seed=123\" \\\n --output output.mp4\n"},{"id":"i2v","description":"Image → video (temporally coherent, audio muxed in)","curl":"curl -X POST http://localhost:8000/v1/videos/sync \\\n -H \"Accept: video/mp4\" \\\n -F \"input_reference=@assets/example_i2v_input.jpg\" \\\n -F \"prompt=$(cat assets/example_i2v_prompt.json)\" \\\n -F \"negative_prompt=$(cat assets/negative_prompt.json)\" \\\n -F \"size=1280x720\" \\\n -F \"num_frames=189\" \\\n -F \"fps=24\" \\\n -F \"num_inference_steps=35\" \\\n -F \"guidance_scale=6.0\" \\\n -F \"max_sequence_length=4096\" \\\n -F \"flow_shift=10.0\" \\\n -F \"seed=1111\" \\\n --output output.mp4\n"}]},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":154,"description":"BF16 weights — frontier-scale; 8×H200/H100/A100 node recommended"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$7f","hf_org":"nvidia","hf_repo":"Cosmos3-Super","hf_id":"nvidia/Cosmos3-Super","hf_released":"2026-03-10T22:59:21.000Z"},{"meta":{"title":"NVIDIA Nemotron-3-Super-120B-A12B","slug":"nemotron-3-super-120b-a12b","provider":"NVIDIA","description":"NVIDIA Nemotron-3-Super Mamba-hybrid latent-MoE (~120B total / ~12B active) with BF16, FP8, and NVFP4 variants","date_updated":"2026-04-28","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:48:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:48:meta:hardware"},"model":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16","min_vllm_version":"0.17.1","architecture":"moe","parameter_count":"120B","active_parameters":"12B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Built-in nemotron_v3 reasoning parser (vLLM >= 0.17.1)","args":["--reasoning-parser","nemotron_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":298,"description":"BF16 weights (FP8 KV cache recommended)","extra_args":["--kv-cache-dtype","fp8"]},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8","precision":"fp8","vram_minimum_gb":149,"description":"FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"]},"nvfp4":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4","precision":"nvfp4","vram_minimum_gb":75,"description":"NVFP4 weights for Blackwell"},"base_bf16":{"model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16","precision":"bf16","vram_minimum_gb":298,"description":"Pre-RL base checkpoint (BF16)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$80","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-3-Super-120B-A12B-BF16","hf_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16","hf_released":"2026-03-10T18:32:14.000Z","engines":{"vllm":{"min_version":"0.17.1"},"sglang":{"engine":"sglang","model_id":"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16","min_version":"v0.5.8","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":4,"b200":4},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","nemotron_3"]}},"guide":"$81"}},"default_engine":"vllm"},{"meta":{"title":"NVIDIA Nemotron-3-Nano-4B","slug":"nemotron-3-nano-4b","provider":"NVIDIA","description":"NVIDIA Nemotron-3-Nano 4B (Mamba-hybrid dense) — compact reasoning + tool-use model with BF16 and FP8 variants","date_updated":"2026-04-28","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:47:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","nvidia/NVIDIA-Nemotron-Nano-9B-v2"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:47:meta:hardware"},"model":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16","min_vllm_version":"0.11.2","architecture":"dense","parameter_count":"4B","active_parameters":"4B","context_length":262144,"base_args":["--trust-remote-code","--async-scheduling"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Custom Nano v3 reasoning parser (download plugin: nano_v3_reasoning_parser.py)","args":["--reasoning-parser-plugin","nano_v3_reasoning_parser.py","--reasoning-parser","nano_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":10,"description":"BF16 weights","extra_args":["--kv-cache-dtype","auto"]},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-4B-FP8","precision":"fp8","vram_minimum_gb":5,"description":"FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$82","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-3-Nano-4B-BF16","hf_id":"nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16","hf_released":"2026-03-07T01:23:35.000Z","engines":{"vllm":{"min_version":"0.11.2"},"sglang":{"engine":"sglang","model_id":"nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"reasoning":{"args":["--reasoning-parser","nemotron_3"]}},"guide":"$83"}},"default_engine":"vllm"},{"meta":{"title":"NVIDIA Nemotron-3-Nano-30B-A3B","slug":"nemotron-3-nano-30b-a3b","provider":"NVIDIA","description":"NVIDIA Nemotron-3-Nano Mamba-hybrid MoE (30B total / ~3B active) with BF16 and FP8 variants","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:95:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:95:meta:hardware"},"model":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","min_vllm_version":"0.11.2","architecture":"moe","parameter_count":"30B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code","--async-scheduling"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 Coder tool-call parser with automatic tool choice","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Custom Nano v3 reasoning parser (download plugin: nano_v3_reasoning_parser.py)","args":["--reasoning-parser-plugin","nano_v3_reasoning_parser.py","--reasoning-parser","nano_v3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":72,"description":"BF16 weights","extra_args":["--kv-cache-dtype","auto"]},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8","precision":"fp8","vram_minimum_gb":35,"description":"FP8 weights + FP8 KV cache","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1","VLLM_FLASHINFER_MOE_BACKEND":"throughput"}}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$84","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","hf_id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","hf_released":"2025-12-04T03:37:11.000Z","engines":{"vllm":{"min_version":"0.11.2"},"sglang":{"engine":"sglang","model_id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","nano_v3"]}},"guide":"$85"}},"default_engine":"vllm"},{"meta":{"title":"NVIDIA Nemotron-Nano-12B-v2-VL","slug":"nemotron-nano-12b-v2-vl","provider":"NVIDIA","description":"NVIDIA Nemotron-Nano 12B vision-language model with video support and Efficient Video Sampling (EVS)","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:96:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:96:meta:hardware"},"model":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16","min_vllm_version":"0.11.1","architecture":"dense","parameter_count":"12B","active_parameters":"12B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{"VLLM_VIDEO_LOADER_BACKEND":"opencv"}},"features":{"video_compression":{"description":"Efficient Video Sampling (EVS) prunes video tokens; 0.75 means 75% pruning","args":["--video-pruning-rate","0.75"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":29,"description":"BF16 weights on 1 GPU"},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8","precision":"fp8","vram_minimum_gb":14,"description":"FP8 weights on 1 GPU"},"nvfp4":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD","precision":"nvfp4","vram_minimum_gb":8,"description":"NVFP4 (QAD) weights for Blackwell"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$86","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-Nano-12B-v2-VL-BF16","hf_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16","hf_released":"2025-10-21T18:11:05.000Z","engines":{"vllm":{"min_version":"0.11.1"},"sglang":{"engine":"sglang","model_id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$87"}},"default_engine":"vllm"},{"meta":{"title":"NVIDIA Nemotron-Nano-9B-v2","slug":"nemotron-nano-9b-v2","provider":"NVIDIA","description":"NVIDIA Nemotron-Nano 9B (Mamba-hybrid dense) reasoning + tool-use model with FP8 / NVFP4 / Japanese variants","date_updated":"2026-04-28","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:49:meta:tasks","related_recipes":["nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16","nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:49:meta:hardware"},"model":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","min_vllm_version":"0.10.1","architecture":"dense","parameter_count":"9B","active_parameters":"9B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Custom Nemotron tool-call parser plugin (download: nemotron_toolcall_parser_no_streaming.py)","args":["--enable-auto-tool-choice","--tool-call-parser-plugin","nemotron_toolcall_parser_no_streaming.py","--tool-call-parser","nemotron_json"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":22,"description":"BF16 weights on 1 GPU"},"fp8":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8","precision":"fp8","vram_minimum_gb":11,"description":"FP8 weights"},"nvfp4":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4","precision":"nvfp4","vram_minimum_gb":6,"description":"NVFP4 weights for Blackwell"},"base":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base","precision":"bf16","vram_minimum_gb":22,"description":"Pre-RL base checkpoint (BF16)"},"japanese":{"model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese","precision":"bf16","vram_minimum_gb":22,"description":"Japanese-specialized fine-tune (BF16)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$88","hf_org":"nvidia","hf_repo":"NVIDIA-Nemotron-Nano-9B-v2","hf_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","hf_released":"2025-08-12T22:43:32.000Z","engines":{"vllm":{"min_version":"0.10.1"},"sglang":{"engine":"sglang","model_id":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"reasoning":{"args":["--reasoning-parser","nemotron_3"]}},"guide":"$89"}},"default_engine":"vllm"}]],["openai",[{"meta":{"title":"GPT-OSS 20B","slug":"gpt-oss-20b","provider":"OpenAI","description":"OpenAI's gpt-oss-20b — 21B-total / 3.6B-active MoE reasoning model with native MXFP4 quant; fits in 16GB VRAM","date_updated":"2026-05-08","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:41:meta:tasks","performance_headline":"21B/3.6B-A MoE reasoning model with native MXFP4 — runs on 16GB","related_recipes":["openai/gpt-oss-120b"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:41:meta:hardware"},"model":{"model_id":"openai/gpt-oss-20b","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"21B","active_parameters":"3.6B","context_length":131072,"base_args":[],"base_env":{}},"features":{"tool_calling":{"description":"OpenAI harmony tool-call parser with automatic tool choice","args":["--tool-call-parser","openai","--enable-auto-tool-choice"]}},"opt_in_features":[],"variants":{"default":{"precision":"mxfp4","vram_minimum_gb":16,"description":"MXFP4 MoE weights — fits in 16GB VRAM on a single consumer or datacenter GPU"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{"blackwell":{"extra_args":["--kv-cache-dtype","fp8","--no-enable-prefix-caching","--max-cudagraph-capture-size","2048","--max-num-batched-tokens","8192","--stream-interval","20"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":"1"}},"hopper":{"extra_args":["--no-enable-prefix-caching","--max-cudagraph-capture-size","2048","--max-num-batched-tokens","8192","--stream-interval","20"],"extra_env":{}},"amd":{"extra_args":["--attention-backend","ROCM_AITER_UNIFIED_ATTN","-cc.pass_config.fuse_rope_kvcache=True","-cc.use_inductor_graph_partition=True","--gpu-memory-utilization","0.95","--block-size=64"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":"INT4","HSA_NO_SCRATCH_RECLAIM":"1","AMDGCN_USE_BUFFER_OPS":"0"}}},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$8a","hf_org":"openai","hf_repo":"gpt-oss-20b","hf_id":"openai/gpt-oss-20b","hf_released":"2025-08-04T22:33:29.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"openai/gpt-oss-20b","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":1,"h200":1,"b200":1},"variants":{"default":{"precision":"mxfp4"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","gpt-oss"]},"reasoning":{"args":["--reasoning-parser","gpt-oss"]}},"guide":"$8b"}},"default_engine":"vllm"},{"meta":{"title":"GPT-OSS","slug":"gpt-oss","provider":"OpenAI","description":"OpenAI's gpt-oss family (20B / 120B) with MXFP4 MoE, attention-sinks, built-in tools via Responses API","date_updated":"2026-05-10","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:39:meta:tasks","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:39:meta:hardware"},"model":{"model_id":"openai/gpt-oss-120b","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"120B","active_parameters":"5.1B","context_length":131072,"base_args":[],"base_env":{}},"features":{"tool_calling":{"description":"OpenAI harmony tool-call parser with automatic tool choice","args":["--tool-call-parser","openai","--enable-auto-tool-choice"]},"spec_decoding":{"description":"EAGLE3 speculative decoding for accelerated inference","args":["--speculative-config","{\"model\":\"nvidia/gpt-oss-120b-Eagle3-v3\",\"num_speculative_tokens\":7,\"method\":\"eagle3\",\"draft_tensor_parallel_size\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"mxfp4","vram_minimum_gb":96,"description":"gpt-oss-120b with MXFP4 MoE; fits on 1xA100 80GB, scales to TP 2/4/8"},"amd_fp8":{"model_id":"amd/gpt-oss-120b-w-mxfp4-a-fp8","precision":"mxfp4","vram_minimum_gb":80,"description":"Quark-quantized MXFP4 weights with FP8 activations for MI355X (gfx950)"}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"blackwell":{"extra_args":["--kv-cache-dtype","fp8","--no-enable-prefix-caching","--max-cudagraph-capture-size","2048","--max-num-batched-tokens","8192","--stream-interval","20"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":"1"}},"hopper":{"extra_args":["--no-enable-prefix-caching","--max-cudagraph-capture-size","2048","--max-num-batched-tokens","8192","--stream-interval","20"],"extra_env":{}},"amd":{"extra_args":["--attention-backend","ROCM_AITER_UNIFIED_ATTN","-cc.pass_config.fuse_rope_kvcache=True","-cc.use_inductor_graph_partition=True","--gpu-memory-utilization","0.95","--block-size=64"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":"INT4","HSA_NO_SCRATCH_RECLAIM":"1","AMDGCN_USE_BUFFER_OPS":"0"}}},"strategy_overrides":{},"guide":"$8c","hf_org":"openai","hf_repo":"gpt-oss-120b","hf_id":"openai/gpt-oss-120b","hf_released":"2025-08-04T22:33:06.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"openai/gpt-oss-120b","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":8,"h200":8,"b200":8},"variants":{"default":{"precision":"mxfp4"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","gpt-oss"]},"reasoning":{"args":["--reasoning-parser","gpt-oss"]}},"guide":"$8d"}},"default_engine":"vllm"}]],["openbmb",[{"meta":{"title":"MiniCPM5-1B","slug":"minicpm5-1b","provider":"MiniCPM (OpenBMB)","description":"MiniCPM5-1B — dense 1B on-device LLM with hybrid Think/No-Think reasoning, native 128K context, and strong agentic tool use, built on the standard Llama architecture","date_updated":"2026-06-02","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:16:meta:tasks","performance_headline":"1B-class open-source SOTA on tool use, code, and reasoning","related_recipes":["openbmb/MiniCPM-V-4.6"]},"model":{"model_id":"openbmb/MiniCPM5-1B","min_vllm_version":"0.21.0","architecture":"dense","parameter_count":"1.1B","active_parameters":"1.1B","context_length":131072,"base_args":[],"base_env":{}},"features":{"reasoning":{"description":"Enable hybrid deep-thinking mode — the chat template wraps a block before the answer. Recommended sampling shifts to temperature=0.9, top_p=0.95. Leave off for fast No-Think responses (temperature=0.7).","args":["--default-chat-template-kwargs","{\"enable_thinking\": true}"]}},"opt_in_features":["reasoning"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":3,"description":"BF16 weights (~1.1B params) — tiny footprint, runs on a single consumer GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$8e","hf_org":"openbmb","hf_repo":"MiniCPM5-1B","hf_id":"openbmb/MiniCPM5-1B","hf_released":"2026-05-21T07:27:59.000Z","engines":{"vllm":{"min_version":"0.21.0"},"sglang":{"engine":"sglang","model_id":"openbmb/MiniCPM5-1B","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","minicpm5"]}},"guide":"$8f"}},"default_engine":"vllm"},{"meta":{"title":"MiniCPM-V 4.6","slug":"minicpm-v-4-6","provider":"MiniCPM (OpenBMB)","description":"MiniCPM-V 4.6 (1.3B) — pocket-sized multimodal LLM for ultra-efficient single-image, multi-image, and video understanding, built on SigLIP2-400M + a Qwen3.5-0.8B hybrid-attention backbone","date_updated":"2026-06-02","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:15:meta:tasks","performance_headline":"~1.5× token throughput vs Qwen3.5-0.8B with mixed 4×/16× visual token compression","related_recipes":["openbmb/MiniCPM5-1B"]},"model":{"model_id":"openbmb/MiniCPM-V-4.6","min_vllm_version":"0.22.0","architecture":"dense","parameter_count":"1.3B","active_parameters":"1.3B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"MiniCPM-V 4.6 ships as a standalone architecture (MiniCPMV4_6ForConditionalGeneration) only in transformers >= 5.7.0","command":"uv pip install -U \"transformers>=5.7.0\""},{"note":"Video understanding support (decoders for the video input path)","command":"uv pip install -U \"vllm[video]\"","optional":true}],"features":{"tool_calling":{"description":"Function/tool calling via the qwen3_coder parser (the v4.6 chat template emits Qwen3-Coder-style blocks)","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":4,"description":"BF16 weights (~1.3B params) — tiny footprint, runs comfortably on a single consumer GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$90","hf_org":"openbmb","hf_repo":"MiniCPM-V-4.6","hf_id":"openbmb/MiniCPM-V-4.6","hf_released":"2026-04-13T09:12:47.000Z","engines":{"vllm":{"min_version":"0.22.0"},"sglang":{"engine":"sglang","model_id":"openbmb/MiniCPM-V-4.6","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$91"}},"default_engine":"vllm"}]],["OpenGVLab",[{"meta":{"title":"InternVL3.5","slug":"internvl3.5","provider":"InternVL (OpenGVLab)","description":"InternVL 3.5 vision-language models from Shanghai AI Lab with thinking-mode prompting","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:65:meta:tasks","related_recipes":["internlm/Intern-S1"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:65:meta:hardware"},"model":{"model_id":"OpenGVLab/InternVL3_5-8B","min_vllm_version":"0.10.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":40960,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":19,"description":"BF16 weights for the 8B variant"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$92","hf_org":"OpenGVLab","hf_repo":"InternVL3_5-8B","hf_id":"OpenGVLab/InternVL3_5-8B","hf_released":"2025-08-25T16:38:47.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"OpenGVLab/InternVL3_5-8B","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":1,"mi300x":1,"mi325x":1,"mi355x":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$93"}},"default_engine":"vllm"}]],["PaddlePaddle",[{"meta":{"title":"PaddleOCR-VL-1.5","slug":"paddleocr-vl-1.5","provider":"PaddlePaddle","description":"PaddleOCR-VL-1.5 (0.9B) — next-gen compact VLM for document parsing; adds text spotting, seal recognition, and Tibetan/Bengali","date_updated":"2026-05-11","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:34:meta:tasks","related_recipes":["PaddlePaddle/PaddleOCR-VL"]},"model":{"model_id":"PaddlePaddle/PaddleOCR-VL-1.5","min_vllm_version":"0.11.1","architecture":"dense","parameter_count":"0.9B","active_parameters":"0.9B","context_length":131072,"base_args":["--trust-remote-code","--max-num-batched-tokens","16384","--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"dependencies":[{"note":"PaddlePaddle runtime (install in a separate venv from vllm to avoid conflicts)","command":"uv pip install paddlepaddle-gpu==3.2.1 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/"},{"note":"PaddleOCR document-parsing helpers (1.5 ships under the same paddleocr[doc-parser] extra)","command":"uv pip install -U \"paddleocr[doc-parser]\""},{"note":"Safetensors loader used by the doc-parser path","command":"uv pip install safetensors"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"BF16 weights — small footprint, runs on most GPUs"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$94","hf_org":"PaddlePaddle","hf_repo":"PaddleOCR-VL-1.5","hf_id":"PaddlePaddle/PaddleOCR-VL-1.5","hf_released":"2026-01-28T12:43:01.000Z","engines":{"vllm":{"min_version":"0.11.1"},"sglang":{"engine":"sglang","model_id":"PaddlePaddle/PaddleOCR-VL-1.5","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$95"}},"default_engine":"vllm"},{"meta":{"title":"PaddleOCR-VL","slug":"paddleocr-vl","provider":"PaddlePaddle","description":"PaddleOCR-VL (0.9B) — compact vision-language model for document parsing, OCR, tables, formulas, charts","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:66:meta:tasks","related_recipes":[]},"model":{"model_id":"PaddlePaddle/PaddleOCR-VL","min_vllm_version":"0.11.1","architecture":"dense","parameter_count":"0.9B","active_parameters":"0.9B","context_length":131072,"base_args":["--trust-remote-code","--max-num-batched-tokens","16384","--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"dependencies":[{"note":"PaddlePaddle runtime (install in a separate venv from vllm to avoid conflicts)","command":"uv pip install paddlepaddle-gpu==3.2.1 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/"},{"note":"PaddleOCR document-parsing helpers","command":"uv pip install -U \"paddleocr[doc-parser]\""},{"note":"Safetensors loader used by the doc-parser path","command":"uv pip install safetensors"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"BF16 weights — small footprint, runs on most GPUs"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$96","hf_org":"PaddlePaddle","hf_repo":"PaddleOCR-VL","hf_id":"PaddlePaddle/PaddleOCR-VL","hf_released":"2025-10-16T10:14:45.000Z","engines":{"vllm":{"min_version":"0.11.1"},"sglang":{"engine":"sglang","model_id":"PaddlePaddle/PaddleOCR-VL","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$97"}},"default_engine":"vllm"}]],["pfnet",[{"meta":{"title":"PLaMo 3 NICT 31B Base","slug":"plamo-3-nict-31b-base","provider":"Preferred Networks","description":"Largest PLaMo 3 NICT Japanese/English base model with interleaved sliding-window and full attention.","date_updated":"2026-06-02","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:18:meta:tasks","performance_headline":"31B-class PLaMo 3 base checkpoint for high-quality bilingual generation","related_recipes":["pfnet/plamo-2-translate"]},"model":{"model_id":"pfnet/plamo-3-nict-31b-base","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"32B","active_parameters":"32B","context_length":4096,"base_args":["--trust-remote-code","--max-model-len","4096"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":77,"description":"BF16 base checkpoint. Fits on H100/H200-class GPUs for 4K context; use TP=2 if extra cache headroom is needed."}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$98","hf_org":"pfnet","hf_repo":"plamo-3-nict-31b-base","hf_id":"pfnet/plamo-3-nict-31b-base","hf_released":"2025-10-24T07:30:37.000Z"},{"meta":{"title":"PLaMo 2 Translate","slug":"plamo-2-translate","provider":"Preferred Networks","description":"Post-trained PLaMo 2 translation model specialized for English/Japanese translation tasks.","date_updated":"2026-06-02","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:17:meta:tasks","performance_headline":"PLaMo 2 post-trained for English/Japanese translation","related_recipes":["pfnet/plamo-3-nict-31b-base"]},"model":{"model_id":"pfnet/plamo-2-translate","min_vllm_version":"0.8.5","architecture":"dense","parameter_count":"9.5B","active_parameters":"9.5B","context_length":8192,"base_args":["--trust-remote-code","--max-model-len","8192"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":23,"description":"BF16 post-trained translation checkpoint for English/Japanese translation."}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$99","hf_org":"pfnet","hf_repo":"plamo-2-translate","hf_id":"pfnet/plamo-2-translate","hf_released":"2025-05-27T05:08:28.000Z"}]],["poolside",[{"meta":{"title":"Laguna XS.2","slug":"laguna-xs.2","provider":"Poolside","description":"Poolside's 33B total / 3B activated MoE coding model with mixed sliding-window + global attention, native interleaved reasoning, and 128K context — designed for agentic coding.","date_updated":"2026-04-29","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:46:meta:tasks","performance_headline":"33B/3B-A MoE for agentic coding with interleaved thinking and tool use","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:46:meta:hardware"},"model":{"model_id":"poolside/Laguna-XS.2","min_vllm_version":"nightly","nightly_required":true,"architecture":"moe","parameter_count":"33B","active_parameters":"3B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Poolside's tool-call parser","args":["--enable-auto-tool-choice","--tool-call-parser","poolside_v1"]},"reasoning":{"description":"Enable interleaved thinking with Poolside's reasoning parser","args":["--reasoning-parser","poolside_v1"]},"spec_decoding":{"description":"DFlash speculative decoding with the Laguna-XS.2 draft model (7 tokens, greedy)","args":["--speculative-config","{\"model\":\"poolside/Laguna-XS.2-speculator.dflash\",\"num_speculative_tokens\":7,\"method\":\"dflash\"}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":80,"description":"BF16 weights — fits on a single 80GB+ GPU (H100/H200/B200)"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep"],"hardware_overrides":{},"strategy_overrides":{"single_node_tp":{"tp":1}},"guide":"$9a","hf_org":"poolside","hf_repo":"Laguna-XS.2","hf_id":"poolside/Laguna-XS.2","hf_released":"2026-04-23T20:50:45.000Z","engines":{"vllm":{"min_version":"nightly"},"sglang":{"engine":"sglang","model_id":"poolside/Laguna-XS.2","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","poolside_v1"]},"reasoning":{"args":["--reasoning-parser","poolside_v1"]}},"guide":"$9b"}},"default_engine":"vllm"}]],["Qwen",[{"meta":{"title":"Qwen3.6-27B","slug":"Qwen3.6-27b","provider":"Qwen","description":"Qwen3.6 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:61:meta:tasks","performance_headline":"Qwen3.6 flagship dense — single-GPU FP8 or 2x GPU BF16","related_recipes":["Qwen/Qwen3.6-35B-A3B"]},"model":{"model_id":"Qwen/Qwen3.6-27B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"27B","active_parameters":"27B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":65,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.6-27B-FP8","precision":"fp8","vram_minimum_gb":33,"description":"Qwen official FP8 checkpoint — single 40 GB GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$9c","hf_org":"Qwen","hf_repo":"Qwen3.6-27B","hf_id":"Qwen/Qwen3.6-27B","hf_released":"2026-04-21T07:50:43.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.6-27B","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$9d"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.6-35B-A3B","slug":"qwen3.6-35b-a3b","provider":"Qwen","description":"Smaller Qwen3.6 multimodal MoE model (35B total / 3B active) with BF16, FP8, and NVIDIA NVFP4 variants","date_updated":"2026-06-03","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:7:meta:tasks","performance_headline":"Compact Qwen3.6 MoE with 3B active parameters — single-GPU FP8 or 2-4 GPU BF16 serving","related_recipes":["Qwen/Qwen3.5-397B-A17B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:7:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.6-35B-A3B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"35B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":2}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":84,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.6-35B-A3B-FP8","precision":"fp8","vram_minimum_gb":42,"description":"Qwen official FP8 checkpoint — single-GPU serving"},"nvfp4":{"model_id":"nvidia/Qwen3.6-35B-A3B-NVFP4","precision":"nvfp4","vram_minimum_gb":21,"tp":1,"description":"NVIDIA ModelOpt NVFP4 checkpoint for Blackwell GPUs, including DGX Spark","extra_args":["--quantization","modelopt","--kv-cache-dtype","fp8","--attention-backend","flashinfer","--moe-backend","marlin","--gpu-memory-utilization","0.85","--max-model-len","65536","--max-num-seqs","4","--max-num-batched-tokens","8192","--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":3,\"moe_backend\":\"triton\"}"],"extra_env":{"VLLM_FP8_MOE_BACKEND":"flashinfer_cutlass","FLASHINFER_DISABLE_VERSION_CHECK":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$9e","hf_org":"Qwen","hf_repo":"Qwen3.6-35B-A3B","hf_id":"Qwen/Qwen3.6-35B-A3B","hf_released":"2026-04-15T05:59:19.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.6-35B-A3B","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":1,"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$9f"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-0.8B","slug":"qwen3.5-0.8b","provider":"Qwen","description":"Qwen3.5 tiny dense multimodal model (0.8B) — ultra-low-VRAM / edge serving with 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:54:meta:tasks","performance_headline":"Tiny Qwen3.5 dense for edge / draft-model use","related_recipes":["Qwen/Qwen3.5-2B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:54:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-0.8B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"0.8B","active_parameters":"0.8B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads.","args":["--language-model-only"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"Full precision BF16 — runs on any modern GPU"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$a0","hf_org":"Qwen","hf_repo":"Qwen3.5-0.8B","hf_id":"Qwen/Qwen3.5-0.8B","hf_released":"2026-02-28T23:57:01.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-0.8B","min_version":"v0.5.11","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$a1"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-2B","slug":"qwen3.5-2b","provider":"Qwen","description":"Qwen3.5 mini dense multimodal model (2B) — edge / low-VRAM serving with 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:57:meta:tasks","performance_headline":"Edge-scale Qwen3.5 dense — fits on 8 GB GPUs","related_recipes":["Qwen/Qwen3.5-4B","Qwen/Qwen3.5-0.8B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:57:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-2B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"2B","active_parameters":"2B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads.","args":["--language-model-only"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":5,"description":"Full precision BF16 — fits on an 8 GB GPU"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"## Overview\n\n[Qwen3.5-2B](https://huggingface.co/Qwen/Qwen3.5-2B) is a miniature dense\nQwen3.5 model — the full gated delta networks architecture, vision encoder,\nand 262K context, in a form small enough for 8 GB consumer GPUs or edge\ninference.\n\n## Prerequisites\n\n- **vLLM version:** >= 0.17.0\n- **Hardware:** single 8 GB GPU\n\n### Install vLLM\n\n```bash\nuv venv\nsource .venv/bin/activate\nuv pip install -U vllm --torch-backend=auto\n```\n\n## Launching the Server\n\n```bash\nvllm serve Qwen/Qwen3.5-2B \\\n --max-model-len 262144 \\\n --reasoning-parser qwen3\n```\n\n## Client Usage\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(api_key=\"EMPTY\", base_url=\"http://localhost:8000/v1\")\nresp = client.chat.completions.create(\n model=\"Qwen/Qwen3.5-2B\",\n messages=[{\"role\": \"user\", \"content\": \"Hi!\"}],\n max_tokens=64,\n)\nprint(resp.choices[0].message.content)\n```\n\n## References\n\n- [Model card](https://huggingface.co/Qwen/Qwen3.5-2B)\n- [Base checkpoint](https://huggingface.co/Qwen/Qwen3.5-2B-Base)\n","hf_org":"Qwen","hf_repo":"Qwen3.5-2B","hf_id":"Qwen/Qwen3.5-2B","hf_released":"2026-02-28T23:56:16.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-2B","min_version":"v0.5.11","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$a2"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-4B","slug":"qwen3.5-4b","provider":"Qwen","description":"Qwen3.5 compact dense multimodal model (4B) — fits on 16 GB consumer GPUs with full 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:59:meta:tasks","performance_headline":"Consumer-GPU-friendly Qwen3.5 dense with MTP support","related_recipes":["Qwen/Qwen3.5-9B","Qwen/Qwen3.5-2B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:59:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-4B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"4B","active_parameters":"4B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads.","args":["--language-model-only"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":10,"description":"Full precision BF16 — fits on a single 16 GB GPU"}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$a3","hf_org":"Qwen","hf_repo":"Qwen3.5-4B","hf_id":"Qwen/Qwen3.5-4B","hf_released":"2026-02-27T14:45:03.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-4B","min_version":"v0.5.11","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$a4"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-9B","slug":"qwen3.5-9b","provider":"Qwen","description":"Qwen3.5 dense multimodal model (9B) with gated delta networks hybrid attention, MTP, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:60:meta:tasks","performance_headline":"Single-GPU Qwen3.5 dense with MTP-accelerated decoding","related_recipes":["Qwen/Qwen3.5-27B","Qwen/Qwen3.5-4B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:60:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-9B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"9B","active_parameters":"9B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache.","args":["--language-model-only"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":22,"description":"Full precision BF16 — single 24 GB GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$a5","hf_org":"Qwen","hf_repo":"Qwen3.5-9B","hf_id":"Qwen/Qwen3.5-9B","hf_released":"2026-02-27T12:58:26.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-9B","min_version":"v0.5.11","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$a6"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-122B-A10B","slug":"qwen3.5-122b-a10b","provider":"Qwen","description":"Mid-size Qwen3.5 multimodal MoE (122B total / 10B active) with gated delta networks, 256 experts, and 262K context","date_updated":"2026-04-22","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:55:meta:tasks","performance_headline":"Qwen3.5 mid-tier MoE — fits on 4x H200 BF16 or 2x H200 FP8","related_recipes":["Qwen/Qwen3.5-397B-A17B","Qwen/Qwen3.5-35B-A3B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:55:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-122B-A10B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"122B","active_parameters":"10B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":293,"description":"Full precision BF16 — requires 4x H200 or equivalent"},"fp8":{"model_id":"Qwen/Qwen3.5-122B-A10B-FP8","precision":"fp8","vram_minimum_gb":147,"description":"Qwen official FP8 checkpoint — fits on 2x H200"},"gptq_int4":{"model_id":"Qwen/Qwen3.5-122B-A10B-GPTQ-Int4","precision":"int4","vram_minimum_gb":74,"description":"GPTQ Int4 checkpoint — single-GPU serving on 80GB hardware"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{"pd_cluster":{"prefill":{"env":{"VLLM_SSM_CONV_STATE_LAYOUT":"DS"}},"decode":{"env":{"VLLM_SSM_CONV_STATE_LAYOUT":"DS"}}}},"guide":"$a7","hf_org":"Qwen","hf_repo":"Qwen3.5-122B-A10B","hf_id":"Qwen/Qwen3.5-122B-A10B","hf_released":"2026-02-24T09:43:37.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-122B-A10B","min_version":"v0.5.11","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":2,"b200":2},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$a8"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-27B","slug":"qwen3.5-27b","provider":"Qwen","description":"Qwen3.5 dense multimodal model (27B) with gated delta networks hybrid attention, MTP, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:56:meta:tasks","performance_headline":"Qwen3.5 flagship dense — single-GPU FP8 or 2x GPU BF16","related_recipes":["Qwen/Qwen3.5-397B-A17B","Qwen/Qwen3.5-35B-A3B","Qwen/Qwen3.5-9B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:56:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-27B","min_vllm_version":"0.17.0","architecture":"dense","parameter_count":"27B","active_parameters":"27B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":65,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.5-27B-FP8","precision":"fp8","vram_minimum_gb":33,"description":"Qwen official FP8 checkpoint — single 40 GB GPU"},"gptq_int4":{"model_id":"Qwen/Qwen3.5-27B-GPTQ-Int4","precision":"int4","vram_minimum_gb":17,"description":"GPTQ Int4 checkpoint — fits on a single 24 GB GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$a9","hf_org":"Qwen","hf_repo":"Qwen3.5-27B","hf_id":"Qwen/Qwen3.5-27B","hf_released":"2026-02-24T09:41:56.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-27B","min_version":"v0.5.11","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$aa"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-35B-A3B","slug":"qwen3.5-35b-a3b","provider":"Qwen","description":"Compact Qwen3.5 multimodal MoE (35B total / 3B active) with gated delta networks, 256 experts, and 262K context","date_updated":"2026-04-22","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:58:meta:tasks","performance_headline":"Compact Qwen3.5 MoE — single-GPU FP8 or 2x GPU BF16 serving","related_recipes":["Qwen/Qwen3.5-397B-A17B","Qwen/Qwen3.5-122B-A10B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:58:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-35B-A3B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"35B","active_parameters":"3B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":84,"description":"Full precision BF16 — fits on 1x H200 or 2x H100"},"fp8":{"model_id":"Qwen/Qwen3.5-35B-A3B-FP8","precision":"fp8","vram_minimum_gb":42,"description":"Qwen official FP8 checkpoint — single-GPU serving"},"gptq_int4":{"model_id":"Qwen/Qwen3.5-35B-A3B-GPTQ-Int4","precision":"int4","vram_minimum_gb":21,"description":"GPTQ Int4 checkpoint — fits on a single 24GB GPU"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_dep","multi_node_tep"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$ab","hf_org":"Qwen","hf_repo":"Qwen3.5-35B-A3B","hf_id":"Qwen/Qwen3.5-35B-A3B","hf_released":"2026-02-24T09:39:25.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-35B-A3B","min_version":"v0.5.11","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$ac"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3.5-397B","slug":"qwen3.5-397b","provider":"Qwen","description":"Multimodal MoE model with gated delta networks architecture, 397B total / 17B active parameters, up to 262K context","date_updated":"2026-04-16","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:112:meta:tasks","performance_headline":"Verified on 8x H200, 8x MI300X/MI355X, and GB200 nodes","related_recipes":["Qwen/Qwen3.6-35B-A3B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:112:meta:hardware"},"model":{"model_id":"Qwen/Qwen3.5-397B-A17B","min_vllm_version":"0.17.0","architecture":"moe","parameter_count":"397B","active_parameters":"17B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{"VLLM_DEEP_GEMM_WARMUP":"skip","VLLM_USE_DEEP_GEMM":"0","VLLM_FLASHINFER_MOE_BACKEND":"latency"}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]},"reasoning":{"description":"Enable chain-of-thought reasoning with Qwen3 parser","args":["--reasoning-parser","qwen3"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative_config","{\"method\":\"mtp\",\"num_speculative_tokens\":3}"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["spec_decoding","text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":953,"description":"Full precision BF16 — requires 8x H200 or equivalent"},"nvfp4":{"model_id":"nvidia/Qwen3.5-397B-A17B-NVFP4","precision":"nvfp4","vram_minimum_gb":238,"extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}},"gptq_int4":{"model_id":"Qwen/Qwen3.5-397B-A17B-GPTQ-Int4","precision":"int4","vram_minimum_gb":239,"description":"GPTQ Int4 checkpoint — halves VRAM vs FP8"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{"pd_cluster":{"prefill":{"env":{"VLLM_SSM_CONV_STATE_LAYOUT":"DS"}},"decode":{"env":{"VLLM_SSM_CONV_STATE_LAYOUT":"DS"}}}},"guide":"$ad","hf_org":"Qwen","hf_repo":"Qwen3.5-397B-A17B","hf_id":"Qwen/Qwen3.5-397B-A17B","hf_released":"2026-02-16T04:55:12.000Z","engines":{"vllm":{"min_version":"0.17.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3.5-397B-A17B","min_version":"v0.5.8","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":16,"h200":8,"b200":8,"mi300x":8,"mi325x":4,"mi355x":4},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$ae"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3-ASR-1.7B","slug":"qwen3-asr-1.7b","provider":"Qwen","description":"Speech-to-text model supporting 11 languages, multiple accents, and singing voice with customizable text-context prompting.","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:70:meta:tasks","performance_headline":"Accurate multilingual ASR, including singing voice; single-GPU serving","related_recipes":[]},"model":{"model_id":"Qwen/Qwen3-ASR-1.7B","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"2.3B","active_parameters":"2.3B","context_length":65536,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras — required for ASR input pre-processing (librosa, soundfile)","command":"uv pip install -U \"vllm[audio]\""}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":4,"description":"Full precision BF16 — fits on a single mid-range GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$af","hf_org":"Qwen","hf_repo":"Qwen3-ASR-1.7B","hf_id":"Qwen/Qwen3-ASR-1.7B","hf_released":"2026-01-28T03:22:40.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3-ASR-1.7B","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$b0"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3Guard-Gen-8B","slug":"qwen3guard-gen-8b","provider":"Qwen","description":"Lightweight text-only guardrail/safety classifier model in the Qwen3Guard family.","date_updated":"2026-04-30","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:44:meta:tasks","performance_headline":"Runs on a single GPU; serves safety classifications over OpenAI-compatible API","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:44:meta:hardware"},"model":{"model_id":"Qwen/Qwen3Guard-Gen-8B","min_vllm_version":"0.10.0","architecture":"dense","parameter_count":"8B","active_parameters":"8B","context_length":32768,"base_args":[],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":19,"description":"Full precision BF16 — single GPU with >=20 GB VRAM"},"small_4b":{"model_id":"Qwen/Qwen3Guard-Gen-4B","precision":"bf16","vram_minimum_gb":10,"description":"4B variant for more constrained deployments"},"tiny_0_6b":{"model_id":"Qwen/Qwen3Guard-Gen-0.6B","precision":"bf16","vram_minimum_gb":4,"description":"0.6B variant for edge / ultra-low-cost serving"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$b1","hf_org":"Qwen","hf_repo":"Qwen3Guard-Gen-8B","hf_id":"Qwen/Qwen3Guard-Gen-8B","hf_released":"2025-09-23T11:40:09.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3Guard-Gen-8B","min_version":"v0.4.6.post1","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$b2"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3-VL-235B-A22B-Instruct","slug":"qwen3-vl-235b-a22b-instruct","provider":"Qwen","description":"Qwen3-VL flagship MoE vision-language model with 235B total / 22B active parameters, supporting images, video, and long context.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:73:meta:tasks","performance_headline":"Strong on images, video, and text — #1 open model on text on lmarena.ai at release","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:73:meta:hardware"},"model":{"model_id":"Qwen/Qwen3-VL-235B-A22B-Instruct","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"235B","active_parameters":"22B","context_length":262144,"base_args":[],"base_env":{}},"dependencies":[{"note":"Recommended for offline multimodal inference (image/video pre-processing helpers)","command":"uv pip install qwen-vl-utils==0.0.14","optional":true},{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":564,"description":"Full BF16 — ideal on H200/B200 with 8 GPUs"},"fp8":{"model_id":"Qwen/Qwen3-VL-235B-A22B-Instruct-FP8","precision":"fp8","vram_minimum_gb":282,"tp":4,"description":"Qwen official FP8 checkpoint for optimal H100 memory efficiency"},"nvfp4":{"model_id":"nvidia/Qwen3-VL-235B-A22B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":141,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":["--mm-encoder-tp-mode","data","--async-scheduling"],"extra_env":{}},"blackwell":{"extra_args":["--mm-encoder-tp-mode","data","--async-scheduling"],"extra_env":{}}},"strategy_overrides":{},"guide":"$b3","hf_org":"Qwen","hf_repo":"Qwen3-VL-235B-A22B-Instruct","hf_id":"Qwen/Qwen3-VL-235B-A22B-Instruct","hf_released":"2025-09-22T03:54:32.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3-VL-235B-A22B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":8,"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen"]}},"guide":"$b4"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3-Next-80B-A3B-Instruct","slug":"qwen3-next-80b-a3b-instruct","provider":"Qwen","description":"Advanced Qwen3-Next MoE model (80B total / 3B active) with hybrid attention, highly sparse experts, and multi-token prediction.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:72:meta:tasks","performance_headline":"Highly sparse MoE with MTP-accelerated decoding, runs on 4x H200/H20/A100/A800","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:72:meta:hardware"},"model":{"model_id":"Qwen/Qwen3-Next-80B-A3B-Instruct","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"80B","active_parameters":"3B","context_length":262144,"base_args":[],"base_env":{}},"features":{"tool_calling":{"description":"Enable automatic tool choice with Hermes parser","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]},"spec_decoding":{"description":"Multi-token prediction speculative decoding for lower latency","args":["--speculative-config","{\"method\":\"qwen3_next_mtp\",\"num_speculative_tokens\":2}","--no-enable-chunked-prefill"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":192,"description":"Full precision BF16 — fits on 4x H200/H20/A100/A800"},"fp8":{"model_id":"Qwen/Qwen3-Next-80B-A3B-Instruct-FP8","precision":"fp8","vram_minimum_gb":96,"description":"Qwen official FP8 checkpoint — recommended on SM90/SM100"},"nvfp4":{"model_id":"nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":48,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep"],"hardware_overrides":{"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"1","VLLM_FLASHINFER_MOE_BACKEND":"latency","VLLM_USE_DEEP_GEMM":"0","VLLM_USE_TRTLLM_ATTENTION":"0","VLLM_ATTENTION_BACKEND":"FLASH_ATTN"}}},"strategy_overrides":{},"guide":"$b5","hf_org":"Qwen","hf_repo":"Qwen3-Next-80B-A3B-Instruct","hf_id":"Qwen/Qwen3-Next-80B-A3B-Instruct","hf_released":"2025-09-09T15:40:56.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3-Next-80B-A3B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":4,"h200":2,"b200":2},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen"]}},"guide":"$b6"}},"default_engine":"vllm"},{"meta":{"title":"Qwen-Image","slug":"qwen-image","provider":"Qwen","description":"Text-to-image diffusion model (20B parameters) from the Qwen-Image family, served via vLLM-Omni.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:67:meta:tasks","performance_headline":"Shared DiT core across T2I, image editing, and layered-image variants; accelerated via Cache-DiT, TeaCache, and sequence parallelism","related_recipes":[]},"model":{"model_id":"Qwen/Qwen-Image","min_vllm_version":"0.18.0","architecture":"dense","parameter_count":"20B","active_parameters":"20B","context_length":0,"base_args":[],"base_env":{}},"omni":{"tasks":["t2i"]},"dependencies":[{"note":"vLLM-Omni must be installed from source and pins vllm==0.18.0 for diffusion support","command":"git clone https://github.com/vllm-project/vllm-omni.git && cd vllm-omni && uv pip install -e . vllm==0.18.0"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":48,"description":"Full precision BF16 — use CPU offload / layerwise offload for lower VRAM"},"fp8":{"precision":"fp8","vram_minimum_gb":24,"description":"FP8 quantization with `img_mlp` kept at full precision for quality.","extra_args":["--quantization","fp8","--ignored-layers","img_mlp"]},"int8":{"precision":"int8","vram_minimum_gb":24,"description":"INT8 quantization (`--quantization int8`).","extra_args":["--quantization","int8"]}},"compatible_strategies":["single_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$b7","hf_org":"Qwen","hf_repo":"Qwen-Image","hf_id":"Qwen/Qwen-Image","hf_released":"2025-08-02T04:58:07.000Z"},{"meta":{"title":"Qwen3-Coder-480B-A35B-Instruct","slug":"qwen3-coder-480b-a35b-instruct","provider":"Qwen","description":"Large coder MoE with 480B total / 35B active parameters, strong tool-use and code generation capabilities.","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:71:meta:tasks","performance_headline":"HumanEval 0.939, MBPP 0.918 (FP8). Recommended FP8 on 8x H200/H20 via DP=8","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:71:meta:hardware"},"model":{"model_id":"Qwen/Qwen3-Coder-480B-A35B-Instruct","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"480B","active_parameters":"35B","context_length":262144,"base_args":[],"base_env":{}},"dependencies":[{"note":"Optional: opt into DeepGEMM FP8 MoE kernels for extra throughput","command":"export VLLM_USE_DEEP_GEMM=1","optional":true},{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Qwen3 Coder parser","args":["--enable-auto-tool-choice","--tool-call-parser","qwen3_coder"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":1152,"description":"Full BF16 — 8x H200/H20 (141GB × 8) recommended"},"fp8":{"model_id":"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8","precision":"fp8","vram_minimum_gb":576,"tp":4,"description":"Qwen official FP8 checkpoint — required for DP=8 serving"},"nvfp4":{"model_id":"nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4","precision":"nvfp4","vram_minimum_gb":288,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{"hopper":{"extra_args":[],"extra_env":{"VLLM_USE_DEEP_GEMM":"1"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$b8","hf_org":"Qwen","hf_repo":"Qwen3-Coder-480B-A35B-Instruct","hf_id":"Qwen/Qwen3-Coder-480B-A35B-Instruct","hf_released":"2025-07-22T14:52:38.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3-Coder-480B-A35B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]}},"guide":"$b9"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3-235B-A22B-Instruct","slug":"qwen3-235b-a22b-instruct-2507","provider":"Qwen","description":"Flagship Qwen3 MoE instruct model with 235B total and 22B active parameters, tuned for high-quality text generation.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:69:meta:tasks","performance_headline":"Verified on 4x/8x H200, MI300X/MI325X/MI355X nodes (BF16 and FP8)","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:69:meta:hardware"},"model":{"model_id":"Qwen/Qwen3-235B-A22B-Instruct-2507","min_vllm_version":"0.10.0","architecture":"moe","parameter_count":"235B","active_parameters":"22B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"# deploy via GPUStack 2.1+ (Alibaba T-Head PPU backend)","brand":"T-Head","note":"Alibaba T-Head 真武 PPU 810E (96 GB, ~H20 class) serves vLLM/SGLang via GPUStack 2.1+, which orchestrates vendor container images and a pluggable backend — raw vllm serve flags are abstracted. bf16/fp8. See https://github.com/gpustack/gpustack"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with Hermes-compatible parser","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":564,"description":"Full precision BF16 — requires 4x H200 or 8x MI300X/MI325X/MI355X"},"fp8":{"model_id":"Qwen/Qwen3-235B-A22B-FP8","precision":"fp8","vram_minimum_gb":240,"tp":4,"description":"Qwen official FP8 checkpoint for improved efficiency on SM90+"},"nvfp4":{"model_id":"nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4","precision":"nvfp4","vram_minimum_gb":141,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$ba","hf_org":"Qwen","hf_repo":"Qwen3-235B-A22B-Instruct-2507","hf_id":"Qwen/Qwen3-235B-A22B-Instruct-2507","hf_released":"2025-07-21T06:46:56.000Z","engines":{"vllm":{"min_version":"0.10.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3-235B-A22B-Instruct-2507","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":8,"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen"]}},"guide":"$bb"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3-32B","slug":"qwen3-32b","provider":"Qwen","description":"Qwen3 32B dense model with hybrid thinking/non-thinking modes — verified on TPU v6e (Trillium).","date_updated":"2026-06-04","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:4:meta:tasks","performance_headline":"Verified on TPU v6e (Trillium) and v7 (Ironwood) with BF16","related_recipes":["Qwen/Qwen3-4B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:4:meta:hardware"},"model":{"model_id":"Qwen/Qwen3-32B","min_vllm_version":"0.8.5","architecture":"dense","parameter_count":"32B","active_parameters":"32B","context_length":40960,"base_args":[],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; fp16/bf16 only). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"},{"command":"docker run --runtime iluvatar -e IX_VISIBLE_DEVICES=all ","brand":"Iluvatar","note":"Iluvatar CoreX BI-V150 runs a vendor-forked vLLM. Get the CoreX Docker installer (corex-docker-installer-4.3.0+-llm-py3.10-x86_64.run) from the Iluvatar Resource Center (login required), install the iluvatar container runtime, then vllm serve as usual. bf16/fp16/int8. See https://github.com/Deep-Spark/DeepSparkInference"},{"command":"# deploy via GPUStack 2.1+ (Alibaba T-Head PPU backend)","brand":"T-Head","note":"Alibaba T-Head 真武 PPU 810E (96 GB, ~H20 class) serves vLLM/SGLang via GPUStack 2.1+, which orchestrates vendor container images and a pluggable backend — raw vllm serve flags are abstracted. bf16/fp8. See https://github.com/gpustack/gpustack"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"},{"command":"git clone https://github.com/Cambricon/vllm-mlu && pip install -e vllm-mlu","brand":"Cambricon","note":"Cambricon MLU (MLU370+/MLU590) runs vLLM via the out-of-tree vllm-mlu plugin. Requires Cambricon Neuware SDK 25.08 (gated — request from ecosystem@cambricon.com) and MLU-patched Ray 2.51.1 for multi-card TP. Day-0 DeepSeek-V3.2/V4 support. See https://github.com/Cambricon/vllm-mlu"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with the Hermes parser","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]},"reasoning":{"description":"Enable Qwen3 thinking mode (chain-of-thought) with the Qwen3 reasoning parser","args":["--reasoning-parser","qwen3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":77,"description":"Full precision BF16 — TPU v6e 2x2 (4 chips, TP=4) or a single 80GB+ GPU"},"fp8":{"model_id":"Qwen/Qwen3-32B-FP8","precision":"fp8","vram_minimum_gb":39,"description":"Qwen official FP8 checkpoint — single 40 GB GPU"},"awq":{"model_id":"Qwen/Qwen3-32B-AWQ","precision":"int4","vram_minimum_gb":20,"description":"AWQ 4-bit quantized weights — fits on a single 24 GB GPU","extra_args":["--quantization","awq"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$bc","hf_org":"Qwen","hf_repo":"Qwen3-32B","hf_id":"Qwen/Qwen3-32B","hf_released":"2025-04-27T03:52:59.000Z","engines":{"vllm":{"min_version":"0.8.5"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3-32B","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$bd"}},"default_engine":"vllm"},{"meta":{"title":"Qwen3-4B","slug":"qwen3-4b","provider":"Qwen","description":"Qwen3 4B dense model with hybrid thinking/non-thinking modes — fits on a single TPU v6e chip or one GPU.","date_updated":"2026-06-04","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:5:meta:tasks","performance_headline":"Verified on TPU v6e (Trillium) with BF16 on a single chip","related_recipes":["Qwen/Qwen3-32B"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:5:meta:hardware"},"model":{"model_id":"Qwen/Qwen3-4B","min_vllm_version":"0.8.5","architecture":"dense","parameter_count":"4B","active_parameters":"4B","context_length":40960,"base_args":[],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"docker run --runtime iluvatar -e IX_VISIBLE_DEVICES=all ","brand":"Iluvatar","note":"Iluvatar CoreX BI-V150 runs a vendor-forked vLLM. Get the CoreX Docker installer (corex-docker-installer-4.3.0+-llm-py3.10-x86_64.run) from the Iluvatar Resource Center (login required), install the iluvatar container runtime, then vllm serve as usual. bf16/fp16/int8. See https://github.com/Deep-Spark/DeepSparkInference"}],"features":{"tool_calling":{"description":"Enable automatic tool choice with the Hermes parser","args":["--enable-auto-tool-choice","--tool-call-parser","hermes"]},"reasoning":{"description":"Enable Qwen3 thinking mode (chain-of-thought) with the Qwen3 reasoning parser","args":["--reasoning-parser","qwen3"]}},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":10,"description":"Full precision BF16 — fits on a single TPU v6e chip or one 16GB+ GPU"},"fp8":{"model_id":"Qwen/Qwen3-4B-FP8","precision":"fp8","vram_minimum_gb":5,"description":"Qwen official FP8 checkpoint"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$be","hf_org":"Qwen","hf_repo":"Qwen3-4B","hf_id":"Qwen/Qwen3-4B","hf_released":"2025-04-27T03:41:29.000Z","engines":{"vllm":{"min_version":"0.8.5"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen3-4B","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen3_coder"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$bf"}},"default_engine":"vllm"},{"meta":{"title":"Qwen2.5-VL-72B-Instruct","slug":"qwen2.5-vl-72b-instruct","provider":"Qwen","description":"Qwen2.5-VL dense vision-language model (72B) for high-quality image and video understanding.","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:68:meta:tasks","performance_headline":"Verified on 4x A100 and 4x MI300X/MI325X/MI355X with BF16","related_recipes":["Qwen/Qwen2.5-VL-7B-Instruct"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:68:meta:hardware"},"model":{"model_id":"Qwen/Qwen2.5-VL-72B-Instruct","min_vllm_version":"0.7.0","architecture":"dense","parameter_count":"72B","active_parameters":"72B","context_length":128000,"base_args":[],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":173,"description":"Full precision BF16 — 4x A100 80GB or 4x MI300X/MI325X/MI355X"},"awq":{"model_id":"Qwen/Qwen2.5-VL-72B-Instruct-AWQ","precision":"int4","vram_minimum_gb":43,"description":"AWQ 4-bit quantized weights","extra_args":["--quantization","awq"]}},"compatible_strategies":["single_node_tp","multi_node_tp","multi_node_tp_pp"],"hardware_overrides":{"hopper":{"extra_args":["--mm-encoder-tp-mode","data"],"extra_env":{}},"blackwell":{"extra_args":["--mm-encoder-tp-mode","data"],"extra_env":{}},"amd":{"extra_args":["--mm-encoder-tp-mode","data","--limit-mm-per-prompt","{\"image\":2,\"video\":0}"],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$c0","hf_org":"Qwen","hf_repo":"Qwen2.5-VL-72B-Instruct","hf_id":"Qwen/Qwen2.5-VL-72B-Instruct","hf_released":"2025-01-27T04:12:04.000Z","engines":{"vllm":{"min_version":"0.7.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen2.5-VL-72B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":2,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen"]}},"guide":"$c1"}},"default_engine":"vllm"},{"meta":{"title":"Qwen2.5-VL-7B-Instruct","slug":"qwen2.5-vl-7b-instruct","provider":"Qwen","description":"Qwen2.5-VL dense vision-language model (7B) for image and video understanding — fits on a single TPU v6e chip or one GPU.","date_updated":"2026-06-04","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:3:meta:tasks","performance_headline":"Verified on TPU v6e (Trillium) with BF16 on a single chip","related_recipes":["Qwen/Qwen2.5-VL-72B-Instruct"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:3:meta:hardware"},"model":{"model_id":"Qwen/Qwen2.5-VL-7B-Instruct","min_vllm_version":"0.7.0","architecture":"dense","parameter_count":"7B","active_parameters":"7B","context_length":128000,"base_args":[],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":17,"description":"Full precision BF16 — fits on a single TPU v6e chip or one 24GB+ GPU"},"awq":{"model_id":"Qwen/Qwen2.5-VL-7B-Instruct-AWQ","precision":"int4","vram_minimum_gb":5,"description":"AWQ 4-bit quantized weights","extra_args":["--quantization","awq"]}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$c2","hf_org":"Qwen","hf_repo":"Qwen2.5-VL-7B-Instruct","hf_id":"Qwen/Qwen2.5-VL-7B-Instruct","hf_released":"2025-01-26T09:26:37.000Z","engines":{"vllm":{"min_version":"0.7.0"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen2.5-VL-7B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","qwen"]}},"guide":"$c3"}},"default_engine":"vllm"},{"meta":{"title":"Qwen2.5-32B","slug":"qwen2.5-32b","provider":"Qwen","description":"Qwen2.5 32B dense base (pretrained) language model for text completion — verified on TPU v6e (Trillium).","date_updated":"2026-06-04","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:2:meta:tasks","performance_headline":"Verified on TPU v6e (Trillium) with BF16, TP=4 on a 2x2 slice","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:2:meta:hardware"},"model":{"model_id":"Qwen/Qwen2.5-32B","min_vllm_version":"0.6.2","architecture":"dense","parameter_count":"32B","active_parameters":"32B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"command":"pip install vllm-ascend","brand":"Huawei","note":"Huawei Ascend NPU support is provided by the vllm-ascend plugin (requires CANN 9.0 + torch-npu; use the matching quay.io/ascend/vllm-ascend image). FP8 weights are served via W8A8 on Ascend. See https://docs.vllm.ai/projects/ascend"},{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; fp16/bf16 only). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"},{"command":"docker run --runtime iluvatar -e IX_VISIBLE_DEVICES=all ","brand":"Iluvatar","note":"Iluvatar CoreX BI-V150 runs a vendor-forked vLLM. Get the CoreX Docker installer (corex-docker-installer-4.3.0+-llm-py3.10-x86_64.run) from the Iluvatar Resource Center (login required), install the iluvatar container runtime, then vllm serve as usual. bf16/fp16/int8. See https://github.com/Deep-Spark/DeepSparkInference"},{"command":"# deploy via GPUStack 2.1+ (Alibaba T-Head PPU backend)","brand":"T-Head","note":"Alibaba T-Head 真武 PPU 810E (96 GB, ~H20 class) serves vLLM/SGLang via GPUStack 2.1+, which orchestrates vendor container images and a pluggable backend — raw vllm serve flags are abstracted. bf16/fp8. See https://github.com/gpustack/gpustack"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":77,"description":"Full precision BF16 — TPU v6e 2x2 (4 chips, TP=4) or a single 80GB+ GPU"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$c4","hf_org":"Qwen","hf_repo":"Qwen2.5-32B","hf_id":"Qwen/Qwen2.5-32B","hf_released":"2024-09-15T12:18:33.000Z","engines":{"vllm":{"min_version":"0.6.2"},"sglang":{"engine":"sglang","model_id":"Qwen/Qwen2.5-32B","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$c5"}},"default_engine":"vllm"}]],["stabilityai",[{"meta":{"title":"Stable Diffusion 3.5","slug":"stable-diffusion-3.5","provider":"Stability AI","description":"Stability AI's Stable Diffusion 3.5 text-to-image family (medium 2.5B, large 8.1B, large-turbo) via vLLM-Omni with Cache-DiT acceleration","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:98:meta:tasks","related_recipes":["stabilityai/stable-audio-open-1.0"]},"model":{"model_id":"stabilityai/stable-diffusion-3.5-medium","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"2.5B","active_parameters":"2.5B","context_length":0,"base_args":[],"base_env":{}},"omni":{"tasks":["t2i"]},"dependencies":[{"note":"Pin vllm==0.12.0 for Stable Diffusion 3.5","command":"uv pip install vllm==0.12.0"},{"note":"vllm-omni provides the image generation backend","command":"uv pip install git+https://github.com/vllm-project/vllm-omni.git"}],"features":{},"opt_in_features":[],"variants":{"default":{"label":"Medium","precision":"bf16","vram_minimum_gb":44,"description":"Stable Diffusion 3.5 medium (2.5B)"},"large":{"label":"Large","model_id":"stabilityai/stable-diffusion-3.5-large","precision":"bf16","vram_minimum_gb":24,"description":"Stable Diffusion 3.5 large (8.1B)"},"large_turbo":{"label":"Large Turbo","model_id":"stabilityai/stable-diffusion-3.5-large-turbo","precision":"bf16","vram_minimum_gb":24,"description":"Stable Diffusion 3.5 large-turbo (8.1B, timestep-distilled for few-step inference)"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$c6","hf_org":"stabilityai","hf_repo":"stable-diffusion-3.5-medium","hf_id":"stabilityai/stable-diffusion-3.5-medium","hf_released":"2024-10-29T10:27:32.000Z"},{"meta":{"title":"Stable Audio Open","slug":"stable-audio-open","provider":"Stability AI","description":"Text-to-audio generation model (1.2B params) producing up to ~47 s stereo audio at 44.1 kHz, served via vLLM-Omni","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:97:meta:tasks","related_recipes":["stabilityai/stable-diffusion-3.5-medium"]},"model":{"model_id":"stabilityai/stable-audio-open-1.0","min_vllm_version":"0.14.1","architecture":"dense","parameter_count":"1.2B","active_parameters":"1.2B","context_length":0,"base_args":["--trust-remote-code","--enforce-eager","--gpu-memory-utilization","0.9"],"base_env":{}},"omni":{"serve_binary":"vllm-omni serve","tasks":["t2a"]},"dependencies":[{"note":"Pin vllm==0.14.1 for Stable Audio Open","command":"uv pip install vllm==0.14.1"},{"note":"vllm-omni provides the audio generation backend","command":"uv pip install git+https://github.com/vllm-project/vllm-omni.git"},{"note":"soundfile (recommended) or scipy for WAV output","command":"uv pip install soundfile"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":19,"description":"BF16 weights for text-to-audio generation (via vLLM-Omni)"}},"compatible_strategies":[],"hardware_overrides":{},"strategy_overrides":{},"guide":"$c7","hf_org":"stabilityai","hf_repo":"stable-audio-open-1.0","hf_id":"stabilityai/stable-audio-open-1.0","hf_released":"2024-05-24T05:11:20.000Z"}]],["stepfun-ai",[{"meta":{"title":"Step-3.7-Flash","slug":"step-3.7-flash","provider":"StepFun","description":"Production-grade vision-language MoE (~198B total / 11B active parameters) combining a 196B sparse language backbone with a 1.8B perception encoder, hybrid SWA/Global attention, and 3-way Multi-Token Prediction","date_updated":"2026-05-30","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:19:meta:tasks","performance_headline":"Sparse MoE VLM with hybrid attention and 3-layer MTP speculative decoding","related_recipes":["stepfun-ai/Step-3.5-Flash"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:19:meta:hardware"},"model":{"model_id":"stepfun-ai/Step-3.7-Flash","min_vllm_version":"nightly","nightly_required":true,"docker_image":"vllm/vllm-openai:stepfun37","install":{"docker":{"note":"Dedicated Step-3.7 image — preferred over the nightly pip wheel until support lands in vllm:latest."}},"architecture":"moe","parameter_count":"198B","active_parameters":"11B","context_length":262144,"base_args":["--trust-remote-code","--enable-expert-parallel","--disable-cascade-attn"],"base_env":{}},"features":{"tool_calling":{"description":"Step-3.5 tool call parser with automatic tool choice","args":["--tool-call-parser","step3p5","--enable-auto-tool-choice"]},"reasoning":{"description":"Step-3.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","step3p5"]},"spec_decoding":{"description":"3-layer Multi-Token Prediction speculative decoding (MTP-3)","args":["--speculative-config","{\"method\": \"mtp\", \"num_speculative_tokens\": 3}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":475,"description":"Full precision BF16 — recommended on 8xH200/B200 with TP8+EP"},"fp8":{"model_id":"stepfun-ai/Step-3.7-Flash-FP8","precision":"fp8","vram_minimum_gb":238,"description":"Native FP8 checkpoint — runs on 8xH200/B200 with TP8+EP"},"nvfp4":{"model_id":"stepfun-ai/Step-3.7-Flash-NVFP4","precision":"nvfp4","vram_minimum_gb":119,"tp":4,"extra_args":["--quantization","modelopt","--kv-cache-dtype","fp8","--gpu-memory-utilization","0.9","--async-scheduling"],"description":"NVFP4 quantized — Blackwell only; TP4+EP with FP8 KV cache"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$c8","hf_org":"stepfun-ai","hf_repo":"Step-3.7-Flash","hf_id":"stepfun-ai/Step-3.7-Flash","hf_released":"2026-05-23T02:13:46.000Z","engines":{"vllm":{"min_version":"nightly"},"sglang":{"engine":"sglang","model_id":"stepfun-ai/Step-3.7-Flash","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":4,"b200":4},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$c9"}},"default_engine":"vllm"},{"meta":{"title":"Step-3.5-Flash","slug":"step-3.5-flash","provider":"StepFun","description":"Production-grade reasoning MoE (~196B total / 11B active parameters) with hybrid attention schedules, SWA compensation, and multi-token prediction for low-latency long-context inference","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:99:meta:tasks","performance_headline":"Sparse MoE reasoning model with hybrid attention and step3p5 MTP speculative decoding","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:99:meta:hardware"},"model":{"model_id":"stepfun-ai/Step-3.5-Flash","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"196B","active_parameters":"11B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Step-3.5 tool call parser with automatic tool choice","args":["--tool-call-parser","step3p5","--enable-auto-tool-choice"]},"reasoning":{"description":"Step-3.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","step3p5"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding with the step3p5_mtp method","args":["--hf-overrides","{\"num_nextn_predict_layers\": 1}","--speculative-config","{\"method\": \"step3p5_mtp\", \"num_speculative_tokens\": 1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":470,"description":"Full precision BF16 — runs on 4xH200/H20/B200"},"fp8":{"model_id":"stepfun-ai/Step-3.5-Flash-FP8","precision":"fp8","vram_minimum_gb":235,"tp":2,"description":"Native FP8 checkpoint (TP not supported beyond 2 — use DP4)"},"int4":{"model_id":"stepfun-ai/Step-3.5-Flash-INT4","precision":"int4","vram_minimum_gb":118,"description":"INT4 quantized weights"},"int8":{"model_id":"stepfun-ai/Step-3.5-Flash-INT8","precision":"int8","vram_minimum_gb":235,"description":"INT8 quantized weights"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"blackwell":{"extra_args":[],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP8":"0"}},"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"0"}}},"strategy_overrides":{},"guide":"$ca","hf_org":"stepfun-ai","hf_repo":"Step-3.5-Flash","hf_id":"stepfun-ai/Step-3.5-Flash","hf_released":"2026-02-01T08:03:45.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"stepfun-ai/Step-3.5-Flash","min_version":"v0.5.8","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":4,"mi300x":4,"mi325x":4,"mi355x":4},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","step3p5"]},"reasoning":{"args":["--reasoning-parser","step3p5"]}},"guide":"$cb"}},"default_engine":"vllm"}]],["tencent",[{"meta":{"title":"Hy3-preview","slug":"hy3-preview","provider":"Hunyuan (Tencent)","description":"Tencent Hunyuan Hy3-preview — scaled-up MoE language model (295B total / 21B active) with a 3.8B MTP layer for speculative decoding, 256K context, and hy_v3 tool/reasoning parsers","date_updated":"2026-04-23","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:53:meta:tasks","performance_headline":"Hunyuan Hy3-preview MoE — 295B/21B on 8×H200, 8×H20-3e(141GB), or 8×AMD MI300X/MI355X with MTP","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:53:meta:hardware"},"model":{"model_id":"tencent/Hy3-preview","min_vllm_version":"0.20.0","install":{"docker":{"note":"Use the dedicated hy3-preview image until changes land in vllm:latest."}},"architecture":"moe","parameter_count":"295B","active_parameters":"21B","context_length":262144,"base_args":["--trust-remote-code"],"base_env":{}},"features":{"tool_calling":{"description":"Hunyuan v3 tool call parser with automatic tool choice","args":["--tool-call-parser","hy_v3","--enable-auto-tool-choice"]},"reasoning":{"description":"Hunyuan v3 reasoning parser for thinking-mode chain-of-thought extraction","args":["--reasoning-parser","hy_v3"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layer","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":708,"description":"Full precision BF16 — 8×H200 or 8×H20-3e(141GB) minimum for weights + KV cache"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"install_note":"Hy3-preview model code is being added in PR #40681. Until it merges, build\nvLLM editable from the PR branch in rocm/vllm-dev:nightly:\n\n docker run -it --device=/dev/kfd --device=/dev/dri --network=host \\\n --ipc=host --shm-size=128g --group-add video --cap-add SYS_PTRACE \\\n --security-opt seccomp=unconfined -v ~/work:/work -w /work \\\n -e PYTHONPATH=/work/vllm rocm/vllm-dev:nightly bash\n git clone -b feature/support_hy_v3 \\\n https://github.com/stevenkuang-tencent/vllm.git\n cd vllm && pip uninstall -y vllm\n SETUPTOOLS_SCM_PRETEND_VERSION=0.20.0.dev0 VLLM_TARGET_DEVICE=rocm \\\n pip install --editable . --no-build-isolation\n\nSetting PYTHONPATH avoids a known editable-install conflict with the\nempty /app/vllm namespace directory shipped in the base image.\n","extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1","VLLM_ROCM_USE_AITER_MOE":"1","VLLM_ROCM_USE_AITER_MHA":"1","VLLM_ROCM_USE_AITER_RMSNORM":"1","VLLM_ROCM_USE_AITER_LINEAR":"1"}}},"strategy_overrides":{},"guide":"$cc","hf_org":"tencent","hf_repo":"Hy3-preview","hf_id":"tencent/Hy3-preview","hf_released":"2026-04-13T06:07:57.000Z","engines":{"vllm":{"min_version":"0.20.0"},"sglang":{"engine":"sglang","model_id":"tencent/Hy3-preview","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"mi300x":8,"mi325x":8,"mi350x":8,"mi355x":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$cd"}},"default_engine":"vllm"},{"meta":{"title":"HunyuanOCR","slug":"hunyuan-ocr","provider":"Hunyuan (Tencent)","description":"Tencent Hunyuan end-to-end OCR expert VLM (~1B) for online OCR serving with an OpenAI-compatible API","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:101:meta:tasks","performance_headline":"Compact 1B end-to-end OCR VLM from the Hunyuan native multimodal family","related_recipes":[]},"model":{"model_id":"tencent/HunyuanOCR","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"1B","active_parameters":"1B","context_length":32768,"base_args":["--no-enable-prefix-caching","--mm-processor-cache-gb","0"],"base_env":{}},"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"Full precision BF16 — single-GPU deployment"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$ce","hf_org":"tencent","hf_repo":"HunyuanOCR","hf_id":"tencent/HunyuanOCR","hf_released":"2025-11-18T04:08:56.000Z"},{"meta":{"title":"Hunyuan-A13B-Instruct","slug":"hunyuan-a13b-instruct","provider":"Hunyuan (Tencent)","description":"Tencent Hunyuan A13B instruct-tuned MoE language model with AITER-accelerated AMD ROCm deployment","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:100:meta:tasks","performance_headline":"Hunyuan-A13B MoE with AITER acceleration on AMD MI300X/MI325X/MI355X","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:100:meta:hardware"},"model":{"model_id":"tencent/Hunyuan-A13B-Instruct","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"80B","active_parameters":"13B","context_length":32768,"base_args":["--trust-remote-code"],"base_env":{}},"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":580,"description":"Full precision BF16 — 2x GPU (TP=2) on AMD MI300X/MI325X/MI355X"},"fp8":{"model_id":"tencent/Hunyuan-A13B-Instruct-FP8","precision":"fp8","vram_minimum_gb":96,"description":"FP8 quantized weights for Hopper/Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"]}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"1"}}},"strategy_overrides":{},"guide":"$cf","hf_org":"tencent","hf_repo":"Hunyuan-A13B-Instruct","hf_id":"tencent/Hunyuan-A13B-Instruct","hf_released":"2025-06-25T12:39:52.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"tencent/Hunyuan-A13B-Instruct","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"mi300x":2,"mi325x":2,"mi355x":2},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","hunyuan_a13b"]},"reasoning":{"args":["--reasoning-parser","hunyuan_a13b"]}},"guide":"$d0"}},"default_engine":"vllm"}]],["Wan-AI",[{"meta":{"title":"Wan2.2","slug":"wan2.2","provider":"Wan (Alibaba)","description":"Wan2.2 video generation models — T2V/I2V MoE (14B active) and unified TI2V (5B dense), served via vLLM-Omni","date_updated":"2026-04-27","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:50:meta:tasks","related_recipes":[]},"model":{"model_id":"Wan-AI/Wan2.2-T2V-A14B-Diffusers","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"28B","active_parameters":"14B","context_length":0,"base_args":[],"base_env":{}},"omni":{"tasks":[{"id":"t2v","vram_minimum_gb":152,"description":"T2V MoE — 14B active parameters"},{"id":"i2v","model_id":"Wan-AI/Wan2.2-I2V-A14B-Diffusers","vram_minimum_gb":40,"description":"I2V MoE — 14B active parameters"},{"id":"ti2v","model_id":"Wan-AI/Wan2.2-TI2V-5B-Diffusers","vram_minimum_gb":20,"description":"Unified Text+Image-to-Video — dense 5B"}]},"dependencies":[{"note":"Pin vllm==0.12.0 for Wan2.2","command":"uv pip install vllm==0.12.0"},{"note":"vllm-omni pinned to a specific commit that includes Wan2.2 text-to-video support","command":"uv pip install git+https://github.com/vllm-project/vllm-omni.git@ef01223c42be10ee260b9f6e5ec31894cd09d86e"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":152,"description":"BF16 — variants moved to omni.tasks (T2V / I2V / TI2V each pick a different checkpoint)"}},"compatible_strategies":[],"hardware_overrides":{"amd":{"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$d1","hf_org":"Wan-AI","hf_repo":"Wan2.2-T2V-A14B-Diffusers","hf_id":"Wan-AI/Wan2.2-T2V-A14B-Diffusers","hf_released":"2025-07-28T09:04:28.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"Wan-AI/Wan2.2-T2V-A14B-Diffusers","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"b200":1,"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{}}},"default_engine":"vllm"}]],["XiaomiMiMo",[{"meta":{"title":"MiMo-V2.5","slug":"mimo-v2-5","provider":"MiMo (Xiaomi)","description":"MiMo-V2.5 is a native omnimodal model with strong agentic capabilities, supporting text, image, video, and audio understanding within a unified architecture","date_updated":"2026-04-27","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:52:meta:tasks","related_recipes":["XiaomiMiMo/MiMo-V2-Flash"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:52:meta:hardware"},"model":{"model_id":"XiaomiMiMo/MiMo-V2.5","min_vllm_version":"0.21.0","architecture":"moe","parameter_count":"311B","active_parameters":"15B","context_length":1048576,"base_args":["--trust-remote-code","--generation-config","vllm"],"base_env":{}},"features":{"tool_calling":{"description":"MiMo tool-call parser","args":["--tool-call-parser","mimo","--enable-auto-tool-choice"]},"reasoning":{"description":"MiMo reasoning parser","args":["--reasoning-parser","mimo"]},"spec_decoding":{"description":"Multi-Token Prediction","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":374,"tp":4,"description":"Native FP8 weights (block-wise e4m3 128x128); 4x H200 with TP4","extra_args":["--gpu-memory-utilization","0.95","--max-model-len","auto"]}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$d2","hf_org":"XiaomiMiMo","hf_repo":"MiMo-V2.5","hf_id":"XiaomiMiMo/MiMo-V2.5","hf_released":"2026-04-27T13:37:38.000Z","engines":{"vllm":{"min_version":"0.21.0"},"sglang":{"engine":"sglang","model_id":"XiaomiMiMo/MiMo-V2.5","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":4},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mimo"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$d3"}},"default_engine":"vllm"},{"meta":{"title":"MiMo-V2.5-Pro","slug":"mimo-v2-5-pro","provider":"MiMo (Xiaomi)","description":"Xiaomi's flagship MoE reasoning model (1.02T total / 42B active) with hybrid attention, native FP8 weights, and Multi-Token Prediction","date_updated":"2026-04-27","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:51:meta:tasks","related_recipes":["XiaomiMiMo/MiMo-V2.5"],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:51:meta:hardware"},"model":{"model_id":"XiaomiMiMo/MiMo-V2.5-Pro","min_vllm_version":"0.21.0","architecture":"moe","parameter_count":"1T","active_parameters":"42B","context_length":1048576,"base_args":["--trust-remote-code","--max-model-len auto","--generation-config","vllm"],"base_env":{}},"features":{"tool_calling":{"description":"MiMo tool-call parser","args":["--tool-call-parser","mimo","--enable-auto-tool-choice"]},"reasoning":{"description":"MiMo reasoning parser","args":["--reasoning-parser","mimo"]},"spec_decoding":{"description":"Multi-Token Prediction","args":["--speculative-config","{\"method\":\"mtp\",\"num_speculative_tokens\":1}"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"fp8","vram_minimum_gb":1224,"description":"Native FP8 weights (block-wise e4m3 128x128); 8x H200 with TP8","extra_args":["--tensor-parallel-size","8","--gpu-memory-utilization","0.95","--max-model-len","auto"]}},"compatible_strategies":["single_node_tp","single_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$d4","hf_org":"XiaomiMiMo","hf_repo":"MiMo-V2.5-Pro","hf_id":"XiaomiMiMo/MiMo-V2.5-Pro","hf_released":"2026-04-27T12:52:53.000Z","engines":{"vllm":{"min_version":"0.21.0"},"sglang":{"engine":"sglang","model_id":"XiaomiMiMo/MiMo-V2.5-Pro","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mimo"]},"reasoning":{"args":["--reasoning-parser","mimo"]}},"guide":"$d5"}},"default_engine":"vllm"},{"meta":{"title":"MiMo-V2-Flash","slug":"mimo-v2-flash","provider":"MiMo (Xiaomi)","description":"Xiaomi's MoE reasoning model (309B total / 15B active) with hybrid attention and MTP for fast agentic workflows","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:74:meta:tasks","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:74:meta:hardware"},"model":{"model_id":"XiaomiMiMo/MiMo-V2-Flash","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"309B","active_parameters":"15B","context_length":262144,"base_args":["--trust-remote-code","--generation-config","vllm"],"base_env":{}},"features":{"tool_calling":{"description":"Qwen3 XML tool-call parser","args":["--tool-call-parser","qwen3_xml"]},"reasoning":{"description":"Qwen3 reasoning parser","args":["--reasoning-parser","qwen3"]}},"opt_in_features":[],"variants":{"default":{"precision":"fp8","vram_minimum_gb":371,"description":"Native FP8 weights; 4x H200 recommended with TP4","extra_args":["--tensor-parallel-size","4","--gpu-memory-utilization","0.9"]}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{"amd":{"extra_args":[],"extra_env":{"VLLM_ROCM_USE_AITER":"0"}}},"strategy_overrides":{},"guide":"$d6","hf_org":"XiaomiMiMo","hf_repo":"MiMo-V2-Flash","hf_id":"XiaomiMiMo/MiMo-V2-Flash","hf_released":"2025-12-16T08:47:02.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"XiaomiMiMo/MiMo-V2-Flash","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":4,"mi300x":2,"mi325x":2},"variants":{"default":{"precision":"fp8"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","mimo"]},"reasoning":{"args":["--reasoning-parser","qwen3"]}},"guide":"$d7"}},"default_engine":"vllm"}]],["zai-org",[{"meta":{"title":"GLM-5.1","slug":"glm-5.1","provider":"GLM (Z-AI)","description":"GLM-5.1 refreshed version of GLM-5 — frontier-scale MoE language model (~744B total parameters) with MTP speculative decoding and thinking mode","date_updated":"2026-05-21","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:23:meta:tasks","performance_headline":"Refreshed GLM-5 series MoE with improved reasoning, coding, and agentic performance","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:23:meta:hardware"},"model":{"model_id":"zai-org/GLM-5.1","min_vllm_version":"0.19.1","architecture":"moe","parameter_count":"744B","active_parameters":"40B","context_length":202752,"base_args":["--trust-remote-code","--chat-template-content-format=string"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.7 tool call parser with automatic tool choice","args":["--tool-call-parser","glm47","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser — thinking mode enabled by default on requests","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding (3 draft tokens)","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","3"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":1786,"description":"Full precision BF16 — requires multi-node deployment"},"fp8":{"model_id":"zai-org/GLM-5.1-FP8","precision":"fp8","vram_minimum_gb":893,"description":"Native FP8 checkpoint — 8xH200/H20 (141GB × 8) single-node serving"},"nvfp4":{"model_id":"nvidia/GLM-5.1-NVFP4","precision":"nvfp4","vram_minimum_gb":446,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$d8","hf_org":"zai-org","hf_repo":"GLM-5.1","hf_id":"zai-org/GLM-5.1","hf_released":"2026-04-03T09:28:47.000Z","engines":{"vllm":{"min_version":"0.19.1"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-5.1","min_version":"v0.5.10","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":32,"h200":16,"b200":16},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm47"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$d9"}},"default_engine":"vllm"},{"meta":{"title":"GLM-5","slug":"glm-5","provider":"GLM (Z-AI)","description":"GLM-5 frontier-scale MoE language model (~744B total parameters, 28.5T training tokens) with asynchronous RL infrastructure for reasoning, coding, and agentic tasks","date_updated":"2026-04-17","difficulty":"advanced","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:107:meta:tasks","performance_headline":"Frontier-scale MoE with 744B parameters, best-in-class open-source performance on reasoning/coding/agents","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:107:meta:hardware"},"model":{"model_id":"zai-org/GLM-5","min_vllm_version":"0.16.0","architecture":"moe","parameter_count":"744B","active_parameters":"40B","context_length":202752,"base_args":["--trust-remote-code","--chat-template-content-format=string"],"base_env":{}},"dependencies":[{"note":"Pin vllm==0.19.0 (avoid nightly)","command":"uv pip install \"vllm==0.19.0\" --torch-backend=auto"},{"note":"GLM-5 requires transformers >= 5.4.0","command":"uv pip install \"transformers>=5.4.0\""},{"note":"Optional: DeepGEMM for FP8 MoE kernels (FP8 variant only)","command":"bash install_deepgemm.sh","optional":true}],"features":{"tool_calling":{"description":"GLM-4.7 tool call parser with automatic tool choice","args":["--tool-call-parser","glm47","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser — thinking mode enabled by default on requests","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding (3 draft tokens)","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","3"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":1786,"description":"Full precision BF16 — requires multi-node deployment"},"fp8":{"model_id":"zai-org/GLM-5-FP8","precision":"fp8","vram_minimum_gb":893,"description":"Native FP8 checkpoint — 8xH200/H20 (141GB x 8) single-node serving"},"nvfp4":{"model_id":"nvidia/GLM-5-NVFP4","precision":"nvfp4","vram_minimum_gb":446,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$da","hf_org":"zai-org","hf_repo":"GLM-5","hf_id":"zai-org/GLM-5","hf_released":"2026-02-11T04:55:46.000Z","engines":{"vllm":{"min_version":"0.16.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-5","min_version":"v0.5.8","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":32,"h200":16,"b200":16},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm47"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$db"}},"default_engine":"vllm"},{"meta":{"title":"GLM-OCR","slug":"glm-ocr","provider":"GLM (Z-AI)","description":"GLM-OCR image-to-text model with built-in MTP speculative decoding for high-throughput OCR serving","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:110:meta:tasks","performance_headline":"Multilingual end-to-end OCR VLM with MTP-accelerated decoding","related_recipes":[]},"model":{"model_id":"zai-org/GLM-OCR","min_vllm_version":"0.12.0","architecture":"dense","parameter_count":"0.9B","active_parameters":"0.9B","context_length":131072,"base_args":[],"base_env":{}},"dependencies":[{"note":"GLM-OCR requires the nightly vllm wheel","command":"uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly"},{"note":"transformers from source for GLM-OCR tokenizer support","command":"uv pip install git+https://github.com/huggingface/transformers.git"}],"features":{"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":2,"description":"Full precision BF16 — single-GPU deployment"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$dc","hf_org":"zai-org","hf_repo":"GLM-OCR","hf_id":"zai-org/GLM-OCR","hf_released":"2026-01-30T04:24:21.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-OCR","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$dd"}},"default_engine":"vllm"},{"meta":{"title":"GLM-Image","slug":"glm-image","provider":"GLM (Z-AI)","description":"Hybrid autoregressive + diffusion image generation model — text-to-image and image-to-image with strong text rendering and knowledge-intensive generation","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:109:meta:tasks","performance_headline":"9B AR generator + 7B DiT decoder, state-of-the-art text rendering in generated images","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:109:meta:hardware"},"model":{"model_id":"zai-org/GLM-Image","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"16B","active_parameters":"16B","context_length":4096,"base_args":["--trust-remote-code"],"base_env":{}},"omni":{"tasks":["t2i","i2i"]},"dependencies":[{"note":"vllm-omni provides the diffusion decoder path","command":"uv pip install vllm-omni"},{"note":"transformers from source (GLM-Image tokenizer)","command":"uv pip install git+https://github.com/huggingface/transformers.git"},{"note":"diffusers from source — required for the DiT decoder","command":"uv pip install git+https://github.com/huggingface/diffusers.git"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":38,"description":"Single-GPU deployment (~33 GB for model weights, plus activation headroom)"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$de","hf_org":"zai-org","hf_repo":"GLM-Image","hf_id":"zai-org/GLM-Image","hf_released":"2026-01-08T09:39:56.000Z"},{"meta":{"title":"GLM-4.7","slug":"glm-4.7","provider":"GLM (Z-AI)","description":"GLM-4.7 MoE language model (~358B total parameters) with MTP speculative decoding, updated tool call parser, and reasoning support","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:106:meta:tasks","performance_headline":"Latest GLM-4.X release with updated glm47 tool call parser and MTP speculative decoding","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:106:meta:hardware"},"model":{"model_id":"zai-org/GLM-4.7","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"358B","active_parameters":"32B","context_length":202752,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"note":"GLM-4.7 requires the nightly vllm wheel","command":"uv pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly"},{"note":"transformers from source — GLM-4.7 tokenizer is newer than any release","command":"uv pip install git+https://github.com/huggingface/transformers.git"}],"features":{"tool_calling":{"description":"GLM-4.7 tool call parser with automatic tool choice","args":["--tool-call-parser","glm47","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":859,"description":"Full precision BF16 on 8xH200 or equivalent"},"fp8":{"model_id":"zai-org/GLM-4.7-FP8","precision":"fp8","vram_minimum_gb":430,"description":"Native FP8 checkpoint with minimal accuracy loss"},"nvfp4":{"model_id":"nvidia/GLM-4.7-NVFP4","precision":"nvfp4","vram_minimum_gb":215,"description":"NVIDIA NVFP4 quantized weights for Blackwell GPUs","extra_args":["--kv-cache-dtype","fp8"],"extra_env":{"VLLM_USE_FLASHINFER_MOE_FP4":"1"}}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$df","hf_org":"zai-org","hf_repo":"GLM-4.7","hf_id":"zai-org/GLM-4.7","hf_released":"2025-12-22T07:45:52.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-4.7","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"mi300x":8,"mi325x":8,"mi355x":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm47"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$e0"}},"default_engine":"vllm"},{"meta":{"title":"GLM-ASR-Nano-2512","slug":"glm-asr-nano-2512","provider":"GLM (Z-AI)","description":"Open-source speech recognition model (~2B) with strong dialect support (Cantonese and others) and robust low-volume speech transcription","date_updated":"2026-04-17","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:108:meta:tasks","performance_headline":"Outperforms Whisper V3 on multiple benchmarks at compact 1.5B active / 2B total size","related_recipes":[]},"model":{"model_id":"zai-org/GLM-ASR-Nano-2512","min_vllm_version":"0.14.1","architecture":"dense","parameter_count":"2.3B","active_parameters":"1.5B","context_length":8192,"base_args":[],"base_env":{}},"dependencies":[{"note":"Audio extras required for ASR (requires vllm>=0.14.1)","command":"uv pip install -U \"vllm[audio]\" --torch-backend auto"},{"note":"Install transformers from source for GLM-ASR tokenizer support","command":"uv pip install git+https://github.com/huggingface/transformers.git"}],"features":{},"opt_in_features":[],"variants":{"default":{"precision":"bf16","vram_minimum_gb":11,"description":"Full precision BF16 — single-GPU deployment"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$e1","hf_org":"zai-org","hf_repo":"GLM-ASR-Nano-2512","hf_id":"zai-org/GLM-ASR-Nano-2512","hf_released":"2025-12-09T09:07:41.000Z","engines":{"vllm":{"min_version":"0.14.1"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-ASR-Nano-2512","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{},"guide":"$e2"}},"default_engine":"vllm"},{"meta":{"title":"GLM-4.6V","slug":"glm-4.6v","provider":"GLM (Z-AI)","description":"GLM-4.6 vision-language MoE model — image-text-to-text with 128K context, native FP8 checkpoint, and expert parallelism","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:105:meta:tasks","performance_headline":"Updated GLM-V series with 128K context length and native FP8","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:105:meta:hardware"},"model":{"model_id":"zai-org/GLM-4.6V","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"107B","active_parameters":"12B","context_length":131072,"base_args":["--trust-remote-code","--enable-expert-parallel","--allowed-local-media-path","/","--mm-encoder-tp-mode","data","--mm-processor-cache-type","shm"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser","args":["--reasoning-parser","glm45"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":257,"description":"Full precision BF16 — runs on 4xH100/H200"},"fp8":{"model_id":"zai-org/GLM-4.6V-FP8","precision":"fp8","vram_minimum_gb":128,"description":"Native FP8 checkpoint with minimal accuracy loss"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":["--mm-encoder-tp-mode","data","--allowed-local-media-path","/"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$e3","hf_org":"zai-org","hf_repo":"GLM-4.6V","hf_id":"zai-org/GLM-4.6V","hf_released":"2025-12-07T07:20:45.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-4.6V","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":8,"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm45"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$e4"}},"default_engine":"vllm"},{"meta":{"title":"Glyph","slug":"glyph","provider":"GLM (Z-AI)","description":"Visual-text compression framework that renders long text into images and processes them with a reasoning VLM, scaling effective context length","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:111:meta:tasks","performance_headline":"Reasoning multimodal model for visual-text compression, single-GPU deployable","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:111:meta:hardware"},"model":{"model_id":"zai-org/Glyph","min_vllm_version":"0.11.0","architecture":"dense","parameter_count":"10B","active_parameters":"10B","context_length":131072,"base_args":["--no-enable-prefix-caching","--mm-processor-cache-gb","0","--limit-mm-per-prompt.video","0"],"base_env":{}},"features":{"reasoning":{"description":"GLM-4.5 reasoning parser for extracting reasoning traces from Glyph outputs","args":["--reasoning-parser","glm45"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only","encoder_parallel"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":24,"description":"Full precision BF16 — single-GPU deployment on 1xH100"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$e5","hf_org":"zai-org","hf_repo":"Glyph","hf_id":"zai-org/Glyph","hf_released":"2025-10-25T06:19:07.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"zai-org/Glyph","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":1,"mi300x":1,"mi325x":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$e6"}},"default_engine":"vllm"},{"meta":{"title":"GLM-4.6","slug":"glm-4.6","provider":"GLM (Z-AI)","description":"GLM-4.6 MoE language model (~357B total parameters, BF16) with MTP speculative decoding, native tool calling and reasoning","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:104:meta:tasks","performance_headline":"Updated GLM-4.X series MoE model with native FP8 and BF16, MTP speculative decoding","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:104:meta:hardware"},"model":{"model_id":"zai-org/GLM-4.6","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"357B","active_parameters":"32B","context_length":202752,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; K100-AI is fp16/bf16 only — FP8 checkpoints run as fp16). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"}],"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":857,"description":"Full precision BF16 on 8xH200 or equivalent"},"fp8":{"model_id":"zai-org/GLM-4.6-FP8","precision":"fp8","vram_minimum_gb":428,"description":"Native FP8 checkpoint with minimal accuracy loss — recommended for cost-efficient serving"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$e7","hf_org":"zai-org","hf_repo":"GLM-4.6","hf_id":"zai-org/GLM-4.6","hf_released":"2025-09-29T18:22:51.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-4.6","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"b200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm45"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$e8"}},"default_engine":"vllm"},{"meta":{"title":"GLM-4.5V","slug":"glm-4.5v","provider":"GLM (Z-AI)","description":"GLM-4.5 vision-language MoE model (~107B parameters, BF16) with image-text-to-text capability, 64K context, expert parallelism, and native FP8","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:103:meta:tasks","performance_headline":"Multimodal GLM-4.5V with native FP8 and expert parallelism, deploys on 4xH100","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:103:meta:hardware"},"model":{"model_id":"zai-org/GLM-4.5V","min_vllm_version":"0.12.0","architecture":"moe","parameter_count":"107B","active_parameters":"12B","context_length":65536,"base_args":["--trust-remote-code","--enable-expert-parallel","--allowed-local-media-path","/","--mm-encoder-tp-mode","data","--mm-processor-cache-type","shm"],"base_env":{}},"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser","args":["--reasoning-parser","glm45"]},"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache. Mutually exclusive with encoder_parallel.","args":["--language-model-only"]},"encoder_parallel":{"description":"Run the vision encoder in data-parallel mode — avoids TP comm overhead on the small encoder. Mutually exclusive with text_only.","args":["--mm-encoder-tp-mode","data"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":257,"description":"Full precision BF16 — runs on 4xH100/H200"},"fp8":{"model_id":"zai-org/GLM-4.5V-FP8","precision":"fp8","vram_minimum_gb":128,"description":"Native FP8 checkpoint with minimal accuracy loss — recommended for cost-efficient serving"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_tep","multi_node_dep"],"hardware_overrides":{"amd":{"extra_args":["--mm-encoder-tp-mode","data","--allowed-local-media-path","/"],"extra_env":{"VLLM_ROCM_USE_AITER":"1","SAFETENSORS_FAST_GPU":"1"}}},"strategy_overrides":{},"guide":"$e9","hf_org":"zai-org","hf_repo":"GLM-4.5V","hf_id":"zai-org/GLM-4.5V","hf_released":"2025-08-10T13:55:30.000Z","engines":{"vllm":{"min_version":"0.12.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-4.5V","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h100":8,"h200":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm45"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$ea"}},"default_engine":"vllm"},{"meta":{"title":"GLM-4.5","slug":"glm-4.5","provider":"GLM (Z-AI)","description":"GLM-4.5 MoE language model (~358B total parameters, BF16) with built-in MTP layers for speculative decoding and native tool calling","date_updated":"2026-04-17","difficulty":"intermediate","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:102:meta:tasks","performance_headline":"GLM-4.X series MoE model with native FP8 and BF16 support and MTP speculative decoding","related_recipes":[],"hardware":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:102:meta:hardware"},"model":{"model_id":"zai-org/GLM-4.5","min_vllm_version":"0.11.0","architecture":"moe","parameter_count":"358B","active_parameters":"32B","context_length":131072,"base_args":["--trust-remote-code"],"base_env":{}},"dependencies":[{"command":"docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.6.2-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10","brand":"Hygon","note":"Hygon DCU K100-AI runs vLLM via the Sourcefind DTK image (ROCm/HIP). Set HSA_OVERRIDE_GFX_VERSION=9.2.8; device flags --device=/dev/kfd --device=/dev/dri -v /opt/hyhal:/opt/hyhal:ro; serve with --enforce-eager --dtype float16 (CUDA-graph capture unreliable; K100-AI is fp16/bf16 only — FP8 checkpoints run as fp16). See https://docs.gpustack.ai/0.5/tutorials/running-inference-with-hygon-dcus/"},{"command":"docker pull wjie520/vllm_kunlun:uv_base","brand":"Kunlunxin","note":"Kunlunxin (Baidu) P800 runs vLLM via the out-of-tree vLLM-Kunlun plugin (P800 only). In the image: pip install vllm==0.15.1 --no-deps, build the plugin (v0.15.1-dev) + kunlun op wheels (kunlun_ops/xspeedgate_ops). TP across 8×P800; int8/awq/gptq/fp8. See https://github.com/baidu/vLLM-Kunlun"}],"features":{"tool_calling":{"description":"GLM-4.5 tool call parser with automatic tool choice","args":["--tool-call-parser","glm45","--enable-auto-tool-choice"]},"reasoning":{"description":"GLM-4.5 reasoning parser for chain-of-thought extraction","args":["--reasoning-parser","glm45"]},"spec_decoding":{"description":"Multi-Token Prediction speculative decoding using the model's built-in MTP layers","args":["--speculative-config.method","mtp","--speculative-config.num_speculative_tokens","1"]}},"opt_in_features":["spec_decoding"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":859,"description":"Full precision BF16 on 8xH200 or equivalent"},"fp8":{"model_id":"zai-org/GLM-4.5-FP8","precision":"fp8","vram_minimum_gb":430,"description":"Native FP8 checkpoint with minimal accuracy loss — recommended for cost-efficient serving"}},"compatible_strategies":["single_node_tp","single_node_tep","single_node_dep","multi_node_tp","multi_node_tp_pp","multi_node_dep","multi_node_tep","pd_cluster"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$eb","hf_org":"zai-org","hf_repo":"GLM-4.5","hf_id":"zai-org/GLM-4.5","hf_released":"2025-07-20T03:25:36.000Z","engines":{"vllm":{"min_version":"0.11.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-4.5","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":8,"mi300x":8,"mi325x":8,"mi355x":8},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm45"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$ec"}},"default_engine":"vllm"},{"meta":{"title":"GLM-GA","slug":"glm-ga","provider":"GLM (Z-AI)","description":"GLM-GA dense vision-language model (~10B) — image and video understanding with 128K context and dedicated Glmga video processor (fps=2, up to 640 frames)","date_updated":"2026-05-27","difficulty":"beginner","tasks":"$2:props:children:props:children:0:props:children:props:children:1:props:children:props:recipes:20:meta:tasks","performance_headline":"Dense VLM based on GLM-4.6V-Flash with dedicated video processor supporting long videos up to 640 frames","related_recipes":["zai-org/GLM-4.6V"]},"model":{"model_id":"zai-org/GLM-GA","min_vllm_version":"0.21.0","architecture":"dense","parameter_count":"10B","context_length":131072,"base_args":["--trust-remote-code","--allowed-local-media-path","/","--mm-processor-cache-type","shm","--reasoning-parser","glm45"],"base_env":{"VLLM_VIDEO_LOADER_BACKEND":"glm4_6v"}},"dependencies":[{"command":"uv pip install git+https://github.com/huggingface/transformers.git","note":"Latest transformers from main branch for GlmgaImageProcessor / GlmgaVideoProcessor support"}],"features":{"text_only":{"description":"Skip loading the vision encoder for text-only workloads — frees VRAM for KV cache.","args":["--language-model-only"]}},"opt_in_features":["text_only"],"variants":{"default":{"precision":"bf16","vram_minimum_gb":29,"description":"Full precision BF16 — runs on a single H100/H200"}},"compatible_strategies":["single_node_tp","multi_node_tp"],"hardware_overrides":{},"strategy_overrides":{},"guide":"$ed","hf_org":"zai-org","hf_repo":"GLM-GA","hf_id":"zai-org/GLM-GA","hf_released":null,"engines":{"vllm":{"min_version":"0.21.0"},"sglang":{"engine":"sglang","model_id":"zai-org/GLM-GA","min_version":"v0.5.6","serve_binary":"python3 -m sglang.launch_server","base_args":["--trust-remote-code"],"tp_by_hardware":{"h200":1,"b200":1},"variants":{"default":{"precision":"bf16"}},"strategies":{"single_node_tp":{}},"features":{"tool_calling":{"args":["--tool-call-parser","glm45"]},"reasoning":{"args":["--reasoning-parser","glm45"]}},"guide":"$ee"}},"default_engine":"vllm"}]]]}],"$Lef"]}]