Skip to main content

ServerArgs

The ServerArgs class contains all configuration options for launching an SGLang server or engine. These arguments control model loading, memory management, parallelism, kernel backends, and optimization settings.

Usage

from sglang import Engine
from sglang.srt.server_args import ServerArgs

# Option 1: Pass arguments directly to Engine
engine = Engine(
    model_path="meta-llama/Llama-3.1-8B-Instruct",
    tp_size=2,
    mem_fraction_static=0.85
)

# Option 2: Create ServerArgs first
server_args = ServerArgs(
    model_path="meta-llama/Llama-3.1-8B-Instruct",
    tp_size=2,
    mem_fraction_static=0.85
)
engine = Engine(server_args=server_args)

Model and Tokenizer

model_path
str
required
Path to the model on Hugging Face Hub or local filesystem.
model_path="meta-llama/Llama-3.1-8B-Instruct"
# or
model_path="/path/to/local/model"
tokenizer_path
Optional[str]
default:"None"
Path to tokenizer. Defaults to model_path if not specified.
tokenizer_mode
str
default:"auto"
Tokenizer mode. Options: "auto", "slow", "fast".
skip_tokenizer_init
bool
default:"False"
Skip tokenizer initialization. Useful when passing pre-tokenized input_ids.
load_format
str
default:"auto"
Model weight loading format.Options: "auto", "pt", "safetensors", "npcache", "dummy", "gguf", "bitsandbytes"
trust_remote_code
bool
default:"False"
Trust remote code when loading models from Hugging Face.
context_length
Optional[int]
default:"None"
Maximum context length. Auto-detected from model config if not specified.
revision
Optional[str]
default:"None"
Model revision (branch, tag, or commit) to use from Hugging Face.

HTTP Server

host
str
default:"127.0.0.1"
Server host address.
port
int
default:"30000"
Server port number.
api_key
Optional[str]
default:"None"
API key for authentication.
served_model_name
Optional[str]
default:"None"
Model name to report in API responses. Defaults to model_path.

Data Type and Quantization

dtype
str
default:"auto"
Data type for model weights and computation.Options: "auto", "float16", "bfloat16", "float32"
quantization
Optional[str]
default:"None"
Quantization method.Options: "awq", "fp8", "gptq", "marlin", "bitsandbytes", "gguf", and more.
quantization="awq"  # For AWQ quantized models
quantization="fp8"  # For FP8 quantization
kv_cache_dtype
str
default:"auto"
Data type for KV cache.Options: "auto", "fp8_e4m3", "fp8_e5m2", "bfloat16", "float16"Using FP8 for KV cache can significantly reduce memory usage.

Memory and Scheduling

mem_fraction_static
Optional[float]
default:"None"
Fraction of GPU memory to use for model weights and KV cache.Auto-calculated based on GPU memory capacity if not specified.
mem_fraction_static=0.85  # Use 85% of GPU memory
max_total_tokens
Optional[int]
default:"None"
Maximum total tokens in the KV cache pool.This is the maximum number of tokens that can be cached across all requests.
max_running_requests
Optional[int]
default:"None"
Maximum number of requests to process simultaneously.
max_queued_requests
Optional[int]
default:"None"
Maximum number of requests to queue when busy.
chunked_prefill_size
Optional[int]
default:"None"
Chunk size for chunked prefill.Auto-calculated based on GPU memory capacity if not specified.
  • Small GPUs (<20GB): 2048
  • Medium GPUs (20-60GB): 4096
  • Large GPUs (>60GB): 8192+
max_prefill_tokens
int
default:"16384"
Maximum tokens for prefill phase.
schedule_policy
str
default:"fcfs"
Scheduling policy. Options: "fcfs" (first-come-first-serve), "lpm" (longest-prefix-match).
enable_priority_scheduling
bool
default:"False"
Enable priority-based request scheduling.

Parallelism

tp_size
int
default:"1"
Tensor parallelism size (number of GPUs for model parallelism).
tp_size=4  # Use 4 GPUs with tensor parallelism
dp_size
int
default:"1"
Data parallelism size (number of independent model replicas).
dp_size=2  # Run 2 independent replicas
pp_size
int
default:"1"
Pipeline parallelism size (number of pipeline stages).
nnodes
int
default:"1"
Number of nodes in a multi-node setup.
node_rank
int
default:"0"
Current node rank (0 to nnodes-1).

Kernel Backends

attention_backend
Optional[str]
default:"None"
Attention kernel backend.Options: "flashinfer", "flashinfer", "triton", "torch_native", "fa3" (FlashAttention-3)Auto-selected based on hardware if not specified.
sampling_backend
Optional[str]
default:"None"
Sampling backend. Options: "flashinfer", "pytorch"
grammar_backend
Optional[str]
default:"None"
Structured generation backend.Options: "xgrammar", "outlines", "llguidance", "none"

CUDA Graph Optimization

disable_cuda_graph
bool
default:"False"
Disable CUDA graph optimization.
cuda_graph_max_bs
Optional[int]
default:"None"
Maximum batch size for CUDA graph capture.Auto-calculated based on GPU memory:
  • Small GPUs: 8-24
  • Medium GPUs: 32-160
  • Large GPUs: 256-512
disable_cuda_graph_padding
bool
default:"False"
Disable padding in CUDA graph batch sizes.

Speculative Decoding

speculative_algorithm
Optional[str]
default:"None"
Speculative decoding algorithm.Options: "EAGLE", "STANDALONE", "NGRAM"
speculative_draft_model_path
Optional[str]
default:"None"
Path to draft model for speculative decoding.
speculative_num_steps
Optional[int]
default:"None"
Number of speculative decoding steps.
speculative_num_draft_tokens
Optional[int]
default:"None"
Number of draft tokens to generate per step.

LoRA

enable_lora
Optional[bool]
default:"None"
Enable LoRA adapter support.
max_lora_rank
Optional[int]
default:"None"
Maximum LoRA rank to support.
lora_paths
Optional[Union[List[str], List[dict]]]
default:"None"
Paths to LoRA adapters to pre-load.
lora_paths=["path/to/lora1", "path/to/lora2"]
max_loaded_loras
Optional[int]
default:"None"
Maximum number of LoRA adapters to keep loaded.
lora_backend
str
default:"csgmv"
LoRA kernel backend. Options: "triton", "csgmv", "torch_native"

Expert Parallelism (MoE)

ep_size
int
default:"1"
Expert parallelism size for Mixture-of-Experts models.
moe_runner_backend
str
default:"auto"
MoE kernel backend.Options: "auto", "triton", "flashinfer_cutlass", "deep_gemm"
moe_a2a_backend
str
default:"none"
All-to-all communication backend for MoE.Options: "none", "deepep", "mooncake"

Logging and Monitoring

log_level
str
default:"info"
Logging level. Options: "debug", "info", "warning", "error"
log_requests
bool
default:"False"
Log all requests and responses.
show_time_cost
bool
default:"False"
Show time cost for each request.
enable_metrics
bool
default:"False"
Enable Prometheus metrics.
enable_trace
bool
default:"False"
Enable OpenTelemetry tracing.
otlp_traces_endpoint
str
default:"localhost:4317"
OpenTelemetry collector endpoint.

Advanced Options

disable_radix_cache
bool
default:"False"
Disable radix cache (prefix caching) optimization.
random_seed
Optional[int]
default:"None"
Random seed for reproducibility. Auto-generated if not specified.
stream_interval
int
default:"1"
Token interval for streaming responses.
download_dir
Optional[str]
default:"None"
Directory for downloading models from Hugging Face.
enable_torch_compile
bool
default:"False"
Enable PyTorch compilation for model optimization.
device
Optional[str]
default:"None"
Device to use. Options: "cuda", "cpu", "npu". Auto-detected if not specified.

Configuration Examples

Basic Configuration

from sglang import Engine

engine = Engine(
    model_path="meta-llama/Llama-3.1-8B-Instruct",
    tp_size=1,
    log_level="info"
)

Production Configuration

from sglang import Engine

engine = Engine(
    model_path="meta-llama/Llama-3.1-70B-Instruct",
    tp_size=4,
    mem_fraction_static=0.90,
    max_running_requests=1000,
    chunked_prefill_size=8192,
    enable_metrics=True,
    enable_torch_compile=True,
    log_level="warning"
)

Quantized Model

from sglang import Engine

engine = Engine(
    model_path="TheBloke/Llama-2-70B-AWQ",
    quantization="awq",
    tp_size=2,
    kv_cache_dtype="fp8_e4m3"
)

Multi-LoRA Configuration

from sglang import Engine

engine = Engine(
    model_path="meta-llama/Llama-3.1-8B-Instruct",
    enable_lora=True,
    max_lora_rank=64,
    lora_paths=[
        "adapter1",
        "adapter2",
        "adapter3"
    ],
    max_loaded_loras=10
)

Data Parallelism

from sglang import Engine

engine = Engine(
    model_path="meta-llama/Llama-3.1-8B-Instruct",
    dp_size=4,  # 4 replicas
    load_balance_method="round_robin"
)

Speculative Decoding

from sglang import Engine

engine = Engine(
    model_path="meta-llama/Llama-3.1-70B-Instruct",
    tp_size=4,
    speculative_algorithm="EAGLE",
    speculative_draft_model_path="meta-llama/Llama-3.1-8B-Instruct",
    speculative_num_steps=5
)

Multi-Node Configuration

# Node 0
from sglang import Engine

engine = Engine(
    model_path="meta-llama/Llama-3.1-70B-Instruct",
    tp_size=8,
    nnodes=2,
    node_rank=0,
    dist_init_addr="node0:12345"
)

# Node 1
engine = Engine(
    model_path="meta-llama/Llama-3.1-70B-Instruct",
    tp_size=8,
    nnodes=2,
    node_rank=1,
    dist_init_addr="node0:12345"
)

See Also