vllm.utils ¶

Modules:

Name	Description
`argparse_utils`	Argument parsing utilities for vLLM.
`async_utils`	Contains helpers related to asynchronous code.
`cache`
`collection_utils`	Contains helpers that are applied to collections.
`deep_gemm`	Compatibility wrapper for DeepGEMM API changes.
`flashinfer`	Compatibility wrapper for FlashInfer API changes.
`func_utils`	Contains helpers that are applied to functions.
`gc_utils`
`hashing`
`import_utils`	Contains helpers related to importing modules.
`jsontree`	Helper functions to work with nested JSON structures.
`math_utils`	Math utility functions for vLLM.
`mem_constants`
`mem_utils`
`nccl`
`network_utils`
`platform_utils`
`profiling`
`serial_utils`
`system_utils`
`tensor_schema`
`torch_utils`

DEFAULT_MAX_NUM_BATCHED_TOKENS `module-attribute` ¶

DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048

MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS `module-attribute` ¶

MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120

POOLING_MODEL_MAX_NUM_BATCHED_TOKENS `module-attribute` ¶

POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768

STR_BACKEND_ENV_VAR `module-attribute` ¶

STR_BACKEND_ENV_VAR: str = 'VLLM_ATTENTION_BACKEND'

STR_FLASHINFER_ATTN_VAL `module-attribute` ¶

STR_FLASHINFER_ATTN_VAL: str = 'FLASHINFER'

STR_FLASH_ATTN_VAL `module-attribute` ¶

STR_FLASH_ATTN_VAL: str = 'FLASH_ATTN'

STR_INVALID_VAL `module-attribute` ¶

STR_INVALID_VAL: str = 'INVALID'

STR_TORCH_SDPA_ATTN_VAL `module-attribute` ¶

STR_TORCH_SDPA_ATTN_VAL: str = 'TORCH_SDPA'

STR_XFORMERS_ATTN_VAL `module-attribute` ¶

STR_XFORMERS_ATTN_VAL: str = 'XFORMERS'

T `module-attribute` ¶

T = TypeVar('T')

_DEPRECATED_MAPPINGS `module-attribute` ¶

_DEPRECATED_MAPPINGS = {
    "cprofile": "profiling",
    "cprofile_context": "profiling",
    "get_open_port": "network_utils",
}

logger `module-attribute` ¶

logger = init_logger(__name__)

AtomicCounter ¶

An atomic, thread-safe counter

Source code in vllm/utils/__init__.py

class AtomicCounter:
    """An atomic, thread-safe counter"""

    def __init__(self, initial=0):
        """Initialize a new atomic counter to given initial value"""
        self._value = initial
        self._lock = threading.Lock()

    def inc(self, num=1):
        """Atomically increment the counter by num and return the new value"""
        with self._lock:
            self._value += num
            return self._value

    def dec(self, num=1):
        """Atomically decrement the counter by num and return the new value"""
        with self._lock:
            self._value -= num
            return self._value

    @property
    def value(self):
        return self._value

_lock `instance-attribute` ¶

_lock = Lock()

_value `instance-attribute` ¶

_value = initial

value `property` ¶

value

init ¶

__init__(initial=0)

Initialize a new atomic counter to given initial value

Source code in vllm/utils/__init__.py

def __init__(self, initial=0):
    """Initialize a new atomic counter to given initial value"""
    self._value = initial
    self._lock = threading.Lock()

dec ¶

dec(num=1)

Atomically decrement the counter by num and return the new value

Source code in vllm/utils/__init__.py

def dec(self, num=1):
    """Atomically decrement the counter by num and return the new value"""
    with self._lock:
        self._value -= num
        return self._value

inc ¶

inc(num=1)

Atomically increment the counter by num and return the new value

Source code in vllm/utils/__init__.py

def inc(self, num=1):
    """Atomically increment the counter by num and return the new value"""
    with self._lock:
        self._value += num
        return self._value

Counter ¶

Source code in vllm/utils/__init__.py

class Counter:
    def __init__(self, start: int = 0) -> None:
        self.counter = start

    def __next__(self) -> int:
        i = self.counter
        self.counter += 1
        return i

    def reset(self) -> None:
        self.counter = 0

counter `instance-attribute` ¶

counter = start

init ¶

__init__(start: int = 0) -> None

Source code in vllm/utils/__init__.py

def __init__(self, start: int = 0) -> None:
    self.counter = start

next ¶

__next__() -> int

Source code in vllm/utils/__init__.py

def __next__(self) -> int:
    i = self.counter
    self.counter += 1
    return i

reset ¶

reset() -> None

Source code in vllm/utils/__init__.py

def reset(self) -> None:
    self.counter = 0

Device ¶

Bases: Enum

Source code in vllm/utils/__init__.py

class Device(enum.Enum):
    GPU = enum.auto()
    CPU = enum.auto()

CPU `class-attribute` `instance-attribute` ¶

CPU = auto()

GPU `class-attribute` `instance-attribute` ¶

GPU = auto()

LayerBlockType ¶

Bases: Enum

Source code in vllm/utils/__init__.py

class LayerBlockType(enum.Enum):
    attention = "attention"
    mamba = "mamba"

attention `class-attribute` `instance-attribute` ¶

attention = 'attention'

mamba `class-attribute` `instance-attribute` ¶

mamba = 'mamba'

dir ¶

__dir__() -> list[str]

Source code in vllm/utils/__init__.py

def __dir__() -> list[str]:
    # expose deprecated names in dir() for better UX/tab-completion
    return sorted(list(globals().keys()) + list(_DEPRECATED_MAPPINGS.keys()))

getattr ¶

__getattr__(name: str) -> Any

Module-level getattr to handle deprecated utilities.

Source code in vllm/utils/__init__.py

def __getattr__(name: str) -> Any:  # noqa: D401 - short deprecation docstring
    """Module-level getattr to handle deprecated utilities."""
    if name in _DEPRECATED_MAPPINGS:
        submodule_name = _DEPRECATED_MAPPINGS[name]
        warnings.warn(
            f"vllm.utils.{name} is deprecated and will be removed in a future version. "
            f"Use vllm.utils.{submodule_name}.{name} instead.",
            DeprecationWarning,
            stacklevel=2,
        )
        module = __import__(f"vllm.utils.{submodule_name}", fromlist=[submodule_name])
        return getattr(module, name)
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

length_from_prompt_token_ids_or_embeds ¶

length_from_prompt_token_ids_or_embeds(
    prompt_token_ids: list[int] | None,
    prompt_embeds: Tensor | None,
) -> int

Calculate the request length (in number of tokens) give either prompt_token_ids or prompt_embeds.

Source code in vllm/utils/__init__.py

def length_from_prompt_token_ids_or_embeds(
    prompt_token_ids: list[int] | None,
    prompt_embeds: torch.Tensor | None,
) -> int:
    """Calculate the request length (in number of tokens) give either
    prompt_token_ids or prompt_embeds.
    """
    prompt_token_len = None if prompt_token_ids is None else len(prompt_token_ids)
    prompt_embeds_len = None if prompt_embeds is None else len(prompt_embeds)

    if prompt_token_len is None:
        if prompt_embeds_len is None:
            raise ValueError("Neither prompt_token_ids nor prompt_embeds were defined.")
        return prompt_embeds_len
    else:
        if prompt_embeds_len is not None and prompt_embeds_len != prompt_token_len:
            raise ValueError(
                "Prompt token ids and prompt embeds had different lengths"
                f" prompt_token_ids={prompt_token_len}"
                f" prompt_embeds={prompt_embeds_len}"
            )
        return prompt_token_len

random_uuid ¶

random_uuid() -> str

Source code in vllm/utils/__init__.py

def random_uuid() -> str:
    return str(uuid.uuid4().hex)

warn_for_unimplemented_methods ¶

warn_for_unimplemented_methods(cls: type[T]) -> type[T]

A replacement for abc.ABC. When we use abc.ABC, subclasses will fail to instantiate if they do not implement all abstract methods. Here, we only require raise NotImplementedError in the base class, and log a warning if the method is not implemented in the subclass.

Source code in vllm/utils/__init__.py

def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
    """
    A replacement for `abc.ABC`.
    When we use `abc.ABC`, subclasses will fail to instantiate
    if they do not implement all abstract methods.
    Here, we only require `raise NotImplementedError` in the
    base class, and log a warning if the method is not implemented
    in the subclass.
    """

    original_init = cls.__init__

    def find_unimplemented_methods(self: object):
        unimplemented_methods = []
        for attr_name in dir(self):
            # bypass inner method
            if attr_name.startswith("_"):
                continue

            try:
                attr = getattr(self, attr_name)
                # get the func of callable method
                if callable(attr):
                    attr_func = attr.__func__
            except AttributeError:
                continue
            src = inspect.getsource(attr_func)
            if "NotImplementedError" in src:
                unimplemented_methods.append(attr_name)
        if unimplemented_methods:
            method_names = ",".join(unimplemented_methods)
            msg = f"Methods {method_names} not implemented in {self}"
            logger.debug(msg)

    @wraps(original_init)
    def wrapped_init(self, *args, **kwargs) -> None:
        original_init(self, *args, **kwargs)
        find_unimplemented_methods(self)

    type.__setattr__(cls, "__init__", wrapped_init)
    return cls

vllm.utils ¶

DEFAULT_MAX_NUM_BATCHED_TOKENS module-attribute ¶

MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS module-attribute ¶

POOLING_MODEL_MAX_NUM_BATCHED_TOKENS module-attribute ¶

STR_BACKEND_ENV_VAR module-attribute ¶

STR_FLASHINFER_ATTN_VAL module-attribute ¶

STR_FLASH_ATTN_VAL module-attribute ¶

STR_INVALID_VAL module-attribute ¶

STR_TORCH_SDPA_ATTN_VAL module-attribute ¶

STR_XFORMERS_ATTN_VAL module-attribute ¶

T module-attribute ¶

_DEPRECATED_MAPPINGS module-attribute ¶

logger module-attribute ¶

AtomicCounter ¶

_lock instance-attribute ¶

_value instance-attribute ¶

value property ¶

__init__ ¶

dec ¶

inc ¶

Counter ¶

counter instance-attribute ¶

__init__ ¶

__next__ ¶

reset ¶

Device ¶

CPU class-attribute instance-attribute ¶

GPU class-attribute instance-attribute ¶

LayerBlockType ¶

attention class-attribute instance-attribute ¶

mamba class-attribute instance-attribute ¶

__dir__ ¶

__getattr__ ¶

length_from_prompt_token_ids_or_embeds ¶

random_uuid ¶

warn_for_unimplemented_methods ¶

DEFAULT_MAX_NUM_BATCHED_TOKENS `module-attribute` ¶

MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS `module-attribute` ¶

POOLING_MODEL_MAX_NUM_BATCHED_TOKENS `module-attribute` ¶

STR_BACKEND_ENV_VAR `module-attribute` ¶

STR_FLASHINFER_ATTN_VAL `module-attribute` ¶

STR_FLASH_ATTN_VAL `module-attribute` ¶

STR_INVALID_VAL `module-attribute` ¶

STR_TORCH_SDPA_ATTN_VAL `module-attribute` ¶

STR_XFORMERS_ATTN_VAL `module-attribute` ¶

T `module-attribute` ¶

_DEPRECATED_MAPPINGS `module-attribute` ¶

logger `module-attribute` ¶

_lock `instance-attribute` ¶

_value `instance-attribute` ¶

value `property` ¶

init ¶

counter `instance-attribute` ¶

init ¶

next ¶

CPU `class-attribute` `instance-attribute` ¶

GPU `class-attribute` `instance-attribute` ¶

attention `class-attribute` `instance-attribute` ¶

mamba `class-attribute` `instance-attribute` ¶

dir ¶

getattr ¶