Skip to content

nvidia_gpu_memory_utilization

Memory utilization measurement for gpu processes.

This relies on the pynvml package for status. We dynamically import it, so that we can return useful results when it isn't available.

We are using the 'nvidia-ml-py' version of the library. Thus: 'pip install nvidia-ml-py'

API DOCS:

NVIDIA Management Library (NVML) - https://developer.nvidia.com/management-library-nvml

Links at bottom: API Docs - https://docs.nvidia.com/deploy/nvml-api/index.html Python Binding Docs - https://pypi.org/project/nvidia-ml-py/

Example for getting memory utilization (nvmlDeviceGetMemoryInfo):

https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8

NvidiaGPUMemoryStatistics

Bases: CommonStatistics

Source code in mlte/measurement/memory/nvidia_gpu_memory_utilization.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class NvidiaGPUMemoryStatistics(CommonStatistics):
    # Nvidia-smi cli uses MiB so we use that for consistency.
    DEFAULT_UNIT: Unit = Units.mebibyte

    """
    The NvidiaGPUMemoryStatistics class encapsulates data
    and functionality for tracking and updating memory
    utilization statistics for an NVIDIA GPU.
    """

    def __init__(
        self, avg: float, min: float, max: float, unit: Unit = DEFAULT_UNIT
    ):
        """
        Initialize a NvidiaGPUMemoryStatistics instance.

        :param avg: The average memory utilization
        :param min: The minimum memory utilization
        :param max: The maximum memory utilization
        :param unit: the unit the values comes in, as a value from Units; defaults to DEFAULT_UNIT
        """
        super().__init__(avg, min, max, unit)

DEFAULT_UNIT = Units.mebibyte class-attribute instance-attribute

The NvidiaGPUMemoryStatistics class encapsulates data and functionality for tracking and updating memory utilization statistics for an NVIDIA GPU.

__init__(avg, min, max, unit=DEFAULT_UNIT)

Initialize a NvidiaGPUMemoryStatistics instance.

Parameters:

Name Type Description Default
avg float

The average memory utilization

required
min float

The minimum memory utilization

required
max float

The maximum memory utilization

required
unit Unit

the unit the values comes in, as a value from Units; defaults to DEFAULT_UNIT

DEFAULT_UNIT
Source code in mlte/measurement/memory/nvidia_gpu_memory_utilization.py
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(
    self, avg: float, min: float, max: float, unit: Unit = DEFAULT_UNIT
):
    """
    Initialize a NvidiaGPUMemoryStatistics instance.

    :param avg: The average memory utilization
    :param min: The minimum memory utilization
    :param max: The maximum memory utilization
    :param unit: the unit the values comes in, as a value from Units; defaults to DEFAULT_UNIT
    """
    super().__init__(avg, min, max, unit)

NvidiaGPUMemoryUtilization

Bases: ProcessMeasurement

Measure memory utilization for a specific gpu.

Source code in mlte/measurement/memory/nvidia_gpu_memory_utilization.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class NvidiaGPUMemoryUtilization(ProcessMeasurement):
    """Measure memory utilization for a specific gpu."""

    def __init__(
        self,
        identifier: Optional[str] = None,
        group: Optional[str] = None,
        gpu_ids: Union[int, list[int]] = 0,
    ):
        """
        Initialize a NvidiaGPUMemoryUtilization instance.

        :param identifier: A unique identifier for the measurement
        :param group: An optional group id, if we want to group this measurement with others.
        :param gpu_ids: A list of 1 or more gpu ids to use.
        """
        super().__init__(identifier, group)

        self.gpu_ids: list[int] = (
            [gpu_ids] if isinstance(gpu_ids, int) else gpu_ids
        )
        assert len(self.gpu_ids) > 0

    # Overriden.
    def __call__(
        self,
        pid: int,
        unit: Unit = NvidiaGPUMemoryStatistics.DEFAULT_UNIT,
        poll_interval: int = 1,
    ) -> NvidiaGPUMemoryStatistics:
        """
        Monitor memory usage on a specific gpu.

        :param pid: The process identifier
        :param unit: The unit to return the memory size in, defaults to statistics default unit.
        :param poll_interval: The poll interval, in seconds
        :return: The captured statistics
        """

        # Keep collecting stats until the controlling process goes away.
        # It might actually take the controlling process a while to start up the memory utilization
        # so just collect the entire time whether we have utilization or not.

        minimum, maximum, average = (
            pynvml_utils.aggregate_measurements_from_process(
                pid,
                poll_interval,
                gpu_ids=self.gpu_ids,
                fn=_get_nvml_memory_usage_bytes,
            )
        )

        # Coerce to the desired target units
        return NvidiaGPUMemoryStatistics(
            Quantity(average, Units.bytes).to(unit).magnitude,
            Quantity(minimum, Units.bytes).to(unit).magnitude,
            Quantity(maximum, Units.bytes).to(unit).magnitude,
            unit=unit,
        )

    # Overriden.

    @classmethod
    def get_output_type(cls) -> type[NvidiaGPUMemoryStatistics]:
        return NvidiaGPUMemoryStatistics

__call__(pid, unit=NvidiaGPUMemoryStatistics.DEFAULT_UNIT, poll_interval=1)

Monitor memory usage on a specific gpu.

Parameters:

Name Type Description Default
pid int

The process identifier

required
unit Unit

The unit to return the memory size in, defaults to statistics default unit.

DEFAULT_UNIT
poll_interval int

The poll interval, in seconds

1

Returns:

Type Description
NvidiaGPUMemoryStatistics

The captured statistics

Source code in mlte/measurement/memory/nvidia_gpu_memory_utilization.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def __call__(
    self,
    pid: int,
    unit: Unit = NvidiaGPUMemoryStatistics.DEFAULT_UNIT,
    poll_interval: int = 1,
) -> NvidiaGPUMemoryStatistics:
    """
    Monitor memory usage on a specific gpu.

    :param pid: The process identifier
    :param unit: The unit to return the memory size in, defaults to statistics default unit.
    :param poll_interval: The poll interval, in seconds
    :return: The captured statistics
    """

    # Keep collecting stats until the controlling process goes away.
    # It might actually take the controlling process a while to start up the memory utilization
    # so just collect the entire time whether we have utilization or not.

    minimum, maximum, average = (
        pynvml_utils.aggregate_measurements_from_process(
            pid,
            poll_interval,
            gpu_ids=self.gpu_ids,
            fn=_get_nvml_memory_usage_bytes,
        )
    )

    # Coerce to the desired target units
    return NvidiaGPUMemoryStatistics(
        Quantity(average, Units.bytes).to(unit).magnitude,
        Quantity(minimum, Units.bytes).to(unit).magnitude,
        Quantity(maximum, Units.bytes).to(unit).magnitude,
        unit=unit,
    )

__init__(identifier=None, group=None, gpu_ids=0)

Initialize a NvidiaGPUMemoryUtilization instance.

Parameters:

Name Type Description Default
identifier Optional[str]

A unique identifier for the measurement

None
group Optional[str]

An optional group id, if we want to group this measurement with others.

None
gpu_ids Union[int, list[int]]

A list of 1 or more gpu ids to use.

0
Source code in mlte/measurement/memory/nvidia_gpu_memory_utilization.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def __init__(
    self,
    identifier: Optional[str] = None,
    group: Optional[str] = None,
    gpu_ids: Union[int, list[int]] = 0,
):
    """
    Initialize a NvidiaGPUMemoryUtilization instance.

    :param identifier: A unique identifier for the measurement
    :param group: An optional group id, if we want to group this measurement with others.
    :param gpu_ids: A list of 1 or more gpu ids to use.
    """
    super().__init__(identifier, group)

    self.gpu_ids: list[int] = (
        [gpu_ids] if isinstance(gpu_ids, int) else gpu_ids
    )
    assert len(self.gpu_ids) > 0