Skip to content

nvidia_gpu_power_utilization

Power utilization measurement in watts for NVIDIA gpu processes.

This relies on the pynvml package for status. We dynamically import it, so that we can return useful results when it isn't available.

We are using the 'nvidia-ml-py' version of the library. Thus: 'pip install nvidia-ml-py'

API DOCS:

NVIDIA Management Library (NVML) - https://developer.nvidia.com/management-library-nvml

Links at bottom: API Docs - https://docs.nvidia.com/deploy/nvml-api/index.html Python Binding Docs - https://pypi.org/project/nvidia-ml-py/

Example for getting memory utilization (nvmlDeviceGetPowerUsage):

https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87

This call returns milliwatts used.

NvidiaGPUPowerStatistics

Bases: CommonStatistics

Source code in mlte/measurement/power/nvidia_gpu_power_utilization.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class NvidiaGPUPowerStatistics(CommonStatistics):
    # Nvidia-smi cli uses watts so we use that for consistency.
    DEFAULT_UNIT = Units.watt

    """
    The NvidiaGPUMemoryStatistics class encapsulates data
    and functionality for tracking and updating power usage
    statistics for an NVIDIA GPU.
    """

    def __init__(
        self, avg: float, min: float, max: float, unit: Unit = DEFAULT_UNIT
    ):
        """
        Initialize a NvidiaGPUPowerStatistics instance.

        :param avg: The average power utilization
        :param min: The minimum power utilization
        :param max: The maximum power utilization
        :param unit: the unit the values comes in, as a value from Units; defaults to DEFAULT_UNIT
        """
        super().__init__(avg, min, max, unit)

DEFAULT_UNIT = Units.watt class-attribute instance-attribute

The NvidiaGPUMemoryStatistics class encapsulates data and functionality for tracking and updating power usage statistics for an NVIDIA GPU.

__init__(avg, min, max, unit=DEFAULT_UNIT)

Initialize a NvidiaGPUPowerStatistics instance.

Parameters:

Name Type Description Default
avg float

The average power utilization

required
min float

The minimum power utilization

required
max float

The maximum power utilization

required
unit Unit

the unit the values comes in, as a value from Units; defaults to DEFAULT_UNIT

DEFAULT_UNIT
Source code in mlte/measurement/power/nvidia_gpu_power_utilization.py
52
53
54
55
56
57
58
59
60
61
62
63
def __init__(
    self, avg: float, min: float, max: float, unit: Unit = DEFAULT_UNIT
):
    """
    Initialize a NvidiaGPUPowerStatistics instance.

    :param avg: The average power utilization
    :param min: The minimum power utilization
    :param max: The maximum power utilization
    :param unit: the unit the values comes in, as a value from Units; defaults to DEFAULT_UNIT
    """
    super().__init__(avg, min, max, unit)

NvidiaGPUPowerUtilization

Bases: ProcessMeasurement

Measure power utilization for a specific gpu.

Source code in mlte/measurement/power/nvidia_gpu_power_utilization.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class NvidiaGPUPowerUtilization(ProcessMeasurement):
    """Measure power utilization for a specific gpu."""

    def __init__(
        self,
        identifier: Optional[str] = None,
        group: Optional[str] = None,
        gpu_ids: Union[int, list[int]] = 0,
    ):
        """
        Initialize a NvidiaGPUPowerUtilization instance.

        :param identifier: A unique identifier for the measurement
        :param group: An optional group id, if we want to group this measurement with others.
        :param gpu_ids: A list of 1 or more gpu ids to use.
        """
        super().__init__(identifier, group)

        self.gpu_ids: list[int] = (
            [gpu_ids] if isinstance(gpu_ids, int) else gpu_ids
        )
        assert len(self.gpu_ids) > 0

    # Overriden.
    def __call__(
        self,
        pid: int,
        unit: Unit = NvidiaGPUPowerStatistics.DEFAULT_UNIT,
        poll_interval: int = 1,
    ) -> NvidiaGPUPowerStatistics:
        """
        Monitor memory usage on a specific gpu.

        :param pid: The process identifier
        :param poll_interval: The poll interval, in seconds
        :param unit: The unit to return the memory size in, defaults to statistics default unit.
        :return: The captured statistics
        """
        minimum, maximum, average = (
            pynvml_utils.aggregate_measurements_from_process(
                pid,
                poll_interval,
                gpu_ids=self.gpu_ids,
                fn=_get_nvml_power_usage_watts,
            )
        )

        # Coerce to the desired target units
        return NvidiaGPUPowerStatistics(
            Quantity(average, Units.watt).to(unit).magnitude,
            Quantity(minimum, Units.watt).to(unit).magnitude,
            Quantity(maximum, Units.watt).to(unit).magnitude,
            unit=unit,
        )

    # Overriden.
    @classmethod
    def get_output_type(cls) -> type[NvidiaGPUPowerStatistics]:
        return NvidiaGPUPowerStatistics

__call__(pid, unit=NvidiaGPUPowerStatistics.DEFAULT_UNIT, poll_interval=1)

Monitor memory usage on a specific gpu.

Parameters:

Name Type Description Default
pid int

The process identifier

required
poll_interval int

The poll interval, in seconds

1
unit Unit

The unit to return the memory size in, defaults to statistics default unit.

DEFAULT_UNIT

Returns:

Type Description
NvidiaGPUPowerStatistics

The captured statistics

Source code in mlte/measurement/power/nvidia_gpu_power_utilization.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def __call__(
    self,
    pid: int,
    unit: Unit = NvidiaGPUPowerStatistics.DEFAULT_UNIT,
    poll_interval: int = 1,
) -> NvidiaGPUPowerStatistics:
    """
    Monitor memory usage on a specific gpu.

    :param pid: The process identifier
    :param poll_interval: The poll interval, in seconds
    :param unit: The unit to return the memory size in, defaults to statistics default unit.
    :return: The captured statistics
    """
    minimum, maximum, average = (
        pynvml_utils.aggregate_measurements_from_process(
            pid,
            poll_interval,
            gpu_ids=self.gpu_ids,
            fn=_get_nvml_power_usage_watts,
        )
    )

    # Coerce to the desired target units
    return NvidiaGPUPowerStatistics(
        Quantity(average, Units.watt).to(unit).magnitude,
        Quantity(minimum, Units.watt).to(unit).magnitude,
        Quantity(maximum, Units.watt).to(unit).magnitude,
        unit=unit,
    )

__init__(identifier=None, group=None, gpu_ids=0)

Initialize a NvidiaGPUPowerUtilization instance.

Parameters:

Name Type Description Default
identifier Optional[str]

A unique identifier for the measurement

None
group Optional[str]

An optional group id, if we want to group this measurement with others.

None
gpu_ids Union[int, list[int]]

A list of 1 or more gpu ids to use.

0
Source code in mlte/measurement/power/nvidia_gpu_power_utilization.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
    self,
    identifier: Optional[str] = None,
    group: Optional[str] = None,
    gpu_ids: Union[int, list[int]] = 0,
):
    """
    Initialize a NvidiaGPUPowerUtilization instance.

    :param identifier: A unique identifier for the measurement
    :param group: An optional group id, if we want to group this measurement with others.
    :param gpu_ids: A list of 1 or more gpu ids to use.
    """
    super().__init__(identifier, group)

    self.gpu_ids: list[int] = (
        [gpu_ids] if isinstance(gpu_ids, int) else gpu_ids
    )
    assert len(self.gpu_ids) > 0