Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved logging of RAPL permission issues #95

Merged
merged 2 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions carbontracker/components/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
)
from carbontracker.components.handler import Handler
from typing import Iterable, List, Union, Type, Sized
from carbontracker.loggerutil import Logger
import os

COMPONENTS = [
{
Expand Down Expand Up @@ -43,7 +45,7 @@ def handlers_by_name(name) -> List[Type[Handler]]:


class Component:
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger):
self.name = name
if name not in component_names():
raise exceptions.ComponentNameError(
Expand All @@ -54,6 +56,7 @@ def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
)
self.power_usages: List[List[float]] = []
self.cur_epoch: int = -1 # Sentry
self.logger = logger

@property
def handler(self) -> Handler:
Expand Down Expand Up @@ -97,18 +100,19 @@ def collect_power_usage(self, epoch: int):
self.power_usages.append([])
try:
self.power_usages[-1] += self.handler.power_usage()
except exceptions.IntelRaplPermissionError:
except exceptions.IntelRaplPermissionError as e:
energy_paths = " and ".join(e.file_names)
commands = ["sudo chmod +r " + energy_path for energy_path in e.file_names]
# Only raise error if no measurements have been collected.
if not self.power_usages[-1]:
print(
"No sudo access to read Intel's RAPL measurements from the energy_uj file."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/40"
)
self.logger.err_critical(
r"Could not read CPU/DRAM energy consumption due to lack of read-permissions.\n\tPlease run the following command(s): \n\t\t" + r"\n\t\t".join(commands)
)
# Append zero measurement to avoid further errors.
self.power_usages.append([0])
except exceptions.GPUPowerUsageRetrievalError:
if not self.power_usages[-1]:
print(
self.logger.err_critical(
"GPU model does not support retrieval of power usages in NVML."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/36"
)
Expand Down Expand Up @@ -154,16 +158,16 @@ def shutdown(self):


def create_components(
components: str, pids: Iterable[int], devices_by_pid: bool
components: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger
) -> List[Component]:
components = components.strip().replace(" ", "").lower()
if components == "all":
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in component_names()
]
else:
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in components.split(",")
]
6 changes: 4 additions & 2 deletions carbontracker/components/cpu/intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ def _read_energy(self, path: str) -> int:

def _get_measurements(self):
measurements = []
permission_errors = []
for package in self._rapl_devices:
try:
power_usage = self._read_energy(os.path.join(RAPL_DIR, package))
measurements.append(power_usage)
# If there is no sudo access, we cannot read the energy_uj file.
# Permission denied error is raised.
except PermissionError:
raise exceptions.IntelRaplPermissionError()
permission_errors += [os.path.join(RAPL_DIR, package, "energy_uj")]

except FileNotFoundError:
# check cpu/gpu/dram
Expand All @@ -79,7 +80,8 @@ def _get_measurements(self):
)

measurements.append(total_power_usage)

if permission_errors:
raise exceptions.IntelRaplPermissionError(permission_errors)
return measurements

def _convert_rapl_name(self, package, name, pattern) -> Union[None, str]:
Expand Down
5 changes: 4 additions & 1 deletion carbontracker/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

class NoComponentsAvailableError(Exception):
def __init__(
self,
Expand All @@ -23,7 +25,8 @@ def __init__(self, expected_unit, received_unit, message):
class IntelRaplPermissionError(Exception):
"""Raised when an Intel RAPL permission error occurs."""

pass
def __init__(self, file_names: List[str]):
self.file_names = file_names


class GPUPowerUsageRetrievalError(Exception):
Expand Down
2 changes: 1 addition & 1 deletion carbontracker/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def __init__(
self.tracker = CarbonTrackerThread(
delete=self._delete,
components=component.create_components(
components=components, pids=pids, devices_by_pid=devices_by_pid
components=components, pids=pids, devices_by_pid=devices_by_pid, logger=self.logger
),
logger=self.logger,
ignore_errors=ignore_errors,
Expand Down
4 changes: 2 additions & 2 deletions tests/components/test_intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_available(self, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["some_directory"]

component = Component(name='cpu', pids=[], devices_by_pid={})
component = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertTrue(component.available())

@patch("os.path.exists")
Expand All @@ -35,7 +35,7 @@ def test_available_false(self, mock_available, mock_listdir, mock_exists):
mock_exists.return_value = False
mock_listdir.return_value = []

cpu = Component(name='cpu', pids=[], devices_by_pid={})
cpu = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertFalse(cpu.available())

@patch("time.sleep")
Expand Down
44 changes: 22 additions & 22 deletions tests/test_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,23 @@ class TestComponent(unittest.TestCase):
def test_init_valid_component(
self, mock_handlers_by_name, mock_error_by_name, mock_component_names
):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
self.assertEqual(component.name, "gpu")
self.assertEqual(component._handler, mock_handlers_by_name()[0]())

def test_init_invalid_component(self):
with self.assertRaises(exceptions.ComponentNameError):
Component(name="unknown", pids=[], devices_by_pid=False)
Component(name="unknown", pids=[], devices_by_pid=False, logger=None)

def test_devices(self):
handler_mock = MagicMock(devices=MagicMock(return_value=["Test GPU"]))
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
self.assertEqual(component.devices(), ["Test GPU"])

def test_available_true(self):
handler_mock = MagicMock(available=MagicMock(return_value=True))
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
self.assertTrue(component.available())

Expand All @@ -52,33 +52,33 @@ def test_available_true(self):
return_value=False,
)
def test_available_false(self, mock_apple_gpu_available, mock_nvidia_gpu_available):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
self.assertFalse(component.available())

def test_collect_power_usage_no_measurement(self):
handler_mock = MagicMock(
power_usage=MagicMock(side_effect=exceptions.IntelRaplPermissionError)
power_usage=MagicMock(side_effect=exceptions.IntelRaplPermissionError(file_names=["file1", "file2"]))
)
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=MagicMock(err_critical=MagicMock()))
component._handler = handler_mock
component.collect_power_usage(epoch=1)
self.assertEqual(component.power_usages, [[], [0]])

def test_collect_power_usage_with_measurement(self):
handler_mock = MagicMock(power_usage=MagicMock(return_value=[1000]))
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
component.collect_power_usage(epoch=1)
self.assertEqual(component.power_usages, [[1000]])

def test_collect_power_usage_with_measurement_but_no_epoch(self):
power_collector = Component(name="cpu", pids=[], devices_by_pid=False)
power_collector = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
power_collector._handler = MagicMock(power_usage=MagicMock(return_value=[1000]))
power_collector.collect_power_usage(epoch=0)
assert len(power_collector.power_usages) == 0

def test_collect_power_usage_with_previous_measurement(self):
power_collector = Component(name="cpu", pids=[], devices_by_pid=False)
power_collector = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
power_collector._handler = MagicMock(power_usage=MagicMock(return_value=[1000]))
power_collector.collect_power_usage(epoch=1)
power_collector.collect_power_usage(epoch=3)
Expand All @@ -88,13 +88,13 @@ def test_collect_power_usage_GPUPowerUsageRetrievalError(self):
handler_mock = MagicMock(
power_usage=MagicMock(side_effect=exceptions.GPUPowerUsageRetrievalError)
)
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=MagicMock(err_critical=MagicMock()))
component._handler = handler_mock
component.collect_power_usage(epoch=1)
self.assertEqual(component.power_usages, [[], [0]])

def test_energy_usage(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[1000], [2000], [3000]]
epoch_times = [1, 2, 3]
energy_usages = component.energy_usage(epoch_times)
Expand All @@ -104,14 +104,14 @@ def test_energy_usage(self):
self.assertTrue(np.all(np.array(energy_usages) > 0))

def test_energy_usage_no_measurements(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[]]
epoch_times = [1]
energy_usages = component.energy_usage(epoch_times)
self.assertEqual(energy_usages, [0])

def test_energy_usage_with_power_from_later_epoch(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[1000], [2000], [3000]]
epoch_times = [1, 2, 3, 4]
energy_usages = component.energy_usage(epoch_times)
Expand All @@ -121,7 +121,7 @@ def test_energy_usage_with_power_from_later_epoch(self):
)

def test_energy_usage_no_power(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[], [], [], [], []]
epoch_times = [1, 2, 3, 4, 5]
energy_usages = component.energy_usage(epoch_times)
Expand All @@ -132,7 +132,7 @@ def test_energy_usage_no_power(self):

def test_init(self):
handler_mock = MagicMock()
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
component.init()
handler_mock.init.assert_called_once()
Expand All @@ -144,15 +144,15 @@ def test_init(self):

def test_shutdown(self):
handler_mock = MagicMock()
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
component.shutdown()
handler_mock.shutdown.assert_called_once()

def test_create_components(self):
gpu = create_components("gpu", pids=[], devices_by_pid=False)
cpu = create_components("cpu", pids=[], devices_by_pid=False)
all_components = create_components("all", pids=[], devices_by_pid=False)
gpu = create_components("gpu", pids=[], devices_by_pid=False, logger=None)
cpu = create_components("cpu", pids=[], devices_by_pid=False, logger=None)
all_components = create_components("all", pids=[], devices_by_pid=False, logger=None)
self.assertEqual(len(gpu), 1)
self.assertEqual(len(cpu), 1)
self.assertEqual(len(all_components), 2)
Expand All @@ -166,12 +166,12 @@ def test_error_by_name(self):
)

def test_handler_property_with_handler_set(self):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = "test"
self.assertEqual(component.handler, "test")

def test_handler_property_without_handler(self):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = None
with self.assertRaises(exceptions.GPUError):
component.handler()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_unit_error(self):

def test_intel_rapl_permission_error(self):
with self.assertRaises(exceptions.IntelRaplPermissionError):
raise exceptions.IntelRaplPermissionError
raise exceptions.IntelRaplPermissionError(file_names=["file1", "file2"])

def test_gpu_power_usage_retrieval_error(self):
with self.assertRaises(exceptions.GPUPowerUsageRetrievalError):
Expand Down