diff --git a/carbonserver/carbonserver/api/errors.py b/carbonserver/carbonserver/api/errors.py index bc7169052..004eb66ab 100644 --- a/carbonserver/carbonserver/api/errors.py +++ b/carbonserver/carbonserver/api/errors.py @@ -69,7 +69,9 @@ def get_http_exception(exception) -> HTTPException: take an internal exception and return a HTTPException """ if isinstance(exception, UserException): - if isinstance(error := exception.error, NotAllowedError): + if isinstance(error := exception.error, UserError): + return HTTPException(status_code=401, detail=error.message) + elif isinstance(error := exception.error, NotAllowedError): return HTTPException(status_code=403, detail=error.message) elif isinstance(error := exception.error, NotFoundError): return HTTPException(status_code=404, detail=error.message) diff --git a/carbonserver/carbonserver/api/routers/authenticate.py b/carbonserver/carbonserver/api/routers/authenticate.py index bb96f85d3..25cf68aa8 100644 --- a/carbonserver/carbonserver/api/routers/authenticate.py +++ b/carbonserver/carbonserver/api/routers/authenticate.py @@ -34,9 +34,12 @@ def check_login( sign_up_service: SignUpService = Depends(Provide[ServerContainer.sign_up_service]), ): """ - return user data or redirect to login screen - null value if not logged in + Return user data or null if not logged in. + Returns 401 if token is present but invalid/expired. """ + if auth_user.auth_user is None: + return {"user": None} + sign_up_service.check_jwt_user(auth_user.auth_user, create=True) return {"user": auth_user.auth_user} diff --git a/carbonserver/carbonserver/api/services/auth_service.py b/carbonserver/carbonserver/api/services/auth_service.py index 163d3cdfd..c15847fe2 100644 --- a/carbonserver/carbonserver/api/services/auth_service.py +++ b/carbonserver/carbonserver/api/services/auth_service.py @@ -55,53 +55,69 @@ async def __call__( ): self.user_service = user_service if cookie_token is not None: - self.auth_user = jwt.decode( + self.auth_user = self._decode_cookie_token(cookie_token) + elif bearer_token is not None: + await self._validate_bearer_token(bearer_token, auth_provider) + self.auth_user = self._decode_bearer_token(bearer_token) + else: + self.auth_user = None + if self.error_if_not_found: + raise HTTPException( + status_code=401, + detail="No token provided, please log in", + ) + self.db_user = self._get_db_user(user_service) + return self + + def _decode_cookie_token(self, cookie_token: str): + try: + return jwt.decode( cookie_token, options={"verify_signature": False}, algorithms=["HS256", "RS256"], ) - elif bearer_token is not None: - if settings.environment != "develop" and auth_provider is not None: - LOGGER.debug( - f"Validating token with auth provider. Token: {bearer_token}" - ) - try: - await auth_provider.validate_access_token(bearer_token.credentials) - except Exception: - raise HTTPException(status_code=401, detail="Invalid token") - # cli user using auth provider token - self.auth_user = jwt.decode( + except Exception: + raise HTTPException(401, "Session expired, please log in") + + async def _validate_bearer_token(self, bearer_token, auth_provider): + if settings.environment != "develop" and auth_provider is not None: + try: + await auth_provider.validate_access_token(bearer_token.credentials) + except Exception: + raise HTTPException(status_code=401, detail="Invalid token") + + def _decode_bearer_token(self, bearer_token) -> dict: + try: + auth_user = jwt.decode( bearer_token.credentials, options={"verify_signature": False}, - algorithms=[ - "HS256", - "RS256", - ], + algorithms=["HS256", "RS256"], ) - if settings.environment == "develop": - try: - # test user - self.auth_user = jwt.decode( - bearer_token.credentials, - settings.jwt_key, - algorithms=[ - "HS256", - "RS256", - ], - ) - except Exception: - ... - else: - self.auth_user = None - if self.error_if_not_found: - raise HTTPException(status_code=401, detail="Unauthorized") + except Exception: + LOGGER.warning("Failed to decode bearer token") + raise HTTPException( + status_code=401, + detail="Invalid or expired token, please log in again", + ) + if settings.environment == "develop": + try: + auth_user = jwt.decode( + bearer_token.credentials, + settings.jwt_key, + algorithms=["HS256", "RS256"], + ) + except Exception: + pass + return auth_user + def _get_db_user(self, user_service) -> Optional[dict]: + if self.auth_user is None: + return None try: - self.db_user = user_service.get_user_by_id(self.auth_user["sub"]) + return user_service.get_user_by_id(self.auth_user["sub"]) except Exception: - self.db_user = None - return self + return None OptionalUserWithAuthDependency = UserWithAuthDependency(error_if_not_found=False) diff --git a/carbonserver/pyproject.toml b/carbonserver/pyproject.toml index 41979a10f..0349b4d2d 100644 --- a/carbonserver/pyproject.toml +++ b/carbonserver/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "fastapi-oidc>=0.0.9", "authlib>=1.6.6", "itsdangerous>=2.2.0", + "pytest-asyncio>=1.3.0", ] [project.optional-dependencies] diff --git a/carbonserver/tests/__init__.py b/carbonserver/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/carbonserver/tests/api/routers/test_auth_fix.py b/carbonserver/tests/api/routers/test_auth_fix.py new file mode 100644 index 000000000..45bd0ec5a --- /dev/null +++ b/carbonserver/tests/api/routers/test_auth_fix.py @@ -0,0 +1,44 @@ +""" +Tests for issue #692 - missing/invalid tokens returning generic 500 error +instead of proper 401. +""" + + +class TestGetHttpException: + """Tests for errors.py get_http_exception() - core fix for issue #692.""" + + def test_user_error_api_key_unknown_returns_401(self): + from carbonserver.carbonserver.api.errors import ( + UserError, + UserErrorEnum, + UserException, + get_http_exception, + ) + + error = UserError( + code=UserErrorEnum.API_KEY_UNKNOWN, message="API key not found" + ) + result = get_http_exception(UserException(error)) + assert result.status_code == 401 + assert result.detail == "API key not found" + + def test_user_error_api_key_disabled_returns_401(self): + from carbonserver.carbonserver.api.errors import ( + UserError, + UserErrorEnum, + UserException, + get_http_exception, + ) + + error = UserError( + code=UserErrorEnum.API_KEY_DISABLE, message="API key disabled" + ) + result = get_http_exception(UserException(error)) + assert result.status_code == 401 + + def test_generic_exception_returns_500_with_detail(self): + from carbonserver.carbonserver.api.errors import get_http_exception + + result = get_http_exception(Exception("unexpected")) + assert result.status_code == 500 + assert result.detail == "Internal Server Error" diff --git a/carbonserver/uv.lock b/carbonserver/uv.lock index fff468702..04f2f8e70 100644 --- a/carbonserver/uv.lock +++ b/carbonserver/uv.lock @@ -49,14 +49,15 @@ wheels = [ [[package]] name = "authlib" -version = "1.6.9" +version = "1.7.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cryptography" }, + { name = "joserfc" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/af/98/00d3dd826d46959ad8e32af2dbb2398868fd9fd0683c26e56d0789bd0e68/authlib-1.6.9.tar.gz", hash = "sha256:d8f2421e7e5980cc1ddb4e32d3f5fa659cfaf60d8eaf3281ebed192e4ab74f04", size = 165134, upload-time = "2026-03-02T07:44:01.998Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3c/f2/e05664d5275ce811fd4e9df0a2b3f0086ee19a8a80358d95499fa82fd50c/authlib-1.7.1.tar.gz", hash = "sha256:8c09b0f9d080c823e594b52316af70f79a1fa4eed64d0363a076233c04ef063a", size = 175884, upload-time = "2026-05-04T08:11:25.033Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/53/23/b65f568ed0c22f1efacb744d2db1a33c8068f384b8c9b482b52ebdbc3ef6/authlib-1.6.9-py2.py3-none-any.whl", hash = "sha256:f08b4c14e08f0861dc18a32357b33fbcfd2ea86cfe3fe149484b4d764c4a0ac3", size = 244197, upload-time = "2026-03-02T07:44:00.307Z" }, + { url = "https://files.pythonhosted.org/packages/e0/82/730650ee5e5b598b7bfdc291b784bc2f6fe02a5671695485403365101088/authlib-1.7.1-py2.py3-none-any.whl", hash = "sha256:8470f4aa6b5590ac41bd81d6e6ee12448ce36a0da0af19bbed69fb53fb4e8ad9", size = 258826, upload-time = "2026-05-04T08:11:23.208Z" }, ] [[package]] @@ -125,6 +126,7 @@ dependencies = [ { name = "pydantic", extra = ["email"] }, { name = "pydantic-settings" }, { name = "pyjwt" }, + { name = "pytest-asyncio" }, { name = "python-dateutil" }, { name = "rapidfuzz" }, { name = "requests" }, @@ -144,8 +146,8 @@ dev = [ [package.metadata] requires-dist = [ { name = "alembic", specifier = "<2.0.0" }, - { name = "authlib", specifier = ">=1.2.1" }, { name = "authlib", specifier = ">=1.6.6" }, + { name = "authlib", specifier = ">=1.7.0" }, { name = "bcrypt", specifier = "<5.0.0" }, { name = "dependency-injector", specifier = "<5.0.0" }, { name = "fastapi", specifier = "<1.0.0" }, @@ -163,6 +165,7 @@ requires-dist = [ { name = "pydantic-settings", specifier = ">=2.0.0,<3.0.0" }, { name = "pyjwt" }, { name = "pytest", marker = "extra == 'dev'" }, + { name = "pytest-asyncio", specifier = ">=1.3.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'" }, { name = "python-dateutil", specifier = "<3.0.0" }, { name = "rapidfuzz" }, @@ -587,6 +590,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "joserfc" +version = "1.6.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/de/c6/de8fdbdfa75c8ca04fead38a82d573df8a82906e984c349d58665f459558/joserfc-1.6.4.tar.gz", hash = "sha256:34ce5f499bfcc5e9ad4cc75077f9278ab3227b71da9aaf28f9ab705f8a560d3c", size = 231866, upload-time = "2026-04-13T13:15:40.632Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/f7/210b27752e972edb36d239315b08d3eb6b14824cc4a590da2337d195260b/joserfc-1.6.4-py3-none-any.whl", hash = "sha256:3e4a22b509b41908989237a045e25c8308d5fd47ab96bdae2dd8057c6451003a", size = 70464, upload-time = "2026-04-13T13:15:39.259Z" }, +] + [[package]] name = "jwcrypto" version = "1.5.7" diff --git a/codecarbon/core/neuron.py b/codecarbon/core/neuron.py new file mode 100644 index 000000000..80bfe8741 --- /dev/null +++ b/codecarbon/core/neuron.py @@ -0,0 +1,284 @@ +""" +Implements tracking for AWS Inferentia and Inferentia2 AI accelerator chips +via the Neuron sysfs interface. + +Sysfs power file location: +/sys/devices/virtual/neuron_device/neuron{i}/stats/power/utilization + +Sysfs power file format: +,,,, + +Where power values are percentages (0.00-100.00) of max TDP. +Updated every 60 seconds by the Neuron driver. + +IMPORTANT - Sampling frequency limitation: +The Neuron sysfs power file updates every 60 seconds. +codecarbon reads it every 15 seconds by default, meaning +the same value may be read up to 4 times between updates. + +Impact: +- Steady workloads: minimal impact, power is relatively constant +- Bursty workloads: may miss power spikes between updates +- Runs < 60 seconds: energy estimate may be based on a single sample +- Long runs: averages out over time, impact diminishes + +NOTE: Power is reported at device level, not per-process. +Accurate for exclusive instances, approximate for shared Neuron cores. +""" + +import glob +import os +from typing import Dict, List, Optional, Tuple + +from codecarbon.external.logger import logger + +# Maximum TDP per device type in watts. +# Only Inferentia (inf1) and Inferentia2 (inf2) are currently supported. +# Add other devices when their power specs are properly researched. +# TDP values are approximate and used to estimate watts from utilization%. +NEURON_DEVICE_TDP_WATTS = { + # long format from device_name sysfs file + "inferentia": 75, + "inferentia2": 100, + # shorthand format from instance_type sysfs file + "inf1": 75, + "inf2": 100, +} + + +def is_neuron_system() -> bool: + """ + Check if AWS Inferentia/Inferentia2 Neuron device is available + by checking if the Neuron sysfs directory exists. + Returns True if Neuron devices are present, False otherwise. + """ + return os.path.exists("/sys/devices/virtual/neuron_device") + + +class NeuronDevice: + """ + Represents a single AWS Inferentia/Inferentia2 Neuron device. + + Reads power utilization from Neuron sysfs at: + /sys/devices/virtual/neuron_device/neuron{i}/stats/power/utilization + + Power is reported as a percentage of max TDP, updated every 60 seconds. + Watts are estimated by multiplying utilization% by the device TDP. + + Accuracy limitations: + - Power derived from utilization% x TDP, not directly measured + - sysfs updates every 60 seconds, codecarbon reads every 15 seconds + - Device-level power only, not per-process attribution + - TDP values are approximate, not officially confirmed by AWS + for power tracking purposes + """ + + def __init__(self, device_path: str, device_index: int): + self._device_path = device_path + self._device_index = device_index + self._max_power_watts = self._get_max_power_watts() + + def _get_max_power_watts(self) -> float: + """ + Look up device TDP by reading device_name, instance_type, + or arch_type from the sysfs info directory. + Tries each file in order, returns first match. + Returns 0.0 if device is not supported or file cannot be read. + """ + try: + for filename in ["device_name", "instance_type", "arch_type"]: + path = os.path.join(self._device_path, "info", "architecture", filename) + if not os.path.exists(path): + continue + with open(path, "r") as f: + name = f.read().strip().lower() + tdp = NEURON_DEVICE_TDP_WATTS.get(name, 0.0) + if tdp > 0: + logger.debug( + f"NeuronDevice {self._device_index}: " + f"{filename}='{name}', TDP={tdp}W" + ) + return tdp + else: + logger.warning( + f"NeuronDevice {self._device_index}: " + f"device '{name}' is not currently supported. " + "Only Inferentia (inf1) and Inferentia2 (inf2) " + "are supported. Power will be reported as 0.0W." + ) + return 0.0 + logger.warning( + f"NeuronDevice {self._device_index}: " + "could not determine device type from sysfs info directory." + ) + return 0.0 + except Exception as e: + logger.debug( + f"NeuronDevice {self._device_index}: " + f"could not read device info: {e}" + ) + return 0.0 + + def _read_power_file(self) -> Optional[Tuple[str, float, float, float]]: + """ + Read and parse the Neuron sysfs power utilization file. + + Format: ,,,, + + Returns (status, min_pct, max_pct, avg_pct) or None on error. + """ + try: + power_file = os.path.join( + self._device_path, "stats", "power", "utilization" + ) + if not os.path.exists(power_file): + logger.debug( + f"NeuronDevice {self._device_index}: " + f"power file not found at {power_file}" + ) + return None + + with open(power_file, "r") as f: + content = f.read().strip() + + parts = content.split(",") + if len(parts) != 5: + logger.debug( + f"NeuronDevice {self._device_index}: " + f"unexpected power file format: {content}" + ) + return None + + status, _, min_pct, max_pct, avg_pct = parts + return status, float(min_pct), float(max_pct), float(avg_pct) + + except Exception as e: + logger.debug( + f"NeuronDevice {self._device_index}: " f"could not read power file: {e}" + ) + return None + + def get_utilization_pct(self) -> float: + """ + Returns the raw average power utilization percentage (0.00-100.00) + as reported directly by the Neuron sysfs interface. + This is the direct measured value with no estimation involved. + Returns 0.0 if status is not POWER_STATUS_VALID or on error. + """ + result = self._read_power_file() + if result is None: + return 0.0 + + status, _, _, avg_pct = result + + if status != "POWER_STATUS_VALID": + logger.debug( + f"NeuronDevice {self._device_index}: " + f"power status: {status}, returning 0.0%" + ) + return 0.0 + + logger.debug( + f"NeuronDevice {self._device_index}: " f"utilization={avg_pct:.2f}%" + ) + return avg_pct + + def get_power_watts(self) -> float: + """ + Returns estimated power in watts by multiplying utilization% + by the device TDP. + + NOTE: This is an estimation. For the raw measured value + use get_utilization_pct() instead. + Returns 0.0 if TDP is unknown or status is not POWER_STATUS_VALID. + """ + if self._max_power_watts == 0.0: + logger.debug( + f"NeuronDevice {self._device_index}: " + "TDP unknown, cannot estimate watts" + ) + return 0.0 + + result = self._read_power_file() + if result is None: + return 0.0 + + status, _, _, avg_pct = result + + if status != "POWER_STATUS_VALID": + logger.debug( + f"NeuronDevice {self._device_index}: " + f"power status: {status}, returning 0.0W" + ) + return 0.0 + + watts = (avg_pct / 100.0) * self._max_power_watts + logger.debug( + f"NeuronDevice {self._device_index}: " + f"avg={avg_pct:.2f}%, TDP={self._max_power_watts}W " + f"=> {watts:.2f}W" + ) + return watts + + def get_device_index(self) -> int: + return self._device_index + + +class AllNeuronDevices: + """ + Discovers and manages all AWS Inferentia/Inferentia2 Neuron devices + on the system by scanning the Neuron sysfs directory. + """ + + def __init__(self): + self._devices: List[NeuronDevice] = self._discover_devices() + logger.info(f"Found {len(self._devices)} Neuron device(s)") + + def _discover_devices(self) -> List[NeuronDevice]: + """ + Scan sysfs for Neuron devices and return a sorted list + of NeuronDevice objects. + Uses neuron[0-9]* glob to avoid matching neuron_core directories. + """ + base_path = "/sys/devices/virtual/neuron_device" + device_paths = sorted(glob.glob(os.path.join(base_path, "neuron[0-9]*"))) + devices = [] + for i, path in enumerate(device_paths): + if os.path.isdir(path): + devices.append(NeuronDevice(path, i)) + logger.info(f"Neuron device {i} found at {path}") + return devices + + @property + def device_count(self) -> int: + return len(self._devices) + + def get_total_power_watts(self) -> float: + """ + Sum estimated power in watts across all Neuron devices. + See NeuronDevice.get_power_watts() for accuracy limitations. + """ + return sum(d.get_power_watts() for d in self._devices) + + def get_total_utilization_pct(self) -> float: + """ + Average raw utilization percentage across all Neuron devices. + This is the direct measured value with no estimation involved. + Returns 0.0 if no devices are present. + """ + if not self._devices: + return 0.0 + return sum(d.get_utilization_pct() for d in self._devices) / len(self._devices) + + def get_device_details(self) -> List[Dict]: + """ + Return a list of dicts with per-device power and utilization. + """ + return [ + { + "device_index": d.get_device_index(), + "power_watts": d.get_power_watts(), + "utilization_pct": d.get_utilization_pct(), + } + for d in self._devices + ] diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py index 67786189d..568ee0ec7 100644 --- a/codecarbon/core/resource_tracker.py +++ b/codecarbon/core/resource_tracker.py @@ -3,6 +3,7 @@ from codecarbon.core import cpu, gpu, powermetrics from codecarbon.core.config import normalize_gpu_ids +from codecarbon.core.neuron import is_neuron_system from codecarbon.core.util import ( detect_cpu_model, is_linux_os, @@ -10,13 +11,19 @@ is_mac_os, is_windows_os, ) -from codecarbon.external.hardware import CPU, GPU, MODE_CPU_LOAD, AppleSiliconChip +from codecarbon.external.hardware import ( + CPU, + GPU, + MODE_CPU_LOAD, + AppleSiliconChip, + NeuronChip, +) from codecarbon.external.logger import logger from codecarbon.external.ram import RAM class ResourceTracker: - cpu_tracker = gpu_tracker = ram_tracker = "Unspecified" + cpu_tracker = gpu_tracker = ram_tracker = neuron_tracker = "Unspecified" def __init__(self, tracker): self.tracker = tracker @@ -250,6 +257,21 @@ def set_GPU_tracking(self): self.tracker._conf.setdefault("gpu_count", 0) self.tracker._conf.setdefault("gpu_model", "") + def set_Neuron_tracking(self): + logger.info("[setup] Neuron Tracking...") + if is_neuron_system(): + logger.info("Tracking AWS Inferentia/Inferentia2 via Neuron sysfs") + neuron = NeuronChip() + self.tracker._hardware.append(neuron) + self.tracker._conf["neuron_count"] = neuron._devices.device_count + self.tracker._conf["neuron_model"] = neuron._model + self.neuron_tracker = "Neuron sysfs" + else: + logger.info("No Neuron device found.") + self.tracker._conf.setdefault("neuron_count", 0) + self.tracker._conf.setdefault("neuron_model", "") + self.neuron_tracker = "Unspecified" + def set_CPU_GPU_ram_tracking(self): """ Set up CPU, GPU and RAM tracking based on the user's configuration. @@ -258,11 +280,13 @@ def set_CPU_GPU_ram_tracking(self): self.set_RAM_tracking() self.set_CPU_tracking() self.set_GPU_tracking() + self.set_Neuron_tracking() logger.info( f"""The below tracking methods have been set up: RAM Tracking Method: {self.ram_tracker} CPU Tracking Method: {self.cpu_tracker} GPU Tracking Method: {self.gpu_tracker} + Neuron Tracking Method: {self.neuron_tracker} """ ) diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 862eba2b4..bad2628ce 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -23,7 +23,7 @@ from codecarbon.core.units import Energy, Power, Time, Water from codecarbon.core.util import count_cpus, count_physical_cpus, suppress from codecarbon.external.geography import CloudMetadata, GeoMetadata -from codecarbon.external.hardware import CPU, GPU, AppleSiliconChip +from codecarbon.external.hardware import CPU, GPU, AppleSiliconChip, NeuronChip from codecarbon.external.logger import logger, set_logger_format, set_logger_level from codecarbon.external.ram import RAM from codecarbon.external.scheduler import PeriodicScheduler @@ -368,12 +368,18 @@ def __init__( self._gpu_utilization_history: List[float] = [] self._ram_utilization_history: List[float] = [] self._ram_used_history: List[float] = [] + self._cpu_temperature_history: List[float] = [] + self._gpu_temperature_history: List[float] = [] self._total_cpu_energy: Energy = Energy.from_energy(kWh=0) self._total_gpu_energy: Energy = Energy.from_energy(kWh=0) self._total_ram_energy: Energy = Energy.from_energy(kWh=0) self._cpu_power: Power = Power.from_watts(watts=0) self._gpu_power: Power = Power.from_watts(watts=0) self._ram_power: Power = Power.from_watts(watts=0) + self._total_neuron_energy: Energy = Energy.from_energy(kWh=0) + self._neuron_power: Power = Power.from_watts(watts=0) + self._neuron_power_sum: float = 0.0 + self._neuron_utilization_history: List[float] = [] # Running average tracking for power self._cpu_power_sum: float = 0.0 self._gpu_power_sum: float = 0.0 @@ -548,6 +554,9 @@ def start(self) -> None: self._ram_utilization_history.clear() self._ram_used_history.clear() self._gpu_utilization_history.clear() + self._cpu_temperature_history.clear() + self._gpu_temperature_history.clear() + self._neuron_utilization_history.clear() # Read initial energy for hardware for hardware in self._hardware: @@ -598,6 +607,9 @@ def start_task(self, task_name=None) -> None: self._ram_utilization_history.clear() self._ram_used_history.clear() self._gpu_utilization_history.clear() + self._cpu_temperature_history.clear() + self._gpu_temperature_history.clear() + self._neuron_utilization_history.clear() # Read initial energy for hardware for hardware in self._hardware: @@ -922,6 +934,28 @@ def _prepare_emissions_data(self) -> EmissionsData: tracking_mode=self._conf.get("tracking_mode"), pue=self._pue, wue=self._wue, + cpu_temperature=( + sum(self._cpu_temperature_history) / len(self._cpu_temperature_history) + if self._cpu_temperature_history + else 0.0 + ), + gpu_temperature=( + sum(self._gpu_temperature_history) / len(self._gpu_temperature_history) + if self._gpu_temperature_history + else 0.0 + ), + neuron_power=( + self._neuron_power_sum / self._power_measurement_count + if self._power_measurement_count > 0 + else self._neuron_power.W + ), + neuron_energy=self._total_neuron_energy.kWh, + neuron_utilization_pct=( + sum(self._neuron_utilization_history) + / len(self._neuron_utilization_history) + if self._neuron_utilization_history + else 0.0 + ), ) logger.debug(total_emissions) return total_emissions @@ -973,6 +1007,10 @@ def _monitor_power(self) -> None: self._ram_utilization_history.append(psutil.virtual_memory().percent) self._ram_used_history.append(psutil.virtual_memory().used / (1024**3)) + for hardware in self._hardware: + if isinstance(hardware, CPU): + self._cpu_temperature_history.append(hardware.get_cpu_temperature()) + # Collect GPU utilization metrics for hardware in self._hardware: if isinstance(hardware, GPU): @@ -980,13 +1018,23 @@ def _monitor_power(self) -> None: gpu_details = hardware.devices.get_gpu_details() for gpu_index, gpu_detail in enumerate(gpu_details): resolved_gpu_index = gpu_detail.get("gpu_index", gpu_index) - if ( - resolved_gpu_index in gpu_ids_to_monitor - and "gpu_utilization" in gpu_detail - ): - self._gpu_utilization_history.append( - gpu_detail["gpu_utilization"] - ) + if resolved_gpu_index in gpu_ids_to_monitor: + + if "gpu_utilization" in gpu_detail: + self._gpu_utilization_history.append( + gpu_detail["gpu_utilization"] + ) + + if "temperature" in gpu_detail: + self._gpu_temperature_history.append( + gpu_detail["temperature"] + ) + + for hardware in self._hardware: + if isinstance(hardware, NeuronChip): + self._neuron_utilization_history.append( + hardware._devices.get_total_utilization_pct() + ) def _do_measurements(self) -> None: for hardware in self._hardware: @@ -1052,6 +1100,14 @@ def _do_measurements(self) -> None: f"Energy consumed for all AppleSilicon GPUs : {self._total_gpu_energy.kWh:.6f} kWh" + f". Total GPU Power : {self._gpu_power.W} W" ) + elif isinstance(hardware, NeuronChip): + self._total_neuron_energy += energy + self._neuron_power = power + self._neuron_power_sum += power.W + logger.info( + f"Energy consumed for Neuron : {self._total_neuron_energy.kWh:.6f} kWh" + + f". Neuron Power : {self._neuron_power.W} W" + ) else: logger.error(f"Unknown hardware type: {hardware} ({type(hardware)})") h_time = time.perf_counter() - h_time diff --git a/codecarbon/external/hardware.py b/codecarbon/external/hardware.py index 8ac4de8f8..10b194222 100644 --- a/codecarbon/external/hardware.py +++ b/codecarbon/external/hardware.py @@ -13,6 +13,7 @@ from codecarbon.core.cpu import IntelPowerGadget, IntelRAPL from codecarbon.core.gpu import AllGPUDevices +from codecarbon.core.neuron import AllNeuronDevices from codecarbon.core.powermetrics import ApplePowermetrics from codecarbon.core.units import Energy, Power, Time from codecarbon.core.util import count_cpus, detect_cpu_model @@ -409,6 +410,40 @@ def monitor_power(self): cpu_power = self._get_power_from_cpus() self._power_history.append(cpu_power) + def get_cpu_temperature(self) -> float: + """ + Get average CPU temperature in Celsius. + Supported on Linux (Intel + AMD) and Windows Intel via Power Gadget. + Returns 0.0 if temperature cannot be read on the current platform. + """ + try: + if self._mode == "intel_power_gadget": + all_cpu_details = self._intel_interface.get_cpu_details() + for metric, value in all_cpu_details.items(): + if re.match(r"^CPU Temperature", metric): + return float(value) + return 0.0 + + elif self._mode in ["intel_rapl", MODE_CPU_LOAD, "constant"]: + temps = psutil.sensors_temperatures() + if not temps: + logger.debug( + "get_cpu_temperature: psutil.sensors_temperatures() " + "returned no data on this platform" + ) + return 0.0 + for key in ["coretemp", "k10temp", "cpu_thermal"]: + if key in temps: + readings = temps[key] + avg = sum(r.current for r in readings) / len(readings) + logger.debug(f"get_cpu_temperature: {key} avg = {avg:.1f}°C") + return avg + return 0.0 + + except Exception as e: + logger.debug(f"get_cpu_temperature: Could not read CPU temperature: {e}") + return 0.0 + def get_model(self): return self._model @@ -522,3 +557,47 @@ def from_utils( logger.warning("Could not read AppleSiliconChip model.") return cls(output_dir=output_dir, model=model, chip_part=chip_part) + + +@dataclass +class NeuronChip(BaseHardware): + """ + Tracks AWS Inferentia/Inferentia2 power consumption + via the Neuron sysfs interface. + + Power is estimated from utilization% x TDP. + Utilization% is the raw measured value from sysfs. + + Sampling limitation: Neuron sysfs updates every 60 seconds. + codecarbon reads every 15 seconds so the same value may be + read up to 4 times between updates. Energy estimates are most + accurate for steady workloads and runs longer than 60 seconds. + + NOTE: Neuron sysfs reports device-level power, not per-process. + Accurate for exclusive instances, approximate for shared Neuron cores. + """ + + def __init__(self): + self._devices = AllNeuronDevices() + self._model = "AWS Inferentia/Inferentia2" + logger.warning( + "Neuron power sysfs updates every 60 seconds. " + "codecarbon reads every 15 seconds so power readings " + "may be stale between updates. Energy estimates are most " + "accurate for runs longer than 60 seconds with steady workloads." + ) + + def __repr__(self) -> str: + return f"NeuronChip({self._model}, " f"{self._devices.device_count} device(s))" + + def total_power(self) -> Power: + """ + Returns total estimated power across all Neuron devices in watts. + Called every 15 seconds by _do_measurements() in tracker.py. + Power is estimated from utilization% x TDP. + """ + watts = self._devices.get_total_power_watts() + return Power.from_watts(watts) + + def description(self) -> str: + return repr(self) diff --git a/codecarbon/output_methods/emissions_data.py b/codecarbon/output_methods/emissions_data.py index 17544aa51..38e2a92b0 100644 --- a/codecarbon/output_methods/emissions_data.py +++ b/codecarbon/output_methods/emissions_data.py @@ -47,6 +47,11 @@ class EmissionsData: on_cloud: str = "N" pue: float = 1 wue: float = 0 + cpu_temperature: float = 0.0 # ADD + gpu_temperature: float = 0.0 # ADD + neuron_power: float = 0.0 + neuron_energy: float = 0.0 + neuron_utilization_pct: float = 0.0 @property def values(self) -> OrderedDict: @@ -110,6 +115,11 @@ class TaskEmissionsData: ram_utilization_percent: float = 0.0 ram_used_gb: float = 0.0 on_cloud: str = "N" + cpu_temperature: float = 0.0 + gpu_temperature: float = 0.0 + neuron_power: float = 0.0 + neuron_energy: float = 0.0 + neuron_utilization_pct: float = 0.0 @property def values(self) -> OrderedDict: diff --git a/codecarbon/output_methods/file.py b/codecarbon/output_methods/file.py index e13f45e90..b11b8a7ad 100644 --- a/codecarbon/output_methods/file.py +++ b/codecarbon/output_methods/file.py @@ -147,3 +147,6 @@ def task_out(self, data: List[TaskEmissionsData], experiment_name: str): new_df = new_df.dropna(axis=1, how="all") df = new_df df.to_csv(save_task_file_path, index=False) + + def live_out(self, total: EmissionsData, delta: EmissionsData): + self.out(total, delta) diff --git a/docker-compose.yml b/docker-compose.yml index 777f5c47f..ad4c965c9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -74,8 +74,8 @@ services: POSTGRES_PASSWORD: ${DATABASE_PASS:-supersecret} POSTGRES_USER: ${DATABASE_USER:-codecarbon-user} image: postgres:13 - # ports: - # - 5480:5432 + ports: + - 5480:5432 restart: unless-stopped volumes: - postgres_codecarbon_data:/var/lib/postgresql/data:rw diff --git a/docs/Contributions/cputemp.md b/docs/Contributions/cputemp.md new file mode 100644 index 000000000..c656cb258 --- /dev/null +++ b/docs/Contributions/cputemp.md @@ -0,0 +1,107 @@ +# Contributions + +Added a function in Hardware.py that tracks cpu temps live in Celsius, this covers issue 1008 + +### Added code + +``` python +def get_cpu_temperature(self) -> float: + """ + Get average CPU temperature in Celsius. + Supported on Linux (Intel + AMD) and Windows Intel via Power Gadget. + Returns 0.0 if temperature cannot be read on the current platform. + """ + try: + if self._mode == "intel_power_gadget": + all_cpu_details = self._intel_interface.get_cpu_details() + for metric, value in all_cpu_details.items(): + if re.match(r"^CPU Temperature", metric): + return float(value) + return 0.0 + + elif self._mode in ["intel_rapl", MODE_CPU_LOAD, "constant"]: + temps = psutil.sensors_temperatures() + if not temps: + logger.debug( + "get_cpu_temperature: psutil.sensors_temperatures() " + "returned no data on this platform" + ) + return 0.0 + for key in ["coretemp", "k10temp", "cpu_thermal"]: + if key in temps: + readings = temps[key] + avg = sum(r.current for r in readings) / len(readings) + logger.debug(f"get_cpu_temperature: {key} avg = {avg:.1f}°C") + return avg + return 0.0 + + except Exception as e: + logger.debug(f"get_cpu_temperature: Could not read CPU temperature: {e}") + return 0.0 +``` +### Added Workflow + +```python +name: Test Temperature Tracking + +on: + push: + branches: [ main ] + workflow_dispatch: + +jobs: + test-temperature: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + pip install -e . + pip install pandas + + - name: Check sensors available + run: | + sudo apt-get install -y lm-sensors + python3 -c "import psutil; print('Sensors:', psutil.sensors_temperatures())" + + - name: Run temperature test + run: | + python3 -c " + import time + from codecarbon import EmissionsTracker + + tracker = EmissionsTracker( + project_name='temperature_test', + measure_power_secs=15, + save_to_file=True, + output_file='emissions_temp_test.csv', + log_level='debug' + ) + + tracker.start() + total = sum(range(10_000_000)) + time.sleep(30) + emissions = tracker.stop() + + print(f'Emissions: {emissions:.6f} kg CO2') + print(f'CPU temperature: {tracker.final_emissions_data.cpu_temperature:.1f}C') + print(f'GPU temperature: {tracker.final_emissions_data.gpu_temperature:.1f}C') + + import pandas as pd + df = pd.read_csv('emissions_temp_test.csv') + print('CSV columns:', df.columns.tolist()) + print('Temperature values:') + print(df[['cpu_temperature', 'gpu_temperature']]) + " +``` + +Allowed for CodeCarbon to track it and input it in to the CSV data set, shown in terminal below +![](../images/CpuTemp.png){.align-center width="700px" height="400px"} + diff --git a/docs/images/CpuTemp.png b/docs/images/CpuTemp.png new file mode 100644 index 000000000..463dc24f7 Binary files /dev/null and b/docs/images/CpuTemp.png differ diff --git a/requirements/requirements-api.txt b/requirements/requirements-api.txt index 42abf97f9..bf22ed371 100644 --- a/requirements/requirements-api.txt +++ b/requirements/requirements-api.txt @@ -32,6 +32,10 @@ click==8.3.1 # rich-toolkit # typer # uvicorn +colorama==0.4.6 + # via + # click + # uvicorn cryptography==46.0.7 # via # authlib @@ -65,6 +69,8 @@ fastar==0.9.0 # via fastapi-cloud-cli fief-client==0.20.0 # via carbonserver (carbonserver/pyproject.toml) +greenlet==3.3.2 + # via sqlalchemy h11==0.16.0 # via # httpcore @@ -208,8 +214,6 @@ uvicorn==0.38.0 # fastapi # fastapi-cli # fastapi-cloud-cli -uvloop==0.22.1 - # via uvicorn watchfiles==1.1.1 # via uvicorn websockets==15.0.1 diff --git a/tests/test_cpu.py b/tests/test_cpu.py index 1e1308812..c18bcae2a 100644 --- a/tests/test_cpu.py +++ b/tests/test_cpu.py @@ -5,6 +5,7 @@ import unittest from unittest import mock +import psutil import pytest from codecarbon.core.config import normalize_gpu_ids @@ -66,6 +67,12 @@ def test_is_psutil_available_without_nice(self, mock_cpu_times): def test_is_psutil_not_available_on_exception(self, mock_cpu_times): self.assertFalse(is_psutil_available()) + @mock.patch("psutil.sensors_temperatures") + def psutil_returns_expected_temperature(self, mock_cpu_times): + mock_temp = mock.Mock() + mock_temp.return_value = {"coretemp": 50, "k10temp": 50, "cpu_thermal": 50} + self.assertEqual(psutil.sensors_temperatures(), 50) + class TestRAPLHelperFunctions(unittest.TestCase): def test_get_candidate_bases_for_custom_dir(self): diff --git a/tests/test_data/emissions_valid_headers.csv b/tests/test_data/emissions_valid_headers.csv index b7493c902..aaf9ad182 100644 --- a/tests/test_data/emissions_valid_headers.csv +++ b/tests/test_data/emissions_valid_headers.csv @@ -1,2 +1,2 @@ -timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,water_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,cpu_utilization_percent,gpu_utilization_percent,ram_utilization_percent,ram_used_gb,on_cloud,pue,wue +timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,water_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,cpu_utilization_percent,gpu_utilization_percent,ram_utilization_percent,ram_used_gb,on_cloud,pue,wue,cpu_temperature,gpu_temperature 2021-09-23T15:04:51,codecarbon,0a578547-1d6b-4e2f-be0c-7ad10f2f7c97,test,161.20380687713623,0.0004490989249167,0.0027859076880178,0.269999999999999,0.0,12.884901888000002,0.0,0,0.00057442898176,0.00057442898176,0.1,Morocco,MAR,casablanca-settat,,,macOS-10.15.7-x86_64-i386-64bit,3.8.0,2.1.3,12,Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz,,,-7.9084,33.5932,,machine,0.0,0.0,0.0,0.0,N,1.0,0.0 diff --git a/tests/test_live_out.py b/tests/test_live_out.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_neuron.py b/tests/test_neuron.py new file mode 100644 index 000000000..e69de29bb diff --git a/webapp/package-lock.json b/webapp/package-lock.json index f7e2bc14b..eaa3f78f8 100644 --- a/webapp/package-lock.json +++ b/webapp/package-lock.json @@ -2259,7 +2259,6 @@ "integrity": "sha512-UaicktuQI+9UKyA4njtDOGBD/67t8YEBt2xdfqu8+gP9hqPUPsiXlNPcpS2gVdjmis5GKPG3fCxbQLVgxsQZ8w==", "devOptional": true, "license": "MIT", - "peer": true, "dependencies": { "csstype": "^3.0.2" } @@ -2270,7 +2269,6 @@ "integrity": "sha512-jFf/woGTVTjUJsl2O7hcopJ1r0upqoq/vIOoCj0yLh3RIXxWcljlpuZ+vEBRXsymD1jhfeJrlyTy/S1UW+4y1w==", "devOptional": true, "license": "MIT", - "peer": true, "peerDependencies": { "@types/react": "^19.0.0" } @@ -2320,7 +2318,6 @@ "integrity": "sha512-BtE0k6cjwjLZoZixN0t5AKP0kSzlGu7FctRXYuPAm//aaiZhmfq1JwdYpYr1brzEspYyFeF+8XF5j2VK6oalrA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.54.0", "@typescript-eslint/types": "8.54.0", @@ -2807,7 +2804,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -3970,7 +3966,6 @@ "integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -4157,7 +4152,6 @@ "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@rtsao/scc": "^1.1.0", "array-includes": "^3.1.9", @@ -5434,7 +5428,6 @@ "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz", "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", "license": "MIT", - "peer": true, "bin": { "jiti": "bin/jiti.js" } @@ -6175,7 +6168,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -6385,7 +6377,6 @@ "resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz", "integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==", "license": "MIT", - "peer": true, "engines": { "node": ">=0.10.0" } @@ -6426,7 +6417,6 @@ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz", "integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==", "license": "MIT", - "peer": true, "dependencies": { "scheduler": "^0.26.0" }, @@ -7497,7 +7487,6 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -7658,7 +7647,6 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" diff --git a/webapp/src/helpers/api-client.ts b/webapp/src/helpers/api-client.ts index bbae063e0..1377470a9 100644 --- a/webapp/src/helpers/api-client.ts +++ b/webapp/src/helpers/api-client.ts @@ -30,6 +30,10 @@ export async function fetchApiClient( }); if (!response.ok) { + if (response.status == 401) { + window.location.href = `${process.env.NEXT_PUBLIC_API_URL}/auth/login?redirect=${process.env.NEXT_PUBLIC_BASE_URL}/home?auth=true`; + return null; + } let errorMessage = `API error: ${response.status} ${response.statusText}`; try { const errorData = await response.json(); diff --git a/webapp/src/helpers/api-server.ts b/webapp/src/helpers/api-server.ts index d40e43851..9b432a80a 100644 --- a/webapp/src/helpers/api-server.ts +++ b/webapp/src/helpers/api-server.ts @@ -1,6 +1,7 @@ "use server"; import { cookies } from "next/headers"; +import { redirect } from "next/navigation"; import { SESSION_COOKIE_NAME } from "./auth"; const API_BASE = process.env.NEXT_PUBLIC_API_URL; @@ -19,7 +20,9 @@ export async function fetchApiServer( const sessionCookie = cookieStore.get(SESSION_COOKIE_NAME); if (!sessionCookie?.value) { - throw new Error("No authentication session found"); + redirect( + `${process.env.NEXT_PUBLIC_API_URL}/auth/login?redirect=${process.env.NEXT_PUBLIC_BASE_URL}/home?auth=true`, + ); } const response = await fetch(`${API_BASE}${endpoint}`, { @@ -32,6 +35,13 @@ export async function fetchApiServer( }); if (!response.ok) { + + if ((response.status === 401)) { + redirect( + `${process.env.NEXT_PUBLIC_API_URL}/auth/login?redirect=${process.env.NEXT_PUBLIC_BASE_URL}/home?auth=true`, + ); + } + let errorMessage = `API error: ${response.status} ${response.statusText}`; try { const errorData = await response.json(); @@ -59,6 +69,15 @@ export async function fetchApiServer( return null; } } catch (error) { + + if( + error instanceof Error && + (error.message == "NEXT_REDIRECT" || + (error as any).digest?.startsWith("NEXT_REDIRECT")) + ) { + throw error; + } + // Log server-side error with more details console.error("API server request failed:", { endpoint, diff --git a/webapp/src/helpers/swr.tsx b/webapp/src/helpers/swr.tsx index 0a16fb956..0bbde4903 100644 --- a/webapp/src/helpers/swr.tsx +++ b/webapp/src/helpers/swr.tsx @@ -5,10 +5,18 @@ export const SWRProvider = ({ children }: { children: React.ReactNode }) => { }; export const fetcher = async (url: string) => { - const res = await fetch(`${process.env.NEXT_PUBLIC_API_URL}${url}`); + const res = await fetch(`${process.env.NEXT_PUBLIC_API_URL}${url}`, { + credentials: "include", + }); + if (!res.ok) { + if (res.status === 401) { + window.location.href = `${process.env.NEXT_PUBLIC_API_URL}/auth/login?redirect=${process.env.NEXT_PUBLIC_BASE_URL}/home?auth=true`; + return null; + } console.error("Failed to fetch data", res.statusText); throw new Error("Failed to fetch data"); } + return res.json(); };