diff --git a/.github/workflows/test-using-pytest.yml b/.github/workflows/test-using-pytest.yml index cbb02e322..7abcbb904 100644 --- a/.github/workflows/test-using-pytest.yml +++ b/.github/workflows/test-using-pytest.yml @@ -25,7 +25,7 @@ jobs: run: | sudo apt-get update sudo apt-get -y upgrade - sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables libsystemd-dev cmake libdbus-1-dev libglib2.0-dev + sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables libsystemd-dev cmake libdbus-1-dev libglib2.0-dev lshw python3-jwcrypto pip install --upgrade typing-extensions types-PyYAML - name: Install required Python packages diff --git a/docker/vm_supervisor-dev.dockerfile b/docker/vm_supervisor-dev.dockerfile index e78a02ec1..2d9e74eed 100644 --- a/docker/vm_supervisor-dev.dockerfile +++ b/docker/vm_supervisor-dev.dockerfile @@ -5,7 +5,7 @@ FROM debian:bookworm RUN apt-get update && apt-get -y upgrade && apt-get install -y \ sudo acl curl squashfs-tools git \ python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\ - python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd nftables \ + python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd nftables \ && rm -rf /var/lib/apt/lists/* RUN useradd jailman diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 6b42eea41..e2e73f8f0 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,python3-jwcrypto +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,lshw,python3-jwcrypto Section: aleph-im Priority: Extra diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py new file mode 100644 index 000000000..11095343d --- /dev/null +++ b/src/aleph/vm/orchestrator/machine.py @@ -0,0 +1,76 @@ +import asyncio +import json +import re +import shutil + +import psutil + +from aleph.vm.utils import run_in_subprocess + + +async def get_hardware_info(): + lshw_path = shutil.which("lshw") + assert lshw_path, "lshw not found in PATH. apt install lshw." + lshw_output = await run_in_subprocess([lshw_path, "-sanitize", "-json"]) + data = json.loads(lshw_output) + + hw_info = {"cpu": None, "memory": None} + + for hw in data["children"][0]["children"]: + if hw["id"] == "cpu": + hw_info["cpu"] = hw + elif hw["class"] == "memory" and hw["id"] == "memory": + hw_info["memory"] = hw + + return hw_info + + +def get_cpu_info(hw): + cpu_info = hw["cpu"] + + if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]: + architecture = "x86_64" + elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]: + architecture = "arm64" + else: + architecture = None + + vendor = cpu_info["vendor"] + # lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308 + + if "Intel Corp" in vendor: + vendor = "GenuineIntel" + elif "Advanced Micro Devices [AMD]" in vendor: + vendor = "AuthenticAMD" + + return { + "architecture": architecture, + "vendor": vendor, + "model": cpu_info["product"], + "frequency": cpu_info["capacity"], + "count": psutil.cpu_count(), + } + + +def get_memory_info(hw): + mem_info = hw["memory"] + + memory_type = "" + memory_clock = "" + for bank in mem_info["children"]: + memory_clock = bank.get("clock") + if "description" in bank: + matched = re.search("(DDR[2-6])", bank["description"]) + if matched: + memory_type = matched.group(0) + break + else: + pass + + return { + "size": mem_info["size"], + "units": mem_info["units"], + "type": memory_type, + "clock": memory_clock, + "clock_units": "Hz" if memory_clock is not None else "", + } diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 694c61bb2..4ce098371 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -1,8 +1,7 @@ import math from datetime import datetime, timezone -from functools import lru_cache +from typing import Optional -import cpuinfo import psutil from aiohttp import web from aleph_message.models import ItemHash @@ -10,10 +9,16 @@ from pydantic import BaseModel, Field from aleph.vm.conf import settings +from aleph.vm.orchestrator.machine import ( + get_cpu_info, + get_hardware_info, + get_memory_info, +) from aleph.vm.pool import VmPool from aleph.vm.resources import GpuDevice from aleph.vm.sevclient import SevClient from aleph.vm.utils import ( + async_cache, check_amd_sev_es_supported, check_amd_sev_snp_supported, check_amd_sev_supported, @@ -90,6 +95,29 @@ class MachineUsage(BaseModel): active: bool = True +class ExtendedCpuProperties(CpuProperties): + """CPU properties.""" + + model: str | None = Field(default=None, description="CPU model") + frequency: int | None = Field(default=None, description="CPU frequency") + count: int | None = Field(default=None, description="CPU count") + + +class MemoryProperties(BaseModel): + """MEMORY properties.""" + + size: int | None = Field(default=None, description="Memory size") + units: str | None = Field(default=None, description="Memory size units") + type: str | None = Field(default=None, description="Memory type") + clock: int | None = Field(default=None, description="Memory clock") + clock_units: str | None = Field(default=None, description="Memory clock units") + + +class MachineCapability(BaseModel): + cpu: ExtendedCpuProperties + memory: MemoryProperties + + def get_machine_gpus(request: web.Request) -> GpuProperties: pool: VmPool = request.app["vm_pool"] gpus = pool.gpus @@ -101,19 +129,22 @@ def get_machine_gpus(request: web.Request) -> GpuProperties: ) -@lru_cache -def get_machine_properties() -> MachineProperties: +machine_properties_cached = None + + +@async_cache +async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... These should not change while the supervisor is running. In the future, some properties may have to be fetched from within a VM. """ - cpu_info = cpuinfo.get_cpu_info() # Slow - + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) return MachineProperties( cpu=CpuProperties( - architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")), - vendor=cpu_info.get("vendor_id", cpu_info.get("vendor_id_raw")), + architecture=cpu_info["architecture"], + vendor=cpu_info["vendor"], features=list( filter( None, @@ -128,6 +159,39 @@ def get_machine_properties() -> MachineProperties: ) +@async_cache +async def get_machine_capability() -> MachineCapability: + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) + mem_info = get_memory_info(hw) + + return MachineCapability( + cpu=ExtendedCpuProperties( + architecture=cpu_info["architecture"], + vendor=cpu_info["vendor"], + model=cpu_info["model"], + frequency=(cpu_info["frequency"]), + count=(cpu_info["count"]), + features=list( + filter( + None, + ( + "sev" if check_amd_sev_supported() else None, + "sev_es" if check_amd_sev_es_supported() else None, + "sev_snp" if check_amd_sev_snp_supported() else None, + ), + ) + ), + ), + memory=MemoryProperties( + size=mem_info["size"], + units=mem_info["units"], + type=mem_info["type"], + clock=mem_info["clock"], + ), + ) + + @cors_allow_all async def about_system_usage(request: web.Request): """Public endpoint to expose information about the system usage.""" @@ -135,6 +199,7 @@ async def about_system_usage(request: web.Request): machine_properties = get_machine_properties() pool = request.app["vm_pool"] + machine_properties = await get_machine_properties() usage: MachineUsage = MachineUsage( cpu=CpuUsage( count=psutil.cpu_count(), @@ -173,6 +238,13 @@ async def about_certificates(request: web.Request): return web.FileResponse(await sev_client.get_certificates()) +async def about_capability(_: web.Request): + """Public endpoint to expose information about the CRN capability.""" + + capability: MachineCapability = await get_machine_capability() + return web.json_response(text=capability.json(exclude_none=False)) + + class Allocation(BaseModel): """An allocation is the set of resources that are currently allocated on this orchestrator. It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs. diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 36bd42dad..087352301 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -20,7 +20,7 @@ from aleph.vm.sevclient import SevClient from aleph.vm.version import __version__ -from .resources import about_certificates, about_system_usage +from .resources import about_capability, about_certificates, about_system_usage from .tasks import ( start_payment_monitoring_task, start_watch_for_messages_task, @@ -129,6 +129,7 @@ def setup_webapp(pool: VmPool | None): web.get("/about/executions/records", about_execution_records), web.get("/about/usage/system", about_system_usage), web.get("/about/certificates", about_certificates), + web.get("/about/capability", about_capability), web.get("/about/config", about_config), # /control APIs are used to control the VMs and access their logs web.post("/control/allocation/notify", notify_allocation), diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 251cb2a04..17a8d386c 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -405,7 +405,7 @@ async def reserve_resources(self, message: ExecutableContent, user): return expiration_date def find_resources_available_for_user(self, message: ExecutableContent, user) -> set[GpuDevice]: - """Find required resource to run ExecutableContent from reserved resources by user or free resources. + """Find the required resource to run ExecutableContent from reserved resources by user or free resources. Only implement GPU for now""" # Calling function should use the creation_lock to avoid resource being stollem diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py index 1c3be8096..2f9a697ff 100644 --- a/src/aleph/vm/resources.py +++ b/src/aleph/vm/resources.py @@ -28,7 +28,7 @@ class GpuDevice(HashableModel): """GPU properties.""" vendor: str = Field(description="GPU vendor name") - model: Optional[str] = Field(description="GPU model name on Aleph Network", default=None) + model: str | None = Field(description="GPU model name on Aleph Network", default=None) device_name: str = Field(description="GPU vendor card name") device_class: GpuDeviceClass = Field( description="GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03" diff --git a/src/aleph/vm/utils/__init__.py b/src/aleph/vm/utils/__init__.py index 62046184f..beaf288ef 100644 --- a/src/aleph/vm/utils/__init__.py +++ b/src/aleph/vm/utils/__init__.py @@ -1,5 +1,6 @@ import asyncio import dataclasses +import functools import hashlib import json import logging @@ -252,3 +253,17 @@ def file_hashes_differ(source: Path, destination: Path, checksum: Callable[[Path return True return checksum(source) != checksum(destination) + + +def async_cache(fn): + """Simple async function cache decorator.""" + cache = {} + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + key = (args, frozenset(kwargs.items())) + if key not in cache: + cache[key] = await fn(*args, **kwargs) + return cache[key] + + return wrapper diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 025917379..02a665bca 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -1,4 +1,5 @@ import asyncio +import os import tempfile from copy import deepcopy from pathlib import Path @@ -72,6 +73,7 @@ async def test_allocation_fails_on_invalid_item_hash(aiohttp_client): @pytest.mark.asyncio async def test_system_usage(aiohttp_client, mocker, mock_app_with_pool): """Test that the usage system endpoints responds. No auth needed""" + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) client = await aiohttp_client(await mock_app_with_pool) response: web.Response = await client.get("/about/usage/system") @@ -82,17 +84,162 @@ async def test_system_usage(aiohttp_client, mocker, mock_app_with_pool): assert resp["cpu"]["count"] > 0 +MOCK_SYSTEM_INFO = { + "cpu": { + "id": "cpu", + "class": "processor", + "claimed": True, + "handle": "DMI:0400", + "description": "CPU", + "product": "AMD EPYC 7763 64-Core Processor", + "vendor": "Advanced Micro Devices [AMD]", + "physid": "400", + "businfo": "cpu@0", + "version": "25.1.1", + "slot": "CPU 0", + "units": "Hz", + "size": 2000000000, + "capacity": 2000000000, + "width": 64, + "configuration": {"cores": "8", "enabledcores": "8", "microcode": "167776681", "threads": "1"}, + "capabilities": { + "x86-64": "64bits extensions (x86-64)", + "fpu": "mathematical co-processor", + "fpu_exception": "FPU exceptions reporting", + "wp": True, + "vme": "virtual mode extensions", + "de": "debugging extensions", + "pse": "page size extensions", + "tsc": "time stamp counter", + "msr": "model-specific registers", + "pae": "4GB+ memory addressing (Physical Address Extension)", + "mce": "machine check exceptions", + "cx8": "compare and exchange 8-byte", + "apic": "on-chip advanced programmable interrupt controller (APIC)", + "sep": "fast system calls", + "mtrr": "memory type range registers", + "pge": "page global enable", + "mca": "machine check architecture", + "cmov": "conditional move instruction", + "pat": "page attribute table", + "pse36": "36-bit page size extensions", + "clflush": True, + "mmx": "multimedia extensions (MMX)", + "fxsr": "fast floating point save/restore", + "sse": "streaming SIMD extensions (SSE)", + "sse2": "streaming SIMD extensions (SSE2)", + "ht": "HyperThreading", + "syscall": "fast system calls", + "nx": "no-execute bit (NX)", + "mmxext": "multimedia extensions (MMXExt)", + "fxsr_opt": True, + "pdpe1gb": True, + "rdtscp": True, + "rep_good": True, + "nopl": True, + "cpuid": True, + "extd_apicid": True, + "tsc_known_freq": True, + "pni": True, + "pclmulqdq": True, + "ssse3": True, + "fma": True, + "cx16": True, + "pcid": True, + "sse4_1": True, + "sse4_2": True, + "x2apic": True, + "movbe": True, + "popcnt": True, + "tsc_deadline_timer": True, + "aes": True, + "xsave": True, + "avx": True, + "f16c": True, + "rdrand": True, + "hypervisor": True, + "lahf_lm": True, + "cmp_legacy": True, + "svm": True, + "cr8_legacy": True, + "abm": True, + "sse4a": True, + "misalignsse": True, + "3dnowprefetch": True, + "osvw": True, + "perfctr_core": True, + "invpcid_single": True, + "ssbd": True, + "ibrs": True, + "ibpb": True, + "stibp": True, + "vmmcall": True, + "fsgsbase": True, + "tsc_adjust": True, + "bmi1": True, + "avx2": True, + "smep": True, + "bmi2": True, + "erms": True, + "invpcid": True, + "rdseed": True, + "adx": True, + "clflushopt": True, + "clwb": True, + "sha_ni": True, + "xsaveopt": True, + "xsavec": True, + "xgetbv1": True, + "xsaves": True, + "clzero": True, + "xsaveerptr": True, + "wbnoinvd": True, + "arat": True, + "npt": True, + "nrip_save": True, + "umip": True, + "pku": True, + "vaes": True, + "vpclmulqdq": True, + "rdpid": True, + "fsrm": True, + "arch_capabilities": True, + }, + }, + "memory": { + "id": "memory", + "class": "memory", + "claimed": True, + "handle": "DMI:1000", + "description": "System Memory", + "physid": "1000", + "units": "bytes", + "size": 17179869184, + "configuration": {"errordetection": "multi-bit-ecc"}, + "capabilities": {"ecc": "Multi-bit error-correcting code (ECC)"}, + "children": [ + { + "id": "bank", + "class": "memory", + "claimed": True, + "handle": "DMI:1100", + "description": "DIMM RAM", + "vendor": "QEMU", + "physid": "0", + "slot": "DIMM 0", + "units": "bytes", + "size": 17179869184, + } + ], + }, +} + + @pytest.mark.asyncio async def test_system_usage_mock(aiohttp_client, mocker, mock_app_with_pool): """Test that the usage system endpoints response value. No auth needed""" - mocker.patch( - "cpuinfo.cpuinfo.get_cpu_info", - { - "arch_string_raw": "x86_64", - "vendor_id_raw": "AuthenticAMD", - }, - ) + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) mocker.patch( "psutil.getloadavg", lambda: [1, 2, 3], @@ -113,6 +260,59 @@ async def test_system_usage_mock(aiohttp_client, mocker, mock_app_with_pool): assert resp["cpu"]["count"] == 200 +@pytest.mark.asyncio +async def test_system_capability_mock(aiohttp_client, mocker): + """Test that the capability system endpoints response value. No auth needed""" + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) + mocker.patch("aleph.vm.orchestrator.resources.check_amd_sev_supported", return_value=True) + mocker.patch("aleph.vm.orchestrator.resources.check_amd_sev_es_supported", return_value=True) + mocker.patch("aleph.vm.orchestrator.resources.check_amd_sev_snp_supported", return_value=False) + mocker.patch( + "psutil.getloadavg", + lambda: [1, 2, 3], + ) + mocker.patch( + "psutil.cpu_count", + lambda: 200, + ) + app = setup_webapp(pool=None) + client = await aiohttp_client(app) + response: web.Response = await client.get("/about/capability") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert resp == { + "cpu": { + "architecture": "x86_64", + "vendor": "AuthenticAMD", + "features": ["sev", "sev_es"], + "model": "AMD EPYC 7763 64-Core Processor", + "frequency": 2000000000, + "count": 200, + }, + "memory": {"size": 17179869184, "units": "bytes", "type": "", "clock": None, "clock_units": None}, + } + + +@pytest.mark.asyncio +async def test_system_capability_real(aiohttp_client, mocker): + """Test that the capability system endpoints response value + with real system value, no mock so we don't know the definive value but want ot see that it works""" + if os.environ.get("GITHUB_JOB"): + pytest.xfail("Test fail inside GITHUB CI because of invalid lshw return inside worker") + + app = setup_webapp(pool=None) + client = await aiohttp_client(app) + response: web.Response = await client.get("/about/capability") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert resp.get("cpu"), resp + assert resp["cpu"].get("architecture") + assert resp.get("memory") + assert resp["memory"].get("size") + + @pytest.mark.asyncio async def test_allocation_invalid_auth_token(aiohttp_client): """Test that the allocation endpoint fails when an invalid auth token is provided.""" @@ -424,6 +624,7 @@ def mock_is_kernel_enabled_gpu(pci_host: str) -> bool: async def test_system_usage_gpu_ressources(aiohttp_client, mocker, mock_app_with_pool): """Test gpu are properly listed""" client = await aiohttp_client(await mock_app_with_pool) + mocker.patch("aleph.vm.orchestrator.resources.get_hardware_info", return_value=MOCK_SYSTEM_INFO) response: web.Response = await client.get("/about/usage/system") assert response.status == 200