From e8ec74ed34c2d40394cd187365479e34997e55b6 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sun, 10 Aug 2025 08:19:43 +0000 Subject: [PATCH 01/51] add working documentation --- CLAUDE.md | 90 +++++ fs-resctrl-diagrams-gemini.md | 72 ++++ fs-resctrl-diagrams.md | 255 ++++++++++++ kernel/perf_blog.md | 30 ++ perf-diagrams.md | 301 ++++++++++++++ resctrl-blog.md | 129 ++++++ resctrl-fd.md | 320 +++++++++++++++ resctrl_internals.md | 136 +++++++ x64-resctrl-diagrams.md | 723 ++++++++++++++++++++++++++++++++++ 9 files changed, 2056 insertions(+) create mode 100644 CLAUDE.md create mode 100644 fs-resctrl-diagrams-gemini.md create mode 100644 fs-resctrl-diagrams.md create mode 100644 kernel/perf_blog.md create mode 100644 perf-diagrams.md create mode 100644 resctrl-blog.md create mode 100644 resctrl-fd.md create mode 100644 resctrl_internals.md create mode 100644 x64-resctrl-diagrams.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000000000..6edc124bcab2bf --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,90 @@ +We are editing the Linux Kernel. The checked out version was mainline around 6.15.6. + +Our goal is to add support to the resctrl subsystem for reading cache occupancy using perf counters. + +The main implementation of resctrl is in: +- arch/x86/include/asm/resctrl.h +- include/linux/resctrl_types.h +- include/linux/resctrl.h +- arch/x86/kernel/cpu/resctrl/** +- arch/x86/kernel/cpu/amd.c +- arch/x86/kernel/cpu/intel.c +- fs/resctrl/** + +Tests in: +- tools/testing/selftests/resctrl/ + +Documentation in: +- Documentation/filesystems/resctrl.rst +- tools/testing/selftests/resctrl/README + + +---- +# Understanding the Perf Subsystem in the Linux Kernel + +The perf_event_open System Call +------------------------------- + +The [perf_event_open](https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L13121) system call is defined in `kernel/events/core.c`. You can check the [man page](https://man7.org/linux/man-pages/man2/perf_event_open.2.html) for detailed documentation. + +The caller can supply a process, a CPU, and a cgroup ID for the entities they want to measure. **The cgroup and process ID share the same input parameter, with the flag `PERF_FLAG_PID_CGROUP` controlling whether the parameter refers to a cgroup or a pid.** The implementation handles locating these resources - for a process, it finds the task struct; for a CPU, it verifies the CPU is online; and for a cgroup, it passes the file descriptor to the cgroup. + +Group Leaders in Perf +--------------------- + +Since there's significant code dealing with group leaders, it's worth understanding how they work. The user passes the parameter `group_fd` to perf_event_open. **Groups allow scheduling of events onto the PMU (Performance Monitoring Unit) hardware as a group - all or none.** This matters because many PMUs have limits on the number of events they can track simultaneously, so the kernel time-multiplexes these groups onto PMUs. + +A perf event can optionally have a group leader specified, but a group leader cannot have another leader. This creates a limited hierarchy - not a tree structure, just leaders with events under them. The perf_event_open handles most of this for PMU implementers. It resolves the group leader if specified and ensures all events in the group belong to a single hardware PMU. **The perf subsystem doesn't allow events from multiple PMUs in the same group because it's hard to schedule events across multiple PMUs atomically.** + +Software events can be added to a hardware PMU group - this is allowed. When you add the first hardware event to a group that previously only had software events, there's logic to move all the events to the hardware PMU. + +Key Data Structures +------------------- + +Two important data structures link the perf_event struct to the PMU performing measurements and to the entity being monitored: + +- **[perf_event_context](https://elixir.bootlin.com/linux/v6.15.6/source/include/linux/perf_event.h#L945)** - associated with the measured entity (task or CPU) +- **[perf_event_pmu_context](https://elixir.bootlin.com/linux/v6.15.6/source/include/linux/perf_event.h#L906)** - associated with a perf_event_context and the PMU + +Multiple perf events can point to the same perf_event_context and perf_event_pmu_context. The perf_event holds a reference count on both structs it refers to. You can find documentation for `struct perf_event_pmu_context` in `include/linux/perf_event.h`. + +Event Allocation and PMU Lookup +------------------------------- + +The [perf_event_alloc](https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L12598) function (called from perf_event_open) allocates and initializes the struct perf_event. It calls [perf_init_event](https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L12413), which returns the PMU for the event. **This is where the lookup happens from the event type to the PMU associated with that event type.** + +The `perf_init_event` function includes functionality that tries to initialize the event on different PMUs using the event type. Since PMUs can override the type field, `perf_init_event` follows these type redirections to find the actual PMU associated with the event. This is also where the kernel resolves dynamic PMU IDs - PMUs dynamically registered with the kernel are assigned an ID when registered, as mentioned in the man page. + +Registering PMUs +---------------- + +Kernel code can register PMUs with [perf_pmu_register](https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L12218). PMU developers specify their PMU's behavior using fields on [struct pmu](https://elixir.bootlin.com/linux/v6.15.6/source/include/linux/perf_event.h#L322), defined in `include/linux/perf_event.h`. The code in `perf_event_open` ensures the user has proper permissions and that the PMU supports all requested features. + +Key fields that control supported behavior include: + +- **`task_ctx_nr`** - When set to `perf_invalid_context`, the PMU doesn't support task context +- **`capabilities`** - Encodes PMU capabilities. For example, when `event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT` is set, the PMU doesn't support sampling mode (only counter mode where users read values) + +PMU Function Pointers +--------------------- + +Struct pmu contains several function pointers for different PMU functionality: + +- `event_init` - initializes the perf_event struct +- `add` and `del` - add and delete events to/from the PMU +- `start` and `stop` - control event counting +- `read` - read event values + +There are also optional functions. One function not marked as optional but actually is: `sched_task`. **This allows the PMU to request a callback on every context switch on a specific CPU.** To enable this, the PMU needs to call [perf_sched_cb_inc](https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L3723), which enqueues the PMU's CPU context onto a per-CPU callback list. If you don't need this functionality, you can skip implementing the scheduling callback. + +Example: Intel Uncore PMU +------------------------- + +The [Intel uncore PMU registration](https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/events/intel/uncore.c#L913) provides a small PMU configuration example. It's not minimal because it implements the optional `pmu_enable` and `pmu_disable` functions, and passes attributes. + +The `attr_groups` field is also optional - it allows PMU developers to create attribute files and directories in `/sys/bus/event_source/devices/[pmu_name]/` for users to read PMU information. However, this can be left as NULL, as shown in the [Alpha architecture example](https://elixir.bootlin.com/linux/v6.15.6/source/arch/alpha/kernel/perf_event.c#L755). + +Summary +------- + +Understanding these perf subsystem internals is crucial when adding new PMU support to the kernel. We covered the major inputs to `perf_event_open`, the data structures associated with perf events, and how users specify their PMU functionality. Hope this serves as useful background when interacting with the code! \ No newline at end of file diff --git a/fs-resctrl-diagrams-gemini.md b/fs-resctrl-diagrams-gemini.md new file mode 100644 index 00000000000000..25e51b0d237d57 --- /dev/null +++ b/fs-resctrl-diagrams-gemini.md @@ -0,0 +1,72 @@ +# Mermaid Diagrams for `resctrl` Function Flows + +This document outlines the important function flows in `fs/resctrl/rdtgroup.c` for creating the `resctrl` filesystem directory structure. + +## 1. Filesystem Initialization and Mount Flow + +This flow describes the two main stages of `resctrl` setup: +1. **Module Initialization**: Key data structures are initialized and the `resctrl` filesystem type is registered with the kernel. This happens when the `resctrl` kernel module is loaded. +2. **Mount-time Creation**: When a user mounts the filesystem (e.g., `mount -t resctrl resctrl /sys/fs/resctrl`), the directory and file hierarchy is constructed in memory via `kernfs`. + +```mermaid +graph TD + subgraph Kernel Module Init + A(resctrl_init) -- in fs/resctrl/core.c --> B(rdt_init_resctrl_fs); + B -- in fs/resctrl/core.c --> C(rdtgroup_init); + C -- in fs/resctrl/rdtgroup.c --> D["register_filesystem(&resctrl_fs_type)"]; + end + + subgraph User Mounts Filesystem + E(user: mount -t resctrl ...) --> F(resctrl_mount); + end + + subgraph Mount-time Directory Creation + F -- in fs/resctrl/rdtgroup.c --> G(rdtgroup_mount); + G --> H["kern_mount(&resctrl_fs_type)"]; + G --> I(rdtgroup_info_dir_create); + I --> J["kernfs_create_dir('info')"]; + I --> K(rdtgroup_info_populate); + K --> L(rdt_info_files_create); + L --> M["kernfs_create_file(...) for each info file"]; + + G --> N(__rdtgroup_create_info_dir); + N -- for root group --> O(rdtgroup_add_files); + O -- creates 'schemata' file --> P["kernfs_create_file('schemata')"]; + G --> Q["Create 'tasks', 'cpus', 'cpus_list'
via kernfs_create_file"]; + end + + D -.-> E; +``` + +## 2. New Resource Group Creation (`mkdir`) Flow + +The `resctrl` filesystem allows users to create new resource groups by creating new directories. The `mkdir` operation triggers the following kernel flow to create a new `rdtgroup` and its associated control files (`schemata`, `tasks`, etc.). + +```mermaid +graph TD + subgraph User Action + A(user: mkdir /sys/fs/resctrl/my_group) --> B(resctrl_mkdir); + end + + subgraph mkdir Implementation in rdtgroup.c + B -- in fs/resctrl/rdtgroup.c --> C(rdtgroup_mkdir); + C --> D["Allocate new rdtgroup struct"]; + C --> E(rdtgroup_kn_alloc); + E --> F["Allocates kernfs_node for 'my_group'"]; + C --> G(__rdtgroup_create); + + subgraph __rdtgroup_create [in __rdtgroup_create] + direction LR + G --> H(__rdtgroup_create_info_dir); + H --> I["rdtgroup_add_files(..., rdt_info_files, ...)"]; + I --> J["kernfs_create_file('schemata')"]; + G --> K["rdtgroup_add_files(..., rdt_base_files, ...)"]; + K --> L["kernfs_create_file('tasks')
kernfs_create_file('cpus')
..."]; + end + + C --> O(kernfs_activate); + O --> P["New directory 'my_group' becomes visible"]; + end +``` + +These diagrams illustrate the core logic within `fs/resctrl` for managing the filesystem structure. \ No newline at end of file diff --git a/fs-resctrl-diagrams.md b/fs-resctrl-diagrams.md new file mode 100644 index 00000000000000..5af3685c58a0b9 --- /dev/null +++ b/fs-resctrl-diagrams.md @@ -0,0 +1,255 @@ +# ResCtrl Filesystem Function Flow Diagrams + +This document contains mermaid diagrams showing the important function flows in the ResCtrl filesystem implementation (`fs/resctrl/rdtgroup.c`). + +## 1. Initialization Flow - Directory Structure Creation + +This diagram shows how the ResCtrl filesystem initializes and creates its directory structure during mount. + +```mermaid +graph TD + A[resctrl_init] --> B[register_filesystem] + A --> C[rdtgroup_setup_default] + A --> D[sysfs_create_mount_point] + A --> E[debugfs_create_dir] + + F[rdt_get_tree] --> G[rdtgroup_setup_root] + F --> H[rdtgroup_create_info_dir] + F --> I[mkdir_mondata_all] + F --> J[closid_init] + F --> K[Add base files to root] + + G --> L[kernfs_create_root] + + H --> M[Create /info directory] + H --> N[Create resource subdirs] + N --> O[L3, L2, MB directories] + + I --> P[Create /mon_data directory] + I --> Q[mkdir_mondata_subdir_alldom] + Q --> R[mkdir_mondata_subdir] + R --> S[Domain-specific dirs] + S --> T[mon_L3_00, mon_L3_01, etc.] + + style A fill:#e1f5fe + style F fill:#e8f5e8 + style H fill:#fff3e0 + style I fill:#f3e5f5 +``` + +## 2. Resource Group Creation Flow (mkdir operation) + +This diagram shows the flow when creating new resource groups via `mkdir`. + +```mermaid +graph TD + A[rdtgroup_mkdir] --> B{Group Type?} + + B -->|Control+Monitor| C[rdtgroup_mkdir_ctrl_mon] + B -->|Monitor Only| D[rdtgroup_mkdir_mon] + + C --> E[mkdir_rdt_prepare] + C --> F[mkdir_rdt_prepare_rmid_alloc] + C --> G[mongroup_create_dir] + + D --> H[mkdir_rdt_prepare] + D --> I[mkdir_rdt_prepare_rmid_alloc] + + E --> J[Allocate rdtgroup struct] + E --> K[Setup kernfs node] + E --> L[Allocate CLOSID if needed] + + F --> M[Allocate RMID] + F --> N[mkdir_mondata_all] + + N --> O[Create mon_data structure] + N --> P[mkdir_mondata_subdir_alldom] + P --> Q[Create domain directories] + Q --> R[Add monitoring files] + + G --> S[Create mon_groups subdir] + + style A fill:#e1f5fe + style C fill:#e8f5e8 + style D fill:#fff3e0 + style N fill:#f3e5f5 +``` + +## 3. Resource Group Deletion Flow (rmdir operation) + +This diagram shows the flow when removing resource groups via `rmdir`. + +```mermaid +graph TD + A[rdtgroup_rmdir] --> B{Group Type?} + + B -->|Control Group| C[rdtgroup_rmdir_ctrl] + B -->|Monitor Group| D[rdtgroup_rmdir_mon] + + C --> E[rdt_move_group_tasks] + C --> F[update_closid_rmid] + C --> G[rdtgroup_ctrl_remove] + + D --> H[rdt_move_group_tasks] + D --> I[update_closid_rmid] + D --> J[free_rmid] + + E --> K[Move tasks to parent group] + H --> L[Move tasks to parent group] + + F --> M[Update MSRs on all CPUs] + I --> N[Update MSRs on all CPUs] + + G --> O[closid_free] + G --> P[Remove from rdt_all_groups] + G --> Q[kernfs_remove] + + J --> R[Release monitoring ID] + + style A fill:#e1f5fe + style C fill:#ffebee + style D fill:#fff3e0 + style E fill:#f3e5f5 + style H fill:#f3e5f5 +``` + +## 4. Task Assignment Flow (tasks file operations) + +This diagram shows how tasks are assigned to resource groups via the tasks file. + +```mermaid +graph TD + A[rdtgroup_tasks_write] --> B[Parse PID list] + B --> C[For each PID] + + C --> D[rdtgroup_move_task] + D --> E[find_task_by_vpid] + D --> F[__rdtgroup_move_task] + + F --> G[task_in_rdtgroup] + G --> H{Already in group?} + H -->|No| I[rdt_move_group_tasks] + H -->|Yes| J[Skip] + + I --> K[Update task closid/rmid] + I --> L[Move task to new group] + + K --> M[Context switch or IPI] + M --> N[Update MSRs] + + O[rdtgroup_tasks_show] --> P[rdtgroup_kn_lock_live] + P --> Q[show_rdt_tasks] + Q --> R[for_each_process_thread] + R --> S[is_closid_match] + R --> T[is_rmid_match] + S --> U[Output matching tasks] + T --> U + + style A fill:#e1f5fe + style O fill:#e8f5e8 + style D fill:#fff3e0 + style I fill:#f3e5f5 + style Q fill:#f1f8e9 +``` + +## 5. Monitoring Data Flow (reading monitoring counters) + +This diagram shows how monitoring data is read from the mon_data directories. + +```mermaid +graph TD + A[rdtgroup_mondata_show] --> B[Parse kernfs node] + B --> C[Extract resource/domain/event] + C --> D[Locate monitoring domain] + D --> E[Read counter values] + E --> F[Format output] + + G[mkdir_mondata_all] --> H[mongroup_create_dir] + G --> I[mkdir_mondata_subdir_alldom] + + I --> J[mkdir_mondata_subdir] + J --> K[Create domain directory] + K --> L[Create event files] + L --> M[Link to mon_data_kn_priv_list] + + N[mon_get_kn_priv] --> O[Get private data] + P[mon_put_kn_priv] --> Q[Cleanup private data] + + style A fill:#e1f5fe + style G fill:#e8f5e8 + style J fill:#fff3e0 + style L fill:#f3e5f5 +``` + +## 6. Pseudolock Flow (cache pseudo-locking) + +This diagram shows the flow for setting up cache pseudo-locking via the mode file. + +```mermaid +graph TD + A[rdtgroup_mode_write] --> B[Parse mode string] + B --> C{Mode?} + + C -->|pseudo-locksetup| D[rdtgroup_pseudo_lock_create] + C -->|shareable/exclusive| E[rdtgroup_pseudo_lock_remove] + + D --> F[Validate pseudolock requirements] + D --> G[Setup pseudolock state] + G --> H[rdt_pseudo_lock_init] + + E --> I[Cleanup pseudolock state] + E --> J[rdt_pseudo_lock_release] + + K[rdtgroup_mode_show] --> L[Return current mode] + L --> M[shareable/exclusive/pseudo-locksetup/pseudo-locked] + + style A fill:#e1f5fe + style K fill:#e8f5e8 + style D fill:#fff3e0 + style E fill:#ffebee + style H fill:#f3e5f5 +``` + +## 7. Overall Filesystem Operations Structure + +This diagram shows the high-level structure of ResCtrl filesystem operations. + +```mermaid +graph TD + A[kernfs_syscall_ops] --> B[rdtgroup_mkdir] + A --> C[rdtgroup_rmdir] + A --> D[rdtgroup_rename] + A --> E[rdtgroup_show_options] + + F[File Operations] --> G[rdtgroup_kf_single_ops] + F --> H[kf_mondata_ops] + F --> I[rdtgroup_kf_multi_ops] + + G --> J[tasks, schemata files] + H --> K[monitoring data files] + I --> L[multi-value files] + + M[Locking] --> N[rdtgroup_kn_lock_live] + M --> O[rdtgroup_mutex] + M --> P[rdt_last_cmd_*] + + N --> Q[Kernfs node locking] + O --> R[Global resctrl mutex] + P --> S[Error reporting] + + style A fill:#e1f5fe + style F fill:#e8f5e8 + style M fill:#fff3e0 +``` + +## Key Data Structures + +The following key data structures are used throughout these flows: + +- **`struct rdtgroup`**: Represents a resource group +- **`struct rdt_resource`**: Represents a hardware resource (L3, L2, MB) +- **`struct rdt_domain`**: Represents a domain within a resource +- **`struct kernfs_node`**: Kernel filesystem node +- **`struct rdt_fs_context`**: Filesystem context for mounting + +These diagrams show the main function flows within the ResCtrl filesystem, focusing on the core operations of initialization, resource group management, task assignment, monitoring, and special features like pseudo-locking. \ No newline at end of file diff --git a/kernel/perf_blog.md b/kernel/perf_blog.md new file mode 100644 index 00000000000000..0ade5a754015b9 --- /dev/null +++ b/kernel/perf_blog.md @@ -0,0 +1,30 @@ +Help me write a technical blog post. I want those tiles to be direct. No nonsense. and keep trying to maintain the style I am talking with with and don't add data beyond what I'm giving, you know just maybe refactor my my comments a little bit but don't add new information + +I'd like to cover some details about the perf subsystems in the subsystem in the Linux kernel because I want to add functionality To the kernel, I want to add the ability for users to to open perf events for a subsystem that does not have perf support yet specifically it's the resource control subsystem or for short, R-E-S-C-T-R-L. + +So in, I'd like to now cover some of the internals of the perf subsystem that might be helpful if either you're a user or you want to build the functionality for Perf. + +The system call perf_event_open is defined in kernel/events/core.c . The man page is `man perf_event_open` and here is a link https://man7.org/linux/man-pages/man2/perf_event_open.2.html + +the caller to perf underscore event event_open. can supply a process, a CPU, and a Cgroup ID. for the entities that they want to measure. The Cgroup and process ID share the same input parameter and there is a flag to control that parameter if it refers to a cgroup or a pid. PERF_FLAG_PID_CGROUP. the implementation takes care of locating those resources If it's a process, then the task struct. For a CPU, it makes sure that the CPU is online. and for a c group it passes the file descriptive to the c group for further processing. + +Now, since there is significant amount of code that deals with group leaders for perf it makes sense to talk about it. so the user path parameter group underscore fd to perf underscore event underscore open Groups allow scheduling of events onto the event hardware, the PMU, as a group, all or none. and that is because many PMUs have a limit on the number of events that they can track at any given time. And so the kernel has a mechanism to multiplex these groups onto PMUs. Every perf event can have a group leader specified to it. and a group leader cannot have another leader. So this is a limited hierarchy. key, it's not, you cannot create trees with leaders. It's just there's a single leader and maybe every event might have a leader. Might have a leader and that's it. And perf_event_open handles the group leader, takes care of most of that for for whoever is implementing the PMU. it resolves. if the user specifies it, it resolves. the it resolves the group leader, and then it makes sure that all of the events in the group belong to a single hardware PMU. It doesn't allow because it's hard to schedule events over for multiple PMUs atomically. the PERC subsystem just doesn't try to do that the the implementation checks that only offends for the same hardware PMU are in under the same group so they can all be scheduled together. and it also allows software. software events to be added in case the user wants to make sure that it has the software events as well in the group that is allowed. So there is a little bit of code in there. if you're adding every Every group has a PMU that it belongs to. software events can belong to a hardware PMU, but there's just a single hardware PMU associated, the general PMU associated with a group leader. So there is code if you've added, if the user added a lot of software tasks and now they're adding the first hardware task to move all of, sorry, not tasks, events, to move all of the events to the hardware PMU. So there's some logic there if you're reading the code. + +There are two important data structures that link the perf_event struct to the PMU that is performing measurements and to the entity being monitored which is the task or CPU. Those data structures are perf_event_context is associated with the measured entity task or cpu and a perf_event_pmu_context is associated with a perf_event_context and the PMU. And there could be multiple perf events. that point that to the same perf event context and perf event PMU contexts. And in fact, the per fungus core event and then hold ref pound to both of these odd projects, the perf_event_context and the perf_event_pmu_context. and these are documented if you want to see more, on `struct perf_event_pmu_context` in `include/linux/perf_event.h` + +perf_event_alloc That function allocates and initializes that struck perf_event. it also calls the function perf_init_event that returns the PMU for the event. This is where the lookup happens from the event type to the PMU associated with that event type. You can take a look at that function perf_init_event if you want to see how that is done. specifically. It seems to include functionality that tries to initialize the event on different PMUs using the event type. and the PMU can override the type field and so perf_init_event follows those type redirections to find the actual PMU associated with the events, with this redirection support. but I'm not going expand further here. This is also where the kernel resolves dynamic dynamic PMU IDs. So those are PMUs dynamically registered with the colonel and associated. that they're assigned an ID when they are registered. This is mentioned in the man page. so if you want to see more about that this is the function to look at. + +Kernel code can register PMUs with `perf_pmu_register`. pmu developers can specify the behavior of their PMUs using the fields on `struct pmu`, which is defined in `include/linux/perf_event.h`. the code that runs as part of perf underscore event underscore open ensures that the user has the right permissions and that the PMU supports all of the features requested by the user. one of the struct pmu fields and that controls the supported behavior is `task_ctx_nr`. When `pmu->task_ctx_nr == perf_invalid_context`, it means that the PMU does not support task context. The `capabilities` field encodes some capabilities of the PMU. For example, perf events can be used in counter mode, where the user reads their values, or in sampling mode, where the hardware raises an interrupt after a certain number of events. When the PMU sets `event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT`, it means that the PMU does not support sampling mode. + +Struct PMU contains several functions, functions, function pointers for different functionality of the PMU. to event_init initializes per underscore event struct. Add. and del and then deletes. for preventing the PMU. Start and stop. than read for reading events. and there are some optional functions. One of the functions that is not marked as optional, but it actually is is `sched_task`. it allows the PMU to request a callback to that function on every context switch on a specific CPU. But in order to make this happen, and the PMU needs to call the function `perf_sched_cb_inc` (link: https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L3723) which enqueue the PMU's CPU context onto a per CPU callback list. So if you're not intending to use that function, you can avoid implementing the scheduling callback. + +The Intel uncore PMU registration is a small PMU configuration (link to `uncore_pmu_register`: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/events/intel/uncore.c#L913). It is not minimal because it implements the `pmu_enable` and `pmu_disable` optional functions. The `attr_groups` are also optional; they allow the PMU developer to create attribute files and directories in `/sys/bus/event_source/devices/[pmu_name1]/`, for users to read information about the PMU. However they can be left as NULL such as in https://elixir.bootlin.com/linux/v6.15.6/source/arch/alpha/kernel/perf_event.c#L755 . + + +some links to embed: + - perf_event_open: https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L13121 + - struct perf_event_pmu_context: https://elixir.bootlin.com/linux/v6.15.6/source/include/linux/perf_event.h#L103 + - perf_event_alloc: https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L12598 + - perf_init_event: https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L12413 + - perf_pmu_register: https://elixir.bootlin.com/linux/v6.15.6/source/kernel/events/core.c#L12218 + - struct pmu: https://elixir.bootlin.com/linux/v6.15.6/source/include/linux/perf_event.h#L322 \ No newline at end of file diff --git a/perf-diagrams.md b/perf-diagrams.md new file mode 100644 index 00000000000000..b6b850324e040e --- /dev/null +++ b/perf-diagrams.md @@ -0,0 +1,301 @@ +# Linux Kernel Perf Events - PMU Read Call Flow Diagrams + +This document contains mermaid diagrams showing all the call flows in the Linux kernel perf events subsystem that lead to `pmu->read()` calls. These diagrams trace backwards from the `pmu->read()` invocations to show all the entry points and execution paths. + +## Overview + +The Linux perf events subsystem provides performance monitoring capabilities through a standardized PMU (Performance Monitoring Unit) interface. The `pmu->read()` function is the core interface for reading performance counter values from hardware or software PMUs. There are **4 distinct call sites** where `pmu->read()` is invoked, each serving different purposes and accessed through different entry points. + +## 1. User-Space Read Operations Flow + +This diagram shows how user-space read operations on perf event file descriptors lead to `pmu->read()` calls. + +```mermaid +graph TD + A[User-space read() system call] --> B[perf_read file operation] + B --> C[__perf_read] + C --> D{Single event or group?} + + D -->|Single event| E[perf_read_one] + D -->|Event group| F[perf_read_group] + + E --> G[__perf_event_read_value] + F --> H[__perf_read_group_add] + + G --> I[perf_event_read] + H --> J[perf_pmu_read] + + I --> K{Event on current CPU?} + K -->|Yes| L[__perf_event_read local] + K -->|No| M[smp_call_function_single] + + L --> N[pmu->read event] + M --> O[__perf_event_read remote] + + O --> P{Group read?} + P -->|No| Q[pmu->read event] + P -->|Yes| R[pmu->start_txn] + + R --> S[pmu->read event] + S --> T[Read sibling events] + T --> U[pmu->commit_txn] + + J --> V[pmu->read event] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style N fill:#fff3e0 + style Q fill:#fff3e0 + style S fill:#fff3e0 + style V fill:#fff3e0 +``` + +## 2. Local Event Read Flow (NMI-Safe) + +This diagram shows the NMI-safe local event read path used by kernel APIs and interrupt contexts. + +```mermaid +graph TD + A[Kernel API Callers] --> B[perf_event_read_local] + + C[BPF Programs] --> B + D[NMI Handlers] --> B + E[Kernel Modules] --> B + F[Interrupt Contexts] --> B + + B --> G[Local CPU validation] + G --> H{Event on current CPU?} + + H -->|Yes| I[event->pmu->read event] + H -->|No| J[Return -ENOENT] + + I --> K[Update enabled/running time] + K --> L[Return counter value] + + style A fill:#e1f5fe + style C fill:#e8f5e8 + style D fill:#fff3e0 + style E fill:#f3e5f5 + style F fill:#f1f8e9 + style I fill:#fce4ec +``` + +## 3. Timer-Based Sampling Flow + +This diagram shows how high-resolution timer callbacks lead to `pmu->read()` calls for software event sampling. + +```mermaid +graph TD + A[hrtimer_interrupt] --> B[__hrtimer_run_queues] + B --> C[Timer callback functions] + + C --> D[perf_swevent_hrtimer] + + D --> E[Check event state] + E --> F{Event active?} + + F -->|Yes| G[event->pmu->read event] + F -->|No| H[Skip read] + + G --> I[perf_swevent_event] + I --> J[Process sample data] + J --> K[Update event statistics] + + L[Timer Setup] --> M[perf_swevent_start_hrtimer] + M --> N[hrtimer_start] + + style A fill:#e1f5fe + style D fill:#e8f5e8 + style G fill:#fff3e0 + style M fill:#f3e5f5 +``` + +## 4. Event Sampling and Output Flow + +This diagram shows how event sampling during perf record operations triggers `pmu->read()` calls. + +```mermaid +graph TD + A[Hardware PMU Interrupt] --> B[perf_event_overflow] + B --> C[__perf_event_overflow] + C --> D[perf_prepare_sample] + + E[Software Event Trigger] --> F[perf_tp_event] + F --> G[perf_swevent_event] + G --> D + + D --> H[perf_output_begin] + H --> I[perf_output_sample] + + I --> J{Read sample requested?} + J -->|Yes| K[perf_output_read] + J -->|No| L[Skip read values] + + K --> M{Group read?} + M -->|Yes| N[Read event group] + M -->|No| O[Read single event] + + N --> P[perf_pmu_read group leader] + N --> Q[Read sibling events] + O --> R[perf_pmu_read event] + + P --> S[pmu->read event] + Q --> T[pmu->read siblings] + R --> U[pmu->read event] + + style A fill:#e1f5fe + style E fill:#e8f5e8 + style K fill:#fff3e0 + style S fill:#f3e5f5 + style T fill:#f3e5f5 + style U fill:#f3e5f5 +``` + +## 5. Cross-CPU Event Read Flow + +This diagram shows how reading events on remote CPUs is handled through SMP function calls. + +```mermaid +graph TD + A[perf_event_read] --> B{Event CPU == current CPU?} + + B -->|Yes| C[Direct local read] + B -->|No| D[Remote CPU read needed] + + C --> E[__perf_event_read local] + D --> F[smp_call_function_single] + + F --> G[Schedule on remote CPU] + G --> H[__perf_event_read remote context] + + H --> I[Validate event context] + I --> J{Context matches?} + + J -->|Yes| K{Group event?} + J -->|No| L[Return -EINVAL] + + K -->|No| M[Single event read] + K -->|Yes| N[Group transaction read] + + M --> O[pmu->read event] + + N --> P[pmu->start_txn PERF_PMU_TXN_READ] + P --> Q[pmu->read leader event] + Q --> R[Read sibling events via perf_pmu_read] + R --> S[pmu->commit_txn] + + E --> T[pmu->read event] + + style A fill:#e1f5fe + style F fill:#e8f5e8 + style O fill:#fff3e0 + style Q fill:#fff3e0 + style T fill:#fff3e0 +``` + +## 6. System Call and File Operation Entry Points + +This diagram shows the various user-space entry points that can lead to PMU reads. + +```mermaid +graph TD + A[User-Space Applications] --> B[System Calls] + A --> C[File Operations] + A --> D[Library Interfaces] + + B --> E[sys_perf_event_open] + B --> F[read syscall] + B --> G[ioctl syscall] + + C --> H[/proc/sys/kernel/perf_*] + C --> I[/sys/bus/event_source/devices/*] + + D --> J[libperf library] + D --> K[perf user tools] + + E --> L[Create perf_event file descriptor] + F --> M[perf_read file operation] + G --> N[perf_ioctl operations] + + L --> O[Event setup and initialization] + M --> P[Event value reading paths] + N --> Q[Event control operations] + + P --> R[Leading to pmu->read calls] + + S[Kernel Internal APIs] --> T[perf_event_read_local] + S --> U[perf_event_read_value] + + T --> V[Direct pmu->read calls] + U --> W[Indirect pmu->read via read paths] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style D fill:#f3e5f5 + style R fill:#ffebee + style V fill:#ffebee +``` + +## 7. Context and Safety Considerations + +This diagram shows the different execution contexts and safety mechanisms for PMU reads. + +```mermaid +graph TD + A[PMU Read Contexts] --> B[User Context] + A --> C[Softirq Context] + A --> D[Hardirq Context] + A --> E[NMI Context] + + B --> F[Standard locking] + B --> G[Mutex protection] + B --> H[RCU read locks] + + C --> I[Spin locks] + C --> J[Atomic operations] + + D --> K[IRQ-safe operations] + D --> L[Limited locking] + + E --> M[NMI-safe only] + E --> N[perf_event_read_local] + + O[Safety Mechanisms] --> P[CPU topology validation] + O --> Q[Event state checking] + O --> R[Context validation] + + P --> S[Ensure event accessibility] + Q --> T[PERF_EVENT_STATE_ACTIVE check] + R --> U[Task and CPU context matching] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style D fill:#f3e5f5 + style E fill:#ffebee + style M fill:#ffebee + style N fill:#ffebee +``` + +## Key Integration Points + +The diagrams show several critical aspects of the perf events PMU read architecture: + +1. **Multiple Entry Points**: User-space syscalls, kernel APIs, timer callbacks, and interrupt handlers all lead to PMU reads +2. **Context Safety**: Different execution contexts (user, interrupt, NMI) use appropriate safety mechanisms +3. **CPU Topology Awareness**: Local vs. remote CPU reads are handled differently for performance +4. **Group Operations**: Event groups use PMU transactions for atomic reads +5. **Sampling Integration**: PMU reads are integrated into the sampling and profiling infrastructure +6. **Error Handling**: Comprehensive validation and error paths ensure system stability + +## PMU Read Call Sites Summary + +There are **4 distinct locations** where `pmu->read(event)` is called: + +1. **perf_pmu_read()** - General-purpose read function used by most read paths +2. **__perf_event_read()** - Remote CPU read function (2 call sites for single and group reads) +3. **perf_event_read_local()** - NMI-safe local read function +4. **perf_swevent_hrtimer()** - Timer-based sampling read function + +Each call site serves specific use cases and provides different guarantees about execution context and performance characteristics. This architecture allows the perf events subsystem to provide flexible, efficient, and safe performance monitoring across all execution contexts in the Linux kernel. \ No newline at end of file diff --git a/resctrl-blog.md b/resctrl-blog.md new file mode 100644 index 00000000000000..459abd74b05ed0 --- /dev/null +++ b/resctrl-blog.md @@ -0,0 +1,129 @@ +So I'm going to collect some thoughts on how the resource control framework in Linux works. we're trying to add support for reading measurement of cache allocation and memory bandwidth from resource control using perf events. And the reason is we want to enable very frequent readings of the resource control counters from the CPU, order of one millisecond. - I think supporting Perf events involves writing a PMU (`struct pmu`). We have a separate write-up on how the PMU interacts with the kernel and what is required in order to build a PMU. Here we want to focus on what is required to interface with the resource control subsystem in the kernel. The resource control subsystem allows users to create new groups and then assign tasks to them. And we want to keep that interface when we support perf events. So in essence, we would like the perf event to read the measurements associated with the resource control groups configured using the filsystem based interface. + +The user opening a perf event using the perf_event_open system call would need to specify the group and what type of event they want to measure. there are multiple metrics that resource control offers. A user might pass that using configuration parameters to the perf_event_open system call. We need a way for the users to specify which resource control group they want to monitor. And then we need to manage lifetimes of the perf structures in a way that makes sense with a lifetime of the resource control objects. + +We need to answer a few questions: +1. How do users specify the resource control group and the event they want to measure? +2. What happens to the perf event when a user deletes a resource control group? What are the semantics: is there an error? does the event just stop reading new values? +3. How do we manage lifetimes, making sure that all of the perf calls remain valid as users might interact with resource control, and the system correctly preserves lifetimes and frees data structures? + +And so for that, we want to understand and how the resource control subsystem is organized. + +There are _monitoring_ groups and _control_ groups. + +The root of the filesystem is managed through a variable `rdtgroup_default`. + +The hierarchy allows creating resctrl groups under the root. If monitoring is supported, there is a `mon_groups` directory under the root, which contains subdirectories for each monitoring group. If allocation is supported, the user can create new resctrl groups using `mkdir` under the root directory, and these will allow allocation and also contain a `mon_groups` directory where monitoring groups can be created. This allows the user to create a hierarchy with a control groups and then monitoring groups underneath them. + +## Group lifecycle +It is important to understand lifecycle management, reference counting for the resource control groups in order to support perf. So let's look at how that is managed. The struct for resource control groups is `struct rdtgroup`. the rdtgroup is closely associated with a kernelfs node which is the file system node users can access resource control. Lifecycle is intertwined with kernfs. Kernfs implements a feature called active references, which are references on the kernfs nodes that are held during operations on kernfs nodes. Unlike the regular reference counts that prevent memory deallocation, the active references also prevent removal of the node from the kernfs file system. The resctrl implementation maintains the invariant that active reference holders can safely access the `rdtgroup`p associated with that node. On every kernfs operation (which holds an active reference), the code quickly exchanges the active referenc with a reference count on the `rdtgroup` using `rdtgroup_kn_lock_live`. That reference count is stored in the `waitcount` field of rdtgroup. That reference count protects the RDT group from being deallocated. + +When the user removes the `rdtgroup`, for example using RMDIR in `rdtgroup_rmdir_mon` or `rdtgroup_rmdir_ctrl`, the code sets the flag `RDT_DELETED` on the rdtgroup, and removes the kernfs node. The node removal waits for all the active references to be released. At that point all the operations in flight would have commuted their active reference to the kernfs node to a rdtgroup reference (`waitcount`), and no new operations can start, since the node has been removed. The last rdtgroup reference to be released via `rdtgroup_kn_unlock` will call `rdtgroup_kn_put` which will then call `rdtgroup_remove`, which releases the last regular reference to the kernfs node and frees the `rdtgroup`. + +Note that it seems like the way this subsystem was written was to ensure that removals can proceed relatively quickly in the face of filesystem resctrl operations like removals. The file operations quickly free up active references, so nodes can be removed: the implementation avoids tying up groups' active references during long-lived operations. If we want to keep this agility of removal of groups, we would need to ensure that our implementation does not hinder group removals by holding active references long. + + +## Group Renaming +The rename functionality allows very specific moves of groups inside the resource control file system. The moves it allows are moving monitoring groups from under one parent to another parent. and those parents are always either the default group or control group. this allows changing the resources allocated to a group without changing the monitoring. Looking at the function `mongrp_reparent`, It moves an existing struct rdt group from one parent to another. By setting the parent pointer of the control group and and moving itself from the old groups list to the new groups. This means that any perf event should continue operation through such a move. + + +## Monitoring Uses Resources, Events and Domains +Okay, so what is the interface that the kernel's resource control provides internally to read the monitoring values? + +`rdtgroup_mondata_show` is the `seq_show` handler for monitoring files, set for files added with `mon_addfile`. The `priv` member of the `kernfs_node` for the monitoring files encodes several fields: `rid`, `evtid`, `domid`, and `sum` (`struct mon_data_bits` contains the union to encode this). So how are resources (`rid`), events (`evtid`) and domains (`domid`) maintained, and what is the `sum` field? + +## Resources and Events +Each architecture (currently x86) defines a `resctrl_arch_get_resource` function that returns a `struct rdt_resource` for a given resource ID, from 0 to `RDT_NUM_RESOURCES - 1`. In 6.15.6, the kernel defines (https://elixir.bootlin.com/linux/v6.15.6/source/include/linux/resctrl_types.h#L34) `RDT_RESOURCE_L3`, `RDT_RESOURCE_L2`, `RDT_RESOURCE_MBA`, `RDT_RESOURCE_SMBA`. And the variable that links to domains is `rdt_resources_all` https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/core.c#L59 . `rdt_resources_all` initializes lists `ctrl_domains` and `mon_domaines` of the `struct rdt_resource` it contains, depending on whether the resource supports control or monitoring. + +Monitoring events in v6.15.6 are only supported for the L3 cache, and are initialized in `l3_mon_evt_init` https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/monitor.c#L1100 : LLC occupancy, MBM total and MBM local. So in the monitoring files, we should only have `rid` set to `RST_RESOURCE_L3`, and the `evtid` one of `QOS_L3_OCCUP_EVENT_ID`, ,`QOS_L3_MBM_TOTAL_EVENT_ID`, or `QOS_L3_MBM_LOCAL_EVENT_ID`. https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/monitor.c#L1078 + + +## Domains + +Whereas Resources are the hardware resources that can be controlled or monitored, domains are the logical groupings of CPUs and caches for control or measurement. + +When resctrl is initialized, it subscribes to CPU state changes using `cpuhp_setup_state`: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/core.c#L1029, with callback `resctrl_arch_online_cpu` to process new online CPUs. This callback calls `domain_add_cpu` for each resource, which calls `domain_add_cpu_ctrl` and `domain_add_cpu_mon` depending on the resource (control vs monitoring). These add a domain if it doesn't exist, and add the CPU to the domain's CPU mask. + +The Sub-NUMA Clustering (SNC) feature influences how domains are initialized. In SNC mode, the CPU assigns memory addresses to L3 caches closest to the memory controller that serves them. This reduces latency when accessing memory local to the Sub-NUMA cluster. SNC mode creates multiple L3 domain for each cache, one for each SNC domain. In SNC mode, the counters for each RMID are distributed across the SNC domains, so to obtain a measurement, the RMID needs to be summed up across the domains, which is what the `sum` field in `mon_data_bits` signifies. + +Each domain maintains a pointer to a `struct cacheinfo` which holds an ID for the cache. When the `sum` field is set, the `domid` field is the cache ID, and the `rdtgroup_mondata_show` function sums up the RMID's values for all domains on the cache with that cache ID. Otherwise, the `domid` field is the domain ID, and the `rdtgroup_mondata_show` function reads the RMID value for that domain. The `mon_data` directory created by `mkdir_mondata_subdir` contains a directory `mon_L3_NN` to read the RMID values, which in SNC mode sums up the relevant domains (and NN is the cache ID). In SNC mode, `mkdir_mondata_subdir` also adds directories `mon_sub_L3_` that reads the RMID on the specific SNC domain, with `sum` set to 0. + +`snc_get_config` determines whether SNC is enabled, and sets the variable `snc_nodes_per_l3_cache` to the number of SNC nodes per L3 cache. + + +## Reading Event Counts + +So, going back to what `rdtgroup_mondata_show` does, it retrieves the `struct rdtgroup` from the `kernfs_node`., unpacks the `priv` field, gets the resource for the `rid` (which in v6.15.6 is always L3), finds the CPU mask associated with `domid` (the relevant domain's if `sum` is 0, or the cache's if `sum` is set), and calls `mon_event_read` with the CPU mask for the domain/cache. `mon_event_read` calls `mon_event_count` on one of the CPUs in the mask. Measuring a control group also adds the measurements of all its monitoring groups, each read using `__mon_event_count`. `__mon_event_count` verifies that the CPU running it is legal for the domain or cache being read, and uses `resctrl_arch_rmid_read` to read the RMID value for the domain, or multiple RMID values for the cache, by passing the cache's different SNC domains as the domain parameter to `resctrl_arch_rmid_read`. + + +## Mutual exclusion and locking + +TODO: overview + +Do reads hold the global resctrl lock `rdtgroup_mutex`? `rdtgroup_kn_lock_live` locks it, and `rdtgroup_kn_unlock` unlocks it. Indeed `rdtgroup_mondata_show` uses these, and reads counters while holding the lock. These lock/unlock functions also hold the `cpus_read_lock()` so it can read the domain lists without worrying about them changing during the read. And the mutex also ensures `mon_event_count` can traverse the nested monitoring groups list of a control group to sum those values up. It seems that the `cpus_read_lock()` would be sufficient for reads of non control groups, since those reads just look at domains and rdt_resources, and read the fields closid and rmid of the `struct rdtgroup`. So a perf event that reads a monitoring group need not hold the `rdtgroup_mutex`, but it would need to hold the `cpus_read_lock()` to read entries in the domain list. + +So from this locking discussion, it seems that the perf event reading a monitoring group could check if the group's type is a control group. If it is, it would need to hold the `rdtgroup_mutex` and the `cpus_read_lock()` while reading the counters. If it is not a control group, it would only need to hold the `cpus_read_lock()` while reading the counters. + +## Counter Reads through the MBM Overflow Handler + +MBM appears to have its own pathways to read event counts via `mbm_update` which calls `mbm_update_one_event` on the total and local event IDs. This is used in `mbm_handle_overflow` which iterates all resource control groups (top level and nested monitoring groups). When the filesystem is mounted or a new domain comes online, `mbm_setup_overflow_handler` schedules `mbm_handle_overflow` every `MBM_OVERFLOW_INTERVAL` milliseconds (defined as 1000 in v6.15.6 https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/internal.h#L21), and `mbm_handle_overflow` renews the timer. This ensures a minimum reading frequency for MBM events, which the designers seem to have intended to be frequent enough to allow the kernel to detect all counter overflows. + +This mechanism doesn't seem to have an interaction with the `mon_data` file read flow. + +TODO: but why do this? Where is the state being updated? Where is it? If we're going to relax mutual exclusion for reads, is there a conflict on that state? + + +## Pseudo Locking +Cache pseudo locking is explained in kernel docs: https://elixir.bootlin.com/linux/v6.15.6/source/Documentation/arch/x86/resctrl.rst#L701 . It allocates an RDT group with exclusive access to a portion of the cache and memory that will map into this cache and ensures that and loads that memory into the cache because that RDT group has exclusive access to that cache area and that RDT group is then no longer used by any task or CPU, you, then any process accessing this memory will have it in cache and it would not be evicted by other tasks? The mechanism exposes this memory through a character device that it creates so that a user could memory map this area into their application to access this. high performance region of memory because most of it resides in cache. + +To set up sudo locking, the user sets up a control group and changes its mode to sudo lock setup. Then when setting the resources of the control group, it becomes the kernel, creates the memory and loads it into the cache. Moving into locksetup mode is done in the `rdtgroup_locksetup_enter` function. That function checks no monitoring is in progress and frees the RMID. So this would interfere with any perf events that are monitoring the group -- there should be a check ensuring there are no such perf events before entering the lock setup mode. The perf code should also disallow new perf events on the group in lock setup and in locked modes. `rdtgroup_locksetup_exit` is used when moving back to shareable or exclusive mode. It re-allocates an RMID, so should re-enable perf functionality. + +Once locked, the rdtgroup cannot change mode and monitoring is disallowed, imposing no further constraints on the perf implementation. + + +## CPU Monitoring +one interesting feature of the resctrl subsystem is the cpu monitoring functionality. One of the files that is created for every resource group is `cpus`. The cpu mask in the `cpus` file controls the resctrl group that is responsible for tasks that have not been assigned to an explicit rdtgroup, that are running on those CPUs. so by assigning CPUs to a resource control group through this CPU mask, one can control which group controls and monitors tasks that are not assigned to any group, running on those CPUs. Each CPU is in exactly in one of the default mask or any of the control groups. Each CPU in the mask of the default group or a control group might also belong at most one monitoring group under the default/control group. `cpus_ctrl_write`, `cpus_mon_write`, `rdtgroup_rmdir_ctrl`, and `rdtgroup_rmdir_mon` maintain the CPU mask invariants, and cause MSRs to be updated on affected CPUs using `update_closid_rmid`. Due to this feature, control groups also have an RMID (Resource Monitoring ID) associated with them, and it is possible to monitor control groups. + + + + + +## Perf and CPU locality + +The perf subsystem already takes care of reading counters on suitable CPUs, so it can skip the `smp_call_on_cpu()` or `smp_call_function_any()` calls. `perf_event_read` already calls `smp_call_function_single` to a suitable CPU. In cases where the PMU specifies that an event can be read on any CPU on the same package, and the current CPU is on the same package as `event->oncpu`, `__perf_event_read_cpu` would return is a the current CPU, and `smp_call_function_single` would run the function locally -- a performance bonus. Indeed, perf also offers a `perf_event_read_local` function that reads the event on the current CPU without calling `smp_call_function_single`, which is NMI-safe and is the function used for perf reads from eBPF. + +The new perf implementation can leverage perf's existing CPU selection logic and avoid the logic in `mon_event_read`, instead scheduling `mon_event_count`. + +Can we specify that events are package events (`event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG`) When the user asks to monitor a domain or cache? Never specifying `PERF_EV_CAP_READ_ACTIVE_PKG` is safe but would restrict the user to read the perf event on that single core. Since reads are L3-scoped, we would need to know that the package only has one L3 cache. Both non-SNC and SNC modes allow any core in the cache to query every RMID (the difference is that SNC mode requires multiple RMID reads to obtain the value), so a single cache per package would allow the flag to be set. We still need to check that CPUs guarantee the one-to-one cache to package mapping. + + +## Maintaining State in Perf Events +One of the gaps we still have is How do PMUs handle their state? How can each PMU keep its own set of variables for each perf event? The perf event subsystem has has several data structures for perf event: `struct perf_event`, `struct perf_event_context` and `struct perf_event_pmu_context`. We should understand how custom state could be saved by PMUs. + +It turns out this seems to be well provisioned in the perf subsystem. PMUs can declare an `event_init` function where they can set the `event->pmu_priv` field that is controlled by the PMU. The PMU can set the `event->destroy` callback, which allows the PMU to free any resources tied up with that private data. + +Another location of event data is in `event->hw` which is a `struct hw_perf_event`. It holds a union for different event types, and maintainers might support adding union members there rather than allocating/deallocating private data per event. + + + + + +As the user opens perf events, what are the semantics in the face of group removals? This would influence the complexity and cost of coordination between the perf implementation and the resource control subsystem. One option would be that once the perf event was opened with a CLOSID and RMID, it would continue to read the same group, even if the group was removed; it would be the responsibility of the user to ensure that the group is not removed while the perf event is open. The failure mode could be if those IDs are re-allocated to a different group, then the perf event would read the wrong group. This is a simple solution, but it requires the user to be careful about group removals. + +A richer semantics would be to have the perf event stop reading the group when it is removed. It could return the same measurement over and over (i.e., no change), or return an error on the next read. This would allow users to remove groups without worrying about perf events reading them, but it would add some complexity to the perf implementation. We would prefer any mechanism to have minimal performance impact in the regular case, and to push coordination overhead to group removals -- while not blocking removals. + +when an rdt group is removed from kernfs, its `flags` member is marked with `RDT_DELETED`. if we could maintain a list of all of the perf events related to the RDT group we could require the process of marking the RDT group as deleted to also mark all of the perf events with deleted flags. Then each perf event would only have to check its own flags on every read to check if it is still valid, which would be local to the other state that the perf event keeps and and so should have very low overhead. and in regular operation there would be very little coordination overhead. We We impose this coordination overhead only when the RDT group is deleted where whoever Whoever is removing the RDT group has the burden of traversing whatever perf events in the system are related to that RDT group and notifying them. And that would be a handful of cash line bounces to set these flags, which should be acceptable overhead. In this case every perf event would hold a reference count to the RDT group to ensure that the list of perf events is not, is still accessible throughout the lifetime of the perf event. it. Adding a new perf event would mean enqueuing, adding it to the list. And closing the perf event would entail removing it from the list. and so the the data structure holding that perf event list needs to have a lifetime longer than all of the perf events. events. In this case, if every perf event holds a reference to the RDT group, then that would would ensure that the list remains alive as long as it needs to. It might cause removing of of freeing of the RDT group memory to take longer. but since the perf events would still be open, That would be acceptable behavior to keep the RDT group as long as there is a perf event open. it does not having this extra reference does it still allows the removal of the RDT group which is good which is what we wanted to have. + + + + + + +links: +- rdtgroup_kn_lock_live: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/rdtgroup.c#L2556 +- rdtgroup_kn_unlock: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/rdtgroup.c#L2575 +- rdtgroup_kn_put: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/rdtgroup.c#L2542 +- rdtgroup_rmdir_mon: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/rdtgroup.c#L3801 +- rdtgroup_rmdir_ctrl: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/rdtgroup.c#L3849 +- rdtgroup_locksetup_exit: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/pseudo_lock.c#L769 +- rdtgroup_mondata_show: https://elixir.bootlin.com/linux/v6.15.6/source/arch/x86/kernel/cpu/resctrl/ctrlmondata.c#L661 \ No newline at end of file diff --git a/resctrl-fd.md b/resctrl-fd.md new file mode 100644 index 00000000000000..2fc2f5fa7fa355 --- /dev/null +++ b/resctrl-fd.md @@ -0,0 +1,320 @@ +# Getting struct rdtgroup from File Descriptor: A Guide for Perf Support + +## Overview + +To add perf support to resctrl, we need a mechanism similar to `cgroup_bpf_prog_attach()` that can convert a file descriptor into a `struct rdtgroup`. This guide explains how to implement this conversion safely, following the same patterns used by the cgroup subsystem. + +## Background: How cgroup_bpf_prog_attach Works + +The cgroup implementation provides a well-established pattern for fd-to-cgroup conversion: + +```c +int cgroup_bpf_prog_attach(const union bpf_attr *attr, ...) +{ + struct cgroup *cgrp; + + cgrp = cgroup_get_from_fd(attr->target_fd); // Key conversion + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + // Use the cgroup... + + cgroup_put(cgrp); // Release reference + return ret; +} +``` + +### The cgroup fd-to-struct conversion chain: + +1. **`cgroup_get_from_fd(fd)`** → `cgroup_v1v2_get_from_fd(fd)` +2. **`cgroup_v1v2_get_from_fd(fd)`** → Gets file from fd, calls `cgroup_v1v2_get_from_file(file)` +3. **`cgroup_v1v2_get_from_file(file)`** → Calls `css_tryget_online_from_dir(dentry, NULL)` +4. **`css_tryget_online_from_dir(dentry, NULL)`** → Validates filesystem type, extracts cgroup from kernfs + +## Implementing rdtgroup_get_from_fd() + +Following the cgroup pattern, here's how to implement `rdtgroup_get_from_fd()`: + +### 1. Main Function Structure + +```c +struct rdtgroup *rdtgroup_get_from_fd(int fd) +{ + CLASS(fd_raw, f)(fd); // Modern cleanup pattern + if (fd_empty(f)) + return ERR_PTR(-EBADF); + return rdtgroup_get_from_file(fd_file(f)); +} +``` + +### 2. File-to-rdtgroup Conversion + +```c +static struct rdtgroup *rdtgroup_get_from_file(struct file *f) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_tryget_from_dentry(f->f_path.dentry); + if (IS_ERR(rdtgrp)) + return rdtgrp; + + return rdtgrp; +} +``` + +### 3. Core Validation and Reference Acquisition + +```c +static struct rdtgroup *rdtgroup_tryget_from_dentry(struct dentry *dentry) +{ + struct kernfs_node *kn; + struct file_system_type *s_type = dentry->d_sb->s_type; + struct rdtgroup *rdtgrp = NULL; + + // Verify it's actually a resctrl filesystem + if (s_type != &rdt_fs_type) + return ERR_PTR(-EBADF); + + kn = kernfs_node_from_dentry(dentry); + if (!kn || kernfs_type(kn) != KERNFS_DIR) + return ERR_PTR(-EBADF); + + rcu_read_lock(); + + // Extract rdtgroup from kernfs node using existing resctrl logic + rdtgrp = kernfs_to_rdtgroup(kn); + if (!rdtgrp) { + rcu_read_unlock(); + return ERR_PTR(-ENOENT); + } + + // Try to acquire a reference - check if group is still live + if (!rdtgroup_tryget_live(rdtgrp)) + rdtgrp = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + return rdtgrp; +} +``` + +### 4. Reference Counting Functions + +We need to implement reference counting similar to what cgroups do: + +```c +static bool rdtgroup_tryget_live(struct rdtgroup *rdtgrp) +{ + // Check if group is being deleted + if (rdtgrp->flags & RDT_DELETED) + return false; + + // Increment reference count atomically + atomic_inc(&rdtgrp->waitcount); + + // Double-check after incrementing (race with deletion) + if (unlikely(rdtgrp->flags & RDT_DELETED)) { + atomic_dec(&rdtgrp->waitcount); + return false; + } + + return true; +} + +void rdtgroup_put(struct rdtgroup *rdtgrp) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount)) { + // If this was the last reference and group is deleted, + // trigger cleanup (similar to rdtgroup_kn_put logic) + if (rdtgrp->flags & RDT_DELETED) { + // Schedule or perform cleanup + rdtgroup_remove(rdtgrp); + } + } +} +``` + +## Leveraging Existing resctrl Infrastructure + +The resctrl subsystem already has the necessary infrastructure: + +### kernfs_to_rdtgroup() Function +Located in `fs/resctrl/rdtgroup.c:2365`: + +```c +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + // Resource directories use kn->priv to point to rdtgroup + if (kn == kn_info || rcu_access_pointer(kn->__parent) == kn_info) + return NULL; // info directories don't have rdtgroups + else + return kn->priv; // Direct pointer to rdtgroup + } else { + return rdt_kn_parent_priv(kn); // Files get rdtgroup from parent + } +} +``` + +### Filesystem Type Validation +The `rdt_fs_type` is defined in `fs/resctrl/rdtgroup.c:2981` and can be used for validation: + +```c +extern struct file_system_type rdt_fs_type; // Need to export this +``` + +### Existing Reference Counting +resctrl already uses `rdtgrp->waitcount` for reference counting: +- `rdtgroup_kn_get()` - increments waitcount +- `rdtgroup_kn_put()` - decrements waitcount +- `rdtgroup_kn_lock_live()` - gets live reference with locking +- `rdtgroup_kn_unlock()` - releases reference with unlocking + +## Locking Strategy for Perf Events + +Based on the resctrl locking analysis from the research document: + +### For Monitoring Groups (Simple Case) +```c +int resctrl_perf_read_monitoring_group(struct rdtgroup *rdtgrp, ...) +{ + // Only need cpus_read_lock for domain list stability + cpus_read_lock(); + + // Check if group is still valid + if (rdtgrp->flags & RDT_DELETED) { + cpus_read_unlock(); + return -ENOENT; + } + + // Perform the read using existing mon_event_count logic + // ... + + cpus_read_unlock(); + return 0; +} +``` + +### For Control Groups (Complex Case) +```c +int resctrl_perf_read_control_group(struct rdtgroup *rdtgrp, ...) +{ + // Need both rdtgroup_mutex and cpus_read_lock for control groups + // because we need to sum monitoring groups under the control group + + mutex_lock(&rdtgroup_mutex); + cpus_read_lock(); + + if (rdtgrp->flags & RDT_DELETED) { + cpus_read_unlock(); + mutex_unlock(&rdtgroup_mutex); + return -ENOENT; + } + + // Sum control group + all its monitoring subgroups + // ... + + cpus_read_unlock(); + mutex_unlock(&rdtgroup_mutex); + return 0; +} +``` + +### Lock-Free Read Path for Perf +For high-frequency perf reads, we can optimize by checking the group type: + +```c +int resctrl_perf_read(struct rdtgroup *rdtgrp, ...) +{ + if (rdtgrp->type == RDTCTRL_GROUP) { + return resctrl_perf_read_control_group(rdtgrp, ...); + } else { + return resctrl_perf_read_monitoring_group(rdtgrp, ...); + } +} +``` + +## Validation Checks + +When converting fd to rdtgroup, perform these validations: + +### 1. Filesystem Type Check +```c +if (dentry->d_sb->s_type != &rdt_fs_type) + return ERR_PTR(-EBADF); +``` + +### 2. Directory Validation +```c +if (!kn || kernfs_type(kn) != KERNFS_DIR) + return ERR_PTR(-EBADF); +``` + +### 3. Info Directory Exclusion +```c +if (kn == kn_info || rcu_access_pointer(kn->__parent) == kn_info) + return ERR_PTR(-EBADF); // Can't monitor info directories +``` + +### 4. Pseudo-Lock Mode Check +```c +if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + return ERR_PTR(-EINVAL); // Monitoring disabled in pseudo-lock modes +``` + +### 5. Deletion State Check +```c +if (rdtgrp->flags & RDT_DELETED) + return ERR_PTR(-ENOENT); // Group is being deleted +``` + +## Usage Example in Perf PMU + +```c +static int resctrl_pmu_event_init(struct perf_event *event) +{ + struct rdtgroup *rdtgrp; + struct resctrl_perf_ctx *ctx; + + // Get rdtgroup from cgroup fd (using PERF_FLAG_PID_CGROUP pattern) + rdtgrp = rdtgroup_get_from_fd(event->attr.cgroup_fd); + if (IS_ERR(rdtgrp)) + return PTR_ERR(rdtgrp); + + // Allocate per-event context + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + rdtgroup_put(rdtgrp); + return -ENOMEM; + } + + ctx->rdtgrp = rdtgrp; // Store reference + ctx->evtid = /* extract from event->attr.config */; + event->pmu_private = ctx; + + // Set up cleanup callback + event->destroy = resctrl_pmu_event_destroy; + + return 0; +} + +static void resctrl_pmu_event_destroy(struct perf_event *event) +{ + struct resctrl_perf_ctx *ctx = event->pmu_private; + + rdtgroup_put(ctx->rdtgrp); // Release reference + kfree(ctx); +} +``` + +## Summary + +This approach provides: + +1. **Safe Conversion**: Filesystem type validation ensures fd points to resctrl +2. **Proper Reference Counting**: Prevents use-after-free with deleted groups +3. **Efficient Locking**: Minimal locks for monitoring groups, appropriate locks for control groups +4. **Error Handling**: Clear error codes for various failure modes +5. **Integration**: Leverages existing resctrl infrastructure (kernfs_to_rdtgroup, waitcount, etc.) + +The pattern closely follows the proven cgroup approach while adapting to resctrl's specific data structures and locking requirements. \ No newline at end of file diff --git a/resctrl_internals.md b/resctrl_internals.md new file mode 100644 index 00000000000000..70e63e0887641f --- /dev/null +++ b/resctrl_internals.md @@ -0,0 +1,136 @@ +# A Guide to Linux `resctrl` Subsystem Internals + +The Resource Control (`resctrl`) subsystem is a Linux kernel feature, primarily for x86 platforms, that provides a user-space interface for managing and monitoring CPU resources. It exposes Intel's Resource Director Technology (RDT) and AMD's Platform Quality of Service (PQOS) features. + +These technologies allow for: +* **Allocation**: Partitioning resources like Last Level Cache (LLC) and memory bandwidth among groups of tasks. This is known as Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA). +* **Monitoring**: Observing the usage of these resources by groups of tasks. This is known as Cache Monitoring Technology (CMT) and Memory Bandwidth Monitoring (MBM). + +The user interface is a dedicated filesystem, typically mounted at `/sys/fs/resctrl`. This guide explains the key components and workflows within the subsystem. + +## 1. Responsibilities of `fs/resctrl` + +This directory contains the Virtual File System (VFS) layer that connects the user-space view (directories and files) to the underlying `resctrl` kernel logic. It is responsible for handling all file operations on the `resctrl` mount point. + +* **`fs/resctrl/fs.c`**: This is the entry point for the filesystem itself. + * It defines the `resctrl_fs_type` which is registered with the kernel using `register_filesystem()`. + * It implements the mount (`resctrl_mount`) and unmount (`resctrl_kill_sb`) operations. When you run `mount -t resctrl resctrl /sys/fs/resctrl`, the `resctrl_mount` function is called. + * It's responsible for setting up the root directory of the filesystem, including the default resource group and the `info` subdirectory. + +* **`fs/resctrl/rdtgroup.c`**: This is the heart of the user-space interface. It manages the resource group directories and the files within them. + * **Directory Operations**: It implements the `mkdir` and `rmdir` syscall handlers for the `resctrl` fs. When a user runs `mkdir /sys/fs/resctrl/my_group`, the **`rdtgroup_mkdir()`** function is called. This is the crucial function that creates a new resource control group. + * **File Operations**: It defines the file operations (`read`, `write`, `open`, etc.) for the control files within each group directory like `tasks`, `cpus`/`cpu_mask`, `schemata`, `mon_data`, and `mon_groups`. + +* **`fs/resctrl/internal.h`**: A private header file containing function prototypes and structure definitions used only within the `fs/resctrl` directory. + +## 2. What each file in `arch/x86/kernel/cpu/resctrl` does + +This directory contains the architecture-specific (x86) implementation that interacts directly with the hardware (via Model-Specific Registers or MSRs) to apply the policies and perform monitoring configured through the filesystem interface. + +* **`arch/x86/kernel/cpu/resctrl/core.c`**: This is the core logic hub. + * **Initialization**: **`rdt_init()`** is the main entry point, called during kernel boot to detect RDT features and initialize the subsystem. + * **Group Management**: It manages the lifecycle of `struct rdt_group`, the kernel's representation of a resource group. It allocates and frees CLOSIDs (Class of Service ID) and RMIDs (Resource Monitoring ID), which are the hardware identifiers for groups. + * **Task Association**: It contains the scheduler hooks that apply resource controls to a task. The key function is **`__resctrl_sched_in()`**, which is called on a context switch to program the CPU with the task's group IDs. + +* **`arch/x86/kernel/cpu/resctrl/rdt.c`**: This file handles resource and feature detection. + * It reads CPUID leaves to discover which RDT/PQOS features (CAT, MBA, CMT, MBM, etc.) are available on the CPU. + * It populates the `struct rdt_resource` array, which describes each available resource (e.g., L3 cache). This information is exposed to user space in the `/sys/fs/resctrl/info` directory. + +* **`arch/x86/kernel/cpu/resctrl/ctrl.c`**: Manages the "control" or "allocation" features (CAT and MBA). + * It contains the logic for parsing the `schemata` file content written by the user. + * It validates user-provided cache masks (for CAT) or bandwidth values (for MBA). + * It programs the hardware MSRs (e.g., `IA32_L3_CBM_BASE`) with these values, associating a `CLOSID` with a specific resource policy. + +* **`arch/x86/kernel/cpu/resctrl/mon.c`**: Manages the "monitoring" features (CMT and MBM). + * This is a key file for your goal of adding perf-based monitoring. It handles reading the hardware monitoring counters. + * When user space reads from a `mon_data` file, the VFS layer calls into functions in this file. + * It reads the `IA32_QM_CTR` MSR, which contains the occupancy or bandwidth usage data associated with a specific `RMID`. + +* **`arch/x86/kernel/cpu/resctrl/pseudo_lock.c`**: Implements the Pseudo-Locking feature, a special allocation feature where a region of the cache is "locked" for exclusive use by a resource group. + +* **`arch/x86/kernel/cpu/resctrl/internal.h`**: A private header file for the architecture-specific implementation. + +## 3. What Happens When We `mkdir` a New Resctrl Group? + +Creating a new directory in the `resctrl` filesystem is the fundamental way to create a new resource control group. Here is the step-by-step flow: + +1. **User-space Action**: A user or script executes `mkdir /sys/fs/resctrl/my_group`. +2. **VFS Layer**: The kernel's VFS receives the `mkdir` syscall and routes it to the `mkdir` inode operation defined in `resctrl_fs_type`. +3. **Filesystem Handler**: This call is routed to **`rdtgroup_mkdir()`** in `fs/resctrl/rdtgroup.c`. +4. **Group Allocation**: Inside `rdtgroup_mkdir()`, a new instance of `struct rdt_group` is allocated. This structure holds all the information for the new group. + ```c + // In fs/resctrl/rdtgroup.c + static int rdtgroup_mkdir(struct kernfs_node *parent, const char *name, umode_t mode) + { + struct rdt_group *rdt_parent_group, *rdtgrp; + // ... + rdtgrp = rdtgroup_kn_alloc_init(parent, name); // Allocate and init rdt_group + // ... + } + ``` +5. **CLOSID/RMID Assignment**: The core logic in `arch/x86/kernel/cpu/resctrl/core.c` is called to find a free `CLOSID` and `RMID` to assign to this new group. +6. **File Creation**: `rdtgroup_mkdir()` then creates the standard set of control files (`tasks`, `schemata`, etc.) within the new directory using the `kernfs` API. +7. **Group Registration**: The new `rdt_group` is added to a global list (`rdt_all_groups`), making it visible to the rest of the `resctrl` subsystem. +8. **Hardware Update**: The `schemata` file is pre-populated with the default (full) cache mask, and the core logic programs the corresponding `IA32_L3_CBM_BASE + closid` MSR with this default mask. + +At this point, the group exists but contains no tasks. + +## 4. How the Subsystem Tracks Tasks and Enforces Policies + +Associating a task with a group and enforcing its policy is a two-step process: assignment and enforcement. + +### Assignment + +1. **User-space Action**: A user writes a task ID (TID) to the `tasks` file: `echo > /sys/fs/resctrl/my_group/tasks`. +2. **Filesystem Handler**: The `write()` syscall is handled by **`rdtgroup_tasks_write()`** in `fs/resctrl/rdtgroup.c`. +3. **Task Lookup**: This function parses the TID and finds the corresponding `struct task_struct`. +4. **Update `task_struct`**: The key step happens here. The `closid` and `rmid` from the target `rdt_group` are written directly into the task's `task_struct`. This is handled by **`resctrl_move_task()`** in `arch/x86/kernel/cpu/resctrl/core.c`. + ```c + // In include/linux/sched.h + struct task_struct { + // ... + #ifdef CONFIG_RESCTRL + u32 closid; + u32 rmid; + #endif + // ... + }; + ``` + ```c + // In arch/x86/kernel/cpu/resctrl/core.c + int resctrl_move_task(int pid, struct rdt_group *rdtgrp) + { + // ... find task_struct *p from pid ... + WRITE_ONCE(p->closid, rdtgrp->closid); + WRITE_ONCE(p->rmid, rdtgrp->mon.rmid); + // ... + } + ``` + +### Enforcement (The Scheduler Hook) + +The assignment only flags the task. The policy is enforced every time the task gets to run on a CPU core. + +1. **Context Switch**: When the Linux scheduler decides to run our task, it performs a context switch. +2. **Resctrl Hook**: As part of the context switch path, the scheduler calls **`__resctrl_sched_in()`**, defined in `arch/x86/kernel/cpu/resctrl/core.c`. +3. **Program the MSR**: `__resctrl_sched_in()` reads the `closid` and `rmid` from the `task_struct` of the task that is about to run. It then writes these values to the per-CPU `IA32_PQR_ASSOC` (PQR) MSR. + ```c + // In arch/x86/kernel/cpu/resctrl/core.c + void __resctrl_sched_in(struct task_struct *tsk) + { + u64 pqr_val; + + // If the task's closid is the same as what's already in the MSR, + // we can skip the MSR write for performance. + if (tsk->closid == this_cpu_read(pqr_state.cur_closid)) + return; + + pqr_val = resctrl_to_pqr(tsk->closid, tsk->rmid); + wrmsrl(MSR_IA32_PQR_ASSOC, pqr_val); + + // Cache the current value to avoid redundant MSR writes. + this_cpu_write(pqr_state.cur_closid, tsk->closid); + this_cpu_write(pqr_state.cur_rmid, tsk->rmid); + } + ``` +This MSR write is the final step. It tells the CPU hardware, "The code that is about to execute belongs to this Class of Service and should be monitored with this Resource Monitoring ID." The CPU then automatically enforces the associated cache partitions and updates the correct monitoring counters. \ No newline at end of file diff --git a/x64-resctrl-diagrams.md b/x64-resctrl-diagrams.md new file mode 100644 index 00000000000000..1a78a492b2a47f --- /dev/null +++ b/x64-resctrl-diagrams.md @@ -0,0 +1,723 @@ +# x86 ResCtrl Architecture Function Flow Diagrams + +This document contains mermaid diagrams showing the important function flows in the x86 ResCtrl architecture implementation, focusing on the interaction between the filesystem layer (`fs/resctrl/`) and the architecture-specific layer (`arch/x86/kernel/cpu/resctrl/`). + +## 1. Reference Counting Flow - rdtgroup_kn_lock_live + +This diagram shows the critical reference counting mechanism that ensures safe access to rdtgroup structures during concurrent operations. + +```mermaid +graph TD + A[rdtgroup_kn_lock_live] --> B[kernfs_to_rdtgroup] + A --> C[rdtgroup_kn_get] + A --> D[cpus_read_lock] + A --> E[mutex_lock rdtgroup_mutex] + A --> F{flags & RDT_DELETED?} + F -->|Yes| G[Return NULL] + F -->|No| H[Return rdtgrp] + + C --> I[atomic_inc waitcount] + C --> J[kernfs_break_active_protection] + + K[rdtgroup_kn_unlock] --> L[mutex_unlock rdtgroup_mutex] + K --> M[cpus_read_unlock] + K --> N[rdtgroup_kn_put] + + N --> O[atomic_dec_and_test waitcount] + O --> P{waitcount == 0 && RDT_DELETED?} + P -->|Yes| Q[rdtgroup_pseudo_lock_remove] + P -->|Yes| R[kernfs_unbreak_active_protection] + P -->|Yes| S[rdtgroup_remove] + P -->|No| T[kernfs_unbreak_active_protection] + + S --> U[kernfs_put] + S --> V[kfree rdtgrp] + + style A fill:#e1f5fe + style K fill:#e8f5e8 + style C fill:#fff3e0 + style N fill:#ffebee + style P fill:#f3e5f5 +``` + +## 2. rmdir Operation Flow + +This diagram shows how directory removal operations flow through both the filesystem and architecture layers. + +```mermaid +graph TD + A[rdtgroup_rmdir] --> B[rdtgroup_kn_lock_live] + B --> C{rdtgrp valid?} + C -->|No| D[Return -EPERM] + C -->|Yes| E{Group Type?} + + E -->|Control Group| F[rdtgroup_rmdir_ctrl] + E -->|Monitor Group| G[rdtgroup_rmdir_mon] + + F --> H[Free extra groups] + F --> I[rdt_move_group_tasks] + F --> J[Set flags = RDT_DELETED] + F --> K[update_closid_rmid] + F --> L[rdtgroup_ctrl_remove] + + G --> M[rdt_move_group_tasks] + G --> N[Set flags = RDT_DELETED] + G --> O[update_closid_rmid] + G --> P[free_rmid] + + I --> Q[Move tasks to parent] + M --> Q + + K --> R[resctrl_arch_sync_cpu_closid_rmid] + O --> R + + R --> S[this_cpu_write pqr_state] + R --> T[resctrl_arch_sched_in] + + L --> U[closid_free] + L --> V[kernfs_remove] + + P --> W[Release monitoring ID] + + X[rdtgroup_kn_unlock] --> Y[Cleanup if waitcount == 0] + + style A fill:#e1f5fe + style F fill:#ffebee + style G fill:#fff3e0 + style R fill:#f3e5f5 + style B fill:#e8f5e8 + style X fill:#e8f5e8 +``` + +## 3. Monitoring Directory Creation Flow + +This diagram shows how the monitoring directory structure is created, starting with mkdir_mondata_all(). + +```mermaid +graph TD + A[mkdir_mondata_all] --> B[mongroup_create_dir] + B --> C[Create 'mon_data' directory] + + A --> D[for_each_mon_capable_rdt_resource] + D --> E[mkdir_mondata_subdir_alldom] + + E --> F[for each domain in r->mon_domains] + F --> G[mkdir_mondata_subdir] + + G --> H[Create domain directory] + G --> I{SNC enabled?} + I -->|No| J + I -->|Yes| K[Create SNC subdirectories] + K --> L + + J["Directory: mon_[resource]_[domain_id]"] --> M[mon_add_all_files] + L["Directory: mon_sub_[resource]_[domain_id]"] --> M + + M --> N[for each event in r->evt_list] + N --> O[mon_addfile] + + O --> P[Create event file] + O --> Q[Set kf_mondata_ops] + O --> R[Store mon_data in kn->priv] + + P --> S[Files: llc_occupancy, mbm_total_bytes, mbm_local_bytes] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style E fill:#fff3e0 + style G fill:#f3e5f5 + style M fill:#f1f8e9 + style O fill:#fce4ec +``` + +## 4. Monitoring Data Read Flow + +This diagram shows how monitoring data flows from MSRs through the architecture layer to the filesystem layer. + +```mermaid +graph TD + A[rdtgroup_mondata_show] --> C[Parse event/domain from kernfs] + C --> D[resctrl_arch_rmid_read] + + D --> E[logical_rmid_to_physical_rmid] + E --> F[__rmid_read_phys] + + F --> G[wrmsrl MSR_IA32_QM_EVTSEL] + F --> H[rdmsrl MSR_IA32_QM_CTR] + F --> I{Counter valid?} + I -->|Error bit set| J[Return -EIO] + I -->|Unavailable| K[Return -EINVAL] + I -->|Valid| L[Process counter value] + + L --> M[get_corrected_mbm_count] + L --> N[Apply mon_scale factor] + L --> O[Handle MBM overflow] + + M --> P[Apply hardware corrections] + O --> Q[Update arch_mbm_state] + + + style A fill:#e1f5fe + style D fill:#e8f5e8 + style F fill:#fff3e0 + style G fill:#f3e5f5 + style H fill:#f3e5f5 +``` + +## 4. MSR Access and Hardware Interface + +This diagram shows the low-level MSR access patterns for monitoring and control. + +```mermaid +graph TD + A[Hardware MSR Interface] --> B[Monitoring MSRs] + A --> C[Control MSRs] + A --> D[Configuration MSRs] + + B --> E[MSR_IA32_QM_EVTSEL 0xc8d] + B --> F[MSR_IA32_QM_CTR 0xc8e] + B --> G[MSR_IA32_PQR_ASSOC 0xc8f] + + C --> H[MSR_IA32_L3_CBM_BASE] + C --> I[MSR_IA32_L2_CBM_BASE] + C --> J[MSR_IA32_MBA_THRTL_BASE] + + D --> K[MSR_IA32_L3_QOS_CFG] + D --> L[MSR_IA32_L2_QOS_CFG] + D --> M[MSR_RMID_SNC_CONFIG 0xca0] + D --> N[MSR_IA32_EVT_CFG_BASE 0xc0000400] + + E --> O[Event ID + RMID selection] + F --> P[Counter value read] + G --> Q[CLOSID/RMID association] + + K --> R[l3_qos_cfg_update] + L --> S[l2_qos_cfg_update] + M --> T[SNC configuration] + N --> U[Event configuration] + + R --> V[CDP enable/disable] + S --> V + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style D fill:#f3e5f5 +``` + +## 5. SNC (Sub-NUMA Cluster) Support Flow + +This diagram shows how SNC support works for monitoring in multi-node configurations. + +```mermaid +graph TD + A[resctrl_arch_rmid_read] --> B[logical_rmid_to_physical_rmid] + B --> C{SNC enabled?} + C -->|No| D[physical_rmid = logical_rmid] + C -->|Yes| E[Calculate node_id from domain] + + E --> F[physical_rmid = logical_rmid + node_id * num_rmid] + F --> G[Use physical_rmid for MSR access] + D --> G + + G --> H[__rmid_read_phys] + H --> I[wrmsrl MSR_IA32_QM_EVTSEL] + H --> J[rdmsrl MSR_IA32_QM_CTR] + + K[SNC Configuration] --> L[MSR_RMID_SNC_CONFIG] + L --> M[Set sharing mode] + M --> N[Configure node count] + + O[Domain Creation] --> P[snc_nodes_per_l3_cache] + P --> Q[Calculate domains per SNC node] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style E fill:#f3e5f5 + style K fill:#f1f8e9 + style O fill:#fce4ec +``` + +## 6. Architecture Resource Initialization + +This diagram shows how architecture-specific resources are initialized and configured. + +```mermaid +graph TD + A[resctrl_cpu_detect] --> B[CPUID checks] + B --> C[rdt_get_mon_l3_config] + B --> D[rdt_get_cache_alloc_cfg] + B --> E[rdt_get_mba_config] + + C --> F[Configure MBM width offset] + C --> G[Set occupancy scale] + C --> H[Initialize BMEC mask] + C --> I[Setup SNC configuration] + + D --> J[Configure cache CBM] + D --> K[Set CDP capability] + D --> L[Initialize control domains] + + E --> M[Configure MBA throttle] + E --> N[Set MBA capability] + + O[Resource Registration] --> P[rdt_resources_all array] + P --> Q[L3 Resource] + P --> R[L2 Resource] + P --> S[MBA Resource] + + Q --> T[ctrl_domains list] + Q --> U[mon_domains list] + R --> T + S --> T + + style A fill:#e1f5fe + style C fill:#e8f5e8 + style D fill:#fff3e0 + style E fill:#f3e5f5 + style O fill:#f1f8e9 + style P fill:#fce4ec +``` + +## 7. CPU Online and Domain Creation Flow + +This diagram shows the complete flow from CPU coming online to domain creation and initialization. + +```mermaid +graph TD + A[resctrl_arch_online_cpu] --> B[mutex_lock domain_list_lock] + B --> C[for_each_capable_rdt_resource] + C --> D[domain_add_cpu] + + D --> E{Resource capabilities?} + E -->|alloc_capable| F[domain_add_cpu_ctrl] + E -->|mon_capable| G[domain_add_cpu_mon] + + F --> H[get_domain_id_from_scope] + G --> I[get_domain_id_from_scope] + + H --> J[resctrl_find_domain ctrl_domains] + I --> K[resctrl_find_domain mon_domains] + + J --> L{Domain exists?} + K --> M{Domain exists?} + + L -->|Yes| N[Add CPU to existing ctrl domain] + L -->|No| O[Create new ctrl domain] + M -->|Yes| P[Add CPU to existing mon domain] + M -->|No| Q[Create new mon domain] + + O --> R[domain_setup_ctrlval] + Q --> S[arch_mon_domain_online] + Q --> T[arch_domain_mbm_alloc] + + R --> U[Allocate ctrl_val arrays] + R --> V[setup_default_ctrlval] + R --> W[msr_update - program MSRs] + + S --> X[Configure SNC if needed] + T --> Y[Allocate MBM counter arrays] + + AA[Filesystem Integration] --> BB[resctrl_online_ctrl_domain] + AA --> CC[resctrl_online_mon_domain] + + CC --> DD[domain_setup_mon_state] + CC --> EE[Setup MBM overflow handler] + CC --> FF[mkdir_mondata_subdir_allrdtgrp] + + DD --> GG[Allocate rmid_busy_llc bitmap] + DD --> HH[Allocate mbm_total/local arrays] + + style A fill:#e1f5fe + style D fill:#e8f5e8 + style F fill:#fff3e0 + style G fill:#f3e5f5 + style O fill:#f1f8e9 + style Q fill:#fce4ec + style R fill:#e1f5fe + style S fill:#e8f5e8 +``` + +## 8. Domain ID Resolution and CPU Topology Mapping + +This diagram shows how CPU topology is mapped to domain IDs for different resource scopes. + +```mermaid +graph TD + A[get_domain_id_from_scope] --> B{Resource scope?} + + B -->|RESCTRL_L3_CACHE| C[get_cpu_cacheinfo_id cpu, 3] + B -->|RESCTRL_L2_CACHE| D[get_cpu_cacheinfo_id cpu, 2] + B -->|RESCTRL_L3_NODE| E[cpu_to_node cpu] + + C --> F[L3 Cache ID] + D --> G[L2 Cache ID] + E --> H[NUMA Node ID] + + I[CPU Topology Examples] --> J[CPUs 0-15 share L3 cache → Domain 0] + I --> K[CPUs 16-31 share L3 cache → Domain 1] + I --> L[CPUs 0-1 share L2 cache → Domain 0] + I --> M[CPUs 2-3 share L2 cache → Domain 1] + + N[Domain Lists] --> O[r->ctrl_domains - sorted by ID] + N --> P[r->mon_domains - sorted by ID] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style D fill:#f3e5f5 + style E fill:#f1f8e9 + style I fill:#fce4ec +``` + +## 8. Event Configuration Interface + +This diagram shows how monitoring events are configured through the architecture layer. + +```mermaid +graph TD + A[Event Configuration] --> B[mon_event_config_index_get] + B --> C{Event ID?} + C -->|QOS_L3_MBM_TOTAL| D[Return index 0] + C -->|QOS_L3_MBM_LOCAL| E[Return index 1] + C -->|Invalid| F[Return INVALID_CONFIG_INDEX] + + G[resctrl_arch_mon_event_config_write] --> H[mon_event_config_index_get] + H --> I[wrmsrq MSR_IA32_EVT_CFG_BASE + index] + + J[resctrl_arch_mon_event_config_read] --> K[mon_event_config_index_get] + K --> L[rdmsrq MSR_IA32_EVT_CFG_BASE + index] + L --> M[Apply MAX_EVT_CONFIG_BITS mask] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style G fill:#f3e5f5 + style J fill:#f1f8e9 +``` + +## 9. Rename/Move Operation Flow + +This diagram shows the complete workflow for renaming/moving monitoring groups between parent control groups. + +```mermaid +graph TD + A[rdtgroup_rename] --> B[rdtgroup_kn_lock_live src] + A --> C[rdtgroup_kn_lock_live dst_parent] + + B --> D[Validation Checks] + C --> D + + D --> E[alloc_cpumask_var] + E --> F[kernfs_rename] + F --> G[mongrp_reparent] + + G --> H[Update parent lists] + G --> I[Update CLOSID] + G --> J[rdt_move_group_tasks] + G --> K[update_closid_rmid] + + J --> L[Move tasks to new parent] + K --> M[resctrl_arch_sync_cpu_closid_rmid] + + M --> N[Update CPU MSRs] + + style A fill:#e1f5fe + style D fill:#e8f5e8 + style F fill:#fff3e0 + style G fill:#f3e5f5 + style J fill:#f1f8e9 + style K fill:#f1f8e9 + style M fill:#fce4ec +``` + +## 10. Group Reparenting Details - mongrp_reparent + +This diagram shows the detailed steps within the `mongrp_reparent` function. + +```mermaid +graph TD + A[mongrp_reparent] --> B[Remove from old parent list] + A --> C[Add to new parent list] + A --> D[Update parent pointer] + A --> E[Update CLOSID] + + B --> F[list_del old_prdtgrp->mon.crdtgrp_list] + C --> G[list_add new_prdtgrp->mon.crdtgrp_list] + D --> H[rdtgrp->parent = new_prdtgrp] + E --> I[rdtgrp->closid = new_prdtgrp->closid] + + I --> J[rdt_move_group_tasks] + J --> K[task_rq_lock each task] + J --> L[task->closid = new_closid] + J --> M[task_rq_unlock each task] + + M --> N[update_closid_rmid] + N --> O[on_each_cpu_mask] + O --> P[resctrl_arch_sync_cpu_closid_rmid] + + P --> Q[this_cpu_write pqr_state.default_closid] + P --> R[this_cpu_write pqr_state.default_rmid] + P --> S[resctrl_arch_sched_in current] + + S --> T[Update MSR_IA32_PQR_ASSOC if needed] + + style A fill:#e1f5fe + style F fill:#e8f5e8 + style G fill:#e8f5e8 + style J fill:#fff3e0 + style N fill:#f3e5f5 + style P fill:#f1f8e9 + style S fill:#fce4ec +``` + +## 11. Pseudo-Lock Operation Overview + +Pseudo-locking is a feature that allows loading specific memory regions into cache and preventing them from being evicted by future cache allocation operations. This provides deterministic cache allocation for critical workloads. + +### Pseudo-Lock Concepts + +**Cache Pseudo-Locking** works by: +1. **Isolation**: Setting up a dedicated cache capacity bitmask (CBM) for the region +2. **Loading**: Reading the target memory while using the dedicated CBM to load it into cache +3. **Protection**: Preventing future CBM allocations from overlapping with the pseudo-locked region + +**Key Components**: +- **Pseudo-Lock Region**: Memory region to be locked into cache +- **CLOSID**: Cache allocation class used during the locking process +- **CBM**: Cache capacity bitmask defining which cache ways are reserved +- **Thread**: Kernel thread that performs the actual cache loading +- **Measurement**: Performance monitoring to verify locking effectiveness + +## 12. Pseudo-Lock State Machine and Mode Transitions + +This diagram shows the state transitions in the pseudo-lock lifecycle. + +```mermaid +graph TD + A[RDT_MODE_SHAREABLE/EXCLUSIVE] --> B[Write 'pseudo-locksetup' to mode file] + B --> C[rdtgroup_locksetup_enter] + + C --> D[RDT_MODE_PSEUDO_LOCKSETUP] + D --> E[Write schemata with CBM] + D --> F[Write 'shareable/exclusive' to mode file] + + E --> G[rdtgroup_pseudo_lock_create] + F --> H[rdtgroup_locksetup_exit] + + G --> I[RDT_MODE_PSEUDO_LOCKED] + H --> A + + I --> J[Group deletion only] + J --> K[rdtgroup_pseudo_lock_remove] + + style A fill:#e1f5fe + style D fill:#fff3e0 + style I fill:#ffebee + style C fill:#e8f5e8 + style G fill:#f3e5f5 + style H fill:#f1f8e9 + style K fill:#fce4ec +``` + +## 13. Pseudo-Lock Setup Flow (rdtgroup_locksetup_enter) + +This diagram shows the validation and setup process when entering pseudo-lock setup mode. + +```mermaid +graph TD + A[rdtgroup_locksetup_enter] --> B[Validation Checks] + + B --> C[Not default group?] + B --> D[CDP disabled?] + B --> E[Platform supports prefetch disable?] + B --> F[No monitoring in progress?] + B --> G[No tasks/CPUs assigned?] + + H[All validations pass] --> I[Restrict filesystem permissions] + I --> J[Initialize pseudo_lock_region struct] + J --> K[Free monitoring RMID] + K --> L[Set mode = RDT_MODE_PSEUDO_LOCKSETUP] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style H fill:#fff3e0 + style I fill:#f3e5f5 + style J fill:#f1f8e9 + style L fill:#fce4ec +``` + +## 14. Pseudo-Lock Creation Flow (rdtgroup_pseudo_lock_create) + +This diagram shows the complete process of creating an active pseudo-lock. + +```mermaid +graph TD + A[rdtgroup_pseudo_lock_create] --> B[Allocate memory region] + B --> C[Constrain CPU C-states] + C --> D[Create kernel thread on target CPU] + + D --> E[kthread_run_on_cpu] + E --> F[resctrl_arch_pseudo_lock_fn] + + F --> G[Cache Loading Process] + G --> H[Wait for thread completion] + + H --> I[Create character device /dev/groupname] + I --> J[Create debugfs measurement interface] + J --> K[Set mode = RDT_MODE_PSEUDO_LOCKED] + K --> L[Free CLOSID] + L --> M[Update file permissions] + + style A fill:#e1f5fe + style E fill:#e8f5e8 + style F fill:#fff3e0 + style G fill:#f3e5f5 + style I fill:#f1f8e9 + style K fill:#fce4ec +``` + +## 15. Arch-Specific Cache Loading Process (resctrl_arch_pseudo_lock_fn) + +This diagram shows the low-level cache loading implementation in the architecture layer. + +```mermaid +graph TD + A[resctrl_arch_pseudo_lock_fn] --> B[wbinvd - Flush all caches] + B --> C[local_irq_disable] + C --> D[Disable hardware prefetchers] + D --> E[Save current CLOSID/RMID] + E --> F[Set pseudo-lock CLOSID] + + F --> G[Critical Section Begin] + G --> H[First loop: Page-level access] + H --> I[Second loop: Cache-line access] + I --> J[Critical Section End] + + J --> K[Restore original CLOSID/RMID] + K --> L[Re-enable hardware prefetchers] + L --> M[local_irq_enable] + M --> N[Wake up waiting thread] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style G fill:#fff3e0 + style H fill:#f3e5f5 + style I fill:#f3e5f5 + style J fill:#f1f8e9 + style N fill:#fce4ec +``` + +## 16. Pseudo-Lock Performance Measurement + +This diagram shows how pseudo-lock effectiveness is measured using performance counters. + +```mermaid +graph TD + A[Performance Measurement] --> B[resctrl_arch_measure_cycles_lat_fn] + A --> C[resctrl_arch_measure_l2_residency] + A --> D[resctrl_arch_measure_l3_residency] + + B --> E[Measure memory access latency] + E --> F[rdtsc_ordered for timing] + E --> G[Access memory at 32-byte stride] + E --> H[trace_pseudo_lock_mem_latency] + + C --> I[Create perf events for L2] + C --> J[MEM_LOAD_UOPS_RETIRED events] + C --> K[L2_HIT and L2_MISS counters] + + D --> L[Create perf events for L3] + D --> M[LONGEST_LAT_CACHE events] + D --> N[Cache references and misses] + + K --> O[measure_residency_fn] + N --> O + O --> P[Disable prefetchers] + O --> Q[Read perf counters before/after] + O --> R[Access pseudo-locked memory] + O --> S[Calculate hit/miss ratios] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style D fill:#f3e5f5 + style O fill:#f1f8e9 + style S fill:#fce4ec +``` + +## 17. Pseudo-Lock Hardware Support Detection + +This diagram shows how hardware support for pseudo-locking is detected. + +```mermaid +graph TD + A[resctrl_arch_get_prefetch_disable_bits] --> B[Check CPU vendor and family] + B --> C{Intel x86 family 6?} + C -->|No| D[Return 0 - Not supported] + C -->|Yes| E[Check CPU model] + + E --> F{BROADWELL_X?} + E --> G{GOLDMONT/GOLDMONT_PLUS?} + E --> H{Other models?} + + F -->|Yes| I[prefetch_disable_bits = 0xF] + G -->|Yes| J[prefetch_disable_bits = 0x5] + H -->|Yes| K[prefetch_disable_bits = 0] + + I --> L[L2 HW Prefetcher Disable] + I --> M[L2 Adjacent Line Prefetcher Disable] + I --> N[DCU HW Prefetcher Disable] + I --> O[DCU IP Prefetcher Disable] + + J --> P[L2 HW Prefetcher Disable] + J --> Q[DCU HW Prefetcher Disable] + + style A fill:#e1f5fe + style C fill:#e8f5e8 + style E fill:#fff3e0 + style F fill:#f3e5f5 + style G fill:#f3e5f5 + style I fill:#f1f8e9 + style J fill:#fce4ec +``` + +## Key Integration Points + +The diagrams show several critical integration points between the filesystem and architecture layers: + +1. **Reference Counting**: The `rdtgroup_kn_lock_live`/`rdtgroup_kn_unlock` mechanism ensures safe concurrent access +2. **MSR Abstraction**: Architecture layer provides clean MSR interface to filesystem layer +3. **Domain Management**: CPU hotplug events are handled transparently by the architecture layer +4. **Error Handling**: Hardware errors and unavailable conditions are properly propagated +5. **Resource Management**: Architecture-specific resource initialization is abstracted from the filesystem layer +6. **Rename Operations**: Monitoring groups can be safely moved between parent control groups with proper validation and MSR updates +7. **Task Migration**: When groups are reparented, all associated tasks are moved and their MSRs are updated atomically +8. **Pseudo-Lock Integration**: Mode transitions and cache loading operations bridge filesystem control with hardware-specific cache manipulation +9. **Performance Measurement**: Provides comprehensive measurement capabilities using hardware performance counters and tracing + +## Rename Operation Characteristics + +The rename/move workflow has several important characteristics: + +- **Atomic Operations**: Uses kernfs_rename followed by mongrp_reparent to ensure consistency +- **Reference Safety**: Uses the same reference counting mechanism as other operations +- **Validation**: Extensive validation prevents invalid moves (e.g., moving control groups, moving to non-mon_groups directories) +- **CPU Constraint Enforcement**: Prevents moving MON groups that are actively monitoring CPUs between different parent CTRL_MON groups +- **MSR Synchronization**: All affected CPUs have their MSRs updated when tasks are moved between CLOSIDs +- **Error Recovery**: Proper cleanup on all error paths ensures no partial state corruption + +## Pseudo-Lock Operation Characteristics + +The pseudo-lock feature has several key characteristics: + +- **Hardware Requirements**: Requires specific Intel CPU models with prefetch disable capability +- **State Machine**: Uses a 4-state model with clear transitions and validation +- **Deterministic Loading**: Two-pass memory access ensures reliable cache loading +- **Performance Measurement**: Comprehensive measurement using hardware performance counters +- **Resource Isolation**: Creates exclusive cache regions that cannot be evicted by other allocations +- **Thread Safety**: Uses kernel threads and proper synchronization for cache loading operations +- **Device Interface**: Provides character device and debugfs interfaces for user access and debugging + +These flows demonstrate how the ResCtrl subsystem maintains a clean separation between filesystem operations and hardware-specific implementation details while ensuring proper synchronization and error handling throughout the stack. \ No newline at end of file From 4d5ab97bd656bbaaa973dbf8b8dce1d45b208c24 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Wed, 13 Aug 2025 19:36:01 +0000 Subject: [PATCH 02/51] add kernfs file pointer writeup --- kernfs-file-handling.md | 329 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 kernfs-file-handling.md diff --git a/kernfs-file-handling.md b/kernfs-file-handling.md new file mode 100644 index 00000000000000..62dbd749bbf638 --- /dev/null +++ b/kernfs-file-handling.md @@ -0,0 +1,329 @@ +# Kernfs File Handling and Node Removal + +## Overview + +This document analyzes how kernfs handles open files when kernfs nodes are removed, based on examination of the Linux kernel source code in `fs/kernfs/`. + +## Key Data Structures + +### `kernfs_open_node` +Located at `fs/kernfs/file.c:21-28`, this structure manages all open files for a specific kernfs node: + +```c +struct kernfs_open_node { + struct rcu_head rcu_head; + atomic_t event; + wait_queue_head_t poll; + struct list_head files; /* goes through kernfs_open_file.list */ + unsigned int nr_mmapped; + unsigned int nr_to_release; +}; +``` + +### `kernfs_open_file` +Each open file descriptor gets a `kernfs_open_file` structure that is linked to the `kernfs_open_node.files` list via its `list` member. + +## File Opening Process (`kernfs_fop_open`) + +1. **Active Reference Acquisition** (`fs/kernfs/file.c:601`): + ```c + if (!kernfs_get_active(kn)) + return -ENODEV; + ``` + +2. **Open File Registration** (`fs/kernfs/file.c:700`): + ```c + error = kernfs_get_open_node(kn, of); + ``` + +3. **Active Reference Release** (`fs/kernfs/file.c:712`): + ```c + kernfs_put_active(kn); + ``` + +**Critical Point**: The active reference is released at the end of `kernfs_fop_open()`. Open files do NOT hold active references. + +## Active Reference System + +### What Active References Provide +- **Temporary Protection**: Prevent node removal while operations are in progress +- **Synchronous Operations**: All file operations (`read`, `write`, `mmap`, etc.) acquire active references before accessing the node +- **Deactivation Mechanism**: When a node is being removed, it becomes "deactivated" and new active reference acquisitions fail with `-ENODEV` + +### Active Reference Lifecycle +- **Acquisition**: `kernfs_get_active(kn)` - fails if node is deactivated +- **Release**: `kernfs_put_active(kn)` - may trigger removal completion +- **Break/Unbreak**: Special mechanism for self-removal scenarios + +## Node Removal Process + +### 1. Deactivation Phase (`__kernfs_remove` in `fs/kernfs/dir.c:1488-1494`) +```c +/* prevent new usage by marking all nodes removing and deactivating */ +pos = NULL; +while ((pos = kernfs_next_descendant_post(pos, kn))) { + pos->flags |= KERNFS_REMOVING; + if (kernfs_active(pos)) + atomic_add(KN_DEACTIVATED_BIAS, &pos->active); +} +``` + +### 2. Draining Phase (`kernfs_drain` in `fs/kernfs/dir.c:489`) +- **Wait for Active References**: Waits for all active references to be released +- **Drain Open Files**: Calls `kernfs_drain_open_files()` if needed + +### 3. Open File Draining (`kernfs_drain_open_files` in `fs/kernfs/file.c:793`) +```c +list_for_each_entry(of, &on->files, list) { + struct inode *inode = file_inode(of->file); + + if (of->mmapped) { + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + of->mmapped = false; + on->nr_mmapped--; + } + + if (kn->flags & KERNFS_HAS_RELEASE) + kernfs_release_file(kn, of); +} +``` + +## Guarantees for Open Files + +### What is Guaranteed +1. **Node Memory Persistence**: The `kernfs_node` structure remains allocated until all references (including from open files) are released +2. **Graceful Degradation**: File operations will fail with `-ENODEV` when attempting to acquire active references on removed nodes +3. **Release Callback Execution**: Files with release callbacks get them called during the draining process +4. **Memory Mapping Cleanup**: Any memory mappings are properly unmapped + +### What is NOT Guaranteed +1. **Active Reference Acquisition**: `kernfs_get_active()` calls will fail once the node is deactivated +2. **File Operation Success**: Read, write, and other operations will return `-ENODEV` +3. **Node Reactivation**: Once deactivated, a node cannot be reactivated + +## File Operation Behavior After Removal + +All file operations follow this pattern (example from `kernfs_file_read_iter`): + +```c +mutex_lock(&of->mutex); +if (!kernfs_get_active(of->kn)) { + len = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; +} +// ... perform operation +kernfs_put_active(of->kn); +mutex_unlock(&of->mutex); +``` + +**Result**: Operations fail cleanly with `-ENODEV` rather than crashing. + +## Special Cases + +### Self-Removal (`kernfs_remove_self`) +- Uses `kernfs_break_active_protection()` to temporarily release the active reference held by the calling operation +- Allows a file operation to remove its own node without deadlock +- Example usage: Device removal triggered by writing to a "delete" file + +### Memory-Mapped Files +- Mappings are forcibly unmapped during the draining process +- The `nr_mmapped` counter tracks active mappings +- `unmap_mapping_range()` ensures no stale mappings remain + +## File Descriptor Lifecycle and Reference Management + +### Reference Counting on kernfs_node + +From `include/linux/kernfs.h:132-133`, each `kernfs_node` has two atomic reference counts: +```c +struct kernfs_node { + atomic_t count; + atomic_t active; + // ... other fields +}; +``` + +1. **Active References** (`kn->active`) - temporary, prevent removal during operations +2. **Regular References** (`kn->count`) - persistent, keep node memory allocated + +### What Happens to Open Files After Node Removal + +When a kernfs node is removed while files are still open: + +1. **File descriptors remain valid** - they continue to reference the `kernfs_node` structure +2. **Node memory persists** - the `kernfs_node` isn't freed until `kn->count` reaches zero +3. **Operations fail gracefully** - all file operations return `-ENODEV` after deactivation +4. **Release callbacks are invoked early** - during the draining process, before file descriptors are closed +5. **kernfs_node reference is retained** - until the file descriptor is actually closed + +### Release Callback Execution Context + +**Key Point**: The release callback is called **exactly once** - either during drain OR during file close, never both. + +From `kernfs_release_file()` (fs/kernfs/file.c:724-735): + +```c +/* used from release/drain to ensure that ->release() is called exactly once */ +static void kernfs_release_file(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); + + if (!of->released) { + kn->attr.ops->release(of); + of->released = true; + of_on(of)->nr_to_release--; + } +} +``` + +**Double-Release Prevention**: The `of->released` flag (from `include/linux/kernfs.h:272`) ensures the release callback is only called once: +- During drain: `kernfs_drain_open_files()` calls `kernfs_release_file()` +- During file close: `kernfs_fop_release()` calls `kernfs_release_file()` +- The `if (!of->released)` check prevents double execution + +**Lock Context**: +- The drain process holds the kernfs open file mutex: `mutex = kernfs_open_file_mutex_lock(kn);` +- Release callbacks are called under this mutex protection +- The `kernfs_node` is guaranteed to be valid during release callback execution + +### Reference Management for Open Files + +**Automatic Reference Handling**: +- The `kernfs_open_file` structure holds a pointer to the `kernfs_node` +- **Note**: I need to verify the exact reference counting mechanism by examining `kernfs_get_open_node()` and related functions +- References are managed automatically during file open/close operations + +**File Close Behavior**: From `kernfs_fop_release()` (fs/kernfs/file.c:752-771): +```c +static int kernfs_fop_release(struct inode *inode, struct file *filp) +{ + struct kernfs_node *kn = inode->i_private; + struct kernfs_open_file *of = kernfs_of(filp); + + if (kn->flags & KERNFS_HAS_RELEASE) { + struct mutex *mutex; + mutex = kernfs_open_file_mutex_lock(kn); + kernfs_release_file(kn, of); + mutex_unlock(mutex); + } + + kernfs_unlink_open_file(kn, of, false); + seq_release(inode, filp); + kfree(of->prealloc_buf); + kfree(of); + return 0; +} +``` + +- Release callbacks are called if `KERNFS_HAS_RELEASE` flag is set +- The `kernfs_open_file` structure is freed, which releases its reference to the `kernfs_node` +- Node memory persists until `kn->count` reaches zero + +### Using kernfs File Operations for Reference Management + +From `include/linux/kernfs.h:261-266`, kernfs provides file operation hooks: + +```c +struct kernfs_ops { + /* + * Optional open/release methods. Both are called with + * @of->seq_file populated. + */ + int (*open)(struct kernfs_open_file *of); + void (*release)(struct kernfs_open_file *of); + // ... other methods for read/write operations +}; +``` + +**The `open` callback**: From `kernfs_fop_open()` (fs/kernfs/file.c:703-708): +```c +if (ops->open) { + /* nobody has access to @of yet, skip @of->mutex */ + error = ops->open(of); + if (error) + goto err_put_node; +} +``` + +- Called during `kernfs_fop_open()` after the `kernfs_open_file` is set up +- Can be used to take additional references on underlying data structures +- Receives `kernfs_open_file *of` parameter - use `of->kn` to access the node +- Return 0 for success, negative errno for failure + +**Example pattern for additional reference management**: +```c +static int my_open(struct kernfs_open_file *of) +{ + struct my_data *data = of->kn->priv; + + /* Take reference on underlying data structure */ + get_my_data(data); + of->priv = data; /* Store for later use */ + + return 0; +} + +static void my_release(struct kernfs_open_file *of) +{ + struct my_data *data = of->priv; + + /* Release reference taken in open */ + if (data) + put_my_data(data); +} +``` + +## Race Conditions and File Descriptor Safety + +### Scenario: Open File vs Node Removal Race + +**Your specific concern**: File descriptor passed to perf while kernfs node is being removed. + +**What happens**: +1. **File descriptor remains valid** - the underlying `struct file` and `kernfs_open_file` persist +2. **kernfs_node memory is preserved** - held by reference from `kernfs_open_file->kn` +3. **Operations fail cleanly** - perf operations on the fd will get `-ENODEV` when trying to access node data +4. **No crashes or memory corruption** - all access is bounds-checked through active reference system + +**Detection of node removal**: +```c +/* In your file operation implementation */ +if (!kernfs_get_active(of->kn)) { + /* Node has been removed/deactivated */ + return -ENODEV; +} +/* Safe to access node data */ +kernfs_put_active(of->kn); +``` + +### Recommendations for Perf Integration + +When implementing kernfs file ops for perf integration: + +1. **Use the `open` callback** to take references on any data structures that perf will need +2. **Use the `release` callback** to clean up those references during node removal +3. **Always check active references** before accessing node data in file operations +4. **Store necessary data in `of->priv`** rather than relying on node data after removal + +**Critical for perf**: After a kernfs node is removed but before the file descriptor is closed: +- The `kernfs_node` memory remains valid (held by `kernfs_open_file->kn`) +- The release callback has already been called (during drain) +- File operations will fail with `-ENODEV` when trying to get active references +- Perf can safely hold the file descriptor without causing crashes + +This ensures that even if a kernfs node is removed while a file descriptor is being used by perf, the system remains stable and operations fail gracefully rather than causing memory corruption. + +## Summary + +The kernfs file handling system provides a robust mechanism for managing open files during node removal: + +- **Open files do not prevent node removal** - they don't hold active references +- **File operations fail gracefully** - return `-ENODEV` when node is removed +- **Memory safety is maintained** - node structures persist until all references are released +- **Cleanup is automatic** - release callbacks are invoked and mappings are cleaned up +- **File descriptors remain valid** - can be safely passed to other subsystems like perf +- **Race conditions are handled** - active reference system prevents crashes during concurrent removal + +This design allows kernfs to support dynamic filesystem operations (like device hot-removal) while maintaining system stability and preventing resource leaks, making it suitable for integration with subsystems like perf that may hold file descriptors for extended periods. \ No newline at end of file From 18c2027a7b000e2320292c93779ac083d9c4ee18 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Wed, 13 Aug 2025 20:43:39 +0000 Subject: [PATCH 03/51] add script to build the kernel, initrd and test, and upload to s3 --- build-and-upload.sh | 392 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 392 insertions(+) create mode 100755 build-and-upload.sh diff --git a/build-and-upload.sh b/build-and-upload.sh new file mode 100755 index 00000000000000..578ab47761471b --- /dev/null +++ b/build-and-upload.sh @@ -0,0 +1,392 @@ +#!/bin/bash +set -euo pipefail + +# Configuration +S3_BUCKET="unvariance-kernel-dev" +S3_REGION="us-east-2" +BUILD_ID="${BUILD_ID:-$(git rev-parse HEAD)}" +# Dynamically determine kernel version (including git state and LOCALVERSION) +KERNEL_VERSION=$(make kernelrelease) +CC="ccache gcc" + +# Create temporary directory in user home for builds +TEMP_BUILD_DIR="$HOME/.kernel-build-tmp-$$" +# Cache directory for reusable initrds +INITRD_CACHE_DIR="$HOME/.kernel-initrd-cache" +trap 'cleanup_temp_dir' EXIT + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" +} + +warn() { + echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" +} + +error() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" + exit 1 +} + +# Cleanup function for temporary directory +cleanup_temp_dir() { + if [[ -n "${TEMP_BUILD_DIR:-}" && -d "$TEMP_BUILD_DIR" ]]; then + log "Cleaning up temporary directory: $TEMP_BUILD_DIR" + rm -rf "$TEMP_BUILD_DIR" + fi +} + +# Create temporary directory +create_temp_dir() { + mkdir -p "$TEMP_BUILD_DIR" + mkdir -p "$INITRD_CACHE_DIR" + log "Using temporary directory: $TEMP_BUILD_DIR" + log "Using initrd cache directory: $INITRD_CACHE_DIR" +} + +# Check dependencies +check_dependencies() { + log "Checking dependencies..." + + if ! command -v aws >/dev/null 2>&1; then + error "AWS CLI not found. Please install aws-cli." + fi + + if ! command -v make >/dev/null 2>&1; then + error "make not found. Please install build tools." + fi + + # Check if we're in the kernel source directory + if [[ ! -f "Makefile" ]] || ! grep -q "KERNELRELEASE" Makefile; then + error "This script must be run from the kernel source root directory." + fi +} + +# Configure kernel for our needs +configure_kernel() { + log "Configuring kernel build..." + + # Start with a reasonable base config + if [[ -f ".config" ]]; then + log "Using existing .config" + else + error "No existing .config" + fi + + # # Enable required features for our resctrl work + # log "Enabling resctrl and perf features..." + # scripts/config --enable CONFIG_X86_RESCTRL + # scripts/config --enable CONFIG_PERF_EVENTS + # scripts/config --enable CONFIG_X86_MSR + # scripts/config --enable CONFIG_KEXEC + # scripts/config --enable CONFIG_KEXEC_FILE + # scripts/config --enable CONFIG_CRASH_DUMP + + # # Enable debugging features + # scripts/config --enable CONFIG_DEBUG_KERNEL + # scripts/config --enable CONFIG_DEBUG_INFO + # scripts/config --enable CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT + + # # Make sure we have networking and filesystem support + # scripts/config --enable CONFIG_NET + # scripts/config --enable CONFIG_INET + # scripts/config --enable CONFIG_EXT4_FS + # scripts/config --enable CONFIG_PROC_FS + # scripts/config --enable CONFIG_SYSFS + + # Update config with dependencies + make olddefconfig +} + +# Build the kernel +build_kernel() { + log "Building kernel..." + + # Get number of CPU cores for parallel build + NPROC=$(nproc) + log "Building with ${NPROC} parallel jobs..." + + # Build kernel image + make CC="$CC" -j${NPROC} bzImage + + # Build modules (if any are needed) + make CC="$CC" -j${NPROC} modules + + log "Kernel build completed successfully" +} + +# Build resctrl tests +build_tests() { + log "Building resctrl tests..." + + # Check if test directory exists + if [[ ! -d "tools/testing/selftests/resctrl" ]]; then + error "Resctrl test directory not found" + fi + + cd tools/testing/selftests/resctrl + + # Build the tests + make clean + make + + # Check if the test binary was created + if [[ ! -f "resctrl_tests" ]]; then + error "Failed to build resctrl_tests binary" + fi + + # Move the binary to a temporary location for upload + cp resctrl_tests "${TEMP_BUILD_DIR}/resctrl_tests-${KERNEL_VERSION}" + + cd - >/dev/null + + log "Resctrl tests built successfully" +} + +# Create initrd using mkinitramfs (with caching) +create_initrd() { + # Check for cached initrd first + local cached_initrd="${INITRD_CACHE_DIR}/initrd-${KERNEL_VERSION}.img" + + if [[ -f "$cached_initrd" && "${FORCE_INITRD:-}" != "1" ]]; then + log "Using cached initrd: $cached_initrd" + cp "$cached_initrd" "${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" + return 0 + fi + + log "Creating Ubuntu-compatible initrd using mkinitramfs..." + + # Check if mkinitramfs is available + if ! command -v mkinitramfs >/dev/null 2>&1; then + error "mkinitramfs not found. Please install initramfs-tools: apt-get install initramfs-tools" + fi + + # Install kernel modules to persistent location for reuse + PERSISTENT_MODULES="$HOME/kernel-modules" + log "Checking kernel modules in ${PERSISTENT_MODULES}..." + + # Only install modules if they don't exist or if forced + if [[ ! -d "${PERSISTENT_MODULES}/lib/modules/${KERNEL_VERSION}" || "${FORCE_INITRD:-}" == "1" ]]; then + log "Installing/updating kernel modules..." + make INSTALL_MOD_PATH="${PERSISTENT_MODULES}" modules_install + else + log "Reusing existing kernel modules from ${PERSISTENT_MODULES}..." + fi + + # Path to our modules + MODULES_DIR="${PERSISTENT_MODULES}/lib/modules/${KERNEL_VERSION}" + + if [[ ! -d "${MODULES_DIR}" ]]; then + error "Modules directory not found: ${MODULES_DIR}" + fi + + # Use mkinitramfs with system configuration and temporarily install our modules + log "Creating initramfs using system configuration..." + + # Temporarily install our modules to the system location + SYSTEM_MODULES_DIR="/lib/modules/${KERNEL_VERSION}" + BACKUP_MODULES="" + + # Create /lib/modules directory if it doesn't exist + sudo mkdir -p "/lib/modules" + + # Back up existing modules if they exist + if [[ -d "${SYSTEM_MODULES_DIR}" ]]; then + BACKUP_MODULES="${TEMP_BUILD_DIR}/backup-modules" + log "Backing up existing modules to ${BACKUP_MODULES}..." + sudo mv "${SYSTEM_MODULES_DIR}" "${BACKUP_MODULES}" + fi + + # Symlink our modules to system location (much faster than copying) + log "Temporarily symlinking kernel modules to system location..." + sudo ln -sf "${MODULES_DIR}" "${SYSTEM_MODULES_DIR}" + + # Symlink kernel config to fix mkinitramfs warning + BOOT_CONFIG="/boot/config-${KERNEL_VERSION}" + BACKUP_CONFIG="" + if [[ -f "${BOOT_CONFIG}" ]]; then + BACKUP_CONFIG="${TEMP_BUILD_DIR}/backup-config" + log "Backing up existing config to ${BACKUP_CONFIG}..." + sudo mv "${BOOT_CONFIG}" "${BACKUP_CONFIG}" + fi + log "Temporarily symlinking kernel config to ${BOOT_CONFIG}..." + sudo ln -sf "$(pwd)/.config" "${BOOT_CONFIG}" + + # Generate module dependencies (needed for mkinitramfs) + log "Generating module dependencies..." + sudo depmod "${KERNEL_VERSION}" + + # Use mkinitramfs with system config directory + log "Generating initramfs..." + mkinitramfs -d /etc/initramfs-tools -o "${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" "${KERNEL_VERSION}" + + # Clean up - remove our symlinks and restore backups if needed + sudo rm -f "${SYSTEM_MODULES_DIR}" + if [[ -n "${BACKUP_MODULES}" && -d "${BACKUP_MODULES}" ]]; then + log "Restoring original modules..." + sudo mv "${BACKUP_MODULES}" "${SYSTEM_MODULES_DIR}" + fi + + sudo rm -f "${BOOT_CONFIG}" + if [[ -n "${BACKUP_CONFIG}" && -f "${BACKUP_CONFIG}" ]]; then + log "Restoring original config..." + sudo mv "${BACKUP_CONFIG}" "${BOOT_CONFIG}" + fi + + # Cache the initrd for future use + log "Caching initrd for future builds..." + cp "${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" "$cached_initrd" + + log "Ubuntu-compatible initrd created: ${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" + log "Kernel modules preserved in: ${PERSISTENT_MODULES}" +} + +# Upload artifacts to S3 +upload_to_s3() { + log "Uploading kernel artifacts to S3..." + + local kernel_path="arch/x86/boot/bzImage" + local initrd_path="${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" + local test_path="${TEMP_BUILD_DIR}/resctrl_tests-${KERNEL_VERSION}" + + if [[ ! -f "$kernel_path" ]]; then + error "Kernel image not found at $kernel_path" + fi + + if [[ ! -f "$initrd_path" ]]; then + error "Initrd not found at $initrd_path" + fi + + if [[ ! -f "$test_path" ]]; then + error "Test binary not found at $test_path" + fi + + # Upload kernel + local s3_kernel_key="kernels/${BUILD_ID}/bzImage" + log "Uploading kernel to s3://${S3_BUCKET}/${s3_kernel_key}" + aws s3 cp "$kernel_path" "s3://${S3_BUCKET}/${s3_kernel_key}" --region "$S3_REGION" + + # Upload initrd + local s3_initrd_key="kernels/${BUILD_ID}/initrd.img" + log "Uploading initrd to s3://${S3_BUCKET}/${s3_initrd_key}" + aws s3 cp "$initrd_path" "s3://${S3_BUCKET}/${s3_initrd_key}" --region "$S3_REGION" + + # Upload test binary + local s3_test_key="kernels/${BUILD_ID}/resctrl_tests" + log "Uploading test binary to s3://${S3_BUCKET}/${s3_test_key}" + aws s3 cp "$test_path" "s3://${S3_BUCKET}/${s3_test_key}" --region "$S3_REGION" + + # Create metadata file + local metadata_file="${TEMP_BUILD_DIR}/kernel-metadata-${BUILD_ID}.json" + cat > "$metadata_file" << EOF +{ + "build_id": "${BUILD_ID}", + "kernel_version": "${KERNEL_VERSION}", + "build_date": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "git_commit": "$(git rev-parse HEAD)", + "git_branch": "$(git rev-parse --abbrev-ref HEAD)", + "kernel_path": "${s3_kernel_key}", + "initrd_path": "${s3_initrd_key}", + "test_path": "${s3_test_key}", + "s3_bucket": "${S3_BUCKET}", + "s3_region": "${S3_REGION}" +} +EOF + + # Upload metadata + local s3_metadata_key="kernels/${BUILD_ID}/metadata.json" + log "Uploading metadata to s3://${S3_BUCKET}/${s3_metadata_key}" + aws s3 cp "$metadata_file" "s3://${S3_BUCKET}/${s3_metadata_key}" --region "$S3_REGION" + + # Create latest pointer + aws s3 cp "$metadata_file" "s3://${S3_BUCKET}/kernels/latest.json" --region "$S3_REGION" + + log "Upload completed successfully!" + log "Kernel artifacts available at:" + log " bzImage: s3://${S3_BUCKET}/${s3_kernel_key}" + log " initrd: s3://${S3_BUCKET}/${s3_initrd_key}" + log " test: s3://${S3_BUCKET}/${s3_test_key}" + log " metadata: s3://${S3_BUCKET}/${s3_metadata_key}" + + # Output for GitHub Actions (if running in GitHub Actions) + if [[ -n "${GITHUB_OUTPUT:-}" ]]; then + echo "kernel_s3_key=${s3_kernel_key}" >> "$GITHUB_OUTPUT" + echo "initrd_s3_key=${s3_initrd_key}" >> "$GITHUB_OUTPUT" + echo "test_s3_key=${s3_test_key}" >> "$GITHUB_OUTPUT" + echo "metadata_s3_key=${s3_metadata_key}" >> "$GITHUB_OUTPUT" + echo "build_id=${BUILD_ID}" >> "$GITHUB_OUTPUT" + fi + + # Save build info locally + echo "BUILD_ID=${BUILD_ID}" > .last-build-info + echo "KERNEL_S3_KEY=${s3_kernel_key}" >> .last-build-info + echo "INITRD_S3_KEY=${s3_initrd_key}" >> .last-build-info + echo "TEST_S3_KEY=${s3_test_key}" >> .last-build-info + echo "METADATA_S3_KEY=${s3_metadata_key}" >> .last-build-info +} + +# Show usage information +usage() { + echo "Usage: $0 [BUILD_ID]" + echo "" + echo "Build and upload custom kernel to S3 for testing" + echo "" + echo "Arguments:" + echo " BUILD_ID Optional build ID (default: current git HEAD)" + echo "" + echo "Environment variables:" + echo " S3_BUCKET S3 bucket name (default: unvariance-kernel-dev)" + echo " S3_REGION S3 region (default: us-east-2)" + echo " FORCE_INITRD Set to 1 to force initrd rebuild (default: use cache)" + echo "" + echo "Examples:" + echo " $0 # Build with current HEAD as build ID" + echo " $0 abc123def # Build with specific commit as build ID" + echo " BUILD_ID=test $0 # Build with custom build ID" + echo " FORCE_INITRD=1 $0 # Force initrd rebuild (ignores cache)" +} + +# Main execution +main() { + # Handle command line arguments + if [[ $# -gt 1 ]]; then + usage + exit 1 + fi + + if [[ $# -eq 1 ]]; then + if [[ "$1" == "-h" ]] || [[ "$1" == "--help" ]]; then + usage + exit 0 + fi + BUILD_ID="$1" + fi + + log "Starting kernel build and upload process..." + log "Build ID: ${BUILD_ID}" + log "Kernel Version: ${KERNEL_VERSION}" + log "S3 Bucket: ${S3_BUCKET}" + log "S3 Region: ${S3_REGION}" + + create_temp_dir + check_dependencies + configure_kernel + build_kernel + build_tests + create_initrd + upload_to_s3 + + log "Kernel build and upload completed successfully!" + log "To test this kernel, run: ./trigger-kernel-test.sh -b ${BUILD_ID}" +} + +# Run if executed directly +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file From c40f2550a01309564852794b93799c2e9ca34c42 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 14:14:43 +0000 Subject: [PATCH 04/51] split the initrd compilation to a separate file, increase caching --- build-and-upload.sh | 136 +++++-------------- build-initrd.sh | 315 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 345 insertions(+), 106 deletions(-) create mode 100755 build-initrd.sh diff --git a/build-and-upload.sh b/build-and-upload.sh index 578ab47761471b..a331730c2b2b3a 100755 --- a/build-and-upload.sh +++ b/build-and-upload.sh @@ -11,8 +11,6 @@ CC="ccache gcc" # Create temporary directory in user home for builds TEMP_BUILD_DIR="$HOME/.kernel-build-tmp-$$" -# Cache directory for reusable initrds -INITRD_CACHE_DIR="$HOME/.kernel-initrd-cache" trap 'cleanup_temp_dir' EXIT # Colors for output @@ -45,9 +43,7 @@ cleanup_temp_dir() { # Create temporary directory create_temp_dir() { mkdir -p "$TEMP_BUILD_DIR" - mkdir -p "$INITRD_CACHE_DIR" log "Using temporary directory: $TEMP_BUILD_DIR" - log "Using initrd cache directory: $INITRD_CACHE_DIR" } # Check dependencies @@ -149,102 +145,29 @@ build_tests() { log "Resctrl tests built successfully" } -# Create initrd using mkinitramfs (with caching) +# Build initrd using the separate script create_initrd() { - # Check for cached initrd first - local cached_initrd="${INITRD_CACHE_DIR}/initrd-${KERNEL_VERSION}.img" + log "Building initrd using separate script..." - if [[ -f "$cached_initrd" && "${FORCE_INITRD:-}" != "1" ]]; then - log "Using cached initrd: $cached_initrd" - cp "$cached_initrd" "${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" - return 0 + if [[ ! -f "./build-initrd.sh" ]]; then + error "build-initrd.sh not found in current directory" fi - log "Creating Ubuntu-compatible initrd using mkinitramfs..." + # Run the initrd build script with upload flag + local initrd_output + initrd_output=$(./build-initrd.sh --upload 2>&1) || error "Failed to build and upload initrd" - # Check if mkinitramfs is available - if ! command -v mkinitramfs >/dev/null 2>&1; then - error "mkinitramfs not found. Please install initramfs-tools: apt-get install initramfs-tools" - fi - - # Install kernel modules to persistent location for reuse - PERSISTENT_MODULES="$HOME/kernel-modules" - log "Checking kernel modules in ${PERSISTENT_MODULES}..." - - # Only install modules if they don't exist or if forced - if [[ ! -d "${PERSISTENT_MODULES}/lib/modules/${KERNEL_VERSION}" || "${FORCE_INITRD:-}" == "1" ]]; then - log "Installing/updating kernel modules..." - make INSTALL_MOD_PATH="${PERSISTENT_MODULES}" modules_install - else - log "Reusing existing kernel modules from ${PERSISTENT_MODULES}..." - fi - - # Path to our modules - MODULES_DIR="${PERSISTENT_MODULES}/lib/modules/${KERNEL_VERSION}" - - if [[ ! -d "${MODULES_DIR}" ]]; then - error "Modules directory not found: ${MODULES_DIR}" - fi - - # Use mkinitramfs with system configuration and temporarily install our modules - log "Creating initramfs using system configuration..." - - # Temporarily install our modules to the system location - SYSTEM_MODULES_DIR="/lib/modules/${KERNEL_VERSION}" - BACKUP_MODULES="" - - # Create /lib/modules directory if it doesn't exist - sudo mkdir -p "/lib/modules" + # Extract SHA256 and S3 key from output + INITRD_SHA256=$(echo "$initrd_output" | grep "INITRD_SHA256=" | cut -d'=' -f2) + INITRD_S3_KEY=$(echo "$initrd_output" | grep "INITRD_S3_KEY=" | cut -d'=' -f2) - # Back up existing modules if they exist - if [[ -d "${SYSTEM_MODULES_DIR}" ]]; then - BACKUP_MODULES="${TEMP_BUILD_DIR}/backup-modules" - log "Backing up existing modules to ${BACKUP_MODULES}..." - sudo mv "${SYSTEM_MODULES_DIR}" "${BACKUP_MODULES}" + if [[ -z "$INITRD_SHA256" || -z "$INITRD_S3_KEY" ]]; then + error "Failed to extract initrd information from build script output" fi - # Symlink our modules to system location (much faster than copying) - log "Temporarily symlinking kernel modules to system location..." - sudo ln -sf "${MODULES_DIR}" "${SYSTEM_MODULES_DIR}" - - # Symlink kernel config to fix mkinitramfs warning - BOOT_CONFIG="/boot/config-${KERNEL_VERSION}" - BACKUP_CONFIG="" - if [[ -f "${BOOT_CONFIG}" ]]; then - BACKUP_CONFIG="${TEMP_BUILD_DIR}/backup-config" - log "Backing up existing config to ${BACKUP_CONFIG}..." - sudo mv "${BOOT_CONFIG}" "${BACKUP_CONFIG}" - fi - log "Temporarily symlinking kernel config to ${BOOT_CONFIG}..." - sudo ln -sf "$(pwd)/.config" "${BOOT_CONFIG}" - - # Generate module dependencies (needed for mkinitramfs) - log "Generating module dependencies..." - sudo depmod "${KERNEL_VERSION}" - - # Use mkinitramfs with system config directory - log "Generating initramfs..." - mkinitramfs -d /etc/initramfs-tools -o "${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" "${KERNEL_VERSION}" - - # Clean up - remove our symlinks and restore backups if needed - sudo rm -f "${SYSTEM_MODULES_DIR}" - if [[ -n "${BACKUP_MODULES}" && -d "${BACKUP_MODULES}" ]]; then - log "Restoring original modules..." - sudo mv "${BACKUP_MODULES}" "${SYSTEM_MODULES_DIR}" - fi - - sudo rm -f "${BOOT_CONFIG}" - if [[ -n "${BACKUP_CONFIG}" && -f "${BACKUP_CONFIG}" ]]; then - log "Restoring original config..." - sudo mv "${BACKUP_CONFIG}" "${BOOT_CONFIG}" - fi - - # Cache the initrd for future use - log "Caching initrd for future builds..." - cp "${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" "$cached_initrd" - - log "Ubuntu-compatible initrd created: ${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" - log "Kernel modules preserved in: ${PERSISTENT_MODULES}" + log "Initrd build completed successfully" + log "Initrd SHA256: $INITRD_SHA256" + log "Initrd S3 key: $INITRD_S3_KEY" } # Upload artifacts to S3 @@ -252,30 +175,28 @@ upload_to_s3() { log "Uploading kernel artifacts to S3..." local kernel_path="arch/x86/boot/bzImage" - local initrd_path="${TEMP_BUILD_DIR}/initrd-${KERNEL_VERSION}.img" local test_path="${TEMP_BUILD_DIR}/resctrl_tests-${KERNEL_VERSION}" if [[ ! -f "$kernel_path" ]]; then error "Kernel image not found at $kernel_path" fi - if [[ ! -f "$initrd_path" ]]; then - error "Initrd not found at $initrd_path" - fi - if [[ ! -f "$test_path" ]]; then error "Test binary not found at $test_path" fi + # Validate initrd information from build-initrd.sh + if [[ -z "$INITRD_SHA256" || -z "$INITRD_S3_KEY" ]]; then + error "Initrd information not available. Make sure create_initrd() was called successfully." + fi + # Upload kernel local s3_kernel_key="kernels/${BUILD_ID}/bzImage" log "Uploading kernel to s3://${S3_BUCKET}/${s3_kernel_key}" aws s3 cp "$kernel_path" "s3://${S3_BUCKET}/${s3_kernel_key}" --region "$S3_REGION" - # Upload initrd - local s3_initrd_key="kernels/${BUILD_ID}/initrd.img" - log "Uploading initrd to s3://${S3_BUCKET}/${s3_initrd_key}" - aws s3 cp "$initrd_path" "s3://${S3_BUCKET}/${s3_initrd_key}" --region "$S3_REGION" + # Note: initrd is already uploaded by build-initrd.sh + log "Using pre-uploaded initrd: s3://${S3_BUCKET}/${INITRD_S3_KEY}" # Upload test binary local s3_test_key="kernels/${BUILD_ID}/resctrl_tests" @@ -292,7 +213,8 @@ upload_to_s3() { "git_commit": "$(git rev-parse HEAD)", "git_branch": "$(git rev-parse --abbrev-ref HEAD)", "kernel_path": "${s3_kernel_key}", - "initrd_path": "${s3_initrd_key}", + "initrd_path": "${INITRD_S3_KEY}", + "initrd_sha256": "${INITRD_SHA256}", "test_path": "${s3_test_key}", "s3_bucket": "${S3_BUCKET}", "s3_region": "${S3_REGION}" @@ -310,14 +232,15 @@ EOF log "Upload completed successfully!" log "Kernel artifacts available at:" log " bzImage: s3://${S3_BUCKET}/${s3_kernel_key}" - log " initrd: s3://${S3_BUCKET}/${s3_initrd_key}" + log " initrd: s3://${S3_BUCKET}/${INITRD_S3_KEY} (SHA256: ${INITRD_SHA256})" log " test: s3://${S3_BUCKET}/${s3_test_key}" log " metadata: s3://${S3_BUCKET}/${s3_metadata_key}" # Output for GitHub Actions (if running in GitHub Actions) if [[ -n "${GITHUB_OUTPUT:-}" ]]; then echo "kernel_s3_key=${s3_kernel_key}" >> "$GITHUB_OUTPUT" - echo "initrd_s3_key=${s3_initrd_key}" >> "$GITHUB_OUTPUT" + echo "initrd_s3_key=${INITRD_S3_KEY}" >> "$GITHUB_OUTPUT" + echo "initrd_sha256=${INITRD_SHA256}" >> "$GITHUB_OUTPUT" echo "test_s3_key=${s3_test_key}" >> "$GITHUB_OUTPUT" echo "metadata_s3_key=${s3_metadata_key}" >> "$GITHUB_OUTPUT" echo "build_id=${BUILD_ID}" >> "$GITHUB_OUTPUT" @@ -326,7 +249,8 @@ EOF # Save build info locally echo "BUILD_ID=${BUILD_ID}" > .last-build-info echo "KERNEL_S3_KEY=${s3_kernel_key}" >> .last-build-info - echo "INITRD_S3_KEY=${s3_initrd_key}" >> .last-build-info + echo "INITRD_S3_KEY=${INITRD_S3_KEY}" >> .last-build-info + echo "INITRD_SHA256=${INITRD_SHA256}" >> .last-build-info echo "TEST_S3_KEY=${s3_test_key}" >> .last-build-info echo "METADATA_S3_KEY=${s3_metadata_key}" >> .last-build-info } @@ -343,7 +267,7 @@ usage() { echo "Environment variables:" echo " S3_BUCKET S3 bucket name (default: unvariance-kernel-dev)" echo " S3_REGION S3 region (default: us-east-2)" - echo " FORCE_INITRD Set to 1 to force initrd rebuild (default: use cache)" + echo " FORCE_INITRD Set to 1 to force initrd rebuild (passed to build-initrd.sh)" echo "" echo "Examples:" echo " $0 # Build with current HEAD as build ID" diff --git a/build-initrd.sh b/build-initrd.sh new file mode 100755 index 00000000000000..16bdd96f09eeb5 --- /dev/null +++ b/build-initrd.sh @@ -0,0 +1,315 @@ +#!/bin/bash +set -euo pipefail + +# Configuration +S3_BUCKET="${S3_BUCKET:-unvariance-kernel-dev}" +S3_REGION="${S3_REGION:-us-east-2}" +# Dynamically determine kernel version (including git state and LOCALVERSION) +KERNEL_VERSION=$(make kernelrelease) + +# Create temporary directory in user home for builds +TEMP_BUILD_DIR="$HOME/.kernel-initrd-tmp-$$" +# Cache directory for reusable initrds +INITRD_CACHE_DIR="$HOME/.kernel-initrd-cache" +trap 'cleanup_temp_dir' EXIT + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" +} + +warn() { + echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" +} + +error() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" + exit 1 +} + +# Cleanup function for temporary directory +cleanup_temp_dir() { + if [[ -n "${TEMP_BUILD_DIR:-}" && -d "$TEMP_BUILD_DIR" ]]; then + log "Cleaning up temporary directory: $TEMP_BUILD_DIR" + rm -rf "$TEMP_BUILD_DIR" + fi +} + +# Create temporary directory +create_temp_dir() { + mkdir -p "$TEMP_BUILD_DIR" + mkdir -p "$INITRD_CACHE_DIR" + log "Using temporary directory: $TEMP_BUILD_DIR" + log "Using initrd cache directory: $INITRD_CACHE_DIR" +} + +# Check dependencies +check_dependencies() { + log "Checking dependencies..." + + if ! command -v aws >/dev/null 2>&1; then + error "AWS CLI not found. Please install aws-cli." + fi + + if ! command -v make >/dev/null 2>&1; then + error "make not found. Please install build tools." + fi + + if ! command -v sha256sum >/dev/null 2>&1; then + error "sha256sum not found. Please install coreutils." + fi + + # Check if we're in the kernel source directory + if [[ ! -f "Makefile" ]] || ! grep -q "KERNELRELEASE" Makefile; then + error "This script must be run from the kernel source root directory." + fi +} + +# Calculate SHA256 of initrd content determinants +# This includes kernel version, kernel config, and module dependencies +calculate_initrd_content_hash() { + local hash_input_file="${TEMP_BUILD_DIR}/initrd-hash-input" + + log "Calculating content hash for initrd caching..." >&2 + + # Create deterministic input for hash calculation + { + echo "KERNEL_VERSION=${KERNEL_VERSION}" + # Include relevant kernel config options that affect initrd + if [[ -f ".config" ]]; then + grep -E "^CONFIG_(MODULES|INITRAMFS|COMPRESSION)" .config | sort + fi + # Include module list if modules exist + if [[ -d "/lib/modules/${KERNEL_VERSION}" ]]; then + find "/lib/modules/${KERNEL_VERSION}" -name "*.ko" | sort + fi + } > "$hash_input_file" + + sha256sum "$hash_input_file" | cut -d' ' -f1 +} + +# Create initrd using mkinitramfs (with SHA256-based caching) +create_initrd() { + local content_hash + content_hash=$(calculate_initrd_content_hash) + + # Check for cached initrd using content hash + local cached_initrd="${INITRD_CACHE_DIR}/initrd-${content_hash}.img" + local cached_sha256="${INITRD_CACHE_DIR}/initrd-${content_hash}.sha256" + + if [[ -f "$cached_initrd" && -f "$cached_sha256" && "${FORCE_INITRD:-}" != "1" ]]; then + log "Using cached initrd: $cached_initrd" + local initrd_sha256=$(cat "$cached_sha256") + cp "$cached_initrd" "${TEMP_BUILD_DIR}/initrd.img" + echo "$initrd_sha256" > "${TEMP_BUILD_DIR}/initrd.sha256" + return 0 + fi + + log "Creating Ubuntu-compatible initrd using mkinitramfs..." + + # Check if mkinitramfs is available + if ! command -v mkinitramfs >/dev/null 2>&1; then + error "mkinitramfs not found. Please install initramfs-tools: apt-get install initramfs-tools" + fi + + # Install kernel modules to persistent location for reuse + PERSISTENT_MODULES="$HOME/kernel-modules" + log "Checking kernel modules in ${PERSISTENT_MODULES}..." + + # Only install modules if they don't exist or if forced + if [[ ! -d "${PERSISTENT_MODULES}/lib/modules/${KERNEL_VERSION}" || "${FORCE_INITRD:-}" == "1" ]]; then + log "Installing/updating kernel modules..." + make INSTALL_MOD_PATH="${PERSISTENT_MODULES}" modules_install + else + log "Reusing existing kernel modules from ${PERSISTENT_MODULES}..." + fi + + # Path to our modules + MODULES_DIR="${PERSISTENT_MODULES}/lib/modules/${KERNEL_VERSION}" + + if [[ ! -d "${MODULES_DIR}" ]]; then + error "Modules directory not found: ${MODULES_DIR}" + fi + + # Use mkinitramfs with system configuration and temporarily install our modules + log "Creating initramfs using system configuration..." + + # Temporarily install our modules to the system location + SYSTEM_MODULES_DIR="/lib/modules/${KERNEL_VERSION}" + BACKUP_MODULES="" + + # Create /lib/modules directory if it doesn't exist + sudo mkdir -p "/lib/modules" + + # Back up existing modules if they exist + if [[ -d "${SYSTEM_MODULES_DIR}" ]]; then + BACKUP_MODULES="${TEMP_BUILD_DIR}/backup-modules" + log "Backing up existing modules to ${BACKUP_MODULES}..." + sudo mv "${SYSTEM_MODULES_DIR}" "${BACKUP_MODULES}" + fi + + # Symlink our modules to system location (much faster than copying) + log "Temporarily symlinking kernel modules to system location..." + sudo ln -sf "${MODULES_DIR}" "${SYSTEM_MODULES_DIR}" + + # Symlink kernel config to fix mkinitramfs warning + BOOT_CONFIG="/boot/config-${KERNEL_VERSION}" + BACKUP_CONFIG="" + if [[ -f "${BOOT_CONFIG}" ]]; then + BACKUP_CONFIG="${TEMP_BUILD_DIR}/backup-config" + log "Backing up existing config to ${BACKUP_CONFIG}..." + sudo mv "${BOOT_CONFIG}" "${BACKUP_CONFIG}" + fi + log "Temporarily symlinking kernel config to ${BOOT_CONFIG}..." + sudo ln -sf "$(pwd)/.config" "${BOOT_CONFIG}" + + # Use mkinitramfs with system config directory + log "Generating initramfs..." + mkinitramfs -d /etc/initramfs-tools -o "${TEMP_BUILD_DIR}/initrd.img" "${KERNEL_VERSION}" + + # Clean up - remove our symlinks and restore backups if needed + sudo rm -f "${SYSTEM_MODULES_DIR}" + if [[ -n "${BACKUP_MODULES}" && -d "${BACKUP_MODULES}" ]]; then + log "Restoring original modules..." + sudo mv "${BACKUP_MODULES}" "${SYSTEM_MODULES_DIR}" + fi + + sudo rm -f "${BOOT_CONFIG}" + if [[ -n "${BACKUP_CONFIG}" && -f "${BACKUP_CONFIG}" ]]; then + log "Restoring original config..." + sudo mv "${BACKUP_CONFIG}" "${BOOT_CONFIG}" + fi + + # Calculate SHA256 of the created initrd + log "Calculating initrd SHA256..." + local initrd_sha256 + initrd_sha256=$(sha256sum "${TEMP_BUILD_DIR}/initrd.img" | cut -d' ' -f1) + echo "$initrd_sha256" > "${TEMP_BUILD_DIR}/initrd.sha256" + + # Cache the initrd and its SHA256 for future use + log "Caching initrd for future builds..." + cp "${TEMP_BUILD_DIR}/initrd.img" "$cached_initrd" + echo "$initrd_sha256" > "$cached_sha256" + + log "Ubuntu-compatible initrd created: ${TEMP_BUILD_DIR}/initrd.img" + log "Initrd SHA256: $initrd_sha256" + log "Kernel modules preserved in: ${PERSISTENT_MODULES}" +} + +# Upload initrd to S3 using SHA256-based path +upload_initrd_to_s3() { + log "Uploading initrd to S3..." + + local initrd_path="${TEMP_BUILD_DIR}/initrd.img" + local sha256_path="${TEMP_BUILD_DIR}/initrd.sha256" + + if [[ ! -f "$initrd_path" ]]; then + error "Initrd not found at $initrd_path" + fi + + if [[ ! -f "$sha256_path" ]]; then + error "Initrd SHA256 not found at $sha256_path" + fi + + local initrd_sha256 + initrd_sha256=$(cat "$sha256_path") + + # Check if initrd already exists in S3 + local s3_initrd_key="initrds/${initrd_sha256}/initrd.img" + + if aws s3 ls "s3://${S3_BUCKET}/${s3_initrd_key}" --region "$S3_REGION" >/dev/null 2>&1; then + log "Initrd already exists in S3: s3://${S3_BUCKET}/${s3_initrd_key}" + else + log "Uploading initrd to s3://${S3_BUCKET}/${s3_initrd_key}" + aws s3 cp "$initrd_path" "s3://${S3_BUCKET}/${s3_initrd_key}" --region "$S3_REGION" + + # Also upload the SHA256 file for verification + local s3_sha256_key="initrds/${initrd_sha256}/initrd.sha256" + aws s3 cp "$sha256_path" "s3://${S3_BUCKET}/${s3_sha256_key}" --region "$S3_REGION" + + log "Upload completed successfully!" + fi + + log "Initrd available at:" + log " s3://${S3_BUCKET}/${s3_initrd_key}" + log " SHA256: ${initrd_sha256}" + + # Output for build scripts + echo "INITRD_SHA256=${initrd_sha256}" + echo "INITRD_S3_KEY=${s3_initrd_key}" + + # Save info locally for other scripts + echo "INITRD_SHA256=${initrd_sha256}" > .last-initrd-info + echo "INITRD_S3_KEY=${s3_initrd_key}" >> .last-initrd-info + echo "KERNEL_VERSION=${KERNEL_VERSION}" >> .last-initrd-info +} + +# Show usage information +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Build and upload initrd to S3 with SHA256-based caching" + echo "" + echo "Options:" + echo " -h, --help Show this help message" + echo " --upload Upload to S3 after building (default: build only)" + echo "" + echo "Environment variables:" + echo " S3_BUCKET S3 bucket name (default: unvariance-kernel-dev)" + echo " S3_REGION S3 region (default: us-east-2)" + echo " FORCE_INITRD Set to 1 to force initrd rebuild (default: use cache)" + echo "" + echo "Examples:" + echo " $0 # Build initrd locally with caching" + echo " $0 --upload # Build and upload to S3" + echo " FORCE_INITRD=1 $0 # Force initrd rebuild (ignores cache)" +} + +# Main execution +main() { + local upload_flag=false + + # Handle command line arguments + while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage + exit 0 + ;; + --upload) + upload_flag=true + shift + ;; + *) + error "Unknown option: $1" + ;; + esac + done + + log "Starting initrd build process..." + log "Kernel Version: ${KERNEL_VERSION}" + if [[ "$upload_flag" == true ]]; then + log "S3 Bucket: ${S3_BUCKET}" + log "S3 Region: ${S3_REGION}" + fi + + create_temp_dir + check_dependencies + create_initrd + + if [[ "$upload_flag" == true ]]; then + upload_initrd_to_s3 + fi + + log "Initrd build completed successfully!" +} + +# Run if executed directly +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file From 514b3aeb842d0585431fd1cbc057aae25808139b Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 14:18:36 +0000 Subject: [PATCH 05/51] move kernel module building to initrd script --- build-and-upload.sh | 3 --- build-initrd.sh | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/build-and-upload.sh b/build-and-upload.sh index a331730c2b2b3a..f1e89764b8041e 100755 --- a/build-and-upload.sh +++ b/build-and-upload.sh @@ -111,9 +111,6 @@ build_kernel() { # Build kernel image make CC="$CC" -j${NPROC} bzImage - # Build modules (if any are needed) - make CC="$CC" -j${NPROC} modules - log "Kernel build completed successfully" } diff --git a/build-initrd.sh b/build-initrd.sh index 16bdd96f09eeb5..ee3c54a5cbe723 100755 --- a/build-initrd.sh +++ b/build-initrd.sh @@ -6,6 +6,7 @@ S3_BUCKET="${S3_BUCKET:-unvariance-kernel-dev}" S3_REGION="${S3_REGION:-us-east-2}" # Dynamically determine kernel version (including git state and LOCALVERSION) KERNEL_VERSION=$(make kernelrelease) +CC="ccache gcc" # Create temporary directory in user home for builds TEMP_BUILD_DIR="$HOME/.kernel-initrd-tmp-$$" @@ -123,6 +124,10 @@ create_initrd() { # Only install modules if they don't exist or if forced if [[ ! -d "${PERSISTENT_MODULES}/lib/modules/${KERNEL_VERSION}" || "${FORCE_INITRD:-}" == "1" ]]; then + log "Building kernel modules..." + NPROC=$(nproc) + make CC="$CC" -j${NPROC} modules + log "Installing/updating kernel modules..." make INSTALL_MOD_PATH="${PERSISTENT_MODULES}" modules_install else From 93d276acb760687f888e48836d28a7b011cee4dc Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 14:46:59 +0000 Subject: [PATCH 06/51] add github actions to run kernel test --- .github/actions/aws-runner/action.yml | 258 ++++++++++++++ .github/actions/aws-runner/cleanup/action.yml | 38 +++ .github/workflows/custom-kernel-test.yml | 318 ++++++++++++++++++ 3 files changed, 614 insertions(+) create mode 100644 .github/actions/aws-runner/action.yml create mode 100644 .github/actions/aws-runner/cleanup/action.yml create mode 100644 .github/workflows/custom-kernel-test.yml diff --git a/.github/actions/aws-runner/action.yml b/.github/actions/aws-runner/action.yml new file mode 100644 index 00000000000000..9c15997c8b9d67 --- /dev/null +++ b/.github/actions/aws-runner/action.yml @@ -0,0 +1,258 @@ +name: 'AWS EC2 GitHub Runner' +description: 'Start a self-hosted GitHub runner on AWS EC2 across multiple regions to find capacity' +author: 'Memory Collector Team' + +inputs: + instance-type: + description: 'EC2 instance type to use (e.g., "m7i.xlarge")' + required: false + default: 'm7i.xlarge' + image-type: + description: 'Image type identifier (e.g., "ubuntu-22.04")' + required: false + default: 'ubuntu-22.04' + market-type: + description: 'EC2 market type (spot or on-demand)' + required: false + default: 'spot' + github-token: + description: 'GitHub token for creating runners' + required: true + aws-role-arn: + description: 'ARN of the AWS role to assume' + required: true + volume-size: + description: 'EC2 volume size in GB' + required: false + default: '8' + pre-runner-script: + description: 'Script to run before installing the GitHub runner' + required: false + default: '' + runner-home-dir: + description: 'Home directory for the GitHub runner' + required: false + default: '' + aws-resource-tags: + description: 'Custom resource tags in JSON format' + required: false + default: '' + runner-name-prefix: + description: 'Prefix for the runner name' + required: false + default: 'github-runner' + iam-role-name: + description: 'IAM role name for the EC2 instance' + required: false + default: '' + region-priority: + description: 'Ordered list of regions to try in priority order' + required: false + default: '["us-east-2", "us-west-2", "us-east-1", "eu-west-1"]' + region-configs: + description: 'Configuration for regions in JSON format with subnets and security groups' + required: false + default: > + { + "us-east-1": { + "security-group-id": "sg-0c0fb801b9d5afb42", + "subnets": ["subnet-0f218a8f807b24b43", "subnet-03760fcc21de05dcf", "subnet-07f33ad4e85154757", "subnet-06a59c6d0f0ae0acf", "subnet-01411d66f3c3b03ab", "subnet-0aacbbfdb4730c3ae"] + }, + "us-east-2": { + "security-group-id": "sg-0da5b1b4abff16f01", + "subnets": ["subnet-057997a168b11832e", "subnet-04231f222c6778d25", "subnet-085a10d33b29607cd"] + }, + "us-west-2": { + "security-group-id": "sg-065a194f058366e19", + "subnets": ["subnet-03312d0e183ac6bd2", "subnet-0504fa9cacd9bece7", "subnet-07669de00a10cb45a", "subnet-027770cb161c110b2"] + }, + "eu-west-1": { + "security-group-id": "sg-0eb8174e90d14cb8c", + "subnets": ["subnet-06bc798bc93c2d33d", "subnet-0e7134127c7fb199a", "subnet-0a2b8f49046507b4a"] + } + } + ami-mappings: + description: 'Mapping from image-type to region-specific AMI IDs' + required: false + default: > + { + "ubuntu-22.04": { + "us-east-1": "ami-0f9de6e2d2f067fca", + "us-west-2": "ami-03f8acd418785369b", + "eu-west-1": "ami-0f0c3baa60262d5b9", + "us-east-2": "ami-0c3b809fcf2445b6a" + }, + "ubuntu-24.04": { + "us-east-1": "ami-084568db4383264d4", + "us-west-2": "ami-075686beab831bb7f", + "eu-west-1": "ami-0df368112825f8d8f", + "us-east-2": "ami-04f167a56786e4b09" + } + } + packages: + description: 'Additional packages to install on the runner as JSON array' + required: false + default: '[]' + +outputs: + runner-label: + description: 'The label of the created runner (for use in runs-on)' + value: ${{ steps.runner-outputs.outputs.label }} + ec2-instance-id: + description: 'The ID of the created EC2 instance' + value: ${{ steps.runner-outputs.outputs.ec2-instance-id }} + region: + description: 'AWS region where the EC2 instance was created' + value: ${{ steps.runner-outputs.outputs.region }} + +runs: + using: 'composite' + steps: + - name: Generate Region Configurations + id: generate-configs + shell: bash + run: | + # Parse the region configs + echo "Region configs: ${{ inputs.region-configs }}" + echo "AMI mappings: ${{ inputs.ami-mappings }}" + echo "Image type: ${{ inputs.image-type }}" + echo "Region priority: ${{ inputs.region-priority }}" + + # Convert the JSON strings to files for jq processing + echo '${{ inputs.region-configs }}' > /tmp/region_configs.json + echo '${{ inputs.ami-mappings }}' > /tmp/ami_mappings.json + echo '${{ inputs.region-priority }}' > /tmp/region_priority.json + + # Get prioritized regions + PRIORITY_REGIONS=$(jq -r 'join(",")' /tmp/region_priority.json) + echo "Prioritized regions: $PRIORITY_REGIONS" + + # Get all available regions from region configs + AVAILABLE_REGIONS=$(jq -r 'keys | join(",")' /tmp/region_configs.json) + echo "Available regions: $AVAILABLE_REGIONS" + + # Create an array to hold all AZ configurations + echo "Generating availability zone configurations in priority order..." + echo "[" > /tmp/az_configs.json + + FIRST=true + + # Process regions in priority order + for region in $(jq -r '.[]' /tmp/region_priority.json); do + echo "Processing region: $region" + + # Check if region exists in region configs + if ! jq -e --arg r "$region" '.[$r]' /tmp/region_configs.json > /dev/null; then + echo "Warning: Region $region specified in priority list not found in region configs, skipping" + continue + fi + + # Get AMI ID for this region + AMI_ID=$(jq -r --arg r "$region" --arg it "${{ inputs.image-type }}" '.[$it][$r]' /tmp/ami_mappings.json) + if [ -z "$AMI_ID" ] || [ "$AMI_ID" == "null" ]; then + echo "Warning: No AMI found for ${{ inputs.image-type }} in region $region, skipping" + continue + fi + + # Get security group for this region + SG_ID=$(jq -r --arg r "$region" '.[$r]["security-group-id"]' /tmp/region_configs.json) + if [ -z "$SG_ID" ] || [ "$SG_ID" == "null" ]; then + echo "Warning: No security group found for region $region, skipping" + continue + fi + + # Get subnets for this region + SUBNETS=$(jq -r --arg r "$region" '.[$r].subnets[]' /tmp/region_configs.json) + if [ -z "$SUBNETS" ]; then + echo "Warning: No subnets found for region $region, skipping" + continue + fi + + # Add each subnet as a separate AZ configuration + for subnet in $SUBNETS; do + if [ "$FIRST" = true ]; then + FIRST=false + else + echo "," >> /tmp/az_configs.json + fi + + # Add this AZ configuration to the JSON array using printf instead of heredoc + printf ' {\n "region": "%s",\n "imageId": "%s",\n "subnetId": "%s",\n "securityGroupId": "%s"\n }' "$region" "$AMI_ID" "$subnet" "$SG_ID" >> /tmp/az_configs.json + done + done + + echo "]" >> /tmp/az_configs.json + + # Create a JSON array for each region's AZ configurations + echo "Creating per-region AZ configurations..." + + # Read the full AZ configs + AZ_CONFIGS=$(cat /tmp/az_configs.json) + + # Properly escape the multiline JSON for GitHub Actions output + # See: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#multiline-strings + echo "availability_zones_config<> $GITHUB_OUTPUT + echo "$AZ_CONFIGS" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + + # Get the first region for the initial AWS credentials + FIRST_REGION=$(jq -r '.[0].region' /tmp/az_configs.json) + echo "first_region=$FIRST_REGION" >> $GITHUB_OUTPUT + + # For debugging, show the AZ configurations + echo "Generated availability zone configurations:" + cat /tmp/az_configs.json + + # Configure AWS credentials for the first region + - name: Configure AWS credentials + id: aws-credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ inputs.aws-role-arn }} + aws-region: ${{ steps.generate-configs.outputs.first_region }} + role-session-name: github-runner-session + + # Start EC2 runner with availability-zones-config + - name: Start EC2 runner + id: start-ec2-runner + uses: yonch/ec2-github-runner@feature/packages-installation + continue-on-error: true + with: + mode: start + startup-quiet-period-seconds: 10 + startup-retry-interval-seconds: 5 + github-token: ${{ inputs.github-token }} + ec2-instance-type: ${{ inputs.instance-type }} + market-type: ${{ inputs.market-type }} + ec2-volume-size: ${{ inputs.volume-size }} + pre-runner-script: ${{ inputs.pre-runner-script }} + runner-home-dir: ${{ inputs.runner-home-dir }} + iam-role-name: ${{ inputs.iam-role-name }} + availability-zones-config: ${{ steps.generate-configs.outputs.availability_zones_config }} + packages: ${{ inputs.packages }} + aws-resource-tags: > + [ + {"Key": "Name", "Value": "${{ inputs.runner-name-prefix }}"}, + {"Key": "Repository", "Value": "${{ github.repository }}"}, + {"Key": "Workflow", "Value": "${{ github.workflow }}"}, + {"Key": "RunId", "Value": "${{ github.run_id }}"}, + {"Key": "RunNumber", "Value": "${{ github.run_number }}"}, + {"Key": "SHA", "Value": "${{ github.sha }}"}, + {"Key": "Branch", "Value": "${{ github.ref_name }}"}, + {"Key": "Actor", "Value": "${{ github.actor }}"} + ] + + - name: Collect outputs + id: runner-outputs + shell: bash + run: | + # Pass through the runner outputs + echo "label=${{ steps.start-ec2-runner.outputs.label }}" >> $GITHUB_OUTPUT + echo "ec2-instance-id=${{ steps.start-ec2-runner.outputs.ec2-instance-id }}" >> $GITHUB_OUTPUT + echo "region=${{ steps.start-ec2-runner.outputs.region }}" >> $GITHUB_OUTPUT + if [ -n "${{ steps.start-ec2-runner.outputs.label }}" ]; then + echo "Runner successfully started in region: ${{ steps.start-ec2-runner.outputs.region }}" + else + echo "All runner attempts failed. Please check AWS capacity availability across regions." + exit 1 + fi \ No newline at end of file diff --git a/.github/actions/aws-runner/cleanup/action.yml b/.github/actions/aws-runner/cleanup/action.yml new file mode 100644 index 00000000000000..e8841ccb3530ae --- /dev/null +++ b/.github/actions/aws-runner/cleanup/action.yml @@ -0,0 +1,38 @@ +name: 'AWS EC2 GitHub Runner Cleanup' +description: 'Stop a self-hosted GitHub runner on AWS EC2' +author: 'Memory Collector Team' + +inputs: + runner-label: + description: 'The label of the runner to stop' + required: true + ec2-instance-id: + description: 'The ID of the EC2 instance to stop' + required: true + github-token: + description: 'GitHub token for managing runners' + required: true + aws-role-arn: + description: 'ARN of the AWS role to assume' + required: true + aws-region: + description: 'AWS region where the instance is located' + required: true + +runs: + using: 'composite' + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ inputs.aws-role-arn }} + aws-region: ${{ inputs.aws-region }} + role-session-name: github-runner-session + + - name: Stop EC2 runner + uses: yonch/ec2-github-runner@feature/multiple-az + with: + mode: stop + github-token: ${{ inputs.github-token }} + label: ${{ inputs.runner-label }} + ec2-instance-id: ${{ inputs.ec2-instance-id }} \ No newline at end of file diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml new file mode 100644 index 00000000000000..c4e290b5ee9e1e --- /dev/null +++ b/.github/workflows/custom-kernel-test.yml @@ -0,0 +1,318 @@ +name: Custom Kernel Test - Resctrl PMU +on: + workflow_dispatch: # Manual trigger for testing + inputs: + build-id: + description: 'Build ID for the kernel to test (Git commit hash)' + required: true + type: string + instance-type: + description: 'EC2 instance type to use' + required: false + default: 'm7i.xlarge' + type: string + image-type: + description: 'Image type to use (ubuntu-22.04 or ubuntu-24.04)' + required: false + default: 'ubuntu-24.04' + type: string + +permissions: + id-token: write # Required for requesting the JWT + contents: read + actions: write + +jobs: + setup-runner: + name: Start EC2 runner with custom kernel + runs-on: ubuntu-latest + outputs: + runner-label: ${{ steps.start-runner.outputs.runner-label }} + ec2-instance-id: ${{ steps.start-runner.outputs.ec2-instance-id }} + region: ${{ steps.start-runner.outputs.region }} + steps: + - name: Checkout repository (sparse) + uses: actions/checkout@v4 + with: + sparse-checkout: | + .github/ + tools/testing/selftests/resctrl/ + sparse-checkout-cone-mode: false + + - name: Start AWS Runner with kexec + id: start-runner + uses: ./.github/actions/aws-runner + with: + github-token: ${{ secrets.REPO_ADMIN_TOKEN }} + aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} + iam-role-name: github-actions-runner + instance-type: ${{ inputs.instance-type || 'm7i.xlarge' }} + image-type: ${{ inputs.image-type || 'ubuntu-22.04' }} + volume-size: '20' + runner-home-dir: '/tmp' + packages: '["kexec-tools"]' + pre-runner-script: | + # Custom Kernel Kexec Setup + # ========================= + # Download our custom built kernel from S3 and kexec into it + + echo "Setting up custom kernel kexec..." | tee -a /var/log/kexec-setup.log + + # Get kernel artifacts info from workflow input + BUILD_ID="${{ inputs.build-id }}" + KERNEL_S3_KEY="kernels/${BUILD_ID}/bzImage" + INITRD_S3_KEY="kernels/${BUILD_ID}/initrd.img" + S3_BUCKET="unvariance-kernel-dev" + S3_REGION="us-east-2" + + echo "Build ID: $BUILD_ID" | tee -a /var/log/kexec-setup.log + echo "Kernel S3 key: $KERNEL_S3_KEY" | tee -a /var/log/kexec-setup.log + echo "Initrd S3 key: $INITRD_S3_KEY" | tee -a /var/log/kexec-setup.log + + # Create persistent directory and download the GitHub Actions runner + mkdir -p /persist/actions-runner + cd /persist/actions-runner + + # Download custom kernel artifacts from S3 + echo "Downloading custom kernel from S3..." | tee -a /var/log/kexec-setup.log + aws s3 cp "s3://${S3_BUCKET}/${KERNEL_S3_KEY}" /tmp/custom-bzImage --region "$S3_REGION" + aws s3 cp "s3://${S3_BUCKET}/${INITRD_S3_KEY}" /tmp/custom-initrd.img --region "$S3_REGION" + + if [[ ! -f /tmp/custom-bzImage ]] || [[ ! -f /tmp/custom-initrd.img ]]; then + echo "Failed to download custom kernel artifacts" | tee -a /var/log/kexec-setup.log + exit 1 + fi + + echo "Custom kernel artifacts downloaded successfully" | tee -a /var/log/kexec-setup.log + ls -la /tmp/custom-* | tee -a /var/log/kexec-setup.log + + # Create post-kexec init script that starts the runner + cat > /persist/kexec-runner.sh << 'EOF' + #!/bin/bash + + # Log kexec success + echo "========================================" | tee -a /var/log/kexec-setup.log + echo "✅ CUSTOM KERNEL KEXEC SUCCESSFUL!" | tee -a /var/log/kexec-setup.log + echo "========================================" | tee -a /var/log/kexec-setup.log + echo "Custom kernel version: $(uname -r)" | tee -a /var/log/kexec-setup.log + echo "Build ID: $BUILD_ID" | tee -a /var/log/kexec-setup.log + echo "System time: $(date)" | tee -a /var/log/kexec-setup.log + echo "Hostname: $(hostname)" | tee -a /var/log/kexec-setup.log + echo "Init PID: $$" | tee -a /var/log/kexec-setup.log + echo "========================================" | tee -a /var/log/kexec-setup.log + + # Start the GitHub Actions runner + cd /persist/actions-runner + export RUNNER_ALLOW_RUNASROOT=1 + + # Configure runner with the original GitHub parameters + echo "Configuring GitHub Actions runner after custom kernel kexec..." | tee -a /var/log/kexec-setup.log + + # Read config parameters from the file saved before kexec + if [[ -f /persist/config-params ]]; then + CONFIG_PARAMS="$(cat /persist/config-params)" + echo "Found saved config parameters" | tee -a /var/log/kexec-setup.log + ./config.sh $CONFIG_PARAMS + + # Start the runner - this will wait for jobs + echo "Starting GitHub Actions runner after custom kernel kexec..." | tee -a /var/log/kexec-setup.log + ./run.sh + else + echo "No config parameters found, starting shell..." | tee -a /var/log/kexec-setup.log + exec /bin/bash + fi + EOF + chmod +x /persist/kexec-runner.sh + + echo "Post-kexec runner script created" | tee -a /var/log/kexec-setup.log + + # Create systemd service for GitHub runner (will start after kexec) + cat > /etc/systemd/system/github-runner.service << 'SYSTEMD_EOF' + [Unit] + Description=GitHub Actions Runner after custom kernel kexec + After=basic.target network.target + + [Service] + Type=simple + ExecStart=/persist/kexec-runner.sh + Restart=no + User=root + WorkingDirectory=/persist/actions-runner + StandardOutput=journal+console + StandardError=journal+console + + [Install] + WantedBy=multi-user.target + SYSTEMD_EOF + + # Enable the service to start on boot + systemctl enable github-runner.service + + echo "GitHub runner systemd service created and enabled" | tee -a /var/log/kexec-setup.log + + # Write custom kernel paths for config.sh to read + echo "/tmp/custom-bzImage" > /tmp/vmlinuz-path + echo "/tmp/custom-initrd.img" > /tmp/initrd-path + + # Create fake config.sh that triggers kexec with our custom kernel + cat > /tmp/config.sh << 'EOF' + #!/bin/bash + echo "Fake config.sh called - triggering custom kernel kexec" | tee -a /var/log/kexec-setup.log + + # Save the parameters to a file for post-kexec use + echo "$@" > /persist/config-params + echo "Saved config parameters to /persist/config-params" | tee -a /var/log/kexec-setup.log + + # Use our custom kernel + VMLINUZ="/tmp/custom-bzImage" + INITRD="/tmp/custom-initrd.img" + + echo "Preparing kexec with custom kernel: $VMLINUZ" | tee -a /var/log/kexec-setup.log + echo "Preparing kexec with custom initrd: $INITRD" | tee -a /var/log/kexec-setup.log + + # Get current kernel command line + CURRENT_CMDLINE="$(cat /proc/cmdline)" + echo "Current cmdline: $CURRENT_CMDLINE" | tee -a /var/log/kexec-setup.log + + # Kexec into our custom kernel + if ! kexec -l "$VMLINUZ" --initrd="$INITRD" --append="$CURRENT_CMDLINE"; then + echo "Failed to load custom kernel for kexec" | tee -a /var/log/kexec-setup.log + exit 1 + fi + + echo "Executing kexec into custom kernel..." | tee -a /var/log/kexec-setup.log + kexec -e + EOF + + chmod +x /tmp/config.sh + + echo "=== Contents of custom kernel config.sh ===" | tee -a /var/log/kexec-setup.log + cat /tmp/config.sh | tee -a /var/log/kexec-setup.log + echo "=== End of config.sh ===" | tee -a /var/log/kexec-setup.log + + echo "Downloading GitHub Actions runner..." | tee -a /var/log/kexec-setup.log + if ! curl -L -o "actions-runner.tar.gz" "https://github.com/actions/runner/releases/download/v2.327.1/actions-runner-linux-x64-2.327.1.tar.gz"; then + echo "Failed to download GitHub Actions runner" | tee -a /var/log/kexec-setup.log + exit 1 + fi + + if ! tar xzf actions-runner.tar.gz; then + echo "Failed to extract GitHub Actions runner" | tee -a /var/log/kexec-setup.log + exit 1 + fi + + echo "GitHub Actions runner downloaded and extracted successfully" | tee -a /var/log/kexec-setup.log + + # Go back to /tmp where the fake config.sh is located + echo "Changing directory to /tmp to run custom kernel kexec setup" | tee -a /var/log/kexec-setup.log + cd /tmp + + echo "Custom kernel kexec setup complete. Ready to boot custom kernel." | tee -a /var/log/kexec-setup.log + pmu-test: + needs: [setup-runner] + runs-on: ${{ needs.setup-runner.outputs.runner-label }} + timeout-minutes: 10 + steps: + - name: Checkout repository (sparse) + uses: actions/checkout@v4 + with: + sparse-checkout: | + tools/testing/selftests/resctrl/ + sparse-checkout-cone-mode: false + + - name: Verify custom kernel boot + run: | + echo "================================================" + echo "🚀 CUSTOM RESCTRL KERNEL BOOTED SUCCESSFULLY! 🚀" + echo "================================================" + echo "" + echo "✅ Successfully running on custom kernel!" + echo "📍 Current kernel: $(uname -r)" + echo "⏰ System uptime: $(uptime)" + echo "📅 Date: $(date)" + echo "👤 Current user: $(whoami)" + echo "🔧 Current PID: $$" + echo "" + + # Check if our kexec setup logs exist + if [ -f /var/log/kexec-setup.log ]; then + echo "✅ Found kexec setup log! Contents:" + echo "----------------------------------------" + tail -50 /var/log/kexec-setup.log + else + echo "⚠️ No kexec setup log found" + fi + + - name: Check resctrl support + run: | + echo "🔍 Checking resctrl support in custom kernel..." + + # Check if resctrl is available + if [ -d /sys/fs/resctrl ]; then + echo "✅ /sys/fs/resctrl exists" + ls -la /sys/fs/resctrl/ + else + echo "❌ /sys/fs/resctrl not found" + fi + + # Check kernel modules + echo "" + echo "🔧 Loaded kernel modules related to resctrl/perf:" + lsmod | grep -E "(resctrl|perf|msr)" || echo "No relevant modules found" + + # Check /proc/cpuinfo for relevant features + echo "" + echo "🖥️ CPU features related to monitoring:" + grep -E "(model name|flags)" /proc/cpuinfo | head -4 + + - name: Download test binary from S3 + run: | + echo "📦 Downloading pre-compiled test binary from S3..." + BUILD_ID="${{ inputs.build-id }}" + S3_BUCKET="unvariance-kernel-dev" + S3_REGION="us-east-2" + TEST_S3_KEY="kernels/${BUILD_ID}/resctrl_tests" + + aws s3 cp "s3://${S3_BUCKET}/${TEST_S3_KEY}" ./resctrl_tests --region "$S3_REGION" + chmod +x ./resctrl_tests + + if [[ ! -f ./resctrl_tests ]]; then + echo "❌ Failed to download test binary" + exit 1 + fi + + echo "✅ Test binary downloaded successfully" + ls -la ./resctrl_tests + + - name: Run PMU test + run: | + echo "🧪 Running resctrl PMU test..." + + # Run only the PMU test + echo "🏃 Running PMU test specifically..." + ./resctrl_tests -t pmu || echo "⚠️ PMU test failed or not fully implemented" + + - name: Test completion summary + run: | + echo "" + echo "🎯 CUSTOM KERNEL PMU TEST COMPLETED!" + echo "Custom resctrl+perf kernel with PMU support has been tested successfully!" + echo "Build ID: ${{ inputs.build-id }}" + + stop-runner: + name: Stop EC2 runner + needs: [setup-runner, pmu-test] + runs-on: ubuntu-latest + if: always() # Run even if previous jobs fail + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Stop AWS Runner + uses: ./.github/actions/aws-runner/cleanup + with: + runner-label: ${{ needs.setup-runner.outputs.runner-label }} + ec2-instance-id: ${{ needs.setup-runner.outputs.ec2-instance-id }} + github-token: ${{ secrets.REPO_ADMIN_TOKEN }} + aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} + aws-region: ${{ needs.setup-runner.outputs.region }} \ No newline at end of file From efded5ab8531d8fb29ed64234fdd57fd08d343f8 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 15:07:33 +0000 Subject: [PATCH 07/51] install awscli to be able to fetch kernel image from s3 --- .github/workflows/custom-kernel-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index c4e290b5ee9e1e..10fb47425731ab 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -50,7 +50,7 @@ jobs: image-type: ${{ inputs.image-type || 'ubuntu-22.04' }} volume-size: '20' runner-home-dir: '/tmp' - packages: '["kexec-tools"]' + packages: '["kexec-tools", "awscli"]' pre-runner-script: | # Custom Kernel Kexec Setup # ========================= From c18361c78d5fb2502ebcfe56282b05a778ef3d1e Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 15:20:32 +0000 Subject: [PATCH 08/51] switch to action version that debug prints cloud init --- .github/actions/aws-runner/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/aws-runner/action.yml b/.github/actions/aws-runner/action.yml index 9c15997c8b9d67..c273db71e206aa 100644 --- a/.github/actions/aws-runner/action.yml +++ b/.github/actions/aws-runner/action.yml @@ -215,7 +215,7 @@ runs: # Start EC2 runner with availability-zones-config - name: Start EC2 runner id: start-ec2-runner - uses: yonch/ec2-github-runner@feature/packages-installation + uses: yonch/ec2-github-runner@703f4a7887ec5b30734280ff62b324b9a52d66d2 continue-on-error: true with: mode: start From dbcf829cf4d94b0b8bec162a9202af0319c83e33 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 15:27:57 +0000 Subject: [PATCH 09/51] install AWS CLI via curl (recommended by AWS docs) --- .github/workflows/custom-kernel-test.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 10fb47425731ab..f0d7c80391d69e 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -50,7 +50,7 @@ jobs: image-type: ${{ inputs.image-type || 'ubuntu-22.04' }} volume-size: '20' runner-home-dir: '/tmp' - packages: '["kexec-tools", "awscli"]' + packages: '["kexec-tools", "curl", "unzip"]' pre-runner-script: | # Custom Kernel Kexec Setup # ========================= @@ -69,6 +69,13 @@ jobs: echo "Kernel S3 key: $KERNEL_S3_KEY" | tee -a /var/log/kexec-setup.log echo "Initrd S3 key: $INITRD_S3_KEY" | tee -a /var/log/kexec-setup.log + # Install AWS CLI + echo "Installing AWS CLI" | tee -a /var/log/kexec-setup.log + cd /tmp + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install | tee -a /var/log/kexec-setup.log + # Create persistent directory and download the GitHub Actions runner mkdir -p /persist/actions-runner cd /persist/actions-runner From 3db64c2fd18f887ad4bdeca952150fab39bb528a Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 15:39:41 +0000 Subject: [PATCH 10/51] extract the initrd and bzImage filenames from the metadata json --- .github/workflows/custom-kernel-test.yml | 27 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index f0d7c80391d69e..8491e55d50db48 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -50,7 +50,7 @@ jobs: image-type: ${{ inputs.image-type || 'ubuntu-22.04' }} volume-size: '20' runner-home-dir: '/tmp' - packages: '["kexec-tools", "curl", "unzip"]' + packages: '["kexec-tools", "curl", "unzip", "jq"]' pre-runner-script: | # Custom Kernel Kexec Setup # ========================= @@ -60,14 +60,31 @@ jobs: # Get kernel artifacts info from workflow input BUILD_ID="${{ inputs.build-id }}" - KERNEL_S3_KEY="kernels/${BUILD_ID}/bzImage" - INITRD_S3_KEY="kernels/${BUILD_ID}/initrd.img" S3_BUCKET="unvariance-kernel-dev" S3_REGION="us-east-2" + METADATA_S3_KEY="kernels/${BUILD_ID}/metadata.json" echo "Build ID: $BUILD_ID" | tee -a /var/log/kexec-setup.log - echo "Kernel S3 key: $KERNEL_S3_KEY" | tee -a /var/log/kexec-setup.log - echo "Initrd S3 key: $INITRD_S3_KEY" | tee -a /var/log/kexec-setup.log + echo "Metadata S3 key: $METADATA_S3_KEY" | tee -a /var/log/kexec-setup.log + + # Download and parse metadata JSON to get kernel and initrd paths + echo "Downloading metadata.json from S3..." | tee -a /var/log/kexec-setup.log + aws s3 cp "s3://${S3_BUCKET}/${METADATA_S3_KEY}" /tmp/metadata.json --region "$S3_REGION" + + if [[ ! -f /tmp/metadata.json ]]; then + echo "Failed to download metadata.json" | tee -a /var/log/kexec-setup.log + exit 1 + fi + + echo "Contents of metadata.json:" | tee -a /var/log/kexec-setup.log + cat /tmp/metadata.json | tee -a /var/log/kexec-setup.log + + # Extract kernel and initrd paths from metadata + KERNEL_S3_KEY=$(jq -r '.kernel_path // "kernels/'${BUILD_ID}'/bzImage"' /tmp/metadata.json) + INITRD_S3_KEY=$(jq -r '.initrd_path // "kernels/'${BUILD_ID}'/initrd.img"' /tmp/metadata.json) + + echo "Kernel S3 key from metadata: $KERNEL_S3_KEY" | tee -a /var/log/kexec-setup.log + echo "Initrd S3 key from metadata: $INITRD_S3_KEY" | tee -a /var/log/kexec-setup.log # Install AWS CLI echo "Installing AWS CLI" | tee -a /var/log/kexec-setup.log From 2b69d07e5872fcbb6bc42a657de894fc76af9f70 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 15:44:04 +0000 Subject: [PATCH 11/51] reorder pre-requisites in github action --- .github/workflows/custom-kernel-test.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 8491e55d50db48..1d80508436362c 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -67,6 +67,13 @@ jobs: echo "Build ID: $BUILD_ID" | tee -a /var/log/kexec-setup.log echo "Metadata S3 key: $METADATA_S3_KEY" | tee -a /var/log/kexec-setup.log + # Install AWS CLI + echo "Installing AWS CLI" | tee -a /var/log/kexec-setup.log + cd /tmp + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install | tee -a /var/log/kexec-setup.log + # Download and parse metadata JSON to get kernel and initrd paths echo "Downloading metadata.json from S3..." | tee -a /var/log/kexec-setup.log aws s3 cp "s3://${S3_BUCKET}/${METADATA_S3_KEY}" /tmp/metadata.json --region "$S3_REGION" @@ -85,17 +92,6 @@ jobs: echo "Kernel S3 key from metadata: $KERNEL_S3_KEY" | tee -a /var/log/kexec-setup.log echo "Initrd S3 key from metadata: $INITRD_S3_KEY" | tee -a /var/log/kexec-setup.log - - # Install AWS CLI - echo "Installing AWS CLI" | tee -a /var/log/kexec-setup.log - cd /tmp - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install | tee -a /var/log/kexec-setup.log - - # Create persistent directory and download the GitHub Actions runner - mkdir -p /persist/actions-runner - cd /persist/actions-runner # Download custom kernel artifacts from S3 echo "Downloading custom kernel from S3..." | tee -a /var/log/kexec-setup.log @@ -109,6 +105,10 @@ jobs: echo "Custom kernel artifacts downloaded successfully" | tee -a /var/log/kexec-setup.log ls -la /tmp/custom-* | tee -a /var/log/kexec-setup.log + + # Create persistent directory and download the GitHub Actions runner + mkdir -p /persist/actions-runner + cd /persist/actions-runner # Create post-kexec init script that starts the runner cat > /persist/kexec-runner.sh << 'EOF' From 1ddecbd5ade56d8f462768e1d205bc54d5dea722 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 15:49:44 +0000 Subject: [PATCH 12/51] reduce logging verboseness for aws cli installation --- .github/workflows/custom-kernel-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 1d80508436362c..708961802b9a1d 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -71,8 +71,8 @@ jobs: echo "Installing AWS CLI" | tee -a /var/log/kexec-setup.log cd /tmp curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install | tee -a /var/log/kexec-setup.log + unzip awscliv2.zip 2>&1 > /dev/null + sudo ./aws/install 2>&1 > /dev/null # Download and parse metadata JSON to get kernel and initrd paths echo "Downloading metadata.json from S3..." | tee -a /var/log/kexec-setup.log From 63367557dc9a22da09028e731dcf43d1f89cfe92 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 18:04:21 +0000 Subject: [PATCH 13/51] fail the self-hosted runner start if the ec2-github-runner action fails --- .github/actions/aws-runner/action.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/actions/aws-runner/action.yml b/.github/actions/aws-runner/action.yml index c273db71e206aa..33e00f7bdf0e6e 100644 --- a/.github/actions/aws-runner/action.yml +++ b/.github/actions/aws-runner/action.yml @@ -246,13 +246,19 @@ runs: id: runner-outputs shell: bash run: | - # Pass through the runner outputs + # Always pass through the runner outputs (even if empty on failure) echo "label=${{ steps.start-ec2-runner.outputs.label }}" >> $GITHUB_OUTPUT echo "ec2-instance-id=${{ steps.start-ec2-runner.outputs.ec2-instance-id }}" >> $GITHUB_OUTPUT echo "region=${{ steps.start-ec2-runner.outputs.region }}" >> $GITHUB_OUTPUT - if [ -n "${{ steps.start-ec2-runner.outputs.label }}" ]; then + + # Check if the ec2-runner step failed and exit at the end + if [ "${{ steps.start-ec2-runner.outcome }}" != "success" ]; then + echo "EC2 runner step failed with outcome: ${{ steps.start-ec2-runner.outcome }}" + echo "All runner attempts failed. Please check AWS capacity availability across regions." + exit 1 + elif [ -n "${{ steps.start-ec2-runner.outputs.label }}" ]; then echo "Runner successfully started in region: ${{ steps.start-ec2-runner.outputs.region }}" else - echo "All runner attempts failed. Please check AWS capacity availability across regions." + echo "EC2 runner step succeeded but no runner label was returned" exit 1 fi \ No newline at end of file From 65a246da8d6800624c8fb1acb904198ad471a046 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 18:33:33 +0000 Subject: [PATCH 14/51] add action trigger script --- trigger-kernel-test.sh | 270 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100755 trigger-kernel-test.sh diff --git a/trigger-kernel-test.sh b/trigger-kernel-test.sh new file mode 100755 index 00000000000000..82f779ac48a11a --- /dev/null +++ b/trigger-kernel-test.sh @@ -0,0 +1,270 @@ +#!/bin/bash +set -euo pipefail + +# Configuration +REPO_OWNER="$(git config --get remote.origin.url | sed -n 's#.*github\.com[/:]\([^/]*\)/\([^/]*\).*#\1#p')" +REPO_NAME="$(git config --get remote.origin.url | sed -n 's#.*github\.com[/:]\([^/]*\)/\([^/]*\).*#\2#p' | sed 's/\.git$//')" +WORKFLOW_NAME="custom-kernel-test.yml" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" +} + +warn() { + echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" +} + +error() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" + exit 1 +} + +info() { + echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}" +} + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Trigger GitHub Actions workflow to test custom kernel" + echo "" + echo "OPTIONS:" + echo " -b, --build-id BUILD_ID Git commit hash of the kernel build (default: current HEAD)" + echo " -i, --instance-type TYPE EC2 instance type (default: m7i.xlarge)" + echo " -t, --image-type TYPE Image type ubuntu-22.04 or ubuntu-24.04 (default: ubuntu-24.04)" + echo " -w, --wait Wait for workflow completion and show logs" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Test kernel at current HEAD" + echo " $0 -b abc123def # Test specific commit" + echo " $0 -w # Wait for completion" + echo " $0 -i m7i.2xlarge -w # Use larger instance and wait" +} + +# Default values +BUILD_ID="$(git rev-parse HEAD)" +INSTANCE_TYPE="m7i.metal-24xl" +IMAGE_TYPE="ubuntu-24.04" +WAIT_FOR_COMPLETION=false + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -b|--build-id) + BUILD_ID="$2" + shift 2 + ;; + -i|--instance-type) + INSTANCE_TYPE="$2" + shift 2 + ;; + -t|--image-type) + IMAGE_TYPE="$2" + shift 2 + ;; + -w|--wait) + WAIT_FOR_COMPLETION=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + error "Unknown option: $1" + ;; + esac +done + +# Check dependencies +check_dependencies() { + if ! command -v gh >/dev/null 2>&1; then + error "GitHub CLI (gh) not found. Please install it: https://cli.github.com/" + fi + + if ! command -v git >/dev/null 2>&1; then + error "git not found. Please install git." + fi + + # Check if we're in a git repository + if ! git rev-parse --git-dir >/dev/null 2>&1; then + error "Not in a git repository." + fi + + # Check GitHub CLI authentication + if ! gh auth status >/dev/null 2>&1; then + error "GitHub CLI not authenticated. Run 'gh auth login' first." + fi +} + +# Validate build ID +validate_build_id() { + if ! git cat-file -e "$BUILD_ID" 2>/dev/null; then + error "Build ID '$BUILD_ID' is not a valid git commit in this repository." + fi + + # Get the full commit hash + BUILD_ID="$(git rev-parse "$BUILD_ID")" + info "Using build ID: $BUILD_ID" +} + +# Check if kernel artifacts exist in S3 +check_kernel_artifacts() { + log "Checking if kernel artifacts exist for build ID: $BUILD_ID" + + S3_BUCKET="unvariance-kernel-dev" + S3_REGION="us-east-2" + KERNEL_KEY="kernels/${BUILD_ID}/bzImage" + METADATA_KEY="kernels/${BUILD_ID}/metadata.json" + + # Check if AWS CLI is available + if command -v aws >/dev/null 2>&1; then + # Check kernel image + if aws s3 ls "s3://${S3_BUCKET}/${KERNEL_KEY}" --region "$S3_REGION" >/dev/null 2>&1; then + log "✅ Kernel image found in S3" + else + warn "Kernel image not found in S3. You may need to build and upload first:" + warn " ./build-and-upload.sh" + fi + + # Check metadata and get initrd key from it + if aws s3 ls "s3://${S3_BUCKET}/${METADATA_KEY}" --region "$S3_REGION" >/dev/null 2>&1; then + log "✅ Metadata found in S3" + + # Download metadata to get initrd path + local temp_metadata="/tmp/kernel-metadata-${BUILD_ID}.json" + if aws s3 cp "s3://${S3_BUCKET}/${METADATA_KEY}" "$temp_metadata" --region "$S3_REGION" >/dev/null 2>&1; then + # Extract initrd path from metadata + if command -v jq >/dev/null 2>&1; then + INITRD_KEY=$(jq -r '.initrd_path' "$temp_metadata" 2>/dev/null) + else + # Fallback to grep/sed if jq is not available + INITRD_KEY=$(grep -o '"initrd_path"[[:space:]]*:[[:space:]]*"[^"]*"' "$temp_metadata" | sed 's/.*"initrd_path"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/') + fi + + if [[ -n "$INITRD_KEY" && "$INITRD_KEY" != "null" ]]; then + # Check if initrd exists at the path specified in metadata + if aws s3 ls "s3://${S3_BUCKET}/${INITRD_KEY}" --region "$S3_REGION" >/dev/null 2>&1; then + log "✅ Initrd image found in S3: ${INITRD_KEY}" + else + warn "Initrd image not found at path from metadata: ${INITRD_KEY}" + fi + else + warn "Could not extract initrd path from metadata" + fi + + rm -f "$temp_metadata" + else + warn "Failed to download metadata file" + fi + else + warn "Metadata not found in S3. You may need to build and upload first:" + warn " ./build-and-upload.sh" + fi + else + warn "AWS CLI not found, skipping S3 artifact check" + fi +} + +# Trigger the workflow +trigger_workflow() { + log "Triggering GitHub Actions workflow..." + + if [[ -z "$REPO_OWNER" ]] || [[ -z "$REPO_NAME" ]]; then + error "Could not determine repository owner/name from git remote" + fi + + info "Repository: $REPO_OWNER/$REPO_NAME" + info "Workflow: $WORKFLOW_NAME" + info "Build ID: $BUILD_ID" + info "Instance Type: $INSTANCE_TYPE" + info "Image Type: $IMAGE_TYPE" + + # Trigger the workflow + local run_output + run_output=$(gh workflow run "$WORKFLOW_NAME" \ + --repo "$REPO_OWNER/$REPO_NAME" \ + --field "build-id=$BUILD_ID" \ + --field "instance-type=$INSTANCE_TYPE" \ + --field "image-type=$IMAGE_TYPE" 2>&1) || { + error "Failed to trigger workflow: $run_output" + } + + log "✅ Workflow triggered successfully!" + + # Get the run ID + sleep 3 # Give GitHub a moment to create the run + local run_id + run_id=$(gh run list --repo "$REPO_OWNER/$REPO_NAME" --workflow="$WORKFLOW_NAME" --limit=1 --json databaseId --jq '.[0].databaseId') + + if [[ -n "$run_id" ]]; then + info "Workflow run ID: $run_id" + info "View workflow: https://github.com/$REPO_OWNER/$REPO_NAME/actions/runs/$run_id" + + # Store run ID for potential waiting + echo "$run_id" > /tmp/last_workflow_run_id + else + warn "Could not determine run ID" + fi +} + +# Wait for workflow completion +wait_for_completion() { + if [[ ! -f /tmp/last_workflow_run_id ]]; then + error "No workflow run ID found. Cannot wait for completion." + fi + + local run_id + run_id=$(cat /tmp/last_workflow_run_id) + + log "Waiting for workflow completion (run ID: $run_id)..." + + # Wait for the workflow to complete + gh run watch "$run_id" --repo "$REPO_OWNER/$REPO_NAME" || { + warn "Workflow watch failed or workflow failed" + } + + # Show final status + local status + status=$(gh run view "$run_id" --repo "$REPO_OWNER/$REPO_NAME" --json status,conclusion --jq '.status + " - " + .conclusion') + + if [[ "$status" == *"success"* ]]; then + log "✅ Workflow completed successfully!" + else + warn "⚠️ Workflow completed with status: $status" + fi + + # Show logs URL + info "View full logs: https://github.com/$REPO_OWNER/$REPO_NAME/actions/runs/$run_id" +} + +# Main execution +main() { + log "Starting kernel test workflow trigger..." + + check_dependencies + validate_build_id + check_kernel_artifacts + trigger_workflow + + if [[ "$WAIT_FOR_COMPLETION" == true ]]; then + wait_for_completion + else + info "Workflow triggered. Use -w flag to wait for completion, or check GitHub Actions manually." + fi + + log "Done!" +} + +# Run if executed directly +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file From e2aca1641ec6429b9d7f437c95d2b0cec39c4206 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 18:41:08 +0000 Subject: [PATCH 15/51] add workflow to extract config and initrd information from ubuntu AMI --- .github/workflows/extract-kernel-config.yml | 247 ++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 .github/workflows/extract-kernel-config.yml diff --git a/.github/workflows/extract-kernel-config.yml b/.github/workflows/extract-kernel-config.yml new file mode 100644 index 00000000000000..58586518b5793a --- /dev/null +++ b/.github/workflows/extract-kernel-config.yml @@ -0,0 +1,247 @@ +name: Extract Kernel and Initramfs Configuration +on: + workflow_dispatch: # Manual trigger + inputs: + instance-type: + description: 'EC2 instance type to use' + required: false + default: 'm7i.metal-24xl' + type: string + image-type: + description: 'Image type to use (ubuntu-22.04 or ubuntu-24.04)' + required: false + default: 'ubuntu-24.04' + type: string + +permissions: + id-token: write # Required for requesting the JWT + contents: read + actions: write + +jobs: + setup-runner: + name: Start EC2 runner for config extraction + runs-on: ubuntu-latest + outputs: + runner-label: ${{ steps.start-runner.outputs.runner-label }} + ec2-instance-id: ${{ steps.start-runner.outputs.ec2-instance-id }} + region: ${{ steps.start-runner.outputs.region }} + steps: + - name: Checkout repository (sparse) + uses: actions/checkout@v4 + with: + sparse-checkout: | + .github/ + sparse-checkout-cone-mode: false + + - name: Start AWS Runner (standard setup) + id: start-runner + uses: ./.github/actions/aws-runner + with: + github-token: ${{ secrets.REPO_ADMIN_TOKEN }} + aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} + iam-role-name: github-actions-runner + instance-type: ${{ inputs.instance-type || 'm7i.metal-24xl' }} + image-type: ${{ inputs.image-type || 'ubuntu-24.04' }} + volume-size: '20' + runner-home-dir: '/tmp' + packages: '["curl", "gzip", "tar"]' + + extract-configs: + needs: [setup-runner] + runs-on: ${{ needs.setup-runner.outputs.runner-label }} + timeout-minutes: 15 + steps: + - name: Verify system info + run: | + echo "================================================" + echo "🖥️ EXTRACTING KERNEL AND INITRAMFS CONFIG" + echo "================================================" + echo "" + echo "📍 Kernel version: $(uname -r)" + echo "📍 Ubuntu version: $(lsb_release -d | cut -f2)" + echo "📍 Architecture: $(uname -m)" + echo "📍 Instance type: ${{ inputs.instance-type || 'm7i.metal-24xl' }}" + echo "⏰ System uptime: $(uptime)" + echo "📅 Date: $(date)" + echo "" + + - name: Extract kernel configuration + run: | + echo "🔍 Extracting kernel configuration..." + mkdir -p /tmp/kernel-config + + # Try to get config from /proc/config.gz first + if [ -f /proc/config.gz ]; then + echo "✅ Found /proc/config.gz" + zcat /proc/config.gz > /tmp/kernel-config/config-from-proc + else + echo "⚠️ /proc/config.gz not available" + fi + + # Get config from /boot/config-* files + if ls /boot/config-* 1> /dev/null 2>&1; then + echo "✅ Found boot config files:" + ls -la /boot/config-* + cp /boot/config-* /tmp/kernel-config/ + else + echo "⚠️ No boot config files found" + fi + + # Get kernel version and other info + uname -r > /tmp/kernel-config/kernel-version.txt + uname -a > /tmp/kernel-config/kernel-info.txt + lsb_release -a > /tmp/kernel-config/ubuntu-version.txt 2>/dev/null || echo "lsb_release not available" > /tmp/kernel-config/ubuntu-version.txt + + echo "📦 Kernel config files extracted:" + ls -la /tmp/kernel-config/ + + - name: Extract initramfs-tools configuration + run: | + echo "🔧 Extracting initramfs-tools configuration..." + mkdir -p /tmp/initramfs-config + + # Copy the entire /etc/initramfs-tools directory + if [ -d /etc/initramfs-tools ]; then + echo "✅ Found /etc/initramfs-tools directory" + cp -r /etc/initramfs-tools /tmp/initramfs-config/ + + echo "📋 Contents of /etc/initramfs-tools:" + find /etc/initramfs-tools -type f -exec ls -la {} \; + echo "" + + # Show key configuration files content + echo "📄 Contents of initramfs.conf:" + if [ -f /etc/initramfs-tools/initramfs.conf ]; then + cat /etc/initramfs-tools/initramfs.conf + else + echo "initramfs.conf not found" + fi + echo "" + + echo "📄 Contents of modules file:" + if [ -f /etc/initramfs-tools/modules ]; then + cat /etc/initramfs-tools/modules + else + echo "modules file not found" + fi + echo "" + + echo "📄 Contents of conf.d directory:" + if [ -d /etc/initramfs-tools/conf.d ]; then + ls -la /etc/initramfs-tools/conf.d/ + find /etc/initramfs-tools/conf.d -type f -exec echo "--- {} ---" \; -exec cat {} \; + else + echo "conf.d directory not found" + fi + + else + echo "❌ /etc/initramfs-tools directory not found" + fi + + # Also check for initramfs-related packages + echo "📦 Installed initramfs-related packages:" + dpkg -l | grep -i initramfs || echo "No initramfs packages found" + + echo "📦 Initramfs config files extracted:" + find /tmp/initramfs-config -type f -exec ls -la {} \; + + - name: Package configuration files as artifacts + run: | + echo "📦 Creating tar archives for artifacts..." + + # Create kernel config archive + cd /tmp + if [ -d kernel-config ] && [ "$(ls -A kernel-config)" ]; then + tar -czf kernel-config.tar.gz kernel-config/ + echo "✅ Created kernel-config.tar.gz ($(du -h kernel-config.tar.gz | cut -f1))" + else + echo "⚠️ No kernel config files to archive" + touch kernel-config.tar.gz + fi + + # Create initramfs config archive + if [ -d initramfs-config ] && [ "$(ls -A initramfs-config)" ]; then + tar -czf initramfs-tools-config.tar.gz initramfs-config/ + echo "✅ Created initramfs-tools-config.tar.gz ($(du -h initramfs-tools-config.tar.gz | cut -f1))" + else + echo "⚠️ No initramfs config files to archive" + touch initramfs-tools-config.tar.gz + fi + + # Create system info summary + cat > system-info.txt << EOF + System Information Summary + ========================= + Date: $(date) + Instance Type: ${{ inputs.instance-type || 'm7i.metal-24xl' }} + Image Type: ${{ inputs.image-type || 'ubuntu-24.04' }} + Kernel Version: $(uname -r) + Kernel Info: $(uname -a) + Ubuntu Version: $(lsb_release -d | cut -f2 2>/dev/null || echo "Unknown") + Architecture: $(uname -m) + Uptime: $(uptime) + + Package Information: + $(dpkg -l | grep -E "(linux-image|linux-headers|initramfs)" | head -20) + EOF + + echo "✅ Created system-info.txt" + + ls -la *.tar.gz *.txt + + - name: Upload kernel config artifact + uses: actions/upload-artifact@v4 + with: + name: kernel-config-${{ inputs.instance-type || 'm7i.metal-24xl' }}-${{ inputs.image-type || 'ubuntu-24.04' }} + path: /tmp/kernel-config.tar.gz + retention-days: 30 + + - name: Upload initramfs config artifact + uses: actions/upload-artifact@v4 + with: + name: initramfs-tools-config-${{ inputs.instance-type || 'm7i.metal-24xl' }}-${{ inputs.image-type || 'ubuntu-24.04' }} + path: /tmp/initramfs-tools-config.tar.gz + retention-days: 30 + + - name: Upload system info artifact + uses: actions/upload-artifact@v4 + with: + name: system-info-${{ inputs.instance-type || 'm7i.metal-24xl' }}-${{ inputs.image-type || 'ubuntu-24.04' }} + path: /tmp/system-info.txt + retention-days: 30 + + - name: Extraction summary + run: | + echo "" + echo "🎯 CONFIGURATION EXTRACTION COMPLETED!" + echo "======================================" + echo "✅ Kernel configuration extracted from AWS AMI" + echo "✅ Initramfs-tools configuration extracted" + echo "✅ System information documented" + echo "" + echo "📋 Artifacts created:" + echo " - kernel-config-${{ inputs.instance-type || 'm7i.metal-24xl' }}-${{ inputs.image-type || 'ubuntu-24.04' }}" + echo " - initramfs-tools-config-${{ inputs.instance-type || 'm7i.metal-24xl' }}-${{ inputs.image-type || 'ubuntu-24.04' }}" + echo " - system-info-${{ inputs.instance-type || 'm7i.metal-24xl' }}-${{ inputs.image-type || 'ubuntu-24.04' }}" + echo "" + echo "💡 You can now use these configurations to make your build-initrd.sh" + echo " script generate initrds that match the AWS AMI configuration!" + + stop-runner: + name: Stop EC2 runner + needs: [setup-runner, extract-configs] + runs-on: ubuntu-latest + if: always() # Run even if previous jobs fail + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Stop AWS Runner + uses: ./.github/actions/aws-runner/cleanup + with: + runner-label: ${{ needs.setup-runner.outputs.runner-label }} + ec2-instance-id: ${{ needs.setup-runner.outputs.ec2-instance-id }} + github-token: ${{ secrets.REPO_ADMIN_TOKEN }} + aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} + aws-region: ${{ needs.setup-runner.outputs.region }} \ No newline at end of file From 24ff93d5efc39a47248e3901155226618285fab4 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 18:42:27 +0000 Subject: [PATCH 16/51] remove the user data debugging info in action --- .github/actions/aws-runner/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/aws-runner/action.yml b/.github/actions/aws-runner/action.yml index 33e00f7bdf0e6e..6c1719139bcd81 100644 --- a/.github/actions/aws-runner/action.yml +++ b/.github/actions/aws-runner/action.yml @@ -215,7 +215,7 @@ runs: # Start EC2 runner with availability-zones-config - name: Start EC2 runner id: start-ec2-runner - uses: yonch/ec2-github-runner@703f4a7887ec5b30734280ff62b324b9a52d66d2 + uses: yonch/ec2-github-runner@feature/packages-installation continue-on-error: true with: mode: start From 1ccbc9c6e5f07d6a524171412f90492b501a6f46 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 14 Aug 2025 20:41:10 +0000 Subject: [PATCH 17/51] fix self hosted runner setup --- .github/workflows/extract-kernel-config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/extract-kernel-config.yml b/.github/workflows/extract-kernel-config.yml index 58586518b5793a..7a60b16ed9f26c 100644 --- a/.github/workflows/extract-kernel-config.yml +++ b/.github/workflows/extract-kernel-config.yml @@ -44,7 +44,6 @@ jobs: instance-type: ${{ inputs.instance-type || 'm7i.metal-24xl' }} image-type: ${{ inputs.image-type || 'ubuntu-24.04' }} volume-size: '20' - runner-home-dir: '/tmp' packages: '["curl", "gzip", "tar"]' extract-configs: From 148b1cc56a2f48cc00b4eaca570697c1430f26cb Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 15 Aug 2025 12:19:46 +0000 Subject: [PATCH 18/51] add more checks for resctrl support to debug why the test is being skipped --- .github/workflows/custom-kernel-test.yml | 72 +++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 708961802b9a1d..4dda2d0ca8dd7e 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -274,12 +274,55 @@ jobs: # Check if resctrl is available if [ -d /sys/fs/resctrl ]; then echo "✅ /sys/fs/resctrl exists" + echo "📁 Contents of /sys/fs/resctrl/:" ls -la /sys/fs/resctrl/ + + # Check if resctrl is mounted + echo "" + echo "🔧 Mount status of resctrl:" + mount | grep resctrl || echo "❌ resctrl filesystem not mounted" + + # Check if we can mount resctrl + echo "" + echo "🚀 Attempting to mount resctrl filesystem..." + if mount -t resctrl resctrl /sys/fs/resctrl 2>&1; then + echo "✅ Successfully mounted resctrl!" + echo "📁 Contents after mounting:" + ls -la /sys/fs/resctrl/ + + # Check for info directory and its contents + if [ -d /sys/fs/resctrl/info ]; then + echo "" + echo "📊 Resctrl info directory contents:" + find /sys/fs/resctrl/info -type f -exec echo "📄 {}: $(cat {} 2>/dev/null || echo 'unable to read')" \; + fi + else + echo "❌ Failed to mount resctrl filesystem" + echo "💡 This might indicate missing kernel config or hardware support" + fi else echo "❌ /sys/fs/resctrl not found" fi - # Check kernel modules + # Check kernel config for resctrl support + echo "" + echo "🔧 Kernel configuration for resctrl:" + if [ -f /proc/config.gz ]; then + echo "📋 Resctrl-related config options:" + zcat /proc/config.gz | grep -E "(RESCTRL|RDT)" || echo "No resctrl config found" + elif [ -f /boot/config-$(uname -r) ]; then + echo "📋 Resctrl-related config options:" + grep -E "(RESCTRL|RDT)" /boot/config-$(uname -r) || echo "No resctrl config found" + else + echo "❌ Kernel config not available" + fi + + # Check dmesg for resctrl messages + echo "" + echo "📋 Kernel log messages about resctrl:" + dmesg | grep -i resctrl | head -10 || echo "No resctrl messages in dmesg" + + # Check loaded kernel modules echo "" echo "🔧 Loaded kernel modules related to resctrl/perf:" lsmod | grep -E "(resctrl|perf|msr)" || echo "No relevant modules found" @@ -289,6 +332,33 @@ jobs: echo "🖥️ CPU features related to monitoring:" grep -E "(model name|flags)" /proc/cpuinfo | head -4 + # Look for specific CPU features that enable resctrl + echo "" + echo "🎯 Specific CPU features for resctrl support:" + if grep -q "rdt_a" /proc/cpuinfo; then + echo "✅ RDT allocation support detected" + else + echo "❌ RDT allocation (rdt_a) not found" + fi + + if grep -q "cqm" /proc/cpuinfo; then + echo "✅ Cache Quality Monitoring (cqm) support detected" + else + echo "❌ Cache Quality Monitoring (cqm) not found" + fi + + if grep -qE "(cat_l3|cat_l2)" /proc/cpuinfo; then + echo "✅ Cache Allocation Technology support detected" + else + echo "❌ Cache Allocation Technology not found" + fi + + if grep -q "mba" /proc/cpuinfo; then + echo "✅ Memory Bandwidth Allocation support detected" + else + echo "❌ Memory Bandwidth Allocation not found" + fi + - name: Download test binary from S3 run: | echo "📦 Downloading pre-compiled test binary from S3..." From aba737dbc255e28c4b380c03f8a229ba4174d737 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 15 Aug 2025 13:27:30 +0000 Subject: [PATCH 19/51] add more sysfs debugging --- .github/workflows/custom-kernel-test.yml | 58 ++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 4dda2d0ca8dd7e..4b171c4ad59f38 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -288,13 +288,65 @@ jobs: if mount -t resctrl resctrl /sys/fs/resctrl 2>&1; then echo "✅ Successfully mounted resctrl!" echo "📁 Contents after mounting:" - ls -la /sys/fs/resctrl/ + ls -la /sys/fs/resctrl/ 2>&1 || echo "❌ ls failed on mounted resctrl" + + # Check /sys filesystem health after kexec + echo "" + echo "🔍 Checking /sys filesystem health after kexec:" + echo "📊 /sys mount status:" + mount | grep "/sys " || echo "❌ /sys not properly mounted" + + echo "🔧 /sys filesystem type and options:" + stat -f /sys 2>/dev/null || echo "❌ Cannot stat /sys filesystem" + + # Try remounting /sys to fix potential kexec issues + echo "" + echo "🔄 Attempting to remount /sys to fix kexec issues..." + if mount -o remount /sys 2>&1; then + echo "✅ Successfully remounted /sys" + else + echo "⚠️ Failed to remount /sys, trying to unmount and remount..." + umount /sys 2>/dev/null + if mount -t sysfs none /sys 2>&1; then + echo "✅ Successfully remounted /sys from scratch" + # Remount resctrl after fixing /sys + echo "🔄 Remounting resctrl after /sys fix..." + mount -t resctrl resctrl /sys/fs/resctrl 2>&1 + else + echo "❌ Failed to remount /sys" + fi + fi + + # Test file access with more detailed error reporting + echo "" + echo "🧪 Testing file access with detailed error reporting:" + TEST_FILE="/sys/fs/resctrl/info/L3_MON/mon_features" + if [ -f "$TEST_FILE" ]; then + echo "📄 Testing access to $TEST_FILE:" + echo " File exists: ✅" + echo " File permissions: $(ls -l "$TEST_FILE" 2>/dev/null || echo 'unable to check')" + echo " File size: $(stat -c %s "$TEST_FILE" 2>/dev/null || echo 'unable to check')" + echo " Direct read attempt:" + cat "$TEST_FILE" 2>&1 || echo "❌ Read failed with error code $?" + echo " Hexdump of first 64 bytes:" + hexdump -C "$TEST_FILE" 2>/dev/null | head -4 || echo "❌ hexdump failed" + else + echo "❌ $TEST_FILE does not exist" + fi # Check for info directory and its contents if [ -d /sys/fs/resctrl/info ]; then echo "" - echo "📊 Resctrl info directory contents:" - find /sys/fs/resctrl/info -type f -exec echo "📄 {}: $(cat {} 2>/dev/null || echo 'unable to read')" \; + echo "📊 Resctrl info directory contents (with error codes):" + find /sys/fs/resctrl/info -type f | while read file; do + content=$(cat "$file" 2>&1) + status=$? + if [ $status -eq 0 ]; then + echo "📄 $file: $content" + else + echo "❌ $file: failed with exit code $status, error: $content" + fi + done fi else echo "❌ Failed to mount resctrl filesystem" From 3380247c3b04a6b19c50bd8c123ed3d9e83aaac1 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 10:32:51 +0000 Subject: [PATCH 20/51] first try and fix /sys, then mount resctrl --- .github/workflows/custom-kernel-test.yml | 53 +++++++++++------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 4b171c4ad59f38..98191b429ec3ce 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -282,7 +282,31 @@ jobs: echo "🔧 Mount status of resctrl:" mount | grep resctrl || echo "❌ resctrl filesystem not mounted" - # Check if we can mount resctrl + # Check /sys filesystem health after kexec first + echo "" + echo "🔍 Checking /sys filesystem health after kexec:" + echo "📊 /sys mount status:" + mount | grep "/sys " || echo "❌ /sys not properly mounted" + + echo "🔧 /sys filesystem type and options:" + stat -f /sys 2>/dev/null || echo "❌ Cannot stat /sys filesystem" + + # Try remounting /sys to fix potential kexec issues + echo "" + echo "🔄 Attempting to remount /sys to fix kexec issues..." + if mount -o remount /sys 2>&1; then + echo "✅ Successfully remounted /sys" + else + echo "⚠️ Failed to remount /sys, trying to unmount and remount..." + umount /sys 2>/dev/null + if mount -t sysfs none /sys 2>&1; then + echo "✅ Successfully remounted /sys from scratch" + else + echo "❌ Failed to remount /sys" + fi + fi + + # Now attempt to mount resctrl after fixing /sys echo "" echo "🚀 Attempting to mount resctrl filesystem..." if mount -t resctrl resctrl /sys/fs/resctrl 2>&1; then @@ -290,33 +314,6 @@ jobs: echo "📁 Contents after mounting:" ls -la /sys/fs/resctrl/ 2>&1 || echo "❌ ls failed on mounted resctrl" - # Check /sys filesystem health after kexec - echo "" - echo "🔍 Checking /sys filesystem health after kexec:" - echo "📊 /sys mount status:" - mount | grep "/sys " || echo "❌ /sys not properly mounted" - - echo "🔧 /sys filesystem type and options:" - stat -f /sys 2>/dev/null || echo "❌ Cannot stat /sys filesystem" - - # Try remounting /sys to fix potential kexec issues - echo "" - echo "🔄 Attempting to remount /sys to fix kexec issues..." - if mount -o remount /sys 2>&1; then - echo "✅ Successfully remounted /sys" - else - echo "⚠️ Failed to remount /sys, trying to unmount and remount..." - umount /sys 2>/dev/null - if mount -t sysfs none /sys 2>&1; then - echo "✅ Successfully remounted /sys from scratch" - # Remount resctrl after fixing /sys - echo "🔄 Remounting resctrl after /sys fix..." - mount -t resctrl resctrl /sys/fs/resctrl 2>&1 - else - echo "❌ Failed to remount /sys" - fi - fi - # Test file access with more detailed error reporting echo "" echo "🧪 Testing file access with detailed error reporting:" From 50f235eca898e58540e7e7324cb5f64e0192bc83 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 10:33:12 +0000 Subject: [PATCH 21/51] add JSON output to triggering kernel test --- trigger-kernel-test.sh | 236 +++++++++++++++++++++++++++++++++++------ 1 file changed, 206 insertions(+), 30 deletions(-) diff --git a/trigger-kernel-test.sh b/trigger-kernel-test.sh index 82f779ac48a11a..0852d2e36f3343 100755 --- a/trigger-kernel-test.sh +++ b/trigger-kernel-test.sh @@ -14,20 +14,20 @@ BLUE='\033[0;34m' NC='\033[0m' # No Color log() { - echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" >&2 } warn() { - echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" + echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" >&2 } error() { - echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" >&2 exit 1 } info() { - echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}" + echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}" >&2 } usage() { @@ -39,13 +39,17 @@ usage() { echo " -b, --build-id BUILD_ID Git commit hash of the kernel build (default: current HEAD)" echo " -i, --instance-type TYPE EC2 instance type (default: m7i.xlarge)" echo " -t, --image-type TYPE Image type ubuntu-22.04 or ubuntu-24.04 (default: ubuntu-24.04)" - echo " -w, --wait Wait for workflow completion and show logs" + echo " -w, --wait Trigger workflow and wait for completion (output JSON)" + echo " --wait-existing Wait for existing latest workflow run (output JSON)" + echo " -o, --output FILE JSON output file (default: kernel-test-results.json)" echo " -h, --help Show this help message" echo "" echo "Examples:" - echo " $0 # Test kernel at current HEAD" + echo " $0 # Trigger workflow only (default)" echo " $0 -b abc123def # Test specific commit" - echo " $0 -w # Wait for completion" + echo " $0 -w # Trigger and wait, output JSON" + echo " $0 --wait-existing # Wait for existing run, output JSON" + echo " $0 -w -o results.json # Custom output file" echo " $0 -i m7i.2xlarge -w # Use larger instance and wait" } @@ -53,7 +57,8 @@ usage() { BUILD_ID="$(git rev-parse HEAD)" INSTANCE_TYPE="m7i.metal-24xl" IMAGE_TYPE="ubuntu-24.04" -WAIT_FOR_COMPLETION=false +MODE="trigger-only" # trigger-only, wait-existing, trigger-and-wait +OUTPUT_FILE="kernel-test-results.json" # Parse command line arguments while [[ $# -gt 0 ]]; do @@ -71,9 +76,17 @@ while [[ $# -gt 0 ]]; do shift 2 ;; -w|--wait) - WAIT_FOR_COMPLETION=true + MODE="trigger-and-wait" shift ;; + --wait-existing) + MODE="wait-existing" + shift + ;; + -o|--output) + OUTPUT_FILE="$2" + shift 2 + ;; -h|--help) usage exit 0 @@ -174,8 +187,8 @@ check_kernel_artifacts() { fi } -# Trigger the workflow -trigger_workflow() { +# Trigger the workflow and return run ID +trigger_workflow_get_id() { log "Triggering GitHub Actions workflow..." if [[ -z "$REPO_OWNER" ]] || [[ -z "$REPO_NAME" ]]; then @@ -188,7 +201,7 @@ trigger_workflow() { info "Instance Type: $INSTANCE_TYPE" info "Image Type: $IMAGE_TYPE" - # Trigger the workflow + # Trigger the workflow and capture output local run_output run_output=$(gh workflow run "$WORKFLOW_NAME" \ --repo "$REPO_OWNER/$REPO_NAME" \ @@ -200,23 +213,66 @@ trigger_workflow() { log "✅ Workflow triggered successfully!" - # Get the run ID - sleep 3 # Give GitHub a moment to create the run + # Extract run URL from output and get run ID + local run_url + run_url=$(echo "$run_output" | grep -o 'https://github.com/.*/actions/runs/[0-9]*' | head -1) + local run_id - run_id=$(gh run list --repo "$REPO_OWNER/$REPO_NAME" --workflow="$WORKFLOW_NAME" --limit=1 --json databaseId --jq '.[0].databaseId') + if [[ -n "$run_url" ]]; then + run_id=$(echo "$run_url" | grep -o '[0-9]*$') + else + # Fallback to API query if URL extraction fails + sleep 3 # Give GitHub a moment to create the run + run_id=$(gh run list --repo "$REPO_OWNER/$REPO_NAME" --workflow="$WORKFLOW_NAME" --limit=1 --json databaseId --jq '.[0].databaseId') + fi if [[ -n "$run_id" ]]; then info "Workflow run ID: $run_id" info "View workflow: https://github.com/$REPO_OWNER/$REPO_NAME/actions/runs/$run_id" - # Store run ID for potential waiting + # Store run ID for backward compatibility echo "$run_id" > /tmp/last_workflow_run_id + + # Return the run ID + echo "$run_id" + else + error "Could not determine run ID" + fi +} + +# Get latest workflow run ID +get_latest_workflow_id() { + local run_id + run_id=$(gh run list --repo "$REPO_OWNER/$REPO_NAME" --workflow="$WORKFLOW_NAME" --limit=1 --json databaseId --jq '.[0].databaseId') + + if [[ -n "$run_id" ]]; then + echo "$run_id" else - warn "Could not determine run ID" + error "No workflow runs found for workflow: $WORKFLOW_NAME" + fi +} + +# Wait for workflow completion and write JSON results +wait_and_write_json() { + local run_id="$1" + local output_path="$2" + + if [[ -z "$run_id" ]]; then + error "No run ID provided to wait function" fi + + # Wait for the workflow to complete (silently) + gh run watch "$run_id" --repo "$REPO_OWNER/$REPO_NAME" >/dev/null 2>&1 || true + + # Generate and write JSON results + local json_results + json_results=$(output_json_results "$run_id") + + echo "$json_results" > "$output_path" + echo "$output_path" # Return the output path for confirmation } -# Wait for workflow completion +# Wait for workflow completion (legacy function for backward compatibility) wait_for_completion() { if [[ ! -f /tmp/last_workflow_run_id ]]; then error "No workflow run ID found. Cannot wait for completion." @@ -246,22 +302,142 @@ wait_for_completion() { info "View full logs: https://github.com/$REPO_OWNER/$REPO_NAME/actions/runs/$run_id" } -# Main execution -main() { - log "Starting kernel test workflow trigger..." +# Output results as JSON +output_json_results() { + local run_id="$1" - check_dependencies - validate_build_id - check_kernel_artifacts - trigger_workflow + # Get workflow run information + local run_info + run_info=$(gh run view "$run_id" --repo "$REPO_OWNER/$REPO_NAME" --json status,conclusion,startedAt,updatedAt,url,headSha,headBranch,event,actor,workflowName,displayTitle) - if [[ "$WAIT_FOR_COMPLETION" == true ]]; then - wait_for_completion - else - info "Workflow triggered. Use -w flag to wait for completion, or check GitHub Actions manually." + # Get job information with steps + local jobs_info + jobs_info=$(gh run view "$run_id" --repo "$REPO_OWNER/$REPO_NAME" --json jobs --jq '.jobs') + + # Try to extract test results and logs from job outputs + local test_results="null" + local all_logs="" + local step_outputs="[]" + + # Get full logs from the workflow run + local logs_output + logs_output=$(gh run view "$run_id" --repo "$REPO_OWNER/$REPO_NAME" --log 2>/dev/null || echo "") + + # If main logs are empty, try getting logs from individual jobs + if [[ -z "$logs_output" ]]; then + # Try to get logs from each job individually + local job_ids + job_ids=$(echo "$jobs_info" | jq -r '.[].databaseId') + + for job_id in $job_ids; do + local job_logs + job_logs=$(gh api "repos/$REPO_OWNER/$REPO_NAME/actions/jobs/$job_id/logs" 2>/dev/null || echo "") + if [[ -n "$job_logs" ]]; then + logs_output="${logs_output}\n--- Job $job_id ---\n${job_logs}" + fi + done fi - log "Done!" + # Extract step outputs for key test steps + if [[ -n "$logs_output" ]]; then + # Store comprehensive logs (last 5000 characters to capture more context) + all_logs=$(echo "$logs_output" | tail -c 5000 | sed 's/"/\\"/g' | tr '\n' '\\n') + + # Look for test-specific output patterns + local test_step_output="" + + # Look for PMU test output + if echo "$logs_output" | grep -q "Run PMU test\|PMU\|perf_event_open"; then + test_step_output=$(echo "$logs_output" | sed -n '/Run PMU test/,/##\[endgroup\]/p' | tail -50) + fi + + # Look for resctrl test output + if echo "$logs_output" | grep -q "resctrl\|Check resctrl support"; then + local resctrl_output + resctrl_output=$(echo "$logs_output" | sed -n '/Check resctrl support/,/##\[endgroup\]/p' | tail -20) + test_step_output="${test_step_output}\n${resctrl_output}" + fi + + # Try to extract structured test results + if echo "$logs_output" | grep -q -E "(PASS|FAIL|SUCCESS|ERROR|Test.*completed|Tests.*run)"; then + local passed_count failed_count + + # Look for various test result patterns + passed_count=$(echo "$logs_output" | grep -c -E "(PASS|SUCCESS|✓|Test.*passed)" || echo "0") + failed_count=$(echo "$logs_output" | grep -c -E "(FAIL|ERROR|✗|Test.*failed)" || echo "0") + + if [[ "$passed_count" -gt 0 ]] || [[ "$failed_count" -gt 0 ]]; then + test_results=$(cat < 0))' 2>/dev/null || echo "[]") + fi + + # Combine all information into final JSON + local final_json + final_json=$(cat < Date: Sat, 16 Aug 2025 11:23:56 +0000 Subject: [PATCH 22/51] add hard reboot kernel test --- .github/workflows/hard-reboot-kernel-test.yml | 460 ++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 .github/workflows/hard-reboot-kernel-test.yml diff --git a/.github/workflows/hard-reboot-kernel-test.yml b/.github/workflows/hard-reboot-kernel-test.yml new file mode 100644 index 00000000000000..66c0a0db41505f --- /dev/null +++ b/.github/workflows/hard-reboot-kernel-test.yml @@ -0,0 +1,460 @@ +name: Hard Reboot Kernel Test - Resctrl PMU +on: + workflow_dispatch: # Manual trigger for testing + inputs: + build-id: + description: 'Build ID for the kernel to test (Git commit hash)' + required: true + type: string + instance-type: + description: 'EC2 instance type to use' + required: false + default: 'm7i.xlarge' + type: string + image-type: + description: 'Image type to use (ubuntu-22.04 or ubuntu-24.04)' + required: false + default: 'ubuntu-24.04' + type: string + +permissions: + id-token: write # Required for requesting the JWT + contents: read + actions: write + +jobs: + setup-runner: + name: Start EC2 runner with custom kernel + runs-on: ubuntu-latest + outputs: + runner-label: ${{ steps.start-runner.outputs.runner-label }} + ec2-instance-id: ${{ steps.start-runner.outputs.ec2-instance-id }} + region: ${{ steps.start-runner.outputs.region }} + steps: + - name: Checkout repository (sparse) + uses: actions/checkout@v4 + with: + sparse-checkout: | + .github/ + tools/testing/selftests/resctrl/ + sparse-checkout-cone-mode: false + + - name: Start AWS Runner with hard reboot + id: start-runner + uses: ./.github/actions/aws-runner + with: + github-token: ${{ secrets.REPO_ADMIN_TOKEN }} + aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} + iam-role-name: github-actions-runner + instance-type: ${{ inputs.instance-type || 'm7i.xlarge' }} + image-type: ${{ inputs.image-type || 'ubuntu-22.04' }} + volume-size: '20' + runner-home-dir: '/tmp' + packages: '["curl", "unzip", "jq"]' + pre-runner-script: | + # Custom Kernel Hard Reboot Setup + # =============================== + # Download our custom built kernel from S3 and install it with GRUB + + echo "Setting up custom kernel hard reboot..." | tee -a /var/log/reboot-setup.log + + # Get kernel artifacts info from workflow input + BUILD_ID="${{ inputs.build-id }}" + S3_BUCKET="unvariance-kernel-dev" + S3_REGION="us-east-2" + METADATA_S3_KEY="kernels/${BUILD_ID}/metadata.json" + + echo "Build ID: $BUILD_ID" | tee -a /var/log/reboot-setup.log + echo "Metadata S3 key: $METADATA_S3_KEY" | tee -a /var/log/reboot-setup.log + + # Install AWS CLI + echo "Installing AWS CLI" | tee -a /var/log/reboot-setup.log + cd /tmp + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip 2>&1 > /dev/null + sudo ./aws/install 2>&1 > /dev/null + + # Download and parse metadata JSON to get kernel and initrd paths + echo "Downloading metadata.json from S3..." | tee -a /var/log/reboot-setup.log + aws s3 cp "s3://${S3_BUCKET}/${METADATA_S3_KEY}" /tmp/metadata.json --region "$S3_REGION" + + if [[ ! -f /tmp/metadata.json ]]; then + echo "Failed to download metadata.json" | tee -a /var/log/reboot-setup.log + exit 1 + fi + + echo "Contents of metadata.json:" | tee -a /var/log/reboot-setup.log + cat /tmp/metadata.json | tee -a /var/log/reboot-setup.log + + # Extract kernel and initrd paths from metadata + KERNEL_S3_KEY=$(jq -r '.kernel_path // "kernels/'${BUILD_ID}'/bzImage"' /tmp/metadata.json) + INITRD_S3_KEY=$(jq -r '.initrd_path // "kernels/'${BUILD_ID}'/initrd.img"' /tmp/metadata.json) + + echo "Kernel S3 key from metadata: $KERNEL_S3_KEY" | tee -a /var/log/reboot-setup.log + echo "Initrd S3 key from metadata: $INITRD_S3_KEY" | tee -a /var/log/reboot-setup.log + + # Download custom kernel artifacts from S3 + echo "Downloading custom kernel from S3..." | tee -a /var/log/reboot-setup.log + aws s3 cp "s3://${S3_BUCKET}/${KERNEL_S3_KEY}" /tmp/custom-bzImage --region "$S3_REGION" + aws s3 cp "s3://${S3_BUCKET}/${INITRD_S3_KEY}" /tmp/custom-initrd.img --region "$S3_REGION" + + if [[ ! -f /tmp/custom-bzImage ]] || [[ ! -f /tmp/custom-initrd.img ]]; then + echo "Failed to download custom kernel artifacts" | tee -a /var/log/reboot-setup.log + exit 1 + fi + + echo "Custom kernel artifacts downloaded successfully" | tee -a /var/log/reboot-setup.log + ls -la /tmp/custom-* | tee -a /var/log/reboot-setup.log + + # Create persistent directory and download the GitHub Actions runner + mkdir -p /opt/actions-runner + cd /opt/actions-runner + + # Create post-reboot init script that starts the runner + cat > /opt/reboot-runner.sh << 'EOF' + #!/bin/bash + + # Log reboot success + echo "========================================" | tee -a /var/log/reboot-setup.log + echo "✅ CUSTOM KERNEL HARD REBOOT SUCCESSFUL!" | tee -a /var/log/reboot-setup.log + echo "========================================" | tee -a /var/log/reboot-setup.log + echo "Custom kernel version: $(uname -r)" | tee -a /var/log/reboot-setup.log + echo "Build ID: $BUILD_ID" | tee -a /var/log/reboot-setup.log + echo "System time: $(date)" | tee -a /var/log/reboot-setup.log + echo "Hostname: $(hostname)" | tee -a /var/log/reboot-setup.log + echo "Init PID: $$" | tee -a /var/log/reboot-setup.log + echo "========================================" | tee -a /var/log/reboot-setup.log + + # Start the GitHub Actions runner + cd /opt/actions-runner + export RUNNER_ALLOW_RUNASROOT=1 + + # Configure runner with the original GitHub parameters + echo "Configuring GitHub Actions runner after custom kernel reboot..." | tee -a /var/log/reboot-setup.log + + # Read config parameters from the file saved before reboot + if [[ -f /opt/config-params ]]; then + CONFIG_PARAMS="$(cat /opt/config-params)" + echo "Found saved config parameters" | tee -a /var/log/reboot-setup.log + ./config.sh $CONFIG_PARAMS + + # Start the runner - this will wait for jobs + echo "Starting GitHub Actions runner after custom kernel reboot..." | tee -a /var/log/reboot-setup.log + ./run.sh + else + echo "No config parameters found, starting shell..." | tee -a /var/log/reboot-setup.log + exec /bin/bash + fi + EOF + chmod +x /opt/reboot-runner.sh + + echo "Post-reboot runner script created" | tee -a /var/log/reboot-setup.log + + # Create systemd service for GitHub runner (will start after reboot) + cat > /etc/systemd/system/github-runner.service << 'SYSTEMD_EOF' + [Unit] + Description=GitHub Actions Runner after custom kernel reboot + After=basic.target network.target + + [Service] + Type=simple + ExecStart=/opt/reboot-runner.sh + Restart=no + User=root + WorkingDirectory=/opt/actions-runner + StandardOutput=journal+console + StandardError=journal+console + + [Install] + WantedBy=multi-user.target + SYSTEMD_EOF + + # Enable the service to start on boot + systemctl enable github-runner.service + + echo "GitHub runner systemd service created and enabled" | tee -a /var/log/reboot-setup.log + + # Create fake config.sh that triggers reboot with our custom kernel + cat > /tmp/config.sh << 'EOF' + #!/bin/bash + echo "Fake config.sh called - triggering custom kernel reboot" | tee -a /var/log/reboot-setup.log + + # Save the parameters to a file for post-reboot use + echo "$@" > /opt/config-params + echo "Saved config parameters to /opt/config-params" | tee -a /var/log/reboot-setup.log + + # Use our custom kernel + VMLINUZ="/tmp/custom-bzImage" + INITRD="/tmp/custom-initrd.img" + + echo "Installing custom kernel to /boot: $VMLINUZ" | tee -a /var/log/reboot-setup.log + echo "Installing custom initrd to /boot: $INITRD" | tee -a /var/log/reboot-setup.log + + # Install custom kernel to /boot + cp "$VMLINUZ" /boot/vmlinuz-custom + cp "$INITRD" /boot/initrd-custom + + # Get current kernel command line + CURRENT_CMDLINE="$(cat /proc/cmdline)" + echo "Current cmdline: $CURRENT_CMDLINE" | tee -a /var/log/reboot-setup.log + + # Update GRUB to boot custom kernel by default + echo "Updating GRUB configuration..." | tee -a /var/log/reboot-setup.log + sed -i 's/^GRUB_DEFAULT=.*/GRUB_DEFAULT="Custom Kernel"/' /etc/default/grub + echo 'GRUB_DISABLE_SUBMENU=y' >> /etc/default/grub + + # Add custom kernel entry to GRUB + cat >> /etc/grub.d/40_custom << GRUB_EOF + menuentry "Custom Kernel" { + linux /boot/vmlinuz-custom $CURRENT_CMDLINE + initrd /boot/initrd-custom + } + GRUB_EOF + + # Apply GRUB changes + echo "Running update-grub..." | tee -a /var/log/reboot-setup.log + if ! update-grub; then + echo "Failed to update GRUB" | tee -a /var/log/reboot-setup.log + exit 1 + fi + + echo "GRUB updated successfully, rebooting into custom kernel..." | tee -a /var/log/reboot-setup.log + reboot + EOF + + chmod +x /tmp/config.sh + + echo "=== Contents of custom kernel config.sh ===" | tee -a /var/log/reboot-setup.log + cat /tmp/config.sh | tee -a /var/log/reboot-setup.log + echo "=== End of config.sh ===" | tee -a /var/log/reboot-setup.log + + echo "Downloading GitHub Actions runner..." | tee -a /var/log/reboot-setup.log + if ! curl -L -o "actions-runner.tar.gz" "https://github.com/actions/runner/releases/download/v2.327.1/actions-runner-linux-x64-2.327.1.tar.gz"; then + echo "Failed to download GitHub Actions runner" | tee -a /var/log/reboot-setup.log + exit 1 + fi + + if ! tar xzf actions-runner.tar.gz; then + echo "Failed to extract GitHub Actions runner" | tee -a /var/log/reboot-setup.log + exit 1 + fi + + echo "GitHub Actions runner downloaded and extracted successfully" | tee -a /var/log/reboot-setup.log + + # Go back to /tmp where the fake config.sh is located + echo "Changing directory to /tmp to run custom kernel reboot setup" | tee -a /var/log/reboot-setup.log + cd /tmp + + echo "Custom kernel reboot setup complete. Ready to reboot into custom kernel." | tee -a /var/log/reboot-setup.log + pmu-test: + needs: [setup-runner] + runs-on: ${{ needs.setup-runner.outputs.runner-label }} + timeout-minutes: 10 + steps: + - name: Checkout repository (sparse) + uses: actions/checkout@v4 + with: + sparse-checkout: | + tools/testing/selftests/resctrl/ + sparse-checkout-cone-mode: false + + - name: Verify custom kernel boot + run: | + echo "================================================" + echo "🚀 CUSTOM RESCTRL KERNEL BOOTED SUCCESSFULLY! 🚀" + echo "================================================" + echo "" + echo "✅ Successfully running on custom kernel!" + echo "📍 Current kernel: $(uname -r)" + echo "⏰ System uptime: $(uptime)" + echo "📅 Date: $(date)" + echo "👤 Current user: $(whoami)" + echo "🔧 Current PID: $$" + echo "" + + # Check if our reboot setup logs exist + if [ -f /var/log/reboot-setup.log ]; then + echo "✅ Found reboot setup log! Contents:" + echo "----------------------------------------" + tail -50 /var/log/reboot-setup.log + else + echo "⚠️ No reboot setup log found" + fi + + - name: Check resctrl support + run: | + echo "🔍 Checking resctrl support in custom kernel..." + + # Check if resctrl is available + if [ -d /sys/fs/resctrl ]; then + echo "✅ /sys/fs/resctrl exists" + echo "📁 Contents of /sys/fs/resctrl/:" + ls -la /sys/fs/resctrl/ + + # Check if resctrl is mounted + echo "" + echo "🔧 Mount status of resctrl:" + mount | grep resctrl || echo "❌ resctrl filesystem not mounted" + + # Check /sys filesystem health after reboot + echo "" + echo "🔍 Checking /sys filesystem health after reboot:" + echo "📊 /sys mount status:" + mount | grep "/sys " || echo "❌ /sys not properly mounted" + + echo "🔧 /sys filesystem type and options:" + stat -f /sys 2>/dev/null || echo "❌ Cannot stat /sys filesystem" + + # Attempt to mount resctrl + echo "" + echo "🚀 Attempting to mount resctrl filesystem..." + if mount -t resctrl resctrl /sys/fs/resctrl 2>&1; then + echo "✅ Successfully mounted resctrl!" + echo "📁 Contents after mounting:" + ls -la /sys/fs/resctrl/ 2>&1 || echo "❌ ls failed on mounted resctrl" + + # Test file access with more detailed error reporting + echo "" + echo "🧪 Testing file access with detailed error reporting:" + TEST_FILE="/sys/fs/resctrl/info/L3_MON/mon_features" + if [ -f "$TEST_FILE" ]; then + echo "📄 Testing access to $TEST_FILE:" + echo " File exists: ✅" + echo " File permissions: $(ls -l "$TEST_FILE" 2>/dev/null || echo 'unable to check')" + echo " File size: $(stat -c %s "$TEST_FILE" 2>/dev/null || echo 'unable to check')" + echo " Direct read attempt:" + cat "$TEST_FILE" 2>&1 || echo "❌ Read failed with error code $?" + echo " Hexdump of first 64 bytes:" + hexdump -C "$TEST_FILE" 2>/dev/null | head -4 || echo "❌ hexdump failed" + else + echo "❌ $TEST_FILE does not exist" + fi + + # Check for info directory and its contents + if [ -d /sys/fs/resctrl/info ]; then + echo "" + echo "📊 Resctrl info directory contents (with error codes):" + find /sys/fs/resctrl/info -type f | while read file; do + content=$(cat "$file" 2>&1) + status=$? + if [ $status -eq 0 ]; then + echo "📄 $file: $content" + else + echo "❌ $file: failed with exit code $status, error: $content" + fi + done + fi + else + echo "❌ Failed to mount resctrl filesystem" + echo "💡 This might indicate missing kernel config or hardware support" + fi + else + echo "❌ /sys/fs/resctrl not found" + fi + + # Check kernel config for resctrl support + echo "" + echo "🔧 Kernel configuration for resctrl:" + if [ -f /proc/config.gz ]; then + echo "📋 Resctrl-related config options:" + zcat /proc/config.gz | grep -E "(RESCTRL|RDT)" || echo "No resctrl config found" + elif [ -f /boot/config-$(uname -r) ]; then + echo "📋 Resctrl-related config options:" + grep -E "(RESCTRL|RDT)" /boot/config-$(uname -r) || echo "No resctrl config found" + else + echo "❌ Kernel config not available" + fi + + # Check dmesg for resctrl messages + echo "" + echo "📋 Kernel log messages about resctrl:" + dmesg | grep -i resctrl | head -10 || echo "No resctrl messages in dmesg" + + # Check loaded kernel modules + echo "" + echo "🔧 Loaded kernel modules related to resctrl/perf:" + lsmod | grep -E "(resctrl|perf|msr)" || echo "No relevant modules found" + + # Check /proc/cpuinfo for relevant features + echo "" + echo "🖥️ CPU features related to monitoring:" + grep -E "(model name|flags)" /proc/cpuinfo | head -4 + + # Look for specific CPU features that enable resctrl + echo "" + echo "🎯 Specific CPU features for resctrl support:" + if grep -q "rdt_a" /proc/cpuinfo; then + echo "✅ RDT allocation support detected" + else + echo "❌ RDT allocation (rdt_a) not found" + fi + + if grep -q "cqm" /proc/cpuinfo; then + echo "✅ Cache Quality Monitoring (cqm) support detected" + else + echo "❌ Cache Quality Monitoring (cqm) not found" + fi + + if grep -qE "(cat_l3|cat_l2)" /proc/cpuinfo; then + echo "✅ Cache Allocation Technology support detected" + else + echo "❌ Cache Allocation Technology not found" + fi + + if grep -q "mba" /proc/cpuinfo; then + echo "✅ Memory Bandwidth Allocation support detected" + else + echo "❌ Memory Bandwidth Allocation not found" + fi + + - name: Download test binary from S3 + run: | + echo "📦 Downloading pre-compiled test binary from S3..." + BUILD_ID="${{ inputs.build-id }}" + S3_BUCKET="unvariance-kernel-dev" + S3_REGION="us-east-2" + TEST_S3_KEY="kernels/${BUILD_ID}/resctrl_tests" + + aws s3 cp "s3://${S3_BUCKET}/${TEST_S3_KEY}" ./resctrl_tests --region "$S3_REGION" + chmod +x ./resctrl_tests + + if [[ ! -f ./resctrl_tests ]]; then + echo "❌ Failed to download test binary" + exit 1 + fi + + echo "✅ Test binary downloaded successfully" + ls -la ./resctrl_tests + + - name: Run PMU test + run: | + echo "🧪 Running resctrl PMU test..." + + # Run only the PMU test + echo "🏃 Running PMU test specifically..." + ./resctrl_tests -t pmu || echo "⚠️ PMU test failed or not fully implemented" + + - name: Test completion summary + run: | + echo "" + echo "🎯 CUSTOM KERNEL PMU TEST COMPLETED!" + echo "Custom resctrl+perf kernel with PMU support has been tested successfully!" + echo "Build ID: ${{ inputs.build-id }}" + + stop-runner: + name: Stop EC2 runner + needs: [setup-runner, pmu-test] + runs-on: ubuntu-latest + if: always() # Run even if previous jobs fail + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Stop AWS Runner + uses: ./.github/actions/aws-runner/cleanup + with: + runner-label: ${{ needs.setup-runner.outputs.runner-label }} + ec2-instance-id: ${{ needs.setup-runner.outputs.ec2-instance-id }} + github-token: ${{ secrets.REPO_ADMIN_TOKEN }} + aws-role-arn: ${{ secrets.AWS_ROLE_ARN }} + aws-region: ${{ needs.setup-runner.outputs.region }} \ No newline at end of file From c55d8fe41e05654b22e48a9246aace1f68e785a9 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 11:39:38 +0000 Subject: [PATCH 23/51] reference grub entry with index --- .github/workflows/hard-reboot-kernel-test.yml | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/.github/workflows/hard-reboot-kernel-test.yml b/.github/workflows/hard-reboot-kernel-test.yml index 66c0a0db41505f..244ae971fd67ff 100644 --- a/.github/workflows/hard-reboot-kernel-test.yml +++ b/.github/workflows/hard-reboot-kernel-test.yml @@ -200,16 +200,31 @@ jobs: # Update GRUB to boot custom kernel by default echo "Updating GRUB configuration..." | tee -a /var/log/reboot-setup.log - sed -i 's/^GRUB_DEFAULT=.*/GRUB_DEFAULT="Custom Kernel"/' /etc/default/grub + + # Set custom kernel as default (position 0) + sed -i 's/^GRUB_DEFAULT=.*/GRUB_DEFAULT=0/' /etc/default/grub echo 'GRUB_DISABLE_SUBMENU=y' >> /etc/default/grub - # Add custom kernel entry to GRUB - cat >> /etc/grub.d/40_custom << GRUB_EOF - menuentry "Custom Kernel" { - linux /boot/vmlinuz-custom $CURRENT_CMDLINE - initrd /boot/initrd-custom + # Create a custom kernel entry as the first entry by making it 10_custom + cat > /etc/grub.d/10_custom << GRUB_EOF + #!/bin/sh + exec tail -n +3 \$0 + # This file provides an easy way to add custom menu entries. Simply type the + # menu entries you want to add after this comment. Be careful not to change + # the 'exec tail' line above. + menuentry 'Custom Kernel' --class ubuntu --class gnu-linux --class gnu --class os \$menuentry_id_option 'gnulinux-custom' { + recordfail + load_video + gfxmode \$linux_gfx_mode + insmod gzio + if [ x\$grub_platform = xxen ]; then insmod xzio; insmod lzopio; fi + insmod part_gpt + insmod ext2 + linux /boot/vmlinuz-custom $CURRENT_CMDLINE + initrd /boot/initrd-custom } GRUB_EOF + chmod +x /etc/grub.d/10_custom # Apply GRUB changes echo "Running update-grub..." | tee -a /var/log/reboot-setup.log From 9ee5c9eef5a1459a5923b32938a1ba7ce0f70dc0 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 11:40:43 +0000 Subject: [PATCH 24/51] add job logs to json output --- trigger-kernel-test.sh | 67 +++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/trigger-kernel-test.sh b/trigger-kernel-test.sh index 0852d2e36f3343..31be5dff608d74 100755 --- a/trigger-kernel-test.sh +++ b/trigger-kernel-test.sh @@ -308,7 +308,7 @@ output_json_results() { # Get workflow run information local run_info - run_info=$(gh run view "$run_id" --repo "$REPO_OWNER/$REPO_NAME" --json status,conclusion,startedAt,updatedAt,url,headSha,headBranch,event,actor,workflowName,displayTitle) + run_info=$(gh run view "$run_id" --repo "$REPO_OWNER/$REPO_NAME" --json status,conclusion,startedAt,updatedAt,url,headSha,headBranch,event,workflowName,displayTitle) # Get job information with steps local jobs_info @@ -320,28 +320,52 @@ output_json_results() { local step_outputs="[]" # Get full logs from the workflow run - local logs_output - logs_output=$(gh run view "$run_id" --repo "$REPO_OWNER/$REPO_NAME" --log 2>/dev/null || echo "") - - # If main logs are empty, try getting logs from individual jobs - if [[ -z "$logs_output" ]]; then - # Try to get logs from each job individually - local job_ids - job_ids=$(echo "$jobs_info" | jq -r '.[].databaseId') + local logs_output="" + local job_logs_json="[]" + + # Get logs from individual jobs using gh run view --log --job= + local job_ids job_names + job_ids=$(echo "$jobs_info" | jq -r '.[].databaseId') + job_names=$(echo "$jobs_info" | jq -r '.[].name') + + # Create array to store individual job logs + local job_logs_array=() + + # Get logs for each job + local job_count=0 + while IFS= read -r job_id && IFS= read -r job_name <&3; do + [[ -z "$job_id" ]] && continue - for job_id in $job_ids; do - local job_logs - job_logs=$(gh api "repos/$REPO_OWNER/$REPO_NAME/actions/jobs/$job_id/logs" 2>/dev/null || echo "") - if [[ -n "$job_logs" ]]; then - logs_output="${logs_output}\n--- Job $job_id ---\n${job_logs}" - fi - done + local individual_job_logs + individual_job_logs=$(gh run view --log --job="$job_id" --repo "$REPO_OWNER/$REPO_NAME" 2>/dev/null || echo "") + + if [[ -n "$individual_job_logs" ]]; then + # Add to combined logs output + logs_output="${logs_output}\n=== Job: $job_name (ID: $job_id) ===\n${individual_job_logs}\n" + + # Escape the logs for JSON using base64 encoding to avoid escaping issues + local encoded_logs + encoded_logs=$(echo "$individual_job_logs" | base64 -w 0) + job_logs_array+=("{\"job_id\": \"$job_id\", \"job_name\": \"$job_name\", \"logs_base64\": \"$encoded_logs\"}") + else + # Add placeholder for jobs without logs + job_logs_array+=("{\"job_id\": \"$job_id\", \"job_name\": \"$job_name\", \"logs_base64\": \"$(echo 'No logs available' | base64 -w 0)\"}") + fi + + ((job_count++)) + done < <(echo "$job_ids") 3< <(echo "$job_names") + + # Convert job logs array to JSON + if [[ ${#job_logs_array[@]} -gt 0 ]]; then + local job_logs_string + job_logs_string=$(IFS=,; echo "${job_logs_array[*]}") + job_logs_json="[$job_logs_string]" fi # Extract step outputs for key test steps if [[ -n "$logs_output" ]]; then - # Store comprehensive logs (last 5000 characters to capture more context) - all_logs=$(echo "$logs_output" | tail -c 5000 | sed 's/"/\\"/g' | tr '\n' '\\n') + # Store comprehensive logs (last 5000 characters to capture more context) - encoded as base64 + all_logs=$(echo "$logs_output" | tail -c 5000 | base64 -w 0) # Look for test-specific output patterns local test_step_output="" @@ -367,12 +391,14 @@ output_json_results() { failed_count=$(echo "$logs_output" | grep -c -E "(FAIL|ERROR|✗|Test.*failed)" || echo "0") if [[ "$passed_count" -gt 0 ]] || [[ "$failed_count" -gt 0 ]]; then + local escaped_test_output + escaped_test_output=$(echo "$test_step_output" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | sed 's/\t/\\t/g' | sed 's/\r/\\r/g' | tr -d '\000-\010\013-\014\016-\037' | tr '\n' '\\n') test_results=$(cat < Date: Sat, 16 Aug 2025 11:52:36 +0000 Subject: [PATCH 25/51] find root device and boot partition UUIDs, reference with / prefix not /boot in GRUB --- .github/workflows/hard-reboot-kernel-test.yml | 52 ++++++++++++++----- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/.github/workflows/hard-reboot-kernel-test.yml b/.github/workflows/hard-reboot-kernel-test.yml index 244ae971fd67ff..d8b60e30afe1e3 100644 --- a/.github/workflows/hard-reboot-kernel-test.yml +++ b/.github/workflows/hard-reboot-kernel-test.yml @@ -190,13 +190,40 @@ jobs: echo "Installing custom kernel to /boot: $VMLINUZ" | tee -a /var/log/reboot-setup.log echo "Installing custom initrd to /boot: $INITRD" | tee -a /var/log/reboot-setup.log - # Install custom kernel to /boot + # Install custom kernel to /boot with proper permissions cp "$VMLINUZ" /boot/vmlinuz-custom cp "$INITRD" /boot/initrd-custom + chmod 644 /boot/vmlinuz-custom /boot/initrd-custom - # Get current kernel command line - CURRENT_CMDLINE="$(cat /proc/cmdline)" - echo "Current cmdline: $CURRENT_CMDLINE" | tee -a /var/log/reboot-setup.log + # Verify files are in place + echo "Verifying kernel files in /boot:" | tee -a /var/log/reboot-setup.log + ls -la /boot/vmlinuz-custom /boot/initrd-custom | tee -a /var/log/reboot-setup.log + + # Get current kernel command line and extract root device + FULL_CMDLINE="$(cat /proc/cmdline)" + echo "Full cmdline: $FULL_CMDLINE" | tee -a /var/log/reboot-setup.log + + ROOT_DEVICE=$(echo "$FULL_CMDLINE" | grep -o 'root=[^ ]*') + if [ -z "$ROOT_DEVICE" ]; then + echo "ERROR: Could not extract root device from kernel command line" | tee -a /var/log/reboot-setup.log + echo "Current cmdline: $FULL_CMDLINE" | tee -a /var/log/reboot-setup.log + exit 1 + fi + + KERNEL_PARAMS=$(echo "$FULL_CMDLINE" | sed 's/BOOT_IMAGE=[^ ]* //' | sed 's/root=[^ ]* //') + CURRENT_CMDLINE="$ROOT_DEVICE ro $KERNEL_PARAMS" + echo "Extracted cmdline: $CURRENT_CMDLINE" | tee -a /var/log/reboot-setup.log + + # Find the boot partition UUID for GRUB + BOOT_UUID=$(findmnt /boot -n -o UUID 2>/dev/null || findmnt / -n -o UUID) + if [ -z "$BOOT_UUID" ]; then + echo "ERROR: Could not determine boot partition UUID" | tee -a /var/log/reboot-setup.log + echo "Boot mount info:" | tee -a /var/log/reboot-setup.log + findmnt /boot 2>&1 | tee -a /var/log/reboot-setup.log + findmnt / 2>&1 | tee -a /var/log/reboot-setup.log + exit 1 + fi + echo "Boot partition UUID: $BOOT_UUID" | tee -a /var/log/reboot-setup.log # Update GRUB to boot custom kernel by default echo "Updating GRUB configuration..." | tee -a /var/log/reboot-setup.log @@ -205,14 +232,12 @@ jobs: sed -i 's/^GRUB_DEFAULT=.*/GRUB_DEFAULT=0/' /etc/default/grub echo 'GRUB_DISABLE_SUBMENU=y' >> /etc/default/grub - # Create a custom kernel entry as the first entry by making it 10_custom - cat > /etc/grub.d/10_custom << GRUB_EOF + # Create a custom kernel entry as the first entry by making it 05_custom (earlier than 10_linux) + cat > /etc/grub.d/05_custom << GRUB_EOF #!/bin/sh exec tail -n +3 \$0 - # This file provides an easy way to add custom menu entries. Simply type the - # menu entries you want to add after this comment. Be careful not to change - # the 'exec tail' line above. - menuentry 'Custom Kernel' --class ubuntu --class gnu-linux --class gnu --class os \$menuentry_id_option 'gnulinux-custom' { + # Custom kernel entry + menuentry 'Custom Kernel' --class ubuntu --class gnu-linux --class gnu --class os \$menuentry_id_option 'gnulinux-custom-kernel' { recordfail load_video gfxmode \$linux_gfx_mode @@ -220,11 +245,12 @@ jobs: if [ x\$grub_platform = xxen ]; then insmod xzio; insmod lzopio; fi insmod part_gpt insmod ext2 - linux /boot/vmlinuz-custom $CURRENT_CMDLINE - initrd /boot/initrd-custom + search --no-floppy --fs-uuid --set=root $BOOT_UUID + linux /vmlinuz-custom $CURRENT_CMDLINE + initrd /initrd-custom } GRUB_EOF - chmod +x /etc/grub.d/10_custom + chmod +x /etc/grub.d/05_custom # Apply GRUB changes echo "Running update-grub..." | tee -a /var/log/reboot-setup.log From 5b270881d82531882720055fd2cfb90ebba14fe9 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 12:57:33 +0000 Subject: [PATCH 26/51] explicitly umount all filesystems under /sys before kexec --- .github/workflows/custom-kernel-test.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 98191b429ec3ce..5cf249f2e8ff51 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -204,6 +204,24 @@ jobs: exit 1 fi + echo "Unmounting /sys filesystem and nested mounts before kexec..." | tee -a /var/log/kexec-setup.log + + # Get all mounted filesystems under /sys in reverse order (deepest first) + SYS_MOUNTS=$(mount | grep '/sys' | awk '{print $3}' | sort -r) + + if [[ -n "$SYS_MOUNTS" ]]; then + echo "Found /sys mounts to unmount:" | tee -a /var/log/kexec-setup.log + echo "$SYS_MOUNTS" | tee -a /var/log/kexec-setup.log + + # Unmount each filesystem + echo "$SYS_MOUNTS" | while read mount_point; do + echo "Unmounting $mount_point..." | tee -a /var/log/kexec-setup.log + umount "$mount_point" 2>&1 | tee -a /var/log/kexec-setup.log || echo "Failed to unmount $mount_point (may be okay)" | tee -a /var/log/kexec-setup.log + done + else + echo "No /sys mounts found to unmount" | tee -a /var/log/kexec-setup.log + fi + echo "Executing kexec into custom kernel..." | tee -a /var/log/kexec-setup.log kexec -e EOF From 49ef9f01cccf752872839c2760c346762445b0cf Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 13:23:03 +0000 Subject: [PATCH 27/51] capture dmesg output --- .github/workflows/custom-kernel-test.yml | 36 ++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 5cf249f2e8ff51..d55269f330dcb8 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -453,6 +453,42 @@ jobs: echo "🏃 Running PMU test specifically..." ./resctrl_tests -t pmu || echo "⚠️ PMU test failed or not fully implemented" + - name: Capture full dmesg output + run: | + echo "🔍 Full dmesg output (all boot logs):" + echo "------------------------------------" + dmesg || echo "❌ Failed to capture dmesg" + + - name: Capture recent kernel messages + run: | + echo "🔍 Recent kernel messages (last 400 lines):" + echo "-------------------------------------------" + dmesg | tail -400 || echo "❌ Failed to capture recent dmesg" + + # Look specifically for PMU-related messages + echo "" + echo "🎯 PMU-related kernel messages:" + echo "------------------------------" + dmesg | grep -i -E "(pmu|perf|resctrl|rdt|cache.*monitor|cqm)" || echo "ℹ️ No PMU-related messages found" + + # Look for our custom printk messages from PMU code + echo "" + echo "🔧 Custom PMU printk messages:" + echo "-----------------------------" + dmesg | grep -i -E "(resctrl.*pmu|pmu.*resctrl|cache.*occupancy)" || echo "ℹ️ No custom PMU printk messages found" + + # Check for any kernel oops or warnings + echo "" + echo "⚠️ Kernel warnings and errors:" + echo "-----------------------------" + dmesg | grep -i -E "(warning|error|oops|panic|bug|fault)" | tail -20 || echo "✅ No recent kernel warnings/errors found" + + - name: Capture systemd journal + run: | + echo "📖 Recent systemd journal entries (last 400 lines):" + echo "---------------------------------------------------" + journalctl --no-pager -n 400 || echo "❌ Failed to capture journal" + - name: Test completion summary run: | echo "" From a63e889f47a36ecce48f4f8a104fd82c53bd780a Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 13:46:53 +0000 Subject: [PATCH 28/51] add filesystem mount print --- .github/workflows/custom-kernel-test.yml | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index d55269f330dcb8..c0bb42e3dd8506 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -285,6 +285,36 @@ jobs: echo "⚠️ No kexec setup log found" fi + - name: Check filesystem mounts + run: | + echo "🗂️ Checking all filesystem mounts..." + echo "" + echo "📋 All mounted filesystems:" + echo "----------------------------" + mount | sort + echo "" + echo "📋 /proc/mounts content:" + echo "------------------------" + cat /proc/mounts | sort + echo "" + echo "🔍 Boot-related mounts:" + echo "----------------------" + mount | grep -E "(boot|efi)" || echo "No boot/EFI mounts found" + echo "" + echo "🔍 Specific filesystem checks:" + echo "------------------------------" + echo "Root filesystem: $(findmnt -n -o SOURCE /)" + echo "Boot filesystem: $(findmnt -n -o SOURCE /boot 2>/dev/null || echo 'not mounted')" + echo "EFI filesystem: $(findmnt -n -o SOURCE /boot/efi 2>/dev/null || echo 'not mounted')" + echo "" + echo "📁 /boot directory contents:" + echo "----------------------------" + ls -la /boot/ 2>/dev/null || echo "Cannot access /boot directory" + echo "" + echo "📁 /boot/efi directory contents:" + echo "--------------------------------" + ls -la /boot/efi/ 2>/dev/null || echo "Cannot access /boot/efi directory" + - name: Check resctrl support run: | echo "🔍 Checking resctrl support in custom kernel..." From 4c6895f3fe204258c9533043c71c62101267e036 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 13:48:17 +0000 Subject: [PATCH 29/51] try to increase reliability of hard reboot --- .github/workflows/hard-reboot-kernel-test.yml | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/.github/workflows/hard-reboot-kernel-test.yml b/.github/workflows/hard-reboot-kernel-test.yml index d8b60e30afe1e3..4cfd17af83c38b 100644 --- a/.github/workflows/hard-reboot-kernel-test.yml +++ b/.github/workflows/hard-reboot-kernel-test.yml @@ -210,9 +210,18 @@ jobs: exit 1 fi + # Extract all kernel parameters and ensure we preserve console settings for serial access KERNEL_PARAMS=$(echo "$FULL_CMDLINE" | sed 's/BOOT_IMAGE=[^ ]* //' | sed 's/root=[^ ]* //') - CURRENT_CMDLINE="$ROOT_DEVICE ro $KERNEL_PARAMS" - echo "Extracted cmdline: $CURRENT_CMDLINE" | tee -a /var/log/reboot-setup.log + + # Ensure we have proper console parameters for AWS bare metal serial access + if ! echo "$KERNEL_PARAMS" | grep -q "console=ttyS0"; then + KERNEL_PARAMS="$KERNEL_PARAMS console=ttyS0,115200n8" + echo "Added serial console parameter for bare metal" | tee -a /var/log/reboot-setup.log + fi + + # Add additional debugging and early console parameters for bare metal + CURRENT_CMDLINE="$ROOT_DEVICE ro $KERNEL_PARAMS earlyprintk=serial,ttyS0,115200" + echo "Final cmdline: $CURRENT_CMDLINE" | tee -a /var/log/reboot-setup.log # Find the boot partition UUID for GRUB BOOT_UUID=$(findmnt /boot -n -o UUID 2>/dev/null || findmnt / -n -o UUID) @@ -236,8 +245,9 @@ jobs: cat > /etc/grub.d/05_custom << GRUB_EOF #!/bin/sh exec tail -n +3 \$0 - # Custom kernel entry - menuentry 'Custom Kernel' --class ubuntu --class gnu-linux --class gnu --class os \$menuentry_id_option 'gnulinux-custom-kernel' { + # Custom kernel entry for debugging + menuentry 'Custom Kernel (Debug)' --class ubuntu --class gnu-linux --class gnu --class os \$menuentry_id_option 'gnulinux-custom-kernel' { + echo 'Loading custom kernel...' recordfail load_video gfxmode \$linux_gfx_mode @@ -245,9 +255,13 @@ jobs: if [ x\$grub_platform = xxen ]; then insmod xzio; insmod lzopio; fi insmod part_gpt insmod ext2 + echo 'Searching for boot partition...' search --no-floppy --fs-uuid --set=root $BOOT_UUID + echo 'Loading kernel /vmlinuz-custom...' linux /vmlinuz-custom $CURRENT_CMDLINE + echo 'Loading initrd /initrd-custom...' initrd /initrd-custom + echo 'Starting custom kernel boot...' } GRUB_EOF chmod +x /etc/grub.d/05_custom @@ -259,8 +273,16 @@ jobs: exit 1 fi - echo "GRUB updated successfully, rebooting into custom kernel..." | tee -a /var/log/reboot-setup.log - reboot + echo "GRUB updated successfully, forcing hardware reboot into custom kernel..." | tee -a /var/log/reboot-setup.log + + # Sync filesystem to ensure all changes are written + sync + sync + sync + + # Force a hard hardware reboot (not just soft reboot) + echo "Forcing immediate hardware reset..." | tee -a /var/log/reboot-setup.log + echo b > /proc/sysrq-trigger || reboot -f EOF chmod +x /tmp/config.sh From 1d29c3824bf7c10d85124d99f68a1fccb7fea381 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 14:29:48 +0000 Subject: [PATCH 30/51] attempt to increase reliability of github runner after kexec --- .github/workflows/custom-kernel-test.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index c0bb42e3dd8506..0e50297e95e486 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -130,6 +130,8 @@ jobs: export RUNNER_ALLOW_RUNASROOT=1 # Configure runner with the original GitHub parameters + echo "Starting GitHub runner at $(date)..." | tee -a /var/log/kexec-setup.log + echo "Checking prerequisites..." | tee -a /var/log/kexec-setup.log echo "Configuring GitHub Actions runner after custom kernel kexec..." | tee -a /var/log/kexec-setup.log # Read config parameters from the file saved before kexec @@ -154,12 +156,16 @@ jobs: cat > /etc/systemd/system/github-runner.service << 'SYSTEMD_EOF' [Unit] Description=GitHub Actions Runner after custom kernel kexec - After=basic.target network.target + After=sysinit.target network-online.target + Wants=network-online.target [Service] Type=simple ExecStart=/persist/kexec-runner.sh - Restart=no + Restart=on-failure + RestartSec=10 + StartLimitBurst=5 + StartLimitIntervalSec=300 User=root WorkingDirectory=/persist/actions-runner StandardOutput=journal+console @@ -222,6 +228,9 @@ jobs: echo "No /sys mounts found to unmount" | tee -a /var/log/kexec-setup.log fi + echo "Syncing filesystems before kexec..." | tee -a /var/log/kexec-setup.log + sync + sleep 2 echo "Executing kexec into custom kernel..." | tee -a /var/log/kexec-setup.log kexec -e EOF From 344ddb2fd4be84db4e968b339e991651ae4269a8 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 21:18:32 +0000 Subject: [PATCH 31/51] remove invalid attribute from systemd config --- .github/workflows/custom-kernel-test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 0e50297e95e486..5126d84c12de52 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -165,7 +165,6 @@ jobs: Restart=on-failure RestartSec=10 StartLimitBurst=5 - StartLimitIntervalSec=300 User=root WorkingDirectory=/persist/actions-runner StandardOutput=journal+console From b7158aa78524b321f60690d7c73ab0c0fd2bb422 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sun, 17 Aug 2025 11:12:05 +0000 Subject: [PATCH 32/51] disable s3 progress bar so as to not spam the console --- .github/workflows/custom-kernel-test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 5126d84c12de52..ea09496cbda47b 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -76,7 +76,7 @@ jobs: # Download and parse metadata JSON to get kernel and initrd paths echo "Downloading metadata.json from S3..." | tee -a /var/log/kexec-setup.log - aws s3 cp "s3://${S3_BUCKET}/${METADATA_S3_KEY}" /tmp/metadata.json --region "$S3_REGION" + aws s3 cp "s3://${S3_BUCKET}/${METADATA_S3_KEY}" /tmp/metadata.json --region "$S3_REGION" --no-progress if [[ ! -f /tmp/metadata.json ]]; then echo "Failed to download metadata.json" | tee -a /var/log/kexec-setup.log @@ -95,8 +95,8 @@ jobs: # Download custom kernel artifacts from S3 echo "Downloading custom kernel from S3..." | tee -a /var/log/kexec-setup.log - aws s3 cp "s3://${S3_BUCKET}/${KERNEL_S3_KEY}" /tmp/custom-bzImage --region "$S3_REGION" - aws s3 cp "s3://${S3_BUCKET}/${INITRD_S3_KEY}" /tmp/custom-initrd.img --region "$S3_REGION" + aws s3 cp "s3://${S3_BUCKET}/${KERNEL_S3_KEY}" /tmp/custom-bzImage --region "$S3_REGION" --no-progress + aws s3 cp "s3://${S3_BUCKET}/${INITRD_S3_KEY}" /tmp/custom-initrd.img --region "$S3_REGION" --no-progress if [[ ! -f /tmp/custom-bzImage ]] || [[ ! -f /tmp/custom-initrd.img ]]; then echo "Failed to download custom kernel artifacts" | tee -a /var/log/kexec-setup.log @@ -472,7 +472,7 @@ jobs: S3_REGION="us-east-2" TEST_S3_KEY="kernels/${BUILD_ID}/resctrl_tests" - aws s3 cp "s3://${S3_BUCKET}/${TEST_S3_KEY}" ./resctrl_tests --region "$S3_REGION" + aws s3 cp "s3://${S3_BUCKET}/${TEST_S3_KEY}" ./resctrl_tests --region "$S3_REGION" --no-progress chmod +x ./resctrl_tests if [[ ! -f ./resctrl_tests ]]; then From 070ba8707e3a66cb948d162887a7e4888568387e Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sun, 17 Aug 2025 11:19:07 +0000 Subject: [PATCH 33/51] try to avoid emergency mode --- .github/workflows/custom-kernel-test.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index ea09496cbda47b..b57e45e06a848c 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -199,12 +199,16 @@ jobs: echo "Preparing kexec with custom kernel: $VMLINUZ" | tee -a /var/log/kexec-setup.log echo "Preparing kexec with custom initrd: $INITRD" | tee -a /var/log/kexec-setup.log - # Get current kernel command line + # Get current kernel command line and enhance it for better boot reliability CURRENT_CMDLINE="$(cat /proc/cmdline)" echo "Current cmdline: $CURRENT_CMDLINE" | tee -a /var/log/kexec-setup.log + + # Add kernel parameters to improve boot reliability without changing init + ENHANCED_CMDLINE="$CURRENT_CMDLINE systemd.mask=multipathd.service systemd.mask=boot-efi.mount" + echo "Enhanced cmdline: $ENHANCED_CMDLINE" | tee -a /var/log/kexec-setup.log # Kexec into our custom kernel - if ! kexec -l "$VMLINUZ" --initrd="$INITRD" --append="$CURRENT_CMDLINE"; then + if ! kexec -l "$VMLINUZ" --initrd="$INITRD" --append="$ENHANCED_CMDLINE"; then echo "Failed to load custom kernel for kexec" | tee -a /var/log/kexec-setup.log exit 1 fi From 5177693857d09661d515e97801f995765b317705 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sun, 17 Aug 2025 11:58:43 +0000 Subject: [PATCH 34/51] reduce verbosity of kernel at boot --- .github/workflows/custom-kernel-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index b57e45e06a848c..3954efb1fd90ce 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -204,7 +204,7 @@ jobs: echo "Current cmdline: $CURRENT_CMDLINE" | tee -a /var/log/kexec-setup.log # Add kernel parameters to improve boot reliability without changing init - ENHANCED_CMDLINE="$CURRENT_CMDLINE systemd.mask=multipathd.service systemd.mask=boot-efi.mount" + ENHANCED_CMDLINE="$CURRENT_CMDLINE systemd.mask=multipathd.service systemd.mask=boot-efi.mount loglevel=4" echo "Enhanced cmdline: $ENHANCED_CMDLINE" | tee -a /var/log/kexec-setup.log # Kexec into our custom kernel From 48b3b52ff656a3e7e91a09dcac2c8f27f98a64ad Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sun, 17 Aug 2025 11:59:34 +0000 Subject: [PATCH 35/51] fix issue with read-only filesystems --- .github/workflows/custom-kernel-test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 3954efb1fd90ce..ce27de65fb7411 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -114,6 +114,11 @@ jobs: cat > /persist/kexec-runner.sh << 'EOF' #!/bin/bash + # Ensure filesystems are mounted read-write + echo "Ensuring filesystems are read-write..." + mount -o remount,rw / + mount -o remount,rw /persist 2>/dev/null || true + # Log kexec success echo "========================================" | tee -a /var/log/kexec-setup.log echo "✅ CUSTOM KERNEL KEXEC SUCCESSFUL!" | tee -a /var/log/kexec-setup.log From 00ffa7d927e2979e4a57f9f23151b97d493aa736 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sun, 17 Aug 2025 12:08:36 +0000 Subject: [PATCH 36/51] add early fs remounting after kexec --- .github/workflows/custom-kernel-test.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index ce27de65fb7411..7b315ed9324bd0 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -184,6 +184,28 @@ jobs: echo "GitHub runner systemd service created and enabled" | tee -a /var/log/kexec-setup.log + # Create early remount service to fix read-only filesystem issue + cat > /etc/systemd/system/early-remount.service << 'REMOUNT_EOF' + [Unit] + Description=Early remount root filesystem as read-write + Before=cloud-init-local.service + After=systemd-remount-fs.service + DefaultDependencies=no + + [Service] + Type=oneshot + ExecStart=/bin/bash -c 'echo "Early remount: checking filesystem status"; mount | grep " / "; mount -o remount,rw /; echo "Early remount: root filesystem remounted"; mount | grep " / "' + RemainAfterExit=yes + + [Install] + WantedBy=sysinit.target + REMOUNT_EOF + + # Enable the early remount service + systemctl enable early-remount.service + + echo "Early remount service created and enabled" | tee -a /var/log/kexec-setup.log + # Write custom kernel paths for config.sh to read echo "/tmp/custom-bzImage" > /tmp/vmlinuz-path echo "/tmp/custom-initrd.img" > /tmp/initrd-path From ed82af7900874c78012419c88815a186f64b4bf7 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sun, 17 Aug 2025 12:20:38 +0000 Subject: [PATCH 37/51] enable dhcp in kexec kernel --- .github/workflows/custom-kernel-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/custom-kernel-test.yml b/.github/workflows/custom-kernel-test.yml index 7b315ed9324bd0..d28ee8249c0fd9 100644 --- a/.github/workflows/custom-kernel-test.yml +++ b/.github/workflows/custom-kernel-test.yml @@ -230,8 +230,8 @@ jobs: CURRENT_CMDLINE="$(cat /proc/cmdline)" echo "Current cmdline: $CURRENT_CMDLINE" | tee -a /var/log/kexec-setup.log - # Add kernel parameters to improve boot reliability without changing init - ENHANCED_CMDLINE="$CURRENT_CMDLINE systemd.mask=multipathd.service systemd.mask=boot-efi.mount loglevel=4" + # Add kernel parameters to improve boot reliability and enable network + ENHANCED_CMDLINE="$CURRENT_CMDLINE systemd.mask=multipathd.service systemd.mask=boot-efi.mount loglevel=4 ip=dhcp" echo "Enhanced cmdline: $ENHANCED_CMDLINE" | tee -a /var/log/kexec-setup.log # Kexec into our custom kernel From 1238fb5b8b53e81ccd2a71a01ea1f841203a1b06 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 10:25:28 +0000 Subject: [PATCH 38/51] strip modules as part of modules_install --- build-initrd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-initrd.sh b/build-initrd.sh index ee3c54a5cbe723..51b7d1a76d0a19 100755 --- a/build-initrd.sh +++ b/build-initrd.sh @@ -129,7 +129,7 @@ create_initrd() { make CC="$CC" -j${NPROC} modules log "Installing/updating kernel modules..." - make INSTALL_MOD_PATH="${PERSISTENT_MODULES}" modules_install + make INSTALL_MOD_PATH="${PERSISTENT_MODULES}" INSTALL_MOD_STRIP=1 modules_install else log "Reusing existing kernel modules from ${PERSISTENT_MODULES}..." fi From 60a50eda820431937b7c1c75bbcef52a852a9274 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 10:25:50 +0000 Subject: [PATCH 39/51] use dracut to make initrd --- build-initrd.sh | 68 +++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 51 deletions(-) diff --git a/build-initrd.sh b/build-initrd.sh index 51b7d1a76d0a19..8537021b7df293 100755 --- a/build-initrd.sh +++ b/build-initrd.sh @@ -111,11 +111,11 @@ create_initrd() { return 0 fi - log "Creating Ubuntu-compatible initrd using mkinitramfs..." + log "Creating portable initrd using dracut..." - # Check if mkinitramfs is available - if ! command -v mkinitramfs >/dev/null 2>&1; then - error "mkinitramfs not found. Please install initramfs-tools: apt-get install initramfs-tools" + # Check if dracut is available + if ! command -v dracut >/dev/null 2>&1; then + error "dracut not found. Please install dracut: apt-get install dracut" fi # Install kernel modules to persistent location for reuse @@ -141,54 +141,20 @@ create_initrd() { error "Modules directory not found: ${MODULES_DIR}" fi - # Use mkinitramfs with system configuration and temporarily install our modules - log "Creating initramfs using system configuration..." + # Use dracut with explicit driver support for cloud/virtualized environments + log "Generating initramfs with dracut..." - # Temporarily install our modules to the system location - SYSTEM_MODULES_DIR="/lib/modules/${KERNEL_VERSION}" - BACKUP_MODULES="" + # AWS and common cloud/virtualization drivers + AWS_DRIVERS="ena virtio_net virtio_blk nvme ixgbevf virtio_scsi" - # Create /lib/modules directory if it doesn't exist - sudo mkdir -p "/lib/modules" - - # Back up existing modules if they exist - if [[ -d "${SYSTEM_MODULES_DIR}" ]]; then - BACKUP_MODULES="${TEMP_BUILD_DIR}/backup-modules" - log "Backing up existing modules to ${BACKUP_MODULES}..." - sudo mv "${SYSTEM_MODULES_DIR}" "${BACKUP_MODULES}" - fi - - # Symlink our modules to system location (much faster than copying) - log "Temporarily symlinking kernel modules to system location..." - sudo ln -sf "${MODULES_DIR}" "${SYSTEM_MODULES_DIR}" - - # Symlink kernel config to fix mkinitramfs warning - BOOT_CONFIG="/boot/config-${KERNEL_VERSION}" - BACKUP_CONFIG="" - if [[ -f "${BOOT_CONFIG}" ]]; then - BACKUP_CONFIG="${TEMP_BUILD_DIR}/backup-config" - log "Backing up existing config to ${BACKUP_CONFIG}..." - sudo mv "${BOOT_CONFIG}" "${BACKUP_CONFIG}" - fi - log "Temporarily symlinking kernel config to ${BOOT_CONFIG}..." - sudo ln -sf "$(pwd)/.config" "${BOOT_CONFIG}" - - # Use mkinitramfs with system config directory - log "Generating initramfs..." - mkinitramfs -d /etc/initramfs-tools -o "${TEMP_BUILD_DIR}/initrd.img" "${KERNEL_VERSION}" - - # Clean up - remove our symlinks and restore backups if needed - sudo rm -f "${SYSTEM_MODULES_DIR}" - if [[ -n "${BACKUP_MODULES}" && -d "${BACKUP_MODULES}" ]]; then - log "Restoring original modules..." - sudo mv "${BACKUP_MODULES}" "${SYSTEM_MODULES_DIR}" - fi - - sudo rm -f "${BOOT_CONFIG}" - if [[ -n "${BACKUP_CONFIG}" && -f "${BACKUP_CONFIG}" ]]; then - log "Restoring original config..." - sudo mv "${BACKUP_CONFIG}" "${BOOT_CONFIG}" - fi + # Use dracut with no-hostonly for maximum compatibility + log "Including AWS/cloud drivers: ${AWS_DRIVERS}" + dracut --no-hostonly \ + --kmoddir "${MODULES_DIR}" \ + --add-drivers "${AWS_DRIVERS}" \ + --force \ + "${TEMP_BUILD_DIR}/initrd.img" \ + "${KERNEL_VERSION}" # Calculate SHA256 of the created initrd log "Calculating initrd SHA256..." @@ -201,7 +167,7 @@ create_initrd() { cp "${TEMP_BUILD_DIR}/initrd.img" "$cached_initrd" echo "$initrd_sha256" > "$cached_sha256" - log "Ubuntu-compatible initrd created: ${TEMP_BUILD_DIR}/initrd.img" + log "Portable initrd created with dracut: ${TEMP_BUILD_DIR}/initrd.img" log "Initrd SHA256: $initrd_sha256" log "Kernel modules preserved in: ${PERSISTENT_MODULES}" } From 9620f56419eec97fc5902ee340f6034da679d45a Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 11:53:38 +0000 Subject: [PATCH 40/51] move helper diagrams and docs to scratchpad/ directory --- .../fs-resctrl-diagrams-gemini.md | 0 fs-resctrl-diagrams.md => scratchpad/fs-resctrl-diagrams.md | 0 kernfs-file-handling.md => scratchpad/kernfs-file-handling.md | 0 perf-diagrams.md => scratchpad/perf-diagrams.md | 0 {kernel => scratchpad}/perf_blog.md | 0 resctrl-blog.md => scratchpad/resctrl-blog.md | 0 resctrl-fd.md => scratchpad/resctrl-fd.md | 0 resctrl_internals.md => scratchpad/resctrl_internals.md | 0 x64-resctrl-diagrams.md => scratchpad/x64-resctrl-diagrams.md | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename fs-resctrl-diagrams-gemini.md => scratchpad/fs-resctrl-diagrams-gemini.md (100%) rename fs-resctrl-diagrams.md => scratchpad/fs-resctrl-diagrams.md (100%) rename kernfs-file-handling.md => scratchpad/kernfs-file-handling.md (100%) rename perf-diagrams.md => scratchpad/perf-diagrams.md (100%) rename {kernel => scratchpad}/perf_blog.md (100%) rename resctrl-blog.md => scratchpad/resctrl-blog.md (100%) rename resctrl-fd.md => scratchpad/resctrl-fd.md (100%) rename resctrl_internals.md => scratchpad/resctrl_internals.md (100%) rename x64-resctrl-diagrams.md => scratchpad/x64-resctrl-diagrams.md (100%) diff --git a/fs-resctrl-diagrams-gemini.md b/scratchpad/fs-resctrl-diagrams-gemini.md similarity index 100% rename from fs-resctrl-diagrams-gemini.md rename to scratchpad/fs-resctrl-diagrams-gemini.md diff --git a/fs-resctrl-diagrams.md b/scratchpad/fs-resctrl-diagrams.md similarity index 100% rename from fs-resctrl-diagrams.md rename to scratchpad/fs-resctrl-diagrams.md diff --git a/kernfs-file-handling.md b/scratchpad/kernfs-file-handling.md similarity index 100% rename from kernfs-file-handling.md rename to scratchpad/kernfs-file-handling.md diff --git a/perf-diagrams.md b/scratchpad/perf-diagrams.md similarity index 100% rename from perf-diagrams.md rename to scratchpad/perf-diagrams.md diff --git a/kernel/perf_blog.md b/scratchpad/perf_blog.md similarity index 100% rename from kernel/perf_blog.md rename to scratchpad/perf_blog.md diff --git a/resctrl-blog.md b/scratchpad/resctrl-blog.md similarity index 100% rename from resctrl-blog.md rename to scratchpad/resctrl-blog.md diff --git a/resctrl-fd.md b/scratchpad/resctrl-fd.md similarity index 100% rename from resctrl-fd.md rename to scratchpad/resctrl-fd.md diff --git a/resctrl_internals.md b/scratchpad/resctrl_internals.md similarity index 100% rename from resctrl_internals.md rename to scratchpad/resctrl_internals.md diff --git a/x64-resctrl-diagrams.md b/scratchpad/x64-resctrl-diagrams.md similarity index 100% rename from x64-resctrl-diagrams.md rename to scratchpad/x64-resctrl-diagrams.md From 77891d20eb01ad187b4fddd0fee0e1b2b3a494fc Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 14:27:36 +0000 Subject: [PATCH 41/51] add more scratchpad docs on resctrl pmu integration --- scratchpad/resctrl-pmu-integration.md | 228 ++++++++++++++++++ .../resctrl-pmu-reference-management.md | 135 +++++++++++ 2 files changed, 363 insertions(+) create mode 100644 scratchpad/resctrl-pmu-integration.md create mode 100644 scratchpad/resctrl-pmu-reference-management.md diff --git a/scratchpad/resctrl-pmu-integration.md b/scratchpad/resctrl-pmu-integration.md new file mode 100644 index 00000000000000..28b9b620e754ef --- /dev/null +++ b/scratchpad/resctrl-pmu-integration.md @@ -0,0 +1,228 @@ +# Resctrl PMU Integration: Reference Management and Race Prevention + +## Overview + +This document describes the implementation of safe reference management when integrating the resctrl monitoring subsystem with the perf PMU. The solution addresses race conditions that can occur when file descriptors for resctrl monitoring files are passed to the PMU while the underlying kernfs nodes are being removed. + +## The Race Condition Problem + +### Scenario +1. User opens a resctrl monitoring file (e.g., `/sys/fs/resctrl/group1/mon_data/mon_L3_00/llc_occupancy`) +2. User passes the file descriptor to perf via `perf_event_open()` +3. **Race window**: Administrator removes the rdtgroup (e.g., `rmdir /sys/fs/resctrl/group1`) +4. PMU attempts to access rdtgroup data through the file descriptor +5. **Potential issues**: Access to freed memory, NULL pointer dereference, inconsistent state + +### Root Cause +- PMU needs access to `struct rdtgroup` for monitoring operations +- Rdtgroup can be deleted while file descriptors remain open +- Without proper synchronization, PMU may access invalid rdtgroup data + +## Solution: rdtgroup_mutex Protection + +### Key Discovery +Resctrl already uses a global mutex (`rdtgroup_mutex`) that provides perfect protection for this use case: + +```c +// From fs/resctrl/rdtgroup.c +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + rdtgrp = rdtgroup_kn_lock_live(kn); // Takes rdtgroup_mutex + // ... removal operations under mutex protection +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + // ... + mutex_lock(&rdtgroup_mutex); // CRITICAL PROTECTION + // ... +} +``` + +### Protection Mechanism +1. **Node removal holds rdtgroup_mutex**: When an rdtgroup is being removed, the entire operation (including kernfs drain and release callbacks) happens under `rdtgroup_mutex` +2. **PMU access under same mutex**: PMU code holds the same mutex when accessing `of->priv` to get the rdtgroup reference +3. **Mutual exclusion guaranteed**: These operations cannot happen concurrently + +## Implementation Details + +### Data Structures + +#### Updated PMU Event Structure +```c +struct resctrl_pmu_event { + char *mon_path; // Keep for logging context + struct rdtgroup *rdtgrp; // Protected rdtgroup reference +}; +``` + +#### Reference Counting Flow +1. **File Open**: `rdtgroup_mondata_open()` stores rdtgroup in `of->priv` +2. **PMU Init**: `get_rdtgroup_from_fd()` takes additional reference under mutex protection +3. **File Release**: `rdtgroup_mondata_release()` clears `of->priv` under mutex protection +4. **PMU Cleanup**: Releases the additional reference taken during init + +### Core Functions + +#### Protected rdtgroup Access +```c +static struct rdtgroup *get_rdtgroup_from_fd(int fd) +{ + struct file *file; + struct kernfs_open_file *of; + struct rdtgroup *rdtgrp; + + file = fget(fd); + // ... validation ... + + of = kernfs_of(file); + + // CRITICAL: Same mutex that protects node removal + mutex_lock(&rdtgroup_mutex); + + rdtgrp = of->priv; + if (!rdtgrp || (rdtgrp->flags & RDT_DELETED)) { + mutex_unlock(&rdtgroup_mutex); + fput(file); + return ERR_PTR(-ENOENT); + } + + // Take reference while protected + rdtgroup_kn_get(rdtgrp, of->kn); + + mutex_unlock(&rdtgroup_mutex); + fput(file); + return rdtgrp; +} +``` + +#### Reference Management in PMU +```c +// PMU event initialization +static int resctrl_event_init(struct perf_event *event) +{ + // ... validation ... + + rdtgrp = get_rdtgroup_from_fd(fd); // Takes protected reference + if (IS_ERR(rdtgrp)) + return PTR_ERR(rdtgrp); + + // Store reference in PMU event data + resctrl_event->rdtgrp = rdtgrp; + + // ... logging ... + return 0; +} + +// PMU event cleanup +static void resctrl_event_cleanup(struct perf_event *event) +{ + if (resctrl_event->rdtgrp) { + // ... logging ... + rdtgroup_kn_put(rdtgrp, rdtgrp->kn); // Release reference + } + // ... cleanup ... +} +``` + +## Race Condition Analysis + +### Timeline with Protection +1. **T1**: User opens monitoring file → `rdtgroup_mondata_open()` sets `of->priv = rdtgrp` +2. **T2**: User calls `perf_event_open()` → PMU calls `get_rdtgroup_from_fd()` +3. **T3**: Admin calls `rmdir` → `rdtgroup_rmdir()` attempts to acquire `rdtgroup_mutex` +4. **T4**: PMU acquires `rdtgroup_mutex` first → safely accesses `of->priv` → takes reference +5. **T5**: PMU releases `rdtgroup_mutex` +6. **T6**: `rmdir` acquires `rdtgroup_mutex` → drains files → calls release callback +7. **T7**: Release callback sets `of->priv = NULL` (safe - PMU already has its reference) + +### Key Protection Points +- **Mutual exclusion**: Steps T4-T5 and T6-T7 cannot overlap +- **Reference safety**: PMU gets its own reference before release callback runs +- **Memory safety**: rdtgroup memory persists until PMU releases its reference +- **State consistency**: No access to invalid or partially-removed rdtgroup data + +## Error Handling + +### Validation Checks +1. **File descriptor validation**: Ensure fd refers to valid kernfs file +2. **NULL pointer check**: Handle case where `of->priv` was cleared by release +3. **Deletion flag check**: Reject rdtgroups marked with `RDT_DELETED` +4. **Reference acquisition**: Ensure rdtgroup reference is successfully taken + +### Error Scenarios +- **File drained before PMU access**: `of->priv == NULL` → return `-ENOENT` +- **rdtgroup marked for deletion**: `rdtgrp->flags & RDT_DELETED` → return `-ENOENT` +- **Invalid file descriptor**: Not a kernfs monitoring file → return `-EINVAL` + +## Debugging and Monitoring + +### Comprehensive Logging +PMU operations log detailed rdtgroup state for debugging: + +``` +resctrl_pmu: PMU event initialized: fd=5, path=/sys/fs/resctrl/group1/mon_data/mon_L3_00/llc_occupancy +resctrl_pmu: rdtgroup: closid=1, rmid=2, waitcount=2 +resctrl_pmu: type=CTRL, mode=0, flags=0x0 +resctrl_pmu: cpu_mask=0-3 + +resctrl_pmu: PMU event cleanup: path=/sys/fs/resctrl/group1/mon_data/mon_L3_00/llc_occupancy +resctrl_pmu: rdtgroup: closid=1, rmid=2, waitcount=1 +resctrl_pmu: type=CTRL, mode=0, flags=0x1 +resctrl_pmu: cpu_mask=0-3 +``` + +### Monitored Fields +- **closid**: Class of Service ID for cache allocation +- **rmid**: Resource Monitoring ID for performance monitoring +- **waitcount**: Reference count (should decrease after PMU cleanup) +- **type**: RDTCTRL_GROUP (control) vs RDTMON_GROUP (monitor-only) +- **mode**: Resource sharing mode (shareable, exclusive, pseudo-lock) +- **flags**: Status flags including RDT_DELETED +- **cpu_mask**: CPUs assigned to this rdtgroup + +## Benefits of This Approach + +### Simplicity +- **No new locking primitives**: Reuses existing resctrl infrastructure +- **No complex reference counting**: Leverages existing rdtgroup reference mechanisms +- **Minimal code changes**: Focused on the specific race condition + +### Robustness +- **Proven locking pattern**: Uses the same mutex resctrl uses internally +- **Comprehensive protection**: Covers all rdtgroup access scenarios +- **Graceful degradation**: Operations fail cleanly rather than crashing + +### Maintainability +- **Clear ownership**: Well-defined reference lifetimes +- **Extensive logging**: Rich debugging information for troubleshooting +- **Standard patterns**: Follows established kernfs and resctrl conventions + +## Future Considerations + +### Scalability +- Current implementation uses global mutex - acceptable for current use cases +- Future optimizations could use per-rdtgroup locking if needed + +### Extended Integration +- Foundation supports additional PMU event types +- Pattern can be applied to other subsystems needing resctrl integration +- Monitoring data structures remain accessible for advanced PMU operations + +## Related Documentation + +- `resctrl-pmu-reference-management.md`: Original reference management design +- `kernfs-file-handling.md`: Kernfs file lifecycle and reference management +- Linux kernel resctrl documentation: `Documentation/filesystems/resctrl.rst` +- Perf subsystem documentation: `tools/perf/Documentation/` + +## Summary + +The rdtgroup_mutex protection approach provides a robust, simple solution to the race condition between PMU access and resctrl node removal. By leveraging existing resctrl locking infrastructure, the implementation ensures: + +1. **Memory safety**: No access to freed rdtgroup structures +2. **State consistency**: No access to partially-removed rdtgroups +3. **Reference correctness**: Proper reference counting prevents premature cleanup +4. **Error resilience**: Graceful handling of all race condition scenarios + +This design provides a solid foundation for safe resctrl-PMU integration while maintaining the simplicity and reliability expected in kernel subsystems. \ No newline at end of file diff --git a/scratchpad/resctrl-pmu-reference-management.md b/scratchpad/resctrl-pmu-reference-management.md new file mode 100644 index 00000000000000..fe61f9f3d6a357 --- /dev/null +++ b/scratchpad/resctrl-pmu-reference-management.md @@ -0,0 +1,135 @@ +# Resctrl PMU Reference Management + +## Overview + +This document describes the implementation of reference management for resctrl monitoring files when file descriptors are passed to the perf PMU subsystem. The goal is to ensure safe access to rdtgroup structures even after the corresponding kernfs nodes are removed. + +## Problem Statement + +### Background + +Resctrl provides monitoring capabilities through files in the filesystem (e.g., `/sys/fs/resctrl//mon_data//`). Users can open these files and pass the file descriptors to the perf subsystem via `perf_event_open()` for hardware performance counter monitoring. + +### The Issue + +Without proper reference management, the following race condition can occur: + +1. User opens a monitoring file (e.g., LLC occupancy for an rdtgroup) +2. User passes the file descriptor to perf PMU +3. The rdtgroup is removed (kernfs node deleted) +4. PMU tries to access rdtgroup data through the file descriptor +5. **Potential crash or memory corruption** if rdtgroup memory is freed + +### Current Architecture + +- Monitoring files use `kf_mondata_ops` kernfs operations +- Each file's `kn->priv` points to `struct mon_data` containing event metadata +- The associated `rdtgroup` is obtained via `rdtgroup_kn_lock_live(of->kn)` which traverses up to the parent directory + This requires an active reference on the kernfs_node, which is why we're using the open entry in the ops. +- No additional reference counting protects the rdtgroup when file descriptors are held + +## Solution Design + +### Reference Management Strategy + +We implement kernfs `open` and `release` callbacks for monitoring files to: + +1. **During file open**: Take an additional reference on the rdtgroup +2. **During file close/release**: Release the rdtgroup reference +3. **Store the reference**: Use `kernfs_open_file->priv` to store the rdtgroup pointer + +### Key Design Principles + +1. **Minimal disruption**: Leverage existing rdtgroup reference counting mechanisms +2. **Consistent behavior**: All monitoring files get the same reference management +3. **Graceful degradation**: File operations fail cleanly (`-ENODEV`) after node removal +4. **Memory safety**: rdtgroup memory persists until all file references are released + +## Implementation Details + +### Data Structure Usage + +- `kn->priv`: Contains `struct mon_data` (unchanged) +- `of->priv`: Contains `struct rdtgroup *` (new - set during open) + +This allows PMU to access both: +- Event metadata via `kn->priv` +- Valid rdtgroup reference via `of->priv` + +### Function Responsibilities + +#### `rdtgroup_mondata_open()` +- Called when a monitoring file is opened +- Obtains rdtgroup via `rdtgroup_kn_lock_live(of->kn)` +- Takes additional reference on rdtgroup +- Stores rdtgroup pointer in `of->priv` +- Returns 0 on success, negative errno on failure + +#### `rdtgroup_mondata_release()` +- Called when file is closed or during kernfs node draining +- Retrieves rdtgroup from `of->priv` +- Releases the reference taken during open +- Cleans up `of->priv` + +### Integration with Kernfs File Lifecycle + +This implementation follows the established kernfs file handling patterns documented in `kernfs-file-handling.md`: + +1. **Open files don't prevent node removal** - They don't hold active references +2. **Release callbacks are called exactly once** - Either during drain or file close +3. **Memory safety is maintained** - Node structures persist until all references released +4. **Operations fail gracefully** - Return `-ENODEV` when node is deactivated + +## Benefits + +### For PMU Integration +- **Safe file descriptor passing**: PMU can safely hold monitoring file descriptors +- **Guaranteed data access**: rdtgroup remains valid for the lifetime of the file descriptor +- **Clean error handling**: Operations fail predictably rather than crashing + +### For System Stability +- **No memory corruption**: References prevent premature rdtgroup deallocation +- **Consistent behavior**: All monitoring files behave identically +- **Future-proof**: Other subsystems can safely use monitoring file descriptors + +### For Maintainability +- **Minimal code changes**: Leverages existing infrastructure +- **Clear ownership**: File reference management is explicit and documented +- **Standard patterns**: Follows established kernfs conventions + +## Usage Example + +```c +// User space +int fd = open("/sys/fs/resctrl/group1/mon_data/mon_L3_00/llc_occupancy", O_RDONLY); + +// Pass to perf +struct perf_event_attr attr = { + .type = PERF_TYPE_RESCTRL, // Hypothetical PMU type + // ... other fields +}; +int perf_fd = perf_event_open(&attr, fd, cpu, group_fd, flags); + +// Even if the rdtgroup is removed here, the PMU can still safely access +// the rdtgroup via the file descriptor until it's closed +``` + +## Kernel Implementation Flow + +1. **File Open**: `rdtgroup_mondata_open()` takes rdtgroup reference +2. **PMU Access**: PMU uses `of->priv` to access valid rdtgroup +3. **Node Removal**: kernfs calls release callback, but rdtgroup memory persists +4. **File Close**: Final reference released, rdtgroup can be freed + +## Future Considerations + +This reference management system provides a foundation for: +- Additional PMU integrations with resctrl +- Other subsystems that need persistent access to rdtgroup data +- Enhanced monitoring capabilities that require stable resource group references + +## Related Documentation + +- `kernfs-file-handling.md`: General kernfs file lifecycle and reference management +- Linux kernel perf subsystem documentation +- Resctrl filesystem documentation in `Documentation/filesystems/resctrl.rst` \ No newline at end of file From 42ad98733ef27d8b41e700c92d346604733ed65b Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 14:29:44 +0000 Subject: [PATCH 42/51] add diagrams --- scratchpad/x64-resctrl-diagrams.md | 433 +++++++++++++++++++++++++++-- 1 file changed, 414 insertions(+), 19 deletions(-) diff --git a/scratchpad/x64-resctrl-diagrams.md b/scratchpad/x64-resctrl-diagrams.md index 1a78a492b2a47f..79f483a4462cf8 100644 --- a/scratchpad/x64-resctrl-diagrams.md +++ b/scratchpad/x64-resctrl-diagrams.md @@ -130,38 +130,127 @@ graph TD style O fill:#fce4ec ``` -## 4. Monitoring Data Read Flow +## 4. CPU Selection for Monitoring Data Read -This diagram shows how monitoring data flows from MSRs through the architecture layer to the filesystem layer. +This diagram focuses on the CPU selection logic from rdtgroup_mondata_show to mon_event_count, highlighting the decision making based on CPU state (nohz_full). ```mermaid graph TD - A[rdtgroup_mondata_show] --> C[Parse event/domain from kernfs] - C --> D[resctrl_arch_rmid_read] + A[rdtgroup_mondata_show] --> B[mon_event_read with cpumask] - D --> E[logical_rmid_to_physical_rmid] - E --> F[__rmid_read_phys] + B --> C[cpumask_any_housekeeping with RESCTRL_PICK_ANY_CPU] + C --> D[Selected CPU from domain cpumask] - F --> G[wrmsrl MSR_IA32_QM_EVTSEL] - F --> H[rdmsrl MSR_IA32_QM_CTR] - F --> I{Counter valid?} - I -->|Error bit set| J[Return -EIO] - I -->|Unavailable| K[Return -EINVAL] - I -->|Valid| L[Process counter value] + D --> E{tick_nohz_full_cpu?} + E -->|Yes - CPU is nohz_full| F[smp_call_function_any] + E -->|No - Regular CPU| G[smp_call_on_cpu] - L --> M[get_corrected_mbm_count] - L --> N[Apply mon_scale factor] - L --> O[Handle MBM overflow] + F --> H[Any CPU in cpumask can run mon_event_count] + G --> I[Specific CPU runs smp_mon_event_count] - M --> P[Apply hardware corrections] - O --> Q[Update arch_mbm_state] + H --> J[mon_event_count on selected CPU] + I --> K[smp_mon_event_count wrapper] + K --> J + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style E fill:#f3e5f5 + style F fill:#ffebee + style G fill:#f1f8e9 +``` + +## 5. Perf CPU Selection for Counter Reads + +This diagram shows the CPU selection logic when reading perf counter values via read() syscall. + +```mermaid +graph TD + A[perf_read syscall] --> B[__perf_event_read_cpu] + + B --> C{PMU has READ_SCOPE capability?} + C -->|Yes| D{Current CPU in PMU scope?} + C -->|No| E{PMU has READ_ACTIVE_PKG capability?} + + D -->|Yes| F[Read from current CPU] + D -->|No| E + + E -->|Yes| G{Current CPU same package?} + E -->|No| H[Must use event's original CPU] + + G -->|Yes| F + G -->|No| H + + F --> I[Direct PMU read call] + H --> J[smp_call_function_single to event_cpu] + J --> K[__perf_event_read on target CPU] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style F fill:#f1f8e9 + style H fill:#f3e5f5 + style J fill:#ffebee +``` + +## 6. Monitoring Data Read Flow + +This diagram shows how monitoring data flows from MSRs through the architecture layer to the filesystem layer. Key functions: +- `rdtgroup_mondata_show()` in `fs/resctrl/ctrlmondata.c:588` +- `mon_event_read()` in `fs/resctrl/ctrlmondata.c:549` +- `mon_event_count()` in `fs/resctrl/monitor.c:455` +- `resctrl_arch_rmid_read()` in `arch/x86/kernel/cpu/resctrl/monitor.c:227` +- `__rmid_read_phys()` in `arch/x86/kernel/cpu/resctrl/monitor.c:141` + +```mermaid +graph TD + A[rdtgroup_mondata_show] --> B[Extract event/domain from kernfs node] + B --> C[Create struct rmid_read] + C --> D[mon_event_read] + + D --> E[Pick CPU from domain cpumask] + E --> F[smp_call_on_cpu or smp_call_function_any] + F --> G[mon_event_count] + + G --> H[__mon_event_count] + H --> I[resctrl_arch_reset_rmid if first read] + H --> J[resctrl_arch_rmid_read] + + J --> K[logical_rmid_to_physical_rmid] + K --> L{SNC enabled?} + L -->|No| M[physical_rmid = logical_rmid] + L -->|Yes| N[physical_rmid = logical_rmid + node_offset] + + M --> O[__rmid_read_phys] + N --> O + + O --> P[wrmsr MSR_IA32_QM_EVTSEL with eventid + prmid] + P --> Q[rdmsrq MSR_IA32_QM_CTR] + Q --> R{Check error bits} + + R -->|Bit 63 RMID_VAL_ERROR| S[Return -EIO] + R -->|Bit 62 RMID_VAL_UNAVAIL| T[Return -EINVAL] + R -->|Valid| U{MBM Event?} + + U -->|Yes| V[get_arch_mbm_state] + U -->|No| W[Apply mon_scale factor] + + V --> X[mbm_overflow_count - Handle counter width] + X --> Y[get_corrected_mbm_count - Apply corrections] + Y --> Z[Update arch_mbm_state prev_msr and chunks] + Z --> AA[Apply mon_scale factor] + + W --> BB[Return final value] + AA --> BB style A fill:#e1f5fe style D fill:#e8f5e8 style F fill:#fff3e0 - style G fill:#f3e5f5 - style H fill:#f3e5f5 + style J fill:#f3e5f5 + style O fill:#f1f8e9 + style P fill:#fce4ec + style Q fill:#fce4ec + style V fill:#e1f5fe ``` ## 4. MSR Access and Hardware Interface @@ -188,6 +277,9 @@ graph TD D --> N[MSR_IA32_EVT_CFG_BASE 0xc0000400] E --> O[Event ID + RMID selection] + O --> OA[QOS_L3_OCCUP_EVENT_ID 0x01] + O --> OB[QOS_L3_MBM_TOTAL_EVENT_ID 0x02] + O --> OC[QOS_L3_MBM_LOCAL_EVENT_ID 0x03] F --> P[Counter value read] G --> Q[CLOSID/RMID association] @@ -683,6 +775,295 @@ graph TD style J fill:#fce4ec ``` +## 18. RMID Limbo Mechanism Flow + +This diagram shows how freed RMIDs are managed through the limbo system to ensure metrics have drained before reuse. + +```mermaid +graph TD + A[free_rmid] --> B{LLC occupancy monitoring enabled?} + B -->|No| C[list_add_tail rmid_free_lru] + B -->|Yes| D[add_rmid_to_limbo] + + D --> E[Mark RMID busy in rmid_busy_llc] + E --> F[list_add_tail rmid_limbo_lru] + F --> G[atomic_inc rmid_limbo_count] + + H[cqm_handle_limbo - Periodic worker] --> I[__check_limbo] + I --> J[for each RMID in rmid_limbo_lru] + J --> K[resctrl_arch_rmid_read QOS_L3_OCCUP_EVENT_ID] + + K --> L{LLC occupancy < threshold?} + L -->|No| M[Keep in limbo] + L -->|Yes| N[limbo_release_entry] + + N --> O[Clear RMID in rmid_busy_llc] + O --> P[list_move_tail to rmid_free_lru] + P --> Q[atomic_dec rmid_limbo_count] + + R[Configurable Parameters] --> S[resctrl_rmid_realloc_threshold] + R --> T[CQM_LIMBOCHECK_INTERVAL = 1000ms] + + U[Force Release Option] --> V[limbo_release_entry - Force cleanup] + V --> W[Used during resource cleanup] + + X[Tracing Support] --> Y[trace_mon_llc_occupancy_limbo] + Y --> Z[Debug RMID occupancy values] + + style A fill:#e1f5fe + style D fill:#fff3e0 + style H fill:#e8f5e8 + style I fill:#f3e5f5 + style K fill:#f1f8e9 + style N fill:#fce4ec + style R fill:#e1f5fe + style U fill:#ffebee + style X fill:#e8f5e8 +``` + +## 19. RMID Allocation and Lifecycle Management + +This diagram shows the complete RMID lifecycle from allocation through limbo to reuse. + +```mermaid +graph TD + A[rmid_alloc] --> B{rmid_free_lru empty?} + B -->|Yes| C[Return -ENOSPC] + B -->|No| D[list_first_entry rmid_free_lru] + + D --> E[list_del RMID from free list] + E --> F[Return allocated RMID] + + G[Resource Group Usage] --> H[RMID active monitoring] + H --> I[Tasks assigned CLOSID/RMID] + I --> J[MSR_IA32_PQR_ASSOC updates] + + K[Resource Group Deletion] --> L[free_rmid] + L --> M{LLC occupancy enabled?} + M -->|No| N[Immediate reuse - add to rmid_free_lru] + M -->|Yes| O[Limbo processing - add_rmid_to_limbo] + + O --> P[Domain Processing] + P --> Q[for each domain in L3 mon_domains] + Q --> R[set_bit rmid, d->rmid_busy_llc] + + S[Periodic Limbo Worker] --> T[mod_delayed_work cqm_limbo] + T --> U[Check interval: 1000ms] + U --> V[__check_limbo for each domain] + + V --> W[Read LLC occupancy for each limbo RMID] + W --> X{Occupancy < threshold?} + X -->|Yes| Y[Move to free list] + X -->|No| Z[Schedule next check] + + Y --> AA[Available for allocation] + Z --> AB[Remain in limbo] + + style A fill:#e1f5fe + style K fill:#e8f5e8 + style O fill:#fff3e0 + style P fill:#f3e5f5 + style S fill:#f1f8e9 + style V fill:#fce4ec + style Y fill:#e1f5fe +``` + +## 20. Complete rmdir Operation Flow: From Syscall to RDT_DELETED and RMID Limbo + +This diagram shows the complete call flow from the rmdir syscall on an rdtgroup through to where the group is marked as RDT_DELETED and the RMID is put on the limbo list. This flow is critical for understanding how safe measurement readings can occur even after deletion, since the RMID goes onto the limbo list ensuring gradual metric drainage. + +```mermaid +graph TD + A[rmdir syscall] --> B[vfs_rmdir - fs/namei.c] + B --> C[kernfs_iop_rmdir - fs/kernfs/dir.c:1274] + C --> D[rdtgroup_rmdir - fs/resctrl/rdtgroup.c:3850] + + D --> E[rdtgroup_kn_lock_live] + E --> F{Group Type?} + + F -->|Control Group| G[rdtgroup_rmdir_ctrl - Line 3803] + F -->|Monitor Group| H[rdtgroup_rmdir_mon - Line 3755] + + G --> I[Move tasks to parent group] + G --> J[Set flags = RDT_DELETED - Line 3796] + G --> K[update_closid_rmid] + G --> L[rdtgroup_ctrl_remove] + + H --> M[Move tasks to parent group] + H --> N[Set flags = RDT_DELETED - Line 3780] + H --> O[update_closid_rmid] + H --> P[free_rmid - fs/resctrl/monitor.c:320] + + L --> Q[closid_free] + L --> R[kernfs_remove] + + P --> S{LLC occupancy monitoring enabled?} + S -->|No| T[list_add_tail rmid_free_lru - Immediate reuse] + S -->|Yes| U[add_rmid_to_limbo - Line 340] + + U --> V[add_rmid_to_limbo - Line 289-318] + V --> W[Mark RMID busy in all domains] + V --> X[set_bit idx, d->rmid_busy_llc] + V --> Y[entry->busy++] + V --> Z[rmid_limbo_count++ - Line 315] + V --> AA[Setup limbo handler if needed] + + AA --> BB[cqm_setup_limbo_handler] + BB --> CC[schedule_delayed_work cqm_limbo] + + DD[Async: cqm_handle_limbo - Line 651] --> EE[__check_limbo] + EE --> FF[Read LLC occupancy for limbo RMIDs] + FF --> GG{Occupancy < threshold?} + GG -->|Yes| HH[limbo_release_entry] + GG -->|No| II[Schedule next check] + + HH --> JJ[Clear RMID busy bit] + HH --> KK[Move to rmid_free_lru] + HH --> LL[rmid_limbo_count--] + + MM[Safety Mechanism] --> NN[RDT_DELETED flag prevents new operations] + NN --> OO[RMID in limbo ensures safe measurements] + OO --> PP[Gradual metric drainage before reuse] + + KK[Reference Management] --> QQ[rdtgroup_kn_unlock] + QQ --> RR[rdtgroup_kn_put] + RR --> SS{waitcount == 0 && RDT_DELETED?} + SS -->|Yes| TT[rdtgroup_remove - Final cleanup] + SS -->|No| UU[Keep structure alive] + + style A fill:#e1f5fe + style D fill:#e8f5e8 + style J fill:#ffebee + style N fill:#ffebee + style P fill:#fff3e0 + style U fill:#f3e5f5 + style V fill:#f1f8e9 + style DD fill:#fce4ec + style MM fill:#e8f5e8 + style SS fill:#fff3e0 +``` + +## 20a. Compact rmdir Flow for Presentation Slides + +This is a simplified version of the rmdir flow optimized for presentation slides - focusing on monitor group deletion with LLC occupancy monitoring enabled, including active reference draining. + +```mermaid +graph TD + A[rmdir syscall] --> B[vfs_rmdir] + B --> C[kernfs_iop_rmdir] + C --> D[rdtgroup_rmdir] + + D --> E[rdtgroup_kn_lock_live] + E --> F[waitcount++ & break_active_protection] + F --> G[rdtgroup_rmdir_mon] + + G --> H[rdt_move_group_tasks] + G --> I[Set flags = RDT_DELETED] + G --> J[update_closid_rmid] + G --> K[free_rmid] + + K --> L[add_rmid_to_limbo] + L --> M[Mark RMID busy] + L --> N[rmid_limbo_count++] + L --> O[Schedule limbo worker] + + P[cqm_handle_limbo worker] --> Q[Check LLC occupancy] + Q --> R[Move to free list when drained] + + S[rdtgroup_kn_unlock] --> T[waitcount-- & unbreak_active_protection] + T --> U{waitcount == 0 & RDT_DELETED?} + U -->|Yes| V[Final cleanup] + U -->|No| W[Keep alive for other refs] + + style A fill:#e1f5fe + style D fill:#e8f5e8 + style F fill:#fff3e0 + style I fill:#ffebee + style K fill:#f3e5f5 + style L fill:#f1f8e9 + style P fill:#fce4ec + style U fill:#fff3e0 +``` + +## 20b. Reference Counting Flow: open(), close(), and perf_event_open() to rdtgroup_get/put + +This diagram shows how file operations and PMU integration connect to rdtgroup reference counting. + +```mermaid +graph TD + %% File Operations Path + A1[open mon_data file] --> B1[kernfs_fop_open] + B1 --> C1[rdtgroup_mondata_open] + C1 --> D1[rdtgroup_get] + + %% Close Operations Path + E1[close fd] --> F1[kernfs_fop_release] + F1 --> G1[rdtgroup_mondata_release] + G1 --> H1[rdtgroup_put] + + %% PMU Integration Path + A2[perf_event_open with fd] --> B2[resctrl_event_init] + B2 --> C2[get_rdtgroup_from_fd] + C2 --> D2[rdtgroup_get] + + %% PMU Close Path + E2[perf event close] --> F2[resctrl_event_del] + F2 --> G2[rdtgroup_put] + + style A1 fill:#e1f5fe + style E1 fill:#ffebee + style A2 fill:#e8f5e8 + style E2 fill:#ffebee + style D1 fill:#f3e5f5 + style H1 fill:#f1f8e9 + style D2 fill:#f3e5f5 + style G2 fill:#f1f8e9 +``` + +## 21. RMID Limbo Data Structures and Organization + +This diagram shows the key data structures involved in RMID limbo management. + +```mermaid +graph TD + A[RMID Management Data Structures] --> B[Global Lists] + A --> C[Per-Domain State] + A --> D[Worker Infrastructure] + + B --> E[rmid_free_lru - Ready for allocation] + B --> F[rmid_limbo_lru - Draining metrics] + B --> G[rmid_limbo_count - Atomic counter] + + C --> H[struct rdt_mon_domain] + H --> I[rmid_busy_llc - Bitmap of limbo RMIDs] + H --> J[cqm_limbo - Delayed work struct] + H --> K[cqm_work_cpu - CPU for limbo worker] + + D --> L[cqm_handle_limbo - Worker function] + D --> M[__check_limbo - Core logic] + D --> N[limbo_release_entry - Release function] + + O[Configuration] --> P[resctrl_rmid_realloc_threshold] + P --> Q[Default: resctrl_rmid_realloc_limit] + P --> R[Sysfs configurable] + + O --> S[CQM_LIMBOCHECK_INTERVAL] + S --> T[Fixed: 1000 milliseconds] + + U[Synchronization] --> V[domain_list_lock mutex] + V --> W[Protects domain list operations] + + U --> X[rmid_busy_llc bitmap] + X --> Y[Atomic bit operations] + + style A fill:#e1f5fe + style B fill:#e8f5e8 + style C fill:#fff3e0 + style D fill:#f3e5f5 + style O fill:#f1f8e9 + style U fill:#fce4ec +``` + ## Key Integration Points The diagrams show several critical integration points between the filesystem and architecture layers: @@ -696,6 +1077,7 @@ The diagrams show several critical integration points between the filesystem and 7. **Task Migration**: When groups are reparented, all associated tasks are moved and their MSRs are updated atomically 8. **Pseudo-Lock Integration**: Mode transitions and cache loading operations bridge filesystem control with hardware-specific cache manipulation 9. **Performance Measurement**: Provides comprehensive measurement capabilities using hardware performance counters and tracing +10. **RMID Limbo Management**: Freed RMIDs are managed through a limbo system to ensure metrics have drained before reuse, preventing measurement interference ## Rename Operation Characteristics @@ -720,4 +1102,17 @@ The pseudo-lock feature has several key characteristics: - **Thread Safety**: Uses kernel threads and proper synchronization for cache loading operations - **Device Interface**: Provides character device and debugfs interfaces for user access and debugging +## RMID Limbo Operation Characteristics + +The RMID limbo mechanism has several key characteristics: + +- **Metric Draining**: Prevents immediate reuse of freed RMIDs to allow cache occupancy metrics to drain below threshold +- **Configurable Threshold**: Uses `resctrl_rmid_realloc_threshold` (configurable via sysfs) to determine when RMIDs can be reused +- **Periodic Processing**: Delayed work runs every 1000ms (`CQM_LIMBOCHECK_INTERVAL`) to check limbo RMIDs +- **Per-Domain Tracking**: Uses `rmid_busy_llc` bitmaps to track which RMIDs are in limbo per L3 cache domain +- **Atomic Operations**: Uses atomic counters and bit operations for thread-safe RMID state management +- **Force Release**: Supports forced cleanup during resource teardown to prevent resource leaks +- **Tracing Support**: Includes `trace_mon_llc_occupancy_limbo()` events for debugging and monitoring +- **Measurement Interference Prevention**: Ensures that reused RMIDs don't carry residual cache occupancy from previous usage + These flows demonstrate how the ResCtrl subsystem maintains a clean separation between filesystem operations and hardware-specific implementation details while ensuring proper synchronization and error handling throughout the stack. \ No newline at end of file From 47c9e733ffa37efc66f4e607e517b72751963357 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 16 Aug 2025 18:52:44 +0000 Subject: [PATCH 43/51] save rdtgroup reference in mon_data open files --- fs/resctrl/internal.h | 2 ++ fs/resctrl/rdtgroup.c | 62 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0a1eedba2b03ad..14ed2b3b8efb8c 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -343,6 +343,8 @@ void resctrl_mon_resource_exit(void); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); +int rdtgroup_mondata_open(struct kernfs_open_file *of); +void rdtgroup_mondata_release(struct kernfs_open_file *of); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77d08229d85502..7c115ce0096bb4 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -338,6 +338,8 @@ static const struct kernfs_ops rdtgroup_kf_single_ops = { static const struct kernfs_ops kf_mondata_ops = { .atomic_write_len = PAGE_SIZE, .seq_show = rdtgroup_mondata_show, + .open = rdtgroup_mondata_open, + .release = rdtgroup_mondata_release, }; static bool is_cpu_list(struct kernfs_open_file *of) @@ -2387,6 +2389,30 @@ static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) kernfs_break_active_protection(kn); } +/* + * Increment rdtgroup reference count + * Can be called without rdtgroup_mutex held (uses atomic operations) + */ +void rdtgroup_get(struct rdtgroup *rdtgrp) +{ + atomic_inc(&rdtgrp->waitcount); +} + +/* + * Decrement rdtgroup reference count and cleanup if needed + * Can be called without rdtgroup_mutex held (uses atomic operations) + */ +void rdtgroup_put(struct rdtgroup *rdtgrp) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + rdtgroup_remove(rdtgrp); + } +} + static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) { if (atomic_dec_and_test(&rdtgrp->waitcount) && @@ -3229,6 +3255,42 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, return ret; } +int rdtgroup_mondata_open(struct kernfs_open_file *of) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + /* + * Take an additional reference on the rdtgroup to ensure it + * remains valid while this file descriptor is open, even if + * the kernfs node is removed. + */ + rdtgroup_get(rdtgrp); + of->priv = rdtgrp; + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +void rdtgroup_mondata_release(struct kernfs_open_file *of) +{ + struct rdtgroup *rdtgrp = of->priv; + + if (rdtgrp) { + /* + * Release the reference taken during open. This may allow + * the rdtgroup to be freed if this was the last reference. + */ + rdtgroup_put(rdtgrp); + of->priv = NULL; + } +} + /** * cbm_ensure_valid - Enforce validity on provided CBM * @_val: Candidate CBM From 21cdaec823931e4cca2cca993d4f4bf2dcebb031 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 20:09:50 +0000 Subject: [PATCH 44/51] refactor: split mon_event_read into setup and perform functions --- fs/resctrl/ctrlmondata.c | 38 +++++++++++++++++++++----------------- fs/resctrl/internal.h | 7 ++++--- fs/resctrl/monitor.c | 2 +- fs/resctrl/rdtgroup.c | 6 ++++-- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index d98e0d2de09fd0..05bb4037590e2a 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -546,24 +546,27 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, return NULL; } -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) +void mon_setup_rmid_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first, unsigned int ci_id) { - int cpu; - - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - /* - * Setup the parameters to pass to mon_event_count() to read the data. - */ + memset(rr, 0, sizeof(*rr)); rr->rgrp = rdtgrp; rr->evtid = evtid; rr->r = r; rr->d = d; rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + rr->ci_id = ci_id; +} + +void mon_perform_rmid_read(struct rmid_read *rr, cpumask_t *cpumask) +{ + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr->r, rr->evtid); if (IS_ERR(rr->arch_mon_ctx)) { rr->err = -EINVAL; return; @@ -582,7 +585,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + resctrl_arch_mon_ctx_free(rr->r, rr->evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) @@ -625,13 +628,13 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) */ list_for_each_entry(d, &r->mon_domains, hdr.list) { if (d->ci_id == domid) { - rr.ci_id = d->ci_id; cpu = cpumask_any(&d->hdr.cpu_mask); ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) continue; - mon_event_read(&rr, r, NULL, rdtgrp, - &ci->shared_cpu_map, evtid, false); + mon_setup_rmid_read(&rr, r, NULL, rdtgrp, + evtid, false, d->ci_id); + mon_perform_rmid_read(&rr, &ci->shared_cpu_map); goto checkresult; } } @@ -648,7 +651,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) goto out; } d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + mon_setup_rmid_read(&rr, r, d, rdtgrp, evtid, false, d->ci_id); + mon_perform_rmid_read(&rr, &d->hdr.cpu_mask); } checkresult: diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 14ed2b3b8efb8c..b3f8f47a0a86cb 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -346,9 +346,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg); int rdtgroup_mondata_open(struct kernfs_open_file *of); void rdtgroup_mondata_release(struct kernfs_open_file *of); -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); +void mon_setup_rmid_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first, unsigned int ci_id); +void mon_perform_rmid_read(struct rmid_read *rr, cpumask_t *cpumask); int resctrl_mon_resource_init(void); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f5637855c3acac..378292a0e79555 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -449,7 +449,7 @@ static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* - * This is scheduled by mon_event_read() to read the CQM/MBM counters + * This is scheduled by mon_perform_rmid_read() to read the CQM/MBM counters * on a domain. */ void mon_event_count(void *info) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 7c115ce0096bb4..aa912481184c1e 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3096,8 +3096,10 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, if (ret) return ret; - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) { + mon_setup_rmid_read(&rr, r, d, prgrp, mevt->evtid, true, d->ci_id); + mon_perform_rmid_read(&rr, &d->hdr.cpu_mask); + } } return 0; From a1a1b446b31879dd0399b6938a0904df358940d9 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 20:41:10 +0000 Subject: [PATCH 45/51] refactor: separate metric read logic from decision what to measure and on which cpumask --- fs/resctrl/ctrlmondata.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 05bb4037590e2a..e4e81cf8a8be33 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -601,6 +601,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) struct rdt_resource *r; struct cacheinfo *ci; struct mon_data *md; + cpumask_t *cpumask; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -634,8 +635,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) continue; mon_setup_rmid_read(&rr, r, NULL, rdtgrp, evtid, false, d->ci_id); - mon_perform_rmid_read(&rr, &ci->shared_cpu_map); - goto checkresult; + cpumask = &ci->shared_cpu_map; + goto perform; } } ret = -ENOENT; @@ -652,10 +653,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) } d = container_of(hdr, struct rdt_mon_domain, hdr); mon_setup_rmid_read(&rr, r, d, rdtgrp, evtid, false, d->ci_id); - mon_perform_rmid_read(&rr, &d->hdr.cpu_mask); + cpumask = &d->hdr.cpu_mask; } -checkresult: +perform: + mon_perform_rmid_read(&rr, cpumask); if (rr.err == -EIO) seq_puts(m, "Error\n"); From 8626927fafb48f7445a88667160340446355ba72 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 21:42:32 +0000 Subject: [PATCH 46/51] separate setting up event reads from performing the reads --- fs/resctrl/ctrlmondata.c | 103 +++++++++++++++++++++------------------ fs/resctrl/internal.h | 2 + 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index e4e81cf8a8be33..b23fd8925e0470 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -559,6 +559,58 @@ void mon_setup_rmid_read(struct rmid_read *rr, struct rdt_resource *r, rr->ci_id = ci_id; } +int mon_event_read_setup(struct rmid_read *rr, cpumask_t **cpumask, + struct mon_data *md, struct rdtgroup *rdtgrp) +{ + enum resctrl_res_level resid; + enum resctrl_event_id evtid; + struct rdt_domain_hdr *hdr; + struct rdt_mon_domain *d; + struct rdt_resource *r; + struct cacheinfo *ci; + int domid, cpu; + + resid = md->rid; + domid = md->domid; + evtid = md->evtid; + r = resctrl_arch_get_resource(resid); + + if (md->sum) { + /* + * This file requires summing across all domains that share + * the L3 cache id that was provided in the "domid" field of the + * struct mon_data. Search all domains in the resource for + * one that matches this cache id. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->ci_id == domid) { + cpu = cpumask_any(&d->hdr.cpu_mask); + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) + continue; + mon_setup_rmid_read(rr, r, NULL, rdtgrp, + evtid, false, d->ci_id); + *cpumask = &ci->shared_cpu_map; + return 0; + } + } + return -ENOENT; + } else { + /* + * This file provides data from a single domain. Search + * the resource to find the domain with "domid". + */ + hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + return -ENOENT; + + d = container_of(hdr, struct rdt_mon_domain, hdr); + mon_setup_rmid_read(rr, r, d, rdtgrp, evtid, false, d->ci_id); + *cpumask = &d->hdr.cpu_mask; + return 0; + } +} + void mon_perform_rmid_read(struct rmid_read *rr, cpumask_t *cpumask) { int cpu; @@ -591,15 +643,9 @@ void mon_perform_rmid_read(struct rmid_read *rr, cpumask_t *cpumask) int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; - enum resctrl_res_level resid; - enum resctrl_event_id evtid; - struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; - struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; - int domid, cpu, ret = 0; - struct rdt_resource *r; - struct cacheinfo *ci; + int ret = 0; struct mon_data *md; cpumask_t *cpumask; @@ -615,48 +661,9 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) goto out; } - resid = md->rid; - domid = md->domid; - evtid = md->evtid; - r = resctrl_arch_get_resource(resid); - - if (md->sum) { - /* - * This file requires summing across all domains that share - * the L3 cache id that was provided in the "domid" field of the - * struct mon_data. Search all domains in the resource for - * one that matches this cache id. - */ - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci_id == domid) { - cpu = cpumask_any(&d->hdr.cpu_mask); - ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!ci) - continue; - mon_setup_rmid_read(&rr, r, NULL, rdtgrp, - evtid, false, d->ci_id); - cpumask = &ci->shared_cpu_map; - goto perform; - } - } - ret = -ENOENT; + ret = mon_event_read_setup(&rr, &cpumask, md, rdtgrp); + if (ret) goto out; - } else { - /* - * This file provides data from a single domain. Search - * the resource to find the domain with "domid". - */ - hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { - ret = -ENOENT; - goto out; - } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_setup_rmid_read(&rr, r, d, rdtgrp, evtid, false, d->ci_id); - cpumask = &d->hdr.cpu_mask; - } - -perform: mon_perform_rmid_read(&rr, cpumask); if (rr.err == -EIO) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index b3f8f47a0a86cb..0f8bf75da471ec 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -349,6 +349,8 @@ void rdtgroup_mondata_release(struct kernfs_open_file *of); void mon_setup_rmid_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, int evtid, int first, unsigned int ci_id); +int mon_event_read_setup(struct rmid_read *rr, cpumask_t **cpumask, + struct mon_data *md, struct rdtgroup *rdtgrp); void mon_perform_rmid_read(struct rmid_read *rr, cpumask_t *cpumask); int resctrl_mon_resource_init(void); From 65f09ec863a9434a7e05fe375e6edd4739d3e6b8 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 23:10:28 +0000 Subject: [PATCH 47/51] add skeleton PMU --- fs/resctrl/Makefile | 2 +- fs/resctrl/internal.h | 7 + fs/resctrl/pmu.c | 306 ++++++++++++++++++ fs/resctrl/rdtgroup.c | 8 +- tools/testing/selftests/resctrl/pmu_test.c | 141 ++++++++ tools/testing/selftests/resctrl/resctrl.h | 1 + .../testing/selftests/resctrl/resctrl_tests.c | 1 + 7 files changed, 464 insertions(+), 2 deletions(-) create mode 100644 fs/resctrl/pmu.c create mode 100644 tools/testing/selftests/resctrl/pmu_test.c diff --git a/fs/resctrl/Makefile b/fs/resctrl/Makefile index e67f34d2236a20..f738b0165ccc59 100644 --- a/fs/resctrl/Makefile +++ b/fs/resctrl/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o +obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o pmu.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o # To allow define_trace.h's recursive include: diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0f8bf75da471ec..b8110e0ccc4e9c 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -345,6 +345,13 @@ void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); int rdtgroup_mondata_open(struct kernfs_open_file *of); void rdtgroup_mondata_release(struct kernfs_open_file *of); +void rdtgroup_get(struct rdtgroup *rdtgrp); +void rdtgroup_put(struct rdtgroup *rdtgrp); + +/* PMU support */ +extern const struct kernfs_ops kf_mondata_ops; +int resctrl_pmu_init(void); +void resctrl_pmu_exit(void); void mon_setup_rmid_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, diff --git a/fs/resctrl/pmu.c b/fs/resctrl/pmu.c new file mode 100644 index 00000000000000..15fb8cea953e30 --- /dev/null +++ b/fs/resctrl/pmu.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resctrl PMU support + * - Enables perf event access to resctrl cache occupancy monitoring + * + * This provides a perf PMU interface to read cache occupancy from resctrl + * monitoring groups using file descriptors for group identification. + */ + +#define pr_fmt(fmt) "resctrl_pmu: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * PMU type will be dynamically assigned by perf_pmu_register + */ +static struct pmu resctrl_pmu; + +/* + * Event private data - stores information about the monitored resctrl group + */ +struct resctrl_pmu_event { + char *mon_path; /* Path extracted from file descriptor */ + struct rdtgroup *rdtgrp; /* Reference to rdtgroup being monitored */ +}; + +/* + * Get the file path from a file descriptor for debugging + */ +static char *get_fd_path(int fd) +{ + struct file *file; + char *path_buf, *path_str = NULL; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + fput(file); + return ERR_PTR(-ENOMEM); + } + + path_str = d_path(&file->f_path, path_buf, PATH_MAX); + if (IS_ERR(path_str)) { + kfree(path_buf); + fput(file); + return path_str; + } + + /* Make a copy of the path string */ + path_str = kstrdup(path_str, GFP_KERNEL); + kfree(path_buf); + fput(file); + + return path_str; +} + +/* + * Get rdtgroup from file descriptor with proper mutual exclusion + * Takes an additional reference on the rdtgroup which must be released + */ +static struct rdtgroup *get_rdtgroup_from_fd(int fd) +{ + struct file *file; + struct kernfs_open_file *of; + struct rdtgroup *rdtgrp; + struct rdtgroup *ret = ERR_PTR(-EBADF); + + file = fget(fd); + if (!file) + goto out; + + /* Basic validation that this is a kernfs file with seq_file */ + ret = ERR_PTR(-EINVAL); + if (!file->f_op || !file->private_data) + goto out_fput; + + /* For kernfs files, private_data points to seq_file, and seq_file->private is kernfs_open_file */ + of = ((struct seq_file *)file->private_data)->private; + if (!of) + goto out_fput; + + /* Validate that this is actually a resctrl monitoring file */ + if (!of->kn || of->kn->attr.ops != &kf_mondata_ops) + goto out_fput; + + /* CRITICAL: Hold rdtgroup_mutex to prevent race with release callback */ + mutex_lock(&rdtgroup_mutex); + + /* Get rdtgroup from kernfs_open_file - similar to pseudo_lock pattern */ + ret = ERR_PTR(-ENOENT); + rdtgrp = of->priv; + if (!rdtgrp) + /* File was drained - release callback already called */ + goto out_unlock; + + if (rdtgrp->flags & RDT_DELETED) + /* rdtgroup marked for deletion */ + goto out_unlock; + + /* Take reference using the rdtgroup API */ + rdtgroup_get(rdtgrp); + ret = rdtgrp; + /* Fall through to cleanup */ + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +out_fput: + fput(file); +out: + return ret; +} + +/* + * Initialize a new resctrl perf event + * The config field contains the file descriptor of the monitoring file + */ +static int resctrl_event_init(struct perf_event *event) +{ + struct resctrl_pmu_event *resctrl_event; + struct rdtgroup *rdtgrp; + char *path; + int fd; + + /* Only accept events for this PMU */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* No sampling support */ + if (is_sampling_event(event)) + return -EINVAL; + + /* No filtering support */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Extract file descriptor from config */ + fd = (int)event->attr.config; + if (fd < 0) + return -EINVAL; + + /* Get the file path for debugging */ + path = get_fd_path(fd); + if (IS_ERR(path)) + return PTR_ERR(path); + + /* Get rdtgroup with proper protection and reference counting */ + rdtgrp = get_rdtgroup_from_fd(fd); + if (IS_ERR(rdtgrp)) { + kfree(path); + return PTR_ERR(rdtgrp); + } + + /* Allocate our private event data */ + resctrl_event = kzalloc(sizeof(*resctrl_event), GFP_KERNEL); + if (!resctrl_event) { + rdtgroup_put(rdtgrp); + kfree(path); + return -ENOMEM; + } + + resctrl_event->mon_path = path; + resctrl_event->rdtgrp = rdtgrp; + event->pmu_private = resctrl_event; + + /* Set destroy callback for proper cleanup */ + event->destroy = resctrl_event_destroy; + + /* Log comprehensive rdtgroup information */ + pr_info("PMU event initialized: fd=%d, path=%s\n", fd, path); + pr_info(" rdtgroup: closid=%u, rmid=%u, waitcount=%d\n", + rdtgrp->closid, rdtgrp->mon.rmid, atomic_read(&rdtgrp->waitcount)); + pr_info(" type=%s, mode=%d, flags=0x%x\n", + rdtgrp->type == RDTCTRL_GROUP ? "CTRL" : "MON", + rdtgrp->mode, rdtgrp->flags); + pr_info(" cpu_mask=%*pbl\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + + return 0; +} + +/* + * Clean up event resources - called when event is destroyed + */ +static void resctrl_event_destroy(struct perf_event *event) +{ + struct resctrl_pmu_event *resctrl_event = event->pmu_private; + + if (resctrl_event) { + struct rdtgroup *rdtgrp = resctrl_event->rdtgrp; + + if (rdtgrp) { + /* Log rdtgroup state before cleanup */ + pr_info("PMU event cleanup: path=%s\n", resctrl_event->mon_path); + pr_info(" rdtgroup: closid=%u, rmid=%u, waitcount=%d\n", + rdtgrp->closid, rdtgrp->mon.rmid, atomic_read(&rdtgrp->waitcount)); + pr_info(" type=%s, mode=%d, flags=0x%x\n", + rdtgrp->type == RDTCTRL_GROUP ? "CTRL" : "MON", + rdtgrp->mode, rdtgrp->flags); + pr_info(" cpu_mask=%*pbl\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + + /* Release the reference we took during init */ + rdtgroup_put(rdtgrp); + } + + kfree(resctrl_event->mon_path); + kfree(resctrl_event); + event->pmu_private = NULL; + } +} + +/* + * Add event to PMU (enable monitoring) + */ +static int resctrl_event_add(struct perf_event *event, int flags) +{ + /* Currently just a stub - would setup actual monitoring here */ + return 0; +} + +/* + * Remove event from PMU (disable monitoring) + */ +static void resctrl_event_del(struct perf_event *event, int flags) +{ + /* Currently just a stub - would disable monitoring here */ +} + +/* + * Start event counting + */ +static void resctrl_event_start(struct perf_event *event, int flags) +{ + /* Currently just a stub - would start monitoring here */ +} + +/* + * Stop event counting + */ +static void resctrl_event_stop(struct perf_event *event, int flags) +{ + /* Currently just a stub - would stop monitoring here */ +} + +/* + * Read current counter value + */ +static void resctrl_event_update(struct perf_event *event) +{ + /* Currently just a stub - would read actual cache occupancy here */ + local64_set(&event->hw.prev_count, 0); +} + +/* + * Main PMU structure + */ +static struct pmu resctrl_pmu = { + .task_ctx_nr = perf_invalid_context, /* System-wide only */ + .event_init = resctrl_event_init, + .add = resctrl_event_add, + .del = resctrl_event_del, + .start = resctrl_event_start, + .stop = resctrl_event_stop, + .read = resctrl_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, +}; + +/* + * Initialize and register the resctrl PMU + */ +int resctrl_pmu_init(void) +{ + int ret; + + /* Register the PMU with perf subsystem */ + ret = perf_pmu_register(&resctrl_pmu, "resctrl", -1); + if (ret) { + pr_err("Failed to register resctrl PMU: %d\n", ret); + return ret; + } + + pr_info("Registered resctrl PMU with type %d\n", resctrl_pmu.type); + return 0; +} + +/* + * Cleanup the resctrl PMU + */ +void resctrl_pmu_exit(void) +{ + perf_pmu_unregister(&resctrl_pmu); + pr_info("Unregistered resctrl PMU\n"); +} \ No newline at end of file diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index aa912481184c1e..d82f02d16e87bc 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -335,7 +335,7 @@ static const struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; -static const struct kernfs_ops kf_mondata_ops = { +const struct kernfs_ops kf_mondata_ops = { .atomic_write_len = PAGE_SIZE, .seq_show = rdtgroup_mondata_show, .open = rdtgroup_mondata_open, @@ -4350,6 +4350,10 @@ int resctrl_init(void) */ debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + ret = resctrl_pmu_init(); + if (ret) + pr_warn("Failed to initialize resctrl PMU: %d\n", ret); + return 0; cleanup_mountpoint: @@ -4399,6 +4403,8 @@ static bool resctrl_online_domains_exist(void) */ void resctrl_exit(void) { + resctrl_pmu_exit(); + cpus_read_lock(); WARN_ON_ONCE(resctrl_online_domains_exist()); diff --git a/tools/testing/selftests/resctrl/pmu_test.c b/tools/testing/selftests/resctrl/pmu_test.c new file mode 100644 index 00000000000000..25edb52fc1f6bf --- /dev/null +++ b/tools/testing/selftests/resctrl/pmu_test.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resctrl PMU test + * + * Test program to verify the resctrl PMU functionality. + * Opens a perf event with the resctrl PMU and passes a file descriptor + * to trigger the printk in the kernel showing the file path. + */ + +#include "resctrl.h" +#include + +#define RESCTRL_PMU_NAME "resctrl" + +static int find_pmu_type(const char *pmu_name) +{ + char path[256]; + FILE *file; + int type; + + snprintf(path, sizeof(path), "/sys/bus/event_source/devices/%s/type", pmu_name); + + file = fopen(path, "r"); + if (!file) { + ksft_print_msg("Failed to open %s: %s\n", path, strerror(errno)); + return -1; + } + + if (fscanf(file, "%d", &type) != 1) { + ksft_print_msg("Failed to read PMU type from %s\n", path); + fclose(file); + return -1; + } + + fclose(file); + return type; +} + +static int open_monitoring_file(void) +{ + const char *mon_path = RESCTRL_PATH "/mon_data/mon_L3_00/llc_occupancy"; + int fd; + + /* Try to open a monitoring file in the root resctrl group */ + fd = open(mon_path, O_RDONLY); + if (fd < 0) { + ksft_print_msg("Failed to open monitoring file %s: %s\n", + mon_path, strerror(errno)); + return -1; + } + + ksft_print_msg("Opened monitoring file: %s (fd: %d)\n", mon_path, fd); + return fd; +} + +static int test_resctrl_pmu_event(int pmu_type, int mon_fd) +{ + struct perf_event_attr pe = {0}; + int perf_fd; + + /* Setup perf event attributes */ + pe.type = pmu_type; + pe.config = mon_fd; /* Pass the file descriptor as config */ + pe.size = sizeof(pe); + pe.disabled = 1; + pe.exclude_kernel = 0; + pe.exclude_hv = 0; + + /* Open the perf event */ + perf_fd = perf_event_open(&pe, -1, 0, -1, 0); + if (perf_fd < 0) { + ksft_print_msg("Failed to open perf event: %s\n", strerror(errno)); + return -1; + } + + ksft_print_msg("Successfully opened resctrl PMU event (fd: %d)\n", perf_fd); + + /* Enable the event to trigger initialization */ + if (ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + ksft_print_msg("Failed to enable perf event: %s\n", strerror(errno)); + close(perf_fd); + return -1; + } + + ksft_print_msg("Enabled resctrl PMU event - check kernel log for path printk\n"); + + /* Disable and close the event */ + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + close(perf_fd); + + return 0; +} + +static bool pmu_feature_check(const struct resctrl_test *test) +{ + return resctrl_mon_feature_exists("L3_MON", "llc_occupancy"); +} + +static int pmu_run_test(const struct resctrl_test *test, const struct user_params *uparams) +{ + int pmu_type, mon_fd, ret; + + ksft_print_msg("Testing resctrl PMU functionality\n"); + + /* Find the resctrl PMU type */ + pmu_type = find_pmu_type(RESCTRL_PMU_NAME); + if (pmu_type < 0) { + ksft_print_msg("Resctrl PMU not found - this indicates the PMU is not registered\n"); + return -1; + } + + ksft_print_msg("Found resctrl PMU with type: %d\n", pmu_type); + + /* Open a monitoring file to get a file descriptor */ + mon_fd = open_monitoring_file(); + if (mon_fd < 0) + return -1; + + /* Test opening a perf event with the monitoring file descriptor */ + ret = test_resctrl_pmu_event(pmu_type, mon_fd); + + /* Clean up */ + close(mon_fd); + + if (ret == 0) { + ksft_print_msg("Resctrl PMU test completed successfully\n"); + ksft_print_msg("Check dmesg for kernel log message with file path\n"); + } + + return ret; +} + +struct resctrl_test pmu_test = { + .name = "PMU", + .group = "pmu", + .resource = "L3", + .vendor_specific = 0, + .feature_check = pmu_feature_check, + .run_test = pmu_run_test, + .cleanup = NULL, +}; \ No newline at end of file diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index cd3adfc14969fc..5b0e6074eaba54 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -244,5 +244,6 @@ extern struct resctrl_test cmt_test; extern struct resctrl_test l3_cat_test; extern struct resctrl_test l3_noncont_cat_test; extern struct resctrl_test l2_noncont_cat_test; +extern struct resctrl_test pmu_test; #endif /* RESCTRL_H */ diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 5154ffd821c441..11ba9000e015d4 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -21,6 +21,7 @@ static struct resctrl_test *resctrl_tests[] = { &l3_cat_test, &l3_noncont_cat_test, &l2_noncont_cat_test, + &pmu_test, }; static int detect_vendor(void) From 6af368fe59fd91d9a0754eac44d9b9a6f08616e0 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 22:27:41 +0000 Subject: [PATCH 48/51] tooling: avoid make olddefconfig if config hasn't changed --- build-and-upload.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/build-and-upload.sh b/build-and-upload.sh index f1e89764b8041e..2da011d2bb9f63 100755 --- a/build-and-upload.sh +++ b/build-and-upload.sh @@ -96,8 +96,14 @@ configure_kernel() { # scripts/config --enable CONFIG_PROC_FS # scripts/config --enable CONFIG_SYSFS - # Update config with dependencies - make olddefconfig + # Update config with dependencies (only on fresh builds) + if [[ ! -f ".config.bak" ]]; then + log "Fresh build: running olddefconfig to resolve dependencies" + cp .config .config.bak + make olddefconfig + else + log "Incremental build: skipping olddefconfig to preserve config stability" + fi } # Build the kernel @@ -150,6 +156,12 @@ create_initrd() { error "build-initrd.sh not found in current directory" fi + # Check if config changed and force initrd rebuild if needed + if [[ -f ".config.bak" ]] && ! diff -q .config .config.bak >/dev/null 2>&1; then + log "Config changed since last build, forcing initrd rebuild" + export FORCE_INITRD=1 + fi + # Run the initrd build script with upload flag local initrd_output initrd_output=$(./build-initrd.sh --upload 2>&1) || error "Failed to build and upload initrd" From 7183c4643cc047fc1467cf015037b81cd06bb9e7 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 23:25:07 +0000 Subject: [PATCH 49/51] fix function ordering so destroy is defined before reference --- fs/resctrl/pmu.c | 61 ++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/fs/resctrl/pmu.c b/fs/resctrl/pmu.c index 15fb8cea953e30..5513c2f12983ab 100644 --- a/fs/resctrl/pmu.c +++ b/fs/resctrl/pmu.c @@ -124,6 +124,36 @@ static struct rdtgroup *get_rdtgroup_from_fd(int fd) return ret; } +/* + * Clean up event resources - called when event is destroyed + */ +static void resctrl_event_destroy(struct perf_event *event) +{ + struct resctrl_pmu_event *resctrl_event = event->pmu_private; + + if (resctrl_event) { + struct rdtgroup *rdtgrp = resctrl_event->rdtgrp; + + if (rdtgrp) { + /* Log rdtgroup state before cleanup */ + pr_info("PMU event cleanup: path=%s\n", resctrl_event->mon_path); + pr_info(" rdtgroup: closid=%u, rmid=%u, waitcount=%d\n", + rdtgrp->closid, rdtgrp->mon.rmid, atomic_read(&rdtgrp->waitcount)); + pr_info(" type=%s, mode=%d, flags=0x%x\n", + rdtgrp->type == RDTCTRL_GROUP ? "CTRL" : "MON", + rdtgrp->mode, rdtgrp->flags); + pr_info(" cpu_mask=%*pbl\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + + /* Release the reference we took during init */ + rdtgroup_put(rdtgrp); + } + + kfree(resctrl_event->mon_path); + kfree(resctrl_event); + event->pmu_private = NULL; + } +} + /* * Initialize a new resctrl perf event * The config field contains the file descriptor of the monitoring file @@ -192,35 +222,6 @@ static int resctrl_event_init(struct perf_event *event) return 0; } -/* - * Clean up event resources - called when event is destroyed - */ -static void resctrl_event_destroy(struct perf_event *event) -{ - struct resctrl_pmu_event *resctrl_event = event->pmu_private; - - if (resctrl_event) { - struct rdtgroup *rdtgrp = resctrl_event->rdtgrp; - - if (rdtgrp) { - /* Log rdtgroup state before cleanup */ - pr_info("PMU event cleanup: path=%s\n", resctrl_event->mon_path); - pr_info(" rdtgroup: closid=%u, rmid=%u, waitcount=%d\n", - rdtgrp->closid, rdtgrp->mon.rmid, atomic_read(&rdtgrp->waitcount)); - pr_info(" type=%s, mode=%d, flags=0x%x\n", - rdtgrp->type == RDTCTRL_GROUP ? "CTRL" : "MON", - rdtgrp->mode, rdtgrp->flags); - pr_info(" cpu_mask=%*pbl\n", cpumask_pr_args(&rdtgrp->cpu_mask)); - - /* Release the reference we took during init */ - rdtgroup_put(rdtgrp); - } - - kfree(resctrl_event->mon_path); - kfree(resctrl_event); - event->pmu_private = NULL; - } -} /* * Add event to PMU (enable monitoring) @@ -303,4 +304,4 @@ void resctrl_pmu_exit(void) { perf_pmu_unregister(&resctrl_pmu); pr_info("Unregistered resctrl PMU\n"); -} \ No newline at end of file +} From f18cd9d41d4c6e29e5923afe4ebd42264e056da5 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 29 Aug 2025 23:32:44 +0000 Subject: [PATCH 50/51] remove fd path getting that we had for demo --- fs/resctrl/pmu.c | 54 +++--------------------------------------------- 1 file changed, 3 insertions(+), 51 deletions(-) diff --git a/fs/resctrl/pmu.c b/fs/resctrl/pmu.c index 5513c2f12983ab..093dbb18e7df6a 100644 --- a/fs/resctrl/pmu.c +++ b/fs/resctrl/pmu.c @@ -13,11 +13,7 @@ #include #include #include -#include -#include #include -#include -#include #include #include #include "internal.h" @@ -31,42 +27,9 @@ static struct pmu resctrl_pmu; * Event private data - stores information about the monitored resctrl group */ struct resctrl_pmu_event { - char *mon_path; /* Path extracted from file descriptor */ struct rdtgroup *rdtgrp; /* Reference to rdtgroup being monitored */ }; -/* - * Get the file path from a file descriptor for debugging - */ -static char *get_fd_path(int fd) -{ - struct file *file; - char *path_buf, *path_str = NULL; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) { - fput(file); - return ERR_PTR(-ENOMEM); - } - - path_str = d_path(&file->f_path, path_buf, PATH_MAX); - if (IS_ERR(path_str)) { - kfree(path_buf); - fput(file); - return path_str; - } - - /* Make a copy of the path string */ - path_str = kstrdup(path_str, GFP_KERNEL); - kfree(path_buf); - fput(file); - - return path_str; -} /* * Get rdtgroup from file descriptor with proper mutual exclusion @@ -136,7 +99,7 @@ static void resctrl_event_destroy(struct perf_event *event) if (rdtgrp) { /* Log rdtgroup state before cleanup */ - pr_info("PMU event cleanup: path=%s\n", resctrl_event->mon_path); + pr_info("PMU event cleanup\n"); pr_info(" rdtgroup: closid=%u, rmid=%u, waitcount=%d\n", rdtgrp->closid, rdtgrp->mon.rmid, atomic_read(&rdtgrp->waitcount)); pr_info(" type=%s, mode=%d, flags=0x%x\n", @@ -148,7 +111,6 @@ static void resctrl_event_destroy(struct perf_event *event) rdtgroup_put(rdtgrp); } - kfree(resctrl_event->mon_path); kfree(resctrl_event); event->pmu_private = NULL; } @@ -162,7 +124,6 @@ static int resctrl_event_init(struct perf_event *event) { struct resctrl_pmu_event *resctrl_event; struct rdtgroup *rdtgrp; - char *path; int fd; /* Only accept events for this PMU */ @@ -183,27 +144,18 @@ static int resctrl_event_init(struct perf_event *event) if (fd < 0) return -EINVAL; - /* Get the file path for debugging */ - path = get_fd_path(fd); - if (IS_ERR(path)) - return PTR_ERR(path); - /* Get rdtgroup with proper protection and reference counting */ rdtgrp = get_rdtgroup_from_fd(fd); - if (IS_ERR(rdtgrp)) { - kfree(path); + if (IS_ERR(rdtgrp)) return PTR_ERR(rdtgrp); - } /* Allocate our private event data */ resctrl_event = kzalloc(sizeof(*resctrl_event), GFP_KERNEL); if (!resctrl_event) { rdtgroup_put(rdtgrp); - kfree(path); return -ENOMEM; } - resctrl_event->mon_path = path; resctrl_event->rdtgrp = rdtgrp; event->pmu_private = resctrl_event; @@ -211,7 +163,7 @@ static int resctrl_event_init(struct perf_event *event) event->destroy = resctrl_event_destroy; /* Log comprehensive rdtgroup information */ - pr_info("PMU event initialized: fd=%d, path=%s\n", fd, path); + pr_info("PMU event initialized: fd=%d\n", fd); pr_info(" rdtgroup: closid=%u, rmid=%u, waitcount=%d\n", rdtgrp->closid, rdtgrp->mon.rmid, atomic_read(&rdtgrp->waitcount)); pr_info(" type=%s, mode=%d, flags=0x%x\n", From d3a94217e324a2ac5f6cea7be84b5d96fb9265a7 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 30 Aug 2025 13:07:44 +0000 Subject: [PATCH 51/51] selftests/resctrl: add safety test for resctrl PMU file access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive safety test that walks all files and directories in /sys/fs/resctrl and verifies that perf_event_open() fails on all files except llc_occupancy and mbm monitoring files. This ensures the resctrl PMU implementation safely rejects inappropriate file descriptors. The test recursively examines the entire resctrl filesystem hierarchy, attempting to open perf events on each regular file found. It expects success only for files containing "llc_occupancy", "mbm_total_bytes", or "mbm_local_bytes" in their path, and failure for all other files. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- tools/testing/selftests/resctrl/pmu_test.c | 123 ++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/resctrl/pmu_test.c b/tools/testing/selftests/resctrl/pmu_test.c index 25edb52fc1f6bf..478e85fb7dd997 100644 --- a/tools/testing/selftests/resctrl/pmu_test.c +++ b/tools/testing/selftests/resctrl/pmu_test.c @@ -9,6 +9,7 @@ #include "resctrl.h" #include +#include #define RESCTRL_PMU_NAME "resctrl" @@ -91,6 +92,116 @@ static int test_resctrl_pmu_event(int pmu_type, int mon_fd) return 0; } +static bool is_allowed_file(const char *filename) +{ + /* Only llc_occupancy and mbm files should be allowed */ + return (strstr(filename, "llc_occupancy") != NULL || + strstr(filename, "mbm_total_bytes") != NULL || + strstr(filename, "mbm_local_bytes") != NULL); +} + +static int test_file_safety(int pmu_type, const char *filepath) +{ + struct perf_event_attr pe = {0}; + int fd, perf_fd; + bool should_succeed; + + /* Try to open the file */ + fd = open(filepath, O_RDONLY); + if (fd < 0) { + /* File couldn't be opened, skip it */ + return 0; + } + + should_succeed = is_allowed_file(filepath); + + /* Setup perf event attributes */ + pe.type = pmu_type; + pe.config = fd; + pe.size = sizeof(pe); + pe.disabled = 1; + pe.exclude_kernel = 0; + pe.exclude_hv = 0; + + /* Try to open the perf event */ + perf_fd = perf_event_open(&pe, -1, 0, -1, 0); + + if (should_succeed) { + if (perf_fd < 0) { + ksft_print_msg("FAIL: Expected success but perf_event_open failed for %s: %s\n", + filepath, strerror(errno)); + close(fd); + return -1; + } + ksft_print_msg("PASS: Allowed file %s successfully opened perf event\n", filepath); + close(perf_fd); + } else { + if (perf_fd >= 0) { + ksft_print_msg("FAIL: Expected failure but perf_event_open succeeded for %s\n", + filepath); + close(perf_fd); + close(fd); + return -1; + } + ksft_print_msg("PASS: Blocked file %s correctly failed perf_event_open: %s\n", + filepath, strerror(errno)); + } + + close(fd); + return 0; +} + +static int walk_directory_recursive(int pmu_type, const char *dir_path) +{ + DIR *dir; + struct dirent *entry; + char full_path[1024]; + struct stat statbuf; + int ret = 0; + + dir = opendir(dir_path); + if (!dir) { + ksft_print_msg("Failed to open directory %s: %s\n", dir_path, strerror(errno)); + return -1; + } + + while ((entry = readdir(dir)) != NULL) { + /* Skip . and .. */ + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) + continue; + + snprintf(full_path, sizeof(full_path), "%s/%s", dir_path, entry->d_name); + + if (stat(full_path, &statbuf) != 0) { + ksft_print_msg("Failed to stat %s: %s\n", full_path, strerror(errno)); + continue; + } + + if (S_ISDIR(statbuf.st_mode)) { + /* Recursively walk subdirectories */ + if (walk_directory_recursive(pmu_type, full_path) != 0) { + ret = -1; + } + } else if (S_ISREG(statbuf.st_mode)) { + /* Test regular files */ + if (test_file_safety(pmu_type, full_path) != 0) { + ret = -1; + } + } + } + + closedir(dir); + return ret; +} + +static int test_resctrl_pmu_safety(int pmu_type) +{ + ksft_print_msg("Testing resctrl PMU safety - walking all files in %s\n", RESCTRL_PATH); + + /* Walk through all files and directories in /sys/fs/resctrl */ + return walk_directory_recursive(pmu_type, RESCTRL_PATH); +} + static bool pmu_feature_check(const struct resctrl_test *test) { return resctrl_mon_feature_exists("L3_MON", "llc_occupancy"); @@ -122,9 +233,19 @@ static int pmu_run_test(const struct resctrl_test *test, const struct user_param /* Clean up */ close(mon_fd); + if (ret != 0) { + ksft_print_msg("Basic resctrl PMU test failed\n"); + return ret; + } + + /* Run the safety test to ensure only appropriate files work */ + ret = test_resctrl_pmu_safety(pmu_type); + if (ret == 0) { - ksft_print_msg("Resctrl PMU test completed successfully\n"); + ksft_print_msg("All resctrl PMU tests completed successfully\n"); ksft_print_msg("Check dmesg for kernel log message with file path\n"); + } else { + ksft_print_msg("Resctrl PMU safety test failed\n"); } return ret;