diff --git a/.gitignore b/.gitignore index 98e28f56..92fd6557 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ __pycache__/*.lock **.lock **__pycache__** .idea/ -.vscode/ \ No newline at end of file +.vscode/ +**.swp diff --git "a/src/Il\303\272vatar/.gitignore" "b/src/Il\303\272vatar/.gitignore" index 943c004b..6adb73b5 100644 --- "a/src/Il\303\272vatar/.gitignore" +++ "b/src/Il\303\272vatar/.gitignore" @@ -4,5 +4,5 @@ iluvatar_worker/src/worker.dev*json iluvatar_worker_library/tests/resources/worker.dev*json iluvatar_controller/src/controller.dev*json target/ -**.swp -.vscode/ \ No newline at end of file +**.sw* +.vscode/ diff --git "a/src/Il\303\272vatar/Cargo.toml" "b/src/Il\303\272vatar/Cargo.toml" index ccbb5fae..b3464e71 100644 --- "a/src/Il\303\272vatar/Cargo.toml" +++ "b/src/Il\303\272vatar/Cargo.toml" @@ -8,7 +8,9 @@ members = [ "iluvatar_library", "iluvatar_worker_library", "iluvatar_energy_mon", - "iluvatar_rpc" + "iluvatar_rpc", + "iluvatar_bpf_library", + "fs_policy_tsksz", ] resolver = "2" diff --git "a/src/Il\303\272vatar/Cross.toml" "b/src/Il\303\272vatar/Cross.toml" index 2343fab3..fa877878 100644 --- "a/src/Il\303\272vatar/Cross.toml" +++ "b/src/Il\303\272vatar/Cross.toml" @@ -5,5 +5,9 @@ passthrough = ["ARCH=amd64", "GO_VERSION=1.22.0", "CNI_VERSION=v1.1.1", "GOPATH= default-target = "x86_64-unknown-linux-gnu" # use this target if none is explicitly provided pre-build = [ # additional commands to run prior to building the package "dpkg --add-architecture $CROSS_DEB_ARCH", - "apt-get update && apt-get --assume-yes install protobuf-compiler iproute2 wget curl runc bridge-utils iptables net-tools sysstat" + "apt-get update && apt-get --assume-yes install protobuf-compiler iproute2 wget curl runc bridge-utils iptables net-tools sysstat libelf-dev lsb-release wget software-properties-common gnupg", + "wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh && ./llvm.sh all && ln -s /usr/bin/ld.lld-18 /usr/bin/ld.lld && ln -s /usr/bin/clang-18 /usr/bin/clang && ln -s /usr/bin/clang++-18 /usr/bin/clang++ && rm llvm.sh" ] + + + diff --git "a/src/Il\303\272vatar/Makefile" "b/src/Il\303\272vatar/Makefile" index 1c9f9de8..a6598a1e 100644 --- "a/src/Il\303\272vatar/Makefile" +++ "b/src/Il\303\272vatar/Makefile" @@ -12,7 +12,7 @@ endif RUST_C?=cross DEBUG_FLAGS=--all-targets $(TARGET) -j $(NPROCS) RELEASE_FLAGS=--lib --bins $(TARGET) -j $(NPROCS) --release -CARGO_ARGS?="" +CARGO_ARGS?= default: debug @@ -23,7 +23,7 @@ clean: @$(RUST_C) clean check: @echo "Checking" - @RUSTFLAGS=$(RUST_FLAGS) $(RUST_C) check --all-features $(DEBUG_FLAGS) $(CARGO_ARGS) + @RUSTFLAGS=$(RUST_FLAGS) $(RUST_C) check $(DEBUG_FLAGS) $(CARGO_ARGS) release: @echo "Building release" @RUSTFLAGS=$(RUST_FLAGS) $(RUST_C) build $(RELEASE_FLAGS) $(CARGO_ARGS) @@ -37,6 +37,10 @@ tiny: debug: @echo "Building debug" @RUSTFLAGS=$(RUST_FLAGS) $(RUST_C) build $(DEBUG_FLAGS) $(CARGO_ARGS) +fix: + @echo "Fixing lint errors" + @RUSTFLAGS=$(RUST_FLAGS) $(RUST_C) fix $(DEBUG_FLAGS) $(CARGO_ARGS) + @RUSTFLAGS=$(RUST_FLAGS) $(RUST_C) clippy --fix --allow-no-vcs --workspace --examples --benches --no-deps -- -Dclippy::suspicious -Dclippy::correctness -Dclippy::perf -Aclippy::single_match -Aclippy::new_without_default -Aclippy::too_many_arguments -Aclippy::type-complexity -Dclippy::from_over_into -Aclippy::redundant-field-names -Dwarnings spans: @echo "Building full_spans" @RUSTFLAGS=$(RUST_FLAGS) $(RUST_C) build --features full_spans $(RELEASE_FLAGS) $(CARGO_ARGS) diff --git "a/src/Il\303\272vatar/ansible/worker.yml" "b/src/Il\303\272vatar/ansible/worker.yml" index e3ee4f63..a1ecce46 100644 --- "a/src/Il\303\272vatar/ansible/worker.yml" +++ "b/src/Il\303\272vatar/ansible/worker.yml" @@ -123,6 +123,14 @@ remote_src: "{{__remote_bin_src}}" become: yes + - name: copy fs_policy_tsksz + ansible.builtin.copy: + src: "{{__bin_src}}/fs_policy_tsksz" + dest: "{{bin_dir}}/" + mode: "preserve" + remote_src: "{{__remote_bin_src}}" + become: yes + - name: copy worker config ansible.builtin.copy: src: "{{__bin_src}}/{{worker.config_name}}" diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/.gitignore" "b/src/Il\303\272vatar/fs_policy_tsksz/.gitignore" new file mode 100644 index 00000000..e69de29b diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/Cargo.toml" "b/src/Il\303\272vatar/fs_policy_tsksz/Cargo.toml" new file mode 100644 index 00000000..c7ed8870 --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/Cargo.toml" @@ -0,0 +1,33 @@ +[package] +name = "fs_policy_tsksz" +version = "0.0.3" +authors = ["Abdul Rehman "] +edition = "2021" +description = "A simple scheduler that preserves locality for a function cgroup" +license = "GPL-2.0-only" + +[dependencies] +anyhow = "1.0.65" +plain = "0.2.3" +ctrlc = { version = "3.1", features = ["termination"] } +libbpf-rs = "0.24.1" +libc = "0.2.137" +scx_utils = { version = "1.0.7" } +scx_rustland_core = { version = "2.2.3" } + +# Specific to iluvatar +iluvatar_library = { path = "../iluvatar_library" } +iluvatar_worker_library = { path = "../iluvatar_worker_library" } +clap = { version = "4.5.4", features = ["derive"] } +ipc-channel = { version = "0.18.1", features = ["memfd"] } +serde = { version = "1.0" } + +[build-dependencies] +scx_utils = { version = "1.0.7" } +scx_rustland_core = { version = "2.2.3" } + +[features] +enable_backtrace = [] + + + diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/LICENSE" "b/src/Il\303\272vatar/fs_policy_tsksz/LICENSE" new file mode 120000 index 00000000..5853aaea --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/LICENSE" @@ -0,0 +1 @@ +../../../LICENSE \ No newline at end of file diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/README.md" "b/src/Il\303\272vatar/fs_policy_tsksz/README.md" new file mode 100644 index 00000000..f57115fc --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/README.md" @@ -0,0 +1,20 @@ +# scx_rlfifo + +This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main). + +## Overview + +scx_rlfifo is a simple FIFO scheduler runs in user-space, based on the +scx_rustland_core framework. + +## Typical Use Case + +This scheduler is provided as a simple template that can be used as a baseline +to test more complex scheduling policies. + +## Production Ready? + +Definitely not. Using this scheduler in a production environment is not +recommended, unless there are specific requirements that necessitate a basic +FIFO scheduling approach. Even then, it's still recommended to use the kernel's +SCHED_FIFO real-time class. diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/build.rs" "b/src/Il\303\272vatar/fs_policy_tsksz/build.rs" new file mode 100644 index 00000000..42e96b71 --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/build.rs" @@ -0,0 +1,11 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +fn main() { + scx_utils::BpfBuilder::new() + .unwrap() + .enable_intf("src/bpf/intf.h", "bpf_intf.rs") + .enable_skel("src/bpf/main.bpf.c", "bpf") + .build() + .unwrap(); +} diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/meson.build" "b/src/Il\303\272vatar/fs_policy_tsksz/meson.build" new file mode 100644 index 00000000..25e1d4a1 --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/meson.build" @@ -0,0 +1,14 @@ +if serialize + sched_deps = [libbpf, bpftool_target, sched] +else + sched_deps = [libbpf, bpftool_target] +endif + +sched = custom_target('scx_rlfifo', + output: '@PLAINNAME@.__PHONY__', + input: 'Cargo.toml', + command: [cargo, 'build', '--manifest-path=@INPUT@', '--target-dir=@OUTDIR@', + cargo_build_args], + env: cargo_env, + depends: sched_deps, + build_always_stale: true) diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/rustfmt.toml" "b/src/Il\303\272vatar/fs_policy_tsksz/rustfmt.toml" new file mode 100644 index 00000000..b7258ed0 --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/rustfmt.toml" @@ -0,0 +1,8 @@ +# Get help on options with `rustfmt --help=config` +# Please keep these in alphabetical order. +edition = "2021" +group_imports = "StdExternalCrate" +imports_granularity = "Item" +merge_derives = false +use_field_init_shorthand = true +version = "Two" diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/src/bpf.rs" "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf.rs" new file mode 100644 index 00000000..16f759e3 --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf.rs" @@ -0,0 +1,189 @@ +// Copyright (c) Andrea Righi + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +use scx_utils::enums::scx_enums; +use scx_utils::import_enums; +use std::mem::MaybeUninit; + +use crate::bpf_intf; +use crate::bpf_skel::*; + +use anyhow::Context; +use anyhow::Result; + +use libbpf_rs::skel::OpenSkel; +use libbpf_rs::skel::Skel; +use libbpf_rs::skel::SkelBuilder; +use libbpf_rs::OpenObject; + +use libc::{pthread_self, pthread_setschedparam, sched_param}; + +#[cfg(target_env = "musl")] +use libc::timespec; + +use scx_utils::scx_ops_attach; +use scx_utils::scx_ops_load; +use scx_utils::scx_ops_open; +use scx_utils::uei_exited; +use scx_utils::uei_report; +use scx_utils::UserExitInfo; + +use scx_rustland_core::ALLOCATOR; + +// Defined in UAPI +const SCHED_EXT: i32 = 7; + +pub struct BpfScheduler<'cb> { + pub skel: BpfSkel<'cb>, // Low-level BPF connector + struct_ops: Option, // Low-level BPF methods + queued_stats: libbpf_rs::RingBuffer<'cb>, // ring buffer of tasks pids to be switched to schedext +} + +#[derive(Clone, Copy, Debug)] +#[allow(non_camel_case_types, dead_code)] +pub struct lpolicy_stats(bpf_intf::policy_stats); + +macro_rules! define_buffer { + ( $bufname: ident, $abufname: ident, $abuf: ident, $callback: ident, $tdst: ty ) => { + const $bufname: usize = std::mem::size_of::<$tdst>(); + #[repr(align(8))] + struct $abufname([u8; $bufname]); + static mut $abuf: $abufname = $abufname([0; $bufname]); + fn $callback(data: &[u8]) -> i32 { + unsafe { + $abuf.0.copy_from_slice(data); + } + LIBBPF_STOP + } + }; +} + +define_buffer!( + BUFSIZE_STATS, + AlignedBufferstats, + BUF_STATS, + callback_stats, + bpf_intf::policy_stats +); +fn fetch_stats(bytes: &[u8]) -> lpolicy_stats { + let ps = unsafe { *(bytes.as_ptr() as *const bpf_intf::policy_stats) }; + lpolicy_stats(ps) +} + +// Special negative error code for libbpf to stop after consuming just one item from a BPF +// ring buffer. +const LIBBPF_STOP: i32 = -255; + +impl<'cb> BpfScheduler<'cb> { + pub fn init( + open_object: &'cb mut MaybeUninit, + slice_us: u64, + exit_dump_len: u32, + verbose: bool, + ) -> Result { + // Open the BPF prog first for verification. + let mut skel_builder = BpfSkelBuilder::default(); + skel_builder.obj_builder.debug(verbose); + let mut skel = scx_ops_open!(skel_builder, open_object, tsksz_ops)?; + + // Lock all the memory to prevent page faults that could trigger potential deadlocks during + // scheduling. + ALLOCATOR.lock_memory(); + + skel.struct_ops.tsksz_ops_mut().exit_dump_len = exit_dump_len; + skel.maps.bss_data.usersched_pid = std::process::id(); + skel.maps.rodata_data.effective_slice_ns = slice_us * 1000; + + let path = "/sys/fs/bpf/func_metadata"; + let func_metadata = &mut skel.maps.func_metadata; + assert!(func_metadata.reuse_pinned_map("/asdf").is_err()); + func_metadata + .reuse_pinned_map(path) + .expect("failed to reuse map"); + + // Attach BPF scheduler. + let mut skel = scx_ops_load!(skel, tsksz_ops, uei)?; + let struct_ops = Some(scx_ops_attach!(skel, tsksz_ops)?); + + // Build the ring buffer of queued tasks. + let rb_map = &mut skel.maps.queued_stats; + let mut builder = libbpf_rs::RingBufferBuilder::new(); + builder.add(rb_map, callback_stats).unwrap(); + let queued_stats = builder.build().unwrap(); + + // Make sure to use the SCHED_EXT class at least for the scheduler itself. + match Self::use_sched_ext() { + 0 => Ok(Self { + skel, + struct_ops, + queued_stats, + }), + err => Err(anyhow::Error::msg(format!( + "sched_setscheduler error: {}", + err + ))), + } + } + + // Receive stats from the BPF scheduler to switch to schedext policy. + pub fn dequeue_stats(&mut self) -> Result, i32> { + match self.queued_stats.consume_raw() { + 0 => Ok(None), + LIBBPF_STOP => { + // A valid pid is received, convert data to a proper pid. + let stats = unsafe { fetch_stats(&BUF_STATS.0) }; + Ok(Some(stats)) + } + res if res < 0 => Err(res), + res => panic!( + "Unexpected return value from libbpf-rs::consume_raw(): {}", + res + ), + } + } + + // Set scheduling class for the scheduler itself to SCHED_EXT + fn use_sched_ext() -> i32 { + #[cfg(target_env = "gnu")] + let param: sched_param = sched_param { sched_priority: 0 }; + #[cfg(target_env = "musl")] + let param: sched_param = sched_param { + sched_priority: 0, + sched_ss_low_priority: 0, + sched_ss_repl_period: timespec { + tv_sec: 0, + tv_nsec: 0, + }, + sched_ss_init_budget: timespec { + tv_sec: 0, + tv_nsec: 0, + }, + sched_ss_max_repl: 0, + }; + + unsafe { pthread_setschedparam(pthread_self(), SCHED_EXT, ¶m as *const sched_param) } + } + + // Read exit code from the BPF part. + pub fn exited(&mut self) -> bool { + uei_exited!(&self.skel, uei) + } + + // Called on exit to shutdown and report exit message from the BPF part. + pub fn shutdown_and_report(&mut self) -> Result { + self.struct_ops.take(); + uei_report!(&self.skel, uei) + } +} + +// Disconnect the low-level BPF scheduler. +impl<'a> Drop for BpfScheduler<'a> { + fn drop(&mut self) { + if let Some(struct_ops) = self.struct_ops.take() { + drop(struct_ops); + } + ALLOCATOR.unlock_memory(); + } +} diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/src/bpf/intf.h" "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf/intf.h" new file mode 100644 index 00000000..ea17d040 --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf/intf.h" @@ -0,0 +1,133 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +#ifndef __INTF_H +#define __INTF_H + + +//////////////////////////////// +// Macros + +#define MAX_LOCKS 1 +#define STATS_LOCK 0 + +#define LOCK_HEADER( lock_name ) \ + u32 lock_key = lock_name; \ + struct lock_wrapper *lockw; \ + lockw = bpf_map_lookup_elem(&global_locks, &lock_key); \ + if( lockw ) + +#define MAX_NAME_LEN 16 +#define MAX_FUNCS 50 +#define FUNC_METADATA_KEYSIZE MAX_NAME_LEN // because the kernel fs inode name is 15 characters +#define MAX_ENQUEUED_TASKS 8192 +#define MAX_CGROUPS 64 + +#define SHARED_DSQ MAX_CPUS/2 +#define USCHED_DSQ SHARED_DSQ + 1 +#define USCHED_CORE MAX_CPUS - 1 + +#define QMAX_THRESHOLD 80 + +// info msg with a specific tag +#define info_msg(_fmt, ...) \ + do { \ + bpf_printk("[info-tsksz] " _fmt, ##__VA_ARGS__); \ + } while (0) + +// see comment over e2e_thresholds +#define MAX_E2E_BUCKETS 4 +#define RESERVED_E2E_BUCKET 0 + + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +#define NSEC_PER_SEC 1000000000L +#define ONE_MSEC 1000000L +#define ONE_NSEC 1000L +#define CLOCK_BOOTTIME 7 + +#include +#ifndef __kptr +#ifdef __KERNEL__ +#error "__kptr_ref not defined in the kernel" +#endif +#define __kptr +#endif + +#ifndef __KERNEL__ +typedef unsigned char u8; +typedef unsigned int u32; +typedef int s32; +typedef unsigned long long u64; +typedef long long s64; +#endif + +/* Check a condition at build time */ +#define BUILD_BUG_ON(expr) \ + do { \ + extern char __build_assert__[(expr) ? -1 : 1] \ + __attribute__((unused)); \ + } while( 0 ) + +/* + * Maximum amount of CPUs supported by this scheduler (this defines the size of + * cpu_map that is used to store the idle state and CPU ownership). + */ +#define MAX_CPUS 48 + +/* Special dispatch flags */ +enum { + /* + * Do not assign any specific CPU to the task. + * + * The task will be dispatched to the global shared DSQ and it will run + * on the first CPU available. + */ + RL_CPU_ANY = 1 << 0, + + /* + * Allow to preempt the target CPU when dispatching the task. + */ + RL_PREEMPT_CPU = 1 << 1, +}; + +/* + * Task sent to the user-space scheduler by the BPF dispatcher. + * + * All attributes are collected from the kernel by the the BPF component. + */ +struct queued_task_ctx { + s32 pid; + s32 cpu; /* CPU where the task is running (-1 = exiting) */ + u64 cpumask_cnt; /* cpumask generation counter */ + u64 sum_exec_runtime; /* Total cpu time */ + u64 nvcsw; /* Voluntary context switches */ + u64 weight; /* Task static priority */ +}; + +/* + * Task sent to the BPF dispatcher by the user-space scheduler. + * + * This struct can be easily extended to send more information to the + * dispatcher (i.e., a target CPU, a variable time slice, etc.). + */ +struct dispatched_task_ctx { + s32 pid; + s32 cpu; /* CPU where the task should be dispatched */ + u64 flags; /* special dispatch flags */ + u64 cpumask_cnt; /* cpumask generation counter */ + u64 slice_ns; /* time slice assigned to the task (0=default) */ +}; + +typedef struct packet_pid { + int pid; +} packet_pid_t; + +typedef struct policy_stats { + unsigned int timestamp_ms; + int tsks_Q[SHARED_DSQ]; +} stats_t; + +#endif /* __INTF_H */ diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/src/bpf/main.bpf.c" "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf/main.bpf.c" new file mode 100644 index 00000000..6243fa05 --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf/main.bpf.c" @@ -0,0 +1,1296 @@ +/* Copyright (c) Abdul Rehman */ +/* + Task Size Interval Assignment BPF Scheduler + that assigns specific Q to function cgroups + based on their metadata. + + Each Q dispatches tasks to fixed pair of cores. + Wichever is idle, giving power of 2 choice. + + Tasks run for a fixed timeslice. + + This software may be used and distributed according to the terms of the + GNU General Public License version 2. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "intf.h" + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +//////////////////////////////////////// +// Structures + +typedef struct CharVal{ + u32 prio; + u32 e2e; + u32 loc; +} MetaVal_t; + +// let's create a hashmap +// a hash map +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_FUNCS); + __uint(key_size, sizeof(char)*FUNC_METADATA_KEYSIZE); /* cgrp ID */ + __uint(value_size, sizeof(MetaVal_t)); /* Value Structure */ +} func_metadata SEC(".maps"); + +typedef struct CgroupInfo { + u64 id; + s32 qid; + s32 tsk_cnt; + char name[MAX_NAME_LEN]; +} CgroupInfo_t; + +/* + HashMap to keep track of the Cgroups whose + tasks we are capturing. +*/ +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, MAX_CGROUPS); + __type(key, __u64); + __type(value, CgroupInfo_t); +} CgroupsHashMap SEC(".maps"); + +typedef struct TaskInfo { + s32 pid; + s32 qid_cur; + CgroupInfo_t *cgroupctx; +} TaskInfo_t; + +/* + HashMap to keep track of the task context. +*/ +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, MAX_ENQUEUED_TASKS); + __type(key, __s32); // pid of the task + __type(value, TaskInfo_t); // context of the task +} TasksHashMap SEC(".maps"); + +/* + * Heartbeat timer used to periodically trigger the check to run the user-space + * scheduler. + * + * Without this timer we may starve the scheduler if the system is completely + * idle and hit the watchdog that would auto-kill this scheduler. + */ +struct usersched_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct usersched_timer); +} usersched_timer SEC(".maps"); + +// The map containing pids of tasks that are to be switched to SchedEXT policy. +// it is drained by the user space thread +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MAX_ENQUEUED_TASKS); +} queued_pids SEC(".maps"); + +stats_t global_stats; + +struct lock_wrapper { + struct bpf_spin_lock lock; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct lock_wrapper); + __uint(max_entries, 1); + __uint(map_flags, 0); +} global_locks SEC(".maps"); + +// The map containing stats that are to be switched to SchedEXT policy. +// it is drained by the user space thread +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MAX_ENQUEUED_TASKS); +} queued_stats SEC(".maps"); + +///////////////////////////////////////////// +// Global Variables + +/* + * Scheduler attributes and statistics. + */ +u32 usersched_pid; /* User-space scheduler PID */ + +/* + * Effective time slice: allow the scheduler to override the default time slice + * (slice_ns) if this one is set. + */ +// const volatile u64 effective_slice_ns = SCX_SLICE_DFL; /* SCX_SLICE_DFL is 20 ms */ +const volatile u64 effective_slice_ns = 10 * ONE_MSEC; + +// Number of tasks being handled by the bpf scheduler +volatile u64 nr_tasks = 0; +volatile u64 nr_eq_tasks = 0; + +/* + +Buckets / Groups + + Unassigned 0 2000 4000 INF -- Thresholds + + │ Group Reserved │ Group 1 │ Group 2 │ Group Rest │ + │ │ │ │ │ + │ No E2E │ │ │ all > 4000 │ + │ │ │ │ │ + │ ┌┐ ┌┐ ┌┐ ┌┐ │ ┌┐ ┌┐ ┌┐ ┌┐ │ ┌┐ ┌┐ ┌┐ ┌┐ │ ┌┐ ┌┐ ┌┐ ┌┐ │ + │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ + │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ + │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ ││ ││ ││ ││ │ + │ └┘ └┘ └┘ └┘ │ └┘ └┘ └┘ └┘ │ └┘ └┘ └┘ └┘ │ └┘ └┘ └┘ └┘ │ + +*/ + +// Thresholds for each bucket. +volatile u32 e2e_thresholds[MAX_E2E_BUCKETS -2]; // we don't need thresholds + // for reserved and rest + // buckets +// Next Qid to assign for a given bucket +// it's initialized in init_scheduler +static volatile s32 bkt_next_qid[MAX_E2E_BUCKETS] = {0}; + +/* + * Flag used to wake-up the user-space scheduler. + */ +static volatile u32 usersched_needed; + +// verbose debug output flag +bool verbose = true; + +///////////////////////////////// +// Function Declarations + +static __always_inline CgroupInfo_t get_task_cgroupinfo( struct task_struct *p ); +static __always_inline CgroupInfo_t * get_chashmap( __u64 cid ); +static __always_inline MetaVal_t * get_func_metadata( char *key_name ); +static __always_inline bool verify_qid( s32 qid ); + +void scx_bpf_switch_to_scx(struct task_struct *p) __ksym; +struct kernfs_node *bpf_get_parent_kernfs(struct cgroup *cgrp) __ksym; +void bpf_release_kernfs(struct kernfs_node *kn) __ksym; + +///////////////////////////////// +// Function Definitions + +static __always_inline char readchar( const char *str, u32 index ) +{ + char c; + bpf_probe_read_kernel( &c, sizeof(c), str + index ); + return c; +} + +static __always_inline bool str_is_docker( const char *str ) +{ + // 0123456 + char * docker = "docker"; + int i; + bool is_match = true; + + if (!str) { + return false; + } + + bpf_for(i, 0, 6){ + char c = readchar( str, i ); + if ( c == '\0' ){ + is_match = false; + break; + } + if (docker[i] != c){ + is_match = false; + break; + } + } + return is_match; +} + +static __always_inline bool is_docker_parent( struct cgroup *cgrp ) +{ + struct kernfs_node *parent; + bool result; + + parent = bpf_get_parent_kernfs( cgrp ); + if (!parent){ + return false; + } + + result = str_is_docker( parent->name ); + + bpf_release_kernfs( parent ); + return result; +} + +void global_stats_update_tsks_Q( s32 qid, s32 tsks_cnt ){ + if( verify_qid(qid) ){ + global_stats.tsks_Q[qid] = tsks_cnt; + } +} + +void global_stats_add_tsks_Q( s32 qid, s32 tsks_cnt ){ + if( verify_qid(qid) ){ + __sync_fetch_and_add( &global_stats.tsks_Q[qid], tsks_cnt ); + } +} + +void global_stats_sub_tsks_Q( s32 qid, s32 tsks_cnt ){ + if( verify_qid(qid) ){ + __sync_fetch_and_sub( &global_stats.tsks_Q[qid], tsks_cnt ); + } +} + +s32 get_groupid( u32 e2e ) { + int i; + + // can setup if else branch for thresholds + if( e2e == 0 ){ + return RESERVED_E2E_BUCKET; + }; + + if( e2e < e2e_thresholds[0] ) { + return 1; + } else if ( e2e < e2e_thresholds[1] ) { + return 2; + } + + return 3; +} + +/* + Test Results + root@v-021:/data2/ar/workspace/temp# cat /sys/kernel/debug/tracing/trace_pipe | grep -i test + fs_policy_tsksz-2481710 [002] ...11 227122.741565: bpf_trace_printk: [info-tsksz] [test][get_groupid] e2e: 0 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2481710 [002] ...11 227122.741567: bpf_trace_printk: [info-tsksz] [test][get_groupid] e2e: 100 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2481710 [002] ...11 227122.741568: bpf_trace_printk: [info-tsksz] [test][get_groupid] e2e: 1000 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2481710 [002] ...11 227122.741569: bpf_trace_printk: [info-tsksz] [test][get_groupid] e2e: 2000 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2481710 [002] ...11 227122.741570: bpf_trace_printk: [info-tsksz] [test][get_groupid] e2e: 3000 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2481710 [002] ...11 227122.741571: bpf_trace_printk: [info-tsksz] [test][get_groupid] e2e: 4000 -> gid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2481710 [002] ...11 227122.741572: bpf_trace_printk: [info-tsksz] [test][get_groupid] e2e: 5000 -> gid 3 -- should be 3 -- passed: 1 +*/ +static __always_inline void verify_get_groupid(){ + s32 gid; + s32 sgid; + +#define TESTCASE_get_groupid( e2e, sgid ) \ + gid = get_groupid( e2e ); \ + info_msg("[test][get_groupid] e2e: %d -> gid %d -- should be %d -- passed: %d ", \ + e2e, \ + gid, \ + sgid, \ + (gid == sgid) \ + ); + + TESTCASE_get_groupid( 0, RESERVED_E2E_BUCKET ) + TESTCASE_get_groupid( 100, 1 ) + TESTCASE_get_groupid( 1000, 1 ) + TESTCASE_get_groupid( 2000, 2 ) + TESTCASE_get_groupid( 3000, 2 ) + TESTCASE_get_groupid( 4000, 3 ) + TESTCASE_get_groupid( 5000, 3 ) +} + +static __always_inline s32 gen_qid_new( s32 gid ) +{ + if( !(0 <= gid && gid < MAX_E2E_BUCKETS) ){ + return -1; + } + + s32 t = bkt_next_qid[gid]++; + s32 gap = SHARED_DSQ / MAX_E2E_BUCKETS; // 6 + s32 lower = gap * gid; // 0,6 + s32 upper = lower + gap; // 6,12 + + if ( bkt_next_qid[gid] == upper ) { + bkt_next_qid[gid] = lower; + } + + return t; +} + +/* + Test Results + root@v-021:/data2/ar/workspace/temp# cat /sys/kernel/debug/tracing/trace_pipe | grep -i gen_qid_new + fs_policy_tsksz-2491375 [047] ...11 231851.589544: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: -1 -> qid -1 -- should be -1 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589545: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 0 -> qid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589546: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 0 -> qid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589547: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 0 -> qid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589547: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 0 -> qid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589548: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 0 -> qid 4 -- should be 4 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589549: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 0 -> qid 5 -- should be 5 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589550: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 0 -> qid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589551: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 1 -> qid 6 -- should be 6 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589552: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 1 -> qid 7 -- should be 7 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589552: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 1 -> qid 8 -- should be 8 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589553: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 1 -> qid 9 -- should be 9 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589554: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 1 -> qid 10 -- should be 10 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589555: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 1 -> qid 11 -- should be 11 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589556: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 1 -> qid 6 -- should be 6 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589557: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 2 -> qid 12 -- should be 12 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589558: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 2 -> qid 13 -- should be 13 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589558: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 2 -> qid 14 -- should be 14 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589559: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 2 -> qid 15 -- should be 15 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589560: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 2 -> qid 16 -- should be 16 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589561: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 2 -> qid 17 -- should be 17 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589562: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 2 -> qid 12 -- should be 12 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589563: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 3 -> qid 18 -- should be 18 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589563: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 3 -> qid 19 -- should be 19 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589564: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 3 -> qid 20 -- should be 20 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589565: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 3 -> qid 21 -- should be 21 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589566: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 3 -> qid 22 -- should be 22 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589567: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 3 -> qid 23 -- should be 23 -- passed: 1 + fs_policy_tsksz-2491375 [047] ...11 231851.589568: bpf_trace_printk: [info-tsksz] [test][gen_qid_new] gid: 3 -> qid 18 -- should be 18 -- passed: 1 +*/ +static __always_inline void verify_gen_qid_new(){ + s32 qid; + s32 sgid; + +#define TESTCASE_gen_qid_new( gid, sqid ) \ + qid = gen_qid_new( gid ); \ + info_msg("[test][gen_qid_new] gid: %d -> qid %d -- should be %d -- passed: %d ", \ + gid, \ + qid, \ + sqid, \ + (qid == sqid) \ + ); + // for config max dsqs 24 and - max buckets 4 + // 0 -> [0,6) + // 1 -> [6,12) + // 2 -> [12,18) + // 3 -> [18,24) + +#define TESTCASES_gen_qid_new( gid, sqid ) \ + TESTCASE_gen_qid_new( gid, sqid + 0 ) \ + TESTCASE_gen_qid_new( gid, sqid + 1 ) \ + TESTCASE_gen_qid_new( gid, sqid + 2 ) \ + TESTCASE_gen_qid_new( gid, sqid + 3 ) \ + TESTCASE_gen_qid_new( gid, sqid + 4 ) \ + TESTCASE_gen_qid_new( gid, sqid + 5 ) \ + TESTCASE_gen_qid_new( gid, sqid + 0 ) + + TESTCASE_gen_qid_new( -1, -1 ) + TESTCASES_gen_qid_new( 0, 0 ) + TESTCASES_gen_qid_new( 1, 6 ) + TESTCASES_gen_qid_new( 2, 12 ) + TESTCASES_gen_qid_new( 3, 18 ) +} + + +static __always_inline s32 qid_to_groupid( s32 qid ){ + s32 gid = 0; + + if( !(0 <= qid && qid < SHARED_DSQ) ){ + return -1; + } + + s32 gap = SHARED_DSQ / MAX_E2E_BUCKETS; // 6 + s32 lower; + s32 upper; + + bpf_for(gid, 0, MAX_E2E_BUCKETS){ + lower = gap * gid; // 0,6 + upper = lower + gap; // 6,12 + + if( lower <= qid && qid < upper ){ + break; + } + } + if ( gid == MAX_E2E_BUCKETS ){ + return -1; + } + return gid; +} + +/* + Test Results: + root@v-021:/data2/ar/workspace/temp# cat /sys/kernel/debug/tracing/trace_pipe | grep -i qid_to_groupid + fs_policy_tsksz-2497763 [026] ...11 235008.361905: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: -1 -> gid -1 -- should be -1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361906: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 0 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361907: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 1 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361908: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 2 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361909: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 3 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361910: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 4 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361911: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 5 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361912: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 0 -> gid 0 -- should be 0 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361912: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 6 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361913: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 7 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361914: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 8 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361915: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 9 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361916: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 10 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361917: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 11 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361918: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 6 -> gid 1 -- should be 1 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361919: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 12 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361920: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 13 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361921: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 14 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361922: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 15 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361923: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 16 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361924: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 17 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361925: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 12 -> gid 2 -- should be 2 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361926: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 18 -> gid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361927: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 19 -> gid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361927: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 20 -> gid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361928: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 21 -> gid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361929: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 22 -> gid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361930: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 23 -> gid 3 -- should be 3 -- passed: 1 + fs_policy_tsksz-2497763 [026] ...11 235008.361931: bpf_trace_printk: [info-tsksz] [test][qid_to_groupid] qid: 18 -> gid 3 -- should be 3 -- passed: 1 +*/ +static __always_inline void verify_qid_to_groupid(){ + s32 gid; + +#define TESTCASE_qid_to_groupid( qid, sgid ) \ + gid = qid_to_groupid( qid ); \ + info_msg("[test][qid_to_groupid] qid: %d -> gid %d -- should be %d -- passed: %d ", \ + qid, \ + gid, \ + sgid, \ + (gid == sgid) \ + ); + +/* + [0,6) -> 0 + [6,12) -> 1 + [12,18) -> 2 + [18,24) -> 3 +*/ +#define TESTCASES_qid_to_groupid( qid, sgid ) \ + TESTCASE_qid_to_groupid( qid + 0, sgid) \ + TESTCASE_qid_to_groupid( qid + 1, sgid) \ + TESTCASE_qid_to_groupid( qid + 2, sgid) \ + TESTCASE_qid_to_groupid( qid + 3, sgid) \ + TESTCASE_qid_to_groupid( qid + 4, sgid) \ + TESTCASE_qid_to_groupid( qid + 5, sgid) \ + TESTCASE_qid_to_groupid( qid + 0, sgid) + + TESTCASE_qid_to_groupid( -1, -1 ) + TESTCASES_qid_to_groupid( 0, 0 ) + TESTCASES_qid_to_groupid( 6, 1 ) + TESTCASES_qid_to_groupid( 12, 2 ) + TESTCASES_qid_to_groupid( 18, 3 ) +} + +static __always_inline void update_qid_assignment( struct task_struct *p ) { + + // get the cgroup to which this task belongs to + CgroupInfo_t cgrp = get_task_cgroupinfo( p ); + + // fetch associated metadata of the function + MetaVal_t *fmeta = get_func_metadata( cgrp.name ); + if ( fmeta ){ + + // verify if we should be updating the group assignment + // of this cgroup using chashmap + CgroupInfo_t * cgrp_old = get_chashmap( cgrp.id ); + if ( cgrp_old ){ + + + // Update based on the e2e changes + s32 gid = get_groupid( fmeta->e2e ); // new group id + s32 ogid = qid_to_groupid( cgrp_old->qid ); // old group id + + if ( ogid != gid ){ + s32 nqid; + + nqid = gen_qid_new( gid ); + + info_msg( "[qid_assignment] cgroup %d - %s now is assigned Q %d instead of old-Q %d", + cgrp_old->id, + cgrp_old->name, + nqid, + cgrp_old->qid + ); + + cgrp_old->qid = nqid; + + // todo: it's an abrupt change in stats - even though actual + // shift only happens when tasks go through the select_cpu + // callback + // so there is a lag between stats and actual shift - but it + // should be very small for coarse grained observation over a + // second + } else { + // if there were not changes based on the e2e + // we will try load balancing based on Q threshold within the + // same group + + s32 qlen = scx_bpf_dsq_nr_queued( cgrp_old->qid ); + if ( qlen > QMAX_THRESHOLD ){ + + s32 nqid; + nqid = gen_qid_new( gid ); + + info_msg( "[qid_assignment][load_balanced] cgroup %d - %s now is assigned Q %d instead of old-Q %d", + cgrp_old->id, + cgrp_old->name, + nqid, + cgrp_old->qid + ); + + cgrp_old->qid = nqid; + } + } + } // cgrp_old + } // fmeta +} + +/* + Callback for bpf_for_each_map_elem + long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + callback + continues : if return 0 + stops : if return 1 +*/ +static long func_metadata_dump_callback (void *map, const char *key, MetaVal_t *val, void *data){ + info_msg("[func_metadata][dump_callback] key: %s e2e: %lu", + key, + val->e2e + ); + return 0; +} + +/* + Callback for bpf_for_each_map_elem + long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + callback + continues : if return 0 + stops : if return 1 +*/ +static long func_cgroup_dump_callback (void *map, const __u64 *key, CgroupInfo_t *val, void *data){ + info_msg("[func_cgroup][dump_callback] key: %d id: %llu qid: %d tasks: %d name: %s", + *key, + val->id, + val->qid, + val->tsk_cnt, + val->name + ); + return 0; +} + +static __always_inline MetaVal_t * get_func_metadata( char *key_name ) +{ + MetaVal_t *cvalue = bpf_map_lookup_elem(&func_metadata, key_name); + return cvalue; +} + +static __always_inline bool verify_qid( s32 qid ) +{ + if ( 0 <= qid && qid < SHARED_DSQ ) { + return true; + } + return false; +} + +static __always_inline bool verify_cpu(s32 cpu) +{ + if ( 0 <= cpu && cpu < MAX_CPUS ) { + return true; + } + return false; +} + +static __always_inline s32 cpu_to_qid(s32 cpu) +{ + return (u32)cpu / 2; +} + +// Generate a bpf cpumask with cpus that belong to qid +// cpumask needs to be released after use +static __always_inline struct bpf_cpumask *qid_to_cpumask(s32 qid) +{ + struct bpf_cpumask *mask; + s32 cpu = qid * 2; + + mask = bpf_cpumask_create(); + if (!mask) + return NULL; + + bpf_cpumask_set_cpu(cpu, mask); + bpf_cpumask_set_cpu(cpu + 1, mask); + return mask; +} + +void push_pid_for_class_switch( int pid ) +{ + int *p = bpf_ringbuf_reserve( &queued_pids, sizeof(packet_pid_t), 0 ); + if ( p ) { + packet_pid_t *ps = (packet_pid_t *)p; + ps->pid = pid; + bpf_ringbuf_submit(ps, 0); + info_msg( "[queued_pids] pushed pid %d", pid ); + } +} + +void push_stats( stats_t *stat ) +{ + if( !stat ){ + return; + } + + int *p = bpf_ringbuf_reserve( &queued_stats, sizeof(stats_t), 0 ); + if ( p ) { + stats_t *ps = (stats_t *)p; + stat->timestamp_ms = bpf_ktime_get_ns()/ONE_MSEC; + __builtin_memcpy_inline( ps, stat, sizeof(stats_t) ); + bpf_ringbuf_submit(ps, 0); + info_msg( "[queued_stats] pushed stat %p", ps ); + } +} + +static void chashmap_insert( CgroupInfo_t *cgrp ) +{ + CgroupInfo_t *cgrp_old = bpf_map_lookup_elem( &CgroupsHashMap, &cgrp->id ); + + if ( cgrp_old ) { + __builtin_memcpy_inline(cgrp_old->name, cgrp->name, MAX_NAME_LEN); + cgrp_old->id = cgrp->id; + cgrp_old->qid = cgrp->qid; + } else { + CgroupInfo_t cgrp_new; + + __builtin_memcpy_inline( &cgrp_new, cgrp, sizeof(CgroupInfo_t) ); + + // we are assumming right now that we don't have any info about + // the associated function metadata + cgrp_new.qid = gen_qid_new( RESERVED_E2E_BUCKET ); + + bpf_map_update_elem( + &CgroupsHashMap, + &cgrp->id, + &cgrp_new, + BPF_NOEXIST + ); + + info_msg("[chashmap] inserting cgroup %d - %s with Q %d", + cgrp->id, + cgrp->name, + cgrp_new.qid + ); + } +} + +static __always_inline CgroupInfo_t *get_chashmap(__u64 cid) +{ + CgroupInfo_t *cvalue = bpf_map_lookup_elem(&CgroupsHashMap, &cid); + return cvalue; +} + +static __always_inline CgroupInfo_t get_task_cgroupinfo(struct task_struct *p) +{ + struct cgroup *cgrp; + CgroupInfo_t info; + + bpf_rcu_read_lock(); + // cgroups->dfl_cgrp is the cgroup-id 1 + // we need cgroups->subsys[sched] cgroup + if( p->sched_task_group && + p->sched_task_group->css.cgroup + ){ + cgrp = p->sched_task_group->css.cgroup; + info.id = cgrp->kn->id; + bpf_probe_read_kernel_str( info.name, MAX_NAME_LEN, cgrp->kn->name ); + } + bpf_rcu_read_unlock(); + + info.qid = -1; + + return info; +} + + +static __always_inline void thashmap_insert( TaskInfo_t *tinfo ) +{ + TaskInfo_t *tinfo_old = bpf_map_lookup_elem( &TasksHashMap, &tinfo->pid ); + long r = bpf_map_update_elem( + &TasksHashMap, + &tinfo->pid, + tinfo, + BPF_ANY + ); + info_msg("[thashmap] inserting task %d with Q %d - status %d", + tinfo->pid, + tinfo->qid_cur, + r + ); +} + +static __always_inline TaskInfo_t* get_task_ctx(s32 pid) +{ + TaskInfo_t *tinfo = bpf_map_lookup_elem(&TasksHashMap, &pid); + return tinfo; +} + +static __always_inline s32 task_to_qid(struct task_struct *p) +{ + CgroupInfo_t cgrp = get_task_cgroupinfo( p ); + + CgroupInfo_t * cvalue = get_chashmap( cgrp.id ); + if (cvalue) { + return cvalue->qid; + }else{ + // sometimes task switch to sched_ext even after partial being set + // let's sched those tasks through qid 0 + info_msg("[warn] no qid found for task %d - %s belongs to cgroup %d - %s", + p->pid, + p->comm, + cgrp.id, + cgrp.name + ); + return 0; + } + + return -1; +} + +// todo: make this ugly function better +static inline bool match_prefix(const char *prefix, const char *str, u32 max_len) +{ + int c; + if (!prefix || !str) { + return false; + } + + if (max_len == 0) { + return false; + } + + if (max_len == 1) { + if (prefix[0] == '\0') + return false; + if (str[0] == '\0') + return false; + } + + bpf_for(c, 0, max_len) + { + if (prefix[c] == '\0') + return true; + if (c == (max_len - 1)) { + return true; + } + if (str[c] != prefix[c]) + return false; + } + return false; +} + +// todo: remove if not needed +static __always_inline int cus_strlen(const char *cs) +{ + int len = 0; + while (cs != NULL && *cs != '\0') { + cs++; + len++; + } + return len; +} + +/* + * Return true if the target task @p is the user-space scheduler. + */ +static inline bool is_usersched_task( const struct task_struct *p ) +{ + return p->pid == usersched_pid; +} + +/* + * Set user-space scheduler wake-up flag (equivalent to an atomic release + * operation). + */ +static void set_usersched_needed(void) +{ + __sync_fetch_and_or(&usersched_needed, 1); +} + +/* + * Check and clear user-space scheduler wake-up flag (equivalent to an atomic + * acquire operation). + */ +static bool test_and_clear_usersched_needed(void) +{ + return __sync_fetch_and_and(&usersched_needed, 0) == 1; +} + +/* + * Dispatch the user-space scheduler. + */ +static void dispatch_user_scheduler(void) +{ + struct task_struct *p; + if (!test_and_clear_usersched_needed()) + return; + + p = bpf_task_from_pid(usersched_pid); + if (!p) { + scx_bpf_error("Failed to find usersched task %d", + usersched_pid); + return; + } + /* + * Dispatch the scheduler on the first CPU available, likely the + * current one. + */ + scx_bpf_dispatch( p, USCHED_DSQ, effective_slice_ns, 0 ); + scx_bpf_kick_cpu( USCHED_CORE, SCX_KICK_IDLE ); + + bpf_task_release(p); +} + +static __always_inline inline bool is_usersched_cpu(s32 cpu) +{ + return cpu == USCHED_CORE; +} + +/* + Select the target CPU where a task can be executed. + + We use scx_bpf_pick_idle_cpu(...) to pick idle CPU from + among the CPUs that belong to the given Qid. + + Currently we associate two CPUs to each Q resulting in a + power of 2 choice. +*/ +s32 BPF_STRUCT_OPS(tsksz_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + // Update the Qid assignment based on the + // current snapshot of the function characteristics + // BPF map + update_qid_assignment( p ); + + s32 qid = task_to_qid( p ); + if ( verify_qid(qid) ){ + + TaskInfo_t *taskctx = get_task_ctx( p->pid ); + if ( taskctx ){ + taskctx->qid_cur = qid; + } + + struct bpf_cpumask __kptr *cpumask = qid_to_cpumask(qid); + if ( cpumask ) { + cpu = scx_bpf_pick_idle_cpu( + (const struct cpumask *)cpumask, + SCX_PICK_IDLE_CORE); + if ( 0 <= cpu && cpu < MAX_CPUS ) { + info_msg( "[select_cpu] selected cpu %d for task: %d - %s", + cpu, + p->pid, + p->comm + ); + // we immediately dispatch to the local DSQ of the idle CPU + scx_bpf_dispatch( p, SCX_DSQ_LOCAL, effective_slice_ns, 0 ); + prev_cpu = cpu; + } + bpf_cpumask_release( cpumask ); + } + } else { + CgroupInfo_t cgrp = get_task_cgroupinfo( p ); + info_msg( "[select_cpu] Q id not found for task: %d - %s belongs to cgroup: %d - %s", + p->pid, + p->comm, + cgrp.id, + cgrp.name + ); + } + + // if there is no dispatch to Local DSQ enqueue callback would be called + // where we enqueue to a custom DSQ + // later when dispatch callback is called we consume from the + // custom DSQ into Local DSQ of that CPU on which that dispatch was called + return prev_cpu; +} + +/* + * Task @p becomes ready to run. We can dispatch the task directly here if the + * user-space scheduler is not required, or enqueue it to be processed by the + * scheduler. + */ +void BPF_STRUCT_OPS(tsksz_enqueue, struct task_struct *p, u64 enq_flags) +{ + info_msg("[enqueue] task: %d - %s EFlags: 0x%llx", + p->pid, + p->comm, + enq_flags + ); + + /* + * Scheduler is dispatched directly in .dispatch() when needed, so + * we can skip it here. + */ + if ( is_usersched_task(p) ) + return; + + s32 qid = task_to_qid(p); + if (verify_qid(qid)) { + TaskInfo_t *taskctx = get_task_ctx( p->pid ); + if( taskctx ){ + taskctx->qid_cur = qid; + } + + scx_bpf_dispatch( p, qid, effective_slice_ns, 0 ); + info_msg("[enqueue] enqueued task: %d - %s to Q %d", + p->pid, + p->comm, + qid + ); + global_stats_update_tsks_Q( qid, scx_bpf_dsq_nr_queued(qid) ); + + // trigger a follow up scheduling event + s32 cpu = qid*2; + scx_bpf_kick_cpu( cpu , SCX_KICK_IDLE); + scx_bpf_kick_cpu( cpu+1 , SCX_KICK_IDLE); + } +} + +/* + * Dispatch tasks that are ready to run. + * + * This function is called when a CPU's local DSQ is empty and ready to accept + * new dispatched tasks. + * + * We may dispatch tasks also on other CPUs from here, if the scheduler decided + * so (usually if other CPUs are idle we may want to send more tasks to their + * local DSQ to optimize the scheduling pipeline). + */ +void BPF_STRUCT_OPS(tsksz_dispatch, s32 cpu, struct task_struct *prev) +{ + if( prev ){ + info_msg("[dispatch] cpu: %d prev_task: %d - %s", + cpu, + prev->pid, + prev->comm + ); + } + + // Enqueue the scheduler task if the timer callback + // has set the need flag - it's set every second + dispatch_user_scheduler(); + + if ( is_usersched_cpu(cpu) ) { + // dispatches from custom DSQ to Local DSQ of this cpu + if ( scx_bpf_consume(USCHED_DSQ) ){ + info_msg("[dispatch] consumed user sched Q on cpu: %d ", + cpu + ); + } + } + + // dispatch tasks from custom DSQ that corresponds to this CPU + // to Local DSQ of this cpu + s32 qid = cpu_to_qid( cpu ); + if ( verify_qid(qid) ) { + if ( scx_bpf_consume(qid) ) { + info_msg("[dispatch] consumed Q %d on cpu: %d ", + qid, + cpu + ); + } + } +} + +/* + * A new task @p is being created. + * + * Allocate and initialize all the internal structures for the task (this + * function is allowed to block, so it can be used to preallocate memory). + */ +s32 BPF_STRUCT_OPS(tsksz_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + bool push_to_ringbuf = false; + + info_msg("[init_task] initializing task %d - %s", + p->pid, + p->comm + ); + __sync_fetch_and_add(&nr_tasks, 1); + + if( p->sched_task_group ){ + struct cgroup *cgrp; + + cgrp = p->sched_task_group->css.cgroup; + + if ( cgrp && is_docker_parent( cgrp ) ){ + scx_bpf_switch_to_scx( p ); + bpf_printk("[cgroup-prog][cgroup][switch] switched %s - %d belongs to name %s", + p->comm, + p->pid, + cgrp->kn->name + ); + + CgroupInfo_t *cgrp_old = get_chashmap( cgrp->kn->id ); + if ( cgrp_old ) { + LOCK_HEADER( STATS_LOCK ) { + bpf_spin_lock(&lockw->lock); + __sync_fetch_and_add( &cgrp_old->tsk_cnt, 1 ); + bpf_spin_unlock(&lockw->lock); + } + + TaskInfo_t taskctx; + taskctx.pid = p->pid; + taskctx.qid_cur = cgrp_old->qid; + taskctx.cgroupctx = cgrp_old; + thashmap_insert( &taskctx ); + } else { + CgroupInfo_t cgrpinfo; + cgrpinfo.tsk_cnt = 0; + cgrpinfo.id = cgrp->kn->id; + chashmap_insert( &cgrpinfo ); + } + } + } + + if ( verbose ){ + // dump out the function characteritics map + u64 stackptr = 0; + bpf_for_each_map_elem( + &func_metadata, + func_metadata_dump_callback, + &stackptr, + 0 + ); + } + + return 0; +} + +/* + * Task @p is exiting. + */ +void BPF_STRUCT_OPS(tsksz_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{ + info_msg("[exit_task] exiting task %d - %s", + p->pid, + p->comm + ); + __sync_fetch_and_sub(&nr_tasks, 1); + + CgroupInfo_t cgrp = get_task_cgroupinfo( p ); + CgroupInfo_t *cgrp_old = get_chashmap(cgrp.id); + if( cgrp_old ){ + LOCK_HEADER( STATS_LOCK ) { + bpf_spin_lock(&lockw->lock); + __sync_fetch_and_sub( &cgrp_old->tsk_cnt, 1 ); + bpf_spin_unlock(&lockw->lock); + } + } + + TaskInfo_t *tinfo = get_task_ctx( p->pid ); + if( tinfo ){ + info_msg("[thashmap] exiting task %d with Q %d ", + tinfo->pid, + tinfo->qid_cur + ); + } +} + +/* + * Heartbeat scheduler timer callback. + * + * If the system is completely idle the sched-ext watchdog may incorrectly + * detect that as a stall and automatically disable the scheduler. So, use this + * timer to periodically wake-up the scheduler and avoid long inactivity. + * + * This can also help to prevent real "stalling" conditions in the scheduler. + */ +static int usersched_timer_fn(void *map, int *key, struct bpf_timer *timer) +{ + int err = 0; + + /* Kick the scheduler */ + set_usersched_needed(); + + // check all the dsqs - if anyone has any pending tasks + // kick the target cpus, so that we may not have any unnecessary stalls + int i; + s32 cpu; + s32 n; + bpf_for(i, 0, SHARED_DSQ){ + n = scx_bpf_dsq_nr_queued( i ); + global_stats_update_tsks_Q( i, n ); + if ( n > 0 ){ + cpu = i*2; + scx_bpf_kick_cpu( cpu, SCX_KICK_IDLE); + scx_bpf_kick_cpu( cpu+1, SCX_KICK_IDLE); + } + } + + push_stats( &global_stats ); + + if ( verbose ){ + + info_msg("[health] heartbeat message"); + + int i; + bpf_for(i, 0, SHARED_DSQ) { + info_msg("[stats] q[%d] -> %d", + i, + global_stats.tsks_Q[i] + ); + } + + // dump out the cgrouphashmap + u64 stackptr = 0; + bpf_for_each_map_elem( + &CgroupsHashMap, + func_cgroup_dump_callback, + &stackptr, + 0 + ); + + } + + /* Re-arm the timer */ + err = bpf_timer_start(timer, NSEC_PER_SEC, 0); + if (err) + scx_bpf_error("Failed to arm stats timer"); + + return 0; +} + +/* + * Initialize the heartbeat scheduler timer. + */ +static int usersched_timer_init(void) +{ + struct bpf_timer *timer; + u32 key = 0; + int err; + + timer = bpf_map_lookup_elem(&usersched_timer, &key); + if (!timer) { + scx_bpf_error("Failed to lookup scheduler timer"); + return -ESRCH; + } + bpf_timer_init(timer, &usersched_timer, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, usersched_timer_fn); + err = bpf_timer_start(timer, NSEC_PER_SEC, 0); + if (err) + scx_bpf_error("Failed to arm scheduler timer"); + + return err; +} + +/* + Init the DSQs + + Since we are using scx_bpf_dispatch(...) to dispatch to + the custom DSQs, they are being used as FIFOs instead of + priority Qs. + */ +static int dsq_init(void) +{ + int err; + int i; + + // create SHARED_DSQ number of custom DSQs + bpf_for(i, 0, SHARED_DSQ) + { + err = scx_bpf_create_dsq(i, -1); + if (err) { + scx_bpf_error("failed to create shared DSQ: %d", err); + return err; + } + } + + // create a separate DSQ for the user space scheduler thread + err = scx_bpf_create_dsq(USCHED_DSQ, -1); + if (err) { + scx_bpf_error("failed to create shared DSQ: %d", err); + return err; + } + + return 0; +} + +/* + Initialize the scheduling class. +*/ +s32 BPF_STRUCT_OPS_SLEEPABLE(tsksz_init) +{ + int err; + + info_msg("[init] initializing the tsksz scheduler"); + + /* Compile-time checks */ + BUILD_BUG_ON((MAX_CPUS % 2)); + + // init dsqs + err = dsq_init(); + if (err) + return err; + + // arm the timer callback + err = usersched_timer_init(); + if (err) + return err; + + // init thresholds for buckets + e2e_thresholds[0] = 2000; // 2 seconds + e2e_thresholds[1] = 4000; // 4 seconds + + // init the next qids array for each bucket + s32 gap = SHARED_DSQ / MAX_E2E_BUCKETS; // 6 + s32 lower; + s32 i; + bpf_for(i, 0, MAX_E2E_BUCKETS){ + lower = gap * i; // 0,6 + bkt_next_qid[i] = lower; + } + + if ( verbose ){ + // testcases for each function + verify_get_groupid(); + verify_gen_qid_new(); + verify_qid_to_groupid(); + } + + return 0; +} + +/* + * Unregister the scheduling class. + */ +void BPF_STRUCT_OPS(tsksz_exit, struct scx_exit_info *ei) +{ + info_msg("[exit] exiting the tsksz scheduler"); + + + UEI_RECORD(uei, ei); +} + +/* + * Scheduling class declaration. + */ +SCX_OPS_DEFINE( tsksz_ops, + .select_cpu = (void *)tsksz_select_cpu, + .enqueue = (void *)tsksz_enqueue, + .dispatch = (void *)tsksz_dispatch, + .init_task = (void *)tsksz_init_task, + .exit_task = (void *)tsksz_exit_task, + .init = (void *)tsksz_init, + .exit = (void *)tsksz_exit, + .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_SWITCH_PARTIAL, + .timeout_ms = 5000, + .name = "tsksz" +); diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/src/bpf_intf.rs" "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf_intf.rs" new file mode 100644 index 00000000..9db020ef --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf_intf.rs" @@ -0,0 +1,9 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] + +include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs")); diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/src/bpf_skel.rs" "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf_skel.rs" new file mode 100644 index 00000000..c42af33d --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/src/bpf_skel.rs" @@ -0,0 +1,4 @@ +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs")); diff --git "a/src/Il\303\272vatar/fs_policy_tsksz/src/main.rs" "b/src/Il\303\272vatar/fs_policy_tsksz/src/main.rs" new file mode 100644 index 00000000..f6613f2b --- /dev/null +++ "b/src/Il\303\272vatar/fs_policy_tsksz/src/main.rs" @@ -0,0 +1,159 @@ +// Copyright (c) Andrea Righi + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +mod bpf_skel; +pub use bpf_skel::*; +pub mod bpf_intf; + +mod bpf; +use bpf::*; + +use scx_utils::UserExitInfo; + +use libbpf_rs::OpenObject; + +use std::mem::MaybeUninit; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use iluvatar_library::clock::get_global_timestamp_ms; + +use anyhow::Result; + +use clap::Parser; + +use serde::{Deserialize, Serialize}; + +use iluvatar_worker_library::utils::characteristics_map::CharacteristicsPacket; +use iluvatar_worker_library::worker_api::fs_scheduler::Channels; + +use ipc_channel::ipc::{self, IpcReceiver, IpcSender}; +use std::collections::HashMap; + +#[derive(Debug, Deserialize, Serialize)] +pub struct ChannelsR { + pub rx_chr: IpcReceiver, +} + +struct Scheduler<'a> { + bpf: BpfScheduler<'a>, + characteristics: HashMap, + crecvs: Option<&'a mut ChannelsR>, +} + +impl<'a> Scheduler<'a> { + fn init( + open_object: &'a mut MaybeUninit, + crecvs: Option<&'a mut ChannelsR>, + slice_us: u64, + ) -> Result { + let bpf = BpfScheduler::init( + open_object, + slice_us, + 0, // exit_dump_len (buffer size of exit info) + true, // verbose (verbose output) + )?; + + Ok(Self { + bpf, + characteristics: HashMap::new(), + crecvs, + }) + } + + fn now() -> i64 { + get_global_timestamp_ms() + } + + fn print_stats(&mut self) { + println!("-------- Userspace thread running ----------"); + + match self.bpf.dequeue_stats() { + Ok(Some(stats)) => { + println!("stats {:?}", stats); + } + Ok(None) | Err(_) => {} + } + + if let Some(crecvs) = &self.crecvs { + while let Ok(chr) = crecvs.rx_chr.try_recv() { + self.characteristics.insert(chr.fqdn.clone(), chr); + } + for (k, v) in &self.characteristics { + println!("{}: {:?}", k, v); + } + } + } + + fn fetch_over_channel(&mut self) { + if let Some(crecvs) = &self.crecvs { + while let Ok(chr) = crecvs.rx_chr.try_recv() { + self.characteristics.insert(chr.fqdn.clone(), chr); + } + } + } + + fn run(&mut self, shutdown: Arc) -> Result { + let mut prev_ts = Self::now(); + + while !shutdown.load(Ordering::Relaxed) && !self.bpf.exited() { + let curr_ts = Self::now(); + if curr_ts > prev_ts { + self.print_stats(); + self.fetch_over_channel(); + + prev_ts = curr_ts; + } + } + + self.bpf.shutdown_and_report() + } +} + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + #[arg(long)] + server_name: Option, + + #[arg(long, default_value_t = 1000)] + time_slice_us: u64, +} + +fn main() -> Result<()> { + let args = Args::parse(); + + println!("Starting power of 2 scheduler"); + + let mut crecvs; + if let Some(server_name) = &args.server_name { + let (c_tx, c_rx): ( + IpcSender, + IpcReceiver, + ) = ipc::channel().unwrap(); + let server_tx = IpcSender::connect(server_name.clone()).unwrap(); + server_tx.send(Channels { tx_chr: c_tx }).unwrap(); + crecvs = Some(ChannelsR { rx_chr: c_rx }); + } else { + crecvs = None; + } + + let shutdown = Arc::new(AtomicBool::new(false)); + let shutdown_clone = shutdown.clone(); + ctrlc::set_handler(move || { + shutdown_clone.store(true, Ordering::Relaxed); + })?; + + let mut open_object = MaybeUninit::uninit(); + loop { + let mut sched = + Scheduler::init(&mut open_object, (&mut crecvs).into(), args.time_slice_us)?; + if !sched.run(shutdown.clone())?.should_restart() { + break; + } + } + + Ok(()) +} diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/Cargo.toml" "b/src/Il\303\272vatar/iluvatar_bpf_library/Cargo.toml" new file mode 100644 index 00000000..0b2efb81 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/Cargo.toml" @@ -0,0 +1,29 @@ +[package] +name = "iluvatar_bpf_library" +description = "Collection of code to be used in context of bpf, but can also be referenced by other parts of the codebase." +version.workspace = true +authors.workspace = true +edition.workspace = true +keywords.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +categories.workspace = true + +[build-dependencies] +libbpf-cargo = { version = "0.24.4" } +vmlinux = { version = "0.0", git = "https://github.com/libbpf/vmlinux.h.git", rev = "a9c092aa771310bf8b00b5018f7d40a1fdb6ec82" } + +[dependencies] +anyhow = "1.0" +libbpf-rs = { version = "0.24.4" } + +[dev-dependencies] +rstest = "0.13.0" + +[features] +static = ["libbpf-rs/static"] + + + + diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/build.rs" "b/src/Il\303\272vatar/iluvatar_bpf_library/build.rs" new file mode 100644 index 00000000..da75fc0a --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/build.rs" @@ -0,0 +1,23 @@ +use std::env; +use std::ffi::OsStr; +use std::path::PathBuf; + +use libbpf_cargo::SkeletonBuilder; + +const SRC: &str = "src/bpf/charmap.bpf.c"; + +fn main() { + let out = PathBuf::from(env::var_os("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR must be set in build script")) + .join("src") + .join("bpf") + .join("charmap.skel.rs"); + + let arch = env::var("CARGO_CFG_TARGET_ARCH").expect("CARGO_CFG_TARGET_ARCH must be set in build script"); + + SkeletonBuilder::new() + .source(SRC) + .clang_args([OsStr::new("-I"), vmlinux::include_path_root().join(arch).as_os_str()]) + .build_and_generate(&out) + .unwrap(); + println!("cargo:rerun-if-changed={SRC}"); +} diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/src/bin/basic_bpf_usage.rs" "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bin/basic_bpf_usage.rs" new file mode 100644 index 00000000..64505de7 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bin/basic_bpf_usage.rs" @@ -0,0 +1,25 @@ +use iluvatar_bpf_library::bpf::func_characs::*; +use std::mem::MaybeUninit; + +pub fn main() { + let mut open_object = MaybeUninit::uninit(); + let skel = build_and_load(&mut open_object).unwrap(); + let fcmap = skel.maps.func_metadata; + + let key: BPF_FMAP_KEY = build_bpf_key("first_key"); + let key2: BPF_FMAP_KEY = build_bpf_key("second_key"); + + let val = CharVal { + prio: 1, + e2e: 2, + loc: 3, + }; + update_map(&fcmap, &key, &val); + update_map(&fcmap, &key2, &val); + + use std::{thread, time}; + let one_sec = time::Duration::from_millis(1000); + loop { + thread::sleep(one_sec); + } +} diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/charmap.bpf.c" "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/charmap.bpf.c" new file mode 100644 index 00000000..5d3733a4 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/charmap.bpf.c" @@ -0,0 +1,24 @@ +#include "vmlinux.h" +#include +#include + +#define MAX_FUNCS 50 +#define FUNC_METADATA_KEYSIZE 15 // because the kernel fs inode name is 15 characters + +typedef struct CharVal{ + u32 prio; + u32 e2e; + u32 loc; +} CharVal_t; + +// let's create a hashmap +// a hash map +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_FUNCS); + __uint(key_size, sizeof(char)*FUNC_METADATA_KEYSIZE); /* cgrp ID */ + __uint(value_size, sizeof(CharVal_t)); /* Value Structure */ +} func_metadata SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/charmap.skel.rs" "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/charmap.skel.rs" new file mode 100644 index 00000000..d4963671 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/charmap.skel.rs" @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// +// THIS FILE IS AUTOGENERATED BY CARGO-LIBBPF-GEN! + +pub use self::imp::*; + +#[allow(dead_code)] +#[allow(non_snake_case)] +#[allow(non_camel_case_types)] +#[allow(clippy::absolute_paths)] +#[allow(clippy::upper_case_acronyms)] +#[allow(clippy::zero_repeat_side_effects)] +#[warn(single_use_lifetimes)] +mod imp { + #[allow(unused_imports)] + use super::*; + use libbpf_rs::libbpf_sys; + use libbpf_rs::skel::OpenSkel; + use libbpf_rs::skel::Skel; + use libbpf_rs::skel::SkelBuilder; + use libbpf_rs::AsRawLibbpf as _; + use libbpf_rs::MapCore as _; + fn build_skel_config() -> libbpf_rs::Result> { + let mut builder = libbpf_rs::__internal_skel::ObjectSkeletonConfigBuilder::new(DATA); + builder.name("charmap_bpf").map("func_metadata", false); + builder.build() + } + pub struct OpenCharmapMaps<'obj> { + pub func_metadata: libbpf_rs::OpenMapMut<'obj>, + _phantom: std::marker::PhantomData<&'obj ()>, + } + + impl<'obj> OpenCharmapMaps<'obj> { + #[allow(unused_variables)] + unsafe fn new( + config: &libbpf_rs::__internal_skel::ObjectSkeletonConfig<'_>, + object: &mut libbpf_rs::OpenObject, + ) -> libbpf_rs::Result { + let mut func_metadata = None; + let object = + unsafe { std::mem::transmute::<&mut libbpf_rs::OpenObject, &'obj mut libbpf_rs::OpenObject>(object) }; + #[allow(clippy::never_loop)] + for map in object.maps_mut() { + let name = map.name().to_str().ok_or_else(|| { + libbpf_rs::Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "map has invalid name", + )) + })?; + #[allow(clippy::match_single_binding)] + match name { + "func_metadata" => func_metadata = Some(map), + _ => panic!("encountered unexpected map: `{name}`"), + } + } + + let slf = Self { + func_metadata: func_metadata.expect("map `func_metadata` not present"), + _phantom: std::marker::PhantomData, + }; + Ok(slf) + } + } + pub struct CharmapMaps<'obj> { + pub func_metadata: libbpf_rs::MapMut<'obj>, + _phantom: std::marker::PhantomData<&'obj ()>, + } + + impl<'obj> CharmapMaps<'obj> { + #[allow(unused_variables)] + unsafe fn new( + config: &libbpf_rs::__internal_skel::ObjectSkeletonConfig<'_>, + object: &mut libbpf_rs::Object, + ) -> libbpf_rs::Result { + let mut func_metadata = None; + let object = unsafe { std::mem::transmute::<&mut libbpf_rs::Object, &'obj mut libbpf_rs::Object>(object) }; + #[allow(clippy::never_loop)] + for map in object.maps_mut() { + let name = map.name().to_str().ok_or_else(|| { + libbpf_rs::Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "map has invalid name", + )) + })?; + #[allow(clippy::match_single_binding)] + match name { + "func_metadata" => func_metadata = Some(map), + _ => panic!("encountered unexpected map: `{name}`"), + } + } + + let slf = Self { + func_metadata: func_metadata.expect("map `func_metadata` not present"), + _phantom: std::marker::PhantomData, + }; + Ok(slf) + } + } + pub struct OpenCharmapProgs<'obj> { + _phantom: std::marker::PhantomData<&'obj ()>, + } + + impl<'obj> OpenCharmapProgs<'obj> { + unsafe fn new(object: &mut libbpf_rs::OpenObject) -> libbpf_rs::Result { + let object = + unsafe { std::mem::transmute::<&mut libbpf_rs::OpenObject, &'obj mut libbpf_rs::OpenObject>(object) }; + for prog in object.progs_mut() { + let name = prog.name().to_str().ok_or_else(|| { + libbpf_rs::Error::from(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "prog has invalid name", + )) + })?; + match name { + _ => panic!("encountered unexpected prog: `{name}`"), + } + } + + let slf = Self { + _phantom: std::marker::PhantomData, + }; + Ok(slf) + } + } + pub struct CharmapProgs<'obj> { + _phantom: std::marker::PhantomData<&'obj ()>, + } + + impl<'obj> CharmapProgs<'obj> { + #[allow(unused_variables)] + fn new(open_progs: OpenCharmapProgs<'obj>) -> Self { + Self { + _phantom: std::marker::PhantomData, + } + } + } + struct OwnedRef<'obj, O> { + object: Option<&'obj mut std::mem::MaybeUninit>, + } + + impl<'obj, O> OwnedRef<'obj, O> { + /// # Safety + /// The object has to be initialized. + unsafe fn new(object: &'obj mut std::mem::MaybeUninit) -> Self { + Self { object: Some(object) } + } + + fn as_ref(&self) -> &O { + // SAFETY: As per the contract during construction, the + // object has to be initialized. + unsafe { self.object.as_ref().unwrap().assume_init_ref() } + } + + fn as_mut(&mut self) -> &mut O { + // SAFETY: As per the contract during construction, the + // object has to be initialized. + unsafe { self.object.as_mut().unwrap().assume_init_mut() } + } + + fn take(mut self) -> &'obj mut std::mem::MaybeUninit { + self.object.take().unwrap() + } + } + + impl Drop for OwnedRef<'_, O> { + fn drop(&mut self) { + if let Some(object) = &mut self.object { + unsafe { object.assume_init_drop() } + } + } + } + + #[derive(Default)] + pub struct CharmapSkelBuilder { + pub obj_builder: libbpf_rs::ObjectBuilder, + } + + impl<'obj> CharmapSkelBuilder { + fn open_opts_impl( + self, + open_opts: *const libbpf_sys::bpf_object_open_opts, + object: &'obj mut std::mem::MaybeUninit, + ) -> libbpf_rs::Result> { + let skel_config = build_skel_config()?; + let skel_ptr = skel_config.as_libbpf_object(); + + let ret = unsafe { libbpf_sys::bpf_object__open_skeleton(skel_ptr.as_ptr(), open_opts) }; + if ret != 0 { + return Err(libbpf_rs::Error::from_raw_os_error(-ret)); + } + + // SAFETY: `skel_ptr` points to a valid object after the + // open call. + let obj_ptr = unsafe { *skel_ptr.as_ref().obj }; + // SANITY: `bpf_object__open_skeleton` should have + // allocated the object. + let obj_ptr = std::ptr::NonNull::new(obj_ptr).unwrap(); + // SAFETY: `obj_ptr` points to an opened object after + // skeleton open. + let obj = unsafe { libbpf_rs::OpenObject::from_ptr(obj_ptr) }; + let _obj = object.write(obj); + // SAFETY: We just wrote initialized data to `object`. + let mut obj_ref = unsafe { OwnedRef::new(object) }; + + #[allow(unused_mut)] + let mut skel = OpenCharmapSkel { + maps: unsafe { OpenCharmapMaps::new(&skel_config, obj_ref.as_mut())? }, + progs: unsafe { OpenCharmapProgs::new(obj_ref.as_mut())? }, + obj: obj_ref, + // SAFETY: Our `struct_ops` type contains only pointers, + // which are allowed to be NULL. + // TODO: Generate and use a `Default` representation + // instead, to cut down on unsafe code. + struct_ops: unsafe { std::mem::zeroed() }, + skel_config, + }; + + Ok(skel) + } + } + + impl<'obj> SkelBuilder<'obj> for CharmapSkelBuilder { + type Output = OpenCharmapSkel<'obj>; + fn open( + self, + object: &'obj mut std::mem::MaybeUninit, + ) -> libbpf_rs::Result> { + self.open_opts_impl(std::ptr::null(), object) + } + + fn open_opts( + self, + open_opts: libbpf_sys::bpf_object_open_opts, + object: &'obj mut std::mem::MaybeUninit, + ) -> libbpf_rs::Result> { + self.open_opts_impl(&open_opts, object) + } + + fn object_builder(&self) -> &libbpf_rs::ObjectBuilder { + &self.obj_builder + } + fn object_builder_mut(&mut self) -> &mut libbpf_rs::ObjectBuilder { + &mut self.obj_builder + } + } + + #[derive(Debug, Clone)] + #[repr(C)] + pub struct StructOps {} + + impl StructOps {} + pub mod types { + #[allow(unused_imports)] + use super::*; + #[derive(Debug, Copy, Clone)] + #[repr(C)] + pub struct __anon_1 { + pub r#type: *mut [i32; 1], + pub max_entries: *mut [i32; 50], + pub key_size: *mut [i32; 15], + pub value_size: *mut [i32; 12], + } + impl Default for __anon_1 { + fn default() -> Self { + Self { + r#type: std::ptr::null_mut(), + max_entries: std::ptr::null_mut(), + key_size: std::ptr::null_mut(), + value_size: std::ptr::null_mut(), + } + } + } + #[derive(Debug, Copy, Clone)] + #[repr(C)] + pub struct license { + pub _license: [i8; 4], + } + #[derive(Debug, Copy, Clone)] + #[repr(C)] + pub struct maps { + pub func_metadata: __anon_1, + } + } + pub struct OpenCharmapSkel<'obj> { + obj: OwnedRef<'obj, libbpf_rs::OpenObject>, + pub maps: OpenCharmapMaps<'obj>, + pub progs: OpenCharmapProgs<'obj>, + pub struct_ops: StructOps, + skel_config: libbpf_rs::__internal_skel::ObjectSkeletonConfig<'obj>, + } + + impl<'obj> OpenSkel<'obj> for OpenCharmapSkel<'obj> { + type Output = CharmapSkel<'obj>; + fn load(self) -> libbpf_rs::Result> { + let skel_ptr = self.skel_config.as_libbpf_object().as_ptr(); + + let ret = unsafe { libbpf_sys::bpf_object__load_skeleton(skel_ptr) }; + if ret != 0 { + return Err(libbpf_rs::Error::from_raw_os_error(-ret)); + } + + let obj_ref = self.obj.take(); + let open_obj = std::mem::replace(obj_ref, std::mem::MaybeUninit::uninit()); + // SAFETY: `open_obj` is guaranteed to be properly + // initialized as it came from an `OwnedRef`. + let obj_ptr = unsafe { open_obj.assume_init().take_ptr() }; + // SAFETY: `obj_ptr` points to a loaded object after + // skeleton load. + let obj = unsafe { libbpf_rs::Object::from_ptr(obj_ptr) }; + // SAFETY: `OpenObject` and `Object` are guaranteed to + // have the same memory layout. + let obj_ref = unsafe { + std::mem::transmute::< + &'obj mut std::mem::MaybeUninit, + &'obj mut std::mem::MaybeUninit, + >(obj_ref) + }; + let _obj = obj_ref.write(obj); + // SAFETY: We just wrote initialized data to `obj_ref`. + let mut obj_ref = unsafe { OwnedRef::new(obj_ref) }; + + Ok(CharmapSkel { + maps: unsafe { CharmapMaps::new(&self.skel_config, obj_ref.as_mut())? }, + progs: CharmapProgs::new(self.progs), + obj: obj_ref, + struct_ops: self.struct_ops, + skel_config: self.skel_config, + }) + } + + fn open_object(&self) -> &libbpf_rs::OpenObject { + self.obj.as_ref() + } + + fn open_object_mut(&mut self) -> &mut libbpf_rs::OpenObject { + self.obj.as_mut() + } + } + pub struct CharmapSkel<'obj> { + obj: OwnedRef<'obj, libbpf_rs::Object>, + pub maps: CharmapMaps<'obj>, + pub progs: CharmapProgs<'obj>, + struct_ops: StructOps, + skel_config: libbpf_rs::__internal_skel::ObjectSkeletonConfig<'obj>, + } + + unsafe impl Send for CharmapSkel<'_> {} + unsafe impl Sync for CharmapSkel<'_> {} + + impl<'obj> Skel<'obj> for CharmapSkel<'obj> { + fn object(&self) -> &libbpf_rs::Object { + self.obj.as_ref() + } + + fn object_mut(&mut self) -> &mut libbpf_rs::Object { + self.obj.as_mut() + } + } + impl CharmapSkel<'_> { + pub fn struct_ops_raw(&self) -> *const StructOps { + &self.struct_ops + } + + pub fn struct_ops(&self) -> &StructOps { + &self.struct_ops + } + } + const DATA: &[u8] = &[ + 127, 69, 76, 70, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 247, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 64, 0, 6, 0, 1, 0, 0, 46, 115, 116, 114, + 116, 97, 98, 0, 46, 115, 121, 109, 116, 97, 98, 0, 108, 105, 99, 101, 110, 115, 101, 0, 46, 109, 97, 112, 115, + 0, 99, 104, 97, 114, 109, 97, 112, 46, 98, 112, 102, 46, 99, 0, 95, 108, 105, 99, 101, 110, 115, 101, 0, 102, + 117, 110, 99, 95, 109, 101, 116, 97, 100, 97, 116, 97, 0, 46, 66, 84, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 4, 0, 241, 255, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 17, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, + 0, 17, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 71, 80, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 159, 235, 1, 0, 24, 0, 0, 0, 0, + 0, 0, 0, 100, 1, 0, 0, 100, 1, 0, 0, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, + 4, 0, 0, 0, 32, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 0, + 0, 0, 1, 4, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, + 0, 0, 4, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, + 0, 4, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, + 0, 4, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 32, 0, 0, 0, 25, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, + 0, 5, 0, 0, 0, 64, 0, 0, 0, 42, 0, 0, 0, 7, 0, 0, 0, 128, 0, 0, 0, 51, 0, 0, 0, 9, 0, 0, 0, 192, 0, 0, 0, 62, + 0, 0, 0, 0, 0, 0, 14, 11, 0, 0, 0, 1, 0, 0, 0, 76, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 8, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 3, 0, 0, 0, 0, 13, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 81, 0, 0, 0, 0, 0, 0, 14, 14, 0, 0, 0, 1, 0, 0, 0, + 90, 0, 0, 0, 1, 0, 0, 15, 4, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 98, 0, 0, 0, 1, 0, 0, 15, 32, 0, 0, + 0, 12, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 105, 110, 116, 0, 95, 95, 65, 82, 82, 65, 89, 95, 83, 73, 90, 69, + 95, 84, 89, 80, 69, 95, 95, 0, 116, 121, 112, 101, 0, 109, 97, 120, 95, 101, 110, 116, 114, 105, 101, 115, 0, + 107, 101, 121, 95, 115, 105, 122, 101, 0, 118, 97, 108, 117, 101, 95, 115, 105, 122, 101, 0, 102, 117, 110, 99, + 95, 109, 101, 116, 97, 100, 97, 116, 97, 0, 99, 104, 97, 114, 0, 95, 108, 105, 99, 101, 110, 115, 101, 0, 108, + 105, 99, 101, 110, 115, 101, 0, 46, 109, 97, 112, 115, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 0, 0, 0, 0, 0, 0, 73, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, + 96, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, + 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 248, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 24, 1, 0, 0, 0, 0, 0, 0, 228, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ]; +} diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/func_characs.rs" "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/func_characs.rs" new file mode 100644 index 00000000..11cfee16 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/func_characs.rs" @@ -0,0 +1,85 @@ +use anyhow::Result; + +use std::fs; +use std::mem::MaybeUninit; +use std::path::Path; + +use libbpf_rs::skel::OpenSkel; +use libbpf_rs::skel::SkelBuilder; +//use libbpf_rs::ErrorExt; +use libbpf_rs::MapCore; +use libbpf_rs::MapFlags; + +#[allow(clippy::never_loop)] +#[allow(clippy::match_single_binding)] +mod charmap { + include!(concat!(env!("CARGO_MANIFEST_DIR"), "/src/bpf/charmap.skel.rs")); +} +pub use charmap::CharmapSkel; +use charmap::*; +use std::fmt::{Debug, Error, Formatter}; + +impl Debug for CharmapSkel<'_> { + // Required method + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { + f.debug_struct("BPF Program").finish() + } +} + +#[allow(non_camel_case_types)] +pub type BPF_FMAP_KEY = [u8; 15]; + +#[derive(Debug, Copy, Clone)] +#[repr(C)] +pub struct CharVal { + pub prio: u32, + pub e2e: u32, + pub loc: u32, +} + +/// # Safety +/// This function takes the reference and generates a raw pointer to the data. +/// It should be used with caution as it can lead to undefined behavior if the data is not valid. +pub unsafe fn any_as_u8_slice(p: &T) -> &[u8] { + ::core::slice::from_raw_parts((p as *const T) as *const u8, ::core::mem::size_of::()) +} + +pub fn build_and_load(open_object: &mut MaybeUninit) -> Result> { + let mut skel_builder = CharmapSkelBuilder::default(); + skel_builder.obj_builder.debug(true); + let open_skel = skel_builder.open(open_object)?; + let mut skel = open_skel.load()?; + + let path = "/sys/fs/bpf/func_metadata"; + if Path::new(path).exists() { + let _ = fs::remove_file(path); + } + + let fcmap = &mut skel.maps.func_metadata; + + fcmap.pin(path).expect("failed to pin map"); + assert!(Path::new(path).exists()); + + Ok(skel) +} + +pub fn build_bpf_key(key: &str) -> [u8; 15] { + let mut keyb: BPF_FMAP_KEY = [0; 15]; + + for (i, k) in key.bytes().enumerate() { + if i < keyb.len() { + keyb[i] = k; + } + } + keyb +} + +pub fn update_map<'obj>(map: &'obj libbpf_rs::MapMut<'obj>, key: &BPF_FMAP_KEY, val: &CharVal) { + let val: &[u8] = unsafe { any_as_u8_slice(val) }; + match map.update(key, val, MapFlags::ANY) { + Ok(_) => (), + Err(e) => { + println!("error: unable to update the map {:?}", e); + } + } +} diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/mod.rs" "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/mod.rs" new file mode 100644 index 00000000..af422abd --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/src/bpf/mod.rs" @@ -0,0 +1 @@ +pub mod func_characs; diff --git "a/src/Il\303\272vatar/iluvatar_bpf_library/src/lib.rs" "b/src/Il\303\272vatar/iluvatar_bpf_library/src/lib.rs" new file mode 100644 index 00000000..c773a872 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_bpf_library/src/lib.rs" @@ -0,0 +1,3 @@ +extern crate anyhow; + +pub mod bpf; diff --git "a/src/Il\303\272vatar/iluvatar_library/Cargo.toml" "b/src/Il\303\272vatar/iluvatar_library/Cargo.toml" index cab46435..6d36a890 100644 --- "a/src/Il\303\272vatar/iluvatar_library/Cargo.toml" +++ "b/src/Il\303\272vatar/iluvatar_library/Cargo.toml" @@ -39,6 +39,12 @@ influxdb2-structmap = "0.2" async-process = "2.0.0" async-trait = "0.1.*" num_cpus = "1.16" +csv = "1.3.0" +regex = "*" +iluvatar_bpf_library = { path = "../iluvatar_bpf_library" } +libbpf-rs = { version = "0.24.4" } +glob = "0.3.1" +array_tool = "1.0.3" [dev-dependencies] rstest = "0.13.0" diff --git "a/src/Il\303\272vatar/iluvatar_library/src/cgroup_interaction.rs" "b/src/Il\303\272vatar/iluvatar_library/src/cgroup_interaction.rs" new file mode 100644 index 00000000..110a0b52 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_library/src/cgroup_interaction.rs" @@ -0,0 +1,655 @@ +/// The purpose of this file is to read all the current information available for a given cgroup. +/// both for cgroup v1 and v2 +/// including docker and containerd backend - whereever the given cgroupid is found +use crate::bail_error; +use anyhow::Result; +use array_tool::vec::Union; +use glob::glob; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::str::FromStr; +use std::{fs::File, io::Read, path::Path}; + +// base cgroup mount location +const BASE_CGROUP_DIR: &str = "/sys/fs/cgroup"; + +/* + v1 location and names + + vec of u64? + tasks ✓ + 11753 + 11754 + 11755 + 11756 + + cgroup.procs - ✓ + 11646 + 11673 + 11674 + 11675 + + key-val pairs - + cpu.stat ✓ + nr_periods 0 + nr_throttled 0 + throttled_time 0 +*/ +const DOCKER_LOC_V1: &str = "cpu,cpuacct/docker"; +const V1_METRIC_SYS: &str = "cpuacct.usage_sys"; +const V1_METRIC_USR: &str = "cpuacct.usage_user"; +const V1_METRIC_PCPU_SYS: &str = "cpuacct.usage_percpu_sys"; +const V1_METRIC_PCPU_USR: &str = "cpuacct.usage_percpu_user"; +const V1_METRIC_TASKS: &str = "tasks"; +const V1_METRIC_PROCS: &str = "cgroup.procs"; +const V1_METRIC_STAT_CPU: &str = "cpu.stat"; + +// Keys +pub const KEY_NR_PERIODS: &str = "nr_periods"; +pub const KEY_NR_THROTTLED: &str = "nr_throttled"; +pub const KEY_THROTTLED_TIME: &str = "throttled_time"; + +pub const KEY_USER_USEC: &str = "user_usec"; +pub const KEY_SYSTEM_USEC: &str = "system_usec"; +pub const KEY_USAGE_USEC: &str = "usage_usec"; + +/* + v2 location and names + /sys/fs/cgroup/unified/docker/89e979a2b0e9fd30d9b469c48c59a7650640851559db603bc13faa70eb57576b/cgroup.events + + custom parsing + cpu.pressure + some avg10=0.00 avg60=0.00 avg300=0.00 total=22 + + io.pressure + some avg10=0.00 avg60=0.00 avg300=0.00 total=315303 + full avg10=0.00 avg60=0.00 avg300=0.00 total=315297 + + memory.pressure + some avg10=0.00 avg60=0.00 avg300=0.00 total=0 + full avg10=0.00 avg60=0.00 avg300=0.00 total=0 + + vec of u64? + + cgroup.threads ✓ + 11752 + 11753 + 11754 + 11755 + 11756 + + cgroup.procs ✓ + 11646 + 11673 + 11674 + 11675 + + key-val pairs - + + cpu.stat ✓ + usage_usec 84553241 + user_usec 25088308 + system_usec 59464932 + +*/ +const DOCKER_LOC_V2: &str = "unified/docker"; // /cgroupid/ +const V2_METRIC_PRS_CPU: &str = "cpu.pressure"; +const V2_METRIC_PRS_IO: &str = "io.pressure"; +const V2_METRIC_PRS_MEM: &str = "memory.pressure"; +const V2_METRIC_STAT_CPU: &str = "cpu.stat"; +const V2_METRIC_TASKS: &str = "cgroup.threads"; +const V2_METRIC_PROCS: &str = "cgroup.procs"; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CGROUPV2PsiVal { + pub avg10: f32, // % of time group waited for the resource to become available - on average + // over last 10 seconds + pub avg60: f32, + pub avg300: f32, + pub total: u64, // accumulated useconds of stall +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CGROUPV2Psi { + pub some: CGROUPV2PsiVal, + pub full: CGROUPV2PsiVal, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CGROUPReadingV2 { + pub threads: Vec, + pub procs: Vec, + pub cpustats: HashMap, + pub cpupsi: CGROUPV2Psi, + pub mempsi: CGROUPV2Psi, + pub iopsi: CGROUPV2Psi, +} + +// at some point implement serde serialize and deserialize for this structure +// https://serde.rs/impl-serialize.html +// it would help convert this structure to a json or a csv very easily +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct CGROUPReading { + // v1 + pub usr: u64, + pub sys: u64, + pub pcpu_usr: Vec, + pub pcpu_sys: Vec, + pub threads: Vec, + pub procs: Vec, + pub cpustats: HashMap, + + // v2 + pub v2: CGROUPReadingV2, +} + +impl Default for CGROUPReading { + fn default() -> Self { + CGROUPReading { + usr: 0, + sys: 0, + pcpu_usr: vec![0, 0], + pcpu_sys: vec![0, 0], + threads: vec![], + procs: vec![], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 0), + (KEY_NR_THROTTLED.to_string(), 0), + (KEY_THROTTLED_TIME.to_string(), 0), + ] + .iter() + .cloned() + .collect(), + + v2: CGROUPReadingV2 { + threads: vec![], + procs: vec![], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 0), + (KEY_NR_THROTTLED.to_string(), 0), + (KEY_THROTTLED_TIME.to_string(), 0), + ] + .iter() + .cloned() + .collect(), + cpupsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 0, + }, + full: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 0, + }, + }, + mempsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 0, + }, + full: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 0, + }, + }, + iopsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 0, + }, + full: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 0, + }, + }, + }, + } + } +} + +pub fn build_path(cgroupid: &String, metric: &str, docker_loc: &str) -> String { + format!("{}/{}/{}*/{}", BASE_CGROUP_DIR, docker_loc, cgroupid, metric) +} + +fn read_to_string(path: &String) -> Result { + let pth = Path::new(path); + let mut opened = match File::open(pth) { + Ok(b) => b, + Err(_e) => { + bail_error!(path=%path, "couldn't open path for reading"); + } + }; + let mut buff = String::new(); + match opened.read_to_string(&mut buff) { + Ok(_) => Ok(buff), + Err(e) => { + bail_error!(error=%e, "Unable to read cpu freq file into buffer") + } + } +} + +fn read_to_string_first_match(cgroupid: &String, metric: &str, docker_loc: &str) -> Option { + let path = build_path(cgroupid, metric, docker_loc); + for entry in glob(path.as_str()).expect("Failed to read glob pattern") { + match entry { + Ok(pathb) => match read_to_string(&pathb.into_os_string().into_string().unwrap()) { + Ok(r) => return Some(r), + Err(_) => return None, + }, + Err(e) => println!("{:?}", e), + }; + } + None +} + +pub fn read_as_u64(cgroupid: &String, metric: &str, docker_loc: &str) -> u64 { + match read_to_string_first_match(cgroupid, metric, docker_loc) { + Some(r) => { + //println!("{} -> {}", cgroupid, r); + return r.trim().parse::().unwrap_or(0); + } + None => 0, + } +} + +pub fn read_as_u64_vec(cgroupid: &String, metric: &str, docker_loc: &str) -> Vec { + match read_to_string_first_match(cgroupid, metric, docker_loc) { + Some(r) => { + let mut nums: Vec = r + .trim() + .split(" ") + .map(|n| n.trim().parse::().unwrap_or(0)) + .collect(); + if nums.len() == 1 { + nums = r + .trim() + .split("\n") + .map(|n| n.trim().parse::().unwrap_or(0)) + .collect(); + } + nums + } + None => vec![], + } +} + +pub fn read_as_u64_hashmap(cgroupid: &String, metric: &str, docker_loc: &str) -> HashMap { + match read_to_string_first_match(cgroupid, metric, docker_loc) { + Some(r) => { + let pairs: Vec<&str> = r.trim().split("\n").collect(); + let tuples: Vec> = pairs.iter().map(|x| x.trim().split(" ").collect()).collect(); + let mut result = HashMap::::new(); + for pair in tuples.iter() { + result.insert(pair[0].to_string(), pair[1].trim().parse::().unwrap_or(0)); + } + result + } + None => HashMap::new(), + } +} + +#[allow(non_snake_case)] +pub fn read_as_CGROUPV2Psi(cgroupid: &String, metric: &str, docker_loc: &str) -> CGROUPV2Psi { + let mut data = CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.0, + avg60: 0.0, + avg300: 0.0, + total: 0, + }, + full: CGROUPV2PsiVal { + avg10: 0.0, + avg60: 0.0, + avg300: 0.0, + total: 0, + }, + }; + + match read_to_string_first_match(cgroupid, metric, docker_loc) { + Some(r) => { + //println!("check->{:?}", r); + let lines: Vec<&str> = r.trim().split("\n").collect(); + let fillval = |line: &str, val: &mut CGROUPV2PsiVal| { + // given line: "some avg10=0.00 avg60=0.00 avg300=0.00 total=22\n" + let stuff = &line.split(" ").collect::>()[1..]; + let pairs: Vec> = stuff.iter().map(|x| x.split("=").collect()).collect(); + val.avg10 = f32::from_str(pairs[0][1]).unwrap_or(0.0); + val.avg60 = f32::from_str(pairs[1][1]).unwrap_or(0.0); + val.avg300 = f32::from_str(pairs[2][1]).unwrap_or(0.0); + val.total = pairs[3][1].parse::().unwrap_or(0); + }; + // fill in some + fillval(lines[0], &mut data.some); + if lines.len() == 2 { + // let's fill in full as well + fillval(lines[1], &mut data.full); + } + //println!("check->{:?}", lines); + data + } + + None => data, + } +} + +// TODO: remove hardcoding for docker location +// TODO: dynamic location for cgroup (it's different in case of containerd and docker) +pub fn read_cgroup(cgroupid: String) -> Result { + Ok(CGROUPReading { + usr: read_as_u64(&cgroupid, V1_METRIC_USR, DOCKER_LOC_V1), + sys: read_as_u64(&cgroupid, V1_METRIC_SYS, DOCKER_LOC_V1), + pcpu_usr: read_as_u64_vec(&cgroupid, V1_METRIC_PCPU_USR, DOCKER_LOC_V1), + pcpu_sys: read_as_u64_vec(&cgroupid, V1_METRIC_PCPU_SYS, DOCKER_LOC_V1), + threads: read_as_u64_vec(&cgroupid, V1_METRIC_TASKS, DOCKER_LOC_V1), + procs: read_as_u64_vec(&cgroupid, V1_METRIC_PROCS, DOCKER_LOC_V1), + cpustats: read_as_u64_hashmap(&cgroupid, V1_METRIC_STAT_CPU, DOCKER_LOC_V1), + + v2: CGROUPReadingV2 { + threads: read_as_u64_vec(&cgroupid, V2_METRIC_TASKS, DOCKER_LOC_V2), + procs: read_as_u64_vec(&cgroupid, V2_METRIC_PROCS, DOCKER_LOC_V2), + cpustats: read_as_u64_hashmap(&cgroupid, V2_METRIC_STAT_CPU, DOCKER_LOC_V2), + cpupsi: read_as_CGROUPV2Psi(&cgroupid, V2_METRIC_PRS_CPU, DOCKER_LOC_V2), + mempsi: read_as_CGROUPV2Psi(&cgroupid, V2_METRIC_PRS_MEM, DOCKER_LOC_V2), + iopsi: read_as_CGROUPV2Psi(&cgroupid, V2_METRIC_PRS_IO, DOCKER_LOC_V2), + }, + }) +} + +/// val1 - val0 -> result +pub fn diff_cgroupreading(val0: &CGROUPReading, val1: &CGROUPReading) -> CGROUPReading { + fn diff_vec(v0: &[u64], v1: &[u64]) -> Vec { + let mut diffvec = Vec::::new(); + if v0.len() != v1.len() { + return vec![]; + } + for (i, v) in v0.iter().enumerate() { + diffvec.push(v1[i] - v); + } + diffvec + } + + fn diff_hashmap(v0: &HashMap, v1: &HashMap) -> HashMap { + let mut diffmap = HashMap::::new(); + for (k, v) in v0.iter() { + if let Some(o) = v1.get(k) { + diffmap.insert(k.clone(), o - v); + } + } + diffmap + } + + fn avg_psi(v0: &CGROUPV2PsiVal, v1: &CGROUPV2PsiVal) -> CGROUPV2PsiVal { + let avg = |a, b| (a + b) / 2.0; + CGROUPV2PsiVal { + avg10: avg(v1.avg10, v0.avg10), + avg60: avg(v1.avg60, v0.avg60), + avg300: avg(v1.avg300, v0.avg300), + total: v1.total - v0.total, + } + } + + CGROUPReading { + usr: (val1.usr - val0.usr), + sys: (val1.sys - val0.sys), + pcpu_usr: diff_vec(&val0.pcpu_usr, &val1.pcpu_usr), + pcpu_sys: diff_vec(&val0.pcpu_sys, &val1.pcpu_sys), + threads: val1.threads.union(val0.threads.clone()), + procs: val1.procs.union(val0.procs.clone()), + cpustats: diff_hashmap(&val0.cpustats, &val1.cpustats), + v2: CGROUPReadingV2 { + threads: val1.v2.threads.union(val0.v2.threads.clone()), + procs: val1.v2.procs.union(val0.v2.procs.clone()), + cpustats: diff_hashmap(&val0.v2.cpustats, &val1.v2.cpustats), + cpupsi: CGROUPV2Psi { + some: avg_psi(&val0.v2.cpupsi.some, &val1.v2.cpupsi.some), + full: avg_psi(&val0.v2.cpupsi.full, &val1.v2.cpupsi.full), + }, + mempsi: CGROUPV2Psi { + some: avg_psi(&val0.v2.mempsi.some, &val1.v2.mempsi.some), + full: avg_psi(&val0.v2.mempsi.full, &val1.v2.mempsi.full), + }, + iopsi: CGROUPV2Psi { + some: avg_psi(&val0.v2.iopsi.some, &val1.v2.iopsi.some), + full: avg_psi(&val0.v2.iopsi.full, &val1.v2.iopsi.full), + }, + }, + } +} + +#[cfg(test)] +mod cgroup_interaction_tests { + use crate::cgroup_interaction::*; + + #[test] + fn test_cg_diff() { + let v0 = CGROUPReading { + usr: 20, + sys: 20, + pcpu_usr: vec![20, 30], + pcpu_sys: vec![20, 30], + threads: vec![120, 130], + procs: vec![120, 130], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 0), + (KEY_NR_THROTTLED.to_string(), 0), + (KEY_THROTTLED_TIME.to_string(), 0), + ] + .iter() + .cloned() + .collect(), + + v2: CGROUPReadingV2 { + threads: vec![120, 130], + procs: vec![120, 130], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 0), + (KEY_NR_THROTTLED.to_string(), 0), + (KEY_THROTTLED_TIME.to_string(), 0), + ] + .iter() + .cloned() + .collect(), + cpupsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 123, + }, + full: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 123, + }, + }, + mempsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 123, + }, + full: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 123, + }, + }, + iopsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 123, + }, + full: CGROUPV2PsiVal { + avg10: 0.00, + avg60: 0.00, + avg300: 0.00, + total: 123, + }, + }, + }, + }; + let v1 = CGROUPReading { + usr: 30, + sys: 30, + pcpu_usr: vec![30, 40], + pcpu_sys: vec![30, 40], + threads: vec![121, 131], + procs: vec![121, 131], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 1), + (KEY_NR_THROTTLED.to_string(), 1), + (KEY_THROTTLED_TIME.to_string(), 1), + ] + .iter() + .cloned() + .collect(), + + v2: CGROUPReadingV2 { + threads: vec![121, 131], + procs: vec![121, 131], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 1), + (KEY_NR_THROTTLED.to_string(), 1), + (KEY_THROTTLED_TIME.to_string(), 1), + ] + .iter() + .cloned() + .collect(), + cpupsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.20, + avg60: 0.20, + avg300: 0.20, + total: 124, + }, + full: CGROUPV2PsiVal { + avg10: 0.20, + avg60: 0.20, + avg300: 0.20, + total: 124, + }, + }, + mempsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.20, + avg60: 0.20, + avg300: 0.20, + total: 124, + }, + full: CGROUPV2PsiVal { + avg10: 0.20, + avg60: 0.20, + avg300: 0.20, + total: 124, + }, + }, + iopsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.20, + avg60: 0.20, + avg300: 0.20, + total: 124, + }, + full: CGROUPV2PsiVal { + avg10: 0.20, + avg60: 0.20, + avg300: 0.20, + total: 124, + }, + }, + }, + }; + + let shouldbe = CGROUPReading { + usr: 10, + sys: 10, + pcpu_usr: vec![10, 10], + pcpu_sys: vec![10, 10], + threads: vec![121, 131, 120, 130], + procs: vec![121, 131, 120, 130], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 1), + (KEY_NR_THROTTLED.to_string(), 1), + (KEY_THROTTLED_TIME.to_string(), 1), + ] + .iter() + .cloned() + .collect(), + + v2: CGROUPReadingV2 { + threads: vec![121, 131, 120, 130], + procs: vec![121, 131, 120, 130], + cpustats: [ + (KEY_NR_PERIODS.to_string(), 1), + (KEY_NR_THROTTLED.to_string(), 1), + (KEY_THROTTLED_TIME.to_string(), 1), + ] + .iter() + .cloned() + .collect(), + cpupsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.10, + avg60: 0.10, + avg300: 0.10, + total: 1, + }, + full: CGROUPV2PsiVal { + avg10: 0.10, + avg60: 0.10, + avg300: 0.10, + total: 1, + }, + }, + mempsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.10, + avg60: 0.10, + avg300: 0.10, + total: 1, + }, + full: CGROUPV2PsiVal { + avg10: 0.10, + avg60: 0.10, + avg300: 0.10, + total: 1, + }, + }, + iopsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: 0.10, + avg60: 0.10, + avg300: 0.10, + total: 1, + }, + full: CGROUPV2PsiVal { + avg10: 0.10, + avg60: 0.10, + avg300: 0.10, + total: 1, + }, + }, + }, + }; + + let diff = diff_cgroupreading(&v0, &v1); + assert_eq!(diff, shouldbe); + } +} diff --git "a/src/Il\303\272vatar/iluvatar_library/src/clock.rs" "b/src/Il\303\272vatar/iluvatar_library/src/clock.rs" index 97f1ae88..a76f9c4c 100644 --- "a/src/Il\303\272vatar/iluvatar_library/src/clock.rs" +++ "b/src/Il\303\272vatar/iluvatar_library/src/clock.rs" @@ -27,6 +27,14 @@ pub fn get_global_clock(tid: &TransactionId) -> Result { Ok(clk) } +/// Get the current timestamp in milliseconds based off global clock +pub fn get_global_timestamp_ms() -> i64 { + match get_global_clock(&gen_tid()) { + Ok(clk) => (clk.now().unix_timestamp_nanos() / 1000000) as i64, + Err(_) => 0, + } +} + /// Get the current [Instance] #[inline(always)] pub fn now() -> Instant { diff --git "a/src/Il\303\272vatar/iluvatar_library/src/lib.rs" "b/src/Il\303\272vatar/iluvatar_library/src/lib.rs" index e59d050d..8932d98f 100644 --- "a/src/Il\303\272vatar/iluvatar_library/src/lib.rs" +++ "b/src/Il\303\272vatar/iluvatar_library/src/lib.rs" @@ -6,7 +6,7 @@ pub mod transaction; pub mod utils; #[macro_use] pub mod macros; -pub mod characteristics_map; +pub mod cgroup_interaction; pub mod clock; pub mod continuation; pub mod energy; diff --git "a/src/Il\303\272vatar/iluvatar_library/src/utils/mod.rs" "b/src/Il\303\272vatar/iluvatar_library/src/utils/mod.rs" index 5a24187d..694e203c 100644 --- "a/src/Il\303\272vatar/iluvatar_library/src/utils/mod.rs" +++ "b/src/Il\303\272vatar/iluvatar_library/src/utils/mod.rs" @@ -15,41 +15,11 @@ use crate::utils::port::Port; use anyhow::Result; use async_process::Command as AsyncCommand; use std::collections::HashMap; -use std::num::ParseIntError; use std::process::{Child, Command, Output, Stdio}; -use std::{str, thread, time}; +use std::str; use tokio::signal::unix::{signal, Signal, SignalKind}; use tracing::{debug, info}; -pub fn get_child_pid(ppid: u32) -> Result { - let output = Command::new("pgrep") - .arg("-P") - .arg(ppid.to_string()) - .output() - .expect("failed to execute process"); - - str::from_utf8(&output.stdout).unwrap().trim().parse::() -} - -pub fn try_get_child_pid(ppid: u32, timeout_ms: u64, tries: u32) -> u32 { - let millis = time::Duration::from_millis(timeout_ms); - let mut tries = tries; - - while tries > 0 { - let r = get_child_pid(ppid); - - let cpid = r.unwrap_or(0); - if cpid != 0 { - return cpid; - } - - tries -= 1; - thread::sleep(millis); - } - - 0 -} - lazy_static::lazy_static! { // TODO: This probably shouldn't exist. Process-level global state causes weirdness, is generally bad programming, and prevents in-proc simulation on alternate threads. static ref SIMULATION_CHECK: parking_lot::Mutex = parking_lot::Mutex::new(false); @@ -208,7 +178,6 @@ where debug!(tid=%tid, command=%cmd_pth, args=?args, environment=?env, "executing host command"); let mut cmd = prepare_cmd(cmd_pth, args, env, tid)?; cmd.stdout(Stdio::null()).stdin(Stdio::null()).stderr(Stdio::null()); - match cmd.spawn() { Ok(out) => Ok(out), Err(e) => { diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/Cargo.toml" "b/src/Il\303\272vatar/iluvatar_worker_library/Cargo.toml" index f8411e68..8ea28598 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/Cargo.toml" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/Cargo.toml" @@ -46,6 +46,13 @@ bollard = "0.16" prost-types = "0.12" num_cpus = "1.16.0" async-trait = "0.1.80" +ipc-channel = { version = "0.18.1", features = ["memfd"] } +iluvatar_bpf_library = { path = "../iluvatar_bpf_library" } +num = "0.4.3" + +[build-dependencies] +tonic-build = "0.7.2" +prost-build = "0.11" [dev-dependencies] rstest = "0.13.0" @@ -54,6 +61,7 @@ anyhow = { version = "1.0.13", features = ["backtrace"] } reqwest = { version = "0.12.4", default-features = false, features = ["json", "rustls-tls"] } more-asserts = "0.3" rand = "0.8" +float-cmp = "0.9.0" [features] full_spans = [] diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/lib.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/lib.rs" index 6a653035..fef03989 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/lib.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/lib.rs" @@ -1,2 +1,7 @@ pub mod services; +pub mod utils; pub mod worker_api; +use crate::worker_api::fs_scheduler::Channels; +use std::sync::RwLock; + +pub static mut SCHED_CHANNELS: Option> = None; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/container_pool.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/container_pool.rs" index 4f330aec..0ef4f96f 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/container_pool.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/container_pool.rs" @@ -1,6 +1,7 @@ use super::structs::{Container, ContainerState}; use anyhow::Result; use dashmap::DashMap; +use iluvatar_bpf_library::bpf::func_characs::BPF_FMAP_KEY; use iluvatar_library::{bail_error, transaction::TransactionId, types::Compute}; use std::sync::{ atomic::{AtomicU32, Ordering}, @@ -21,15 +22,18 @@ enum PoolType { pub struct ContainerPool { idle_pool: Pool, running_pool: Pool, + tid_map: DashMap, // tid-cgroupid /// fqdn->Vec len: AtomicU32, pool_name: String, } + impl ContainerPool { pub fn new(compute: Compute) -> Self { ContainerPool { idle_pool: DashMap::new(), running_pool: DashMap::new(), + tid_map: DashMap::new(), len: AtomicU32::new(0), pool_name: format!("{:?}", compute), } @@ -111,6 +115,8 @@ impl ContainerPool { Some(c) => { debug!(tid=%tid, container_id=%c.container_id(), name=%self.pool_name, pool_type=?PoolType::Idle, "Removing random container from pool"); self.add_container(c.clone(), &self.running_pool, tid, PoolType::Running); + // pool is the best place to maintain a tid to cgroup id mapping + self.tid_map.insert(tid.clone(), c.get_cgroupid()); Some(c) } None => None, @@ -226,6 +232,10 @@ impl ContainerPool { let (pos, pool_len) = self.find_container_pos(container, pool_list); if pos < pool_len { debug!(tid=%tid, container_id=%container.container_id(), name=%self.pool_name, pool_type=?pool_type, "Removing container from pool"); + // self.tid_map.remove( tid ); + // we are not removing the tid here because + // charmap would query it after the invocation is done + // and we don't want the value to be absent Some(pool_list.remove(pos)) } else { None @@ -234,6 +244,15 @@ impl ContainerPool { None => None, } } + + pub fn get_cgroupid_against_tid(&self, tid: &TransactionId) -> Option { + self.tid_map.get(tid).map(|v| *v) + } + + pub fn remove_cgroupid_against_tid(&self, tid: &TransactionId) -> Option { + self.tid_map.remove(tid).map(|(_k, v)| v) + } + fn find_container_pos(&self, container: &Container, pool_list: &Subpool) -> (usize, usize) { let pool_len = pool_list.len(); let mut pos = usize::MAX; @@ -245,6 +264,22 @@ impl ContainerPool { } (pos, pool_len) } + + /// get all the cgroup_ids corresponding to given fqdn in this pool + pub fn get_cgroup_ids(&self, fqdn: &str) -> Vec { + let mut cgroup_ids = Vec::new(); + if let Some(c) = self.idle_pool.get(fqdn) { + for cont in &*c { + cgroup_ids.push(cont.get_cgroupid()); + } + } + if let Some(c) = self.running_pool.get(fqdn) { + for cont in &*c { + cgroup_ids.push(cont.get_cgroupid()); + } + } + cgroup_ids + } } #[cfg(test)] diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerd.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerd.rs" index 08216bb3..b7fc12dd 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerd.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerd.rs" @@ -5,7 +5,7 @@ use crate::services::containers::structs::{Container, ContainerState}; use crate::services::network::namespace_manager::NamespaceManager; use crate::services::registration::RegisteredFunction; use crate::services::resources::gpu::GPU; -use crate::worker_api::worker_config::{ContainerResourceConfig, FunctionLimits}; +use crate::worker_api::worker_config::{ContainerResourceConfig, FunctionLimits, WorkerConfig}; use anyhow::Result; use client::services::v1::container::Runtime; use client::services::v1::snapshots::{snapshots_client::SnapshotsClient, PrepareSnapshotRequest}; @@ -31,44 +31,30 @@ use iluvatar_library::utils::{ cgroup::cgroup_namespace, file::{temp_file_pth, touch, try_remove_pth}, port::Port, - try_get_child_pid, }; use iluvatar_library::{bail_error, bail_error_value, error_value, transaction::TransactionId, types::MemSizeMb}; use inotify::{Inotify, WatchMask}; use oci_spec::image::{ImageConfiguration, ImageIndex, ImageManifest}; -use serde::Deserialize; use sha2::{Digest, Sha256}; use std::collections::HashMap; -use std::sync::mpsc; -use std::sync::mpsc::sync_channel; use std::sync::Arc; -use std::thread; use std::time::Duration; use tracing::{debug, error, info, warn}; pub mod containerdstructs; const CONTAINERD_SOCK: &str = "/run/containerd/containerd.sock"; -#[derive(Debug, Deserialize)] -pub struct BGPacket { - pid: u32, - fqdn: String, - container_id: String, - tid: TransactionId, -} - #[derive(Debug)] #[allow(dead_code)] pub struct ContainerdIsolation { channel: Option, namespace_manager: Arc, + worker_config: WorkerConfig, config: Arc, limits_config: Arc, docker_config: Option, downloaded_images: Arc>, creation_sem: Option, - tx: Arc>, - bg_workqueue: thread::JoinHandle>, } /// A service to handle the low-level details of containerd container lifecycles: @@ -93,17 +79,9 @@ impl ContainerdIsolation { true } - fn send_bg_packet(&self, pid: u32, fqdn: &str, container_id: &str, tid: &TransactionId) { - let _ = self.tx.send(BGPacket { - pid, - fqdn: String::from(fqdn), - container_id: container_id.to_owned(), - tid: tid.clone(), - }); - } - pub fn new( ns_man: Arc, + worker_config: WorkerConfig, config: Arc, limits_config: Arc, docker_config: Option, @@ -113,37 +91,17 @@ impl ContainerdIsolation { i => Some(tokio::sync::Semaphore::new(i as usize)), }; - let (send, recv) = sync_channel(30); - ContainerdIsolation { // this is threadsafe if we clone channel // https://docs.rs/tonic/0.4.0/tonic/transport/struct.Channel.html#multiplexing-requests channel: None, namespace_manager: ns_man, + worker_config, config, limits_config, docker_config, downloaded_images: Arc::new(DashMap::new()), creation_sem: sem, - tx: Arc::new(send), - bg_workqueue: thread::spawn(move || loop { - match recv.recv() { - Ok(x) => { - let ccpid = try_get_child_pid(x.pid, 1, 500); - info!( - tid=%x.tid, - fqdn=%x.fqdn, - container_id=%x.container_id, - pid=%x.pid, - cpid=%ccpid, - "tag_pid_mapping" - ); - } - Err(e) => { - bail_error!(error=%e, "background receive channel broken!"); - } - } - }), } } @@ -371,8 +329,12 @@ impl ContainerdIsolation { ctd_namespace: &str, tid: &TransactionId, ) -> Result<()> { - info!(tid=%tid, container_id=%container_id, "Removing container"); let mut client = TasksClient::new(self.channel()); + // container_id is of the form rodinia-needle-0.0.1-2DA0B908-D363-B1BD-34A3-C5F6B264FEFE + // we only need rodinia-needle-0.0.1 + let fqdn = container_id.split('-').take(3).collect::>().join("-"); + info!(tid=%tid, container_id=%container_id, fqdn=%fqdn, "Removing container"); + //self.dtx.send( fqdn ); self.kill_task(&mut client, container_id, ctd_namespace, tid).await?; self.delete_task(&mut client, container_id, ctd_namespace, tid).await?; @@ -737,12 +699,6 @@ impl ContainerIsolationService for ContainerdIsolation { Ok(r) => { debug!("Task {}: {:?} started", container.container_id, r); container.task.running = true; - self.send_bg_packet( - container.task.pid, - fqdn, - &container.task.container_id.clone().unwrap(), - tid, - ); Ok(Arc::new(container)) } Err(e) => { @@ -799,7 +755,9 @@ impl ContainerIsolationService for ContainerdIsolation { let mut handles = vec![]; for container in resp.into_inner().containers { let container_id = container.id.clone(); - info!(tid=%tid, container_id=%container_id, "Removing container"); + let fqdn = container_id.split('-').take(3).collect::>().join("-"); + info!(tid=%tid, container_id=%container_id, fqdn=%fqdn, "Removing container"); + //self.dtx.send( fqdn ); let svc_clone = self_src.clone(); let ns_clone = ctd_namespace.to_string(); diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerdstructs.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerdstructs.rs" index 2a6ad462..a65e2cbc 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerdstructs.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containerd/containerdstructs.rs" @@ -9,6 +9,7 @@ use crate::services::{ resources::gpu::GPU, }; use anyhow::Result; +use iluvatar_bpf_library::bpf::func_characs::{build_bpf_key, BPF_FMAP_KEY}; use iluvatar_library::clock::now; use iluvatar_library::types::{err_val, ResultErrorVal}; use iluvatar_library::{ @@ -49,6 +50,7 @@ pub struct ContainerdContainer { compute: Compute, device: RwLock>, drop_on_remove: Mutex>, + cgroup_id: BPF_FMAP_KEY, } impl ContainerdContainer { @@ -87,6 +89,7 @@ impl ContainerdContainer { state: Mutex::new(state), device: RwLock::new(device), drop_on_remove: Mutex::new(vec![]), + cgroup_id: build_bpf_key("containerd"), }) } @@ -112,6 +115,10 @@ impl ContainerT for ContainerdContainer { } } + fn get_cgroupid(&self) -> BPF_FMAP_KEY { + self.cgroup_id + } + fn container_id(&self) -> &String { &self.container_id } diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containermanager.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containermanager.rs" index 6f4731d6..c704906f 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containermanager.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/containermanager.rs" @@ -9,11 +9,13 @@ use crate::worker_api::worker_config::ContainerResourceConfig; use anyhow::{bail, Result}; use dashmap::DashMap; use futures::Future; +use iluvatar_bpf_library::bpf::func_characs::BPF_FMAP_KEY; use iluvatar_library::threading::{tokio_notify_thread, tokio_runtime, tokio_sender_thread, EventualItem}; use iluvatar_library::types::{Compute, Isolation, MemSizeMb}; use iluvatar_library::{bail_error, transaction::TransactionId, utils::calculate_fqdn}; use parking_lot::RwLock; use std::cmp::Ordering; +use std::fmt; use std::sync::{atomic::AtomicU32, Arc}; use tokio::sync::mpsc::UnboundedSender; use tokio::sync::Notify; @@ -51,7 +53,33 @@ pub struct ContainerManager { outstanding_containers: DashMap, } +impl fmt::Debug for ContainerManager { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ContainerManager").finish() + } +} + impl ContainerManager { + pub fn get_cgroupid_against_tid(&self, tid: &TransactionId) -> Option { + if let Some(v) = self.cpu_containers.get_cgroupid_against_tid(tid) { + return Some(v); + } + if let Some(v) = self.gpu_containers.get_cgroupid_against_tid(tid) { + return Some(v); + } + None + } + + pub fn remove_cgroupid_against_tid(&self, tid: &TransactionId) -> Option { + if let Some(v) = self.cpu_containers.remove_cgroupid_against_tid(tid) { + return Some(v); + } + if let Some(v) = self.gpu_containers.remove_cgroupid_against_tid(tid) { + return Some(v); + } + None + } + pub async fn boxed( resources: Arc, cont_isolations: ContainerIsolationCollection, @@ -189,6 +217,18 @@ impl ContainerManager { ret } + /// Returns the best possible idle container's [ContainerState] at this time + /// Can be either running or idle, if [ContainerState::Cold], then possibly no container found + pub fn container_cgroup_ids(&self, fqdn: &str, compute: Compute) -> Vec { + let mut cgroupids = vec![]; + if compute == Compute::CPU { + cgroupids.extend(self.cpu_containers.get_cgroup_ids(fqdn)); + } else if compute == Compute::GPU { + cgroupids.extend(self.gpu_containers.get_cgroup_ids(fqdn)); + } + cgroupids + } + /// The number of containers for the given FQDN that are not idle /// I.E. they are executing an invocation /// 0 if the fqdn is unknown diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/docker.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/docker.rs" index b3e7794c..db457b80 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/docker.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/docker.rs" @@ -5,6 +5,7 @@ use crate::{ services::{containers::structs::ContainerState, registration::RegisteredFunction}, worker_api::worker_config::{ContainerResourceConfig, FunctionLimits}, }; +use anyhow::bail; use anyhow::Result; use bollard::Docker; use bollard::{ @@ -20,14 +21,18 @@ use bollard::{ use dashmap::DashSet; use futures::StreamExt; use guid_create::GUID; +use iluvatar_bpf_library::bpf::func_characs::{build_bpf_key, BPF_FMAP_KEY}; use iluvatar_library::clock::now; use iluvatar_library::types::{err_val, ResultErrorVal}; use iluvatar_library::{ bail_error, bail_error_value, error_value, transaction::TransactionId, types::{Compute, Isolation, MemSizeMb}, + utils::execute_cmd_async, utils::port::free_local_port, }; + +use iluvatar_library::utils::execute_cmd; use std::collections::HashMap; use std::sync::Arc; use tracing::{debug, error, info, warn}; @@ -114,7 +119,7 @@ impl DockerIsolation { ports: BollardPortBindings, host_config: Option, entrypoint: Option>, - ) -> Result<()> { + ) -> Result { let mut host_config = host_config.unwrap_or_default(); host_config.cpu_shares = Some((cpus * 1024) as i64); host_config.memory = Some(mem_limit_mb * 1024 * 1024); @@ -213,7 +218,22 @@ impl DockerIsolation { Err(e) => bail_error!(tid=%tid, error=%e, "Error starting container"), }; debug!(tid=%tid, container_id=%container_id, "Container started"); - Ok(()) + + let inspect_container = |cid: &str, field: &str| { + let pargs = vec!["inspect", "-f", field, cid]; + // just an inspect cmd no need for env + if let Ok(output) = execute_cmd("/usr/bin/docker", pargs, None, tid) { + return String::from_utf8_lossy(&output.stdout) + .trim() + .trim_matches('\'') + .to_string(); + } + "".to_string() + }; + let cgroup_idoutput = inspect_container(container_id, "'{{.Id}}'"); + let cgroup_id: BPF_FMAP_KEY = build_bpf_key(&cgroup_idoutput[0..15]); + + Ok(cgroup_id) } /// Get the stdout and stderr of a container @@ -254,6 +274,31 @@ impl DockerIsolation { } } +async fn check_if_image_pulled(img: &str, tid: &TransactionId) -> Result<()> { + // img is docker.io/alfuerst/rodinia-iluvatar-gpu:latest + // remove docker.io/ + let mut img = img.split("/"); + img.next(); + let img: Vec<_> = img.collect(); + let img = img.join("/"); + + let output = execute_cmd_async("/usr/bin/docker", vec!["images", img.as_str()], None, tid).await?; + let outstr = String::from_utf8_lossy(&output.stdout) + .trim() + .trim_matches('\'') + .to_string(); + + if let Some(0) = output.status.code() { + let mut lines = outstr.split("\n"); + lines.next(); + if lines.next().is_some() { + return Ok(()); + } + }; + + bail!("not present") +} + #[tonic::async_trait] impl ContainerIsolationService for DockerIsolation { fn backend(&self) -> Vec { @@ -316,7 +361,7 @@ impl ContainerIsolationService for DockerIsolation { None => None, }; - if let Err(e) = self + let cgroup_id = match self .docker_run( tid, image_name, @@ -331,9 +376,14 @@ impl ContainerIsolationService for DockerIsolation { ) .await { - bail_error_value!(error=%e, tid=%tid, "Error trying to acquire docker creation semaphore", device_resource); + Ok(cgid) => cgid, + Err(e) => { + bail_error_value!(error=%e, tid=%tid, "Error trying to acquire docker creation semaphore", device_resource); + } }; + println!("fqdn {} -> cgroup_id {:?}", fqdn, cgroup_id); + drop(permit); unsafe { let c = match DockerContainer::new( @@ -348,6 +398,7 @@ impl ContainerIsolationService for DockerIsolation { compute, device_resource, tid, + cgroup_id, ) { Ok(c) => c, Err((e, d)) => return err_val(e, d), @@ -399,14 +450,18 @@ impl ContainerIsolationService for DockerIsolation { None => None, }; - let mut stream = self.docker_api.create_image(options, None, auth); - while let Some(res) = stream.next().await { - match res { - Ok(_) => (), - Err(e) => bail_error!(tid=%tid, error=%e, "Failed to pull image"), + if (check_if_image_pulled(&rf.image_name, tid).await).is_err() { + let mut stream = self.docker_api.create_image(options, None, auth); + while let Some(res) = stream.next().await { + match res { + Ok(_) => (), + Err(e) => bail_error!(tid=%tid, error=%e, "Failed to pull image"), + } } + info!(tid=%tid, name=%rf.image_name, "Docker image pulled successfully"); + } else { + info!(tid=%tid, name=%rf.image_name, "Docker image not pulled as img is already there"); } - info!(tid=%tid, name=%rf.image_name, "Docker image pulled successfully"); self.pulled_images.insert(rf.image_name.clone()); Ok(()) } diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/dockerstructs.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/dockerstructs.rs" index 208abd0e..c16ec881 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/dockerstructs.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/docker/dockerstructs.rs" @@ -6,6 +6,7 @@ use crate::services::{ resources::gpu::GPU, }; use anyhow::Result; +use iluvatar_bpf_library::bpf::func_characs::BPF_FMAP_KEY; use iluvatar_library::clock::now; use iluvatar_library::types::{err_val, DroppableToken, ResultErrorVal}; use iluvatar_library::{ @@ -34,6 +35,7 @@ pub struct DockerContainer { device: RwLock>, mem_usage: RwLock, drop_on_remove: Mutex>, + cgroup_id: BPF_FMAP_KEY, } impl DockerContainer { @@ -49,6 +51,7 @@ impl DockerContainer { compute: Compute, device: Option, tid: &TransactionId, + cgroup_id: BPF_FMAP_KEY, ) -> ResultErrorVal> { let client = match HttpContainerClient::new(&container_id, port, &address, invoke_timeout, tid) { Ok(c) => c, @@ -67,6 +70,7 @@ impl DockerContainer { state: Mutex::new(state), device: RwLock::new(device), drop_on_remove: Mutex::new(vec![]), + cgroup_id, }; Ok(r) } @@ -101,6 +105,10 @@ impl ContainerT for DockerContainer { *lock = now(); } + fn get_cgroupid(&self) -> BPF_FMAP_KEY { + self.cgroup_id + } + fn container_id(&self) -> &String { &self.container_id } diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/mod.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/mod.rs" index 53d0133a..feb4187f 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/mod.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/mod.rs" @@ -111,6 +111,7 @@ impl IsolationFactory { let netm = NamespaceManager::boxed(networking.clone(), tid, ensure_bridge)?; let mut lifecycle = ContainerdIsolation::new( netm, + self.worker_config.clone(), self.worker_config.container_resources.clone(), self.worker_config.limits.clone(), self.worker_config.container_resources.docker_config.clone(), diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/simulation/simstructs.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/simulation/simstructs.rs" index 0c92a59e..ae656d9b 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/simulation/simstructs.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/simulation/simstructs.rs" @@ -5,6 +5,8 @@ use crate::services::{ resources::gpu::GPU, }; use anyhow::Result; +use iluvatar_bpf_library::bpf::func_characs::build_bpf_key; +use iluvatar_bpf_library::bpf::func_characs::BPF_FMAP_KEY; use iluvatar_library::clock::{now, ContainerTimeFormatter, GlobalClock}; use iluvatar_library::types::ResultErrorVal; use iluvatar_library::{ @@ -189,6 +191,10 @@ impl ContainerT for SimulatorContainer { Ok((result, code_dur)) } + fn get_cgroupid(&self) -> BPF_FMAP_KEY { + build_bpf_key("fromsimstruct") + } + fn touch(&self) { let mut lock = self.last_used.write(); *lock = now(); diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/structs.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/structs.rs" index edebe0b0..86c03231 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/structs.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/containers/structs.rs" @@ -1,6 +1,7 @@ use crate::services::resources::gpu::ProtectedGpuRef; use crate::services::{containers::containermanager::ContainerManager, registration::RegisteredFunction}; use anyhow::Result; +use iluvatar_bpf_library::bpf::func_characs::BPF_FMAP_KEY; use iluvatar_library::{ bail_error, transaction::TransactionId, @@ -17,6 +18,9 @@ pub trait ContainerT: ToAny + Send + Sync { /// Invoke the function within the container, passing the json args to it async fn invoke(&self, json_args: &str, tid: &TransactionId) -> Result<(ParsedResult, Duration)>; + /// get cgroup id + fn get_cgroupid(&self) -> BPF_FMAP_KEY; + /// indicate that the container as been "used" or internal datatsructures should be updated such that it has fn touch(&self); /// the unique ID for this container diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/cpu_q_invoke.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/cpu_q_invoke.rs" index f3955ab0..bde7163e 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/cpu_q_invoke.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/cpu_q_invoke.rs" @@ -11,9 +11,9 @@ use crate::services::containers::{ use crate::services::invocation::energy_limiter::EnergyLimiter; use crate::services::invocation::invoke_on_container; use crate::services::{registration::RegisteredFunction, resources::cpu::CpuResourceTracker}; +use crate::utils::characteristics_map::CharacteristicsMap; use crate::worker_api::worker_config::{FunctionLimits, InvocationConfig}; use anyhow::Result; -use iluvatar_library::characteristics_map::CharacteristicsMap; use iluvatar_library::clock::{get_global_clock, now, Clock}; use iluvatar_library::{threading::tokio_runtime, threading::EventualItem, transaction::TransactionId, types::Compute}; use parking_lot::Mutex; @@ -345,6 +345,7 @@ impl CpuQueueingInvoker { &self.clock, ) .await?; + self.running.fetch_sub(1, std::sync::atomic::Ordering::Relaxed); drop(permit); self.signal.notify_waiters(); diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/energy_limiter.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/energy_limiter.rs" index a37011b3..c343bb1e 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/energy_limiter.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/energy_limiter.rs" @@ -1,5 +1,5 @@ +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::characteristics_map::CharacteristicsMap; use iluvatar_library::energy::energy_logging::EnergyLogger; use std::sync::Arc; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/gpu_q_invoke.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/gpu_q_invoke.rs" index 0581823e..f989853f 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/gpu_q_invoke.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/gpu_q_invoke.rs" @@ -14,6 +14,7 @@ use crate::services::{ }, invocation::invoke_on_container, }; +use crate::utils::characteristics_map::CharacteristicsMap; use crate::worker_api::worker_config::{FunctionLimits, InvocationConfig}; use crate::{ services::{containers::structs::ContainerLock, registration::RegisteredFunction}, @@ -21,8 +22,10 @@ use crate::{ }; use anyhow::Result; use iluvatar_library::clock::{get_global_clock, now, Clock}; -use iluvatar_library::{characteristics_map::CharacteristicsMap, types::DroppableToken}; -use iluvatar_library::{threading::tokio_runtime, threading::EventualItem, transaction::TransactionId, types::Compute}; +use iluvatar_library::{ + threading::tokio_runtime, threading::EventualItem, transaction::TransactionId, types::Compute, + types::DroppableToken, +}; use parking_lot::Mutex; use std::collections::VecDeque; use std::{ diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/mod.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/mod.rs" index bf3acf7b..2715b763 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/mod.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/mod.rs" @@ -8,11 +8,12 @@ use crate::services::{ containers::structs::{ContainerState, ParsedResult}, registration::RegisteredFunction, }; -use crate::worker_api::worker_config::{FunctionLimits, GPUResourceConfig, InvocationConfig}; +use crate::utils::characteristics_map::CharacteristicsMap; +use crate::utils::characteristics_map::{Characteristics, Values}; +use crate::worker_api::worker_config::{FunctionLimits, GPUResourceConfig, InvocationConfig, WorkerConfig}; use anyhow::Result; -use iluvatar_library::characteristics_map::{Characteristics, Values}; use iluvatar_library::clock::Clock; -use iluvatar_library::{characteristics_map::CharacteristicsMap, transaction::TransactionId, types::Compute}; +use iluvatar_library::{transaction::TransactionId, types::Compute}; use parking_lot::Mutex; use std::{sync::Arc, time::Duration}; use time::OffsetDateTime; @@ -56,6 +57,7 @@ pub trait Invoker: Send + Sync { pub struct InvokerFactory { cont_manager: Arc, function_config: Arc, + worker_config: WorkerConfig, invocation_config: Arc, cmap: Arc, cpu: Arc, @@ -69,6 +71,7 @@ impl InvokerFactory { pub fn new( cont_manager: Arc, function_config: Arc, + worker_config: WorkerConfig, invocation_config: Arc, cmap: Arc, cpu: Arc, @@ -79,6 +82,7 @@ impl InvokerFactory { InvokerFactory { cont_manager, function_config, + worker_config, invocation_config, cmap, cpu, @@ -93,6 +97,7 @@ impl InvokerFactory { let invoker = QueueingDispatcher::new( self.cont_manager.clone(), self.function_config.clone(), + self.worker_config.clone(), self.invocation_config.clone(), tid, self.cmap.clone(), @@ -190,6 +195,8 @@ async fn invoke_on_container_2( cmap: &Arc, clock: &Clock, ) -> Result<(ParsedResult, Duration, Container)> { + cmap.start_invoke(®.fqdn, tid); + info!(tid=%tid, insert_time=%clock.format_time(queue_insert_time)?, remove_time=%remove_time, "Item starting to execute"); let (data, duration) = ctr_lock.invoke(json_args).await?; let (char, time) = match ctr_lock.container.state() { @@ -197,6 +204,7 @@ async fn invoke_on_container_2( ContainerState::Prewarm => (Characteristics::PreWarmTime, data.duration_sec), _ => (Characteristics::ColdTime, cold_time_start.elapsed().as_secs_f64()), }; + cmap.add(®.fqdn, char, Values::F64(time), true); cmap.add( ®.fqdn, @@ -206,5 +214,7 @@ async fn invoke_on_container_2( ); let e2etime = (clock.now() - queue_insert_time).as_seconds_f64(); cmap.add(®.fqdn, Characteristics::E2ECpu, Values::F64(e2etime), true); + cmap.end_invoke(®.fqdn, tid); + Ok((data, duration, ctr_lock.container.clone())) } diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/avail_scale.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/avail_scale.rs" index 282a86f1..ecde9dd3 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/avail_scale.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/avail_scale.rs" @@ -1,7 +1,8 @@ use super::{EnqueuedInvocation, InvokerCpuQueuePolicy, MinHeapEnqueuedInvocation, MinHeapFloat}; use crate::services::containers::{containermanager::ContainerManager, structs::ContainerState}; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::{characteristics_map::CharacteristicsMap, transaction::TransactionId}; +use iluvatar_library::transaction::TransactionId; use parking_lot::Mutex; use std::collections::BinaryHeap; use std::sync::Arc; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/cold_priority.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/cold_priority.rs" index 3e105a2e..522ced59 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/cold_priority.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/cold_priority.rs" @@ -1,7 +1,8 @@ use super::{EnqueuedInvocation, InvokerCpuQueuePolicy, MinHeapEnqueuedInvocation, MinHeapFloat}; use crate::services::containers::{containermanager::ContainerManager, structs::ContainerState}; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::{characteristics_map::CharacteristicsMap, transaction::TransactionId}; +use iluvatar_library::transaction::TransactionId; use parking_lot::Mutex; use std::collections::BinaryHeap; use std::sync::Arc; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/concur_mqfq.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/concur_mqfq.rs" index 4cceeea6..c1c34e50 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/concur_mqfq.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/concur_mqfq.rs" @@ -7,10 +7,10 @@ use crate::services::invocation::invoke_on_container; use crate::services::registration::RegisteredFunction; use crate::services::resources::cpu::CpuResourceTracker; use crate::services::resources::gpu::{GpuResourceTracker, GPU}; +use crate::utils::characteristics_map::CharacteristicsMap; use crate::worker_api::worker_config::{GPUResourceConfig, InvocationConfig}; use anyhow::Result; use dashmap::DashMap; -use iluvatar_library::characteristics_map::CharacteristicsMap; use iluvatar_library::clock::{get_global_clock, now, Clock}; use iluvatar_library::mindicator::Mindicator; use iluvatar_library::threading::EventualItem; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/dynamic_batching.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/dynamic_batching.rs" index fe53ad11..b05947f1 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/dynamic_batching.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/dynamic_batching.rs" @@ -3,9 +3,9 @@ use crate::services::{ invocation::gpu_q_invoke::{GpuBatch, GpuQueuePolicy}, registration::RegisteredFunction, }; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; use dashmap::DashMap; -use iluvatar_library::characteristics_map::CharacteristicsMap; use parking_lot::Mutex; use std::collections::VecDeque; use std::sync::{atomic::AtomicUsize, Arc}; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs.rs" index da7d5f5f..55d42a6e 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs.rs" @@ -1,6 +1,6 @@ use crate::services::containers::containermanager::ContainerManager; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::characteristics_map::CharacteristicsMap; use parking_lot::Mutex; use std::collections::BinaryHeap; use std::sync::Arc; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs_gpu.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs_gpu.rs" index 4095cf13..6eeea37a 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs_gpu.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/fcfs_gpu.rs" @@ -4,8 +4,8 @@ use crate::services::{ invocation::gpu_q_invoke::{GpuBatch, GpuQueuePolicy}, registration::RegisteredFunction, }; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::characteristics_map::CharacteristicsMap; use parking_lot::Mutex; use std::{collections::VecDeque, sync::Arc}; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/gpu_mqfq.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/gpu_mqfq.rs" index c4eac4d7..48b26b14 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/gpu_mqfq.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/gpu_mqfq.rs" @@ -7,13 +7,14 @@ use crate::services::invocation::{completion_time_tracker::CompletionTimeTracker use crate::services::registration::RegisteredFunction; use crate::services::resources::cpu::CpuResourceTracker; use crate::services::resources::gpu::{GpuResourceTracker, GpuToken}; +use crate::utils::characteristics_map::CharacteristicsMap; use crate::worker_api::worker_config::{GPUResourceConfig, InvocationConfig}; use anyhow::Result; use dashmap::{mapref::multiple::RefMutMulti, DashMap}; use iluvatar_library::clock::{get_global_clock, now, Clock}; +use iluvatar_library::transaction::TransactionId; use iluvatar_library::types::{Compute, DroppableToken}; use iluvatar_library::utils::missing_default; -use iluvatar_library::{characteristics_map::CharacteristicsMap, transaction::TransactionId}; use iluvatar_library::{ mindicator::Mindicator, threading::{tokio_runtime, tokio_thread, EventualItem}, diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap.rs" index 963d07c6..f89177ae 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap.rs" @@ -1,6 +1,7 @@ use crate::services::containers::containermanager::ContainerManager; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::{characteristics_map::CharacteristicsMap, transaction::TransactionId}; +use iluvatar_library::transaction::TransactionId; use parking_lot::Mutex; use std::sync::Arc; use tracing::debug; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_ed.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_ed.rs" index 969d5a87..913f2137 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_ed.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_ed.rs" @@ -1,6 +1,7 @@ use crate::services::containers::containermanager::ContainerManager; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::{characteristics_map::CharacteristicsMap, transaction::TransactionId}; +use iluvatar_library::transaction::TransactionId; use parking_lot::Mutex; use std::sync::Arc; use tracing::debug; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_iat.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_iat.rs" index d4ec487e..c88c5863 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_iat.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/minheap_iat.rs" @@ -1,6 +1,7 @@ use crate::services::containers::containermanager::ContainerManager; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::{characteristics_map::CharacteristicsMap, transaction::TransactionId}; +use iluvatar_library::transaction::TransactionId; use parking_lot::Mutex; use std::sync::Arc; use tracing::debug; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/mod.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/mod.rs" index 82959c80..4c0293cb 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/mod.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/mod.rs" @@ -2,8 +2,8 @@ use super::{InvocationResult, InvocationResultPtr}; use crate::services::containers::containermanager::ContainerManager; use crate::services::containers::structs::{ContainerState, ParsedResult}; use crate::services::registration::RegisteredFunction; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; -use iluvatar_library::characteristics_map::CharacteristicsMap; use iluvatar_library::transaction::TransactionId; use iluvatar_library::types::Compute; use ordered_float::OrderedFloat; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/oldest_gpu.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/oldest_gpu.rs" index 4403bacb..fafd2492 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/oldest_gpu.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/oldest_gpu.rs" @@ -3,9 +3,9 @@ use crate::services::{ invocation::gpu_q_invoke::{GpuBatch, GpuQueuePolicy}, registration::RegisteredFunction, }; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; use dashmap::DashMap; -use iluvatar_library::characteristics_map::CharacteristicsMap; use parking_lot::Mutex; use std::sync::{atomic::AtomicUsize, Arc}; @@ -87,7 +87,7 @@ impl GpuQueuePolicy for BatchGpuQueue { #[cfg(test)] mod oldest_batch { use super::*; - use iluvatar_library::characteristics_map::{Characteristics, Values}; + use crate::utils::characteristics_map::{Characteristics, Values}; use iluvatar_library::clock::get_global_clock; use iluvatar_library::transaction::gen_tid; use std::collections::HashMap; @@ -110,7 +110,7 @@ mod oldest_batch { #[test] fn single_item_cold() { - let m = CharacteristicsMap::new(iluvatar_library::characteristics_map::AgExponential::new(0.6)); + let m = CharacteristicsMap::new(crate::utils::characteristics_map::AgExponential::new(0.6), None, None); let name = "t1"; let rf = reg(name); @@ -135,7 +135,7 @@ mod oldest_batch { #[test] fn two_item_mix() { - let m = CharacteristicsMap::new(iluvatar_library::characteristics_map::AgExponential::new(0.6)); + let m = CharacteristicsMap::new(crate::utils::characteristics_map::AgExponential::new(0.6), None, None); let name = "t1"; let rf = reg(name); let invoke = Arc::new(EnqueuedInvocation::new( @@ -166,7 +166,7 @@ mod oldest_batch { #[test] fn two_func_mix() { - let m = CharacteristicsMap::new(iluvatar_library::characteristics_map::AgExponential::new(0.6)); + let m = CharacteristicsMap::new(crate::utils::characteristics_map::AgExponential::new(0.6), None, None); let name = "t1"; let rf = reg(name); let invoke = Arc::new(EnqueuedInvocation::new( diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/paella.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/paella.rs" index a417e272..13b226c2 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/paella.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/paella.rs" @@ -3,10 +3,10 @@ use crate::services::{ invocation::gpu_q_invoke::{GpuBatch, GpuQueuePolicy}, registration::RegisteredFunction, }; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; use dashmap::mapref::multiple::RefMutMulti; use dashmap::DashMap; -use iluvatar_library::characteristics_map::CharacteristicsMap; use parking_lot::Mutex; use std::{ collections::VecDeque, diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/sized_batches_gpu.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/sized_batches_gpu.rs" index f4b97a6b..c285e9e1 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/sized_batches_gpu.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/sized_batches_gpu.rs" @@ -3,9 +3,9 @@ use crate::services::{ invocation::gpu_q_invoke::{GpuBatch, GpuQueuePolicy}, registration::RegisteredFunction, }; +use crate::utils::characteristics_map::CharacteristicsMap; use anyhow::Result; use dashmap::DashMap; -use iluvatar_library::characteristics_map::CharacteristicsMap; use parking_lot::Mutex; use std::{ collections::VecDeque, @@ -113,7 +113,7 @@ impl GpuQueuePolicy for SizedBatchGpuQueue { #[cfg(test)] mod oldest_batch { use super::*; - use iluvatar_library::characteristics_map::{Characteristics, Values}; + use crate::utils::characteristics_map::{Characteristics, Values}; use iluvatar_library::clock::get_global_clock; use iluvatar_library::transaction::gen_tid; use std::collections::HashMap; @@ -136,7 +136,7 @@ mod oldest_batch { #[test] fn single_item_cold() { - let m = CharacteristicsMap::new(iluvatar_library::characteristics_map::AgExponential::new(0.6)); + let m = CharacteristicsMap::new(crate::utils::characteristics_map::AgExponential::new(0.6), None, None); let name = "t1"; let rf = reg(name); @@ -161,7 +161,7 @@ mod oldest_batch { #[test] fn two_item_mix() { - let m = CharacteristicsMap::new(iluvatar_library::characteristics_map::AgExponential::new(0.6)); + let m = CharacteristicsMap::new(crate::utils::characteristics_map::AgExponential::new(0.6), None, None); let name = "t1"; let rf = reg(name); let invoke = Arc::new(EnqueuedInvocation::new( @@ -192,7 +192,7 @@ mod oldest_batch { #[test] fn two_func_mix() { - let m = CharacteristicsMap::new(iluvatar_library::characteristics_map::AgExponential::new(0.6)); + let m = CharacteristicsMap::new(crate::utils::characteristics_map::AgExponential::new(0.6), None, None); let name = "t1"; let rf = reg(name); let invoke = Arc::new(EnqueuedInvocation::new( diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/wfq.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/wfq.rs" index 92671815..3b1eb933 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/wfq.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing/wfq.rs" @@ -1,6 +1,6 @@ use crate::services::containers::containermanager::ContainerManager; use anyhow::Result; -use iluvatar_library::characteristics_map::CharacteristicsMap; +use crate::utils::characteristics_map::CharacteristicsMap; use parking_lot::Mutex; use std::collections::BinaryHeap; use std::sync::Arc; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing_dispatcher.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing_dispatcher.rs" index bda4afcc..2977f96e 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing_dispatcher.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/invocation/queueing_dispatcher.rs" @@ -9,9 +9,9 @@ use crate::services::invocation::energy_limiter::EnergyLimiter; use crate::services::registration::RegisteredFunction; use crate::services::resources::{cpu::CpuResourceTracker, gpu::GpuResourceTracker}; use crate::services::{containers::containermanager::ContainerManager, invocation::queueing::EnqueueingPolicy}; -use crate::worker_api::worker_config::{FunctionLimits, GPUResourceConfig, InvocationConfig}; +use crate::utils::characteristics_map::CharacteristicsMap; +use crate::worker_api::worker_config::{FunctionLimits, GPUResourceConfig, InvocationConfig, WorkerConfig}; use anyhow::Result; -use iluvatar_library::characteristics_map::CharacteristicsMap; use iluvatar_library::clock::{get_global_clock, Clock}; use iluvatar_library::types::ComputeEnum; use iluvatar_library::{transaction::TransactionId, types::Compute}; @@ -20,6 +20,7 @@ use rand::Rng; use std::{collections::HashMap, sync::Arc}; use time::OffsetDateTime; use tracing::{debug, info}; +//use crate::{SCHED_CHANNELS, get_pid_from_fqdn}; lazy_static::lazy_static! { pub static ref INVOKER_CPU_QUEUE_WORKER_TID: TransactionId = "InvokerCPUQueue".to_string(); @@ -65,6 +66,7 @@ impl PolymDispatchCtx { pub struct QueueingDispatcher { async_functions: AsyncHelper, + _worker_config: WorkerConfig, invocation_config: Arc, cmap: Arc, clock: Clock, @@ -80,6 +82,7 @@ impl QueueingDispatcher { pub fn new( cont_manager: Arc, function_config: Arc, + worker_config: WorkerConfig, invocation_config: Arc, tid: &TransactionId, cmap: Arc, @@ -111,6 +114,7 @@ impl QueueingDispatcher { )?, async_functions: AsyncHelper::new(), clock: get_global_clock(tid)?, + _worker_config: worker_config, invocation_config, dispatch_state: RwLock::new(PolymDispatchCtx::boxed(&cmap)), cmap, @@ -229,7 +233,6 @@ impl QueueingDispatcher { self.clock.now(), )); let mut enqueues = 0; - if reg.supported_compute == Compute::CPU { self.enqueue_cpu_check(&enqueue)?; return Ok(enqueue); diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/registration.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/registration.rs" index 298e39ea..772426f6 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/registration.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/registration.rs" @@ -1,8 +1,8 @@ use super::containers::{containermanager::ContainerManager, ContainerIsolationCollection}; +use crate::utils::characteristics_map::{Characteristics, CharacteristicsMap, Values}; use crate::worker_api::worker_config::{ContainerResourceConfig, FunctionLimits}; use anyhow::Result; use iluvatar_library::{ - characteristics_map::{Characteristics, CharacteristicsMap, Values}, transaction::TransactionId, types::{Compute, Isolation, MemSizeMb, ResourceTimings}, utils::calculate_fqdn, diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/resources/gpu.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/resources/gpu.rs" index 3d0daea2..a658d0c2 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/services/resources/gpu.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/services/resources/gpu.rs" @@ -373,7 +373,7 @@ impl GpuResourceTracker { }; let img_name = "docker.io/nvidia/cuda:11.8.0-base-ubuntu20.04"; let entrypoint = vec!["/usr/bin/nvidia-cuda-mps-control".to_owned(), "-f".to_owned()]; - docker + match docker .docker_run( tid, img_name, @@ -387,6 +387,10 @@ impl GpuResourceTracker { Some(entrypoint), ) .await + { + Ok(_cgroup_id) => Ok(()), + Err(e) => Err(e), + } } fn set_gpus_shared(tid: &TransactionId) -> Result<()> { diff --git "a/src/Il\303\272vatar/iluvatar_library/src/characteristics_map.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/utils/characteristics_map.rs" similarity index 64% rename from "src/Il\303\272vatar/iluvatar_library/src/characteristics_map.rs" rename to "src/Il\303\272vatar/iluvatar_worker_library/src/utils/characteristics_map.rs" index 54478fb7..8967774d 100644 --- "a/src/Il\303\272vatar/iluvatar_library/src/characteristics_map.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/utils/characteristics_map.rs" @@ -1,11 +1,43 @@ -use crate::clock::now; +use crate::services::containers::containermanager::ContainerManager; +use csv::Writer; use dashmap::DashMap; +use iluvatar_library::clock::get_global_timestamp_ms; +use iluvatar_library::clock::now; use ordered_float::OrderedFloat; +use serde::{Deserialize, Serialize}; use std::cmp::{min, Ordering}; +use std::io; +use std::io::Write; +use std::sync::Arc; +use std::thread; use std::time::Duration; use tokio::time::Instant; use tracing::{debug, error}; +use iluvatar_bpf_library::bpf::func_characs::*; +use iluvatar_library::cgroup_interaction::{ + diff_cgroupreading, read_cgroup, CGROUPReading, CGROUPReadingV2, CGROUPV2Psi, CGROUPV2PsiVal, KEY_NR_PERIODS, + KEY_NR_THROTTLED, KEY_SYSTEM_USEC, KEY_THROTTLED_TIME, KEY_USAGE_USEC, KEY_USER_USEC, +}; +use iluvatar_library::transaction::TransactionId; +use iluvatar_library::types::Compute; + +use num::cast::AsPrimitive; +use std::default::Default; + +use std::sync::mpsc; +use std::sync::mpsc::Receiver; +use std::sync::mpsc::Sender; + +use serde_json; +use std::fs::File; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CharacteristicsPacket { + pub fqdn: String, + pub e2e: f64, +} + #[derive(Debug, Clone)] pub enum Values { Duration(Duration), @@ -72,12 +104,146 @@ impl AgExponential { AgExponential { alpha } } - fn accumulate(&self, old: &f64, new: &f64) -> f64 { - (new * self.alpha) + (old * (1.0 - self.alpha)) + fn accumulate(&self, old: T, new: T) -> T + where + T: AsPrimitive, + f64: AsPrimitive, + { + let oldf: f64 = old.as_(); + let newf: f64 = new.as_(); + let r = (newf * self.alpha) + (oldf * (1.0 - self.alpha)); + r.as_() + } + + fn accumulate_vec(&self, old: &[T], new: &[T]) -> Vec + where + T: AsPrimitive, + f64: AsPrimitive, + { + let mut result = vec![]; + if old.len() != new.len() { + return new.to_vec(); + } + for (i, val) in old.iter().enumerate() { + result.push(self.accumulate(*val, new[i])) + } + result } + fn accumulate_dur(&self, old: &Duration, new: &Duration) -> Duration { new.mul_f64(self.alpha) + old.mul_f64(1.0 - self.alpha) } + + fn accumulate_cgroupreading(&self, old: &CGROUPReading, new: &CGROUPReading) -> CGROUPReading { + CGROUPReading { + usr: self.accumulate(old.usr, new.usr), + sys: self.accumulate(old.sys, new.sys), + pcpu_usr: self.accumulate_vec(&old.pcpu_usr, &new.pcpu_usr), + pcpu_sys: self.accumulate_vec(&old.pcpu_sys, &new.pcpu_sys), + threads: vec![], // if we do accumulate threads it would just + // bloat + procs: vec![], // same goes for the procs + cpustats: [ + ( + KEY_NR_PERIODS.to_string(), + self.accumulate( + *old.cpustats.get(KEY_NR_PERIODS).unwrap_or(&0), + *new.cpustats.get(KEY_NR_PERIODS).unwrap_or(&0), + ), + ), + ( + KEY_NR_THROTTLED.to_string(), + self.accumulate( + *old.cpustats.get(KEY_NR_THROTTLED).unwrap_or(&0), + *new.cpustats.get(KEY_NR_THROTTLED).unwrap_or(&0), + ), + ), + ( + KEY_THROTTLED_TIME.to_string(), + self.accumulate( + *old.cpustats.get(KEY_THROTTLED_TIME).unwrap_or(&0), + *new.cpustats.get(KEY_THROTTLED_TIME).unwrap_or(&0), + ), + ), + ] + .iter() + .cloned() + .collect(), + v2: CGROUPReadingV2 { + threads: vec![], + procs: vec![], + cpustats: [ + ( + KEY_USER_USEC.to_string(), + self.accumulate( + *old.v2.cpustats.get(KEY_USER_USEC).unwrap_or(&0), + *new.v2.cpustats.get(KEY_USER_USEC).unwrap_or(&0), + ), + ), + ( + KEY_SYSTEM_USEC.to_string(), + self.accumulate( + *old.v2.cpustats.get(KEY_SYSTEM_USEC).unwrap_or(&0), + *new.v2.cpustats.get(KEY_SYSTEM_USEC).unwrap_or(&0), + ), + ), + ( + KEY_USAGE_USEC.to_string(), + self.accumulate( + *old.v2.cpustats.get(KEY_USAGE_USEC).unwrap_or(&0), + *new.v2.cpustats.get(KEY_USAGE_USEC).unwrap_or(&0), + ), + ), + ] + .iter() + .cloned() + .collect(), + + cpupsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: self.accumulate(old.v2.cpupsi.some.avg10, new.v2.cpupsi.some.avg10), + avg60: self.accumulate(old.v2.cpupsi.some.avg60, new.v2.cpupsi.some.avg60), + avg300: self.accumulate(old.v2.cpupsi.some.avg300, new.v2.cpupsi.some.avg300), + total: self.accumulate(old.v2.cpupsi.some.total, new.v2.cpupsi.some.total), + }, + full: CGROUPV2PsiVal { + avg10: self.accumulate(old.v2.cpupsi.full.avg10, new.v2.cpupsi.full.avg10), + avg60: self.accumulate(old.v2.cpupsi.full.avg60, new.v2.cpupsi.full.avg60), + avg300: self.accumulate(old.v2.cpupsi.full.avg300, new.v2.cpupsi.full.avg300), + total: self.accumulate(old.v2.cpupsi.full.total, new.v2.cpupsi.full.total), + }, + }, + mempsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: self.accumulate(old.v2.mempsi.some.avg10, new.v2.mempsi.some.avg10), + avg60: self.accumulate(old.v2.mempsi.some.avg60, new.v2.mempsi.some.avg60), + avg300: self.accumulate(old.v2.mempsi.some.avg300, new.v2.mempsi.some.avg300), + total: self.accumulate(old.v2.mempsi.some.total, new.v2.mempsi.some.total), + }, + full: CGROUPV2PsiVal { + avg10: self.accumulate(old.v2.mempsi.full.avg10, new.v2.mempsi.full.avg10), + avg60: self.accumulate(old.v2.mempsi.full.avg60, new.v2.mempsi.full.avg60), + avg300: self.accumulate(old.v2.mempsi.full.avg300, new.v2.mempsi.full.avg300), + total: self.accumulate(old.v2.mempsi.full.total, new.v2.mempsi.full.total), + }, + }, + iopsi: CGROUPV2Psi { + some: CGROUPV2PsiVal { + avg10: self.accumulate(old.v2.iopsi.some.avg10, new.v2.iopsi.some.avg10), + avg60: self.accumulate(old.v2.iopsi.some.avg60, new.v2.iopsi.some.avg60), + avg300: self.accumulate(old.v2.iopsi.some.avg300, new.v2.iopsi.some.avg300), + total: self.accumulate(old.v2.iopsi.some.total, new.v2.iopsi.some.total), + }, + full: CGROUPV2PsiVal { + avg10: self.accumulate(old.v2.iopsi.full.avg10, new.v2.iopsi.full.avg10), + avg60: self.accumulate(old.v2.iopsi.full.avg60, new.v2.iopsi.full.avg60), + avg300: self.accumulate(old.v2.iopsi.full.avg300, new.v2.iopsi.full.avg300), + total: self.accumulate(old.v2.iopsi.full.total, new.v2.iopsi.full.total), + }, + }, + }, + } + } } //////////////////////////////////////////////////////////////// @@ -124,22 +290,63 @@ pub enum Characteristics { E2EGpu, } +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct InvokeDiff { + timestamp: i64, + fqdn: String, + cgroupid: BPF_FMAP_KEY, + cgroupstat: CGROUPReading, +} + /// Historical execution characteristics of functions. Cold/warm times, energy, etc. /// TODO: make get/set functions for Characteristics auto-generated #[derive(Debug)] pub struct CharacteristicsMap { /// Most recent fn->{char->value} - map: DashMap>, + pub map: DashMap>, /// Moving average values agmap: DashMap>, /// Minimum of the values minmap: DashMap>, ag: AgExponential, creation_time: Instant, + fcmap_tx: Option>, + container_man: Option>, + snapshot_invk_start: DashMap, + diff_invk: Arc>, + avg10_invk: Arc>, // it's an exponential moving average of the invoke diff + invk_json_tx: Sender, + avg10_json_tx: Sender, +} + +fn build_sink_thread(filename: String) -> Sender { + let (tx, rx): (Sender, Receiver) = mpsc::channel(); + thread::spawn(move || { + let mut sink = match File::create(filename) { + Ok(f) => Box::new(f) as Box, + Err(_) => Box::new(io::stdout()) as Box, + }; + + // unbounded receiver waiting for all senders to complete. + while let Ok(val) = rx.recv() { + // Serialize it to a JSON string. + let j = match serde_json::to_string(&val) { + Ok(j) => j, + Err(_) => "".to_string(), + }; + let _ = sink.write_all(j.as_bytes()); + let _ = sink.write_all(b"\n"); + } + }); + tx } impl CharacteristicsMap { - pub fn new(ag: AgExponential) -> Self { + pub fn new( + ag: AgExponential, + fcmap_tx: Option>, + container_man: Option>, + ) -> Self { // TODO: Implement file restore functionality here CharacteristicsMap { @@ -148,6 +355,13 @@ impl CharacteristicsMap { minmap: DashMap::new(), ag, creation_time: now(), + fcmap_tx, + container_man, + snapshot_invk_start: DashMap::new(), + diff_invk: Arc::new(DashMap::new()), + avg10_invk: Arc::new(DashMap::new()), + invk_json_tx: build_sink_thread("/tmp/iluvatar/bin/invoke.json".to_string()), + avg10_json_tx: build_sink_thread("/tmp/iluvatar/bin/invoke_avg10.json".to_string()), } } @@ -158,6 +372,21 @@ impl CharacteristicsMap { let e0 = self.map.get_mut(fqdn); + if let Some(cm) = &self.container_man { + let cgids = cm.container_cgroup_ids(fqdn, Compute::CPU); + for cgid in cgids { + if let Some(tx) = self.fcmap_tx.as_ref() { + let v = self.avg_cpu_e2e_t(fqdn) * 1000.0; // in ms + let cv = CharVal { + prio: 1, + loc: 2, + e2e: v as u32, + }; + let _ = tx.send((cgid, cv)); + } + } + } + match e0 { // dashself.map of given fqdn Some(v0) => { @@ -201,7 +430,7 @@ impl CharacteristicsMap { Some(mut v1) => { *v1 = match &v1.value() { Values::Duration(d) => Values::Duration(self.ag.accumulate_dur(d, &unwrap_val_dur(&value))), - Values::F64(f) => Values::F64(self.ag.accumulate(f, &unwrap_val_f64(&value))), + Values::F64(f) => Values::F64(self.ag.accumulate(*f, unwrap_val_f64(&value))), Values::U64(_) => todo!(), Values::Str(_) => todo!(), }; @@ -255,6 +484,52 @@ impl CharacteristicsMap { self } + pub fn start_invoke(&self, _fqdn: &str, tid: &TransactionId) { + if let Some(cm) = &self.container_man { + let cgroup_id = cm.get_cgroupid_against_tid(tid); + if let Some(cgid) = cgroup_id { + let reading = read_cgroup(std::str::from_utf8(&cgid).unwrap().to_string()).unwrap(); + self.snapshot_invk_start.insert(tid.clone(), reading); + } + } + } + + pub fn end_invoke(&self, fqdn: &str, tid: &TransactionId) { + if let Some(cm) = &self.container_man { + let cgroup_id = cm.remove_cgroupid_against_tid(tid); + if let Some(cgid) = cgroup_id { + let reading = read_cgroup(std::str::from_utf8(&cgid).unwrap().to_string()).unwrap(); + if let Some(start_reading) = self.snapshot_invk_start.get(tid) { + let diff = diff_cgroupreading(&start_reading, &reading); + + let time_now = get_global_timestamp_ms(); + let idiff = InvokeDiff { + timestamp: time_now, + fqdn: fqdn.to_string(), + cgroupid: cgid, + cgroupstat: diff.clone(), + }; + let _ = self.invk_json_tx.send(idiff.clone()); + self.diff_invk.insert(time_now, idiff); + + let olddiff = match self.avg10_invk.get(fqdn) { + Some(v) => v.cgroupstat.clone(), + None => InvokeDiff::default().cgroupstat, + }; + let mvavgdiff = self.ag.accumulate_cgroupreading(&olddiff, &diff); + let imvavgdiff = InvokeDiff { + timestamp: get_global_timestamp_ms(), + fqdn: fqdn.to_string(), + cgroupid: cgid, + cgroupstat: mvavgdiff, + }; + let _ = self.avg10_json_tx.send(imvavgdiff.clone()); + self.avg10_invk.insert(fqdn.to_string(), imvavgdiff); + } + } + } + } + pub fn add_iat(&self, fqdn: &str) { let time_now = now(); let time_now_elapsed = time_now.duration_since(self.creation_time); @@ -493,6 +768,18 @@ impl CharacteristicsMap { } } } + + pub fn write_csv(&self, _filename: &str) -> Result<(), Box> { + let mut wtr = Writer::from_path(_filename)?; + wtr.write_record(["func_name", "e2e_time"])?; + for e0 in self.map.iter() { + let fqdn = e0.key(); + let exec_time = self.get_exec_time(fqdn); + wtr.write_record([fqdn, &exec_time.to_string()])?; + } + wtr.flush()?; + Ok(()) + } } #[cfg(test)] @@ -502,7 +789,7 @@ mod charmap { #[test] fn duration() { // Test 4 using Duration datatype for ExecTime - let m = CharacteristicsMap::new(AgExponential::new(0.6)); + let m = CharacteristicsMap::new(AgExponential::new(0.6), None, None); println!("--------------------------------------------------------------------"); println!("Test 4: Using Duration Datatype for ExecTime"); @@ -553,7 +840,7 @@ mod charmap { #[test] fn lookup_agg() { - let m = CharacteristicsMap::new(AgExponential::new(0.6)); + let m = CharacteristicsMap::new(AgExponential::new(0.6), None, None); let push_video = || { m.add( @@ -626,7 +913,7 @@ mod charmap { #[test] fn accumulation() { - let m = CharacteristicsMap::new(AgExponential::new(0.6)); + let m = CharacteristicsMap::new(AgExponential::new(0.6), None, None); m.add( "video_processing.0.0.1", @@ -703,7 +990,7 @@ mod charmap { use float_cmp::approx_eq; use std::thread::sleep; - let m = CharacteristicsMap::new(AgExponential::new(0.6)); + let m = CharacteristicsMap::new(AgExponential::new(0.6), None, None); let fjd_011 = "json_dump.0.1.1".to_string(); let verify_iat_lookup = |fname: &str, val_expc: f64| { diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/utils/mod.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/utils/mod.rs" new file mode 100644 index 00000000..694c429d --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/utils/mod.rs" @@ -0,0 +1,2 @@ +pub mod characteristics_map; +// pub mod stats_reader; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/fs_scheduler.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/fs_scheduler.rs" new file mode 100644 index 00000000..2fccddc8 --- /dev/null +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/fs_scheduler.rs" @@ -0,0 +1,116 @@ +use crate::utils::characteristics_map::CharacteristicsPacket; +use crate::worker_api::worker_config::WorkerConfig; +use iluvatar_bpf_library::bpf::func_characs::*; +use iluvatar_library::utils::execute_cmd_nonblocking; +use ipc_channel::ipc::{IpcOneShotServer, IpcSender}; + +use crate::SCHED_CHANNELS; +use serde::{Deserialize, Serialize}; +use std::fs::File; +use std::io::Read; +use std::io::Write; +use std::mem::MaybeUninit; +use std::path::Path; +use std::sync::mpsc::{channel, Sender}; +use std::sync::RwLock; +use std::thread; + +use tracing::debug; + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct Channels { + pub tx_chr: IpcSender, +} + +pub fn launch_scheduler(worker_config: &WorkerConfig) { + match &worker_config.finescheduling { + Some(fconfig) => { + let fconfig = fconfig.clone(); + + if Path::new(&fconfig.binary).exists() { + // create a oneshot IPC server + let (server, name) = IpcOneShotServer::new().unwrap(); + debug!(name=%name, status="server waiting", "fine_scheduler"); + + let mut args = Vec::::new(); + + // default args for all the policies + args.push("--server-name".to_string()); + args.push(name.clone()); + + // construct args for different policies as needed + match fconfig.binary.as_str() { + "/tmp/iluvatar/bin/fs_policy_constrained" => { + if let Some(cores) = fconfig.cores.as_ref() { + for c in cores { + args.push("-c".to_string()); + args.push(c.to_string()); + } + } + } + _ => {} + } + + // launch the policy in a separate thread + let bname = fconfig.binary.clone(); + thread::spawn(move || { + let mut _child = execute_cmd_nonblocking(&bname, &args, None, &String::from("none")).unwrap(); + + let mut buffer = [0; 1024]; + let mut log = File::create("/tmp/iluvatar/bin/sched.log").expect("failed to open log"); + let mut elog = File::create("/tmp/iluvatar/bin/sched.elog").expect("failed to open log"); + + loop { + let read = _child.stdout.as_mut().unwrap().read(&mut buffer).unwrap_or(0); + if read > 0 { + let _ = log.write(&buffer[..read]); + let _ = log.flush(); + } + match _child.try_wait() { + Ok(Some(_status)) => { + let read = _child.stderr.as_mut().unwrap().read(&mut buffer).unwrap_or(0); + if read > 0 { + let _ = elog.write(&buffer[..read]); + let _ = elog.flush(); + } + } + Ok(None) => {} + Err(_e) => {} + } + } + }); + + // wait for the channel to establish with a timeout + let (_, channels): (_, Channels) = server.accept().unwrap(); + debug!(name=%name, status="channels established", "fine_scheduler"); + + // save it in the global variable + unsafe { + SCHED_CHANNELS = Some(RwLock::new(channels)); + } + } + } + None => (), // no binary config found + }; // end of config match +} + +pub fn create_shared_map() -> Sender<(BPF_FMAP_KEY, CharVal)> { + // create a multi-producer and single consumer async channel + let (tx, rx) = channel::<(BPF_FMAP_KEY, CharVal)>(); + + // move the consumer end to a separate thread + // where data is pushed to the map + thread::spawn(move || { + // build the bpf characteristics map + let mut open_object = MaybeUninit::uninit(); + let skel = build_and_load(&mut open_object).unwrap(); + let fcmap = skel.maps.func_metadata; + + // unbounded receiver waiting for all senders to complete. + while let Ok((key, val)) = rx.recv() { + update_map(&fcmap, &key, &val); + } + }); + + tx +} diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/iluvatar_worker.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/iluvatar_worker.rs" index 07915d85..6825955a 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/iluvatar_worker.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/iluvatar_worker.rs" @@ -5,12 +5,11 @@ use crate::services::invocation::Invoker; use crate::services::resources::gpu::GpuResourceTracker; use crate::services::status::status_service::StatusService; use crate::services::{registration::RegistrationService, worker_health::WorkerHealthService}; +use crate::utils::characteristics_map::CharacteristicsMap; use crate::worker_api::config::WorkerConfig; use iluvatar_library::transaction::TransactionId; use iluvatar_library::types::{Compute, Isolation}; -use iluvatar_library::{ - characteristics_map::CharacteristicsMap, energy::energy_logging::EnergyLogger, utils::calculate_fqdn, -}; +use iluvatar_library::{energy::energy_logging::EnergyLogger, utils::calculate_fqdn}; use iluvatar_rpc::rpc::iluvatar_worker_server::IluvatarWorker; use iluvatar_rpc::rpc::{ CleanRequest, HealthRequest, InvokeAsyncLookupRequest, InvokeAsyncRequest, InvokeRequest, PingRequest, diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/mod.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/mod.rs" index ef1494c8..68e06af4 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/mod.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/mod.rs" @@ -8,25 +8,33 @@ use crate::services::registration::RegistrationService; use crate::services::resources::{cpu::CpuResourceTracker, gpu::GpuResourceTracker}; use crate::services::status::status_service::{build_load_avg_signal, StatusService}; use crate::services::worker_health::WorkerHealthService; +use crate::utils::characteristics_map::{AgExponential, CharacteristicsMap}; use crate::worker_api::iluvatar_worker::IluvatarWorkerImpl; use anyhow::Result; +use iluvatar_library::bail_error; +use iluvatar_library::energy::energy_logging::EnergyLogger; use iluvatar_library::influx::InfluxClient; use iluvatar_library::types::{Compute, HealthStatus, Isolation, ResourceTimings}; -use iluvatar_library::{bail_error, characteristics_map::CharacteristicsMap}; -use iluvatar_library::{characteristics_map::AgExponential, energy::energy_logging::EnergyLogger}; use iluvatar_library::{transaction::TransactionId, types::MemSizeMb}; use iluvatar_rpc::rpc::{CleanResponse, InvokeResponse, StatusResponse}; + use std::sync::Arc; pub mod worker_config; pub use worker_config as config; +pub mod fs_scheduler; pub mod iluvatar_worker; pub mod rpc; pub mod sim_worker; pub mod worker_comm; +use fs_scheduler::{create_shared_map, launch_scheduler}; + pub async fn create_worker(worker_config: WorkerConfig, tid: &TransactionId) -> Result { - let cmap = Arc::new(CharacteristicsMap::new(AgExponential::new(0.6))); + // launch the fine grained scheduler + launch_scheduler(&worker_config); + + let tx = create_shared_map(); let factory = IsolationFactory::new(worker_config.clone()); let load_avg = build_load_avg_signal(); @@ -56,6 +64,14 @@ pub async fn create_worker(worker_config: WorkerConfig, tid: &TransactionId) -> .await .or_else(|e| bail_error!(tid=%tid, error=%e, "Failed to make container manger"))?; + // charateristics map + // push the producer end of the channel to the characteristics map + let cmap = Arc::new(CharacteristicsMap::new( + AgExponential::new(0.6), + Some(tx), + Some(container_man.clone()), + )); + let reg = RegistrationService::new( container_man.clone(), isos.clone(), @@ -75,6 +91,7 @@ pub async fn create_worker(worker_config: WorkerConfig, tid: &TransactionId) -> let invoker_fact = InvokerFactory::new( container_man.clone(), worker_config.limits.clone(), + worker_config.clone(), worker_config.invocation.clone(), cmap.clone(), cpu, diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/rpc.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/rpc.rs" index 1aa68219..fb270a46 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/rpc.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/rpc.rs" @@ -95,6 +95,7 @@ impl WorkerAPI for RPCWorkerAPI { args: String, tid: TransactionId, ) -> Result { + println!("invoking {:?} {:?}", function_name, args); let request = Request::new(InvokeRequest { function_name, function_version: version, diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/worker_config.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/worker_config.rs" index 488bb166..5743b696 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/worker_config.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/src/worker_api/worker_config.rs" @@ -47,6 +47,7 @@ pub struct Configuration { pub energy_cap: Option>, pub status: Arc, pub influx: Option>, + pub finescheduling: Option>, } #[derive(Debug, Deserialize, Default)] @@ -210,6 +211,13 @@ pub struct StatusConfig { pub report_freq_ms: u64, } +#[derive(Debug, Deserialize)] +/// Config related to status monitoring of the worker system & host +pub struct FineSchedConfig { + pub binary: String, + pub cores: Option>, +} + /// A wrapper type for the loaded global worker configuration pub type WorkerConfig = Arc; diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/tests/container_sim_tests.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/tests/container_sim_tests.rs" index 625f167a..f8f0c563 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/tests/container_sim_tests.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/tests/container_sim_tests.rs" @@ -4,12 +4,12 @@ pub mod utils; use crate::utils::{ background_test_invoke, full_sim_invoker, resolve_invoke, sim_args, sim_invoker_svc, wait_for_queue_len, }; -use iluvatar_library::characteristics_map::{Characteristics, Values}; use iluvatar_library::transaction::gen_tid; use iluvatar_library::types::{Compute, Isolation}; use iluvatar_library::{threading::EventualItem, transaction::TEST_TID}; use iluvatar_rpc::rpc::{LanguageRuntime, RegisterRequest}; use iluvatar_worker_library::services::containers::structs::ContainerState; +use iluvatar_worker_library::utils::characteristics_map::{Characteristics, Values}; use rstest::rstest; fn cpu_reg() -> RegisterRequest { diff --git "a/src/Il\303\272vatar/iluvatar_worker_library/tests/utils.rs" "b/src/Il\303\272vatar/iluvatar_worker_library/tests/utils.rs" index a600d622..0d8af4ce 100644 --- "a/src/Il\303\272vatar/iluvatar_worker_library/tests/utils.rs" +++ "b/src/Il\303\272vatar/iluvatar_worker_library/tests/utils.rs" @@ -3,7 +3,6 @@ use iluvatar_library::clock::ContainerTimeFormatter; use iluvatar_library::energy::energy_logging::EnergyLogger; use iluvatar_library::types::{Compute, Isolation, MemSizeMb}; use iluvatar_library::{ - characteristics_map::{AgExponential, CharacteristicsMap}, logging::{start_tracing, LoggingConfig}, transaction::{TransactionId, TEST_TID}, }; @@ -16,14 +15,17 @@ use iluvatar_worker_library::services::{ containers::{containermanager::ContainerManager, IsolationFactory}, invocation::{Invoker, InvokerFactory}, }; -use iluvatar_worker_library::services::{ - invocation::InvocationResult, - resources::{cpu::CpuResourceTracker, gpu::GpuResourceTracker}, -}; use iluvatar_worker_library::{ services::registration::{RegisteredFunction, RegistrationService}, worker_api::config::{Configuration, WorkerConfig}, }; +use iluvatar_worker_library::{ + services::{ + invocation::InvocationResult, + resources::{cpu::CpuResourceTracker, gpu::GpuResourceTracker}, + }, + utils::characteristics_map::{AgExponential, CharacteristicsMap}, +}; use parking_lot::Mutex; use std::{sync::Arc, time::Duration}; use time::OffsetDateTime; @@ -80,7 +82,7 @@ pub async fn full_sim_invoker( false => None, }; let load_avg = build_load_avg_signal(); - let cmap = Arc::new(CharacteristicsMap::new(AgExponential::new(0.6))); + let cmap = Arc::new(CharacteristicsMap::new(AgExponential::new(0.6), None, None)); let cpu = CpuResourceTracker::new(&cfg.container_resources.cpu_resource, load_avg.clone(), &TEST_TID) .unwrap_or_else(|e| panic!("Failed to create cpu resource man: {}", e)); let factory = IsolationFactory::new(cfg.clone()); @@ -123,6 +125,7 @@ pub async fn full_sim_invoker( let invoker_fact = InvokerFactory::new( cm.clone(), cfg.limits.clone(), + cfg.clone(), cfg.invocation.clone(), cmap.clone(), cpu.clone(), @@ -174,7 +177,7 @@ pub async fn sim_invoker_svc( None => None, }; let load_avg = build_load_avg_signal(); - let cmap = Arc::new(CharacteristicsMap::new(AgExponential::new(0.6))); + let cmap = Arc::new(CharacteristicsMap::new(AgExponential::new(0.6), None, None)); let cpu = CpuResourceTracker::new(&cfg.container_resources.cpu_resource, load_avg, &TEST_TID) .unwrap_or_else(|e| panic!("Failed to create cpu resource man: {}", e)); let factory = IsolationFactory::new(cfg.clone()); @@ -217,6 +220,7 @@ pub async fn sim_invoker_svc( let invoker_fact = InvokerFactory::new( cm.clone(), cfg.limits.clone(), + cfg.clone(), cfg.invocation.clone(), cmap.clone(), cpu, @@ -267,7 +271,7 @@ pub async fn test_invoker_svc( false => None, }; let load_avg = build_load_avg_signal(); - let cmap = Arc::new(CharacteristicsMap::new(AgExponential::new(0.6))); + let cmap = Arc::new(CharacteristicsMap::new(AgExponential::new(0.6), None, None)); let cpu = CpuResourceTracker::new(&cfg.container_resources.cpu_resource, load_avg, &TEST_TID) .unwrap_or_else(|e| panic!("Failed to create cpu resource man: {}", e)); @@ -311,6 +315,7 @@ pub async fn test_invoker_svc( let invoker_fact = InvokerFactory::new( cm.clone(), cfg.limits.clone(), + cfg.clone(), cfg.invocation.clone(), cmap, cpu,