Skip to content

Commit

Permalink
Add enough to have decent simulation support for GPU-based invocations (
Browse files Browse the repository at this point in the history
#28)

Does not properly emulate GPU utilization however

---------

Co-authored-by: Alex Fuerst <alfuerst@iu.edu>
  • Loading branch information
aFuerst and Alex Fuerst authored Aug 30, 2024
1 parent 32eeb79 commit ba8fb72
Show file tree
Hide file tree
Showing 58 changed files with 1,049 additions and 533 deletions.
2 changes: 2 additions & 0 deletions src/Ilúvatar/ansible/group_vars/all.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
iluvatar_home: "{{ lookup('env', 'ILU_HOME') | default(playbook_dir ~ '/..', true) }}"
mode: deploy
target: release
cluster: false

controller:
bin_name: "iluvatar_controller"
Expand All @@ -10,6 +11,7 @@ controller:
algorithm: "{{ controller_algorithm | default('LeastLoaded') }}"
load_metric: "{{ controller_load_metric | default('loadavg') }}"
environment: "{{ controller_environment | default({}) }}"
# address: "http://{{ host }}:{{ port }}"

worker:
bin_name: "iluvatar_worker"
Expand Down
1 change: 1 addition & 0 deletions src/Ilúvatar/ansible/iluvatar.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Start Ilúvatar services
- cluster: true

- import_playbook: influx.yml

Expand Down
37 changes: 26 additions & 11 deletions src/Ilúvatar/ansible/worker.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
- hosts: workers
vars_files:
- group_vars/all.yml
- group_vars/host_addresses.yml
vars:
expected: "Connection failure: [Errno 104] Connection reset by peer"
host_group: "{{ groups['workers'] }}"
Expand All @@ -12,6 +15,7 @@
__remote_bin_src: "{{ remote_bin_src | default(false) }}"
__worker_address: "{{ worker_address | default(servers[ansible_host].internal_ip) }}"
__hardware_interface: "{{ hardware_interface | default(servers[ansible_host].hardware_interface) }}"
__controller_address: "http://{{ controller.host }}:{{ controller.port }}"
SIGINT: 2
coded_proxy_env:
"ILUVATAR_WORKER__name": "{{ inventory_hostname }}"
Expand All @@ -20,10 +24,18 @@
"ILUVATAR_WORKER__logging__basename": "worker_{{ inventory_hostname }}"
"ILUVATAR_WORKER__logging__directory": "{{ worker_log_dir | default('/tmp/iluvatar/logs') }}"
"ILUVATAR_WORKER__logging__level": "{{ worker_log_level | default('info') }}"
"ILUVATAR_WORKER__load_balancer_url" : "http://{{ controller.host }}:{{ controller.port }}"
"ILUVATAR_WORKER__timeout_sec" : "{{ worker_timeout_sec | default(6000) }}"
"ILUVATAR_WORKER__tokio_event_interval" : "{{ worker_tokio_event_interval | default(20) }}"
"ILUVATAR_WORKER__tokio_queue_interval" : "{{ worker_tokio_queue_interval | default(20) }}"
"ILUVATAR_WORKER__load_balancer_url" : "{{ __controller_address if cluster else ''}}"

# Influx config
"ILUVATAR_WORKER__influx__host" : "{{ influx.address }}"
"ILUVATAR_WORKER__influx__org" : "{{ influx.organization }}"
"ILUVATAR_WORKER__influx__token" : "{{ hostvars[ groups['influx']|first ].influx_token | default('') }}"
"ILUVATAR_WORKER__influx__enabled" : "{{ influx.enabled }}"
"ILUVATAR_WORKER__influx__update_freq_ms" : "{{ influx.update_freq_ms }}"

# Limits
"ILUVATAR_WORKER__limits__mem_max_mb" : "{{ worker_mem_max | default(5000) }}"

Expand Down Expand Up @@ -73,12 +85,6 @@
"ILUVATAR_WORKER__energy_cap__power_cap" : "{{ worker_power_cap | default(0.0) }}"
"ILUVATAR_WORKER__energy_cap__power_cap_version" : "{{ worker_power_cap_version | default('V0') }}"

# Influx config
"ILUVATAR_WORKER__influx__host" : "{{ influx.address }}"
"ILUVATAR_WORKER__influx__org" : "{{ influx.organization }}"
"ILUVATAR_WORKER__influx__token" : "{{ hostvars[ groups['influx']|first ].influx_token | default('') }}"
"ILUVATAR_WORKER__influx__enabled" : "{{ influx.enabled }}"
"ILUVATAR_WORKER__influx__update_freq_ms" : "{{ influx.update_freq_ms }}"
# Invocation config
"ILUVATAR_WORKER__invocation__retries" : "{{ worker_invoke_retries | default(0) }}"
"ILUVATAR_WORKER__invocation__queue_sleep_ms" : "{{ worker_queue_sleep_ms | default(100) }}"
Expand All @@ -99,11 +105,8 @@
"ILUVATAR_WORKER__invocation__mqfq_config__flow_select_cnt" : "{{ mqfq_flow_select_cnt | default(0) }}"
proxy_env: "{{ coded_proxy_env | combine(worker.environment, recursive=True) }}"

vars_files:
- group_vars/all.yml
- group_vars/host_addresses.yml

tasks:

- name: Create bin directory
file:
path: "{{bin_dir}}"
Expand Down Expand Up @@ -184,6 +187,12 @@
environment: "{{proxy_env}}"
register: worker_output

- name: debug print
# when: ansible_host == "127.0.0.1" or ansible_host == "localhost"
run_once: true
debug:
msg: "{{ worker_output }}"

- name: run worker executable through perf
ansible.builtin.command:
argv:
Expand All @@ -206,6 +215,12 @@
environment: "{{proxy_env}}"
register: worker_output

- name: debug print
# when: ansible_host == "127.0.0.1" or ansible_host == "localhost"
run_once: true
debug:
msg: "{{ worker_output }}"

- name: wait until the worker on this host is up and running
ansible.builtin.uri:
url:
Expand Down
6 changes: 0 additions & 6 deletions src/Ilúvatar/ansible/worker_perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@
register: running_worker
when: mode == "clean"

- name: debug print
# when: ansible_host == "127.0.0.1" or ansible_host == "localhost"
run_once: true
debug:
msg: "{{ __worker_address }}"

- name: Kill running worker process on localhost
shell: "kill --signal SIGINT {{ item }}"
with_items: "{{ running_worker.stdout_lines }}"
Expand Down
10 changes: 6 additions & 4 deletions src/Ilúvatar/docs/SETUP.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ These steps are required on a system that is going to *run* a worker.

```bash
sudo apt-get update -y
sudo apt-get install -y curl runc bridge-utils iptables zfsutils-linux net-tools sysstat
sudo apt-get install -y curl runc bridge-utils iptables zfsutils-linux net-tools sysstat jq
```

Optional dependencies.
Expand Down Expand Up @@ -39,7 +39,7 @@ Start by installing go if it isn't available.

```bash
ARCH=amd64
GO_VERSION=1.18.3
GO_VERSION=$(curl -s https://go.dev/dl/?mode=json | jq -r '.[0].version')
tar="go${GO_VERSION}.linux-${ARCH}.tar.gz"

wget https://go.dev/dl/${tar}
Expand All @@ -58,7 +58,7 @@ sudo mkdir -p /opt/cni/bin
sudo mv ${gopth}/bin/cnitool /opt/cni/bin

ARCH=amd64
CNI_VERSION=v1.1.1
CNI_VERSION=$(curl -s https://api.github.com/repos/containernetworking/plugins/releases/latest | jq --raw-output '.tag_name')

curl -sSL https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-${ARCH}-${CNI_VERSION}.tgz | sudo tar -xz -C /opt/cni/bin
```
Expand All @@ -70,7 +70,7 @@ If you didn't install Docker earlier, then you will need to install Containerd m
To check if it's needed, run `containerd -version`.

```bash
export VER=1.6.4
export VER=$(curl -s https://api.github.com/repos/containerd/containerd/releases/latest | jq --raw-output '.tag_name')
curl -sSL https://github.com/containerd/containerd/releases/download/v$VER/containerd-$VER-linux-amd64.tar.gz > /tmp/containerd.tar.gz \
&& sudo tar -xvf /tmp/containerd.tar.gz -C /usr/local/bin/ --strip-components=1

Expand All @@ -86,6 +86,8 @@ Run these commands, then re-run the `systemctl` commands.
wget https://raw.githubusercontent.com/containerd/containerd/main/containerd.service
sudo mkdir -p /usr/local/lib/systemd/system/
sudo mv containerd.service /usr/local/lib/systemd/system/containerd.service
sudo rm -f /etc/systemd/system/containerd.service
sudo ln /usr/local/lib/systemd/system/containerd.service /etc/systemd/system/containerd.service
```

**ZFS and file system.**
Expand Down
4 changes: 2 additions & 2 deletions src/Ilúvatar/docs/examples/azure-trace/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ ILU_HOME="../../.."
CORES=4
MEMORY=4096

results_dir="."
worker_log_dir="/tmp/iluvatar/logs/ansible"
results_dir=$(pwd)
worker_log_dir=$results_dir
environment='local'
hosts="-e @../../../ansible/group_vars/local_addresses.yml"
host_file="../../../ansible/environments/$environment/hosts.ini"
Expand Down
3 changes: 1 addition & 2 deletions src/Ilúvatar/docs/examples/basic-trace/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ CORES=2
MEMORY=4096

results_dir="."
worker_log_dir="/tmp/iluvatar/logs/ansible"
worker_log_dir=$(pwd)
environment='local'
hosts="-e @../../../ansible/group_vars/local_addresses.yml"
host_file="../../../ansible/environments/$environment/hosts.ini"
Expand All @@ -22,7 +22,6 @@ source ../examples-venv/bin/activate

cleanup(){
echo "cleanup"
cp $worker_log_dir/* $results_dir >> $log_file
# remove system parts
ansible-playbook -i $host_file $ILU_HOME/ansible/worker.yml -e mode=clean $hosts >> $log_file
}
Expand Down
5 changes: 2 additions & 3 deletions src/Ilúvatar/docs/examples/benchmark/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ ILU_HOME="../../.."
CORES=2
MEMORY=4096

results_dir="."
worker_log_dir="/tmp/iluvatar/logs/ansible"
results_dir=$(pwd)
worker_log_dir=$results_dir
environment='local'
hosts="-e @$ILU_HOME/ansible/group_vars/local_addresses.yml"
host_file="$ILU_HOME/ansible/environments/$environment/hosts.ini"
Expand All @@ -22,7 +22,6 @@ source ../examples-venv/bin/activate

cleanup(){
echo "cleanup"
cp $worker_log_dir/* $results_dir >> $log_file
# remove system parts
ansible-playbook -i $host_file $ILU_HOME/ansible/worker.yml -e mode=clean $hosts >> $log_file
}
Expand Down
4 changes: 2 additions & 2 deletions src/Ilúvatar/docs/examples/detailed-spans/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ ILU_HOME="../../.."
CORES=2
MEMORY=4096

results_dir="."
worker_log_dir="/tmp/iluvatar/logs/ansible"
results_dir=$(pwd)
worker_log_dir=$results_dir
environment='local'
hosts="-e @../../../ansible/group_vars/local_addresses.yml"
host_file="../../../ansible/environments/$environment/hosts.ini"
Expand Down
10 changes: 7 additions & 3 deletions src/Ilúvatar/docs/examples/sample_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ if ! [ -x "$(command -v go)" ];
then
echo "go not found, installing"
ARCH=amd64
GO_VERSION=1.18.3
GO_VERSION=1.22.5
tar="go${GO_VERSION}.linux-${ARCH}.tar.gz"

wget https://go.dev/dl/${tar}
Expand All @@ -27,9 +27,13 @@ CNI_VERSION=v1.1.1

curl -sSL https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-${ARCH}-${CNI_VERSION}.tgz | sudo tar -xz -C /opt/cni/bin

sudo apt install -y jq ensurepip
python3 -m pip install virtualenv
python3 -m venv --clear examples-venv
examples-venv/bin/python3 -m pip install --upgrade pip --no-warn-script-location
examples-venv/bin/python3 -m pip install ansible numpy pandas matplotlib --no-warn-script-location
source ./examples-venv/bin/activate
python3 -m pip install --upgrade pip --no-warn-script-location
python3 -m pip install ansible numpy pandas matplotlib --no-warn-script-location
deactivate

name=$(ip route get 8.8.8.8 | awk '{ print $5; exit }')

Expand Down
14 changes: 6 additions & 8 deletions src/Ilúvatar/docs/examples/scaling/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,22 @@ ILU_HOME="../../.."
CORES=2
MEMORY=4096

results_dir="."
worker_log_dir="/tmp/iluvatar/logs/ansible"
results_dir=$(pwd)
worker_log_dir=$results_dir
environment='local'
hosts="-e @../../../ansible/group_vars/local_addresses.yml"
host_file="../../../ansible/environments/$environment/hosts.ini"
host="127.0.0.1"
log_file="$results_dir/orchestration.log"

ret=$(pwd)
cd $ILU_HOME
pushd $ILU_HOME
make release
cd $ret
popd

echo "Running scaling"
source ../examples-venv/bin/activate

cleanup(){
echo "cleanup"
cp $worker_log_dir/* $results_dir >> $log_file
# remove system parts
ansible-playbook -i $host_file $ILU_HOME/ansible/worker.yml -e mode=clean $hosts >> $log_file
}
Expand All @@ -38,7 +35,8 @@ ansible-playbook -i $host_file $ILU_HOME/ansible/worker.yml -e worker_log_dir=$w
ansible-playbook -i $host_file $ILU_HOME/ansible/worker.yml $hosts -e mode=deploy -e worker_memory_mb=$MEMORY \
-e worker_cores=$CORES -e worker_status_ms=500 -e worker_memory_buffer=1024 -e worker_queue_policy="fcfs" -e worker_snapshotter='overlayfs' \
-e influx_enabled=false -e worker_log_dir=$worker_log_dir >> $log_file &&
$ILU_HOME/target/x86_64-unknown-linux-gnu/release/iluvatar_load_gen scaling --out-folder $results_dir --port 8070 --host $host --target worker --start 1 --end 4 --image docker.io/alfuerst/json_dumps_loads-iluvatar-action:latest --compute cpu --isolation containerd --duration=60 >> $log_file
$ILU_HOME/target/x86_64-unknown-linux-gnu/release/iluvatar_load_gen scaling --out-folder $results_dir --port 8070 --host $host --target worker --start 1 \
--end 4 --image docker.io/alfuerst/json_dumps_loads-iluvatar-action:latest --memory-mb 1024 --compute cpu --isolation containerd --duration=60 >> $log_file

sleep 30
cleanup
Expand Down
2 changes: 1 addition & 1 deletion src/Ilúvatar/docs/examples/simulation-trace/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ MEMORY=4096
host="127.0.0.1"
PORT=8080

results_dir="."
results_dir=$(pwd)
log_file="$results_dir/orchestration.log"

ret=$(pwd)
Expand Down
2 changes: 1 addition & 1 deletion src/Ilúvatar/iluvatar_controller_library/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ parking_lot = "0.12.1"
config = "0.13"
clap = "4.1"
reqwest = { version = "0.12.4", default-features = false, features = ["json", "rustls-tls"] }
dashmap = "5.3.4"
dashmap = "6.0"
tracing = "0.1"
tonic = "0.11"

Expand Down
2 changes: 1 addition & 1 deletion src/Ilúvatar/iluvatar_library/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ tracing-subscriber = { version = "0.3", features = ["json", "tracing-log", "park
tracing-appender = "0.2.2"
tracing-flame="0.2"
pin-project = "1"
dashmap = "5.3.4"
dashmap = "6.0"
parking_lot = "0.12.1"
tokio = { version = "1.19", features = ["macros", "rt-multi-thread", "test-util", "sync", "parking_lot", "signal"] }
ordered-float = "3.4.0"
Expand Down
70 changes: 0 additions & 70 deletions src/Ilúvatar/iluvatar_library/src/api_register.rs

This file was deleted.

Loading

0 comments on commit ba8fb72

Please sign in to comment.