Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU Rocm changes for postscripts and Accelerator #2457

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
bdc00ec
rocm Postscript changes
suman-square Feb 7, 2025
a020c8e
Merge branch 'dell:pub/new_architecture' into pub/new_architecture
suman-square Feb 7, 2025
fd9bd3f
updating proxy_status check in omnia_rocm
suman-square Feb 7, 2025
1577cb4
Merge branch 'pub/new_architecture' of https://github.com/suman-squar…
suman-square Feb 7, 2025
a51a39a
Validation based on pulp distribution list
suman-square Feb 10, 2025
54e2bb4
Updated pulp location
suman-square Feb 10, 2025
8e46b29
Update pulp command to localhost
suman-square Feb 10, 2025
b5bb952
pulp command change for rpm and deb
suman-square Feb 11, 2025
1d45a9b
AMD Rocm validation changes
suman-square Feb 12, 2025
319ce68
Updated oim_os to provision_os
suman-square Feb 12, 2025
b8e5410
Removed Comments
suman-square Feb 12, 2025
0a17147
Changed templates file path
suman-square Feb 12, 2025
eccf081
Update set fact fro provision os
suman-square Feb 12, 2025
9997e6d
UPdate validate amd
suman-square Feb 12, 2025
de93dfa
Fail message update
suman-square Feb 12, 2025
c5656be
Updated Performance Profile
suman-square Feb 12, 2025
f865cb9
Updating - update_user_repo
suman-square Feb 12, 2025
c8356d6
Update - update_user_repo utility
suman-square Feb 12, 2025
1c9182a
Updating - dnf makecahe only when repos changes
suman-square Feb 12, 2025
74c008a
Update accelerator - update_user_repo utility
suman-square Feb 12, 2025
5446042
symbolic link for rocky
suman-square Feb 13, 2025
ada4e2a
Symbolic link creation
Feb 13, 2025
0e34942
Update local_repo_access_path
suman-square Feb 13, 2025
44490f8
Update lint issue in repo templates
suman-square Feb 13, 2025
fa80246
Task Name changes
suman-square Feb 14, 2025
dfbe9c5
Merge branch 'dell:pub/new_architecture' into pub/new_architecture
suman-square Feb 14, 2025
2d52023
Added Amdgpu json in rhel 9.4
Feb 14, 2025
69e280f
Merge branch 'dell:pub/new_architecture' into pub/new_architecture
suman-square Feb 14, 2025
8e8c70b
Copyright changes
suman-square Feb 14, 2025
08c7a53
Merge branch 'pub/new_architecture' of https://github.com/suman-squar…
suman-square Feb 14, 2025
a7736da
Merge branch 'dell:pub/new_architecture' into pub/new_architecture
suman-square Feb 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions accelerator/accelerator.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved.
# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,9 +13,8 @@
# limitations under the License.
---

- name: Check if virtual environment is active
ansible.builtin.import_playbook: ../utils/check_venv.yml
when: not ( check_venv_executed | default(false) | bool )
- name: Include input project directory
ansible.builtin.import_playbook: ../utils/include_input_dir.yml

- name: Update Inventory with ansible_host information
ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml
Expand Down
101 changes: 46 additions & 55 deletions accelerator/roles/accelerator_validation/tasks/validate_amd.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved.
# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -25,8 +25,9 @@
file: "{{ software_config_json_file }}"
name: software_config

- name: Include vars for {{ oim_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml"
- name: Set fact provision_os
ansible.builtin.set_fact:
provision_os: "{{ software_config.cluster_os_type }}"

- name: Get amdgpu status
ansible.builtin.set_fact:
Expand All @@ -47,35 +48,28 @@
loop_control:
loop_var: item

- name: Check if the rocm offline repo exists
ansible.builtin.stat:
path: "{{ offline_rocm_directory }}/rocm/"
register: check_rocm_repo
when: rocm_input_status

- name: Set amdgpu_config_status
when: amdgpu_input_status
block:
- name: Fetch amdgpu_version
ansible.builtin.set_fact:
amdgpu_version: "{{ software_config.softwares | selectattr('name', 'equalto', 'amdgpu') | map(attribute='version') | first }}"

- name: Set amdgpu_version
ansible.builtin.set_fact:
amdgpu_directory: "{{ offline_rocm_directory }}/amdgpu/{{ amdgpu_version }}/"

- name: Check amdgpu version directory exists or not
ansible.builtin.stat:
path: "{{ amdgpu_directory }}"
register: check_amdgpu_dir
- name: Get amdgpu repository details from Pulp
ansible.builtin.command: "{{ pulp_bin_path }} {{ os_package_map[provision_os] }} distribution list --name amdgpu_{{ amdgpu_version }}"
delegate_to: localhost
register: pulp_amdgpu_output
changed_when: false
no_log: true

- name: Set amdgpu_config_status based on directory existence
- name: Set amdgpu_config_status based on pulp rpm distribution
ansible.builtin.set_fact:
amdgpu_config_status: "{{ check_amdgpu_dir.stat.exists | ternary(true, false) }}"
amdgpu_config_status: true
when: pulp_amdgpu_output.stdout | length > 0
rescue:
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "
msg: " {{ amdgpu_repo_fail_msg }} "

- name: Set amdgpu_config_status to false
ansible.builtin.set_fact:
Expand All @@ -84,53 +78,50 @@
- name: Set rocm_config_status
when:
- rocm_input_status
- software_config.repo_config == 'always' or software_config.repo_config == 'partial'
- check_rocm_repo.stat.exists
block:
- name: Fetch rocm_version
ansible.builtin.set_fact:
rocm_version: "{{ software_config.amdgpu | selectattr('name', 'equalto', 'rocm') | map(attribute='version') | first }}"

- name: Set rocm_directory
ansible.builtin.set_fact:
rocm_directory: "{{ offline_rocm_directory }}/rocm/{{ rocm_version }}/"

- name: Check rocm_directory exists or not
ansible.builtin.stat:
path: "{{ rocm_directory }}"
register: check_rocm_dir
- name: Get ROCm repository details from Pulp
ansible.builtin.command: "{{ pulp_bin_path }} {{ os_package_map[provision_os] }} distribution list --name rocm_{{ rocm_version }}"
delegate_to: localhost
register: check_rocm_repo
changed_when: false
no_log: true

- name: Set rocm_config_status based on directory existence
ansible.builtin.set_fact:
rocm_config_status: "{{ check_rocm_dir.stat.exists | ternary(true, false) }}"

rescue:
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "

- name: Set rocm_config_status to false
ansible.builtin.set_fact:
rocm_config_status: false

- name: Set rocm_config_status
when:
- rocm_input_status
- software_config.repo_config == 'never' or software_config.repo_config == 'partial'
- not check_rocm_repo.stat.exists
block:
- name: Fetch rocm_version
ansible.builtin.set_fact:
rocm_version: "{{ software_config.amdgpu | selectattr('name', 'equalto', 'rocm') | map(attribute='version') | first }}"

- name: Set rocm_config_status to true
- name: Set rocm_config_status based on pulp rpm distribution
ansible.builtin.set_fact:
rocm_config_status: true
when: check_rocm_repo.stdout | length > 0
rescue:
- name: Log an error message
ansible.builtin.debug:
msg: " {{ amdgpu_fail_msg }} "
msg: " {{ rocm_repo_fail_msg }} "

- name: Set rocm_config_status to false
ansible.builtin.set_fact:
rocm_config_status: false

# This will be used in future
# - name: Set rocm_config_status
# when:
# - rocm_input_status
# - software_config.repo_config == 'never' or software_config.repo_config == 'partial'
# - not check_rocm_repo.stat.exists
# block:
# - name: Fetch rocm_version
# ansible.builtin.set_fact:
# rocm_version: "{{ software_config.amdgpu | selectattr('name', 'equalto', 'rocm') | map(attribute='version') | first }}"

# - name: Set rocm_config_status to true
# ansible.builtin.set_fact:
# rocm_config_status: true
# rescue:
# - name: Log an error message
# ansible.builtin.debug:
# msg: " {{ amdgpu_fail_msg }} "

# - name: Set rocm_config_status to false
# ansible.builtin.set_fact:
# rocm_config_status: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Intel Corporation.
# Copyright 2025 Intel Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -25,9 +25,6 @@
file: "{{ software_config_json_file }}"
name: software_config

- name: Include vars for {{ oim_os }}
ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml"

- name: Get Intel Gaudi status
ansible.builtin.set_fact:
intel_gaudi_input_status: true
Expand Down
18 changes: 12 additions & 6 deletions accelerator/roles/accelerator_validation/vars/main.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved.
# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,10 +13,13 @@
# limitations under the License.
---

# Usage: amd_validation.yml
software_config_json_file: "{{ role_path }}/../../../input/software_config.json"

# Usage: validate_amdgpu_rocm_repo.yml
# Usage: validate_amd.yml
software_config_json_file: "{{ input_project_dir }}/software_config.json"
pulp_bin_path: /usr/local/bin/pulp
os_package_map:
rhel: rpm
rocky: rpm
ubuntu: deb
amdgpu_input_fail_msg: "Failed, software_config.json does not have the amdgpu software stack."
amdgpu_version_fail_msg: "Failed, software_config.json does not have the version for AMDGPU."
amdgpu_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading AMDGPU packages."
Expand All @@ -25,10 +28,13 @@ rocm_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading ROCM
amdgpu_fail_msg: "An error occurred while setting the rocm_config_status."

# Usage: include_local_repo_config.yml
local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml"
local_repo_config_file: "{{ input_project_dir }}/local_repo_config.yml"
local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again."

# Usage: validate_intel_gaudi.yml
offline_intelgaudi_directory: "{{ repo_store_path }}/cluster/apt"
offline_gaudi_directory: "{{ repo_store_path }}/cluster/{{ oim_os }}/{{ oim_os_version }}/deb"
gaudi_search_pattern: "habanalabs*.deb"
intel_gaudi_input_fail_msg: "Failed, software_config.json does not have the intelgaudi software stack."
intel_gaudi_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading Intel Gaudi driver packages."

Expand Down
1 change: 0 additions & 1 deletion accelerator/roles/accelerator_validation/vars/rocky.yml

This file was deleted.

4 changes: 2 additions & 2 deletions discovery/roles/discovery_validations/common/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@
# - name: Validate OFED and CUDA repo
# ansible.builtin.include_tasks: validate_ofed_cuda_repo.yml

# - name: Validate AMDGPU and ROCm repo
# ansible.builtin.include_tasks: validate_amdgpu_rocm_repo.yml
- name: Validate AMDGPU and ROCm repo
ansible.builtin.include_tasks: validate_amdgpu_rocm_repo.yml

# - name: Validate Broadcom repo
# ansible.builtin.include_tasks: validate_broadcom_repo.yml
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved.
# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -34,25 +34,30 @@
ansible.builtin.set_fact:
amdgpu_version: "{{ user_config.softwares | selectattr('name', 'equalto', 'amdgpu') | map(attribute='version') | first }}"

- name: Set amdgpu_directory
- name: Get amdgpu repository details from Pulp
ansible.builtin.command: "{{ pulp_bin_path }} {{ os_package_map[provision_os] }} distribution list --name amdgpu_{{ amdgpu_version }}"
delegate_to: localhost
register: pulp_amdgpu_output
changed_when: false

- name: Parse amdgpu repository details
ansible.builtin.set_fact:
amdgpu_directory: "{{ offline_rocm_directory }}/amdgpu/{{ amdgpu_version }}/"
amdgpu_repo_list: "{{ pulp_amdgpu_output.stdout | from_json }}"
when: pulp_amdgpu_output.stdout | length > 0

- name: Check amdgpu_directory exists or not
ansible.builtin.stat:
path: "{{ amdgpu_directory }}"
register: check_amdgpu_dir
- name: Set amdgpu_base_url and amdgpu_name if repository exists
ansible.builtin.set_fact:
amdgpu_base_url: "{{ amdgpu_repo_list[0]['base_url'] | default('') }}"
amdgpu_name: "{{ amdgpu_repo_list[0]['name'] | default('') }}"
amdgpu_config_status: true
when: amdgpu_repo_list | length > 0

- name: Warning - Please wait, This task will take few seconds
ansible.builtin.pause:
seconds: "{{ warning_wait_time }}"
prompt: "{{ amdgpu_repo_warning_msg }}"
when: not check_amdgpu_dir.stat.exists
when: pulp_amdgpu_output.stdout | length == 0

- name: Set amdgpu_config_status to true
ansible.builtin.set_fact:
amdgpu_config_status: true
when: check_amdgpu_dir.stat.exists
rescue:
- name: Warning - Please wait, This task will take few seconds
ansible.builtin.pause:
Expand Down
6 changes: 5 additions & 1 deletion discovery/roles/discovery_validations/common/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ CUDA will not be installed on the nodes post provisioning."
offline_iso_directory: "{{ repo_store_path }}/cluster/{{ provision_os }}/{{ provision_os_version }}/iso"

# Usage: validate_amdgpu_rocm_repo.yml
pulp_bin_path: /usr/local/bin/pulp
amdgpu_input_warning_msg: "[WARNING] software_config.json does not have the amdgpu software stack.
Hence ROCm will not be installed on the nodes post provisioning."
amdgpu_version_warning_msg: "[WARNING] software_config.json does not have the version for AMDGPU.
Expand All @@ -228,7 +229,10 @@ rocm_version_warning_msg: "[WARNING] software_config.json does not have the vers
Hence ROCm will not be installed on the nodes post provisioning."
rocm_repo_warning_msg: "[WARNING] local_repo.yml is not executed for downloading ROCM packages.
ROCm will not be installed on the nodes post provisioning."

os_package_map:
rhel: rpm
rocky: rpm
ubuntu: deb
# Usage: validate_intelgaudi_repo.yml
intelgaudi_version_warning_msg: "[WARNING] software_config.json does not have the version for 'intelgaudi'.
Hence Habana stack will not be installed on the nodes post provisioning."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
changed_when: true

- name: Configure ubuntu postscripts
ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/configure_postscripts.yml"
ansible.builtin.include_tasks: "{{ role_path }}/../{{ provision_os }}/tasks/configure_postscripts.yml"

- name: Configure ntp postscripts
ansible.builtin.include_tasks: configure_postscripts_ntp.yml
Expand Down Expand Up @@ -73,10 +73,10 @@
# - cuda_config_status
# - cuda_repo_stat.stat.exists
#
# - name: Configure postscripts for ROCm
# ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_rocm"
# changed_when: true
# when: amdgpu_config_status
- name: Configure postscripts for ROCm
ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_rocm"
changed_when: true
when: amdgpu_config_status
#
# - name: Configure postscripts for Intel Gaudi
# ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_intelgaudi"
Expand Down
11 changes: 5 additions & 6 deletions discovery/roles/postscripts/common/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,17 @@
MANPATH: "{{ xcat_manpath_env }}"
PERL_BADLANG: "{{ perl_badlang_env }}"
block:
- name: Include vars for {{ oim_os }}
ansible.builtin.include_vars: "{{ role_path }}/../{{ oim_os }}/vars/main.yml"
- name: Include vars for {{ provision_os }}
ansible.builtin.include_vars: "{{ role_path }}/../{{ provision_os }}/vars/main.yml"

# These tasks will be updated after local repo changes
# - name: Configure postscripts on {{ oim_os }}
# ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/main.yml"
- name: Configure postscripts on {{ provision_os }}
ansible.builtin.include_tasks: "{{ role_path }}/../{{ provision_os }}/tasks/main.yml"

- name: Check all node group status
ansible.builtin.include_tasks: check_nodes_all.yml

- name: Configuration of postbootscripts
ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/configure_postbootscripts.yml"
ansible.builtin.include_tasks: "{{ role_path }}/../{{ provision_os }}/tasks/configure_postbootscripts.yml"
when: all_node_status

- name: Configuration of postscripts
Expand Down
12 changes: 6 additions & 6 deletions discovery/roles/postscripts/common/templates/omnia_rocm.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ if [[ $amd_check_display_ctrlr == *"Advanced Micro Devices"* || $amd_check_proce
if [[ $validate_ubuntu_os == "1" ]]
then

echo "deb [trusted=yes] http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/apt/amdgpu/{{ amdgpu_version }} ./" >> /etc/apt/sources.list.d/amdgpu.list
echo "deb [trusted=yes] http://{{ amdgpu_base_url }} ./" >> /etc/apt/sources.list.d/amdgpu.list

sudo apt-get update
sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" -y
sudo apt install amdgpu-dkms -y

rm /etc/apt/sources.list.d/amdgpu.list

apt-get update
Expand All @@ -28,12 +28,12 @@ if [[ $amd_check_display_ctrlr == *"Advanced Micro Devices"* || $amd_check_proce
else

sudo tee /etc/yum.repos.d/amdgpu.repo <<EOF
[amdgpu]
name=amdgpu
baseurl=http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/yum/amdgpu/{{ amdgpu_version }}
[{{ amdgpu_name }}]
name={{ amdgpu_name }}
baseurl={{ amdgpu_base_url }}
enabled=1
gpgcheck=0
{% if proxy_status %}
{% if proxy_status is defined and proxy_status %}
proxy=_none_
{% endif %}
EOF
Expand Down
Loading