Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into resize_scheduler_opt
Browse files Browse the repository at this point in the history
  • Loading branch information
naoyam committed Feb 4, 2025
2 parents 30a05a3 + 212ac38 commit 52c19a6
Show file tree
Hide file tree
Showing 182 changed files with 9,631 additions and 3,632 deletions.
173 changes: 87 additions & 86 deletions .github/workflows/nvfuser-ci-trigger.yml
Original file line number Diff line number Diff line change
@@ -1,86 +1,87 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause

# A workflow to trigger ci on hybrid infra (github + self hosted runner)
name: Nvfuser-CI Trigger
on:
issue_comment:
types: [created]
jobs:
Authorization:
name: Authorization
runs-on: blossom
outputs:
args: ${{ env.args }}

# This job only runs for pull request comments
if: >-
( startsWith(github.event.comment.body, '!build') ||
startsWith(github.event.comment.body, '!test')
) &&
( github.actor == 'xwang233' ||
github.actor == 'jjsjann123' ||
github.actor == 'chang-l' ||
github.actor == 'csarofeen' ||
github.actor == 'drzejan2' ||
github.actor == 'IvanYashchuk' ||
github.actor == 'jacobhinkle' ||
github.actor == 'kevinstephano' ||
github.actor == 'liqiangxl' ||
github.actor == 'mmigdal-nv' ||
github.actor == 'naoyam' ||
github.actor == 'ptrblck' ||
github.actor == 'rdspring1' ||
github.actor == 'samnordmann' ||
github.actor == 'zasdfgbnm' ||
github.actor == 'crcrpar' ||
github.actor == 'nWEIdia' ||
github.actor == 'Priya2698' ||
github.actor == 'wujingyue' ||
github.actor == 'tfogal' ||
github.actor == 'protonu' ||
github.actor == 'cowanmeg' ||
github.actor == 'nsarka'
)
steps:
- name: Check if comment is issued by authorized person
run: blossom-ci
env:
OPERATION: 'AUTH'
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}

Vulnerability-scan:
name: Vulnerability scan
needs: [Authorization]
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
lfs: 'true'

- name: Run blossom action
uses: NVIDIA/blossom-action@main
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
with:
args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}

Job-trigger:
name: Start ci job
needs: [Vulnerability-scan]
runs-on: blossom
steps:
- name: Start ci job
run: blossom-ci
env:
OPERATION: 'START-CI-JOB'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause

# A workflow to trigger ci on hybrid infra (github + self hosted runner)
name: Nvfuser-CI Trigger
on:
issue_comment:
types: [created]
jobs:
Authorization:
name: Authorization
runs-on: blossom
outputs:
args: ${{ env.args }}

# This job only runs for pull request comments
if: >-
( startsWith(github.event.comment.body, '!build') ||
startsWith(github.event.comment.body, '!test')
) &&
( github.actor == 'xwang233' ||
github.actor == 'jjsjann123' ||
github.actor == 'chang-l' ||
github.actor == 'csarofeen' ||
github.actor == 'drzejan2' ||
github.actor == 'IvanYashchuk' ||
github.actor == 'jacobhinkle' ||
github.actor == 'kevinstephano' ||
github.actor == 'liqiangxl' ||
github.actor == 'mmigdal-nv' ||
github.actor == 'naoyam' ||
github.actor == 'ptrblck' ||
github.actor == 'rdspring1' ||
github.actor == 'samnordmann' ||
github.actor == 'zasdfgbnm' ||
github.actor == 'crcrpar' ||
github.actor == 'nWEIdia' ||
github.actor == 'Priya2698' ||
github.actor == 'wujingyue' ||
github.actor == 'tfogal' ||
github.actor == 'protonu' ||
github.actor == 'cowanmeg' ||
github.actor == 'nsarka' ||
github.actor == 'syed-ahmed'
)
steps:
- name: Check if comment is issued by authorized person
run: blossom-ci
env:
OPERATION: 'AUTH'
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}

Vulnerability-scan:
name: Vulnerability scan
needs: [Authorization]
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
lfs: 'true'

- name: Run blossom action
uses: NVIDIA/blossom-action@main
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
with:
args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}

Job-trigger:
name: Start ci job
needs: [Vulnerability-scan]
runs-on: blossom
steps:
- name: Start ci job
run: blossom-ci
env:
OPERATION: 'START-CI-JOB'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
61 changes: 56 additions & 5 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ jobs:
password: ${{ secrets.GITHUB_TOKEN }}
env:
GITHUB__USER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CONFIG__PUBLISH_OUTPUT: true
CONFIG__PUBLISH_OUTPUT: false
CONFIG__PUBLISH_COMBINED_OUTPUT: true

OPENAI__KEY: ${{ secrets.LLM_OPENAI__KEY }}
OPENAI__API_BASE: ${{ secrets.LLM_OPENAI__API_BASE }}
Expand All @@ -54,9 +55,9 @@ jobs:

PR_REVIEWER__REQUIRE_SCORE_REVIEW: false
PR_REVIEWER__REQUIRE_TESTS_REVIEW: true
PR_REVIEWER__REQUIRE_ESTIMATE_EFFORT_TO_REVIEW: true
PR_REVIEWER__REQUIRE_CAN_BE_SPLIT_REVIEW: false
PR_REVIEWER__REQUIRE_SECURITY_REVIEW: false
PR_REVIEWER__REQUIRE_ESTIMATE_EFFORT_TO_REVIEW: false
PR_REVIEWER__REQUIRE_TICKET_ANALYSIS_REVIEW: false

PR_REVIEWER__ENABLE_REVIEW_LABELS_EFFORT: false
Expand All @@ -66,8 +67,58 @@ jobs:
PR_REVIEWER__FINAL_UPDATE_MESSAGE: false

PR_REVIEWER__EXTRA_INSTRUCTIONS: |
Focus on potential logic change, especially on changes to function signatures.
To review a pull request (PR) for the Nvfuser project effectively, follow this structured approach:
Overall Principles Check:
Ensure the PR provides actual data and focuses on significant performance aspects.
Verify that a clear performance goal is set and that feedback was sought early.
Preliminary Evaluation:
Scope and Context: Confirm the PR clearly states its purpose and the significance of the problem it addresses.
SOL Analysis: Check if the PR uses a roofline model or existing implementations (e.g., CUTLASS) as a target for expected performance.
Current Status and Data: Ensure the PR includes current performance metrics and reasons for any gaps.
Approach Assessment:
Technical Details: Review the technical aspects of the approach and expected performance gains.
Limitations and Comparisons: Evaluate if multiple approaches were considered and if trade-offs are discussed.
Results Analysis:
Correctness and Tests: Confirm all tests pass and new tests were added if necessary.
Performance Data: Check for thorough performance evaluation and clear data presentation.
Gap Analysis: Assess the explanation of performance gaps and their importance.
Regressions: Evaluate the impact of any regressions and whether benefits outweigh drawbacks, supported by quantitative data.
Conclusion and Documentation:
Summarize the PR's effectiveness and potential next steps.
Ensure all guidelines are met and documentation is comprehensive.
Insight from Failed Approaches:
Consider insights from discarded methods, even if they weren't the final solution.
PR_DESCRIPTION__ADD_ORIGINAL_USER_DESCRIPTION: false
PR_DESCRIPTION__GENERATE_AI_TITLE: true
PR_DESCRIPTION__USE_BULLET_POINTS: true
PR_DESCRIPTION__EXTRA_INSTRUCTIONS: ""
PR_DESCRIPTION__ENABLE_PR_TYPE: false
PR_DESCRIPTION__FINAL_UPDATE_MESSAGE: true
PR_DESCRIPTION__ENABLE_HELP_TEXT: false
PR_DESCRIPTION__ENABLE_HELP_COMMENT: false
PR_DESCRIPTION__PUBLISH_DESCRIPTION_AS_COMMENT: true
PR_DESCRIPTION__PUBLISH_DESCRIPTION_AS_COMMENT_Persistent: true
PR_DESCRIPTION__ENABLE_SEMANTIC_FILES_TYPES: true
PR_DESCRIPTION__COLLAPSIBLE_FILE_LIST: 'adaptive'
PR_DESCRIPTION__COLLAPSIBLE_FILE_LIST_THRESHOLD: 8
PR_DESCRIPTION__INLINE_FILE_SUMMARY: false
PR_DESCRIPTION__USE_DESCRIPTION_MARKERS: false
PR_DESCRIPTION__INCLUDE_GENERATED_BY_HEADER: true
PR_DESCRIPTION__ENABLE_LARGE_PR_HANDLING: true

steps:
- name: PR Agent review
run: python /app/pr_agent/cli.py --pr_url ${{ github.event.pull_request.html_url }} review
- name: PR Agent combined action
run: python /app/pr_agent/cli.py --pr_url ${{ github.event.pull_request.html_url }} combined
2 changes: 1 addition & 1 deletion .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'clang-format==18.1.8',
'clang-format==19.1.7',
]
is_formatter = true

Expand Down
43 changes: 43 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/device_lower/analysis/index_compute.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/predicate_elimination.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/sync_information.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/tensor_memory.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/thread_predicate.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/tma.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/trivial_broadcast.cpp
Expand Down Expand Up @@ -203,6 +204,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp
${NVFUSER_SRCS_DIR}/rng.cpp
${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
${NVFUSER_SRCS_DIR}/runtime/compiled_kernel.cpp
${NVFUSER_SRCS_DIR}/runtime/executor.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_dispatch.cpp
${NVFUSER_SRCS_DIR}/runtime/executor_kernel_arg.cpp
Expand Down Expand Up @@ -273,6 +275,7 @@ endif()

if(BUILD_PYTHON)
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/python_frontend/distributed_tensor.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_cache.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_definition.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_state.cpp
Expand Down Expand Up @@ -548,6 +551,7 @@ list(APPEND JIT_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_circular_buffering.cpp
${NVFUSER_ROOT}/tests/cpp/test_abstract_tensor.cpp
${NVFUSER_ROOT}/tests/cpp/test_dynamic_transform.cpp
${NVFUSER_ROOT}/tests/cpp/test_embedding_node.cpp
${NVFUSER_ROOT}/tests/cpp/test_evaluator.cpp
${NVFUSER_ROOT}/tests/cpp/test_exceptions.cpp
${NVFUSER_ROOT}/tests/cpp/test_expr_simplifier.cpp
Expand Down Expand Up @@ -795,6 +799,43 @@ if(BUILD_NVFUSER_BENCHMARK)
-Werror -Wno-deprecated-copy
)
endif()

# multidevice transformer benchmark
if (NVFUSER_DISTRIBUTED)
set(MULTIDEVICE_BENCHMARK_SRCS)
list(APPEND MULTIDEVICE_BENCHMARK_SRCS
${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp
${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp
${NVFUSER_ROOT}/tests/cpp/utils.cpp
)
add_executable(nvfuser_multidevice_bench ${MULTIDEVICE_BENCHMARK_SRCS})
set_target_properties(nvfuser_multidevice_bench PROPERTIES
C_STANDARD ${NVFUSER_C_STANDARD}
CUDA_STANDARD ${NVFUSER_CUDA_STANDARD}
CXX_STANDARD ${NVFUSER_CPP_STANDARD}
CXX_STANDARD_REQUIRED ON
CXX_VISIBILITY_PRESET hidden
POSITION_INDEPENDENT_CODE Yes
VISIBILITY_INLINES_HIDDEN Yes
)
target_include_directories(nvfuser_multidevice_bench SYSTEM PRIVATE
${CMAKE_SOURCE_DIR}/third_party/benchmark/include
${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
${CMAKE_SOURCE_DIR}/third_party/googletest/googletest/include
)
target_include_directories(nvfuser_multidevice_bench PUBLIC ${NVFUSER_ROOT})
target_link_libraries(nvfuser_multidevice_bench PRIVATE
benchmark::benchmark
codegen_internal
)
add_dependencies(nvfuser_multidevice_bench flatc build_flatbuffer_config)
if(NOT MSVC)
target_compile_options(nvfuser_bench PRIVATE
-Wall -Wno-unused-function
-Werror -Wno-deprecated-copy
)
endif()
endif()
endif()

# --- generate runtime files
Expand All @@ -810,6 +851,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
${NVFUSER_ROOT}/runtime/block_sync_default.cu
${NVFUSER_ROOT}/runtime/block_welford_outer.cu
${NVFUSER_ROOT}/runtime/broadcast.cu
${NVFUSER_ROOT}/runtime/cluster.cu
${NVFUSER_ROOT}/runtime/complex_number.cu
${NVFUSER_ROOT}/runtime/fp16_support.cu
${NVFUSER_ROOT}/runtime/fp8_support.cu
Expand All @@ -825,6 +867,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
${NVFUSER_ROOT}/runtime/mbarrier.cu
${NVFUSER_ROOT}/runtime/memory.cu
${NVFUSER_ROOT}/runtime/random_numbers.cu
${NVFUSER_ROOT}/runtime/tensor_memory.cu
${NVFUSER_ROOT}/runtime/tensor.cu
${NVFUSER_ROOT}/runtime/tuple.cu
${NVFUSER_ROOT}/runtime/type_traits.cu
Expand Down
9 changes: 6 additions & 3 deletions benchmarks/cpp/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ static void SingleMatmulBase(
KernelExecutor ke;
ke.compile(fusion, args, launch_constraints, cparams);
NVF_CHECK(
getBankConflictInfo(ke.kernel(), launch_constraints).empty(),
getBankConflictInfo(ke.compiledKernel()->kernel(), launch_constraints)
.empty(),
"Shared memory bank conflict not removed.");

std::vector<c10::IValue> aten_inputs({inputs.first, inputs.second});
Expand Down Expand Up @@ -358,7 +359,7 @@ static void SingleMatmulPartitionedK(
auto lparams = LaunchParams();
ke.compile(fusion, args, lparams, cparams);
NVF_CHECK(
getBankConflictInfo(ke.kernel(), lparams).empty(),
getBankConflictInfo(ke.compiledKernel()->kernel(), lparams).empty(),
"Shared memory bank conflict not removed.");

// Warm up run
Expand Down Expand Up @@ -471,7 +472,9 @@ static void NvFuserScheduler_MatmulSplitKReduction(
fusion, args, heuristic_params->lparams, heuristic_params->cparams);

NVF_CHECK(
getBankConflictInfo(ke.kernel(), heuristic_params->lparams).empty(),
getBankConflictInfo(
ke.compiledKernel()->kernel(), heuristic_params->lparams)
.empty(),
"Shared memory bank conflict not removed.");

// Warm up run
Expand Down
Loading

0 comments on commit 52c19a6

Please sign in to comment.