diff --git a/.gitattributes b/.gitattributes index 922c66e..312053f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -77,4 +77,4 @@ ## Identifying file types # *.txt text -# *.jpg binary +# *.jpg binary \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..823f61b --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,17 @@ +version: 2 + +updates: + # Maintain dependencies for GitHub Actions as recommended in SPEC8: + # https://github.com/scientific-python/specs/pull/325 + # At the time of writing, release critical workflows such as + # pypa/gh-action-pypi-publish should use hash-based versioning for security + # reasons. This strategy may be generalized to all other github actions + # in the future. + - package-ecosystem: github-actions + directory: / + schedule: + interval: daily + commit-message: + prefix: "MAINT" + labels: + - "03 - Maintenance" \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index f52d9cf..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,30 +0,0 @@ -# name: CI - -# on: -# push: -# branches: -# - main -# pull_request: - -# jobs: -# build: -# runs-on: ubuntu-latest -# steps: -# - name: Checkout code -# uses: actions/checkout@v3 - -# - name: Set up Python -# uses: actions/setup-python@v4 -# with: -# python-version: 3.10 - -# - name: Install dependencies -# run: pip install meson meson-python - -# - name: Build the project -# run: | -# meson setup builddir -# meson compile -C builddir - -# - name: Run tests -# run: python -m unittest discover lightnumpy/tests \ No newline at end of file diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000..0ed772a --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,522 @@ +# Workflow to build and test wheels. +# To work on the wheel building infrastructure on a fork, comment out: +# +# if: github.repository == 'scikit-plots/lightnumpy' +# +# in the get_commit_message job. Be sure to include [wheel build] in your commit +# message to trigger the build. All files related to wheel building are located +# at tools/wheels/ +# Alternatively, you can add labels to the pull request in order to trigger wheel +# builds. +# +# The labels that trigger builds are: +# 36 - Build(for changes to the building process, +# 14 - Release(ensure wheels build before release) +name: Wheel Builder Conda + +on: + # schedule: + # # Automatically built and released every night (or at a regular interval, often nightly) + # # Cron schedule for automated runs + # # https://crontab.cronhub.io + # # ┌───────────── minute (0 - 59) + # # │ ┌───────────── hour (0 - 23) + # # │ │ ┌───────────── day of the month (1 - 31) + # # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + # # │ │ │ │ │ + # # cron: "11 1 * * SUN,WED" + # # Run every Monday at 11:11 AM + # - cron: '11 11 * * 1' + # triggers a workflow when a pull request (PR) is opened + # Code Review: Use this to run tests, checks, or CI pipelines + # Pre-Merge Validation: Ensure that changes introduced in the PR won't break + # Pull Requests to Ensure Code Quality + # pull_request: + # branches: + # - main + # - maintenance/** + # triggers a workflow when commits are pushed to specific branches or tags. + # Releases: Use this to automate packaging or deployment when pushing a versioned tag. + # Continuous Deployment (CD): Automatically deploy changes in main or maintenance/ branches. + # Push for Continuous Integration or Tagged Releases for Versioned Deployments + # push: + # # Release by branches|tags + # branches: + # - main + # - maintenance/** + # tags: + # - v* + # manually trigger the workflow from the GitHub Actions UI. + # Ad-Hoc Scenarios: Use this to rerun workflows with specific parameters or to debug a workflow. + # Manual Jobs: When you need a flexible way to trigger a job, such as running a custom deployment or testing process. + # Manual Trigger for Flexibility + workflow_dispatch: # Allows manual triggering of the workflow + +permissions: # Grants read-only permission to repository contents + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +# https://learn.microsoft.com/en-us/azure/devops/pipelines/process/expressions +# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/evaluate-expressions-in-workflows-and-actions#example-of-literals +# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables +jobs: + get_commit_message: + name: Get Commit Message + # To enable this job and subsequent jobs on a fork, comment out: + if: github.repository == 'scikit-plots/lightnumpy' + runs-on: ubuntu-latest + steps: + - name: Checkout lightnumpy + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + # Gets the correct commit message for pull request + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Get Commit Message + id: commit_message + run: | + # OUTPUT Contains '[wheel build]' or '1' + set -xe + # (Optional) Use Utility script by source it + # source .github/scripts/utils.sh + COMMIT_MSG=$(git log --no-merges -1 --oneline) + echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT + RUN="0" + if [[ "$COMMIT_MSG" == *"[wheel build]"* ]]; then + RUN="1" + fi + echo "message=$RUN" >> $GITHUB_OUTPUT + echo github.ref ${{ github.ref }} + outputs: + message: ${{ steps.commit_message.outputs.message }} + + # Build the wheels for Linux, Windows and macOS for Python 3.9 and newer + build_wheels: + name: >- + Wheel-${{ matrix.python[0] }}-${{ join(matrix.buildplat, '-') }} + # Allows splitting a string across multiple lines while treating it as a single logical line. + # ${{ matrix.buildplat[1] && format('-{0}', matrix.buildplat[1]) || '' }} + # ${{ matrix.buildplat[2] && format('-{0}', matrix.buildplat[2]) || '' }} + # ${{ matrix.buildplat[3] && format('-{0}', matrix.buildplat[3]) || '' }} + # ${{ matrix.buildplat[4] && format('-{0}', matrix.buildplat[4]) || '' }} + # ${{ matrix.buildplat[1] != '' && '-' || '' }}${{ matrix.buildplat[1] }} + # Wheel-cp310-macosx-arm64-accelerate-14.0 + if: >- + contains(needs.get_commit_message.outputs.message, '[wheel build]') || + contains(needs.get_commit_message.outputs.message, '1') || + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'push' && + startsWith(github.ref, 'refs/tags/v') && !endsWith(github.ref, 'dev0') + ) + needs: get_commit_message # OUTPUT Contains '[wheel build]' or '1' + runs-on: ${{ matrix.buildplat[0] }} + strategy: + # Ensure that a wheel builder finishes even if another fails + fail-fast: false + # Dynamic Environment Variables Using Matrix Context + matrix: + # Github Actions doesn't support pairing matrix values together, let's improvise + # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 + # python[0] is used to specify the python versions made by cibuildwheel + python: [ + ["cp310", "3.10"], # cp310 (CPython 3.10) Broadest library support + ["cp311", "3.11"], # pp310 (PyPy 3.10) Slower Just-In-Time (JIT) warm-up + ["cp312", "3.12"], + ["cp313", "3.13"], + # ["cp313t", "3.13"] # See: https://github.com/scipy/scipy/issues/21970 + ] + buildplat: + # should also be able to do multi-archs on a single entry, e.g. + # [windows-2019, win*, "AMD64 x86"]. However, those two require a different compiler setup + # so easier to separate out here. + - [ubuntu-22.04, manylinux, x86_64] + - [ubuntu-22.04, musllinux, x86_64] + # - [macos-13, macosx, x86_64, openblas, "10.13"] # Numcpp: C/C++ error: 'path' is unavailable: introduced in macOS 10.15 + - [macos-13, macosx, x86_64, accelerate, "14.0"] + - [macos-14, macosx, arm64, openblas, "12.3"] + - [macos-14, macosx, arm64, accelerate, "14.0"] + - [windows-2019, win, AMD64] # Window 64 bit ("*-win_amd64", "*-x86_64") + exclude: + # (Optional) Don't build PyPy 32-bit windows ("*-win32", "*-x86") + # Don't build PyPy 32-bit windows + - buildplat: [windows-2019, win32,] + python: "pp310" + + env: # Job level enviroment NAME: '', $NAME, ${{ env.NAME }} + IS_32_BIT: ${{ matrix.buildplat[2] == 'x86' }} # Build platform is 32-bit (e.g., 'x86', 'win32') + # Check current event is a push event with a tag starting with 'v' + # upload to staging if it's a push to a maintenance branch + # and the lastcommit message contains '[wheel build]' or '1' + # ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} + # Triggered for specific branches (e.g., main, maintenance/**) + # Triggered for tags starting with 'v' + IS_PUSH: >- + ${{ + github.event_name == 'push' && + (startsWith(github.ref, 'refs/heads/main') || + contains( github.ref, 'maintenance') || + startsWith(github.ref, 'refs/tags/v') ) && + contains(needs.get_commit_message.outputs.message, '1') + }} + # Checks if the event is a schedule or manual workflow dispatch + IS_SCHEDULE_DISPATCH: >- + ${{ + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' + }} + + steps: + - name: Checkout Repository with all submodules recursively + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # Fetches all nested submodules or submodules: true + submodules: recursive + fetch-tags: true + + - name: Python Setup # Used to push the built wheels + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: "3.x" + + - name: Windows Setup MSVC (32|64-bit) + if: ${{ runner.os == 'Windows' }} + uses: bus1/cabuild/action/msdevshell@e22aba57d6e74891d059d66501b6b5aed8123c4d # v1 + with: + architecture: ${{ env.IS_32_BIT == 'true' && 'x86' || 'x64' }} + + - name: Windows Setup RTools (32|64-bit) + if: ${{ runner.os == 'Windows' }} + run: | + if ($env:IS_32_BIT -eq "true") { + # mingw-w32 + choco install --no-progress --skip-automatic -y rtools + echo "c:\rtools40\i386\bin;" >> $env:GITHUB_PATH + echo "PKG_CONFIG_PATH=c:/rtools40/i386/lib/pkgconfig" >> $env:GITHUB_ENV + } else { + # mingw-w64 + choco install --no-progress --skip-automatic -y rtools + echo "c:\rtools40\ucrt64\bin;" >> $env:GITHUB_PATH + echo "PKG_CONFIG_PATH=c:/rtools40/ucrt64/lib/pkgconfig" >> $env:GITHUB_ENV + } + # Verify Tools + echo cl /? >> $GITHUB_OUTPUT + echo gcc --version >> $GITHUB_OUTPUT + + - name: Windows Setup Environment Variables (set PKG_CONFIG_PATH) + if: ${{ runner.os == 'Windows' }} + run: | + # Install pkg-config via Chocolatey + choco install --no-progress --skip-automatic --stoponfirstfailure --checksum 6004DF17818F5A6DBF19CB335CC92702 pkgconfiglite + # Verify pkg-config installation and its version + pkg-config --version + # Set up the CIBW environment variable, in PowerShell (Windows) + # pkgconfig needs a complete path, and not just "./openblas since the + $env:CIBW = "${{ github.workspace }}/.openblas" + # Handle backslash in paths (convert to forward slash) + # build is run in a tmp dir (?) + # It seems somewhere in the env passing, `\` is not + # passed through, so convert it to '/' + $env:CIBW = $env:CIBW.Replace("\", "/") + # Export the environment variable for GitHub Actions + echo "PKG_CONFIG_PATH=$env:CIBW" >> $env:GITHUB_ENV + echo CIBW_ENVIRONMENT="PKG_CONFIG_PATH=$env:CIBW" >> $env:GITHUB_ENV + echo CIBW_ENVIRONMENT_WINDOWS="PKG_CONFIG_PATH=$env:CIBW" >> $env:GITHUB_ENV + # Debug the PKG_CONFIG_PATH and check if it's set correctly + echo "$PKG_CONFIG_PATH" >> $GITHUB_OUTPUT + + - name: macOS Setup + if: startsWith( matrix.buildplat[0], 'macos-' ) + run: | + # Needed due to https://github.com/actions/runner-images/issues/3371 + # Supported versions: https://github.com/actions/runner-images/blob/main/images/macos/macos-14-arm64-Readme.md + echo "FC=gfortran-13" >> "$GITHUB_ENV" + echo "F77=gfortran-13" >> "$GITHUB_ENV" + echo "F90=gfortran-13" >> "$GITHUB_ENV" + if [[ ${{ matrix.buildplat[3] }} == 'accelerate' ]]; then + echo CIBW_CONFIG_SETTINGS=\"setup-args=-Dblas=accelerate\" >> "$GITHUB_ENV" + # Always use preinstalled gfortran for Accelerate builds + ln -s $(which gfortran-13) gfortran + export PATH=$PWD:$PATH + echo "PATH=$PATH" >> "$GITHUB_ENV" + LIB_PATH=$(dirname $(gfortran --print-file-name libgfortran.dylib)) + fi + if [[ ${{ matrix.buildplat[4] }} == '10.13' ]]; then + # 20241017 macos-13 images span Xcode 14.1-->15.2 + # XCODE_VER='14.1' + # Check for Xcode 14 and set the highest available version dynamically + XCODE_VER=$(ls /Applications | grep -E 'Xcode_14\.' | sort -V | tail -n 1) + # Extract the full version number (e.g., 14.3.1 from Xcode_14.3.1.app) + XCODE_VER=$(echo "$XCODE_VER" | sed -E 's/Xcode_([0-9]+\.[0-9]+\.[0-9]+)\.app/\1/') + else + # XCODE_VER='15.2' + # Check for Xcode 15 and set the highest available version dynamically + XCODE_VER=$(ls /Applications | grep -E 'Xcode_15\.' | sort -V | tail -n 1) + # Extract the full version number (e.g., 15.4.0 from Xcode_15.4.0.app) + XCODE_VER=$(echo "$XCODE_VER" | sed -E 's/Xcode_([0-9]+\.[0-9]+\.[0-9]+)\.app/\1/') + fi + echo "Selected Xcode version: $XCODE_VER" + # Use the selected Xcode version in the xcode-select command + CIBW="sudo xcode-select -s /Applications/Xcode_${XCODE_VER}.app" + echo "CIBW_BEFORE_ALL=$CIBW" >> $GITHUB_ENV + # setting SDKROOT necessary when using the gfortran compiler + # installed in cibw_before_build_macos.sh + sudo xcode-select -s /Applications/Xcode_${XCODE_VER}.app + CIBW="MACOSX_DEPLOYMENT_TARGET=${{ matrix.buildplat[4] }}\ + SDKROOT=$(xcrun --sdk macosx --show-sdk-path)\ + PKG_CONFIG_PATH=${{ github.workspace }}/.openblas" + echo "CIBW_ENVIRONMENT=$CIBW" >> "$GITHUB_ENV" + + echo "REPAIR_PATH=$LIB_PATH" >> "$GITHUB_ENV" + + PREFIX=DYLD_LIBRARY_PATH="\$(dirname \$(gfortran --print-file-name libgfortran.dylib))" + # remove libgfortran from location used for linking (if any), to + # check wheel has bundled things correctly and all tests pass without + # needing installed gfortran + POSTFIX=" sudo rm -rf /opt/gfortran-darwin-x86_64-native &&\ + sudo rm -rf /usr/local/gfortran/lib" + CIBW="$PREFIX delocate-listdeps -d {wheel} && echo "-----------" &&\ + $PREFIX delocate-wheel -v $EXCLUDE --require-archs \ + {delocate_archs} -w {dest_dir} {wheel} && echo "-----------" &&\ + delocate-listdeps -d {dest_dir}/*.whl && echo "-----------" &&\ + $POSTFIX" + + # Rename x86 Accelerate wheel to test on macOS 13 runner + if [[ ${{ matrix.buildplat[0] }} == 'macos-13' && ${{ matrix.buildplat[4] }} == '14.0' ]]; then + CIBW+=" && mv {dest_dir}/\$(basename {wheel}) \ + {dest_dir}/\$(echo \$(basename {wheel}) | sed 's/14_0/13_0/')" + fi + + # macos-arm64-openblas wheels that target macos-12 need a + # MACOS_DEPLOYMENT_TARGET of 12.3 otherwise delocate complains. + # Unclear of cause, possibly build tool related. + # This results in wheels that have 12_3 in their name. Since Python + # has no concept of minor OS versions in packaging land rename the + # wheel back to 12. + if [[ ${{ matrix.buildplat[0] }} == 'macos-14' && ${{ matrix.buildplat[4] }} == '12.3' ]]; then + CIBW+=" && echo \$(ls {dest_dir}) && \ + mv {dest_dir}/*.whl \$(find {dest_dir} -type f -name '*.whl' | sed 's/12_3/12_0/')" + fi + echo "CIBW_REPAIR_WHEEL_COMMAND_MACOS=$CIBW" >> "$GITHUB_ENV" + + # - name: Inject environment variable for python dev version + # if: ${{ matrix.python[1] == '3.14-dev' + # shell: bash + # run: | + # # For dev versions of python need to use wheels from scientific-python-nightly-wheels + # # When the python version is released please comment out this section, but do not remove + # # (there will soon be another dev version to target). + # DEPS0="pip install --pre -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy" + # DEPS1="pip install ninja meson-python pybind11 pythran cython" + + # CIBW="$DEPS0;$DEPS1;bash {project}/tools/wheels/cibw_before_build_linux.sh {project}" + # echo "CIBW_BEFORE_BUILD_LINUX=$CIBW" >> "$GITHUB_ENV" + + # CIBW="$DEPS0 && $DEPS1 && bash {project}/tools/wheels/cibw_before_build_win.sh {project}" + # echo "CIBW_BEFORE_BUILD_WINDOWS=$CIBW" >> "$GITHUB_ENV" + + # CIBW="$DEPS0;$DEPS1;bash {project}/tools/wheels/cibw_before_build_macos.sh {project}" + # echo "CIBW_BEFORE_BUILD_MACOS=$CIBW" >> "$GITHUB_ENV" + + # echo "CIBW_BEFORE_TEST=$DEPS0" >> "$GITHUB_ENV" + + - name: Disable build isolation for python free-threaded version + if: endsWith(matrix.python[0], 't') || matrix.python == 'cp313t' + shell: bash -el {0} + run: | + CIBW="pip; args: --no-build-isolation" + echo "CIBW_BUILD_FRONTEND=$CIBW" >> "$GITHUB_ENV" + + - name: Build Wheels Packages + uses: pypa/cibuildwheel@ee63bf16da6cddfb925f542f2c7b59ad50e93969 # v2.22.0 + env: # Step level enviroment NAME: '', $NAME, ${{ env.NAME }} + # Set CIBW_PRERELEASE_PYTHONS to True to build for pre-release versions of Python + CIBW_PRERELEASE_PYTHONS: True + # Enable support for free-threaded builds (specific to some platforms) + CIBW_FREE_THREADED_SUPPORT: True + # Specify target platform + CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}* + CIBW_ARCHS: ${{ matrix.buildplat[2] }} + + - name: macOS Wheels Packages Rename + if: startsWith( matrix.buildplat[0], 'macos-' ) + run: | + # macos-x86_64-accelerate wheels targeting macos-14 were renamed to 13 + # so they could be tested. Shift wheel name back to targeting 14. + if [[ ${{ matrix.buildplat[0] }} == 'macos-13' && ${{ matrix.buildplat[4] }} == '14.0' ]]; then + mv ./wheelhouse/*.whl $(find ./wheelhouse -type f -name '*.whl' | sed 's/13_0/14_0/') + fi + + - name: Upload Wheelhouse Wheels Packages + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: >- + ${{ matrix.python[0] }}-${{ join(matrix.buildplat, '-') }} + # ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + # ${{ matrix.buildplat[2] }} ${{ matrix.buildplat[3] }} + # ${{ matrix.buildplat[4] }} + path: ./wheelhouse/*.whl + + - name: Conda environment creation and activation anaconda-client + uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 + with: + # for installation of anaconda-client, required for upload to + # anaconda.org + # Note that this step is *after* specific pythons have been used to + # build and test the wheel + # for installation of anaconda-client, for upload to anaconda.org + # environment will be activated after creation, and in future bash steps + # environment-file: devtools/conda-envs/build_env.yaml # Path to the build conda environment + # auto-activate-base: false + auto-update-conda: true + show-channel-urls: true + miniforge-version: latest + python-version: "3.10" + + - name: Upload Conda Wheels Packages + if: success() + # see https://github.com/marketplace/actions/setup-miniconda for why + # `-el {0}` is required. + shell: bash -el {0} + run: | + # https://github.com/marketplace/actions/build-and-upload-conda-packages + # # https://docs.anaconda.com/anacondaorg/user-guide/packages/standard-python-packages/ + conda install -y anaconda-client + source tools/wheels/upload_wheels.sh + set_upload_vars # Required TOKEN, USERNAME + # For cron jobs (restricted to main branch) or "Run workflow" trigger + # an upload to: + # + # https://anaconda.org/scikit-plots-wheels-staging-nightly/lightnumpy + # + # Pushes to a maintenance branch that contain '[wheel build]' will + # cause wheels to be built and uploaded to: + # + # https://anaconda.org/scikit-plots-wheels-staging/lightnumpy + # + # The tokens were originally generated at anaconda.org + upload_wheels # Required WHEEL_FILE_PATH + env: + # ${{ secrets. }} retrieves the value of a secret stored securely + # in the Repository's Secrets or Enviroments Secrets settings on GitHub. + SKPLT_STAGING_UPLOAD_TOKEN: ${{ secrets.SKPLT_STAGING_UPLOAD_TOKEN }} # Use as TOKEN + # commented out so the sdist doesn't upload to nightly + SKPLT_STAGING_UPLOAD_TOKEN_NIGHTLY: ${{ secrets.SKPLT_STAGING_UPLOAD_TOKEN_NIGHTLY }} # Use as TOKEN + + build_sdist: + name: Build Source (sdist) Packages + if: >- + contains(needs.get_commit_message.outputs.message, '[wheel build]') || + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + (contains(github.event.pull_request.labels.*.name, '36 - Build') || + contains(github.event.pull_request.labels.*.name, '14 - Release'))) || + (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + needs: get_commit_message + runs-on: ubuntu-latest + env: + # Check current event is a push event with a tag starting with 'v' + # upload to staging if it's a push to a maintenance branch + # and the lastcommit message contains '[wheel build]' or '1' + # ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} + # Triggered for specific branches (e.g., main, maintenance/**) + # Triggered for tags starting with 'v' + IS_PUSH: >- + ${{ + github.event_name == 'push' && + (startsWith(github.ref, 'refs/heads/main') || + contains( github.ref, 'maintenance') || + startsWith(github.ref, 'refs/tags/v') ) && + contains(needs.get_commit_message.outputs.message, '1') + }} + # Checks if the event is a schedule or manual workflow dispatch + IS_SCHEDULE_DISPATCH: >- + ${{ + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' + }} + + steps: + - name: Checkout lightnumpy + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: true + # Used to push the built wheels + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + # Build sdist on lowest supported Python + python-version: "3.10" + - name: Build sdist + run: | + python -m pip install -U pip build setuptools wheel + python -m build --sdist -Csetup-args=-Dallow-noblas=true + - name: Test the sdist + run: | + # TODO: Don't run test suite, and instead build wheels from sdist + # Depends on pypa/cibuildwheel#1020 + python -m pip install dist/*.gz -Csetup-args=-Dallow-noblas=true + pip install -r requirements/test_requirements.txt + cd .. # Can't import "scikitplot" within "scikitplot" src directory + # python -c "import scikitplot, sys; print(scikitplot.__version__); sys.exit(numpy.test() is False)" + python -c "import scikitplot, sys; print(scikitplot.__version__);" + + - name: PyPI Check README rendering + run: | + python -m pip install twine + twine check dist/* + + - name: Upload Wheelhouse Wheels Packages + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: sdist + path: ./dist/* + + - name: Conda environment creation and activation anaconda-client + uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 + with: + # for installation of anaconda-client, required for upload to + # anaconda.org + # Note that this step is *after* specific pythons have been used to + # build and test the wheel + # for installation of anaconda-client, for upload to anaconda.org + # environment will be activated after creation, and in future bash steps + # environment-file: devtools/conda-envs/build_env.yaml # Path to the build conda environment + # auto-activate-base: false + auto-update-conda: true + show-channel-urls: true + miniforge-version: latest + python-version: "3.10" + + - name: Upload Conda Source (sdist) Packages + if: success() + # see https://github.com/marketplace/actions/setup-miniconda for why + # `-el {0}` is required. + shell: bash -el {0} + run: | + # https://github.com/marketplace/actions/build-and-upload-conda-packages + # # https://docs.anaconda.com/anacondaorg/user-guide/packages/standard-python-packages/ + conda install -y anaconda-client + source tools/wheels/upload_wheels.sh + set_upload_vars # Required TOKEN, USERNAME + # For cron jobs (restricted to main branch) or "Run workflow" trigger + # an upload to: + # + # https://anaconda.org/scikit-plots-wheels-staging-nightly/lightnumpy + # + # Pushes to a maintenance branch that contain '[wheel build]' will + # cause wheels to be built and uploaded to: + # + # https://anaconda.org/scikit-plots-wheels-staging/lightnumpy + # + # The tokens were originally generated at anaconda.org + upload_wheels # Required WHEEL_FILE_PATH + env: + # ${{ secrets. }} retrieves the value of a secret stored securely + # in the Repository's Secrets or Enviroments Secrets settings on GitHub. + SKPLT_STAGING_UPLOAD_TOKEN: ${{ secrets.SKPLT_STAGING_UPLOAD_TOKEN }} # Use as TOKEN + # commented out so the sdist doesn't upload to nightly + SKPLT_STAGING_UPLOAD_TOKEN_NIGHTLY: ${{ secrets.SKPLT_STAGING_UPLOAD_TOKEN_NIGHTLY }} # Use as TOKEN \ No newline at end of file diff --git a/.gitignore b/.gitignore index 04af9b2..9db47c7 100644 --- a/.gitignore +++ b/.gitignore @@ -105,4 +105,4 @@ pip-delete-this-directory.txt atlassian-ide-plugin.xml # OS droppings -**.DS_Store \ No newline at end of file +**.DS_Store \ No newline at end of file diff --git a/CITATION.bib b/CITATION.bib index 170ac81..721f642 100644 --- a/CITATION.bib +++ b/CITATION.bib @@ -1,9 +1,3 @@ -% --------------------------------------------------------- -% CITATION.bib file for lightnumpy -% This file provides citation information for users -% who want to cite the library, related papers, and books. -% --------------------------------------------------------- - @misc{lightnumpy:vlatest, author = { scikit-plots developers }, title = { lightnumpy: A lightweight version of NumPy (or similar functionality) }, diff --git a/CITATION.cff b/CITATION.cff index a7db41d..55f9f5b 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,9 +1,3 @@ -# --------------------------------------------------------- -# CITATION.cff file for lightnumpy -# This file provides citation information for users -# who want to cite the library, related papers, and books. -# --------------------------------------------------------- - cff-version: 1.2.0 # The version of the CFF format used. message: "If you use this software, please cite it using the following metadata." title: "lightnumpy: A lightweight version of NumPy (or similar functionality)" diff --git a/LICENSE b/LICENSE index 0ec7b9a..8caa6cc 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD-3 Clause License -Copyright (c) [2024 - ], [scikit-plots Developers] +Copyright (c) [2024 - ], [scikit-plots Developers] Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/MANIFEST.in b/MANIFEST.in index 2c1a288..233a03d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -10,7 +10,7 @@ recursive-include lightnumpy * ## Include all non-Python files in your package directory (e.g., data files, config files) ## Replace `your_package_name` with the actual name of your package -# recursive-include scikitplot *.txt *.csv *.json *.yaml +# recursive-include lightnumpy *.txt *.csv *.json *.yaml ## Exclude unnecessary files or directories (useful for keeping the package clean) # prune tests # Exclude the tests directory from the distribution diff --git a/docs/source/faq.rst b/docs/source/faq.rst index d24bfb7..f6aebf8 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -2,4 +2,4 @@ Frequently Asked Questions ========================== Q: How is LightNumPy different from NumPy? -A: LightNumPy is lightweight, modular, and supports GPU/TPU operations. +A: LightNumPy is lightweight, modular, and supports GPU/TPU operations. \ No newline at end of file diff --git a/galleries/examples/tensor_example.py b/galleries/examples/tensor_example.py deleted file mode 100644 index e69de29..0000000 diff --git a/lightnumpy/_core/meson.build b/lightnumpy/_core/meson.build index 845f9b2..fa223bd 100644 --- a/lightnumpy/_core/meson.build +++ b/lightnumpy/_core/meson.build @@ -11,7 +11,7 @@ if get_option('lnpy_use_numcpp').enabled() _global_numcpp_args += [ # The -D flag defines a macro that is globally available during compilation. # The -U flag removes a macro definition during compilation, effectively ignoring it. - '-DLNPY_USE_NUMCPP', # Default disable all NumCpp features + '-DLNPY_USE_NUMCPP', # Enable all NumCpp features '-DNUMCPP_INCLUDE_PYBIND_PYTHON_INTERFACE', # Include PyBind11 Python interface helper functions # The -f option in compilers like gcc or g++ is used to enable # or disable specific compiler features or behaviors. @@ -48,7 +48,7 @@ if get_option('lnpy_use_numcpp').enabled() endif else _global_numcpp_args += [ - '-U LNPY_USE_NUMCPP', # Default disable all NumCpp features + '-U LNPY_USE_NUMCPP', # Disable all NumCpp features ] endif diff --git a/lightnumpy/_gpu_core/__init__.py b/lightnumpy/_gpu_core/__init__.py index e69de29..912d677 100644 --- a/lightnumpy/_gpu_core/__init__.py +++ b/lightnumpy/_gpu_core/__init__.py @@ -0,0 +1,2 @@ +# Here Sample: numC++: Accelerated GPU Library for C++, with NumPy syntax +# https://github.com/Sha-x2-nk/numCpp \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/.gitkeep b/lightnumpy/_gpu_core/include/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/lightnumpy/_gpu_core/include/b.h b/lightnumpy/_gpu_core/include/b.h new file mode 100644 index 0000000..d7a8b49 --- /dev/null +++ b/lightnumpy/_gpu_core/include/b.h @@ -0,0 +1,5 @@ +#define N 8 + +extern __device__ int g[N]; + +extern __device__ void bar(void); \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/cuda_helpers.hpp b/lightnumpy/_gpu_core/include/cuda_helpers.cuh similarity index 100% rename from lightnumpy/_gpu_core/include/cuda_helpers.hpp rename to lightnumpy/_gpu_core/include/cuda_helpers.cuh diff --git a/lightnumpy/_gpu_core/include/gpu_array_ops.cuh b/lightnumpy/_gpu_core/include/gpu_array_ops.cuh new file mode 100644 index 0000000..28cd185 --- /dev/null +++ b/lightnumpy/_gpu_core/include/gpu_array_ops.cuh @@ -0,0 +1,38 @@ +#pragma once // Ensure the file is included only once by the compiler + +#ifndef GPU_ARRAY_OPS_CUH +#define GPU_ARRAY_OPS_CUH + +#include "numC++/npGPUArray.cuh" +#include "numC++/gpuConfig.cuh" + +#ifdef __cplusplus +extern "C" { +#endif + +// Wrapper for initializing GPU configuration +void init_gpu_config(int device_id); + +// Create a GPU array with default values +void* create_gpu_array(int rows, int cols, float default_value); + +// Create a GPU array from a 1D array +void* create_gpu_array_1d(const float* data, int size); + +// Create a GPU array from a 2D array +void* create_gpu_array_2d(const float* data, int rows, int cols); + +// Get GPU array metadata +int get_gpu_array_rows(void* array); +int get_gpu_array_cols(void* array); +int get_gpu_array_size(void* array); +float* get_gpu_array_data(void* array); + +// Clean up GPU array +void delete_gpu_array(void* array); + +#ifdef __cplusplus +} +#endif + +#endif // GPU_ARRAY_OPS_CUH \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/gpu_ops.hpp b/lightnumpy/_gpu_core/include/gpu_ops.hpp deleted file mode 100644 index b092983..0000000 --- a/lightnumpy/_gpu_core/include/gpu_ops.hpp +++ /dev/null @@ -1 +0,0 @@ -// Template: Declare GPU operations using CUDA. \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/numC++/README.md b/lightnumpy/_gpu_core/include/numC++/README.md new file mode 100644 index 0000000..c8f9870 --- /dev/null +++ b/lightnumpy/_gpu_core/include/numC++/README.md @@ -0,0 +1,258 @@ +# numC++: Accelerated GPU Library for C++, with NumPy syntax +numC++ is a C++ library inspired by numpy, designed for accelerated numerical computations on GPUs. It provides a familiar syntax similar to numpy, making it easy for users familiar with numpy to transition to GPU-accelerated computing seamlessly while working on C++. + +Currently, numC++ only supports upto 2D arrays. + +## Installation +To use numC++, follow these steps: + +1. Clone the repository: + + `git clone https://github.com/Sha-x2-nk/numC++.git` +2. Include whatever numC++ headers / functions you require: + ```cpp + #include "numC++/npGPUArray.cuh" + ``` + +3. compile you numC++ including program using nvcc, and also compile and link gpuConfig.cu file. + +## Features +### ArrayGPU Class +The ArrayGPU class provides functionalities for creating, manipulating, and performing operations on GPU-accelerated arrays. Here are some key features: + +* Creation and initialization of arrays + +```cpp +#include "numC++/npGPUArray.cuh" +#include "numC++/gpuConfig.cuh" + +int main(){ + np::getGPUConfig(0); + /* mandatory call, once in main function. this function gets gpu config, i.e. number + of cuda cores and number of SMs to efficiently schedule algorithms on GPU. */ + + auto A = np::ArrayGPU(10, 2); // creates a 10x2 array filled with 0s' + A = np::ArrayGPU(10, 2, 5); // 10x2 array with all elements as 5. + A = np::ArrayGPU({1, 2, 4, 5, 6}); // supplying matrix + A = np::ArrayGPU({ + {1.5, 2.5, 3.5, 4.5, 5.5, 6.5}, + {0.1, 0.2, 0.3, 0.4, 0.5, 0.6}, + }); // supplying 2d matrix + + + + int r = A.rows; // gets number of rows + int c = A.cols; // gets number of cols + int sz = A.size(); // gets size, rows * cols + float *m = A.mat; // pointer pointing to array in global memory of GPU +} +``` +* Reshaping arrays +```cpp + A.reshape(5, 4); // reshapes it to 5x4 array. + // reshape fails when new size does not match old size +``` +* Getter, setter +```cpp +auto A = np::ArrayGPU(10, 5); +float v = A.at(0); // assumes array as linear and gives number at position. +v = A.at(5, 4); // gives number at 5, 4 position. +/* above functions return the value as typename(float in this case), and the variable is +transferred to CPU RAM. Hence printing by direct for loop will take time - each variable +will be moved to CPU RAM from GPU RAM, numC++ has print and cout for that..*/ + +/* numC++ also has support for indexing multiple elements at once, like numpy. This will +become helpful, when we introduce np::arange function*/ + +auto C = A.at(np::ArrayGPU idxs); // will return a 1D ArrayGPU with all the elements + // from idxs given as parameter. order maintained. +// Ci = Ai where i is from idxs. + +auto C = A.at(np::ArrayGPU row, np::ArrayGPU col); // Ci = A(rowi, coli). + +// set function also has similar APIs +C.set(0, 5); // sets 0th index element as 5 +C.set(np::ArrayGPU idxs, np::ArrayGPU val); +C.set(np::ArrayGPU rows, np::ArrayGPU cols, np::ArrayGPU val); +``` +* print function. +```cpp +auto A = np::ArrayGPU(1024, 1024); +A.print(); // prints whole array +std::cout<(10, 2); + auto B = np::ArrayGPU(10, 2); + // currently only same type arrays are supported for operators. + auto C = A + B; + + // broadcasting + C = A + 5; + C = A + np::ArrayGPU(1, 2); + C = A + np::ArrayGPU(10, 1); + C = A + np::ArrayGPU(1, 1, 0); + + // shown for +, but also works with -, *, / operators +``` +* Comparison operations (>, <, >=, <=, ==, !=). Returns array of 0s and 1s, depending on condition. (Supports broadcasting) +```cpp + auto A = np::ArrayGPU(10, 2); + auto B = np::ArrayGPU(10, 2); + // currently only same type arrays are supported for operators. + auto C = A < B; + + // broadcasting + C = A < 5; + C = A < np::ArrayGPU(1, 2); + C = A < np::ArrayGPU(10, 1); + C = A < np::ArrayGPU(1, 1, 0); + + // shown for <, but also works with <=, >, >=, ==, != operators +``` +* Transpose. returns a new transposed array. Transposition kernel is optimised and delivers ~97% of copy speeds during transposition. +```cpp + auto A = np::ArrayGPU(10, 2); + auto AT = A.T(); +``` +* dot product - only supported for float32 dtype. +```cpp +auto A = np::ArrayGPU(128, 1024); +auto B = np::ArrayGPU(1024, 128); +auto C = A.dot(B); + +// other dot functions +B = np::ArrayGPU(128, 1024); +C = A.Tdot(B); // A transpose dot B + +C = A.dotT(B); // A dot B transpose + +``` +* Statistical functions (sum, max, min, argmax, argmin). Returns a new array. additional argument - axis +```cpp +auto A = np::ArrayGPU(128, 1024); + +// default axis = -1. i.e. total sum +auto C = A.sum(); // returns 1x1 array +C = A.sum(0); // column wise sum. returns array of dimension 1x1024 +C = A.sum(1); // row wise sum. returns array of dimension 128x1 + +/* works similarly with sum, max, min, argmax, argmin. +argmax, argmin return indexes of element instead of elements. return type is mandatorily +np::ArrayGPU for these functions.*/ +``` + +### npFunctions header +* ones, zeros and arange +```cpp +#include "numC++/npFunctions.cuh" + +// call gpuConfig + +auto C = np::ones(10); // 1d array of 1s +C = np::ones(10, 10); // 2d array of 1s + +C = np::zeros(10, 10); // 1d array of 0s +C = np::zeros(10, 10); // 2d array of 0s + +C = np::arange(10); // 1d array with numbers from 0 to 9, all at their respective + // indexes. Immensely powerful for collective indexing, as + // shown earlier +``` + +* maximum, minimum +```cpp +#include "numC++/npFunctions.cuh" + +// call gpuConfig +auto A = np::ArrayGPU(10, 5, 7); // 10x5 array, fill with 7 +auto B = np::ArrayGPU(10, 5, 6); + +auto C = np::maximum(A, B); + +// broadcasting +C = np::maximum(A, 0); +C = np::maximum(A, np::ArrayGPU(10, 1)); +C = np::maximum(A, np::ArrayGPU(1, 5)); + +// works +``` +* exp, log, square, sqrt, pow +```cpp +#include "numC++/npFunctions.cuh" + +// call gpuConfig +auto A = np::ArrayGPU(10, 5, 7); // 10x5 array, fill with 7 + +auto C = np::sqrt(A); // returns an array after applying function element wise. +// similar syntax for square, log, exp. + +C = np::pow(A, 15); // returns an array after raising all elements by a power of pow. + // (pow is float) +``` +* shuffle +```cpp +#include "numC++/npFunctions.cuh" + +// call gpuConfig +auto A = np::arange(1000); // array with numbers from 0 - 999 + +auto C = np::shuffle(A); // shuffles array randomly. (permutes) +``` +* array_split +```cpp +#include "numC++/npFunctions.cuh" + +// call gpuConfig +auto A = np::arange(1000); // array with numbers from 0 - 999 + +auto batches = np::array_split(A, 5, 0); // array, num_parts, axis. + // currently only axis = 0 is supported. +// returns a std::vector of arrays. +// will split even if arrays formed will be unequal. +// will create i%n arrays of size i/n + 1 and rest of size i/n +``` +### Random Class +Random array generation (uniform and normal distributions) +* Uniform distribution +```cpp +#include "numC++/npFunctions.cuh" + +// call gpuConfig + +auto A = np::Random::rand(1, 100); // filled with numbers from uniform distribution + // between 0 and 1 + // third argument can also be given - seed. +auto A = np::Random::rand(1, 100, 20, 50); // filled with numbers from uniform + // distribution between 20 and 50 + // fifth argument can also be given - seed. +``` +* Normal distribution +```cpp +#include "numC++/npFunctions.cuh" + +// call gpuConfig + +auto A = np::Random::randn(1, 100); // filled with numbers from normal distribution + // between 0 and 1 + // third argument can also be given - seed. +``` + +### GPU Config header +Initialises variables of NUM_CUDA_CORES and NUM_SMS to launch gpu functions effectively. Also Initialises cublas_handle to do dot products using cubals sgemm API. + +### Custom Kernels header +This has definitions of kernels of all functions we have used in numC++ which runs on GPU (except dot, dot is from cublas). + +## Contribution and Future Development +While NumPy offers a vast array of commonly used functions such as sort, argsort, and more, this project currently focuses on a specific set of functionalities. For my immediate needs, I've implemented the necessary functions; however, I may revisit this project in the future to expand its capabilities. + +Contributions to enhance and add new functionalities are welcome! If you find areas where additional features could benefit the project or have ideas for improvements, feel free to contribute by opening an issue or submitting a pull request on GitHub. + +## Acknowledgements +- CuBLAS, cuda blas library for dot product +- The kernels used here have been a result of lots of code browsing and book - programming massively parallel processors. Acknowledgement for most signification resources has been done in my cuda-projects repository. + + diff --git a/lightnumpy/_gpu_core/include/numC++/customKernels.cuh b/lightnumpy/_gpu_core/include/numC++/customKernels.cuh new file mode 100644 index 0000000..848786f --- /dev/null +++ b/lightnumpy/_gpu_core/include/numC++/customKernels.cuh @@ -0,0 +1,1964 @@ +#ifndef CUSTOMKERNELS_CUH +#define CUSTOMKERNELS_CUH + +#include // for Operator types + +#include +#include +#include +#include + +#include // printf +#include // for std::is_same + +// curand kernels +// for uniform distribution +template +__global__ void kernelInitializeRandomUnif(TP *arr, const int size, const unsigned long long seed); + +// for uniform distribution range = [lo, hi] +template +__global__ void kernelInitializeRandomUnif(TP *arr, const int size, const int lo, const int hi, const unsigned long long seed); + +// for normal distribution +template +__global__ void kernelInitializeRandomNorm(TP *arr, const int size, const unsigned long long seed); + +// 1 x 1 grid to print matrices +template +__global__ void kernelPrintMat(const TP *in, const int M, const int N); + +// kernel to transpose an array. +template +// TILE_WIDTH = 32. 1 block copies 32 x 32 elements. +__global__ void kernelTransposeInMem(const TP *in, TP *out, const int M, const int N); + +// kernel to initialise all values of array to val. +template +__global__ void kernelInitMatBroadcast(TP *in, const TP Val, const int size); + +// kernel to initialised arange array -> [0 to range), spaced by 1 +template +__global__ void kernelInitMatArange(TP *in, const int range); + +// kernel to get values at a range of indexes. +template +__global__ void kernelGetMatValues(const TP *in, TP *out, const int *idxs, const int size); + +// kernel to get values at a range of indexes. +template +__global__ void kernelGetMatValues(const TP *in, const int rdin, TP *out, const int *rows, const int *cols, const int size); + +// sets A[i] to A[i] OP B, for i in idxs +template +__global__ void kernelSetMat(TP *A, const TP B, const int *idxs, const int size); + +// sets A[i] to A[i] OP B[i], for i in idxs +template +__global__ void kernelSetMat(TP *A, const TP *B, const int *idxs, const int size); + +// sets A[r][c] to A[r][c] OP B, for r, c in zip(r_idxs, c_idxs) +template +__global__ void kernelSetMat(TP *A, const int rdin, const TP B, const int *r_idxs, const int *c_idxs, const int size); + +// sets A[r][c] to A[r][c] OP B[r][c], for r, c in zip(r_idxs, c_idxs) +template +__global__ void kernelSetMat(TP *A, const int rdin, const TP *B, const int *r_idxs, const int *c_idxs, const int size); + +// operator functions + +// perform operator on corrosponding elements of 2 matrices +// C = A op B +template +__global__ void kernelMatOpMat(const TP *A, const TP *B, TP *C, const int size); +template +__global__ void kernelScalarOpMat(const TP Scal, const TP *A, TP *C, const int size); +// operator on matrix and a scalar. +// C = A op Scal (broadcasting) +// op scalar to all values of the matrix +template +__global__ void kernelMatOpScalar(const TP *A, const TP Scal, TP *C, const int size); +template +__global__ void kernelScalarOpMat(const TP Scal, const TP *A, TP *C, const int size); + +// op on matrix and vector, mat.rows = vec.dim +// C = A op V (broadcasting) +// shapeA = M x N matrix +template +__global__ void kernelMatOpVecAlongCols(const TP *A, const TP *V, TP *C, const int size, const int N); +template +__global__ void kernelVecOpMatAlongCols(const TP *V, const TP *A, TP *C, const int size, const int N); + +// operator on matrix and vector, mat.cols = vec.dim +// C = A op V (broadcasting) +// shapeA = M x N matrix +template +__global__ void kernelMatOpVecAlongRows(const TP *A, const TP *V, TP *C, const int size, const int N); +template +__global__ void kernelVecOpMatAlongRows(const TP *V, const TP *A, TP *C, const int size, const int N); + +// maxmin +// compare 2 matrix ( element wise ) and put max / min value in result matrix. +// A = MxN +// B = MxN +// Ci = max(Ai, Bi). (elementwise) +template +__global__ void kernelMatMaxminMat(const TP *A, const TP *B, TP *C, const int size); + +template +__global__ void kernelMatMaxminScalar(const TP *A, const TP B, TP *C, const int size); + +template +__global__ void kernelMatMaxminVecAlongCols(const TP *A, const TP *V, TP *C, const int size, const int N); + +template +__global__ void kernelMatMaxminVecAlongRows(const TP *A, const TP *V, TP *C, const int size, const int N); + +// npfunctions +// functions per element +// A = MxN +// C = MxN +// Ci = F(Ai) +template +__global__ void kernelFMat(const TP *A, TP *C, const int size); + +// np.pow +// A = MxN +// C = MxN +// Ci = square(Ai) +template +__global__ void kernelPowMat(const TP *A, const float power, TP *C, const int size); + +// REDUCTION +// warp unroll +template +__device__ void kernelWarpReduceF(volatile TP *s_A, const int tid); + +template +__global__ void kernelReduceF(const TP *A, TP *output, const int size); + +// warp unroll +template +__device__ void kernelWarpReduceArgF(volatile TP *s_A, volatile int *s_Idx, const int tid); + +template +__global__ void kernelReduceArgF(const TP *A, TP *outputMax, int *outputIdx, const int size); + +// second reduction k time p -> idx serial nhi h, to ek idx ka bhi array dena hoga +template +__global__ void kernelReduceArgF(const TP *A, const int *A_idx, TP *outputMax, int *outputIdx, const int size); + +// np.shuffle +template +__global__ void kernelMatShuffle(TP *A, const int size); + +// ########## FUNCTION DEFINITIONS + +// for uniform distribution +template +__global__ void kernelInitializeRandomUnif(TP *arr, const int size, int lo, int hi, const unsigned long long seed) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + idx *= 50; + if (idx < size) + { + curandState state; + curand_init(seed, idx, 0, &state); // Initialize curand state for each thread + arr[idx] = curand_uniform(&state); // Generate a random value + + for(int i= 1; i< 50 && idx + i< size; ++i){ + arr[idx + i] = ( curand_uniform(&state) * (hi - lo) ) + lo; // generate random value + } + } +} + +// for normal distribution +template +__global__ void kernelInitializeRandomNorm(TP *arr, const int size, const unsigned long long seed) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + idx *= 50; + if (idx < size) + { + curandState state; + curand_init(seed, idx, 0, &state); // Initialize curand state for each thread + arr[idx] = curand_normal(&state); // Generate a random value + for(int i= 1; i < 50 && idx + i< size; ++i){ + arr[idx + i] = curand_normal(&state); // generate random value + } + } +} + +// 1 x 1 grid to print matrices +template +__global__ void kernelPrintMat(const TP *in, const int M, const int N) +{ + // 1, 1 grid. only to print matrix + for (int r = 0; r < M; ++r) + { + for (int c = 0; c < N; ++c) + { + if constexpr (std::is_same::value) + { + printf("%d ", in[r * N + c]); + } + else if constexpr (std::is_same::value) + { + printf("%f ", in[r * N + c]); + } + else if constexpr (std::is_same::value) + { + printf("%lf ", in[r * N + c]); + } + else if constexpr (std::is_same::value) + { + printf("%c ", in[r * N + c]); + } + else + { + // Handle other types here + printf("Unsupported type in kernelPrintMat"); + } + } + printf("\n"); + } +} + +// kernel to transpose an array. +template +// TILE_WIDTH = 32. 32 x 32 copy krega ek matrix +__global__ void kernelTransposeInMem(const TP *in, TP *out, const int M, const int N) +{ + __shared__ TP tile[TILE_DIM][TILE_DIM + 1]; + int x = blockIdx.x * TILE_DIM + threadIdx.x; + int y = blockIdx.y * TILE_DIM + threadIdx.y; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + { + if (y + j < M && x < N) + { + tile[threadIdx.y + j][threadIdx.x] = in[(y + j) * N + x]; + } + } + + __syncthreads(); + + x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset + y = blockIdx.x * TILE_DIM + threadIdx.y; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + { + if (y + j < N && x < M) + { + out[(y + j) * M + x] = tile[threadIdx.x][threadIdx.y + j]; + } + } +} + +// kernel to initialise all values of array to val. +template +__global__ void kernelInitMatBroadcast(TP *in, const TP Val, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while(idx < size){ + in[idx] = Val; + idx += grid_size; + } +} + +// kernel to initialised arange array -> [0, range), spaced by 1 +template +__global__ void kernelInitMatArange(TP *in, const int range) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + while (idx < range) + { + in[idx] = idx; + idx += grid_size; + } +} + +// get values at idxs in A, store C +template +__global__ void kernelGetMat(const TP *A, TP* C, const int *idxs, const int size){ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while (idx < size){ + C[idx] = A[idxs[idx]]; + idx += grid_size; + } +} + +// get values at idxs in A, store C +template +__global__ void kernelGetMat(const TP *A, const int rdin, TP* C, const int *r_idxs, const int *c_idxs, const int size){ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while (idx < size){ + C[idx] = A[r_idxs[idx] * rdin + c_idxs[idx]]; + idx += grid_size; + } +} + +// sets A[i] to A[i] OP B, for i in idxs +template +__global__ void kernelSetMat(TP *A, const TP B, const int *idxs, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while(idx < size){ + if constexpr (OP == np::NP_OP_ADD) + A[idxs[idx]] += B; + else if constexpr (OP == np::NP_OP_SUB) + A[idxs[idx]] -= B; + else if constexpr (OP == np::NP_OP_MUL) + A[idxs[idx]] *= B; + else if constexpr (OP == np::NP_OP_DIV) + A[idxs[idx]] /= B; + else if constexpr (OP == np::NP_OP_LESS_THAN) + A[idxs[idx]] = A[idxs[idx]] < B; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + A[idxs[idx]] = A[idxs[idx]] <= B; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + A[idxs[idx]] = A[idxs[idx]] > B; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + A[idxs[idx]] = A[idxs[idx]] >= B; + else if constexpr (OP == np::NP_OP_EQ_EQ) + A[idxs[idx]] = A[idxs[idx]] == B; + else if constexpr (OP == np::NP_OP_NOT_EQ) + A[idxs[idx]] = A[idxs[idx]] != B; + else if constexpr (OP == np::NP_OP_MINIMUM) + A[idxs[idx]] = (A[idxs[idx]] < B) ? A[idxs[idx]] : B; + else if constexpr (OP == np::NP_OP_MAXIMUM) + A[idxs[idx]] = (A[idxs[idx]] > B) ? A[idxs[idx]] : B; + else if constexpr (OP == np::NP_OP_EQ) + A[idxs[idx]] = B; + else if constexpr (OP == np::NP_F_EXP) + A[idxs[idx]] = expf(A[idxs[idx]]); + else if constexpr (OP == np::NP_F_LOG) + A[idxs[idx]] = logf(A[idxs[idx]]); + else if constexpr (OP == np::NP_F_SQUARE) + A[idxs[idx]] *= A[idxs[idx]]; + else if constexpr (OP == np::NP_F_SQRT) + { + if constexpr (std::is_same::value) + A[idxs[idx]] = static_cast(sqrtf(A[idxs[idx]])); + else if constexpr (std::is_same::value) + A[idxs[idx]] = sqrtf(A[idxs[idx]]); + else if constexpr (std::is_same::value) + A[idxs[idx]] = sqrt(A[idxs[idx]]); + else + // Handle other types here + printf("Unsupported type in kernelSetMat, where F = 22(sqrt)"); + } + else if constexpr (OP == np::NP_F_POW){ + if constexpr (std::is_same::value) + A[idxs[idx]] = static_cast(powf(A[idxs[idx]], B)); + else if constexpr (std::is_same::value) + A[idxs[idx]] = powf(A[idxs[idx]], B); + else if constexpr (std::is_same::value) + A[idxs[idx]] = pow(A[idxs[idx]], static_cast(B)); + else + // Handle other types here + printf("Unsupported type in kernelSetMat F=23(power)"); + } + else + printf("ERROR! INVALID OPERATOR/FUNCTION IN kernelSet.\n"); + idx += grid_size; + } +} + +// sets A[i] to A[i] OP B[i], for i in idxs +template +__global__ void kernelSetMat(TP *A, const TP *B, const int *idxs, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while(idx < size){ + if constexpr (OP == np::NP_OP_ADD) + A[idxs[idx]] += B[idx]; + else if constexpr (OP == np::NP_OP_SUB) + A[idxs[idx]] -= B[idx]; + else if constexpr (OP == np::NP_OP_MUL) + A[idxs[idx]] *= B[idx]; + else if constexpr (OP == np::NP_OP_DIV) + A[idxs[idx]] /= B[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + A[idxs[idx]] = A[idxs[idx]] < B[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + A[idxs[idx]] = A[idxs[idx]] <= B[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + A[idxs[idx]] = A[idxs[idx]] > B[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + A[idxs[idx]] = A[idxs[idx]] >= B[idx]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + A[idxs[idx]] = A[idxs[idx]] == B[idx]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + A[idxs[idx]] = A[idxs[idx]] != B[idx]; + else if constexpr (OP == np::NP_OP_MINIMUM) + A[idxs[idx]] = (A[idxs[idx]] < B[idx]) ? A[idxs[idx]] : B[idx]; + else if constexpr (OP == np::NP_OP_MAXIMUM) + A[idxs[idx]] = (A[idxs[idx]] > B[idx]) ? A[idxs[idx]] : B[idx]; + else if constexpr (OP == np::NP_OP_EQ) + A[idxs[idx]] = B[idx]; + else if constexpr (OP == np::NP_F_EXP) + A[idxs[idx]] = expf(A[idxs[idx]]); + else if constexpr (OP == np::NP_F_LOG) + A[idxs[idx]] = logf(A[idxs[idx]]); + else if constexpr (OP == np::NP_F_SQUARE) + A[idxs[idx]] *= A[idxs[idx]]; + else if constexpr (OP == np::NP_F_SQRT) + { + if constexpr (std::is_same::value) + A[idxs[idx]] = static_cast(sqrtf(A[idxs[idx]])); + else if constexpr (std::is_same::value) + A[idxs[idx]] = sqrtf(A[idxs[idx]]); + else if constexpr (std::is_same::value) + A[idxs[idx]] = sqrt(A[idxs[idx]]); + else + // Handle other types here + printf("Unsupported type in kernelSetMat, where F = 22(sqrt)"); + } + else if constexpr (OP == np::NP_F_POW){ + if constexpr (std::is_same::value) + A[idxs[idx]] = static_cast(powf(A[idxs[idx]], B[idx])); + else if constexpr (std::is_same::value) + A[idxs[idx]] = powf(A[idxs[idx]], B[idx]); + else if constexpr (std::is_same::value) + A[idxs[idx]] = pow(A[idxs[idx]], static_cast(B[idx])); + else + // Handle other types here + printf("Unsupported type in kernelSetMat F=23(power)"); + } + else + printf("ERROR! INVALID OPERATOR/FUNCTION IN kernelSet.\n"); + idx += grid_size; + } +} + +// sets A[r][c] to A[r][c] OP B, for r, c in zip(r_idxs, c_idxs) +template +__global__ void kernelSetMat(TP *A, const int rdin, const TP B, const int *r_idxs, const int *c_idxs, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while(idx < size){ + if constexpr (OP == np::NP_OP_ADD) + A[r_idxs[idx] * rdin + c_idxs[idx]] += B; + else if constexpr (OP == np::NP_OP_SUB) + A[r_idxs[idx] * rdin + c_idxs[idx]] -= B; + else if constexpr (OP == np::NP_OP_MUL) + A[r_idxs[idx] * rdin + c_idxs[idx]] *= B; + else if constexpr (OP == np::NP_OP_DIV) + A[r_idxs[idx] * rdin + c_idxs[idx]] /= B; + else if constexpr (OP == np::NP_OP_LESS_THAN) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] < B; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] <= B; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] > B; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] >= B; + else if constexpr (OP == np::NP_OP_EQ_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] == B; + else if constexpr (OP == np::NP_OP_NOT_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] != B; + else if constexpr (OP == np::NP_OP_MINIMUM) + A[r_idxs[idx] * rdin + c_idxs[idx]] = (A[r_idxs[idx] * rdin + c_idxs[idx]] < B) ? A[r_idxs[idx] * rdin + c_idxs[idx]] : B; + else if constexpr (OP == np::NP_OP_MAXIMUM) + A[r_idxs[idx] * rdin + c_idxs[idx]] = (A[r_idxs[idx] * rdin + c_idxs[idx]] > B) ? A[r_idxs[idx] * rdin + c_idxs[idx]] : B; + else if constexpr (OP == np::NP_OP_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = B; + else if constexpr (OP == np::NP_F_EXP) + A[r_idxs[idx] * rdin + c_idxs[idx]] = expf(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else if constexpr (OP == np::NP_F_LOG) + A[r_idxs[idx] * rdin + c_idxs[idx]] = logf(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else if constexpr (OP == np::NP_F_SQUARE) + A[r_idxs[idx] * rdin + c_idxs[idx]] *= A[r_idxs[idx] * rdin + c_idxs[idx]]; + else if constexpr (OP == np::NP_F_SQRT) + { + if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = static_cast(sqrtf(A[r_idxs[idx] * rdin + c_idxs[idx]])); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = sqrtf(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = sqrt(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else + // Handle other types here + printf("Unsupported type in kernelSetMat, where F = 22(sqrt)"); + } + else if constexpr (OP == np::NP_F_POW){ + if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = static_cast(powf(A[r_idxs[idx] * rdin + c_idxs[idx]], B)); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = powf(A[r_idxs[idx] * rdin + c_idxs[idx]], B); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = pow(A[r_idxs[idx] * rdin + c_idxs[idx]], static_cast(B)); + else + // Handle other types here + printf("Unsupported type in kernelSetMat F=23(power)"); + } + else + printf("ERROR! INVALID OPERATOR/FUNCTION IN kernelSet.\n"); + idx += grid_size; + } +} + +// sets A[r][c] to A[r][c] OP B[r][c], for r, c in zip(r_idxs, c_idxs) +template +__global__ void kernelSetMat(TP *A, const int rdin, const TP *B, const int *r_idxs, const int *c_idxs, const int size){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while(idx < size){ + if constexpr (OP == np::NP_OP_ADD) + A[r_idxs[idx] * rdin + c_idxs[idx]] += B[idx]; + else if constexpr (OP == np::NP_OP_SUB) + A[r_idxs[idx] * rdin + c_idxs[idx]] -= B[idx]; + else if constexpr (OP == np::NP_OP_MUL) + A[r_idxs[idx] * rdin + c_idxs[idx]] *= B[idx]; + else if constexpr (OP == np::NP_OP_DIV) + A[r_idxs[idx] * rdin + c_idxs[idx]] /= B[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] < B[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] <= B[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] > B[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] >= B[idx]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] == B[idx]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = A[r_idxs[idx] * rdin + c_idxs[idx]] != B[idx]; + else if constexpr (OP == np::NP_OP_MINIMUM) + A[r_idxs[idx] * rdin + c_idxs[idx]] = (A[r_idxs[idx] * rdin + c_idxs[idx]] < B[idx]) ? A[r_idxs[idx] * rdin + c_idxs[idx]] : B[idx]; + else if constexpr (OP == np::NP_OP_MAXIMUM) + A[r_idxs[idx] * rdin + c_idxs[idx]] = (A[r_idxs[idx] * rdin + c_idxs[idx]] > B[idx]) ? A[r_idxs[idx] * rdin + c_idxs[idx]] : B[idx]; + else if constexpr (OP == np::NP_OP_EQ) + A[r_idxs[idx] * rdin + c_idxs[idx]] = B[idx]; + else if constexpr (OP == np::NP_F_EXP) + A[r_idxs[idx] * rdin + c_idxs[idx]] = expf(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else if constexpr (OP == np::NP_F_LOG) + A[r_idxs[idx] * rdin + c_idxs[idx]] = logf(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else if constexpr (OP == np::NP_F_SQUARE) + A[r_idxs[idx] * rdin + c_idxs[idx]] *= A[r_idxs[idx] * rdin + c_idxs[idx]]; + else if constexpr (OP == np::NP_F_SQRT) + { + if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = static_cast(sqrtf(A[r_idxs[idx] * rdin + c_idxs[idx]])); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = sqrtf(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = sqrt(A[r_idxs[idx] * rdin + c_idxs[idx]]); + else + // Handle other types here + printf("Unsupported type in kernelSetMat, where F = 22(sqrt)"); + } + else if constexpr (OP == np::NP_F_POW){ + if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = static_cast(powf(A[r_idxs[idx] * rdin + c_idxs[idx]], B[idx])); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = powf(A[r_idxs[idx] * rdin + c_idxs[idx]], B[idx]); + else if constexpr (std::is_same::value) + A[r_idxs[idx] * rdin + c_idxs[idx]] = pow(A[r_idxs[idx] * rdin + c_idxs[idx]], static_cast(B[idx])); + else + // Handle other types here + printf("Unsupported type in kernelSetMat F=23(power)"); + } + else + printf("ERROR! INVALID OPERATOR/FUNCTION IN kernelSet.\n"); + idx += grid_size; + } +} +// operator functions + +// perform operator on corrosponding elements of 2 matrices +// C = A op B +template +__global__ void kernelMatOpMat(const TP *A, const TP *B, TP *C, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = gridDim.x * blockDim.x; + + while (idx < size) + { + if constexpr (OP == np::NP_OP_ADD) + C[idx] = A[idx] + B[idx]; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = A[idx] - B[idx]; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = A[idx] * B[idx]; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = A[idx] / B[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = A[idx] < B[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = A[idx] <= B[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = A[idx] > B[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = A[idx] >= B[idx]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = A[idx] == B[idx]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = A[idx] != B[idx]; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < B[idx]) ? A[idx] : B[idx]; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > B[idx]) ? A[idx] : B[idx]; + else + printf("ERROR! INVALID OPERATOR IN kernelMatOPMat.\n"); + idx += grid_size; + } +} + +// operator on matrix and a scalar. +// C = A op Scal (broadcasting) +// op scalar to all values of the matrix +template +__global__ void kernelMatOpScalar(const TP *A, const TP Scal, TP *C, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = gridDim.x * blockDim.x; + + while (idx < size) + { + if constexpr (OP == np::NP_OP_ADD) + C[idx] = A[idx] + Scal; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = A[idx] - Scal; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = A[idx] * Scal; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = A[idx] / Scal; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = A[idx] < Scal; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = A[idx] <= Scal; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = A[idx] > Scal; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = A[idx] >= Scal; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = A[idx] == Scal; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = A[idx] != Scal; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < Scal) ? A[idx] : Scal; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > Scal) ? A[idx] : Scal; + else + printf("ERROR! INVALID OPERATOR IN kernelMatOPScalar.\n"); + idx += grid_size; + } +} +template +__global__ void kernelMatOpScalar(const TP *A, const TP *Scal_a, TP *C, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = gridDim.x * blockDim.x; + const TP Scal = Scal_a[0]; + while (idx < size) + { + if constexpr (OP == np::NP_OP_ADD) + C[idx] = A[idx] + Scal; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = A[idx] - Scal; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = A[idx] * Scal; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = A[idx] / Scal; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = A[idx] < Scal; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = A[idx] <= Scal; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = A[idx] > Scal; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = A[idx] >= Scal; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = A[idx] == Scal; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = A[idx] != Scal; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < Scal) ? A[idx] : Scal; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > Scal) ? A[idx] : Scal; + else + printf("ERROR! INVALID OPERATOR IN kernelMatOPScalar.\n"); + idx += grid_size; + } +} + +template +__global__ void kernelScalarOpMat(const TP Scal, const TP *A, TP *C, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = gridDim.x * blockDim.x; + + while (idx < size) + { + if constexpr (OP == np::NP_OP_ADD) + C[idx] = Scal + A[idx]; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = Scal - A[idx]; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = Scal * A[idx]; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = Scal / A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = Scal < A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = Scal <= A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = Scal > A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = Scal >= A[idx]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = Scal == A[idx]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = Scal != A[idx]; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < Scal) ? A[idx] : Scal; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > Scal) ? A[idx] : Scal; + else + printf("ERROR! INVALID OPERATOR IN kernelScalarOpMat.\n"); + idx += grid_size; + } +} +template +__global__ void kernelScalarOpMat(const TP *Scal_a, const TP *A, TP *C, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = gridDim.x * blockDim.x; + + const TP Scal = Scal_a[0]; + + while (idx < size) + { + if constexpr (OP == np::NP_OP_ADD) + C[idx] = Scal + A[idx]; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = Scal - A[idx]; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = Scal * A[idx]; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = Scal / A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = Scal < A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = Scal <= A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = Scal > A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = Scal >= A[idx]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = Scal == A[idx]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = Scal != A[idx]; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < Scal) ? A[idx] : Scal; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > Scal) ? A[idx] : Scal; + else + printf("ERROR! INVALID OPERATOR IN kernelScalarOpMat.\n"); + idx += grid_size; + } +} + +// op on matrix and vector, mat.rows = vec.dim +// C = A op V (broadcasting) +// shapeA = M x N matrix +template +__global__ void kernelMatOpVecAlongCols(const TP *A, const TP *V, TP *C, const int size, const int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = gridDim.x * blockDim.x; + int r; + while (idx < size) + { + r = idx / N; + // int c = idx % N; + if constexpr (OP == np::NP_OP_ADD) + C[idx] = A[idx] + V[r]; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = A[idx] - V[r]; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = A[idx] * V[r]; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = A[idx] / V[r]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = A[idx] < V[r]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = A[idx] <= V[r]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = A[idx] > V[r]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = A[idx] >= V[r]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = A[idx] == V[r]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = A[idx] != V[r]; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < V[r]) ? A[idx] : V[r]; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > V[r]) ? A[idx] : V[r]; + else + printf("ERROR! INVALID OPERATOR IN kernelScalarOpMat.\n"); + idx += grid_size; + } +} +template +__global__ void kernelVecOpMatAlongCols(const TP *V, const TP *A, TP *C, const int size, const int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + int r; + while (idx < size) + { + r = idx / N; + // int c = idx % N; + if constexpr (OP == np::NP_OP_ADD) + C[idx] = V[r] + A[idx]; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = V[r] - A[idx]; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = V[r] * A[idx]; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = V[r] / A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = V[r] < A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = V[r] <= A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = V[r] > A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = V[r] >= A[idx]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = V[r] == A[idx]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = V[r] != A[idx]; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < V[r]) ? A[idx] : V[r]; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > V[r]) ? A[idx] : V[r]; + else + printf("ERROR! INVALID OPERATOR IN kernelScalarOpMat.\n"); + idx += grid_size; + } +} + +// operator on matrix and vector, mat.cols = vec.dim +// C = A op V (broadcasting) +// shapeA = M x N matrix +template +__global__ void kernelMatOpVecAlongRows(const TP *A, const TP *V, TP *C, const int size, const int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = gridDim.x * blockDim.x; + int c; + while (idx < size) + { + // int r = idx / N; + c = idx % N; + if constexpr (OP == np::NP_OP_ADD) + C[idx] = A[idx] + V[c]; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = A[idx] - V[c]; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = A[idx] * V[c]; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = A[idx] / V[c]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = A[idx] < V[c]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = A[idx] <= V[c]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = A[idx] > V[c]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = A[idx] >= V[c]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = A[idx] == V[c]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = A[idx] != V[c]; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < V[c]) ? A[idx] : V[c]; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > V[c]) ? A[idx] : V[c]; + else + printf("ERROR! INVALID OPERATOR IN kernelScalarOpMat.\n"); + idx += grid_size; + } +} + +template +__global__ void kernelVecOpMatAlongRows(const TP *V, const TP *A, TP *C, const int size, const int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + int c; + while (idx < size) + { + // int r = idx / N; + c = idx % N; + if constexpr (OP == np::NP_OP_ADD) + C[idx] = V[c] + A[idx]; + else if constexpr (OP == np::NP_OP_SUB) + C[idx] = V[c] - A[idx]; + else if constexpr (OP == np::NP_OP_MUL) + C[idx] = V[c] * A[idx]; + else if constexpr (OP == np::NP_OP_DIV) + C[idx] = V[c] / A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN) + C[idx] = V[c] < A[idx]; + else if constexpr (OP == np::NP_OP_LESS_THAN_EQ) + C[idx] = V[c] <= A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN) + C[idx] = V[c] > A[idx]; + else if constexpr (OP == np::NP_OP_GREATER_THAN_EQ) + C[idx] = V[c] >= A[idx]; + else if constexpr (OP == np::NP_OP_EQ_EQ) + C[idx] = V[c] == A[idx]; + else if constexpr (OP == np::NP_OP_NOT_EQ) + C[idx] = V[c] != A[idx]; + else if constexpr (OP == np::NP_OP_MINIMUM) + C[idx] = (A[idx] < V[c]) ? A[idx] : V[c]; + else if constexpr (OP == np::NP_OP_MAXIMUM) + C[idx] = (A[idx] > V[c]) ? A[idx] : V[c]; + else + printf("ERROR! INVALID OPERATOR IN kernelScalarOpMat.\n"); + idx += grid_size; + } +} + +// npfunctions +// A = MxN +// C = MxN +// Ci = F(Ai) +template +__global__ void kernelFMat(const TP *A, TP *C, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while (idx < size) + { + if constexpr (OP == np::NP_F_EXP) + C[idx] = expf(A[idx]); + else if constexpr (OP == np::NP_F_LOG) + C[idx] = logf(A[idx]); + else if constexpr (OP == np::NP_F_SQUARE) + C[idx] = A[idx] * A[idx]; + else if constexpr (OP == np::NP_F_SQRT) + { + if constexpr (std::is_same::value) + C[idx] = static_cast(sqrtf(A[idx])); + else if constexpr (std::is_same::value) + C[idx] = sqrtf(A[idx]); + else if constexpr (std::is_same::value) + C[idx] = sqrt(A[idx]); + else + // Handle other types here + printf("Unsupported type in kernelFMat, where F = 22(sqrt)"); + } + else + printf("Unsupported function type in kernelFMat\n"); + idx += grid_size; + } +} + +// np.pow +// A = MxN +// C = MxN +// Ci = square(Ai) +template +__global__ void kernelPowMat(const TP *A, const float power, TP *C, const int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + while (idx < size) + { + if constexpr (std::is_same::value) + { + C[idx] = static_cast(powf(A[idx], power)); + } + else if constexpr (std::is_same::value) + { + C[idx] = powf(A[idx], power); + } + else if constexpr (std::is_same::value) + { + C[idx] = pow(A[idx], static_cast(power)); + } + else + { + // Handle other types here + printf("Unsupported type in kernelPowMat"); + } + idx += grid_size; + } +} + +// REDUCTION +template +__device__ void kernelWarpReduceF(volatile TP *s_A, const int tid) +{ // warp reduce for kernel + if constexpr (OP == np::NP_REDUCE_SUM) + { + s_A[tid] += s_A[tid + 32]; + s_A[tid] += s_A[tid + 16]; + s_A[tid] += s_A[tid + 8]; + s_A[tid] += s_A[tid + 4]; + s_A[tid] += s_A[tid + 2]; + s_A[tid] += s_A[tid + 1]; + } + else if constexpr (OP == np::NP_REDUCE_MIN) + { + s_A[tid] = min(s_A[tid], s_A[tid + 32]); + s_A[tid] = min(s_A[tid], s_A[tid + 16]); + s_A[tid] = min(s_A[tid], s_A[tid + 8]); + s_A[tid] = min(s_A[tid], s_A[tid + 4]); + s_A[tid] = min(s_A[tid], s_A[tid + 2]); + s_A[tid] = min(s_A[tid], s_A[tid + 1]); + } + else if constexpr (OP == np::NP_REDUCE_MAX) + { + s_A[tid] = max(s_A[tid], s_A[tid + 32]); + s_A[tid] = max(s_A[tid], s_A[tid + 16]); + s_A[tid] = max(s_A[tid], s_A[tid + 8]); + s_A[tid] = max(s_A[tid], s_A[tid + 4]); + s_A[tid] = max(s_A[tid], s_A[tid + 2]); + s_A[tid] = max(s_A[tid], s_A[tid + 1]); + } + else + printf("INVALID ARGUMENT! in kernelWarpReduceF\n"); +} + +// warp unroll +template +__global__ void kernelReduceF(const TP *A, TP *output, const int size) +{ + if constexpr (OP != np::NP_REDUCE_SUM && OP != np::NP_REDUCE_MIN && OP != np::NP_REDUCE_MAX) + { + printf("INVALID ARGUMENT! in kernelReduceF\n"); + return; + } + const int bx = blockIdx.x; + const int tx = threadIdx.x; + int idx = bx * BLOCK_SIZE * 2 + tx; + const int grid_size = BLOCK_SIZE * 2 * gridDim.x; + __shared__ TP s_A[BLOCK_SIZE]; + + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] = 0; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = INT_MAX; + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = INT_MIN; + // assume 1 hi grid launch kr rha h tu + while (idx < size) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += (A[idx] + ((idx + BLOCK_SIZE < size) ? A[idx + BLOCK_SIZE] : 0)); + else if constexpr (OP == np::NP_REDUCE_MIN) + { + s_A[tx] = min(s_A[tx], A[idx]); + if (idx + BLOCK_SIZE < size) + s_A[tx] = min(s_A[tx], A[idx + BLOCK_SIZE]); + } + else if constexpr (OP == np::NP_REDUCE_MAX) + { + s_A[tx] = max(s_A[tx], A[idx]); + if (idx + BLOCK_SIZE < size) + s_A[tx] = max(s_A[tx], A[idx + BLOCK_SIZE]); + } + + idx += grid_size; + } + __syncthreads(); + + if constexpr (BLOCK_SIZE > 511) + { + if (tx < 256) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += s_A[tx + 256]; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = min(s_A[tx], s_A[tx + 256]); + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = max(s_A[tx], s_A[tx + 256]); + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 255) + { + if (tx < 128) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += s_A[tx + 128]; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = min(s_A[tx], s_A[tx + 128]); + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = max(s_A[tx], s_A[tx + 128]); + } + __syncthreads(); + } + if constexpr (BLOCK_SIZE > 127) + { + if (tx < 64) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += s_A[tx + 64]; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = min(s_A[tx], s_A[tx + 64]); + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = max(s_A[tx], s_A[tx + 64]); + } + __syncthreads(); + } + + if (tx < 32) + kernelWarpReduceF(s_A, tx); + + if (tx == 0) + output[bx] = s_A[0]; +} + +// warp unroll +template +__global__ void kernelReduceFAxis1(const TP *A, TP *output, const int size, const int rows) +{ + if constexpr (OP != np::NP_REDUCE_SUM && OP != np::NP_REDUCE_MIN && OP != np::NP_REDUCE_MAX) + { + printf("INVALID ARGUMENT! in kernelReduceF\n"); + return; + } + const int bx = blockIdx.x; + const int tx = threadIdx.x; + int _idx = bx * BLOCK_SIZE * 2 + tx; + const int grid_size = BLOCK_SIZE * 2 * gridDim.x; + __shared__ TP s_A[BLOCK_SIZE]; + + for(int r = 0; r < rows; ++r){ + int idx = _idx; + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] = 0; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = INT_MAX; + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = INT_MIN; + // assume 1 hi grid launch kr rha h tu + while (idx < size) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += (A[idx] + ((idx + BLOCK_SIZE < size) ? A[idx + BLOCK_SIZE] : 0)); + else if constexpr (OP == np::NP_REDUCE_MIN) + { + s_A[tx] = min(s_A[tx], A[idx]); + if (idx + BLOCK_SIZE < size) + s_A[tx] = min(s_A[tx], A[idx + BLOCK_SIZE]); + } + else if constexpr (OP == np::NP_REDUCE_MAX) + { + s_A[tx] = max(s_A[tx], A[idx]); + if (idx + BLOCK_SIZE < size) + s_A[tx] = max(s_A[tx], A[idx + BLOCK_SIZE]); + } + + idx += grid_size; + } + __syncthreads(); + + if constexpr (BLOCK_SIZE > 511) + { + if (tx < 256) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += s_A[tx + 256]; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = min(s_A[tx], s_A[tx + 256]); + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = max(s_A[tx], s_A[tx + 256]); + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 255) + { + if (tx < 128) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += s_A[tx + 128]; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = min(s_A[tx], s_A[tx + 128]); + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = max(s_A[tx], s_A[tx + 128]); + } + __syncthreads(); + } + if constexpr (BLOCK_SIZE > 127) + { + if (tx < 64) + { + if constexpr (OP == np::NP_REDUCE_SUM) + s_A[tx] += s_A[tx + 64]; + else if constexpr (OP == np::NP_REDUCE_MIN) + s_A[tx] = min(s_A[tx], s_A[tx + 64]); + else if constexpr (OP == np::NP_REDUCE_MAX) + s_A[tx] = max(s_A[tx], s_A[tx + 64]); + } + __syncthreads(); + } + + if (tx < 32) + kernelWarpReduceF(s_A, tx); + + if (tx == 0) + output[bx] = s_A[0]; + A+= size; + output += gridDim.x; + __syncthreads(); + } +} + + +template +__device__ void kernelWarpReduceArgF(volatile TP *s_A, volatile int *s_Idx, const int tid) +{ // warp reduce for kernel + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tid] > s_A[tid + 32]) + { + s_A[tid] = s_A[tid + 32]; + s_Idx[tid] = s_Idx[tid + 32]; + } + if (s_A[tid] > s_A[tid + 16]) + { + s_A[tid] = s_A[tid + 16]; + s_Idx[tid] = s_Idx[tid + 16]; + } + if (s_A[tid] > s_A[tid + 16]) + { + s_A[tid] = s_A[tid + 16]; + s_Idx[tid] = s_Idx[tid + 16]; + } + if (s_A[tid] > s_A[tid + 8]) + { + s_A[tid] = s_A[tid + 8]; + s_Idx[tid] = s_Idx[tid + 8]; + } + if (s_A[tid] > s_A[tid + 4]) + { + s_A[tid] = s_A[tid + 4]; + s_Idx[tid] = s_Idx[tid + 4]; + } + if (s_A[tid] > s_A[tid + 2]) + { + s_A[tid] = s_A[tid + 2]; + s_Idx[tid] = s_Idx[tid + 2]; + } + if (s_A[tid] > s_A[tid + 1]) + { + s_A[tid] = s_A[tid + 1]; + s_Idx[tid] = s_Idx[tid + 1]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tid] < s_A[tid + 32]) + { + s_A[tid] = s_A[tid + 32]; + s_Idx[tid] = s_Idx[tid + 32]; + } + if (s_A[tid] < s_A[tid + 16]) + { + s_A[tid] = s_A[tid + 16]; + s_Idx[tid] = s_Idx[tid + 16]; + } + if (s_A[tid] < s_A[tid + 16]) + { + s_A[tid] = s_A[tid + 16]; + s_Idx[tid] = s_Idx[tid + 16]; + } + if (s_A[tid] < s_A[tid + 8]) + { + s_A[tid] = s_A[tid + 8]; + s_Idx[tid] = s_Idx[tid + 8]; + } + if (s_A[tid] < s_A[tid + 4]) + { + s_A[tid] = s_A[tid + 4]; + s_Idx[tid] = s_Idx[tid + 4]; + } + if (s_A[tid] < s_A[tid + 2]) + { + s_A[tid] = s_A[tid + 2]; + s_Idx[tid] = s_Idx[tid + 2]; + } + if (s_A[tid] < s_A[tid + 1]) + { + s_A[tid] = s_A[tid + 1]; + s_Idx[tid] = s_Idx[tid + 1]; + } + } +} + +// warp unroll +template +__global__ void kernelReduceArgF(const TP *A, TP *outputMax, int *outputIdx, const int size) +{ + if constexpr (OP != np::NP_REDUCE_ARGMIN && OP != np::NP_REDUCE_ARGMAX) + { + printf("INVALID ARGUMENT! in kernelReduceArgF\n"); + return; + } + const int bx = blockIdx.x; + const int tx = threadIdx.x; + int idx = bx * BLOCK_SIZE * 2 + tx; + const int grid_size = BLOCK_SIZE * 2 * gridDim.x; + __shared__ TP s_A[BLOCK_SIZE]; + __shared__ int s_Idx[BLOCK_SIZE]; + + if constexpr (OP == np::NP_REDUCE_ARGMIN) + s_A[tx] = INT_MAX; + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + s_A[tx] = INT_MIN; + s_Idx[tx] = -1; + // assume 1 hi grid launch kr rha h tu + while (idx < size) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = idx; + } + if (idx + BLOCK_SIZE < size) + { + if (s_A[tx] > A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = idx + BLOCK_SIZE; + } + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = idx; + } + if (idx + BLOCK_SIZE < size) + { + if (s_A[tx] < A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = idx + BLOCK_SIZE; + } + } + } + + idx += grid_size; + } + __syncthreads(); + + if constexpr (BLOCK_SIZE > 511) + { + if (tx < 256) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + + } + } + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 255) + { + if (tx < 128) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 127) + { + if (tx < 64) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 64]) + { + s_A[tx] = s_A[idx + 64]; + s_Idx[tx] = s_Idx[tx + 64]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 64]) + { + s_A[tx] = s_A[tx + 64]; + s_Idx[tx] = s_Idx[tx + 64]; + } + } + } + __syncthreads(); + } + + if (tx < 32) + kernelWarpReduceArgF(s_A, s_Idx, tx); + + if (tx == 0) + { + outputMax[bx] = s_A[0]; + outputIdx[bx] = s_Idx[0]; + } +} + +template +__global__ void kernelReduceArgFAxis1(const TP *A, TP *outputMax, int *outputIdx, const int size, const int rows){ + if constexpr (OP != np::NP_REDUCE_ARGMIN && OP != np::NP_REDUCE_ARGMAX) + { + printf("INVALID ARGUMENT! in kernelReduceArgF\n"); + return; + } + const int bx = blockIdx.x; + const int tx = threadIdx.x; + int _idx = bx * BLOCK_SIZE * 2 + tx; + const int grid_size = BLOCK_SIZE * 2 * gridDim.x; + __shared__ TP s_A[BLOCK_SIZE]; + __shared__ int s_Idx[BLOCK_SIZE]; + + for(int r = 0; r < rows; ++r){ + int idx = _idx; + if constexpr (OP == np::NP_REDUCE_ARGMIN) + s_A[tx] = INT_MAX; + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + s_A[tx] = INT_MIN; + s_Idx[tx] = -1; + // assume 1 hi grid launch kr rha h tu + while (idx < size) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = idx; + } + if (idx + BLOCK_SIZE < size) + { + if (s_A[tx] > A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = idx + BLOCK_SIZE; + } + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = idx; + } + if (idx + BLOCK_SIZE < size) + { + if (s_A[tx] < A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = idx + BLOCK_SIZE; + } + } + } + + idx += grid_size; + } + __syncthreads(); + + if constexpr (BLOCK_SIZE > 511) + { + if (tx < 256) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + + } + } + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 255) + { + if (tx < 128) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 127) + { + if (tx < 64) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 64]) + { + s_A[tx] = s_A[idx + 64]; + s_Idx[tx] = s_Idx[tx + 64]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 64]) + { + s_A[tx] = s_A[tx + 64]; + s_Idx[tx] = s_Idx[tx + 64]; + } + } + } + __syncthreads(); + } + + if (tx < 32) + kernelWarpReduceArgF(s_A, s_Idx, tx); + + if (tx == 0) + { + outputMax[bx] = s_A[0]; + outputIdx[bx] = s_Idx[0]; + } + A += size; + outputIdx += gridDim.x; + outputMax += gridDim.x; + } +} + +// second reduction k time p -> idx serial nhi h, to ek idx ka bhi array dena hoga +template +__global__ void kernelReduceArgF(const TP *A, const int *A_idx, TP *outputMax, int *outputIdx, const int size) +{ + if constexpr (OP != np::NP_REDUCE_ARGMIN && OP != np::NP_REDUCE_ARGMAX) + { + printf("INVALID ARGUMENT! in kernelReduceArgF\n"); + return; + } + const int bx = blockIdx.x; + const int tx = threadIdx.x; + int idx = bx * BLOCK_SIZE * 2 + tx; + const int grid_size = BLOCK_SIZE * 2 * gridDim.x; + __shared__ TP s_A[BLOCK_SIZE]; + __shared__ int s_Idx[BLOCK_SIZE]; + + if constexpr (OP == np::NP_REDUCE_ARGMIN) + s_A[tx] = INT_MAX; + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + s_A[tx] = INT_MIN; + s_Idx[tx] = -1; + + // assume 1 hi grid launch kr rha h tu + while (idx < size) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = A_idx[idx]; + } + if (idx + BLOCK_SIZE < size && s_A[tx] > A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = A_idx[idx + BLOCK_SIZE]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = A_idx[idx]; + } + if (idx + BLOCK_SIZE < size) + { + if (s_A[tx] < A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = A_idx[idx + BLOCK_SIZE]; + } + } + } + + idx += grid_size; + } + __syncthreads(); + + if constexpr (BLOCK_SIZE > 511) + { + if (tx < 256) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + } + } + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 255) + { + if (tx < 128) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + } + __syncthreads(); + } + if constexpr (BLOCK_SIZE > 127) + { + if (tx < 64) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 64]) + { + s_A[tx] = s_A[tx + 64]; + s_Idx[tx] = s_A[tx + 64]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 64]) + { + s_A[tx] = s_A[tx + 64]; + s_Idx[tx] = s_Idx[tx + 64]; + } + } + } + __syncthreads(); + } + + if (tx < 32) + kernelWarpReduceArgF(s_A, s_Idx, tx); + + if (tx == 0) + { + outputMax[bx] = s_A[0]; + outputIdx[bx] = s_Idx[0]; + } +} + + +// second reduction k time p -> idx serial nhi h, to ek idx ka bhi array dena hoga +template +__global__ void kernelReduceArgFAxis1(const TP *A, const int *A_idx, TP *outputMax, int *outputIdx, const int size, const int rows) +{ + if constexpr (OP != np::NP_REDUCE_ARGMIN && OP != np::NP_REDUCE_ARGMAX) + { + printf("INVALID ARGUMENT! in kernelReduceArgF\n"); + return; + } + const int bx = blockIdx.x; + const int tx = threadIdx.x; + int _idx = bx * BLOCK_SIZE * 2 + tx; + const int grid_size = BLOCK_SIZE * 2 * gridDim.x; + __shared__ TP s_A[BLOCK_SIZE]; + __shared__ int s_Idx[BLOCK_SIZE]; + + + for(int r = 0; r < rows; ++r){ + int idx = _idx; + if constexpr (OP == np::NP_REDUCE_ARGMIN) + s_A[tx] = INT_MAX; + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + s_A[tx] = INT_MIN; + s_Idx[tx] = -1; + + // assume 1 hi grid launch kr rha h tu + while (idx < size) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = A_idx[idx]; + } + if (idx + BLOCK_SIZE < size && s_A[tx] > A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = A_idx[idx + BLOCK_SIZE]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < A[idx]) + { + s_A[tx] = A[idx]; + s_Idx[tx] = A_idx[idx]; + } + if (idx + BLOCK_SIZE < size) + { + if (s_A[tx] < A[idx + BLOCK_SIZE]) + { + s_A[tx] = A[idx + BLOCK_SIZE]; + s_Idx[tx] = A_idx[idx + BLOCK_SIZE]; + } + } + } + + idx += grid_size; + } + __syncthreads(); + + if constexpr (BLOCK_SIZE > 511) + { + if (tx < 256) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 256]) + { + s_A[tx] = s_A[tx + 256]; + s_Idx[tx] = s_Idx[tx + 256]; + } + } + } + __syncthreads(); + } + + if constexpr (BLOCK_SIZE > 255) + { + if (tx < 128) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 128]) + { + s_A[tx] = s_A[tx + 128]; + s_Idx[tx] = s_Idx[tx + 128]; + } + } + } + __syncthreads(); + } + if constexpr (BLOCK_SIZE > 127) + { + if (tx < 64) + { + if constexpr (OP == np::NP_REDUCE_ARGMIN) + { + if (s_A[tx] > s_A[tx + 64]) + { + s_A[tx] = s_A[tx + 64]; + s_Idx[tx] = s_A[tx + 64]; + } + } + else if constexpr (OP == np::NP_REDUCE_ARGMAX) + { + if (s_A[tx] < s_A[tx + 64]) + { + s_A[tx] = s_A[tx + 64]; + s_Idx[tx] = s_Idx[tx + 64]; + } + } + } + __syncthreads(); + } + + if (tx < 32) + kernelWarpReduceArgF(s_A, s_Idx, tx); + + if (tx == 0) + { + outputMax[bx] = s_A[0]; + outputIdx[bx] = s_Idx[0]; + } + + A += size; + A_idx += size; + outputIdx += gridDim.x; + outputMax += gridDim.x; + __syncthreads(); + } +} + +template +__global__ void kernelMatShuffle(TP *A, const int size, const unsigned long long seed) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + idx *= 4; + if(idx < size){ + // Seed the random number generator + curandState state; + curand_init(seed, idx, 0, &state); // Initialize curand state for each thread + + int j = curand_uniform(&state) * idx; // generate random number b/w [0, idx] + + // Swap array[i] and array[j] + TP temp = A[idx]; + A[idx] = A[j]; + A[j] = temp; + + ++idx; + if(idx < size){ + j = curand_uniform(&state) * idx; // generate random number b/w [0, idx] + temp = A[idx]; + A[idx] = A[j]; + A[j] = temp; + } + ++idx; + if(idx < size){ + j = curand_uniform(&state) * idx; // generate random number b/w [0, idx] + temp = A[idx]; + A[idx] = A[j]; + A[j] = temp; + } + ++idx; + if(idx < size){ + j = curand_uniform(&state) * idx; // generate random number b/w [0, idx] + temp = A[idx]; + A[idx] = A[j]; + A[j] = temp; + } + } +} + +template +__global__ void kernelReLUBackward(TP *dout, TP *cache, TP *dX, int N){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while(idx < N){ + dX[idx] = dout[idx] * (cache[idx] > 0); + idx += grid_size; + } +} + +// scores = scores + 1e-8; // epsilon to prevent -log(0) +// auto loss = (-np::log(scores.get(np::arange(x.rows()), y))); +// dx.set(np::arange(x.rows()), y, NP_OP_SUB, 1); +template +__global__ void kernelSoftMaxUtils(TP *scores, const int *y, float *scores_for_loss, const int NUM_CLASSES, const int N){ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int grid_size = blockDim.x * gridDim.x; + + while(idx < N){ + scores_for_loss[idx] = -(logf(scores[idx * NUM_CLASSES + y[idx]] + 1e-8)); + scores[idx * NUM_CLASSES + y[idx]] -= 1; + idx += grid_size; + } +} + +#endif \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/numC++/gpuConfig.cuh b/lightnumpy/_gpu_core/include/numC++/gpuConfig.cuh new file mode 100644 index 0000000..babc743 --- /dev/null +++ b/lightnumpy/_gpu_core/include/numC++/gpuConfig.cuh @@ -0,0 +1,16 @@ +#ifndef CUDA_CONFIG_CUH +#define CUDA_CONFIG_CUH + +#include + +namespace np +{ + // getting GPU Config to launch kernels with the most optimal + extern int GPU_NUM_CUDA_CORE; + extern int GPU_NUM_SM; + extern cublasHandle_t cbls_handle; + + int _ConvertSMVer2Cores(int major, int minor); + void getGPUConfig(int devId = 0); +} +#endif \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/numC++/npFunctions.cuh b/lightnumpy/_gpu_core/include/numC++/npFunctions.cuh new file mode 100644 index 0000000..9f6591a --- /dev/null +++ b/lightnumpy/_gpu_core/include/numC++/npFunctions.cuh @@ -0,0 +1,398 @@ +#ifndef NPFUNCTIONS_CUH +#define NPFUNCTIONS_CUH + +#include +#include +#include +#include + +#include + +#include +#include +namespace np +{ + template + ArrayGPU ones(const int rows = 1, const int cols = 1); + + template + ArrayGPU zeros(const int rows = 1, const int cols = 1); + + // numbers from [0, range) spaced by 1 + template + ArrayGPU arange(const int range); + + // numbers from [start, stop) spaced by step + template + ArrayGPU arange(const float start, const float stop, const float step = 1); + + template + ArrayGPU _maxmin(const ArrayGPU &A, const ArrayGPU &B); + + template + ArrayGPU _maxmin(const ArrayGPU &A, const TP Scalar); + + // max(a, b). element wise maximum + template + ArrayGPU maximum(const ArrayGPU &A, const ArrayGPU &B); + + template + ArrayGPU maximum(const ArrayGPU &A, const TP Scalar); + + template + ArrayGPU maximum(const TP Scalar, const ArrayGPU &A); + + // min(a, b). element wise minimum + template + ArrayGPU minimum(const ArrayGPU &A, const ArrayGPU &B); + + template + ArrayGPU minimum(const ArrayGPU &A, const TP Scalar); + + template + ArrayGPU minimum(const TP Scalar, const ArrayGPU &A); + + /* + functions per element + F + #define NP_F_EXP 17 + #define NP_F_LOG 18 + #define NP_F_SQUARE 19 + #define NP_F_SQRT 20 + #define NP_F_POW 21 + */ + template + ArrayGPU _F(const ArrayGPU &A); + + // np.exp + template + ArrayGPU exp(const ArrayGPU &A); + + // np.log + template + ArrayGPU log(const ArrayGPU &A); + + // np.square + template + ArrayGPU square(const ArrayGPU &A); + + // np.sqrt + template + ArrayGPU sqrt(const ArrayGPU &A); + + // np.pow + template + ArrayGPU pow(const ArrayGPU &A, const int pow); + + // np.shuffle + template + void shuffle(ArrayGPU &A, unsigned long long seed = static_cast(time(NULL))); + + // np.array_split + template + std::vector> array_split(const ArrayGPU &A, const int num_parts, int axis = 0); + + template + ArrayGPU ones(const int rows, const int cols) + { + return ArrayGPU(rows, cols, static_cast(1)); + } + + template + ArrayGPU zeros(const int rows, const int cols) + { + return ArrayGPU(rows, cols, static_cast(0)); + } + + template + ArrayGPU arange(const int range) + { + ArrayGPU ans(range); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(range, block.x))); + + kernelInitMatArange<<>>(ans.mat, range); + cudaDeviceSynchronize(); + + return ans; + } + + template + ArrayGPU arange(const float start, const float stop, const float step) + { + const int sz = static_cast((stop - start) / step); + return start + (arange(sz) * step); + } + + /* + maxmin + F + #define NP_OP_MAXIMUM 11 + #define NP_OP_MINIMUM 12 + */ + template + ArrayGPU _maxmin(const ArrayGPU &A, const ArrayGPU &B) + { + if (A.rows() == 1 && A.cols() == 1) + { + // A is scalar + ArrayGPU res(B.rows(), B.cols()); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelMatOpScalar<<>>(B.mat, A.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + else if (B.rows() == 1 && B.cols() == 1) + { + // B is scalar + ArrayGPU res(A.rows(), A.cols()); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelMatOpScalar<<>>(A.mat, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + // A is vector + // A vector ki dim, is equal to either col or row of B + // row vector. will extend along cols if possible. (prioritising in case of square matrix) + // vice versa for cols + + else if ((A.cols() == 1 && A.rows() == B.rows()) || (A.rows() == 1 && A.cols() == B.rows())) + { + // along rows add kr + ArrayGPU res(B.rows(), B.cols()); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpVecAlongCols<<>>(A.mat, B.mat, res.mat, res.size(), B.cols()); + cudaDeviceSynchronize(); + + return res; + } + else if ((A.cols() == 1 && A.rows() == B.cols()) || (A.rows() == 1 && A.cols() == B.cols())) + { + // along cols add kr + ArrayGPU res(B.rows(), B.cols()); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpVecAlongRows<<>>(A.mat, B.mat, res.mat, res.size(), B.cols()); + cudaDeviceSynchronize(); + + return res; + } + // B is vetor + // B vector ki dim, is eq to either col or row of B + // row vector. will extend along cols if possible. (prioritising in case of square matrix) + else if ((B.cols() == 1 && A.rows() == B.rows()) || (B.rows() == 1 && A.rows() == B.cols())) + { + // along rows add kr + ArrayGPU res(A.rows(), A.cols()); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpVecAlongCols<<>>(A.mat, B.mat, res.mat, res.size(), A.cols()); + cudaDeviceSynchronize(); + + return res; + } + else if ((B.cols() == 1 && A.cols() == B.rows()) || (B.rows() == 1 && A.cols() == B.cols())) + { + // along cols add kr + ArrayGPU res(A.rows(), A.cols()); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpVecAlongRows<<>>(A.mat, B.mat, res.mat, res.size(), A.cols()); + cudaDeviceSynchronize(); + + return res; + } + else if (A.rows() == B.rows() && A.cols() == B.cols()) + { + // A and B both are matrices of same dimensions + ArrayGPU res(A.rows(), A.cols()); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM*2, np_ceil(res.size(), block.x))); + kernelMatOpMat<<>>(A.mat, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + else + { + std::cerr << "\nError in maximum! Check arguments"; + return np::ArrayGPU(1, 1, 0); + } + } + + template + ArrayGPU _maxmin(const ArrayGPU &A, const TP Scalar) + { + ArrayGPU tmp(1, 1, Scalar); + return _maxmin(A, tmp); + } + + // np.minimum + template + ArrayGPU minimum(const ArrayGPU &A, const ArrayGPU &B) + { + return _maxmin(A, B); + } + + template + ArrayGPU minimum(const ArrayGPU &A, const TP Scalar) + { + return _maxmin(A, Scalar); + } + + template + ArrayGPU minimum(const TP Scalar, const ArrayGPU &A) + { + return _maxmin(A, Scalar); + } + + // np.maximum + template + ArrayGPU maximum(const ArrayGPU &A, const ArrayGPU &B) + { + return _maxmin(A, B); + } + + template + ArrayGPU maximum(const ArrayGPU &A, const TP Scalar) + { + return _maxmin(A, Scalar); + } + + template + ArrayGPU maximum(const TP Scalar, const ArrayGPU &A) + { + return _maxmin(A, Scalar); + } + + /* + functions per element + F + #define NP_F_EXP 19 + #define NP_F_LOG 20 + #define NP_F_SQUARE 21 + #define NP_F_SQRT 22 + */ + template + ArrayGPU _F(const ArrayGPU &A) + { + ArrayGPU res(A.rows(), A.cols()); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelFMat<<>>(A.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // np.exp + template + ArrayGPU exp(const ArrayGPU &A) + { + return _F(A); + } + + // np.log + template + ArrayGPU log(const ArrayGPU &A) + { + return _F(A); + } + + // np.square + template + ArrayGPU square(const ArrayGPU &A) + { + return _F(A); + } + + // np.sqrt + template + ArrayGPU sqrt(const ArrayGPU &A) + { + return _F(A); + } + + // np.pow + template + ArrayGPU pow(const ArrayGPU &A, const float pow) + { + ArrayGPU res(A.rows(), A.cols()); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelPowMat<<>>(A.mat, pow, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // np.shuffle + template + void shuffle(ArrayGPU &A, unsigned long long seed) + { + if(A.size() <= 1) return; + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(np_ceil(A.size(), block.x * 4)); + + kernelMatShuffle<<>>(A.mat, A.size(), seed); + cudaDeviceSynchronize(); + } + + template + std::vector> array_split(const ArrayGPU &A, const int num_parts, int axis) + { + if(num_parts <= 0){ + std::cerr << "INVALID NUM_PARTS ARGUMENT IN array_split!\n"; + return {}; + } + else if(axis != 0){ + std::cerr << "INVALID AXIS ARGUMENT IN array_split!\n"; + return {}; + } + else if(num_parts == 1) + return {A}; + // returns length % n sub-arrays of size length/n + 1 and the rest of size length/n. + else if (axis == 0) + { + std::vector> splitted_arrays; + + int tot_size = A.rows(); + int part_size = tot_size / num_parts; + int remainder = tot_size % num_parts; + + int st_idx = 0; + for (int i = 0; i < num_parts; ++i) + { + int this_part_size = part_size + (i < remainder ? 1 : 0); + + np::ArrayGPU tmp(A.mat + st_idx, this_part_size, A.cols(), "gpu"); + + splitted_arrays.push_back(tmp); + + st_idx += tmp.size(); + } + return splitted_arrays; + } + } + +} +#endif \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/numC++/npGPUArray.cuh b/lightnumpy/_gpu_core/include/numC++/npGPUArray.cuh new file mode 100644 index 0000000..03e700a --- /dev/null +++ b/lightnumpy/_gpu_core/include/numC++/npGPUArray.cuh @@ -0,0 +1,2311 @@ +#ifndef NPGPUARRAY_CUH +#define NPGPUARRAY_CUH + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace np +{ + template + class ArrayGPU + { + private: + // reference counter -> Implemented so that all arrays can share the same memory. array will be deleted when this becomes 0. + int *ref_count; + // _rows and _cols of array. + int _rows, _cols; + + template + ArrayGPU applyOp(const ArrayGPU &B) const; + template + ArrayGPU applyOp(const TP Scalar) const; + + template + ArrayGPU applyReductionF(const int axis) const; + template + ArrayGPU applyReductionArgF(const int axis) const; + + public: + // main array. + TP *mat; + friend class Random; + + // ####################### CONSTRUCTORS ############################ + + /* + Parameterised constructor + Creates a 1D array + Arguments: + * sz = size of array + Ex: ArrayGPU(10); + */ + ArrayGPU(const int sz = 1); + + /* + Parameterised constructor + Creates a 2D array + Arguments: + * _rows = _rows in array + * _cols = _cols in array + Ex: ArrayGPU(3, 4); + */ + ArrayGPU(const int _rows, const int _cols); + + /* + Parameterised constructor + Creates a 2D array fill with a default value. + Arguments: + * _rows = _rows in array + * _cols = _cols in array + * Val = Scalar to fill the array with + Ex: ArrayGPU(3, 4, 0); + */ + ArrayGPU(const int _rows, const int _cols, const TP Val); + + /* + Parameterised constructor + Creates a 1D array with values taking from std::vector. + Arguments: + * std::vector<> + Ex: ArrayGPU({0, 1, 2, 3, 4, 5}); + */ + ArrayGPU(const std::vector &A); + + /* + Parameterised constructor + Creates a 2D array with values taking from std::vector. + Arguments: + * std::vector> + Ex: ArrayGPU({{0, 1, 2, 3, 4, 5}, + {6, 7, 8, 9, 10, 11}); + */ + ArrayGPU(const std::vector> &A); + + /* + Parameterised constructor + Creates a 1D array and copies data from a pointer to array in memory + Arguments: + * TP* + * sz = size of array + * loc = "cpu" or "gpu" + Ex: ArrayGPU(array, 5, "cpu"); + */ + ArrayGPU(const TP *h_array, const int sz, const std::string &loc = "cpu"); + + /* + Parameterised constructor + Creates a 2D array and copies data from a pointer to array in memory + Arguments: + * TP* + * _rows = _rows of mat + * _cols = _cols of mat + * loc = "cpu" or "gpu" + Ex: ArrayGPU(array, 5, 6, "gpu"); + */ + ArrayGPU(const TP *h_array, int _rows, const int _cols, const std::string &loc = "cpu"); + + /* + Copy constructor + */ + ArrayGPU(const ArrayGPU &A); + + /* + assignment operator overload + */ + void operator=(const ArrayGPU &A); + + /* + assignment operator overload. this fills the array with Scal value. + created to be used with indexing + */ + void operator=(const TP Scal); + + // ####################### GETTER FUNCTIONS ############################ + + /* + returns size of array. + Ex: A.size(); + */ + unsigned int size() const; + + /* + returns _rows of array. + Ex: A._rows(); + */ + unsigned int rows() const; + + /* + returns _cols of array. + Ex: A._cols(); + */ + unsigned int cols() const; + + /* + returns reference count of array. + Ex: A.refCount(); + */ + unsigned int refCount() const; + + // ####################### ARRAY UTILITY FUNCTIONS ############################ + + /* + Prints the array on stdout. + Ex: A.print(); + */ + void print() const; + + /* + Overloaded cout + Ex: std::cout< &A); + + /* + Returns a copy of array. + Ex: auto B = A.copy(); + */ + ArrayGPU copy() const; + + /* + Returns transpose of array. + Ex: auto AT = A.T(); + */ + ArrayGPU T() const; + + /* + Reshapes the array. Org size has to be same as new size. + Arguments: + * newRows - number of _rows + * newCols - number of _cols + Ex: A.reshape(5, 10); + */ + void reshape(const int newRows, const int newCols); + + /* + Returns a copy of array as cpu pointer. + Ex: float *a = A.cpu(); + */ + TP *cpu() const; + + // ####################### ARRAY ELEMENT GETTER SETTER FUNCTIONS ############################ + + /* + Returns element at idx as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.at(0); + */ + ArrayGPU at(const int idx) ; + + /* + Returns element at (r, c) as array object. + Arguments: + * r - r to access from + * c - c to access from + Ex: auto B = A.at(5, 2); + */ + ArrayGPU at(const int r, const int c) ; + + /* + Returns element at idx as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get(0); + Note: at returns reference, get returns copy. + */ + TP get(const int idx) const; + + /* + Returns element at (r, c) as array object. + Arguments: + * r - r to access from + * c - c to access from + Ex: auto B = A.get(5, 2); + Note: at returns reference, get returns copy. + */ + TP get(const int r, const int c) const; + + /* + Returns elements at std::vector indexes as array object. + Arguments: + * std::vector - indexes to fetch from + Ex: auto B = A.get({1, 2, 3, 4, 5}); + */ + ArrayGPU get(const std::vector &idxs) const; + + /* + Returns elements at ArrayGPU indexes as array object. + Arguments: + * ArrayGPU - indexes to fetch from + + Ex: auto B = A.get({1, 2, 3, 4, 5}); + */ + ArrayGPU get(const ArrayGPU &idxs) const; + + /* + Returns elements at ArrayGPU && ArrayGPU indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get(R, C); + */ + ArrayGPU get(const ArrayGPU &r, const ArrayGPU &c) const; + + /* + Returns elements at std::vector && ArrayGPU indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get({1, 2, 3, 4, 5}, C); + */ + ArrayGPU get(const std::vector &r, const ArrayGPU &c) const; + + /* + Returns elements at ArrayGPU && std::vector indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get(R, {1, 2, 3, 4, 5}); + */ + ArrayGPU get(const ArrayGPU &r, const std::vector &c) const; + + /* + Returns elements at std::vector<> && ArrayGPU indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get({1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8}); + */ + ArrayGPU get(const std::vector &r, const std::vector &c) const; + + /* + modifies element at idx. + Arguments: + * idx - idx of element + * operator + * Val - Scalar + Ex: A.set(0, NP_OP_ADD, 1); // adds one + */ + void set(const int idx, const Operation op, const TP operand = 0); + + /* + modifies element at (r, c) + Arguments: + * r - r to access from + * c - c to access from + * operator + * Val - Scalar + Ex: A.set(5, 2, NP_OP_SUB, 2); + */ + void set(const int r, const int c, const Operation op, const TP operand = 0); + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set(idxs, NP_OP_MUL, 2); + */ + void set(const ArrayGPU &idxs, const Operation op, const TP operand = 0); + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set({1, 2, 3}, NP_OP_MUL, 2); + */ + void set(const std::vector &idxs, const Operation op, const TP operand = 0); + + /* + modifies element at a list of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + void set(const ArrayGPU &r, const ArrayGPU &c, const Operation op, const TP operand = 0); + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + void set(const std::vector &r, const ArrayGPU &c, Operation op, TP operand = 0); + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + void set(const ArrayGPU &r, const std::vector &c, Operation op, TP operand = 0); + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + void set(const std::vector &r, const std::vector &c, Operation op, TP operand = 0); + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set({1, 2, 3}, NP_OP_MUL, ar); + */ + void set(const std::vector &idxs, Operation op, const ArrayGPU & operand); + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set(idxs, NP_OP_MUL, ar); + */ + void set(const ArrayGPU &idxs, Operation op, const ArrayGPU & operand); + + /* + modifies element at a list of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + void set(const ArrayGPU &r, const ArrayGPU &c, Operation op, const ArrayGPU & operand); + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + void set(const ArrayGPU &r, const std::vector &c, Operation op, const ArrayGPU & operand); + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + void set(const std::vector &r, const ArrayGPU &c, Operation op, const ArrayGPU & operand); + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + void set(const std::vector &r, const std::vector &c, Operation op, const ArrayGPU & operand); + + // ####################### DOT PRODUCT ############################ + + /* + Returns dot product of two arrays + Arguments: + * B - second array + Ex: auto C = A.dot(B); + */ + ArrayGPU dot(const ArrayGPU &B) const; + /* + Returns dot product of two arrays. First is transposed + Arguments: + * B - second array + Ex: auto C = A.Tdot(B); + Is same as - auto C = A.T().dot(B) + */ + ArrayGPU Tdot(const ArrayGPU &B) const; + /* + Returns dot product of two arrays. Second is transposed + Arguments: + * B - second array + Ex: auto C = A.dotT(B); + Is same as - auto C = A.dot(B.T()) + */ + ArrayGPU dotT(const ArrayGPU &B) const; + + // TO BE DONE. + + // add functions + ArrayGPU operator+(const ArrayGPU &B) const; + ArrayGPU operator+(const TP Scalar) const; + + // minus + ArrayGPU operator-(const ArrayGPU &B) const; + ArrayGPU operator-(const TP Scalar) const; + + // unary negation operator + ArrayGPU operator-() const; + + // multiply + ArrayGPU operator*(const ArrayGPU &B) const; + ArrayGPU operator*(const TP Scalar) const; + + // divide + ArrayGPU operator/(const ArrayGPU &B) const; + ArrayGPU operator/(const TP Scalar) const; + + // returns an array of 0s and 1s depending on true or false of the conditions. + // element wise comparison + + // > + ArrayGPU operator>(const ArrayGPU &B) const; + ArrayGPU operator>(const TP Scalar) const; + + // < + ArrayGPU operator<(const ArrayGPU &B) const; + ArrayGPU operator<(const TP Scalar) const; + + // >= + ArrayGPU operator>=(const ArrayGPU &B) const; + ArrayGPU operator>=(const TP Scalar) const; + + // <= + ArrayGPU operator<=(const ArrayGPU &B) const; + ArrayGPU operator<=(const TP Scalar) const; + + // == + ArrayGPU operator==(const ArrayGPU &B) const; + ArrayGPU operator==(const TP Scalar) const; + + // != + ArrayGPU operator!=(const ArrayGPU &B) const; + ArrayGPU operator!=(const TP Scalar) const; + + // sum. along axis or total + ArrayGPU sum(const int axis = -1) const; + + // max. along axis or total + ArrayGPU max(const int axis = -1) const; + + // min. along axis or total + ArrayGPU min(const int axis = -1) const; + + // argmax + ArrayGPU argmax(const int axis = -1) const; + // argmin + ArrayGPU argmin(const int axis = -1) const; + + // sort + // argsort + + ~ArrayGPU(); + }; + + // ####################### CONSTRUCTORS ############################ + + /* + Parameterised constructor + Creates a 1D array + Arguments: + * sz = size of array + Ex: ArrayGPU(10); + */ + template + ArrayGPU::ArrayGPU(const int sz) + { + this->_rows = 1; + this->_cols = sz; + + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(TP))); + + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + + /* + Parameterised constructor + Creates a 2D array + Arguments: + * _rows = _rows in array + * _cols = _cols in array + Ex: ArrayGPU(3, 4); + */ + template + ArrayGPU::ArrayGPU(const int rows, const int cols) + { + this->_rows = rows; + this->_cols = cols; + + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(TP))); + + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + + /* + Parameterised constructor + Creates a 2D array fill with a default value. + Arguments: + * _rows = _rows in array + * _cols = _cols in array + * Val = Scalar to fill the array with + Ex: ArrayGPU(3, 4, 0); + */ + template + ArrayGPU::ArrayGPU(const int rows, const int cols, const TP Val) + { + this->_rows = rows; + this->_cols = cols; + + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(TP))); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil( (this->_rows * this->_cols), block.x))); + + kernelInitMatBroadcast<<>>(mat, Val, this->_rows * this->_cols); + cudaDeviceSynchronize(); + + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + + /* + Parameterised constructor + Creates a 1D array with values taking from std::vector. + Arguments: + * std::vector<> + Ex: ArrayGPU({0, 1, 2, 3, 4, 5}); + */ + template + ArrayGPU::ArrayGPU(const std::vector &A) + { + this->_rows = 1; + this->_cols = A.size(); + + + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(TP))); + + CUDA_CALL(cudaMemcpy(this->mat, A.data(), this->_rows * this->_cols * sizeof(TP), cudaMemcpyHostToDevice)); + + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + + /* + Parameterised constructor + Creates a 2D array with values taking from std::vector. + Arguments: + * std::vector> + Ex: ArrayGPU({{0, 1, 2, 3, 4, 5}, + {6, 7, 8, 9, 10, 11}); + */ + template + ArrayGPU::ArrayGPU(const std::vector> &A) + { + this->_rows = A.size(); + this->_cols = A[0].size(); + + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(TP))); + + for (int rowIdx = 0; rowIdx < this->_rows; ++rowIdx) + CUDA_CALL(cudaMemcpy(mat + rowIdx * this->_cols, A[rowIdx].data(), this->_cols * sizeof(TP), cudaMemcpyHostToDevice)); + + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + + /* + Parameterised constructor + Creates a 1D array and copies data from a pointer to array in memory + Arguments: + * TP* + * sz = size of array + * loc = "cpu" or "gpu" + Ex: ArrayGPU(array, 5, "cpu"); + */ + template + ArrayGPU::ArrayGPU(const TP *array, const int sz, const std::string &loc) + { + this->_rows = 1; + this->_cols = sz; + + if (loc == "cpu") + { + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(float))); + CUDA_CALL(cudaMemcpy(this->mat, array, this->_rows * this->_cols * cudaMemcpyHostToDevice)); + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + else if (loc == "gpu") + { + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(float))); + CUDA_CALL(cudaMemcpy(this->mat, array, this->_rows * this->_cols * cudaMemcpyDeviceToDevice)); + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + else + std::cerr << "INVALID PARAM LOC: POSSIBLE VALUES \"cpu\", \"gpu\"\n"; + } + + /* + Parameterised constructor + Creates a 2D array and copies data from a pointer to array in memory + Arguments: + * TP* + * _rows = _rows of mat + * _cols = _cols of mat + * loc = "cpu" or "gpu" + Ex: ArrayGPU(array, 5, 6, "gpu"); + */ + template + ArrayGPU::ArrayGPU(const TP *array, const int _rows, const int _cols, const std::string &loc) + { + this->_rows = _rows; + this->_cols = _cols; + + if (loc == "cpu") + { + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(TP))); + CUDA_CALL(cudaMemcpy(this->mat, array, this->_rows * this->_cols * sizeof(TP), cudaMemcpyHostToDevice)); + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + else if (loc == "gpu") + { + CUDA_CALL(cudaMalloc((void **)&this->mat, this->_rows * this->_cols * sizeof(TP))); + CUDA_CALL(cudaMemcpy(this->mat, array, this->_rows * this->_cols * sizeof(TP), cudaMemcpyDeviceToDevice)); + // initialising ref_count + this->ref_count = (int *)malloc(sizeof(int)); + (*this->ref_count) = 1; + } + else + std::cerr << "INVALID PARAM LOC: POSSIBLE VALUES \"cpu\", \"gpu\"\n"; + } + + /* + Copy constructor + */ + template + ArrayGPU::ArrayGPU(const ArrayGPU &A) + { + this->_rows = A._rows; + this->_cols = A._cols; + this->mat = A.mat; + this->ref_count = A.ref_count; + ++(*this->ref_count); + } + + /* + assignment operator overload + */ + template + void ArrayGPU::operator=(const ArrayGPU &A) + { + if (this != &A) + { + --(*this->ref_count); + if (*this->ref_count == 0) + { + CUDA_CALL(cudaFree(this->mat)); + free(this->ref_count); + } + this->_rows = A._rows; + this->_cols = A._cols; + this->mat = A.mat; + this->ref_count = A.ref_count; + ++(*this->ref_count); + } + } + + /* + assignment operator overload. this fills the array with Scal value. + created to be used with indexing + */ + template + void ArrayGPU::operator=(const TP Scal) + { + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil((this->_rows * this->_cols), block.x))); + + kernelInitMatBroadcast<<>>(mat, Scal, this->_rows * this->_cols); + cudaDeviceSynchronize(); + } + + // ####################### GETTER FUNCTIONS ############################ + + /* + returns size of array. + Ex: A.size(); + */ + template + unsigned int ArrayGPU::size() const + { + return this->_rows * this->_cols; + } + + /* + returns _rows of array. + Ex: A._rows(); + */ + template + unsigned int ArrayGPU::rows() const + { + return this->_rows; + } + + /* + returns _cols of array. + Ex: A._cols(); + */ + template + unsigned int ArrayGPU::cols() const + { + return this->_cols; + } + + /* + returns reference count of array. + Ex: A.refCount(); + */ + template + unsigned int ArrayGPU::refCount() const + { + return (*this->ref_count); + } + + // ####################### ARRAY UTILITY FUNCTIONS ############################ + + /* + Prints the array on stdout. + Ex: A.print(); + */ + template + void ArrayGPU::print() const + { + kernelPrintMat<<<1, 1>>>(mat, this->_rows, this->_cols); + cudaDeviceSynchronize(); + } + + /* + Overloaded cout + Ex: std::cout< + std::ostream &operator<<(std::ostream &out, const ArrayGPU &A) + { + A.print(); + return out; + } + + /* + Returns a copy of array. + Ex: auto B = A.copy(); + */ + template + ArrayGPU ArrayGPU::copy() const + { + return ArrayGPU(this->mat, this->_rows, this->_cols, "gpu"); + } + + /* + Returns transpose of array. + Ex: auto AT = A.T(); + */ + template + ArrayGPU ArrayGPU::T() const + { + ArrayGPU out(this->_cols, this->_rows); + const int TILE_WIDTH = (GPU_NUM_CUDA_CORE == 64) ? 8 : 16; + const int ROW_BLOCK = (GPU_NUM_CUDA_CORE == 64) ? 4 : 8; + dim3 block(TILE_WIDTH, ROW_BLOCK); + dim3 grid(np_ceil(this->_cols, TILE_WIDTH), np_ceil(this->_rows, TILE_WIDTH)); + + switch (GPU_NUM_CUDA_CORE) + { + case 64: + kernelTransposeInMem<<>>(this->mat, out.mat, this->_rows, this->_cols); + break; + + default: + kernelTransposeInMem<<>>(this->mat, out.mat, this->_rows, this->_cols); + break; + } + cudaDeviceSynchronize(); + + return out; + } + + /* + Reshapes the array. Org size has to be same as new size. + Arguments: + * newRows - number of _rows + * newCols - number of _cols + Ex: A.reshape(5, 10); + */ + template + void ArrayGPU::reshape(const int newRows, const int newCols) + { + if (newRows * newCols == this->_rows * this->_cols) + { + this->_rows = newRows; + this->_cols = newCols; + } + else if (newRows == -1 && (this->_rows * this->_cols) % newCols == 0) + { + this->_rows = (this->_rows * this->_cols) / newCols; + this->_cols = newCols; + } + else if (newCols == -1 && (this->_rows * this->_cols) % newRows == 0) + { + this->_cols = (this->_rows * this->_cols) / newRows; + this->_rows = newRows; + } + else + std::cerr << "\nError! New size and old size are not equal."; + } + + /* + Returns a copy of array as cpu pointer. + Ex: float *a = A.cpu(); + */ + template + TP *ArrayGPU::cpu() const + { + TP *array_h = (TP *)malloc(this->_rows * this->_cols * sizeof(TP)); + + CUDA_CALL(cudaMemcpy(array_h, this->mat, this->_rows * this->_cols * sizeof(TP), cudaMemcpyDeviceToHost)); + return array_h; + } + + // ####################### ARRAY ELEMENT GETTER SETTER FUNCTIONS ############################ + + /* + Returns element at idx as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.at(0); + */ + template + ArrayGPU ArrayGPU::at(const int idx) + { + + ArrayGPU res(1, 1); + res.mat = (this->mat + idx); + res.ref_count = this->ref_count; + ++(*this->ref_count); + return res; + } + + /* + Returns element at (r, c) as array object. + Arguments: + * r - r to access from + * c - c to access from + Ex: auto B = A.at(5, 2); + */ + template + ArrayGPU ArrayGPU::at(const int r, const int c) + { + return this->at(r * this->_cols + c); + } + + /* + Returns element at idx as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get(0); + Note: at returns reference, get returns copy. + */ + template + TP ArrayGPU::get(const int idx) const + { + TP val; + CUDA_CALL(cudaMemcpy(&val, mat + idx, sizeof(TP), cudaMemcpyDeviceToHost)); + return val; + } + + /* + Returns element at (r, c) as array object. + Arguments: + * r - r to access from + * c - c to access from + Ex: auto B = A.get(5, 2); + Note: at returns reference, get returns copy. + */ + template + TP ArrayGPU::get(const int r, const int c) const + { + return this->get(r * this->_cols + c); + } + + /* + Returns elements at ArrayGPU indexes as array object. + Arguments: + * ArrayGPU - indexes to fetch from + + Ex: auto B = A.get({1, 2, 3, 4, 5}); + */ + template + ArrayGPU ArrayGPU::get(const ArrayGPU &idxs) const + { + int sz = std::max(idxs.rows(), idxs.cols()); + ArrayGPU res(sz); + + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64) ? 64 : 128; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(sz, block.x))); + + kernelGetMat<<>>(this->mat, res.mat, idxs.mat, sz); + cudaDeviceSynchronize(); + + return res; + } + + /* + Returns elements at std::vector indexes as array object. + Arguments: + * std::vector - indexes to fetch from + Ex: auto B = A.get({1, 2, 3, 4, 5}); + */ + template + ArrayGPU ArrayGPU::get(const std::vector &idxs) const + { + return this->get(ArrayGPU(idxs)); + } + + /* + Returns elements at ArrayGPU && ArrayGPU indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get(R, C); + */ + template + ArrayGPU ArrayGPU::get(const ArrayGPU &r, const ArrayGPU &c) const + { + int sz = std::max(r.rows(), r.cols()); + ArrayGPU res(sz); + + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64) ? 64 : 128; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(sz, block.x))); + + kernelGetMat<<>>(this->mat, this->_cols, res.mat, r.mat, c.mat, sz); + cudaDeviceSynchronize(); + + return res; + } + + /* + Returns elements at std::vector && ArrayGPU indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get({1, 2, 3, 4, 5}, C); + */ + template + ArrayGPU ArrayGPU::get(const std::vector &r, const ArrayGPU &c) const + { + return this->get(ArrayGPU(r), c); + } + + /* + Returns elements at ArrayGPU && std::vector indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get(R, {1, 2, 3, 4, 5}); + */ + template + ArrayGPU ArrayGPU::get(const ArrayGPU &r, const std::vector &c) const + { + return this->get(r, ArrayGPU(c)); + } + + /* + Returns elements at std::vector<> && ArrayGPU indexes as array object. + Arguments: + * idx - idx of element + Ex: auto B = A.get({1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8}); + */ + template + ArrayGPU ArrayGPU::get(const std::vector &r, const std::vector &c) const + { + return this->get(ArrayGPU(r), ArrayGPU(c)); + } + + /* + modifies element at idx. + Arguments: + * idx - idx of element + * operator + * Val - Scalar + Ex: A.set(0, NP_OP_ADD, 1); // adds one + */ + template + void ArrayGPU::set(const int idx, const Operation op, const TP operand){ + ArrayGPU idxs(1, 1, idx); + int sz = 1; + dim3 block(1); + dim3 grid(1); + + switch(op){ + case NP_OP_ADD: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_SUB: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_MUL: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_DIV: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_LESS_THAN: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_LESS_THAN_EQ: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_GREATER_THAN: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_GREATER_THAN_EQ: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_EQ_EQ : + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_NOT_EQ : + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_MINIMUM: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_MAXIMUM: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_EQ: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_EXP: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_LOG: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_SQUARE: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_SQRT: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_POW: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + default: + std::cerr<<"\nINVALID OPERAND PASSED IN SET."; + } + cudaDeviceSynchronize(); + } + + /* + modifies element at (r, c) + Arguments: + * r - r to access from + * c - c to access from + * operator + * Val - Scalar + Ex: A.set(5, 2, NP_OP_SUB, 2); + */ + template + void ArrayGPU::set(const int r, const int c, const Operation op, const TP operand){ + this->set(r * this->_cols + c, op, operand); + } + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set(idxs, NP_OP_MUL, 2); + */ + template + void ArrayGPU::set(const ArrayGPU &idxs, const Operation op, const TP operand){ + int sz = std::max(idxs.cols(), idxs.rows()); + + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64) ? 64 : 128; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(sz, block.x))); + std::cout<<"\nBLOCK: "<<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_SUB: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_MUL: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_DIV: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_LESS_THAN: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_LESS_THAN_EQ: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_GREATER_THAN: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_GREATER_THAN_EQ: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_EQ_EQ : + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_NOT_EQ : + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_MINIMUM: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_MAXIMUM: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_OP_EQ: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_EXP: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_LOG: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_SQUARE: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_SQRT: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + case NP_F_POW: + kernelSetMat<<>>(this->mat, operand, idxs.mat, sz); + break; + default: + std::cerr<<"\nINVALID OPERAND PASSED IN SET."; + } + cudaDeviceSynchronize(); + } + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set({1, 2, 3}, NP_OP_MUL, 2); + */ + template + void ArrayGPU::set(const std::vector &idxs, const Operation op, const TP operand){ + this->set(ArrayGPU(idxs), op, operand); + } + + /* + modifies element at a list of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + template + void ArrayGPU::set(const ArrayGPU &r, const ArrayGPU &c, const Operation op, const TP operand){ + int sz = std::max(r.cols(), r.rows()); + + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64) ? 64 : 128; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(sz, block.x))); + + switch(op){ + case NP_OP_ADD: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_SUB: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_MUL: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_DIV: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_LESS_THAN: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_LESS_THAN_EQ: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_GREATER_THAN: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_GREATER_THAN_EQ: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_EQ_EQ : + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_NOT_EQ : + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_MINIMUM: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_MAXIMUM: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_OP_EQ: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_F_EXP: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_F_LOG: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_F_SQUARE: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_F_SQRT: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + case NP_F_POW: + kernelSetMat<<>>(this->mat, this->_cols, operand, r.mat, c.mat, sz); + break; + default: + std::cerr<<"\nINVALID OPERAND PASSED IN SET."; + } + cudaDeviceSynchronize(); + } + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + template + void ArrayGPU::set(const ArrayGPU &r, const std::vector &c, Operation op, TP operand){ + this->set(r, ArrayGPU(c), op, operand); + } + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + template + void ArrayGPU::set(const std::vector &r, const ArrayGPU &c, Operation op, TP operand){ + this->set(ArrayGPU(r), c, op, operand); + } + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, 2); + */ + template + void ArrayGPU::set(const std::vector &r, const std::vector &c, Operation op, TP operand){ + this->set(ArrayGPU(r), ArrayGPU(c), op, operand); + } + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set({1, 2, 3}, NP_OP_MUL, ar); + */ + template + void ArrayGPU::set(const ArrayGPU &idxs, Operation op, const ArrayGPU & operand){ + if(operand.size() == 1){ + TP* operand_ = operand.at(0).cpu(); + this->set(idxs, op, operand_[0]); + free(operand_); + return; + } + int sz = std::max(idxs._cols, idxs._rows); + + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64) ? 64 : 128; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(sz, block.x))); + + switch(op){ + case NP_OP_ADD: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_SUB: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_MUL: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_DIV: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_LESS_THAN: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_LESS_THAN_EQ: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_GREATER_THAN: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_GREATER_THAN_EQ: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_EQ_EQ : + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_NOT_EQ : + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_MINIMUM: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_MAXIMUM: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + case NP_OP_EQ: + kernelSetMat<<>>(this->mat, operand.mat, idxs, sz); + break; + default: + std::cerr<<"\nINVALID OPERAND PASSED IN SET."; + } + cudaDeviceSynchronize(); + } + + /* + modifies element at a list of indexes + Arguments: + * idxs - indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set(idxs, NP_OP_MUL, ar); + */ + template + void ArrayGPU::set(const std::vector &idxs, Operation op, const ArrayGPU & operand){ + this->set(ArrayGPU(idxs), op, operand); + } + + /* + modifies element at a list of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + template + void ArrayGPU::set(const ArrayGPU &r, const ArrayGPU &c, Operation op, const ArrayGPU &operand){ + if(operand.size() == 1){ + TP* operand_ = operand.at(0).cpu(); + this->set(r, c, op, operand_[0]); + free(operand_); + return; + } + int sz = std::max(r.cols(), r.rows()); + + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64) ? 64 : 128; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(sz, block.x))); + + std::cout<<"\nGRID: "<<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_SUB: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_MUL: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_DIV: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_LESS_THAN: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_LESS_THAN_EQ: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_GREATER_THAN: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_GREATER_THAN_EQ: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_EQ_EQ : + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_NOT_EQ : + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_MINIMUM: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_MAXIMUM: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + case NP_OP_EQ: + kernelSetMat<<>>(this->mat, this->_cols, operand.mat, r.mat, c.mat, sz); + break; + default: + std::cerr<<"\nINVALID OPERAND PASSED IN SET."; + } + cudaDeviceSynchronize(); + } + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify - (ArrayGPU or vector) + * c_idxs - col indexes to modify - (ArrayGPU or vector) + * operator + * Val - ArrayGPU + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + template + void ArrayGPU::set(const ArrayGPU &r, const std::vector &c, Operation op, const ArrayGPU & operand){ + this->set(r, ArrayGPU(c), op, operand); + } + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + template + void ArrayGPU::set(const std::vector &r, const ArrayGPU &c, Operation op, const ArrayGPU & operand){ + this->set(ArrayGPU(r), c, op, operand); + } + + /* + modifies element at a ArrayGPU of indexes + Arguments: + * r_idxs - row indexes to modify + * c_idxs - col indexes to modify + * operator + * Val - Scalar + Ex: A.set(r_idxs, c_idxs, NP_OP_MUL, ar); + */ + template + void ArrayGPU::set(const std::vector &r, const std::vector &c, Operation op, const ArrayGPU & operand){ + this->set(ArrayGPU(r), ArrayGPU(c), op, operand); + } + + + + // ####################### DOT PRODUCT ############################ + /* + Returns dot product of two arrays + Arguments: + * B - second array + Ex: auto C = A.dot(B); + */ + template + ArrayGPU ArrayGPU::dot(const ArrayGPU &B) const + { + // condition for dot product + if (this->_cols == B._rows) + { + ArrayGPU res(this->_rows, B._cols); + + const float alpha = 1.0f; + const float beta = 0.0f; + + // C = A . B k lie. + cublasSgemm(cbls_handle, // + CUBLAS_OP_N, CUBLAS_OP_N, + B._cols, this->_rows, this->_cols, // B _cols, A _rows, A _cols + &alpha, + B.mat, B._cols, // B, B _cols + this->mat, this->_cols, // A, A _cols + &beta, + res.mat, B._cols); // C, B _cols + + return res; + } + else + { + std::cerr << "\nError in dot! Check arguments"; + return np::ArrayGPU(1, 1, 0); + } + } + /* + Returns dot product of two arrays. First is transposed + Arguments: + * B - second array + Ex: auto C = A.Tdot(B); + Is same as - auto C = A.T().dot(B) + */ + template + ArrayGPU ArrayGPU::Tdot(const ArrayGPU &B) const + { + if (this->_rows == B._rows) + { + ArrayGPU res(this->_cols, B._cols); + + const float alpha = 1.0f; + const float beta = 0.0f; + + // C = AT . B + cublasSgemm(cbls_handle, // + CUBLAS_OP_N, CUBLAS_OP_T, + B._cols, this->_cols, this->_rows, // B _cols, A _cols, A _rows + &alpha, + B.mat, B._cols, // B, B _cols + this->mat, this->_cols, // A, A _cols + &beta, + res.mat, B._cols); // C, B _cols + + return res; + } + else + { + std::cerr << "\nError in Tdot! Check arguments"; + return np::ArrayGPU(1, 1, 0); + } + } + /* + Returns dot product of two arrays. Second is transposed + Arguments: + * B - second array + Ex: auto C = A.dotT(B); + Is same as - auto C = A.dot(B.T()) + */ + template + ArrayGPU ArrayGPU::dotT(const ArrayGPU &B) const + { + if (this->_cols == B._cols) + { + ArrayGPU res(this->_rows, B._rows); + + const float alpha = 1.0f; + const float beta = 0.0f; + + cublasSgemm(cbls_handle, // + CUBLAS_OP_T, CUBLAS_OP_N, + B._rows, this->_rows, this->_cols, // B _cols, A _rows, A _cols + &alpha, + B.mat, B._cols, // B, B _cols + this->mat, this->_cols, // A, A _cols + &beta, + res.mat, B._rows); // C, B _cols + + return res; + } + else + { + std::cerr << "\nError in dotT ! Check arguments"; + return np::ArrayGPU(1, 1, 0); + } + } + + template + template + ArrayGPU ArrayGPU::applyOp(const ArrayGPU &B) const + { + if (this->_rows == 1 && this->_cols == 1) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(this->mat, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + else if (B._rows == 1 && B._cols == 1) + { + // B is scalar + ArrayGPU res(this->_rows, this->_cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelMatOpScalar<<>>(this->mat, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + // if A is vector + // A vector ki dim, is equal to either col or row of B + // row vector. will extend along _cols if possible. (prioritising in case of square matrix) + // vice versa for _cols + + else if ((this->_cols == 1 && this->_rows == B._rows) || (this->_rows == 1 && this->_cols == B._rows)) + { + // along _rows add kr + ArrayGPU res(B._rows, B._cols); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelVecOpMatAlongCols<<>>(this->mat, B.mat, res.mat, res.size(), B._cols); + cudaDeviceSynchronize(); + + return res; + } + else if ((this->_cols == 1 && this->_rows == B._cols) || (this->_rows == 1 && this->_cols == B._cols)) + { + // along _cols add kr + ArrayGPU res(B._rows, B._cols); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelVecOpMatAlongRows<<>>(this->mat, B.mat, res.mat, res.size(), B._cols); + cudaDeviceSynchronize(); + + return res; + } + // B is vetor + // B vector ki dim, is eq to either col or row of B + // row vector. will extend along _cols if possible. (prioritising in case of square matrix) + else if ((B._cols == 1 && this->_rows == B._rows) || (B._rows == 1 && this->_rows == B._cols)) + { + // along _rows add kr + ArrayGPU res(this->_rows, this->_cols); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpVecAlongCols<<>>(this->mat, B.mat, res.mat, res.size(), this->_cols); + cudaDeviceSynchronize(); + + return res; + } + else if ((B._cols == 1 && this->_cols == B._rows) || (B._rows == 1 && this->_cols == B._cols)) + { + // along _cols add kr + ArrayGPU res(this->_rows, this->_cols); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpVecAlongRows<<>>(this->mat, B.mat, res.mat, res.size(), this->_cols); + cudaDeviceSynchronize(); + + return res; + } + else if (this->_rows == B._rows && this->_cols == B._cols) + { + // A and B both are matrices of same dimensions + ArrayGPU res(this->_rows, this->_cols); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpMat<<>>(this->mat, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + else + { + std::cerr << "\nError in applyOP! Check arguments"; + return np::ArrayGPU(1, 1, 0); + } + } + + template + template + ArrayGPU ArrayGPU::applyOp(const TP Scalar) const + { + ArrayGPU res(this->_rows, this->_cols); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpScalar<<>>(this->mat, Scalar, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // add functions + template + ArrayGPU ArrayGPU::operator+(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator+(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator+(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // subtraction + template + ArrayGPU ArrayGPU::operator-(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator-(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator-(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // multiplication + template + ArrayGPU ArrayGPU::operator*(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator*(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator*(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // division + template + ArrayGPU ArrayGPU::operator/(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator/(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator/(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // unary negation operator + template + ArrayGPU ArrayGPU::operator-() const + { + ArrayGPU res(this->_rows, this->_cols); + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + kernelMatOpScalar<<>>(this->mat, -1, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // returns an array of 0s and 1s depending on true or false of the conditions. + // element wise comparison + + // < + template + ArrayGPU ArrayGPU::operator<(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator<(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator<(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // <= + template + ArrayGPU ArrayGPU::operator<=(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator<=(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator<=(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // > + template + ArrayGPU ArrayGPU::operator>(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator>(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator>(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // >= + template + ArrayGPU ArrayGPU::operator>=(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator>=(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator>=(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // == + template + ArrayGPU ArrayGPU::operator==(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator==(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator==(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + // != + template + ArrayGPU ArrayGPU::operator!=(const ArrayGPU &B) const + { + return this->applyOp(B); + } + + template + ArrayGPU ArrayGPU::operator!=(const TP Scalar) const + { + return this->applyOp(Scalar); + } + + template + ArrayGPU operator!=(const TP Scal, const ArrayGPU &B) + { + // A is scalar + ArrayGPU res(B._rows, B._cols); + + const int BLOCK_SIZE = GPU_NUM_CUDA_CORE; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(res.size(), block.x))); + + kernelScalarOpMat<<>>(Scal, B.mat, res.mat, res.size()); + cudaDeviceSynchronize(); + return res; + } + + template + template + ArrayGPU ArrayGPU::applyReductionF(const int axis) const + { + if (axis == -1) + { + // return total sum + const int BLOCK_SIZE = ((GPU_NUM_CUDA_CORE == 64) ? 64 : 128) * 2; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(np_ceil(this->size(), block.x), GPU_NUM_SM * 2)); + + ArrayGPU res(1); + + // device pointer tmp + TP *tmp_d; + CUDA_CALL(cudaMalloc((void **)&tmp_d, sizeof(TP) * grid.x)); + switch (GPU_NUM_CUDA_CORE) + { + case 64: + kernelReduceF<<>>(this->mat, tmp_d, this->size()); + cudaDeviceSynchronize(); + // please guarantee that BLOCK_SIZE > grid.x. otherwise multiple kernel calls will have to be made. + kernelReduceF<<<1, block>>>(tmp_d, res.mat, grid.x); + cudaDeviceSynchronize(); + break; + default: + kernelReduceF<<>>(this->mat, tmp_d, this->size()); + cudaDeviceSynchronize(); + // please guarantee that BLOCK_SIZE > grid.x. otherwise multiple kernel calls will have to be made. + kernelReduceF<<<1, block>>>(tmp_d, res.mat, grid.x); + cudaDeviceSynchronize(); + break; + } + CUDA_CALL(cudaFree(tmp_d)); + return res; + } + else if (axis == 0) + { + // sum along columns. dimension=numCols + auto ans = (this->T()).applyReductionF(1); + ans.reshape(1, -1); + return ans; + } + else if (axis == 1) + { + // reduction along _rows. output dim = numRows + ArrayGPU res(this->_rows, 1); + + const int BLOCK_SIZE = ((GPU_NUM_CUDA_CORE == 64) ? 64 : 128) * 2; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(np_ceil(this->_cols, block.x), GPU_NUM_SM * 2)); + TP *tmp_d; + CUDA_CALL(cudaMalloc((void **)&tmp_d, sizeof(TP) * this->_rows * grid.x)); + + switch (GPU_NUM_CUDA_CORE) + { + case 64: + kernelReduceFAxis1<<>>(this->mat, tmp_d, this->_cols, this->_rows); + cudaDeviceSynchronize(); + + kernelReduceFAxis1<<<1, block>>>(tmp_d, res.mat, grid.x, this->_rows); + cudaDeviceSynchronize(); + + break; + default: + kernelReduceFAxis1<<>>(this->mat, tmp_d, this->_cols, this->_rows); + cudaDeviceSynchronize(); + + kernelReduceFAxis1<<<1, block>>>(tmp_d, res.mat, grid.x, this->_rows); + cudaDeviceSynchronize(); + } + + cudaFree(tmp_d); + return res; + } + else + { + std::cerr << "\nError in applyReductionF! Check arguments"; + return np::ArrayGPU(1, 1, 0); + } + } + + // sum. along axis or total + template + ArrayGPU ArrayGPU::sum(const int axis) const + { + return this->applyReductionF(axis); + } + + // min. along axis or total + template + ArrayGPU ArrayGPU::min(const int axis) const + { + return this->applyReductionF(axis); + } + + // max. along axis or total + template + ArrayGPU ArrayGPU::max(const int axis) const + { + return this->applyReductionF(axis); + } + + template + template + ArrayGPU ArrayGPU::applyReductionArgF(const int axis) const + { + if (axis == -1) + { + // return total sum + const int BLOCK_SIZE = ((GPU_NUM_CUDA_CORE == 64) ? 64 : 128) * 2; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(np_ceil(this->size(), block.x), GPU_NUM_SM * 2)); + + ArrayGPU res(1); + ArrayGPU resIdx(1); + // device pointer tmp + TP *tmp_A_d; + CUDA_CALL(cudaMalloc((void **)&tmp_A_d, sizeof(TP) * grid.x)); + int *tmp_A_Idx_d; + CUDA_CALL(cudaMalloc((void **)&tmp_A_Idx_d, sizeof(int) * grid.x)); + + switch (GPU_NUM_CUDA_CORE) + { + case 64: + kernelReduceArgF<<>>(this->mat, tmp_A_d, tmp_A_Idx_d, this->size()); + cudaDeviceSynchronize(); + // please guarantee that BLOCK_SIZE > grid.x. otherwise multiple kernel calls will have to be made. + if (grid.x == 1) + { + resIdx.mat = tmp_A_Idx_d; + return resIdx; + } + kernelReduceArgF<<<1, block>>>(tmp_A_d, tmp_A_Idx_d, res.mat, resIdx.mat, grid.x); + cudaDeviceSynchronize(); + break; + default: + kernelReduceArgF<<>>(this->mat, tmp_A_d, tmp_A_Idx_d, this->size()); + cudaDeviceSynchronize(); + // please guarantee that BLOCK_SIZE > grid.x. otherwise multiple kernel calls will have to be made. + kernelReduceArgF<<<1, block>>>(tmp_A_d, tmp_A_Idx_d, res.mat, resIdx.mat, grid.x); + cudaDeviceSynchronize(); + } + + CUDA_CALL(cudaFree(tmp_A_d)); + CUDA_CALL(cudaFree(tmp_A_Idx_d)); + + return resIdx; + } + else if (axis == 0) + { + // sum along columns. dimension=numCols + auto ans = this->T().applyReductionArgF(1); + ans.reshape(1, -1); + return ans; + } + else if (axis == 1) + { + // sum along _rows. output dim = numRows + ArrayGPU res(this->_rows, 1); + ArrayGPU resIdx(this->_rows, 1); + + const int BLOCK_SIZE = ((GPU_NUM_CUDA_CORE == 128) ? 128 : 64) * 2; + dim3 block(BLOCK_SIZE); + dim3 grid(std::min(GPU_NUM_SM * 2, np_ceil(this->_cols, block.x))); + + TP *tmp_A_d; + CUDA_CALL(cudaMalloc((void **)&tmp_A_d, sizeof(TP) * this->_rows * grid.x)); + int *tmp_A_Idx_d; + CUDA_CALL(cudaMalloc((void **)&tmp_A_Idx_d, sizeof(int) * this->_rows * grid.x)); + + switch (GPU_NUM_CUDA_CORE) + { + case 64: + kernelReduceArgFAxis1<<>>(this->mat, tmp_A_d, tmp_A_Idx_d, this->_cols, this->_rows); + cudaDeviceSynchronize(); + + kernelReduceArgFAxis1<<<1, block>>>(tmp_A_d, tmp_A_Idx_d, res.mat, resIdx.mat, grid.x, this->_rows); + cudaDeviceSynchronize(); + break; + default: + kernelReduceArgFAxis1<<>>(this->mat, tmp_A_d, tmp_A_Idx_d, this->_cols, this->_rows); + cudaDeviceSynchronize(); + + kernelReduceArgFAxis1<<<1, block>>>(tmp_A_d, tmp_A_Idx_d, res.mat, resIdx.mat, grid.x, this->_rows); + cudaDeviceSynchronize(); + } + + + CUDA_CALL(cudaFree(tmp_A_d)); + CUDA_CALL(cudaFree(tmp_A_Idx_d)); + + return resIdx; + } + else + { + std::cerr << "\nError in applyReductionArgF! Check arguments"; + return np::ArrayGPU(1, 1, 0); + } + } + + // argmin + // min along axis or total + template + ArrayGPU ArrayGPU::argmin(const int axis) const + { + return this->applyReductionArgF(axis); + } + + // argmax + template + ArrayGPU ArrayGPU::argmax(const int axis) const + { + return this->applyReductionArgF(axis); + } + + template + ArrayGPU::~ArrayGPU() + { + --(*this->ref_count); + if (*this->ref_count == 0) + { + CUDA_CALL(cudaFree(this->mat)); + free(ref_count); + } + } +} + +#endif \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/numC++/npRandom.cuh b/lightnumpy/_gpu_core/include/numC++/npRandom.cuh new file mode 100644 index 0000000..86e73a0 --- /dev/null +++ b/lightnumpy/_gpu_core/include/numC++/npRandom.cuh @@ -0,0 +1,56 @@ +#ifndef NPRANDOM_CUH +#define NPRANDOM_CUH + +#include +#include +#include +#include +#include + +#include + +#include + +namespace np +{ + class Random + { + public: + // from uniform distribution + template + static ArrayGPU rand(const unsigned int rows = 1, const unsigned int cols = 1, const unsigned int lo = 0, const unsigned int hi = 1, const unsigned long long seed = static_cast(time(NULL))); + + // from normal distribution + template + static ArrayGPU randn(const unsigned int rows = 1, const unsigned int cols = 1, const unsigned long long seed = static_cast(time(NULL))); + }; + + template + ArrayGPU Random::rand(const unsigned int rows, const unsigned int cols, const unsigned int lo, const unsigned int hi, const unsigned long long seed) + { + ArrayGPU ar(rows, cols); + + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64)?64:128; + dim3 block(BLOCK_SIZE); + dim3 grid(np_ceil( ar.size(), (block.x * 50))); + kernelInitializeRandomUnif<<>>(ar.mat, rows * cols, lo, hi, seed); + cudaDeviceSynchronize(); + + return ar; + } + + // from normal distribution + template + ArrayGPU Random::randn(const unsigned int rows, const unsigned int cols, const unsigned long long seed) + { + ArrayGPU ar(rows, cols); + const int BLOCK_SIZE = (GPU_NUM_CUDA_CORE == 64)?64:128; + dim3 block(BLOCK_SIZE); + dim3 grid(np_ceil(ar.size(), (block.x * 50))); + kernelInitializeRandomNorm<<>>(ar.mat, ar.size(), seed); + cudaDeviceSynchronize(); + return ar; + } +} + +#endif \ No newline at end of file diff --git a/lightnumpy/_gpu_core/include/numC++/utils.cuh b/lightnumpy/_gpu_core/include/numC++/utils.cuh new file mode 100644 index 0000000..41c777b --- /dev/null +++ b/lightnumpy/_gpu_core/include/numC++/utils.cuh @@ -0,0 +1,65 @@ +#ifndef ERRCHECKUTILS_CUH +#define ERRCHECKUTILS_CUH + +// cuda includes +#include +#include + +// std includes +#include + +// cuda error checking macro +#define CUDA_CALL(x) \ + do \ + { \ + if ((x) != cudaSuccess) \ + { \ + cudaError_t err = (x); \ + printf("Error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + } \ + } while (0) + +// curand error checking macro +#define CURAND_CALL(x) \ + do \ + { \ + if ((x) != CURAND_STATUS_SUCCESS) \ + { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + } \ + } while (0) + +// utility function to compute +#define np_ceil(x, y) ((x + y - 1) / y) + +namespace np{ + enum Operation { + NP_OP_ADD, + NP_OP_SUB, + NP_OP_MUL, + NP_OP_DIV, + NP_OP_LESS_THAN, + NP_OP_LESS_THAN_EQ, + NP_OP_GREATER_THAN, + NP_OP_GREATER_THAN_EQ, + NP_OP_EQ_EQ, + NP_OP_NOT_EQ, + NP_OP_MINIMUM, + NP_OP_MAXIMUM, + NP_OP_EQ, + + NP_REDUCE_SUM, + NP_REDUCE_MIN, + NP_REDUCE_MAX, + NP_REDUCE_ARGMIN, + NP_REDUCE_ARGMAX, + + NP_F_EXP, + NP_F_LOG, + NP_F_SQUARE, + NP_F_SQRT, + NP_F_POW + }; +} + +#endif \ No newline at end of file diff --git a/lightnumpy/_gpu_core/meson.build b/lightnumpy/_gpu_core/meson.build new file mode 100644 index 0000000..d368015 --- /dev/null +++ b/lightnumpy/_gpu_core/meson.build @@ -0,0 +1,214 @@ +# example here is inspired by Nvidia's blog post: +# https://developer.nvidia.com/blog/separate-compilation-linking-cuda-device-code/ +# code: +# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#examples + +# # test that optional initialization of cuda works to disable thin archives +# add_languages('cuda') + +# # https://github.com/mesonbuild/meson/issues/13240 +# # nvcc = meson.get_compiler('cuda').find_library('doesnotexist', required : false) +# # assert(not nvcc.found()) +# nvcc = meson.get_compiler('cuda') +# cuda = import('unstable-cuda') + +# arch_flags = cuda.nvcc_arch_flags(nvcc.version(), 'Common') + +# message('NVCC version: ' + nvcc.version()) +# message('NVCC flags: ' + ' '.join(arch_flags)) + +# # test device linking with -dc (which is equivalent to `--relocatable-device-code true`) +# _st_lib_nvcc = static_library('_st_lib_nvcc', ['src/b.cu'], +# cuda_args : ['-dc'] + arch_flags, +# ) + + +# arch_flags = cuda.nvcc_arch_flags(nvcc.version(), 'Auto', detected: ['6.0']) +# arch_readable = cuda.nvcc_arch_readable(nvcc.version(), 'Auto', detected: ['6.0']) +# driver_version = cuda.min_driver_version(nvcc.version()) + +# message('NVCC version: ' + nvcc.version()) +# message('NVCC flags: ' + ' '.join(arch_flags)) +# message('NVCC readable: ' + ' '.join(arch_readable)) +# message('Driver version: >=' + driver_version) + +###################################################################### +## Find CUDA and CuBLAS Dependencies +## https://github.com/mesonbuild/meson/tree/master/test%20cases/cuda +## pkg-config --list-all | grep cuda +###################################################################### + +# if host_machine.system() == 'linux' +# # Linux: Check for CUDA or ROCm +# # dep_cuda = dependency('cuda', modules: ['cublas']) +# # dep_cuda = dependencies: dependency('cuda', modules: ['cublas']), link_language: 'cpp' +# # dep_cuda = dependency('cuda', version: ['>=10.1'], required: false, disabler: true) +# # dep_cuda = dependency('cuda', required: false) # Try to find CUDA +# # dep_rocm = dependency('rocm-opencl', required: false) # Try to find ROCm OpenCL +# cuda_dep = dependency('cuda', required: false) +# rocm_dep = dependency('rocm-opencl', required: false) +# # Check for dependencies +# if cuda_dep.found() +# gpu_library = declare_dependency( +# include_directories: include_directories('/usr/local/cuda/include'), +# link_with: [cuda_dep] +# ) +# message('Using CUDA on Linux') +# elif rocm_dep.found() +# gpu_library = declare_dependency( +# include_directories: include_directories('/opt/rocm/include'), +# link_with: [rocm_dep] +# ) +# message('Using ROCm on Linux') +# else +# error('No GPU library found on Linux!') +# endif + +# elif host_machine.system() == 'darwin' +# # macOS: Use Metal or CUDA (if on older macOS) +# if meson.version().version_compare('10.14') +# metal_dep = dependency('metal', required: true) +# gpu_library = declare_dependency( +# include_directories: include_directories('/System/Library/Frameworks/Metal.framework/Headers'), +# link_args: ['-framework', 'Metal'] # Link Metal framework +# ) +# message('Using Metal on macOS') +# else +# cuda_dep = dependency('cuda', required: true) +# gpu_library = declare_dependency( +# include_directories: include_directories('/usr/local/cuda/include'), +# link_with: [cuda_dep] +# ) +# message('Using CUDA on macOS') +# endif + +# elif host_machine.system() == 'windows' +# # Windows: Use CUDA only +# cuda_dep = dependency('cuda', required: true) +# gpu_library = declare_dependency( +# include_directories: include_directories('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/include'), +# link_with: [cuda_dep] +# ) +# message('Using CUDA on Windows') +# endif + + +###################################################################### +## Define dependencies and Compiler Flags for gpu numC++ +###################################################################### + +# Define args and dependencies for NumCpp +_global_gpu_numcpp_args = [] +_dep_list_gpu_numcpp = [] + +if get_option('lnpy_use_gpu_numcpp').enabled() + # Initialize an empty list for compiler flags + _global_gpu_numcpp_args += [ + # The -D flag defines a macro that is globally available during compilation. + # The -U flag removes a macro definition during compilation, effectively ignoring it. + '-DLNPY_USE_GPU_NUMCPP', # Enable all GPU NumCpp features + ] +else + _global_gpu_numcpp_args += [ + '-U LNPY_USE_GPU_NUMCPP', # Disable all NumCpp features + ] +endif + +cython_cpp_flags += _global_gpu_numcpp_args + +###################################################################### +## Include the Headers directories for gpu _core, NumCpp +###################################################################### + +# include directories for Specific Subfolders +_gpu_core_inc_dir = [ + 'src', + 'include', +] + +###################################################################### +## Define include_directories for gpu _core, NumCpp +###################################################################### + +# Use the include directory in your build setup +# Specify Include directories where your headers are located +# include_directories(header) -> static_library(mix), library(mix), declare_dependency(mix) +inc_dir_gpu_core = include_directories( + _gpu_core_inc_dir +) +# Optionally add a message to confirm the installation +message( + '\nCpp Core Header compatibility files defined successfully: ' + + '@0@ '.format('lightnumpy/_gpu_core/include') + + 'with NumCpp: @0@'.format(get_option('lnpy_use_gpu_numcpp').enabled()) +) + +###################################################################### +## NumCpp is a header-only library, which means it does not have any precompiled binary +## (like .so, .dll, or .a). +###################################################################### + +# # Static library with C++ source file implementing bindings +# _st_lib_gpu_core = static_library('_st_lib_gpu_core', [], +# include_directories: inc_dir_gpu_core, +# dependencies: [], +# c_args: [], +# cpp_args: [], +# gnu_symbol_visibility: 'inlineshidden', +# link_with: [], +# link_args: [], +# # install: true, +# # install_dir: 'lightnumpy/_gpu_core', +# ) +# # Shared (dynamic) library with C++ source file implementing bindings +# _dyn_lib_gpu_core = library('_dyn_lib_gpu_core', [], +# include_directories: inc_dir_gpu_core, +# dependencies: [], +# c_args: [], +# cpp_args: [], +# gnu_symbol_visibility: 'hidden', +# gnu_symbol_visibility: 'inlineshidden', +# # link_with: [_st_lib_gpu_core], # Link with the static library +# # link_args: ['-shared'], # shared library that can be used by other programs at runtime. +# install: true, +# install_dir: 'lightnumpy/_gpu_core', +# ) + +###################################################################### +## Define Dependencies +###################################################################### + +# Get the NumCpp Dependencies with/without library +dep_gpu_core = declare_dependency( + compile_args: cython_cpp_flags, + dependencies: _dep_list_gpu_numcpp, + include_directories: inc_dir_gpu_core, + # link_with: [_st_lib_gpu_core], # Link with the static/shared library + # link_args: ['-shared'], # shared library that can be used by other programs at runtime. +) +# Optionally add a message to confirm the installation +message( + '\nCpp Core Header dependency defined successfully: ' + + '@0@ '.format('dep_gpu_core') + + 'with NumCpp: @0@'.format(get_option('lnpy_use_gpu_numcpp').enabled()) +) + +###################################################################### +## Next +###################################################################### + +###################################################################### +## Optional format, tidy +###################################################################### + +# # Custom targets (optional) +# custom_target('format', +# command: ['clang-format', '-i', '-style=file:.clang-format', '@INPUT@'], +# output: 'formatted', +# input: files('NumCpp.hpp'), # Adjust this pattern to match the actual header files +# ) +# custom_target('tidy', +# command: ['clang-tidy', '-p', meson.build_dir(), '-extra-arg=-std=c++17', '@INPUT@'], +# output: 'tidy', +# input: files('NumCpp.hpp'), # Adjust this pattern to match the actual source files +# ) \ No newline at end of file diff --git a/lightnumpy/_gpu_core/src/.gitkeep b/lightnumpy/_gpu_core/src/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/lightnumpy/_gpu_core/src/b.cu b/lightnumpy/_gpu_core/src/b.cu new file mode 100644 index 0000000..fa3cf69 --- /dev/null +++ b/lightnumpy/_gpu_core/src/b.cu @@ -0,0 +1,5 @@ +#include "b.h" + +__device__ int g[N]; + +__device__ void bar(void) { g[threadIdx.x]++; } \ No newline at end of file diff --git a/lightnumpy/_gpu_core/src/gpuConfig.cu b/lightnumpy/_gpu_core/src/gpuConfig.cu new file mode 100644 index 0000000..3d2cb4b --- /dev/null +++ b/lightnumpy/_gpu_core/src/gpuConfig.cu @@ -0,0 +1,77 @@ +#include + +// https://docs.nvidia.com/cuda/cublas/index.html +#include + +namespace np +{ + // getting GPU Config to launch kernels with the most optimal + int GPU_NUM_SM = 0; + int GPU_NUM_CUDA_CORE = 0; + + // https://docs.nvidia.com/cuda/cublas/index.html#example-code + cublasHandle_t cbls_handle; + + int _ConvertSMVer2Cores(int major, int minor) + { + // Refer to the CUDA Compute Capability documentation for the number of cores per multiprocessor + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities + switch ((major << 4) + minor) + { + case 0x10: + return 8; // Tesla + case 0x11: + return 8; // Tesla + case 0x12: + return 8; // Tesla + case 0x13: + return 8; // Tesla + case 0x20: + return 32; // Fermi + case 0x21: + return 48; // Fermi + case 0x30: + return 192; // Kepler + case 0x32: + return 192; // Kepler + case 0x35: + return 192; // Kepler + case 0x37: + return 192; // Kepler + case 0x50: + return 128; // Maxwell + case 0x52: + return 128; // Maxwell + case 0x53: + return 128; // Maxwell + case 0x60: + return 64; // Pascal + case 0x61: + return 128; // Pascal + case 0x62: + return 128; // Pascal + case 0x70: + return 64; // Volta + case 0x72: + return 64; // Volta + case 0x75: + return 64; // Turing + case 0x80: + return 64; // Ampere + case 0x86: + return 128; // Ampere + default: + return -1; // Unknown + } + } + void getGPUConfig(int devId) + { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, devId); + + GPU_NUM_SM = deviceProp.multiProcessorCount; + GPU_NUM_CUDA_CORE = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + + cublasCreate(&cbls_handle); + } +} \ No newline at end of file diff --git a/lightnumpy/_gpu_core/src/gpu_array_ops.cu b/lightnumpy/_gpu_core/src/gpu_array_ops.cu new file mode 100644 index 0000000..721389d --- /dev/null +++ b/lightnumpy/_gpu_core/src/gpu_array_ops.cu @@ -0,0 +1,50 @@ +#include "gpu_array_ops.cuh" + +// Initialize GPU configuration +void init_gpu_config(int device_id) { + np::getGPUConfig(device_id); +} + +// Create a GPU array with default values +void* create_gpu_array(int rows, int cols, float default_value) { + return new np::ArrayGPU(rows, cols, default_value); +} + +// Create a GPU array from a 1D array +void* create_gpu_array_1d(const float* data, int size) { + std::vector vec(data, data + size); + return new np::ArrayGPU(vec); +} + +// Create a GPU array from a 2D array +void* create_gpu_array_2d(const float* data, int rows, int cols) { + std::vector> vec(rows, std::vector(cols)); + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + vec[i][j] = data[i * cols + j]; + } + } + return new np::ArrayGPU(vec); +} + +// Get metadata from a GPU array +int get_gpu_array_rows(void* array) { + return static_cast*>(array)->rows; +} + +int get_gpu_array_cols(void* array) { + return static_cast*>(array)->cols; +} + +int get_gpu_array_size(void* array) { + return static_cast*>(array)->size(); +} + +float* get_gpu_array_data(void* array) { + return static_cast*>(array)->mat; +} + +// Clean up GPU array +void delete_gpu_array(void* array) { + delete static_cast*>(array); +} diff --git a/lightnumpy/_gpu_core/src/gpu_ops.cu b/lightnumpy/_gpu_core/src/gpu_ops.cu deleted file mode 100644 index 0e5b266..0000000 --- a/lightnumpy/_gpu_core/src/gpu_ops.cu +++ /dev/null @@ -1 +0,0 @@ -// Template: Implement GPU operations using CUDA. \ No newline at end of file diff --git a/lightnumpy/meson.build b/lightnumpy/meson.build index c4f28a6..3d4fc6e 100644 --- a/lightnumpy/meson.build +++ b/lightnumpy/meson.build @@ -191,6 +191,7 @@ _cython_tree = [ # Subdirectories for different cores subdir('_core') +subdir('_gpu_core') # Subdirectories for python module subdir('cy_bindings') diff --git a/lightnumpy/py_bindings/meson.build b/lightnumpy/py_bindings/meson.build index f238e76..b15c990 100644 --- a/lightnumpy/py_bindings/meson.build +++ b/lightnumpy/py_bindings/meson.build @@ -71,6 +71,26 @@ py_bindings_extension_metadata = { 'install': true, # Whether to install the .so file executable after building 'subdir': 'lightnumpy/py_bindings', # Path where the module is located }, + # Not Implmented + # 'py_gpu_numcpp_api': + # { + # 'sources': [ # C++ source file with NumCpp + Pybind11 bindings + # 'src/py_gpu_numcpp_api.cpp', + # ], + # 'include_directories': [ # Include dirs for compilation + # inc_dir_py_bindings, inc_dir_gpu_core + # ], + # 'dependencies': dep_list, # External libraries and dependencies + # 'link_with': [], # Link with the created static library + # 'override_options': [ + # 'cython_language=cpp', # Ensure Cython knows to generate C++ code + # 'optimization=3' # Optimization level '-O3' + # ], + # 'c_args': cython_c_flags, # Additional C/C++ arguments + # 'cpp_args': cython_cpp_flags, # Additional C/C++ arguments + # 'install': true, # Whether to install the .so file executable after building + # 'subdir': 'lightnumpy/py_bindings', # Path where the module is located + # }, } # https://mesonbuild.com/Syntax.html#foreach-with-a-dictionary diff --git a/lightnumpy/py_bindings/src/py_gpu_numcpp_api.cpp b/lightnumpy/py_bindings/src/py_gpu_numcpp_api.cpp new file mode 100644 index 0000000..afd5e9c --- /dev/null +++ b/lightnumpy/py_bindings/src/py_gpu_numcpp_api.cpp @@ -0,0 +1,40 @@ +#include +#include +namespace py = pybind11; + +#include "gpu_array_ops.cuh" + + +PYBIND11_MODULE(py_gpu_numcpp_api, m) { + m.doc() = "Python bindings for GPU array operations using numC++"; + + m.def("init_gpu_config", &init_gpu_config, py::arg("device_id"), + "Initialize GPU configuration for the specified device."); + + m.def("create_gpu_array", [](int rows, int cols, float default_value) { + return reinterpret_cast(create_gpu_array(rows, cols, default_value)); + }, py::arg("rows"), py::arg("cols"), py::arg("default_value"), + "Create a GPU array with default values."); + + m.def("create_gpu_array_1d", [](py::array_t data) { + py::buffer_info buf = data.request(); + return reinterpret_cast(create_gpu_array_1d(static_cast(buf.ptr), buf.size)); + }, py::arg("data"), "Create a GPU array from a 1D NumPy array."); + + m.def("create_gpu_array_2d", [](py::array_t data, int rows, int cols) { + py::buffer_info buf = data.request(); + return reinterpret_cast(create_gpu_array_2d(static_cast(buf.ptr), rows, cols)); + }, py::arg("data"), py::arg("rows"), py::arg("cols"), + "Create a GPU array from a 2D NumPy array."); + + m.def("get_gpu_array_metadata", [](uintptr_t array_ptr) { + void* array = reinterpret_cast(array_ptr); + return py::make_tuple(get_gpu_array_rows(array), + get_gpu_array_cols(array), + get_gpu_array_size(array)); + }, py::arg("array_ptr"), "Get metadata (rows, cols, size) of a GPU array."); + + m.def("delete_gpu_array", [](uintptr_t array_ptr) { + delete_gpu_array(reinterpret_cast(array_ptr)); + }, py::arg("array_ptr"), "Clean up GPU array."); +} diff --git a/meson.options b/meson.options index 03afcfe..77f3af0 100644 --- a/meson.options +++ b/meson.options @@ -57,6 +57,7 @@ option('pythran', type: 'feature', value: 'auto', 'the implementation). ' + 'A feature option has three states: "enabled", "disabled" or "auto".') # numcpp +# https://github.com/dpilger26/NumCpp option('lnpy_use_numcpp', type: 'feature', value: 'enabled', description: 'If set to "disabled", disables using numcpp (it falls back ' + 'to either pure Python code or Cython code, depending on ' + @@ -65,4 +66,11 @@ option('lnpy_use_numcpp', type: 'feature', value: 'enabled', option('numcpp_no_use_boost', type: 'boolean', value: true, description: 'Do not use Boost libraries.') option('numcpp_use_multithread', type: 'boolean', value: false, - description: 'Enable multithreading support.') \ No newline at end of file + description: 'Enable multithreading support.') +# gpu numcpp +# https://github.com/Sha-x2-nk/numCpp +option('lnpy_use_gpu_numcpp', type: 'feature', value: 'enabled', + description: 'If set to "disabled", disables using numcpp (it falls back ' + + 'to either pure Python code or Cython code, depending on ' + + 'the implementation).' + + 'A feature option has three states: "enabled", "disabled" or "auto".') \ No newline at end of file diff --git a/requirements/all_requirements.txt b/requirements/all_requirements.txt new file mode 100644 index 0000000..e2bb14b --- /dev/null +++ b/requirements/all_requirements.txt @@ -0,0 +1,6 @@ +-r build_requirements.txt +-r release_requirements.txt +-r test_requirements.txt +-r ci_requirements.txt +# -r doc_requirements.txt +# -r linter_requirements.txt \ No newline at end of file diff --git a/requirements/build_requirements.txt b/requirements/build_requirements.txt new file mode 100644 index 0000000..47d6a47 --- /dev/null +++ b/requirements/build_requirements.txt @@ -0,0 +1,15 @@ +meson +ninja +meson-python>=0.16.0 +Cython>=3.0.8 +pybind11>=2.13.2 +pythran>=0.14.0 +scipy>=1.6.0 +numpy>=2.0.0 +pandas>=1.1.5 +#pyarrow +matplotlib>=1.4.0 +scikit-learn>=1.4 +joblib +spin==0.13 +build \ No newline at end of file diff --git a/requirements/ci32_requirements.txt b/requirements/ci32_requirements.txt new file mode 100644 index 0000000..032af2d --- /dev/null +++ b/requirements/ci32_requirements.txt @@ -0,0 +1,3 @@ +spin==0.13 +# Keep this in sync with ci_requirements.txt +scipy-openblas32==0.3.28.0.2 \ No newline at end of file diff --git a/requirements/ci_requirements.txt b/requirements/ci_requirements.txt new file mode 100644 index 0000000..74e6738 --- /dev/null +++ b/requirements/ci_requirements.txt @@ -0,0 +1,3 @@ +-r ci32_requirements.txt +# Keep this in sync with ci32_requirements.txt +scipy-openblas64==0.3.28.0.2 \ No newline at end of file diff --git a/requirements/emscripten_test_requirements.txt b/requirements/emscripten_test_requirements.txt new file mode 100644 index 0000000..e684f6c --- /dev/null +++ b/requirements/emscripten_test_requirements.txt @@ -0,0 +1,4 @@ +hypothesis==6.81.1 +pytest==7.4.0 +pytz==2023.3.post1 +pytest-xdist \ No newline at end of file diff --git a/requirements/environment.yml b/requirements/environment.yml new file mode 100644 index 0000000..df86417 --- /dev/null +++ b/requirements/environment.yml @@ -0,0 +1,11 @@ +name: skplt-env +## Library Dependencies +dependencies: +- python=3 +- numpy +- pandas +#- pyarrow +- scipy +- matplotlib +- scikit-learn +- joblib \ No newline at end of file diff --git a/requirements/release_requirements.txt b/requirements/release_requirements.txt new file mode 100644 index 0000000..12886bd --- /dev/null +++ b/requirements/release_requirements.txt @@ -0,0 +1,19 @@ +# These packages are needed for a release in addition to those needed +# for building, testing, and the creation of documentation. + +# download-wheels.py +urllib3 +beautifulsoup4 + +# changelog.py +pygithub +gitpython>=3.1.30 + +# uploading wheels +twine + +# building and notes +Paver + +# uploading release documentation +packaging \ No newline at end of file diff --git a/requirements/requirements.txt b/requirements/requirements.txt new file mode 100644 index 0000000..8ddd39e --- /dev/null +++ b/requirements/requirements.txt @@ -0,0 +1,7 @@ +## Dependencies +scipy>=0.9 +pandas +#pyarrow +matplotlib>=1.4.0 +scikit-learn>=0.24 +joblib>=0.10 \ No newline at end of file diff --git a/requirements/test_requirements.txt b/requirements/test_requirements.txt new file mode 100644 index 0000000..de10147 --- /dev/null +++ b/requirements/test_requirements.txt @@ -0,0 +1,21 @@ +Cython +meson +ninja; sys_platform != "emscripten" +wheel==0.38.1 +setuptools==70.0.0 ; python_version < '3.12' +setuptools ; python_version >= '3.12' +hypothesis>=6.104 +pytest +pytest-cov +pytest-xdist +pytest-timeout +pytz==2023.3.post1 +# for numpy.random.test.test_extending +cffi; python_version < '3.10' +# For testing types. Notes on the restrictions: +# - Mypy relies on C API features not present in PyPy +# NOTE: Keep mypy in sync with environment.yml +mypy==1.13.0; platform_python_implementation != "PyPy" +typing_extensions>=4.2.0 +# for optional f2py encoding detection +charset-normalizer