diff --git a/custom-ops-introduction/.gitignore b/custom-ops-introduction/.gitignore new file mode 100644 index 0000000..ba386de --- /dev/null +++ b/custom-ops-introduction/.gitignore @@ -0,0 +1,8 @@ +# pixi environments +.pixi +*.egg-info +# magic environments +.magic +.env +# build products +operations.mojopkg diff --git a/custom-ops-introduction/README.md b/custom-ops-introduction/README.md new file mode 100644 index 0000000..0fab5f7 --- /dev/null +++ b/custom-ops-introduction/README.md @@ -0,0 +1,115 @@ +# Custom Operations: An Introduction to Programming GPUs and CPUs with Mojo + +In this recipe, we will cover: + +* How to extend a MAX Graph using custom operations. +* Using Mojo to write high-performance calculations that run on GPUs and CPUs. +* The basics of GPU programming in MAX. + +We'll walk through running three examples that show + +* adding one to every number in an input tensor +* performing hardware-specific addition of two vectors +* and calculating the Mandelbrot set on CPU and GPU. + +Let's get started. + +## Requirements + +Please make sure your system meets our +[system requirements](https://docs.modular.com/max/get-started). + +To proceed, ensure you have the `magic` CLI installed: + +```bash +curl -ssL https://magic.modular.com/ | bash +``` + +or update it via: + +```bash +magic self-update +``` + +### GPU requirements + +These examples can all be run on either a CPU or GPU. To run them on a GPU, +ensure your system meets +[these GPU requirements](https://docs.modular.com/max/faq/#gpu-requirements): + +* Officially supported GPUs: NVIDIA Ampere A-series (A100/A10), or Ada + L4-series (L4/L40) data center GPUs. Unofficially, RTX 30XX and 40XX series + GPUs have been reported to work well with MAX. +* NVIDIA GPU driver version 555 or higher. [Installation guide here](https://www.nvidia.com/download/index.aspx). + +## Quick start + +1. Download the code for this recipe using git: + +```bash +git clone https://github.com/modular/max-recipes.git +cd max-recipes/custom-ops-introduction +``` + +2. Run each of the examples: + +```bash +magic run add_one +magic run vector_addition +magic run mandelbrot +``` + +3. Browse through the commented source code to see how they work. + +## Custom operation examples + +Graphs in MAX can be extended to use custom operations written in Mojo. The +following examples are shown here: + +* **add_one**: Adding 1 to every element of an input tensor. +* **vector_addition**: Performing vector addition using a manual GPU function. +* **mandelbrot**: Calculating the Mandelbrot set. + +Custom operations have been written in Mojo to carry out these calculations. For +each example, a simple graph containing a single operation is constructed +in Python. This graph is compiled and dispatched onto a supported GPU if one is +available, or the CPU if not. Input tensors, if there are any, are moved from +the host to the device on which the graph is running. The graph then runs and +the results are copied back to the host for display. + +One thing to note is that this same Mojo code runs on CPU as well as GPU. In +the construction of the graph, it runs on a supported accelerator if one is +available or falls back to the CPU if not. No code changes for either path. +The `vector_addition` example shows how this works under the hood for common +MAX abstractions, where compile-time specialization lets MAX choose the optimal +code path for a given hardware architecture. + +The `operations/` directory contains the custom kernel implementations, and the +graph construction occurs in the Python files in the base directory. These +examples are designed to stand on their own, so that they can be used as +templates for experimentation. + +The execution has two phases: first an `operations.mojopkg` is compiled from the +custom Mojo kernel, and then the graph is constructed and run in Python. The +inference session is pointed to the `operations.mojopkg` in order to load the +custom operations. + +## Conclusion + +In this recipe, we've introduced the basics of how to write custom MAX Graph +operations using Mojo, place them in a one-operation graph in Python, and run +them on an available CPU or GPU. + +## Next Steps + +* Follow [our tutorial for building a custom operation from scratch](https://docs.modular.com/max/tutorials/build-custom-ops). + +* Explore MAX's [documentation](https://docs.modular.com/max/) for additional + features. The [`gpu`](https://docs.modular.com/mojo/stdlib/gpu/) module has + detail on Mojo's GPU programming functions and types, and the documentation + on [`@compiler.register`](https://docs.modular.com/max/api/mojo-decorators/compiler-register/) + shows how to register custom graph operations. + +* Join our [Modular Forum](https://forum.modular.com/) and [Discord community](https://discord.gg/modular) to share your experiences and get support. + +We're excited to see what you'll build with MAX! Share your projects and experiences with us using `#ModularAI` on social media. diff --git a/custom-ops-introduction/add_one.py b/custom-ops-introduction/add_one.py new file mode 100644 index 0000000..15e755e --- /dev/null +++ b/custom-ops-introduction/add_one.py @@ -0,0 +1,74 @@ +# ===----------------------------------------------------------------------=== # +# Copyright (c) 2025, Modular Inc. All rights reserved. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions: +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ===----------------------------------------------------------------------=== # + +from pathlib import Path + +import numpy as np +from max.driver import CPU, Accelerator, Tensor, accelerator_count +from max.dtype import DType +from max.engine import InferenceSession +from max.graph import Graph, TensorType, ops + +if __name__ == "__main__": + path = Path(__file__).parent / "operations.mojopkg" + + rows = 5 + columns = 10 + dtype = DType.float32 + + # Configure our simple one-operation graph. + graph = Graph( + "addition", + # The custom Mojo operation is referenced by its string name, and we + # need to provide inputs as a list as well as expected output types. + forward=lambda x: ops.custom( + name="add_one", + values=[x], + out_types=[TensorType(dtype=x.dtype, shape=x.tensor.shape)], + )[0].tensor, + input_types=[ + TensorType(dtype, shape=[rows, columns]), + ], + ) + + # Place the graph on a GPU, if available. Fall back to CPU if not. + device = CPU() if accelerator_count() == 0 else Accelerator() + + # Set up an inference session for running the graph. + session = InferenceSession( + devices=[device], + custom_extensions=path, + ) + + # Compile the graph. + model = session.load(graph) + + # Fill an input matrix with random values. + x_values = np.random.uniform(size=(rows, columns)).astype(np.float32) + + # Create a driver tensor from this, and move it to the accelerator. + x = Tensor.from_numpy(x_values).to(device) + + # Perform the calculation on the target device. + result = model.execute(x)[0] + + # Copy values back to the CPU to be read. + assert isinstance(result, Tensor) + result = result.to(CPU()) + + print("Graph result:") + print(result.to_numpy()) + print() + + print("Expected result:") + print(x_values + 1) diff --git a/custom-ops-introduction/mandelbrot.py b/custom-ops-introduction/mandelbrot.py new file mode 100644 index 0000000..ba3117b --- /dev/null +++ b/custom-ops-introduction/mandelbrot.py @@ -0,0 +1,94 @@ +# ===----------------------------------------------------------------------=== # +# Copyright (c) 2025, Modular Inc. All rights reserved. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions: +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ===----------------------------------------------------------------------=== # + +from pathlib import Path + +from max.driver import CPU, Accelerator, Tensor, accelerator_count +from max.dtype import DType +from max.engine import InferenceSession +from max.graph import Graph, TensorType, ops + + +def create_mandelbrot_graph( + width: int, + height: int, + min_x: float, + min_y: float, + scale_x: float, + scale_y: float, + max_iterations: int, +) -> Graph: + """Configure a graph to run a Mandelbrot kernel.""" + output_dtype = DType.int32 + with Graph( + "mandelbrot", + ) as graph: + # The custom Mojo operation is referenced by its string name, and we + # need to provide inputs as a list as well as expected output types. + result = ops.custom( + name="mandelbrot", + values=[ + ops.constant(min_x, dtype=DType.float32), + ops.constant(min_y, dtype=DType.float32), + ops.constant(scale_x, dtype=DType.float32), + ops.constant(scale_y, dtype=DType.float32), + ops.constant(max_iterations, dtype=DType.int32), + ], + out_types=[TensorType(dtype=output_dtype, shape=[height, width])], + )[0].tensor + + # Return the result of the custom operation as the output of the graph. + graph.output(result) + return graph + + +if __name__ == "__main__": + path = Path(__file__).parent / "operations.mojopkg" + + # Establish Mandelbrot set ranges. + WIDTH = 15 + HEIGHT = 15 + MAX_ITERATIONS = 100 + MIN_X = -1.5 + MAX_X = 0.7 + MIN_Y = -1.12 + MAX_Y = 1.12 + + # Configure our simple graph. + scale_x = (MAX_X - MIN_X) / WIDTH + scale_y = (MAX_Y - MIN_Y) / HEIGHT + graph = create_mandelbrot_graph( + WIDTH, HEIGHT, MIN_X, MIN_Y, scale_x, scale_y, MAX_ITERATIONS + ) + + # Place the graph on a GPU, if available. Fall back to CPU if not. + device = CPU() if accelerator_count() == 0 else Accelerator() + + # Set up an inference session that runs the graph on a GPU, if available. + session = InferenceSession( + devices=[device], + custom_extensions=path, + ) + # Compile the graph. + model = session.load(graph) + + # Perform the calculation on the target device. + result = model.execute()[0] + + # Copy values back to the CPU to be read. + assert isinstance(result, Tensor) + result = result.to(CPU()) + + print("Iterations to escape:") + print(result.to_numpy()) + print() diff --git a/custom-ops-introduction/metadata.yaml b/custom-ops-introduction/metadata.yaml new file mode 100644 index 0000000..4f2a2a1 --- /dev/null +++ b/custom-ops-introduction/metadata.yaml @@ -0,0 +1,17 @@ +version: 1.0 +long_title: "Custom Operations: An Introduction to Programming GPUs and CPUs with Mojo" +short_title: "Custom Operations: An Introduction" +author: "Brad Larson" +author_image: "author/bradlarson.jpg" +author_url: "https://www.linkedin.com/in/brad-larson-3549a5291/" +github_repo: "https://github.com/modular/max-recipes/tree/main/custom-ops-introduction" +date: "23-02-2025" +difficulty: "beginner" +tags: + - max-graph + - gpu-programming + +tasks: + - magic run add_one + - magic run vector_addition + - magic run mandelbrot diff --git a/custom-ops-introduction/operations/__init__.mojo b/custom-ops-introduction/operations/__init__.mojo new file mode 100644 index 0000000..75c4f82 --- /dev/null +++ b/custom-ops-introduction/operations/__init__.mojo @@ -0,0 +1,12 @@ +# ===----------------------------------------------------------------------=== # +# Copyright (c) 2025, Modular Inc. All rights reserved. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions: +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ===----------------------------------------------------------------------=== # diff --git a/custom-ops-introduction/operations/add_one.mojo b/custom-ops-introduction/operations/add_one.mojo new file mode 100644 index 0000000..d96969f --- /dev/null +++ b/custom-ops-introduction/operations/add_one.mojo @@ -0,0 +1,42 @@ +# ===----------------------------------------------------------------------=== # +# Copyright (c) 2025, Modular Inc. All rights reserved. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions: +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ===----------------------------------------------------------------------=== # + +import compiler +from max.tensor import ManagedTensorSlice, foreach +from runtime.asyncrt import DeviceContextPtr + +from utils.index import IndexList + + +@compiler.register("add_one", num_dps_outputs=1) +struct AddOne: + @staticmethod + fn execute[ + # The kind of device this will be run on: "cpu" or "gpu" + target: StringLiteral, + ]( + # as num_dps_outputs=1, the first argument is the "output" + out: ManagedTensorSlice, + # starting here are the list of inputs + x: ManagedTensorSlice[type = out.type, rank = out.rank], + # the context is needed for some GPU calls + ctx: DeviceContextPtr, + ): + @parameter + @always_inline + fn elementwise_add_one[ + width: Int + ](idx: IndexList[x.rank]) -> SIMD[x.type, width]: + return x.load[width](idx) + 1 + + foreach[elementwise_add_one, target=target](out, ctx) diff --git a/custom-ops-introduction/operations/mandelbrot.mojo b/custom-ops-introduction/operations/mandelbrot.mojo new file mode 100644 index 0000000..f6ec259 --- /dev/null +++ b/custom-ops-introduction/operations/mandelbrot.mojo @@ -0,0 +1,72 @@ +# ===----------------------------------------------------------------------=== # +# Copyright (c) 2025, Modular Inc. All rights reserved. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions: +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ===----------------------------------------------------------------------=== # + +from math import iota + +import compiler +from complex import ComplexSIMD +from max.tensor import ManagedTensorSlice, foreach +from runtime.asyncrt import DeviceContextPtr + +from utils.index import IndexList + + +alias float_dtype = DType.float32 + + +@compiler.register("mandelbrot", num_dps_outputs=1) +struct Mandelbrot: + @staticmethod + fn execute[ + # The kind of device this will be run on: "cpu" or "gpu" + target: StringLiteral, + ]( + # as num_dps_outputs=1, the first argument is the "output" + out: ManagedTensorSlice, + # starting here are the list of inputs + min_x: Float32, + min_y: Float32, + scale_x: Float32, + scale_y: Float32, + max_iterations: Int32, + # the context is needed for some GPU calls + ctx: DeviceContextPtr, + ): + @parameter + @always_inline + fn elementwise_mandelbrot[ + width: Int + ](idx: IndexList[out.rank]) -> SIMD[out.type, width]: + var row = idx[0] + var col = idx[1] + var cx = min_x.cast[float_dtype]() + ( + col + iota[float_dtype, width]() + ) * scale_x.cast[float_dtype]() + var cy = min_y.cast[float_dtype]() + row * SIMD[float_dtype, width]( + scale_y.cast[float_dtype]() + ) + var c = ComplexSIMD[float_dtype, width](cx, cy) + var z = ComplexSIMD[float_dtype, width](0, 0) + var iters = SIMD[out.type, width](0) + + var in_set_mask: SIMD[DType.bool, width] = True + for _ in range(max_iterations): + if not any(in_set_mask): + break + in_set_mask = z.squared_norm() <= 4 + iters = in_set_mask.select(iters + 1, iters) + z = z.squared_add(c) + + return iters + + foreach[elementwise_mandelbrot, target=target](out, ctx) diff --git a/custom-ops-introduction/operations/vector_addition.mojo b/custom-ops-introduction/operations/vector_addition.mojo new file mode 100644 index 0000000..1b19e1d --- /dev/null +++ b/custom-ops-introduction/operations/vector_addition.mojo @@ -0,0 +1,98 @@ +# ===----------------------------------------------------------------------=== # +# Copyright (c) 2025, Modular Inc. All rights reserved. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions: +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ===----------------------------------------------------------------------=== # + +from math import ceildiv + +import compiler +from gpu import block_dim, block_idx, thread_idx +from gpu.host import DeviceContext +from runtime.asyncrt import DeviceContextPtr +from tensor import ManagedTensorSlice, foreach + +from utils.index import IndexList + + +fn _vector_addition_cpu( + out: ManagedTensorSlice, + lhs: ManagedTensorSlice[type = out.type, rank = out.rank], + rhs: ManagedTensorSlice[type = out.type, rank = out.rank], + ctx: DeviceContextPtr, +): + # Warning: This is an extremely inefficient implementation! It's merely an + # instructional example of how a dedicated CPU-only path can be specified + # for basic vector addition. + var vector_length = out.dim_size(0) + for i in range(vector_length): + out[i] = lhs[i] + rhs[i] + + +fn _vector_addition_gpu( + out: ManagedTensorSlice, + lhs: ManagedTensorSlice[type = out.type, rank = out.rank], + rhs: ManagedTensorSlice[type = out.type, rank = out.rank], + ctx: DeviceContextPtr, +) raises: + # Note: The following has not been tuned for any GPU hardware, and is an + # instructional example for how a simple GPU function can be constructed + # and dispatched. + alias BLOCK_SIZE = 16 + var gpu_ctx = ctx.get_device_context() + var vector_length = out.dim_size(0) + + # The function that will be launched and distributed across GPU threads. + @parameter + fn vector_addition_gpu_kernel(length: Int): + var tid = block_dim.x * block_idx.x + thread_idx.x + if tid < length: + out[tid] = lhs[tid] + rhs[tid] + + # The vector is divided up into blocks, making sure there's an extra + # full block for any remainder. + var num_blocks = ceildiv(vector_length, BLOCK_SIZE) + + # The GPU function is compiled and enqueued to run on the GPU across the + # 1-D vector, split into blocks of `BLOCK_SIZE` width. + gpu_ctx.enqueue_function[vector_addition_gpu_kernel]( + vector_length, grid_dim=num_blocks, block_dim=BLOCK_SIZE + ) + + +@compiler.register("vector_addition", num_dps_outputs=1) +struct VectorAddition: + @staticmethod + fn execute[ + # The kind of device this will be run on: "cpu" or "gpu" + target: StringLiteral, + ]( + # as num_dps_outputs=1, the first argument is the "output" + out: ManagedTensorSlice[rank=1], + # starting here are the list of inputs + lhs: ManagedTensorSlice[type = out.type, rank = out.rank], + rhs: ManagedTensorSlice[type = out.type, rank = out.rank], + # the context is needed for some GPU calls + ctx: DeviceContextPtr, + ) raises: + # For a simple elementwise operation like this, the `foreach` function + # does much more rigorous hardware-specific tuning. We recommend using + # that abstraction, with this example serving purely as an illustration + # of how lower-level functions can be used to program GPUs via Mojo. + + # At graph compilation time, we will know what device we are compiling + # this operation for, so we can specialize it for the target hardware. + @parameter + if target == "cpu": + _vector_addition_cpu(out, lhs, rhs, ctx) + elif target == "gpu": + _vector_addition_gpu(out, lhs, rhs, ctx) + else: + raise Error("No known target:", target) diff --git a/custom-ops-introduction/pyproject.toml b/custom-ops-introduction/pyproject.toml new file mode 100644 index 0000000..a9ab84f --- /dev/null +++ b/custom-ops-introduction/pyproject.toml @@ -0,0 +1,31 @@ +[project] +authors = [{ name = "Modular Inc", email = "hello@modular.com" }] +description = "Custom Operations: An Introduction" +name = "custom-ops-introduction" +requires-python = ">= 3.9,<3.13" +version = "0.1.0" + +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling"] + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.pixi.project] +channels = [ + "conda-forge", + "https://conda.modular.com/max-nightly", + "https://conda.modular.com/max", + "https://repo.prefix.dev/modular-community", +] +platforms = ["linux-64", "osx-arm64", "linux-aarch64"] + +[tool.pixi.tasks] +package = "mojo package operations/ -o operations.mojopkg" +add_one = { cmd = "python add_one.py", depends-on = ["package"] } +mandelbrot = { cmd = "python mandelbrot.py", depends-on = ["package"] } +vector_addition = { cmd = "python vector_addition.py", depends-on = ["package"] } + +[tool.pixi.dependencies] +max = ">=25.2.0.dev2025022205" diff --git a/custom-ops-introduction/vector_addition.py b/custom-ops-introduction/vector_addition.py new file mode 100644 index 0000000..994061e --- /dev/null +++ b/custom-ops-introduction/vector_addition.py @@ -0,0 +1,87 @@ +# ===----------------------------------------------------------------------=== # +# Copyright (c) 2025, Modular Inc. All rights reserved. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions: +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ===----------------------------------------------------------------------=== # + +from pathlib import Path + +import numpy as np +from max.driver import CPU, Accelerator, Tensor, accelerator_count +from max.dtype import DType +from max.engine import InferenceSession +from max.graph import Graph, TensorType, ops + +if __name__ == "__main__": + path = Path(__file__).parent / "operations.mojopkg" + + vector_width = 10 + dtype = DType.float32 + + # Configure our simple one-operation graph. + with Graph( + "vector_addition", + input_types=[ + TensorType(dtype, shape=[vector_width]), + TensorType(dtype, shape=[vector_width]), + ], + ) as graph: + # Take in the two inputs to the graph. + lhs, rhs = graph.inputs + output = ops.custom( + name="vector_addition", + values=[lhs, rhs], + out_types=[ + TensorType(dtype=lhs.tensor.dtype, shape=lhs.tensor.shape) + ], + )[0].tensor + graph.output(output) + + # Place the graph on a GPU, if available. Fall back to CPU if not. + device = CPU() if accelerator_count() == 0 else Accelerator() + + # Set up an inference session for running the graph. + session = InferenceSession( + devices=[device], + custom_extensions=path, + ) + + # Compile the graph. + model = session.load(graph) + + # Fill input matrices with random values. + lhs_values = np.random.uniform(size=(vector_width)).astype(np.float32) + rhs_values = np.random.uniform(size=(vector_width)).astype(np.float32) + + # Create driver tensors from this, and move them to the accelerator. + lhs_tensor = Tensor.from_numpy(lhs_values).to(device) + rhs_tensor = Tensor.from_numpy(rhs_values).to(device) + + # Perform the calculation on the target device. + result = model.execute(lhs_tensor, rhs_tensor)[0] + + # Copy values back to the CPU to be read. + assert isinstance(result, Tensor) + result = result.to(CPU()) + + print("Left-hand-side values:") + print(lhs_values) + print() + + print("Right-hand-side values:") + print(rhs_values) + print() + + print("Graph result:") + print(result.to_numpy()) + print() + + print("Expected result:") + print(lhs_values + rhs_values)