From 8afd65317794e0c08d7959b0d2b84b18e078f688 Mon Sep 17 00:00:00 2001
From: Brad Larson <bradlarson@modular.com>
Date: Sun, 23 Feb 2025 13:51:08 -0600
Subject: [PATCH 1/3] Initial custom ops introductory recipe.

---
 custom-ops-introduction/.gitignore            |   8 ++
 custom-ops-introduction/README.md             | 115 ++++++++++++++++++
 custom-ops-introduction/add_one.py            |  74 +++++++++++
 custom-ops-introduction/mandelbrot.py         |  94 ++++++++++++++
 custom-ops-introduction/metadata.yaml         |  17 +++
 .../operations/__init__.mojo                  |  12 ++
 .../operations/add_one.mojo                   |  42 +++++++
 .../operations/mandelbrot.mojo                |  72 +++++++++++
 .../operations/vector_addition.mojo           |  98 +++++++++++++++
 custom-ops-introduction/pyproject.toml        |  31 +++++
 custom-ops-introduction/vector_addition.py    |  87 +++++++++++++
 11 files changed, 650 insertions(+)
 create mode 100644 custom-ops-introduction/.gitignore
 create mode 100644 custom-ops-introduction/README.md
 create mode 100644 custom-ops-introduction/add_one.py
 create mode 100644 custom-ops-introduction/mandelbrot.py
 create mode 100644 custom-ops-introduction/metadata.yaml
 create mode 100644 custom-ops-introduction/operations/__init__.mojo
 create mode 100644 custom-ops-introduction/operations/add_one.mojo
 create mode 100644 custom-ops-introduction/operations/mandelbrot.mojo
 create mode 100644 custom-ops-introduction/operations/vector_addition.mojo
 create mode 100644 custom-ops-introduction/pyproject.toml
 create mode 100644 custom-ops-introduction/vector_addition.py

diff --git a/custom-ops-introduction/.gitignore b/custom-ops-introduction/.gitignore
new file mode 100644
index 0000000..ba386de
--- /dev/null
+++ b/custom-ops-introduction/.gitignore
@@ -0,0 +1,8 @@
+# pixi environments
+.pixi
+*.egg-info
+# magic environments
+.magic
+.env
+# build products
+operations.mojopkg
diff --git a/custom-ops-introduction/README.md b/custom-ops-introduction/README.md
new file mode 100644
index 0000000..0fab5f7
--- /dev/null
+++ b/custom-ops-introduction/README.md
@@ -0,0 +1,115 @@
+# Custom Operations: An Introduction to Programming GPUs and CPUs with Mojo
+
+In this recipe, we will cover:
+
+* How to extend a MAX Graph using custom operations.
+* Using Mojo to write high-performance calculations that run on GPUs and CPUs.
+* The basics of GPU programming in MAX.
+
+We'll walk through running three examples that show
+
+* adding one to every number in an input tensor
+* performing hardware-specific addition of two vectors
+* and calculating the Mandelbrot set on CPU and GPU.
+
+Let's get started.
+
+## Requirements
+
+Please make sure your system meets our
+[system requirements](https://docs.modular.com/max/get-started).
+
+To proceed, ensure you have the `magic` CLI installed:
+
+```bash
+curl -ssL https://magic.modular.com/ | bash
+```
+
+or update it via:
+
+```bash
+magic self-update
+```
+
+### GPU requirements
+
+These examples can all be run on either a CPU or GPU. To run them on a GPU,
+ensure your system meets
+[these GPU requirements](https://docs.modular.com/max/faq/#gpu-requirements):
+
+* Officially supported GPUs: NVIDIA Ampere A-series (A100/A10), or Ada
+  L4-series (L4/L40) data center GPUs. Unofficially, RTX 30XX and 40XX series
+  GPUs have been reported to work well with MAX.
+* NVIDIA GPU driver version 555 or higher. [Installation guide here](https://www.nvidia.com/download/index.aspx).
+
+## Quick start
+
+1. Download the code for this recipe using git:
+
+```bash
+git clone https://github.com/modular/max-recipes.git
+cd max-recipes/custom-ops-introduction
+```
+
+2. Run each of the examples:
+
+```bash
+magic run add_one
+magic run vector_addition
+magic run mandelbrot
+```
+
+3. Browse through the commented source code to see how they work.
+
+## Custom operation examples
+
+Graphs in MAX can be extended to use custom operations written in Mojo. The
+following examples are shown here:
+
+* **add_one**: Adding 1 to every element of an input tensor.
+* **vector_addition**: Performing vector addition using a manual GPU function.
+* **mandelbrot**: Calculating the Mandelbrot set.
+
+Custom operations have been written in Mojo to carry out these calculations. For
+each example, a simple graph containing a single operation is constructed
+in Python. This graph is compiled and dispatched onto a supported GPU if one is
+available, or the CPU if not. Input tensors, if there are any, are moved from
+the host to the device on which the graph is running. The graph then runs and
+the results are copied back to the host for display.
+
+One thing to note is that this same Mojo code runs on CPU as well as GPU. In
+the construction of the graph, it runs on a supported accelerator if one is
+available or falls back to the CPU if not. No code changes for either path.
+The `vector_addition` example shows how this works under the hood for common
+MAX abstractions, where compile-time specialization lets MAX choose the optimal
+code path for a given hardware architecture.
+
+The `operations/` directory contains the custom kernel implementations, and the
+graph construction occurs in the Python files in the base directory. These
+examples are designed to stand on their own, so that they can be used as
+templates for experimentation.
+
+The execution has two phases: first an `operations.mojopkg` is compiled from the
+custom Mojo kernel, and then the graph is constructed and run in Python. The
+inference session is pointed to the `operations.mojopkg` in order to load the
+custom operations.
+
+## Conclusion
+
+In this recipe, we've introduced the basics of how to write custom MAX Graph
+operations using Mojo, place them in a one-operation graph in Python, and run
+them on an available CPU or GPU.
+
+## Next Steps
+
+* Follow [our tutorial for building a custom operation from scratch](https://docs.modular.com/max/tutorials/build-custom-ops).
+
+* Explore MAX's [documentation](https://docs.modular.com/max/) for additional
+  features. The [`gpu`](https://docs.modular.com/mojo/stdlib/gpu/) module has
+  detail on Mojo's GPU programming functions and types, and the documentation
+  on [`@compiler.register`](https://docs.modular.com/max/api/mojo-decorators/compiler-register/)
+  shows how to register custom graph operations.
+
+* Join our [Modular Forum](https://forum.modular.com/) and [Discord community](https://discord.gg/modular) to share your experiences and get support.
+
+We're excited to see what you'll build with MAX! Share your projects and experiences with us using `#ModularAI` on social media.
diff --git a/custom-ops-introduction/add_one.py b/custom-ops-introduction/add_one.py
new file mode 100644
index 0000000..15e755e
--- /dev/null
+++ b/custom-ops-introduction/add_one.py
@@ -0,0 +1,74 @@
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2025, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
+
+from pathlib import Path
+
+import numpy as np
+from max.driver import CPU, Accelerator, Tensor, accelerator_count
+from max.dtype import DType
+from max.engine import InferenceSession
+from max.graph import Graph, TensorType, ops
+
+if __name__ == "__main__":
+    path = Path(__file__).parent / "operations.mojopkg"
+
+    rows = 5
+    columns = 10
+    dtype = DType.float32
+
+    # Configure our simple one-operation graph.
+    graph = Graph(
+        "addition",
+        # The custom Mojo operation is referenced by its string name, and we
+        # need to provide inputs as a list as well as expected output types.
+        forward=lambda x: ops.custom(
+            name="add_one",
+            values=[x],
+            out_types=[TensorType(dtype=x.dtype, shape=x.tensor.shape)],
+        )[0].tensor,
+        input_types=[
+            TensorType(dtype, shape=[rows, columns]),
+        ],
+    )
+
+    # Place the graph on a GPU, if available. Fall back to CPU if not.
+    device = CPU() if accelerator_count() == 0 else Accelerator()
+
+    # Set up an inference session for running the graph.
+    session = InferenceSession(
+        devices=[device],
+        custom_extensions=path,
+    )
+
+    # Compile the graph.
+    model = session.load(graph)
+
+    # Fill an input matrix with random values.
+    x_values = np.random.uniform(size=(rows, columns)).astype(np.float32)
+
+    # Create a driver tensor from this, and move it to the accelerator.
+    x = Tensor.from_numpy(x_values).to(device)
+
+    # Perform the calculation on the target device.
+    result = model.execute(x)[0]
+
+    # Copy values back to the CPU to be read.
+    assert isinstance(result, Tensor)
+    result = result.to(CPU())
+
+    print("Graph result:")
+    print(result.to_numpy())
+    print()
+
+    print("Expected result:")
+    print(x_values + 1)
diff --git a/custom-ops-introduction/mandelbrot.py b/custom-ops-introduction/mandelbrot.py
new file mode 100644
index 0000000..ba3117b
--- /dev/null
+++ b/custom-ops-introduction/mandelbrot.py
@@ -0,0 +1,94 @@
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2025, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
+
+from pathlib import Path
+
+from max.driver import CPU, Accelerator, Tensor, accelerator_count
+from max.dtype import DType
+from max.engine import InferenceSession
+from max.graph import Graph, TensorType, ops
+
+
+def create_mandelbrot_graph(
+    width: int,
+    height: int,
+    min_x: float,
+    min_y: float,
+    scale_x: float,
+    scale_y: float,
+    max_iterations: int,
+) -> Graph:
+    """Configure a graph to run a Mandelbrot kernel."""
+    output_dtype = DType.int32
+    with Graph(
+        "mandelbrot",
+    ) as graph:
+        # The custom Mojo operation is referenced by its string name, and we
+        # need to provide inputs as a list as well as expected output types.
+        result = ops.custom(
+            name="mandelbrot",
+            values=[
+                ops.constant(min_x, dtype=DType.float32),
+                ops.constant(min_y, dtype=DType.float32),
+                ops.constant(scale_x, dtype=DType.float32),
+                ops.constant(scale_y, dtype=DType.float32),
+                ops.constant(max_iterations, dtype=DType.int32),
+            ],
+            out_types=[TensorType(dtype=output_dtype, shape=[height, width])],
+        )[0].tensor
+
+        # Return the result of the custom operation as the output of the graph.
+        graph.output(result)
+        return graph
+
+
+if __name__ == "__main__":
+    path = Path(__file__).parent / "operations.mojopkg"
+
+    # Establish Mandelbrot set ranges.
+    WIDTH = 15
+    HEIGHT = 15
+    MAX_ITERATIONS = 100
+    MIN_X = -1.5
+    MAX_X = 0.7
+    MIN_Y = -1.12
+    MAX_Y = 1.12
+
+    # Configure our simple graph.
+    scale_x = (MAX_X - MIN_X) / WIDTH
+    scale_y = (MAX_Y - MIN_Y) / HEIGHT
+    graph = create_mandelbrot_graph(
+        WIDTH, HEIGHT, MIN_X, MIN_Y, scale_x, scale_y, MAX_ITERATIONS
+    )
+
+    # Place the graph on a GPU, if available. Fall back to CPU if not.
+    device = CPU() if accelerator_count() == 0 else Accelerator()
+
+    # Set up an inference session that runs the graph on a GPU, if available.
+    session = InferenceSession(
+        devices=[device],
+        custom_extensions=path,
+    )
+    # Compile the graph.
+    model = session.load(graph)
+
+    # Perform the calculation on the target device.
+    result = model.execute()[0]
+
+    # Copy values back to the CPU to be read.
+    assert isinstance(result, Tensor)
+    result = result.to(CPU())
+
+    print("Iterations to escape:")
+    print(result.to_numpy())
+    print()
diff --git a/custom-ops-introduction/metadata.yaml b/custom-ops-introduction/metadata.yaml
new file mode 100644
index 0000000..4f2a2a1
--- /dev/null
+++ b/custom-ops-introduction/metadata.yaml
@@ -0,0 +1,17 @@
+version: 1.0
+long_title: "Custom Operations: An Introduction to Programming GPUs and CPUs with Mojo"
+short_title: "Custom Operations: An Introduction"
+author: "Brad Larson"
+author_image: "author/bradlarson.jpg"
+author_url: "https://www.linkedin.com/in/brad-larson-3549a5291/"
+github_repo: "https://github.com/modular/max-recipes/tree/main/custom-ops-introduction"
+date: "23-02-2025"
+difficulty: "beginner"
+tags:
+  - max-graph
+  - gpu-programming
+
+tasks:
+  - magic run add_one
+  - magic run vector_addition
+  - magic run mandelbrot
diff --git a/custom-ops-introduction/operations/__init__.mojo b/custom-ops-introduction/operations/__init__.mojo
new file mode 100644
index 0000000..75c4f82
--- /dev/null
+++ b/custom-ops-introduction/operations/__init__.mojo
@@ -0,0 +1,12 @@
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2025, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
diff --git a/custom-ops-introduction/operations/add_one.mojo b/custom-ops-introduction/operations/add_one.mojo
new file mode 100644
index 0000000..d96969f
--- /dev/null
+++ b/custom-ops-introduction/operations/add_one.mojo
@@ -0,0 +1,42 @@
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2025, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
+
+import compiler
+from max.tensor import ManagedTensorSlice, foreach
+from runtime.asyncrt import DeviceContextPtr
+
+from utils.index import IndexList
+
+
+@compiler.register("add_one", num_dps_outputs=1)
+struct AddOne:
+    @staticmethod
+    fn execute[
+        # The kind of device this will be run on: "cpu" or "gpu"
+        target: StringLiteral,
+    ](
+        # as num_dps_outputs=1, the first argument is the "output"
+        out: ManagedTensorSlice,
+        # starting here are the list of inputs
+        x: ManagedTensorSlice[type = out.type, rank = out.rank],
+        # the context is needed for some GPU calls
+        ctx: DeviceContextPtr,
+    ):
+        @parameter
+        @always_inline
+        fn elementwise_add_one[
+            width: Int
+        ](idx: IndexList[x.rank]) -> SIMD[x.type, width]:
+            return x.load[width](idx) + 1
+
+        foreach[elementwise_add_one, target=target](out, ctx)
diff --git a/custom-ops-introduction/operations/mandelbrot.mojo b/custom-ops-introduction/operations/mandelbrot.mojo
new file mode 100644
index 0000000..f6ec259
--- /dev/null
+++ b/custom-ops-introduction/operations/mandelbrot.mojo
@@ -0,0 +1,72 @@
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2025, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
+
+from math import iota
+
+import compiler
+from complex import ComplexSIMD
+from max.tensor import ManagedTensorSlice, foreach
+from runtime.asyncrt import DeviceContextPtr
+
+from utils.index import IndexList
+
+
+alias float_dtype = DType.float32
+
+
+@compiler.register("mandelbrot", num_dps_outputs=1)
+struct Mandelbrot:
+    @staticmethod
+    fn execute[
+        # The kind of device this will be run on: "cpu" or "gpu"
+        target: StringLiteral,
+    ](
+        # as num_dps_outputs=1, the first argument is the "output"
+        out: ManagedTensorSlice,
+        # starting here are the list of inputs
+        min_x: Float32,
+        min_y: Float32,
+        scale_x: Float32,
+        scale_y: Float32,
+        max_iterations: Int32,
+        # the context is needed for some GPU calls
+        ctx: DeviceContextPtr,
+    ):
+        @parameter
+        @always_inline
+        fn elementwise_mandelbrot[
+            width: Int
+        ](idx: IndexList[out.rank]) -> SIMD[out.type, width]:
+            var row = idx[0]
+            var col = idx[1]
+            var cx = min_x.cast[float_dtype]() + (
+                col + iota[float_dtype, width]()
+            ) * scale_x.cast[float_dtype]()
+            var cy = min_y.cast[float_dtype]() + row * SIMD[float_dtype, width](
+                scale_y.cast[float_dtype]()
+            )
+            var c = ComplexSIMD[float_dtype, width](cx, cy)
+            var z = ComplexSIMD[float_dtype, width](0, 0)
+            var iters = SIMD[out.type, width](0)
+
+            var in_set_mask: SIMD[DType.bool, width] = True
+            for _ in range(max_iterations):
+                if not any(in_set_mask):
+                    break
+                in_set_mask = z.squared_norm() <= 4
+                iters = in_set_mask.select(iters + 1, iters)
+                z = z.squared_add(c)
+
+            return iters
+
+        foreach[elementwise_mandelbrot, target=target](out, ctx)
diff --git a/custom-ops-introduction/operations/vector_addition.mojo b/custom-ops-introduction/operations/vector_addition.mojo
new file mode 100644
index 0000000..1b19e1d
--- /dev/null
+++ b/custom-ops-introduction/operations/vector_addition.mojo
@@ -0,0 +1,98 @@
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2025, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
+
+from math import ceildiv
+
+import compiler
+from gpu import block_dim, block_idx, thread_idx
+from gpu.host import DeviceContext
+from runtime.asyncrt import DeviceContextPtr
+from tensor import ManagedTensorSlice, foreach
+
+from utils.index import IndexList
+
+
+fn _vector_addition_cpu(
+    out: ManagedTensorSlice,
+    lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+    rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+    ctx: DeviceContextPtr,
+):
+    # Warning: This is an extremely inefficient implementation! It's merely an
+    # instructional example of how a dedicated CPU-only path can be specified
+    # for basic vector addition.
+    var vector_length = out.dim_size(0)
+    for i in range(vector_length):
+        out[i] = lhs[i] + rhs[i]
+
+
+fn _vector_addition_gpu(
+    out: ManagedTensorSlice,
+    lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+    rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+    ctx: DeviceContextPtr,
+) raises:
+    # Note: The following has not been tuned for any GPU hardware, and is an
+    # instructional example for how a simple GPU function can be constructed
+    # and dispatched.
+    alias BLOCK_SIZE = 16
+    var gpu_ctx = ctx.get_device_context()
+    var vector_length = out.dim_size(0)
+
+    # The function that will be launched and distributed across GPU threads.
+    @parameter
+    fn vector_addition_gpu_kernel(length: Int):
+        var tid = block_dim.x * block_idx.x + thread_idx.x
+        if tid < length:
+            out[tid] = lhs[tid] + rhs[tid]
+
+    # The vector is divided up into blocks, making sure there's an extra
+    # full block for any remainder.
+    var num_blocks = ceildiv(vector_length, BLOCK_SIZE)
+
+    # The GPU function is compiled and enqueued to run on the GPU across the
+    # 1-D vector, split into blocks of `BLOCK_SIZE` width.
+    gpu_ctx.enqueue_function[vector_addition_gpu_kernel](
+        vector_length, grid_dim=num_blocks, block_dim=BLOCK_SIZE
+    )
+
+
+@compiler.register("vector_addition", num_dps_outputs=1)
+struct VectorAddition:
+    @staticmethod
+    fn execute[
+        # The kind of device this will be run on: "cpu" or "gpu"
+        target: StringLiteral,
+    ](
+        # as num_dps_outputs=1, the first argument is the "output"
+        out: ManagedTensorSlice[rank=1],
+        # starting here are the list of inputs
+        lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+        rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
+        # the context is needed for some GPU calls
+        ctx: DeviceContextPtr,
+    ) raises:
+        # For a simple elementwise operation like this, the `foreach` function
+        # does much more rigorous hardware-specific tuning. We recommend using
+        # that abstraction, with this example serving purely as an illustration
+        # of how lower-level functions can be used to program GPUs via Mojo.
+
+        # At graph compilation time, we will know what device we are compiling
+        # this operation for, so we can specialize it for the target hardware.
+        @parameter
+        if target == "cpu":
+            _vector_addition_cpu(out, lhs, rhs, ctx)
+        elif target == "gpu":
+            _vector_addition_gpu(out, lhs, rhs, ctx)
+        else:
+            raise Error("No known target:", target)
diff --git a/custom-ops-introduction/pyproject.toml b/custom-ops-introduction/pyproject.toml
new file mode 100644
index 0000000..a9ab84f
--- /dev/null
+++ b/custom-ops-introduction/pyproject.toml
@@ -0,0 +1,31 @@
+[project]
+authors = [{ name = "Modular Inc", email = "hello@modular.com" }]
+description = "Custom Operations: An Introduction"
+name = "custom-ops-introduction"
+requires-python = ">= 3.9,<3.13"
+version = "0.1.0"
+
+[build-system]
+build-backend = "hatchling.build"
+requires = ["hatchling"]
+
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+
+[tool.pixi.project]
+channels = [
+    "conda-forge",
+    "https://conda.modular.com/max-nightly",
+    "https://conda.modular.com/max",
+    "https://repo.prefix.dev/modular-community",
+]
+platforms = ["linux-64", "osx-arm64", "linux-aarch64"]
+
+[tool.pixi.tasks]
+package = "mojo package operations/ -o operations.mojopkg"
+add_one = { cmd = "python add_one.py", depends-on = ["package"] }
+mandelbrot = { cmd = "python mandelbrot.py", depends-on = ["package"] }
+vector_addition = { cmd = "python vector_addition.py", depends-on = ["package"] }
+
+[tool.pixi.dependencies]
+max = ">=25.2.0.dev2025022205"
diff --git a/custom-ops-introduction/vector_addition.py b/custom-ops-introduction/vector_addition.py
new file mode 100644
index 0000000..994061e
--- /dev/null
+++ b/custom-ops-introduction/vector_addition.py
@@ -0,0 +1,87 @@
+# ===----------------------------------------------------------------------=== #
+# Copyright (c) 2025, Modular Inc. All rights reserved.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions:
+# https://llvm.org/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===----------------------------------------------------------------------=== #
+
+from pathlib import Path
+
+import numpy as np
+from max.driver import CPU, Accelerator, Tensor, accelerator_count
+from max.dtype import DType
+from max.engine import InferenceSession
+from max.graph import Graph, TensorType, ops
+
+if __name__ == "__main__":
+    path = Path(__file__).parent / "operations.mojopkg"
+
+    vector_width = 10
+    dtype = DType.float32
+
+    # Configure our simple one-operation graph.
+    with Graph(
+        "vector_addition",
+        input_types=[
+            TensorType(dtype, shape=[vector_width]),
+            TensorType(dtype, shape=[vector_width]),
+        ],
+    ) as graph:
+        # Take in the two inputs to the graph.
+        lhs, rhs = graph.inputs
+        output = ops.custom(
+            name="vector_addition",
+            values=[lhs, rhs],
+            out_types=[
+                TensorType(dtype=lhs.tensor.dtype, shape=lhs.tensor.shape)
+            ],
+        )[0].tensor
+        graph.output(output)
+
+    # Place the graph on a GPU, if available. Fall back to CPU if not.
+    device = CPU() if accelerator_count() == 0 else Accelerator()
+
+    # Set up an inference session for running the graph.
+    session = InferenceSession(
+        devices=[device],
+        custom_extensions=path,
+    )
+
+    # Compile the graph.
+    model = session.load(graph)
+
+    # Fill input matrices with random values.
+    lhs_values = np.random.uniform(size=(vector_width)).astype(np.float32)
+    rhs_values = np.random.uniform(size=(vector_width)).astype(np.float32)
+
+    # Create driver tensors from this, and move them to the accelerator.
+    lhs_tensor = Tensor.from_numpy(lhs_values).to(device)
+    rhs_tensor = Tensor.from_numpy(rhs_values).to(device)
+
+    # Perform the calculation on the target device.
+    result = model.execute(lhs_tensor, rhs_tensor)[0]
+
+    # Copy values back to the CPU to be read.
+    assert isinstance(result, Tensor)
+    result = result.to(CPU())
+
+    print("Left-hand-side values:")
+    print(lhs_values)
+    print()
+
+    print("Right-hand-side values:")
+    print(rhs_values)
+    print()
+
+    print("Graph result:")
+    print(result.to_numpy())
+    print()
+
+    print("Expected result:")
+    print(lhs_values + rhs_values)

From f77f00caf82123ea50123141790fd3aad1e49395 Mon Sep 17 00:00:00 2001
From: Brad Larson <bradlarson@modular.com>
Date: Thu, 27 Feb 2025 14:01:05 -0600
Subject: [PATCH 2/3] Expand the documentation of the examples, remove the
 underscores to clean up two function names.

---
 custom-ops-introduction/README.md             | 288 +++++++++++++++++-
 .../operations/vector_addition.mojo           |   8 +-
 2 files changed, 279 insertions(+), 17 deletions(-)

diff --git a/custom-ops-introduction/README.md b/custom-ops-introduction/README.md
index 0fab5f7..09a5513 100644
--- a/custom-ops-introduction/README.md
+++ b/custom-ops-introduction/README.md
@@ -61,29 +61,42 @@ magic run mandelbrot
 
 3. Browse through the commented source code to see how they work.
 
-## Custom operation examples
+## Getting started with MAX Graph custom operations on GPU / CPU
 
-Graphs in MAX can be extended to use custom operations written in Mojo. The
-following examples are shown here:
+AI models in [MAX](https://docs.modular.com/max/intro) are built as
+computational graphs using the
+[MAX Graph API](https://docs.modular.com/max/tutorials/get-started-with-max-graph-in-python).
+MAX contains within it a powerful graph compiler that can take these graphs
+and optimize them for best performance on a wide range of hardware.
+
+Each node in a MAX Graph is defined by an operation that performs a calculation
+on zero or more inputs and produces one or more outputs. These inputs and
+outputs tend to be in the form of tensors, and the operations are usually
+data-parallel calculations that are accelerated on CPUs or GPUs. In MAX,
+these operations are written using [Mojo](https://docs.modular.com/mojo/manual/),
+a Python-family language built for high-performance computation.
+
+Anyone can write their own custom operations in Mojo and place them in a graph
+to run accelerated compute on CPUs or GPUs. The examples in this recipe show
+very basic custom operations and how to run them on CPUs or GPUs within a
+graph. More advanced use cases are available
+[in other recipes](https://builds.modular.com/?category=recipes).
+
+For a guided tour on how to create a custom operation, see
+[our introductory step-by-step tutorial](https://docs.modular.com/max/tutorials/build-custom-ops).
+
+The following examples of custom operations are shown here:
 
 * **add_one**: Adding 1 to every element of an input tensor.
 * **vector_addition**: Performing vector addition using a manual GPU function.
 * **mandelbrot**: Calculating the Mandelbrot set.
 
-Custom operations have been written in Mojo to carry out these calculations. For
-each example, a simple graph containing a single operation is constructed
+For each example, a simple graph containing a single operation is constructed
 in Python. This graph is compiled and dispatched onto a supported GPU if one is
 available, or the CPU if not. Input tensors, if there are any, are moved from
 the host to the device on which the graph is running. The graph then runs and
 the results are copied back to the host for display.
 
-One thing to note is that this same Mojo code runs on CPU as well as GPU. In
-the construction of the graph, it runs on a supported accelerator if one is
-available or falls back to the CPU if not. No code changes for either path.
-The `vector_addition` example shows how this works under the hood for common
-MAX abstractions, where compile-time specialization lets MAX choose the optimal
-code path for a given hardware architecture.
-
 The `operations/` directory contains the custom kernel implementations, and the
 graph construction occurs in the Python files in the base directory. These
 examples are designed to stand on their own, so that they can be used as
@@ -94,11 +107,260 @@ custom Mojo kernel, and then the graph is constructed and run in Python. The
 inference session is pointed to the `operations.mojopkg` in order to load the
 custom operations.
 
+Let's examine each example in detail:
+
+### Adding one to each element in a tensor
+
+In this example, we present a pretty simple "hello world" use case where we
+add the number 1 to each element in parallel of a tensor. The operation itself
+is defined in Mojo within `operations/add_one.mojo`:
+
+```mojo
+@compiler.register("add_one", num_dps_outputs=1)
+struct AddOne:
+    @staticmethod
+    fn execute[
+        target: StringLiteral,
+    ](
+        out: ManagedTensorSlice,
+        x: ManagedTensorSlice[type = out.type, rank = out.rank],
+        ctx: DeviceContextPtr,
+    ):
+        @parameter
+        @always_inline
+        fn elementwise_add_one[
+            width: Int
+        ](idx: IndexList[x.rank]) -> SIMD[x.type, width]:
+            return x.load[width](idx) + 1
+
+        foreach[elementwise_add_one, target=target](out, ctx)
+```
+
+Much of the code that you see is used to describe the computational node to
+the graph compiler: what are the inputs, outputs, and parameters. The actual
+calculation is in the latter half of the above. A closure named
+`elementwise_add_one()` is defined that loads a SIMD vector of values from the
+input tensor, adds one to each position in the vector, and then returns the
+result.
+
+This closure is then run against every position in the output tensor using an
+algorithmic abstraction defined in MAX called `foreach()`. At compile time,
+this elementwise calculation is specialized for the hardware it is being run
+on, with separate execution paths for CPUs and GPUs, as well as
+device-specific optimizations. This means that this same code will run with
+optimal performance on CPU or GPU with no changes required.
+
+The graph that runs this operation is defined and run in the `add_one.py`
+Python file. In this case, a single-operation graph that takes in one input
+tensor and provides one output tensor is created:
+
+```python
+graph = Graph(
+    "addition",
+    # The custom Mojo operation is referenced by its string name, and we
+    # need to provide inputs as a list as well as expected output types.
+    forward=lambda x: ops.custom(
+        name="add_one",
+        values=[x],
+        out_types=[TensorType(dtype=x.dtype, shape=x.tensor.shape)],
+    )[0].tensor,
+    input_types=[
+        TensorType(dtype, shape=[rows, columns]),
+    ],
+)
+```
+
+To see the entire example in action, it can be built and run using one command:
+
+```sh
+magic run add_one
+```
+
+This will build the Mojo custom operation, create the graph, and run it on a
+randomly-initialized tensor. The same calculation is performed in parallel
+using NumPy, and the results from both NumPy and the MAX Graph with this
+operation will be printed to the console and should match.
+
+### Hardware-specific manual vector addition
+
+The first example seen above used a high-level, device-independent abstraction
+for performing calculations on each element of a tensor that provides great
+out-of-the-box performance for a range of CPUs and GPUs. However, it's possible
+to write your own hardware-specific data-parallel algorithms if you desire to
+work at a lower level. This next example illustrates how to perform parallel
+addition of elements in two vectors on GPUs using a programming model that may
+be more familiar to those used to general purpose GPU programming in CUDA and
+similar frameworks.
+
+The vector addition operation is defined in `operations/vector_addition.mojo`.
+Within the body of the operation is the following code:
+
+```mojo
+@parameter
+if target == "cpu":
+    vector_addition_cpu(out, lhs, rhs, ctx)
+elif target == "gpu":
+    vector_addition_gpu(out, lhs, rhs, ctx)
+else:
+    raise Error("No known target:", target)
+```
+
+At compile time of the operation within the graph, a `target` will be provided.
+Using the `@parameter if` language construct in Mojo, the
+`vector_addition_gpu()` path in the operation will only be compiled if the
+operation will be running on a GPU. Compile-time specialization like this is a
+unique and powerful feature of Mojo that makes it easy to optimize code for
+specific hardware.
+
+The body of the `vector_addition_gpu()` function that performs the addition looks
+like the following:
+
+```mojo
+alias BLOCK_SIZE = 16
+var gpu_ctx = ctx.get_device_context()
+var vector_length = out.dim_size(0)
+
+@parameter
+fn vector_addition_gpu_kernel(length: Int):
+    var tid = block_dim.x * block_idx.x + thread_idx.x
+    if tid < length:
+        out[tid] = lhs[tid] + rhs[tid]
+
+var num_blocks = ceildiv(vector_length, BLOCK_SIZE)
+gpu_ctx.enqueue_function[vector_addition_gpu_kernel](
+    vector_length, grid_dim=num_blocks, block_dim=BLOCK_SIZE
+)
+```
+
+This may look familar if you're familiar with CUDA C kernels and how they are
+dispatched onto a GPU. A `vector_addition_gpu_kernel()` closure is defined that
+will run once per thread on the GPU, adding an element from the `lhs` vector
+to the matching element in the `rhs` vector and then saving the result at the
+correct position in the `out` vector. This function is then run across a grid
+of `BLOCK_SIZE` blocks of threads.
+
+The block size is arbitrary here, and is not tuned for the specific GPU
+hardware this will be run on. The previously-mentioned `foreach()` abstraction
+will do hardware-specific tuning for this style of dispatch, and is what we
+recommend for simple elementwise calculations like this. However, this example
+shows how you might mentally map CUDA C code to thread-level operations in MAX.
+
+This example can be built and run using one command:
+
+```sh
+magic run vector_addition
+```
+
+Once the graph has been compiled and run, the two randomly-initialized input
+vectors will be printed, along with their results from the graph calculation
+and addition via NumPy.
+
+### Calculating the Mandelbrot set
+
+The final example in this recipe shows a slightly more complex calculation
+(pun intended):
+[the Mandelbrot set fractal](https://en.wikipedia.org/wiki/Mandelbrot_set).
+This custom operation takes no input tensors, only a set of scalar arguments,
+and returns a 2-D matrix of integer values representing the number of
+iterations it took to escape at that location in complex number space.
+
+The calculation itself is present in `operations/mandelbrot.mojo`. It is
+performed in parallel at each location in the 2-D output matrix using
+`foreach()` and the following function:
+
+```mojo
+fn elementwise_mandelbrot[
+    width: Int
+](idx: IndexList[out.rank]) -> SIMD[out.type, width]:
+    var row = idx[0]
+    var col = idx[1]
+    var cx = min_x.cast[float_dtype]() + (
+        col + iota[float_dtype, width]()
+    ) * scale_x.cast[float_dtype]()
+    var cy = min_y.cast[float_dtype]() + row * SIMD[float_dtype, width](
+        scale_y.cast[float_dtype]()
+    )
+    var c = ComplexSIMD[float_dtype, width](cx, cy)
+    var z = ComplexSIMD[float_dtype, width](0, 0)
+    var iters = SIMD[out.type, width](0)
+
+    var in_set_mask: SIMD[DType.bool, width] = True
+    for _ in range(max_iterations):
+        if not any(in_set_mask):
+            break
+        in_set_mask = z.squared_norm() <= 4
+        iters = in_set_mask.select(iters + 1, iters)
+        z = z.squared_add(c)
+
+    return iters
+```
+
+This begins by calculating the complex number which represents a given location
+in the output grid (C). Then, starting from `Z=0`, the calculation `Z=Z^2 + C`
+is iteratively calculated until Z exceeds 4, the threshold we're using for when
+Z will escape the set. This occurs up until a maximum number of iterations,
+and the number of iterations to escape (or not, if the maximum is hit) is then
+returned for each location in the grid.
+
+The area to examine in complex space, the resolution of the grid, and the
+maximum number of iterations, are all provided as scalars at run time to this
+custom operation. This would potentially allow someone to create an animation
+zooming in on a section of the Mandelbrot set, for example.
+
+The single-operation graph that contains the Mandelbrot set calculation is
+constructed using the following:
+
+```python
+def create_mandelbrot_graph(
+    width: int,
+    height: int,
+    min_x: float,
+    min_y: float,
+    scale_x: float,
+    scale_y: float,
+    max_iterations: int,
+) -> Graph:
+    output_dtype = DType.int32
+    with Graph(
+        "mandelbrot",
+    ) as graph:
+        result = ops.custom(
+            name="mandelbrot",
+            values=[
+                ops.constant(min_x, dtype=DType.float32),
+                ops.constant(min_y, dtype=DType.float32),
+                ops.constant(scale_x, dtype=DType.float32),
+                ops.constant(scale_y, dtype=DType.float32),
+                ops.constant(max_iterations, dtype=DType.int32),
+            ],
+            out_types=[TensorType(dtype=output_dtype, shape=[height, width])],
+        )[0].tensor
+
+        graph.output(result)
+        return graph
+```
+
+The above shows how scalar values can be provided into a graph as
+`ops.constant` inputs. No input is provided, and the output is sized based on
+the width and height provided.
+
+This example can be built and run using one command:
+
+```sh
+magic run mandelbrot
+```
+
+Once the graph is compiled and run, a visual representation of the Mandelbrot
+set will be printed to the console.
+
 ## Conclusion
 
 In this recipe, we've introduced the basics of how to write custom MAX Graph
 operations using Mojo, place them in a one-operation graph in Python, and run
-them on an available CPU or GPU.
+them on an available CPU or GPU. Three operations with differing inputs and
+outputs, as well as different styles of calculation, have been demonstrated
+to begin to show the versatility of the MAX programming interfaces for custom
+graph operations.
 
 ## Next Steps
 
diff --git a/custom-ops-introduction/operations/vector_addition.mojo b/custom-ops-introduction/operations/vector_addition.mojo
index 1b19e1d..6ef3ec0 100644
--- a/custom-ops-introduction/operations/vector_addition.mojo
+++ b/custom-ops-introduction/operations/vector_addition.mojo
@@ -22,7 +22,7 @@ from tensor import ManagedTensorSlice, foreach
 from utils.index import IndexList
 
 
-fn _vector_addition_cpu(
+fn vector_addition_cpu(
     out: ManagedTensorSlice,
     lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
     rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
@@ -36,7 +36,7 @@ fn _vector_addition_cpu(
         out[i] = lhs[i] + rhs[i]
 
 
-fn _vector_addition_gpu(
+fn vector_addition_gpu(
     out: ManagedTensorSlice,
     lhs: ManagedTensorSlice[type = out.type, rank = out.rank],
     rhs: ManagedTensorSlice[type = out.type, rank = out.rank],
@@ -91,8 +91,8 @@ struct VectorAddition:
         # this operation for, so we can specialize it for the target hardware.
         @parameter
         if target == "cpu":
-            _vector_addition_cpu(out, lhs, rhs, ctx)
+            vector_addition_cpu(out, lhs, rhs, ctx)
         elif target == "gpu":
-            _vector_addition_gpu(out, lhs, rhs, ctx)
+            vector_addition_gpu(out, lhs, rhs, ctx)
         else:
             raise Error("No known target:", target)

From ec3886fc5092d45483b0dda72eb884bdad87139e Mon Sep 17 00:00:00 2001
From: Brad Larson <bradlarson@modular.com>
Date: Sat, 1 Mar 2025 11:45:56 -0600
Subject: [PATCH 3/3] Enhance the Mandelbrot set display.

---
 custom-ops-introduction/README.md             | 43 ++++++++++++++++++-
 custom-ops-introduction/mandelbrot.py         | 25 ++++++++---
 .../operations/mandelbrot.mojo                |  9 ++--
 3 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/custom-ops-introduction/README.md b/custom-ops-introduction/README.md
index 09a5513..61c71d9 100644
--- a/custom-ops-introduction/README.md
+++ b/custom-ops-introduction/README.md
@@ -351,7 +351,48 @@ magic run mandelbrot
 ```
 
 Once the graph is compiled and run, a visual representation of the Mandelbrot
-set will be printed to the console.
+set will be printed to the console:
+
+```
+...................................,,,,c@8cc,,,.............
+...............................,,,,,,cc8M @Mjc,,,,..........
+............................,,,,,,,ccccM@aQaM8c,,,,,........
+..........................,,,,,,,ccc88g.o. Owg8ccc,,,,......
+.......................,,,,,,,,c8888M@j,    ,wMM8cccc,,.....
+.....................,,,,,,cccMQOPjjPrgg,   OrwrwMMMjjc,....
+..................,,,,cccccc88MaP  @            ,pGa.g8c,...
+...............,,cccccccc888MjQp.                   o@8cc,..
+..........,,,,c8jjMMMMMMMMM@@w.                      aj8c,,.
+.....,,,,,,ccc88@QEJwr.wPjjjwG                        w8c,,.
+..,,,,,,,cccccMMjwQ       EpQ                         .8c,,,
+.,,,,,,cc888MrajwJ                                   MMcc,,,
+.cc88jMMM@@jaG.                                     oM8cc,,,
+.cc88jMMM@@jaG.                                     oM8cc,,,
+.,,,,,,cc888MrajwJ                                   MMcc,,,
+..,,,,,,,cccccMMjwQ       EpQ                         .8c,,,
+.....,,,,,,ccc88@QEJwr.wPjjjwG                        w8c,,.
+..........,,,,c8jjMMMMMMMMM@@w.                      aj8c,,.
+...............,,cccccccc888MjQp.                   o@8cc,..
+..................,,,,cccccc88MaP  @            ,pGa.g8c,...
+.....................,,,,,,cccMQOEjjPrgg,   OrwrwMMMjjc,....
+.......................,,,,,,,,c8888M@j,    ,wMM8cccc,,.....
+..........................,,,,,,,ccc88g.o. Owg8ccc,,,,......
+............................,,,,,,,ccccM@aQaM8c,,,,,........
+...............................,,,,,,cc8M @Mjc,,,,..........
+```
+
+Try experimenting with the parameters in `mandelbrot.py` to visualize
+different parts of the Mandelbrot set, at different resolutions:
+
+```python
+WIDTH = 60
+HEIGHT = 25
+MAX_ITERATIONS = 100
+MIN_X = -2.0
+MAX_X = 0.7
+MIN_Y = -1.12
+MAX_Y = 1.12
+```
 
 ## Conclusion
 
diff --git a/custom-ops-introduction/mandelbrot.py b/custom-ops-introduction/mandelbrot.py
index ba3117b..cba6778 100644
--- a/custom-ops-introduction/mandelbrot.py
+++ b/custom-ops-introduction/mandelbrot.py
@@ -19,6 +19,21 @@
 from max.graph import Graph, TensorType, ops
 
 
+def draw_mandelbrot(tensor: Tensor, width: int, height: int, iterations: int):
+    """A helper function to visualize the Mandelbrot set in ASCII art."""
+    sr = "....,c8M@jawrpogOQEPGJ"
+    for row in range(height):
+        for col in range(width):
+            v = tensor[row, col].item()
+            if v < iterations:
+                idx = int(v % len(sr))
+                p = sr[idx]
+                print(p, end="")
+            else:
+                print(" ", end="")
+        print("")
+
+
 def create_mandelbrot_graph(
     width: int,
     height: int,
@@ -56,10 +71,10 @@ def create_mandelbrot_graph(
     path = Path(__file__).parent / "operations.mojopkg"
 
     # Establish Mandelbrot set ranges.
-    WIDTH = 15
-    HEIGHT = 15
+    WIDTH = 60
+    HEIGHT = 25
     MAX_ITERATIONS = 100
-    MIN_X = -1.5
+    MIN_X = -2.0
     MAX_X = 0.7
     MIN_Y = -1.12
     MAX_Y = 1.12
@@ -89,6 +104,4 @@ def create_mandelbrot_graph(
     assert isinstance(result, Tensor)
     result = result.to(CPU())
 
-    print("Iterations to escape:")
-    print(result.to_numpy())
-    print()
+    draw_mandelbrot(result, WIDTH, HEIGHT, MAX_ITERATIONS)
diff --git a/custom-ops-introduction/operations/mandelbrot.mojo b/custom-ops-introduction/operations/mandelbrot.mojo
index f6ec259..3022faf 100644
--- a/custom-ops-introduction/operations/mandelbrot.mojo
+++ b/custom-ops-introduction/operations/mandelbrot.mojo
@@ -11,13 +11,12 @@
 # limitations under the License.
 # ===----------------------------------------------------------------------=== #
 
-from math import iota
 
 import compiler
 from complex import ComplexSIMD
+from math import iota
 from max.tensor import ManagedTensorSlice, foreach
 from runtime.asyncrt import DeviceContextPtr
-
 from utils.index import IndexList
 
 
@@ -47,8 +46,11 @@ struct Mandelbrot:
         fn elementwise_mandelbrot[
             width: Int
         ](idx: IndexList[out.rank]) -> SIMD[out.type, width]:
+            # Obtain the position in the grid from the X, Y thread locations.
             var row = idx[0]
             var col = idx[1]
+
+            # Calculate the complex C corresponding to that grid location.
             var cx = min_x.cast[float_dtype]() + (
                 col + iota[float_dtype, width]()
             ) * scale_x.cast[float_dtype]()
@@ -57,8 +59,9 @@ struct Mandelbrot:
             )
             var c = ComplexSIMD[float_dtype, width](cx, cy)
             var z = ComplexSIMD[float_dtype, width](0, 0)
-            var iters = SIMD[out.type, width](0)
 
+            # Perform the Mandelbrot iteration loop calculation.
+            var iters = SIMD[out.type, width](0)
             var in_set_mask: SIMD[DType.bool, width] = True
             for _ in range(max_iterations):
                 if not any(in_set_mask):