|
1 | 1 | # pytorch_cmspepr
|
| 2 | + |
2 | 3 | pytorch bindings for optimized knn and aggregation kernels
|
| 4 | + |
| 5 | + |
| 6 | +## Example |
| 7 | + |
| 8 | +```python |
| 9 | +>>> import torch |
| 10 | +>>> import torch_cmspepr |
| 11 | + |
| 12 | +# Two events with 5 nodes and 4 nodes, respectively. |
| 13 | +# Nodes here are on a diagonal line in 2D, with d^2 = 0.02 between them. |
| 14 | +>>> nodes = torch.FloatTensor([ |
| 15 | + # Event 0 |
| 16 | + [.1, .1], |
| 17 | + [.2, .2], |
| 18 | + [.3, .3], |
| 19 | + [.4, .4], |
| 20 | + [100., 100.], |
| 21 | + # Event 1 |
| 22 | + [.1, .1], |
| 23 | + [.2, .2], |
| 24 | + [.3, .3], |
| 25 | + [.4, .4] |
| 26 | + ]) |
| 27 | +# Designate which nodes belong to which event |
| 28 | +>>> batch = torch.LongTensor([0,0,0,0,0,1,1,1,1]) |
| 29 | + |
| 30 | +# Generate edges: k=2, max_radius^2 of 0.04 |
| 31 | +>>> torch_cmspepr.knn_graph(nodes, 2, batch, max_radius=.2) |
| 32 | +tensor([[0, 1, 1, 2, 2, 3, 5, 6, 6, 7, 7, 8], |
| 33 | + [1, 0, 2, 1, 3, 2, 6, 5, 7, 6, 8, 7]]) |
| 34 | + |
| 35 | +# Generate edges: k=3 with loops allowed |
| 36 | +>>> torch_cmspepr.knn_graph(nodes, 3, batch, max_radius=.2, loop=True) |
| 37 | +tensor([[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8], |
| 38 | + [0, 1, 1, 0, 2, 2, 1, 3, 3, 2, 4, 5, 6, 6, 5, 7, 7, 6, 8, 8, 7]]) |
| 39 | + |
| 40 | +# If CUDA is available, the CUDA version of the knn_graph is used automatically: |
| 41 | +>>> gpu = torch.device('cuda') |
| 42 | +>>> torch_cmspepr.knn_graph(nodes.to(gpu), 2, batch.to(gpu), max_radius=.2) |
| 43 | +tensor([[0, 1, 1, 2, 2, 3, 5, 6, 6, 7, 7, 8], |
| 44 | + [1, 0, 2, 1, 3, 2, 6, 5, 7, 6, 8, 7]], device='cuda:0') |
| 45 | +``` |
| 46 | + |
| 47 | + |
| 48 | +## Installation and requirements |
| 49 | + |
| 50 | +v1 is tested with CUDA 11.7 and pytorch 2.0. |
| 51 | +You should verify `nvcc` is available: |
| 52 | + |
| 53 | +```console |
| 54 | +$ nvcc --version |
| 55 | +nvcc: NVIDIA (R) Cuda compiler driver |
| 56 | +Copyright (c) 2005-2022 NVIDIA Corporation |
| 57 | +Built on Wed_Jun__8_16:49:14_PDT_2022 |
| 58 | +Cuda compilation tools, release 11.7, V11.7.99 |
| 59 | +Build cuda_11.7.r11.7/compiler.31442593_0 |
| 60 | +``` |
| 61 | + |
| 62 | +Also a `gcc` version of 5 or higher is recommended. |
| 63 | + |
| 64 | +The package is not (yet) available on PyPI, so local installation is at the moment the |
| 65 | +preferred installation method: |
| 66 | + |
| 67 | +```bash |
| 68 | +git clone git@github.com:cms-pepr/pytorch_cmspepr.git |
| 69 | +cd pytorch_cmspepr |
| 70 | +pip install -e . |
| 71 | +``` |
| 72 | + |
| 73 | +Installing _only_ the CPU or CUDA extensions is supported: |
| 74 | + |
| 75 | +```bash |
| 76 | +FORCE_CPU_ONLY=1 pip install -e . # Only compile C++ extensions |
| 77 | +FORCE_CUDA_ONLY=1 pip install -e . # Only compile CUDA extenstions |
| 78 | +FORCE_CUDA=1 pip install -e . # Try to compile CUDA extenstion even if no device found |
| 79 | +``` |
| 80 | + |
| 81 | +If you only want to test the compilation of the extensions: |
| 82 | + |
| 83 | +```bash |
| 84 | +python setup.py develop |
| 85 | +``` |
| 86 | + |
| 87 | +### Containerization |
| 88 | + |
| 89 | +It is recommended to install and run inside a container. |
| 90 | +At the time of writing (29 Sep 2023), the [pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel](https://hub.docker.com/layers/pytorch/pytorch/2.0.0-cuda11.7-cudnn8-devel/images/sha256-96ccb2997a131f2455d70fb78dbb284bafe4529aaf265e344bae932c8b32b2a4?context=explore) |
| 91 | +docker container works well. |
| 92 | + |
| 93 | +Example Singularity instructions: |
| 94 | + |
| 95 | +```bash |
| 96 | +singularity pull docker://pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel |
| 97 | +singularity run --nv pytorch_2.0.0-cuda11.7-cudnn8-devel.sif |
| 98 | +``` |
| 99 | + |
| 100 | +And then once in the container: |
| 101 | + |
| 102 | +```bash |
| 103 | +export PYTHONPATH="/opt/conda/lib/python3.10/site-packages" |
| 104 | +python -m venv env |
| 105 | +source env/bin/activate |
| 106 | +pip install torch_geometric |
| 107 | +pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu117.html # Make sure to pick the right torch and CUDA versions here |
| 108 | +git clone git@github.com:cms-pepr/pytorch_cmspepr.git |
| 109 | +cd pytorch_cmspepr |
| 110 | +pip install -e . |
| 111 | +``` |
| 112 | + |
| 113 | + |
| 114 | +## Tests |
| 115 | + |
| 116 | +```bash |
| 117 | +pip install pytest |
| 118 | +pytest tests |
| 119 | +``` |
| 120 | + |
| 121 | + |
| 122 | +## Performance |
| 123 | + |
| 124 | +The following profiling code can be used: |
| 125 | + |
| 126 | +```python |
| 127 | +import time |
| 128 | +import torch |
| 129 | +import torch_cmspepr |
| 130 | +import torch_cluster |
| 131 | +gpu = torch.device('cuda') |
| 132 | + |
| 133 | +def gen(cuda=False): |
| 134 | + # 10k nodes with 5 node features |
| 135 | + x = torch.rand((10000, 5)) |
| 136 | + # Split nodes over 4 events with 2500 nodes/evt |
| 137 | + batch = torch.repeat_interleave(torch.arange(4), 2500) |
| 138 | + if cuda: x, batch = x.to(gpu), batch.to(gpu) |
| 139 | + return x, batch |
| 140 | + |
| 141 | +def profile(name, unit): |
| 142 | + t0 = time.time() |
| 143 | + for _ in range(100): unit() |
| 144 | + print(f'{name} took {(time.time() - t0)/100.} sec/evt') |
| 145 | + |
| 146 | +def cpu_cmspepr(): |
| 147 | + x, batch = gen() |
| 148 | + torch_cmspepr.knn_graph(x, k=10, batch=batch) |
| 149 | +profile('CPU (torch_cmspepr)', cpu_cmspepr) |
| 150 | + |
| 151 | +def cpu_cluster(): |
| 152 | + x, batch = gen() |
| 153 | + torch_cluster.knn_graph(x, k=10, batch=batch) |
| 154 | +profile('CPU (torch_cluster)', cpu_cmspepr) |
| 155 | + |
| 156 | +def cuda_cmspepr(): |
| 157 | + x, batch = gen(cuda=True) |
| 158 | + torch_cmspepr.knn_graph(x, k=10, batch=batch) |
| 159 | +profile('CUDA (torch_cmspepr)', cuda_cmspepr) |
| 160 | + |
| 161 | +def cuda_cluster(): |
| 162 | + x, batch = gen(cuda=True) |
| 163 | + torch_cluster.knn_graph(x, k=10, batch=batch) |
| 164 | +profile('CUDA (torch_cluster)', cpu_cmspepr) |
| 165 | +``` |
| 166 | + |
| 167 | +On a NVIDIA Tesla P100 with 12GB of RAM, this produces: |
| 168 | + |
| 169 | +``` |
| 170 | +CPU (torch_cmspepr) took 0.22623349189758302 sec/evt |
| 171 | +CPU (torch_cluster) took 0.2259768319129944 sec/evt |
| 172 | +CUDA (torch_cmspepr) took 0.026673252582550048 sec/evt |
| 173 | +CUDA (torch_cluster) took 0.22262062072753908 sec/evt |
| 174 | +``` |
0 commit comments