cmake -S . -B build \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DCMAKE_BUILD_TYPE=Release \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ARCH_AMPERE86=ON \
-DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations"
cmake --build build --parallel 48podman build -f Dockerfile -t cg-cuda1281:latest
# --pty allocates a pseudo terminal and runs in the foreground
srun --pty -p batch --gres=gpu:1 \
podman run --rm -it -v ${PWD}:/cg \
--device=nvidia.com/gpu=all \
-e CUDA_VISIBLE_DEVICES \
cg-cuda1281:latest
cmake -S /cg -B build \
-DCMAKE_BUILD_TYPE=Release \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ARCH_HOPPER90=ON \
-DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations"
cmake --build build --parallel 48
# Maya says 64 tiles per core
# result at max problem size 7 * 8 * 64 * 1024 = 3670016 elements and 100 iterations
# weak scaling: 7*8* {16,32,64} * 1024 elements
# so let's divide 7 * 8 * 1024 among fixed x and y, and just put 16,32,64 in z
# 56 * 1024
# -> 112 * 512 for x and y
for z in 016 032 064; do
build/main 512 112 $z f32 | tee $z.out
done