diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 86e430b..51a3137 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -20,6 +20,9 @@ jobs: node-version: 18 cache: npm + - name: test thing + run: node gen-webring-routes.js + - name: Install dependencies run: npm ci - name: Build website diff --git a/CNAME b/CNAME deleted file mode 100644 index 227ad91..0000000 --- a/CNAME +++ /dev/null @@ -1 +0,0 @@ -graphics-programming.org diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/centered_comparison.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/centered_comparison.png new file mode 100644 index 0000000..4969f15 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/centered_comparison.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved.png new file mode 100644 index 0000000..033912d Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_256.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_256.png new file mode 100644 index 0000000..f2a273b Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_256.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_512.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_512.png new file mode 100644 index 0000000..dcbfc35 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_512.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_ring.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_ring.png new file mode 100644 index 0000000..58df418 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved_ring.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/dif_diagram.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/dif_diagram.png new file mode 100644 index 0000000..4920699 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/dif_diagram.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/dif_diagram_color.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/dif_diagram_color.png new file mode 100644 index 0000000..7652ed9 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/dif_diagram_color.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/heart.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/heart.png new file mode 100644 index 0000000..092fd3e Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/heart.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/image.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/image.png new file mode 100644 index 0000000..6329abc Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/image.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/index.md b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/index.md new file mode 100644 index 0000000..a1cbee6 --- /dev/null +++ b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/index.md @@ -0,0 +1,833 @@ +--- +title: 'FFT Bloom Optimized to the Bone in Nabla' +slug: 'fft-bloom-optimized-to-the-bone-in-nabla' +description: 'Understanding and using the Nabla FFT' +date: '2025-01-24' +authors: ['fletterio'] +tags: ['nabla', 'vulkan', 'article', 'tutorial', 'showcase'] +image: 'https://raw.githubusercontent.com/graphicsprogramming/blog/main/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/convolved.png' +last_update: + date: '2025-01-24' + author: Fletterio +--- + + + +Described as "the most important numerical algorithm of our lifetime", the FFT has applications in a plethora of domains. + +In this article I show how to run an FFT in Nabla, talk about different optimizations and showcase one application in graphics: FFT Bloom. + + + +## The Fast Fourier Transform + +First, one must know what a Fourier Transform is. It's a clever way of decomposing periodic signals into their frequency components, essentially nothing more than an orthonormal change of basis. This might weird to think about, so here's a good intro to the topic by 3B1B: + + + +Don't dwell too much on the continuous case because we're mostly interested in the [Discrete Fourier Transform](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) (DFT for short). It's a center piece of Digital Signal Processing. As a quick summary, the DFT is nothing but a change of basis in some vector space. Given a signal defined over some domain (spatial or temporal, usually), the "natural" representation of it is its "canonical basis decomposition" - which means mapping each point in space or time to the signal's value at that point. Thanks to Fourier, we have another very useful representation for the same signal, which involves its "spectral decomposition" - periodic functions defined over certain domains can always be written as a linear combination of some special orthogonal (w.r.t. some metric) functions over the same domain. + +The DFT is a linear transform that maps an $n$-dimensional vector $x$ representing some periodic signal to another $n$-dimensional vector $X$ such that the coordinates of $X$ are the coefficients of the linear decomposition of $x$ in this special basis. Those of you familiar with linear algebra will probably immediately recognize this is just a change of basis, computed as a matrix product $D \cdot x$, where $D$ is the matrix associated with the linear transform (the DFT). + +As posed, however, the DFT is a quite expensive operation: a matrix product is $O(n^2)$. That's where the Fast Fourier Transform (FFT for short) comes in! The Cooley-Tukey FFT (one of many FFT algorithms) exploits symmetries in the DFT's associated matrix to design a divide and conquer algorithm bringing its complexity down to $O(n\log n)$. There are actually a bunch of FFT algorithms that work in a similar fashion and achieve the same time complexity, but in this article we will restrict the discussion to the classic Radix-2 Cooley-Tukey algorithm, which is what we use in Nabla. + +The Radix-2 Cooley-Tukey FFT algorithm (from now on when I say FFT I will mean specifically this algorithm) is easier to implement than other FFT algorithms and maps nicely to the way parallelization is done in a GPU. The main drawback of it is that it only works on Power-of-Two sized (PoT for short) arrays. This requires us to pad any array we want to run an FFT on up to the next power of two. In some cases, it can result in a bunch of wasted compute power (worst cases have you wasting almost the same amount of compute as what you effectively actually need) but that's the price of convenience and simplicity, especially regarding the use of GPU hardware. Still, it runs pretty fast :). + +Now you might be asking, why would I care about computing the DFT really fast? Well, there's a lot of operations that are accelerated with the FFT. One of those, as you might have guessed from the title of this article, is convolution. + +## Convolution and The Convolution Theorem + +The convolution of two signals $f$ and $g$, denoted by $f * g$, is a special type of product. My favourite way of reasoning about it (and one I have surprisingly very rarely come upon) is that it's just the superposition of many copies of $f$: for each point $x$ in your space, you take a copy of $f$ centered at $x$, $f(t-x)$ (as a function of a parameter $t$), and scale it by the value of $g$ at that point, $g(x)$, then sum all of these copies together. 3B1B again has a great introductory video, although he presents convolution in a more "standard" way, which is by sliding inverted copies of one signal over the other: + + + +[The Convolution Theorem](https://en.wikipedia.org/wiki/Convolution_theorem#Periodic_convolution) states that we can perform a (circular) convolution as a Hadamard (element-wise) product in the spectral domain. This means that convolution goes from an $O(nm)$ operation ($n$ being the number of pixels of a signal and $m$ being the number of pixels of a filter) down to $O(n \log n)$ (assuming $n \ge m$): You do Forward FFT, then Hadamard product, then Inverse FFT, with the FFTs being $O(n \log n)$ and the product being $O(n)$. For small filters the FFT convolution ends up being slower, but for larger ones the speedup is massive. + +Our Lead Build System and Test Engineer, Arkadiusz, has a Vulkanised talk giving a recap of the Convolution Theorem and the usage of the FFT in Nabla: + + + +## FFT Bloom + +[Bloom](https://en.wikipedia.org/wiki/Bloom_(shader_effect)) is a visual artifact in imaging. Bloom in Computer Graphics can be simulated as a post-processing effect by convolving an incoming image with a point spread function (PSF) which, as its name implies, spreads light from each pixel (point) all over the image. With my view of the convolution, convolving the image with a PSF means that at each pixel of the image you "paste" a scaled copy of the PSF, then sum all of them together. The PSF is a non-negative function that integrates to $1$ which is very strong towards the center and decays rapidly towards the edges. + +"Pasting" a copy of a PSF at pixel $p$ of the resulting convolved image, scaled by the value of the original image at pixel $p$, essentially has the effect of leaving much of the original pixel's light close to where it originally was, but also spreading some of it througout the rest of the pixels affected by this copy of the PSF. + +Even though convolution is commutative, it is common to distinguish when performing one the signal we're interested in (in this case, our image) from the *kernel* we're convolving it against (in this case, the PSF). Therefore, whenever you read the word *kernel* in this article, remember we're talking about an image / texture representing our PSF and NOT a program running on the GPU. + +Since Bloom is done with a convolution, and we're considering pretty big PSFs (think `256x256` or `512x512`) we set out to accelerate the convolution with FFT (if you were using smaller kernels such as `3x3` or `5x5` you'll be better off with doing the convolution in the spatial domain). The algorithm to perform FFT Bloom is roughly as follows: + +``` +1. compute the spectrum of the kernel via FFT +2. compute the spectrum of the image via FFT +3. compute the Hadamard product of both spectra +4. compute the IFFT of the resulting product +5. FFTShift the result +``` + +Here's what convolving this image ![Balls](image.png "Raytraced Balls") against this kernel + +![Kernel](kernel.png "Kernel") + +looks like: + +![Convolved](convolved.png "Convolved") + +I'm showing the kernel with a white point of $10^{-6}$ so you can appreciate its shape, but its actually a PSF that's super strong in the middle and rapidly decaying towards the edges. For reference, here's the same kernel with a white point of $10^{-4}$: + +![Small Kernel](kernel_small.png "Kernel with a higher white point") + +The spectra of both signals are computed on a per-channel basis, meaning that we compute one spectrum for each of the R,G,B channels, since convolution is performed on each channel independently. + +Since the DFT is separable over Euclidean domains, the spectrum of each signal is computed by performing the FFT along one axis at a time. This is important when we consider optimizations. + +### Padding + +One important detail about the FFT accelerated convolution we must not overlook is that it uses what's known as a circular convolution - in English, it assumes our signals are periodic (sampled with `GL_REPEAT`). If you were to naively compute both spectra as they come, multiply them then run the IFFT back, you'd find artifacts such as this: + +![Unpadded](unpadded.png "Wraparound artifact") + +What's going on is that, since we assume the image to be periodic, any light that would "spill over" from the top of the image ends up wrapping around and contaminating the pixels below. Essentially it's equivalent to padding the image with `GL_REPEAT`, convolving the result with the kernel, then keeping the central pixels. Here's a neat visualization for that: + +![Wrapped](wrapped.png "Repeat padding and convolution") + +To avoid this kind of artifact, one must first pad the image with at least half the kernel size on each side so that the image is now bigger and this "spillover" happens only in the padding pixels, which get cut off when we go back to the original image dimensions. Wrapping modes that make sense here are mirror padding and zero padding. + +Mirror padding has the nice property that total luminance in the image is (kind of) preserved: take a pixel $p$ such that when "pasting" a copy of the kernel at its location, some pixels of this kernel copy end up in the padding. This is luminance lost due to it being pixels that get cut off from the end result. But for each one of those pixels in the padding, when pasting a kernel copy over them, some of that copy overlaps $p$, adding back some lost luminance. If the kernel was perfectly radially symmetric, this would mean no lost luminance. In practice it means a very slight variation in luminance. + +However, mirror padding can cause artifacts such as this when there's pixels that are too bright near the borders: + +![Mirror Padding Artifact](mirror_padding_artifact.png "Artifact causes by mirror padding") + +As you can see with the "Sun" ball there, there's an abnormal amount of light towards the border. This is because a "copy" of the Sun is spawned right on the other side of that border and a lot of light from it is contaminating the pixels below: + +![Mirror Padding](mirror_padding.png "Full convolved image with mirror padding") + +Zero padding is much nicer and it's what's used in the video at the start of this article. We do lose some luminance (since light spilling over to the padding zone is not compensated for) and given the shape of the PSF, pixels near the border lose much more luminance, which can turn the borders of the image darker. This vignetting effect is actually desirable (often added as a post-process), all the while avoiding artifacts like the one showcased for mirror padding. Furthermore, autoexposure methods handle the lost luminance in the image. + +![Zero Padding](zero_padded.png "Full convolved image with zero padding") + +### The FFTShift + +Why do you need to "FFTShift" the result, and what even is the FFTShift? First, let's get the following undiscussed step out of the way: the Hadamard product. + +To compute the Hadamard product, both matrices must have the same dimensions, which is almost never the case. We'll talk about the optimization done in our example later, but for now you can think that instead of the kernel $K$ for the convolution we're using another kernel $K'$ which has the same dimensions as our image and consists of a copy of $K$ in the centre, and zero padding all around it. The result of the convolution between our image and $K'$ is the same as the convolution with $K$, except for a lot of zero padding to the sides: + +![Padded kernel](padded_kernel.png "Our kernel K'") + +The DFT assumes the "origin" of our signal is the "top-left" or "bottom-left" element of the signal, depending on which coordinate you set as $(0,0)$. This is true for both image and kernel. + +In practice what this means is that for each pixel $p$ in our image the convolution is still doing this process of "pasting" a scaled copy of the kernel, but instead of being centered at that pixel's location we paste a copy that has its $(0,0)$ position at $p$. + +Here's a comparison between pasting a centered copy (left) vs pasting a copy starting at $(0,0)$ (when the origin is set to be the top left of the image): + +![Centered Comparison](centered_comparison.png "Comparison between kernel pasting") + +where $p$ would be the red pixel. When you convolve using the DFT, you're actually using the "pasting" method on the right. Furthermore, since both image and kernel have the same size, what's going to happen when pasting kernel copies in this manner is that a bunch of pixels from some copies will go into "the padding area" and wraparound to the other side of the image. + +Due to the way this pasting works, the wraparound only happens in one direction for each axis. For example if $(0,0)$ is the top-left corner as the example above, wraparound can only happen from pixels being too far to the right or too far below. But no matter where $(0,0)$ is, if it's a corner of the image (which is what convolution via DFT assumes) the result will be the same. + +The result of doing the convolution in this way is the following: + +![Unshifted](unshifted.png "Result of convolution with no FFTShift") + +So one possible fix is to take that image and apply what's colloquially known as the FFTShift: For each axis, swap halves around. You can see in the image above if you swap the upper half with the lower half and then swap the left half with the right half, you'll get the correct result (in this case there was no padding to simplify stuff so you'll still see the wraparound artifact we talked about before). + +An equivalent "fix" would be to apply the FFTShift to either the kernel or the image before computing their spectrum. When shifting the kernel, for example, the effect is that the center of the kernel ends up at $(0,0)$: + +![Padded shifted kernel](padded_kernel_shifted.png "Our kernel K', FFTShifted") + +This is still the same kind of expensive, because it's a shift of $K'$ which has the same dimensions as the output image. If you're precomputing the kernel just once, however, it's totally fine + +Much cheaper, however, is a trick that computes the effect of the FFTShift in the spectral domain. The [Time Shift property](https://en.wikipedia.org/wiki/Fourier_transform#Time_shifting) of the Fourier transform +lets us compute the FFTShift right after computing the spectrum of the kernel. In the discrete domain this property becomes the [Shift Property](https://en.wikipedia.org/wiki/Discrete_Fourier_transform#Shift_theorem) of the DFT. + +Essentially, along an axis with length $N$, we're shifting all elements by $\frac N 2$ to either the left or right: it's the same because the signal is periodic, so you move the whole axis $\frac N 2$ to either side and the half that falls outside the range $[0, N-1]$ just wraps around to the other side. Since $2 \pi$ is a whole period of our signal and we're shifting it by half a period, the result in space of this shift yields + +$\mathcal F (\text{FFTShift(x)})_k = e^{- i \pi k} \cdot \mathcal F(x)_k$ + +in the spectral domain. Since this is done for each axis, the result is essentially the same as the Hadamard product +$M \cdot \mathcal F(K')$ of the matrix $M$ and the spectrum of the kernel $\mathcal F(K')$, where + $M_{xy} = (-1)^{x+y}$ (due to the particular form the complex exponentials take along each axis). + + This doesn't even have to be computed as a product in practice, you can just flip the sign of the odd positions (those whose coordinates sum to an odd number) in $\mathcal F(K')$ as you compute it. + +## Optimization 1: Exploiting the Hermitian Symmetry of the spectra of real signals + +[The DFT of a real sequence is Hermitian](https://en.wikipedia.org/wiki/Discrete_Fourier_transform#DFT_of_real_and_purely_imaginary_signals), meaning that if $\text{DFT}$ is the DFT of a sequence of real numbers $f$, then it satisfies $\text{DFT}[T] = \text{DFT}[-T]^*$ ([click here](https://kovleventer.com/blog/fft_real/) if you don't get the notation.) + +Once again, here's Arkadiusz talking about this: + + + +This has two important implications: first, that after performing the FFT of a real signal, we only need to store half of the values, since the other half are redundant. The values we store for a sequence of length $N$, for even $N$, are those indexed $0$ through $\frac N 2$, where the latter is commonly known as the Nyquist frequency. + +"But hold on a second. Those are actually half plus one values of the resulting DFT" an observant reader might say. But worry not, a nice property of even-length sequences is that $-i$ is always part of the group of roots of unity that participate in computing the FFT. An implication of this is that both $\text{DFT}[0]$ and $\text{DFT}[\frac N 2]$ are real-valued for the DFT of a real signal, which lets us pack both elements together in a single complex number as $\text{DFT}[0] + i \cdot \text{DFT}[\frac N 2]$. + +The other important implication is explored in the link to [kovleventer's blog](https://kovleventer.com/blog/fft_real/) above. If you think about it, you are performing the FFT of a sequence of $N$ real numbers and keeping $\frac N 2$ complex numbers, which is the same amount of real numbers. But the FFT takes $N$ complex numbers as input and gives $N$ complex numbers as output, so we're wasting half of our compute! + +Thankfully, the hermitianicity (hermiticity?) of the DFT of real signals and the linearity of the DFT allow us to exploit the computation: we can pack two real signals $x,y$ into a single complex signal +$z = x + iy$, compute $Z = \text{DFT}(z)$, and then used the rules explored in that blog post to retrieve +$X = \text{DFT}(x), Y = \text{DFT}(y)$. We call this process packing and unpacking. + +Also, from the first important implication, we notice that we don't have to unpack the whole of $X$ and $Y$, so we limit ourselves to only unpack the lower half (indices $0$ through Nyquist) of each. + +So to perform the FFT on the first axis (considering a single channel), we pack together two rows or columns of the image (depending on which axis we're running the FFT along first), compute the FFT of this packed signal and then unpack and save half of each. Then, we run an FFT along the other axis along the result. + +Say you did this on an $N \times M$ image, for even $N$ and $M$, running it along the $x$ axis first (so packing rows together). The result has a size of $\frac N 2 \times M$ (saving half of each row). So when running the FFT along the $y$ axis, you'd only need to consider $\frac N 2$ columns instead of $N$! You can rest assured that the FFT of the "implicit" rows is redundant to compute: the implicit columns are exactly the complex-conjugate of one of the columns we do compute the FFT of, so [this property](https://en.wikipedia.org/wiki/Discrete_Fourier_transform#Conjugation_in_time) ensures that we can reconstruct their FFT easily (and as a consequence, we have the result that Arkadiusz showcases). + +This packing/unpacking is also useful when doing IFFT on the way back: given +$X = \text{DFT}(x), Y = \text{DFT}(y)$ where we know $x,y$ are real signals, then we can once again pack +$Z = X + i \cdot Y$, compute $z = IDFT(Z)$, and then unpack $x,y$ with a much simpler rule: +$x = \text{Re}(z), y = \text{Im}(z)$ + +Another optimization done here is that after running the FFT along the first axis, we saved all of the Zero and Nyquist frequencies packed together into a single scanline. So, when running the FFT along the second axis, we compute this scanline with a special rule considering that it's the FFT of two packed real signals. + +## Optimization 2: Single element trade after an FFT butterfly + +The Cooley-Tukey Radix-2 FFT is performed with $k$ stages for a sequence of length $N = 2^k$. There's also two complementary ways of computing the DFT with this algorithm: Decimation in Time (DIT) and Decimation in Frequency (DIF). +In Nabla, our forward FFT implements the DIF algorithm, while the inverse FFT implements the DIT algorithm + normalization. Each one of these returns the output in some nonstandard order (more on that later). Furthermore, running one after the other (in either order) results in the identity function. + +Here's a visualization of the DIF algorithm for a sequence of length $8$: + +![DIF](dif_diagram.png "DIF Diagram") + +At each stage, each cross is called a butterfly. Each cross has two inputs (the two circles on the left) and two outputs (the two circles on the right). The outputs of the butterflies become inputs for the butterflies of the next stage. I won't go into too much detail of how each butterfly is computed, go check out the FFT algorithm for those details if you're interested. + +An important thing to notice here is that each stage computes twice the FFTs as the previous stage, each with half the length, and terminating when the FFT is of length two with a single butterfly per FFT. In fact, this is exactly the point of the FFT: it's a divide-and-conquer type of algorithm. For example, if you take only the top half of the diagram starting from stage $2$ onwards, that's exactly a DIF diagram for a sequence of length 4. The same can be said about the lower half, of course. + +Since we have the diagram at hand, let's also introduce the "stride". Each stage has an associated stride, which is simply the distance (in lanes) between the elements participating in a butterfly. For example, in stage $0$ this stride is $4$, since butterflies happen between elements $4$ lanes apart. The starting stride is always $\frac N 2$ for a sequence of length $N$, and each stage's stride is half the stride of the previous stage. + +In the diagram above, to compute the FFT of a sequence of length $8$ first we perform some butterflies to prepare the input for the next stage, and then the next stage runs two FFTs on sequences of length $4$ independently. Each of these FFTs, in turn, does the same: perform some butterflies as input for stage $3$, then run two FFTs on sequences of length $2$ independently. + +How do we map this to hardware? Well, we notice that the number of butterflies per stage is constantly $\frac N 2$. In our implementation, we make threads compute a single butterfly each at each stage. That means that we launch $\frac N 2$ threads, with thread of thread ID $n$ in charge of computing the $n$th butterfly, when counting butterflies from the top. So at stage $1$, for example, thread $0$ is in charge of computing the butterfly between its inputs $x[0]$ and $x[4]$, and thread $2$ would be in charge of computing the butterfly between inputs $x[2]$ and $x[6]$. + +Now let's look at stage $2$. The first butterfly of stage $2$, with index $0$ counting from the top, has to be performed by thread $0$. But to do this we require the first of thread $0$'s output of the previous stage, and the first of thread $2$'s output. Similarly the third butterfly, with index $2$, has to be performed by thread $2$ with the second outputs of the same butterflies. + +For easier nomenclature we say that each thread at a certain stage holds two elements "lo" and "hi". These are the elements participating of a butterfly: the one closest to the top of the diagram is "lo", and the other is "hi" (could be counterintuitive but it's because we count from the top). Since butterflies are performed in-place "lo" and "hi" can either be the inputs or the outputs of the butterfly, depending on at which moment (although in the same stage) you are inspecting them. So with this nomenclature, thread $0$ at stage $2$ needs its own "lo" and the output "lo" of thread $2$ at the previous stage to compute its butterfly, and similarly thread $2$ needs to do the same but with the "hi"s. + +It turns out that at each stage, thread $x$ must use one of its own outputs of the previous stage and one of the values of thread $x$ \^ $\text{stride}$, where we consider the stride of the current stage. For example at stage $2$ the stride is $2$ and we can see that the pairs exhchanging ther elements $((0,2)$ and $(1,3))$ can be obtained from one another by XORing their threadID with the current stride. The same thing happens at stage $3$ with a stride of $1$. + +Once a thread has figured out with which other thread it must exchange a single value, it must also figure out whether to keep its "lo" and exchange its "hi" or vice-versa. As it turns out, the thread of lower ID exchanges its "hi" and the thread of higher ID exchanges its "lo". You can see this being true in the DIF diagram as well. To figure out whether a thread is the one of lower or higher ID in the pair, just check whether `threadID ^ stride` is nonzero, since stride is always a power of two (the result is $0$ for the thread of lower ID and nonzero for that of higher ID). + +If that gets hard to visualize, here's the same diagram with each node colored according to which thread is holding that element at that point in time. Thread $0$ is blue, $1$ is red, $2$ is green and $3$ was left white. You'll have to excuse my paint skills. Remember that for each thread, "lo" is always the elements closest to the top in the diagram, and the other is "hi". + +![DIF](dif_diagram_color.png "DIF Diagram") + +Another thing we get is optimal reading: think about the colored diagram again. We launched $4$ threads to compute it, but what's interesting is how they read their elements in. First, they will reads all their "lo" elements for the first stage, which are elements $0$ through $3$, and then they will reads all their "hi" elements which are $4$ thorugh $7$. If the input is contiguous in memory, we get coalesced reads! + +## Optimization 3: Exploiting Shared Memory in Compute, subgroup operations and overcoming maximum Workgroup sizes with Virtual Threads + +It turns out that not all FFTs are born equal: some are much easier to handle than others. The smallest size for an FFT we consider is $2 \cdot \text{SubgroupSize}$ where $\text{SubgroupSize}$ is the smallest possible number of threads your device can run in parallel in the same subgroup. We call such an FFT "Subgroup-sized". + +Thus, each thread would hold two elements "lo", "hi" at all times and the DIF diagram (or DIT in the case of the inverse FFT) maps nicely to the hardware. If you want to compute an FFT for a sequence shorter than that, you must pad it to that size in whichever way you see fit. + +This Subgroup-sized FFT is the fastest it can be, since all elements of the FFT can be made resident in registers and the element swaps we talked about in the previous section can be implemented as subgroup XOR shuffles, which is a SPIR-V intrinsic and modern GPUs usually have hardware to perform this shuffle operation fast. + +What happens for bigger-sized FFTs? Well, up to $2 \cdot \text{MaxWorkgroupSize}$ (where $\text{MaxWorkgroupSize}$ is the maximum number of threads you are willing to launch in a single workgroup) you can still use the same exact DIF and DIT algorithms, but using shared memory to perform the shuffles. We call these FFTs "Workgroup-sized". We implement our own generic library for what we call workgroup shuffles: they perform the operation of swapping values between threads in the same workgroup, mimicking what subgroup shuffles do but at the workgroup level. + +To do this, we essentially have all threads in a workgroup write their elements to shared memory, barrier to make sure every thread is done writing, then make threads read their values from shared memory. It is slower than a subgroup shuffle since it involves barriering, but it's miles better in terms of latency than going through global memory. + +We only use this when necessary: for example, if running a forward FFT on a sequence of length $4 \cdot \text{SubgroupSize}$, only the first stage needs to do such a barrier. At the second stage we run two Subgroup-sized FFTs, which we established can be done by a single subgroup with subgroup shuffling. In the case of an inverse FFT (go look at a DIT diagram if you want), this order would be reversed and only the last stage would need a workgroup shuffle. + +The advantages of using subgroup shuffles at the smallest level aren't just because of speed, but also because of the shared memory footprint: if you wanted to do such a shuffle fast using shared memory, you'd need to multiply the amount of shared memory by the number of elements in a shared memory bank to ensure every element ends up in a different bank, otherwise you WILL run into bank conflicts. So you avoid this memory/speed tradeoff altogether. + +What about a "bigger than Workgroup"-sized FFTs? For example, take a sequence of length $4 \cdot \text{MaxWorkgroupSize}$. With our algorithm, we'd need $2 \cdot \text{MaxWorkgroupSize}$ threads in a single workgroup to achieve this. That's where virtual threading comes in! + +Virtual threading essentially consists of emulating many threads' behaviour using a single thread. For a visualization, say our $\text{MaxWorkgroupSize}$ was $2$ and we're running an FFT on a sequence of + $8 = 4 \cdot \text{MaxWorkgroupSize}$ elements like before. Here's the image again so you don't have to go fetch it. + +![DIF](dif_diagram_color.png "DIF Diagram") + +This time however, there are only two real threads and four virtual threads. Thread $0$ manages virtual threads $0$ and $2$ (those which had elements tagged blue and green, respectively) and thread $1$ manages virtual threads $1$ and $3$ (red and white). The idea now is to divide the first stages into computations we can do sequentially until we recurse down to Workgroup-sized sub-FFTs (in this example this happens at stage $2$). + +Virtual threads can be grouped into "virtual workgroups". We see that after stage $1$ in the diagram above, virtual threads $0$ and $1$ must compute a Workgroup-sized FFT (the top half of the diagram starting from stage $2$) and the same thing happens for virtual threads $2$ and $3$. + +This gives us an idea of how to group virtual threads together, because after stage $1$ we can make our real threads compute two Workgroup-sized FFTs in sequence: once emulating virtual threads $0$ and $1$, and a second time emulating virtual threads $2$ and $3$, and each time using shared memory for shuffles since the size allows it. + +Computing stage $1$ is also easy: all butterflies are independent, even within the virtual workgroup, so you can compute them in any order. To keep things consistent we choose to emulate threads per virtual workgroup as well: in a first step, thread $0$ emulates the blue butterfly and thread $1$ the red butterfly (which are those for virtual threads $0$ and $1$) and then in a second step they do the same for the other two butterflies. + +There is no element swap after butterflies though: virtual threads read in their inputs and write out their outputs to the same place they had read their inputs from. Element swapping now happens indirectly: in the next stage, virtual threads figure out which elements they need to read in to perform their computations. What's also nice is that all these writes (and the reads that come afterwards) are all coalesced as well. + +Even better, all memory accesses done in stages previous to running a Workgroup-sized FFT are done in the same positions for different threads. What I mean by this is that even if virtual threads access different memory locations at each of these stages, *all memory locations accessed are owned the same thread*. You can see this in the diagram above: In stage $1$ thread $0$ owns memory locations $0,2,4,6$. After writing to these positions when computing the butterflies in that stage, it still owns those positions: virtual thread $0$ will need elements at positions $0$ and $2$ to run the Workgroup-sized FFT in stage $2$. + +Element at position $2$ was computed by virtual thread $2$, but since that virtual thread is also emulated by thread $0$, it's the same thread that owns that memory location! In practice this means that these computations don't require any sort of barriers, syncs or used of shared memory. This allows us to employ an optimization which is to preload elements per thread - essentially reading the needed elements for each thread only once at the start and keeping them in local/private memory for the rest of the algorithm. This will be explained in more detail in the Static Polymorphism section of this article. + +All of this implies that such FFTs use the same amount of shared memory as a Workgroup-sized one. The downside is either increased latency or decreased occupancy, depending on whether these reads/writes happen in global memory or local/private (preloaded) memory. + +This generalizes to arbitrary "bigger than Workgroup"-sized FFTs: Run all the butterflies in sequence in each "bigger than Workgroup" stage, reading and writing their inputs and outputs from and to the same place. Then once you get to Workgroup size, you can run the algorithm for Workgroup-sized FFTs that uses shared memory for faster swapping. + +Our FFT code requires a bunch of template parameters to be called, among those we find `ElementsPerInvocation` and `WorkgroupSize` (it actually takes their base $2$ logarithm, given that they should be powers of two). This is to indicate that we're going to be performing an FFT on an array of size +$N = \text{ElementsPerInvocation} \cdot \text{WorkgroupSize}$ and that this size should be known at shader compilation time. `ElementsPerInvocation` is the number of elements in the output that a single thread must compute. It is equivalent to twice the number of virtual threads each thread launched emulates. + +For $\text{ElementsPerInvocation} = 2$, for example, that'd be a single virtual thread per thread, which essentially is just running the best algorithm (using shared memory and subgroup ops for shuffles). For $\text{ElementsPerInvocation} = 2^k$ you'd have $2^{k-1}$ virtual threads per thread and $k-1$ stages where you must go through global memory because of thread emulation, and only the rest of the stages would use shared memory/subgroup ops for shuffles. + +You can of course decompose $N = 2^k$ as different products of $\text{ElementsPerInvocation}$ and $\text{WorkgroupSize}$. Our rule of thumb is to minimize $\text{ElementsPerInvocation}$ when possible (remembering it must be at least $2$) and only increasing it if the $\text{MaxWorkgroupSize}$ won't allow you to keep it smaller. This is to avoid going through global memory when possible if reading/writing straight to global memory, or maximize occupancy if we preload at the start. + + +## Optimization 4: Better occupancy and access latency with spilling preloading + +To compute a per-channel spectrum in the Bloom example, we have a few different strategies: + +One would be to load each channel at a time, compute the FFT for that channel, and store it. This sounds natural, and it's what we do in the Bloom example for intermediate FFTs. +But for the first axis FFT, the problem is that each thread is essentially sampling the same spots in a texture once per channel, with a long computation (the FFT of the channel) in the middle , so the parts of the texture accessed are unlikely to be cached between loads (because of other workgroups working on the same SM). The same happens at the end, when doing IFFT along the first axis. This results in three texture writes per pixel, each time writing a different channel (AND having to retrieve the previous write before the next, since you can't just write to a single channel of a texture). + +A different strategy would be to load all channels at once, having the values for all channels resident in registers. Although this means we're using more registers per thread, these can be spilled to global memory in the worst case, resulting in only some accesses to global memory. The upside is that each SM/CU/EU probably has its own spilling private memory arena so its faster than re-accessing a texture/buffer anyway. + +Now that we have preloaded all channels, another important decision must be made: do we compute the FFT channel by channel or all at once? You see, just like you can define `complex_t` or similar, you could use `complex_t >` with operations done on a per-element basis. The number of operations remains the same for either three FFTs with `complex_t` or a single FFT with `complex_t >`, but what doesn't remain the same is the memory and memory barriers required to do so. + +Global memory accesses and subgroup shuffle ops remain the same in both versions. What's different is the behaviour regarding barriering. + +If you used `complex_t` you'd have to do three FFTs, which at each stage where memory gets traded using shared memory incurs in a barrier. If you used `complex_t >` instead, you'd have a third of the shuffles (meaning a third of the barriers), trading three times as much memory on each. Sound good? Well the thing is that this requires triple the amount of shared memory per workgroup. You can spill registers, but not shared memory, so this would effectively kill occupancy. + +In our bloom example, we choose to overbarrier but keep shared memory usage per workgroup lower. + +This will become more clear when we talk about the Accessor pattern later on, but another important part of preloading is that (except for register spilling) we never need to go through global memory when doing an FFT, even if $\text{ElementsPerInvocation}>2$. + +## Optimization 5: Dynamic Kernel rescaling via Spectral Resampling + +At the start of the article we talked a bit about the Hadamard product and that to perform such a product you'd need both spectra to be matrices of the same exact dimensions. Let's go over it again. + +The result of the FFT of the image is of dimensions +```cpp +roundUpToPoT(imageDimensions+kernelDimensions) +``` +while the result of the FFT of the kernel has size +```cpp +roundUpToPoT(kernelDimensions) +``` + +`roundUpToPoT` means to round each dimension up to the next power of two (remember this is needed by Cooley-Tukey) and the `+kernelDimensions` is there to account for padding to avoid wraparound artifacts. +To simplify the discussion a bit we'll also assume `kernelDimensions` is a square with PoT-side length. + +So, how do we compute the Hadamard product? One thing you could do is create a `roundUpToPoT(imageDimensions+kernelDimensions)` image which has a copy of the kernel in the center and zero padding all around it, +compute the spectrum of this and multiply the spectrum of the padded image with it. This works, and is probably optimal if you're considering running Bloom as a post-processing effect for a fixed-size image. It's what we did in the first section, which I introduced as a way to explain the FFTShift. + +However, it has a bigger memory footprint than our approach (presented in the next paragraph): this method has one padded kernel image per value of `roundUpToPoT(imageDimensions+kernelDimensions)`. To run this with images of varying +sizes (where you might get different rounded up to PoT sizes) you'd need a different copy of the padded kernel per each possible value. + +Our approach in Nabla is more aligned with other use cases we had for the FFT. What we do is compute the spectrum of the kernel as-is (no padding or anything) and keep it resident in GPU memory. +Then, once we have the spectrum of the image, to compute the Hadamard product we simply rescale the kernel's spectrum to `roundUpToPoT(imageDimensions+kernelDimensions)` by sampling it as a texture: to compute the +product at pixel $p$, we get $p$'s `uv` coordinates in the image (essentially just divide $p$ by `roundUpToPoT(imageDimensions+kernelDimensions)`) and sample the kernel spectrum at those coordinates. + +This allows us to keep a single copy of the spectrum resident in GPU memory, with no padding (so it's as small as it can be) and reuse it for any convolutions we might want to perform. + +What we're doing here is essentially zooming out in the spatial domain by resampling the spectrum. Once again, Arkadiusz's video does give a bit of insight into this as well. + + + +Since we assume (and ir our Bloom example, require) the kernel to have PoT long sides (and square, but for this discussion it could also be rectangular) it turns out that `roundUpToPoT(imageDimensions+kernelDimensions)` is exactly an integer multiple of `kernelDimensions` +(of course, it might be a different multiple per axis). Let's assume + +$\text{roundUpToPoT}(\text{imageDimensions}+\text{kernelDimensions}) = (N_1, N_2) \cdot \text{kernelDimensions}$. + +$N_1$ and $N_2$ also turn out to be PoT, but that's irrelevant. + +What this all means is that the end result is not just any kind of resampling, but rather a pure (integer-factor) upsampling process. Our spectral interpolation is exacly equivalent to upsampling with a tent filter: that is, the result +is exactly the same as introducing $N_1 - 1$ zeros between samples along the $x$-axis, $N_2 - 1$ zeros between samples along the $y$-axis, and then convolving the result with a tent filter whose finite support is exactly the length between two pixels in the original spectrum (the linear interpolation +performed by hardware samplers is exactly the same as a convolution with a tent). + +[Pure upsampling](https://en.wikipedia.org/wiki/Upsampling) in one domain causes periodic repetition of the signal in the other domain in the expansion step (introducing zeros between samples) before attenuating the repetitions in the interpolation step. Ideally we would like to annihilate the repetitions entirely using a perfect filter $(\text{sinc})$, since convolution with it in the spectral domain becomes a product with a box function in the spatial domain, one which would exactly enclose the central copy of the kernel and annihilate the rest. + +The interpolation with a tent filter, however, is equivalent to a product in the spatial domain with a $\text{sinc}^2$ function. This means that repeated copies of the kernel get attenuated, although not perfectly: copies of the kernel overlapping with secondary lobes of the +$\text{sinc}^2$ will get scaled down (and by quite a lot) but not completely annihilated. This causes ringing artifacts. For example, here's the result of our convolution against a kernel of size `256x256` (using mirror padding to get as much luminance as possible): + +![Ringing](convolved_256.png "Ringing") + +It might be hard to see, but there is some ringing going on in these areas: + +![Ringing Showcase](convolved_ring.png "Ringing circled") + +To better showcase this, here's the same image with a white point of $0.25$: + +![Ringing Whitepoint](ring_whitepoint.png "Ringing whitepoint") + +Look at the areas I circled in red before, and compare that to the same convolution performed with the other technique we mentioned at the start of this section (the one that first padded the kernel with zeros and +then computed the spectrum of that), with the whitepoint set to $0.25$ as well: + +![No ringing](noring.png "No ringing") + +Here's how I visualize this in my mind: copies of the kernel pasted over the very bright pixels of the Sun (and all copies of the kernel everywhere actually, but these would be the relevant ones here) are actually +at first (before the product with a $\text{sinc}^2$) repeated all over the image. But when the product with the $\text{sinc}^2$ happens: + +![Sinc square](sinc.png "Sinc squared graph") + +you can imagine that the big central lobe gets multiplied against the central copy, preserving it (especially towards the center, killing it off towards the edges). But the next copy gets multiplied against the secondary lobe (the one circled in red) and doesn't get completely annihilated (again, especially important towards the centre of the copy, where the kernel concentrates most luminance), +causing ringing artifacts. + +Arkadiusz's video on the timestamp above also showcases the spatial ringing of the kernel when doing this (albeit very exaggerated so you can see some of it). + +To avoid this ringing, you can use a bigger kernel, since that makes $(N_1, N_2)$ smaller, meaning less zeros in the upsampling process, leading to less spatial repetition, leading to a more "compact" $\text{sinc}^2$, +which has smaller secondary lobes. Here's for example the result of doing our convolution against a kernel of size `512x512`, same whitepoint: + +![Less ringing](less_ring.png "Less ringing") + +Some ringing still survives, but it's much less noticeable than before. Here's that image with whitepoint +$1$, in case you want to compare it against the one we did with a `256x256` kernel: + +![512](convolved_512.png "With a 512 kernel") + +An alternative would be to upgrade the Tent filter to a better approximation of the $\text{sinc}$ function. You can do this in any manner you want, but if you want the resampling to stay relatively fast by exploiting hardware bilinear sampling you will want to use polynomial fits of $\text{sinc}$. There's a [GPU Gems article](https://developer.nvidia.com/gpugems/gpugems2/part-iii-high-quality-rendering/chapter-20-fast-third-order-texture-filtering) on implementing a bicubic sampler using bilinear sampling, and the technique naturally extends to bi-whatever sampling (with exponential cost, however). + +### Dynamic PSF Sharpening + +Given the shape of the PSF we consider in this example, spatially rescaling the kernel is pointless. I have shown you an image of the kernel with a low whitepoint, but its actual distribution is kind of a very sharp +Laplacian, so something like this: + +![Laplacian](laplacian.png "Laplacian distribution") + +although much sharper. The kernel isn't perfectly radially symmetric with this distribution, but it's similar. A property such distributions have is that if you normalize them after rescaling, they end up +very similar to how they started. This is why we say that spatial rescaling is pointless, since we want our kernel to be normalized so the convolution does not affect luminance. + +What we can do, however, is dynamic sharpening. We achieve this by interpolating the kernel with a Dirac delta (the identity of the convolution). This is what we do in the video at the start of this article: it's a simple $\cos(t)^2$ interpolation. + +Interpolation is done in the spectral domain. This is because +$f * ((1-t) \cdot k + t \cdot \delta)$ becomes +$F \cdot ((1-t)\cdot K + t \cdot \Delta)$ in the spectral domain. + +An expression for $\Delta$ is easily found: It's just a matrix of ones (since it's the identity of the Hadamard product). + +Since a Dirac delta integrates to $1$, the interpolated kernel also integrates to $1$ (as long as +$t \in [0,1]$ of course). + +## Optimization 7: Which dimension to run first? Dont pad to PoT before you run! + +To elaborate on this point, I'm going to give the exact numbers for the Bloom example. The convolution of our `1280x720` image of balls against the `256x256` kernel requires us to get the spectrum of a +`2048x1024 = roundUpToPoT(1280x720 + 256x256)` sized image. + +Here's a table of total FFTs ran using a naive approach (padding both dimensions up to PoT before running the FFTs): + +| | FFTs along first axis | FFTs along second axis | +|----|---------|---------| +| $x$ axis first | $512$ of length $2048$ | $1024$ of length $1024$ | +| $y$ axis first | $1024$ of length $1024$ | $512$ of length $2048$ | + + +Unsurprisingly, they're the same amount of FFTs (grouped by length) in total. If you're wondering why along the second axis we perform half as many FFTs as the length of the FFTs along the first axis, remember that we keep half of each FFT along the first axis. + +We can however do much better than this. Let's think about the case in which we run an FFT along the $y$-axis first. Out of the $1024$ FFTs we launch, $384$ are redundant. This is because there's +$640 = \frac {1280} 2$ actual packed columns in our image. + +$192 = \frac {384} 2$ packed columns to each side are in the padding area. Running an FFT along these columns either yields $0$ (if we use +zero padding) or can be retrieved later since it's going to be exactly equal to one of the actual packed columns we run an FFT along (if we use mirror padding). + +So when running the FFT along the $y$-axis, we only need to run $640$ FFTs of length $1024$ (we still need to pad along the $y$-axis). The result of this operation will yield $1280$ columns of length $512$. + +Similarly, in the next step when running an FFT along the $x$-axis, we will need to run $512$ (this time we can't pack them since they're already complex) FFTs of length $2048$ (this time padding $1280$ up to +$2048$). The padding along the $y$-axis +was done automatically by our HW sampler, but this time we must do the padding by hand, either setting zeros or retrieving mirrored values in the padding area. + +Here's the table of total FFTs for that case as well as the $x$ axis first case: + +| | FFTs along first axis | FFTs along second axis | +|----|---------|---------| +| $x$ axis first | $360$ of length $2048$ | $1024$ of length $1024$ | +| $y$ axis first | $640$ of length $1024$ | $512$ of length $2048$ | + +Which one is better? Well in this particular case, $y$-axis first takes about $0.57 \; \text{ms}$ to run on my 4060 (measured with Nsight Systems), while $x$-axis first takes about $0.73 \; \text{ms}$. That's a significant difference! + +If we change the kernel to a size of `512x512`, we get the following table instead: + +| | FFTs along first axis | FFTs along second axis | +|----|---------|---------| +| $x$ axis first | $360$ of length $2048$ | $1024$ of length $2048$ | +| $y$ axis first | $640$ of length $2048$ | $1024$ of length $2048$ | + +Unlike the previous case in which the FFTs to compare are all different-sized, this particular case is easier to analyze: $y$-axis first runs $1664$ FFTs of length $2048$ in total while $x$-axis first runs $1384$ FFTs of the same length, so it's reasonable to expect that $x$-axis first performs better in this case. Indeed, $x$-axis first takes about $1.04 \; \text{ms}$ to run, while $y$-axis first takes about $1.45 \; \text{ms}$. + +## Keeping things modular with Nabla + +[HLSL2021](https://devblogs.microsoft.com/directx/announcing-hlsl-2021/) introduced a bunch of additions to the language. In particular, with it we got access to template metaprogramming. This is huge, since +this lets us write more generic and maintanable code. Before I show you some examples, let's talk a bit more about our HLSL library. + +### The Nabla HLSL library + +One of the highlights of Nabla is our HLSL library: most of the library is code that's shared between the host and the device, so you can access the same functions and structs from both. We made a lot of HLSL equivalents +for many of CPP's `std` headers, such as +* `` +* `` +* `` +* `` +* `` +* `` +* `` + +A lot of this (and especially the implementation of ``) is only possible via some hardcore Macro Programming - the only way of writing reflection in C++11 (level at which DXC operates at) - which is made possible by BOOST_PP. + +Wave is the only C++20 conformant (implementing __VA_OPT__ and friends) stand alone library (not part of a compiler) preprocessor thats not buggy allowing us to use BOOST_PP. + +This adds a lot of functionality of the host to the device, but we also have some device code that can run on the host: for example, we implemented a lot of GLSL and HLSL intrinsics in CPP so you can use them on the host as well. + +Besides the added functionality on both sides, this is really good for testing and debugging: as long as your code can be compiled for both host and device, you can debug the code running on your GPU by running it on the CPU, +or design unit tests that run on the CPU. + +### Running an FFT in Nabla + +Before that, let's go over how to use the FFT in Nabla on the GPU. This will be a walkthrough of how to set up the code to run an FFT and an explanation of most stuff found in the FFT library (all the structs detailed here are in the `workgroup` namespace). + +The first thing to clarify is that since we're using Cooley-Tukey, we ONLY perform FFTs on power-of-two (PoT for short) sized arrays. If your array isn't PoT-sized, make sure to pad the array in whichever way you see fit up to a power of two. + +To run an FFT, you need to call the FFT struct's static `__call` method. You do this like so: + +```cpp +FFT::__call(Accessor accessor, SharedMemoryAccessor sharedMemoryAccessor); +``` + +We use functional structs instead of plain functions because HLSL does not support partial specialization of functions, so if you want to achieve the same result you have to wrap your function in a struct which CAN be partially specialized. Furthermore, we use a `__call()` method instead of overloading `operator()` because due to the low (C++11) version DXC is based off of, the latter does not allow for implicit template arguments while the former does (so in the snippet above you could skip writing `` if you wanted to). + +`Inverse` is a bool value indicating whether you're running a forward or an inverse FFT + +`ConstevalParameters` is a struct created from three compile-time constants. + +`Scalar` is just the scalar type for the complex numbers involved. + +`WorkgroupSizeLog2` is self-explanatory, and `ElementsPerInvocationLog2` is the (log of) the number of elements of the array each thread is tasked with computing, with the total `ElementsPerInvocation` being the length `FFTLength` of the array to perform an FFT on (remember it must be PoT) divided by the workgroup size used. + +This makes both `ElementsPerInvocation` and `WorkgroupSize` be PoT. +IMPORTANT: You MUST launch kernel with a workgroup size of `ConstevalParameters::WorkgroupSize` + +`Accessor` is an accessor to the array. It MUST provide the methods +```cpp +template +void set(uint32_t idx, AccessType value); + +template +void get(uint32_t idx, NBL_REF_ARG(AccessType) value); +``` + +These methods need to be able to be instantiated with `AccessType` being `complex_t` for the FFT to work properly. + +`SharedMemoryAccessor` is an accessor to a shared memory array of `uint32_t` that MUST be able to fit `WorkgroupSize` many complex elements (one per thread). + It MUST provide the methods +```cpp +template +void set(IndexType idx, AccessType value); + +template +void get(IndexType idx, NBL_REF_ARG(AccessType) value); + +void workgroupExecutionAndMemoryBarrier(); +``` + +The templates are there in case you want to use the same accessor in other ways, but for usage with FFT those methods MUST be able to be instantiated with +both `IndexType` and `AccessType` being `uint32_t`. +`workgroupExecutionAndMemoryBarrier()` can be any method that ensures that whenever threads shuffle via the shared memory, all threads have reached the barrier after writing their values and before reading the values they need to get from it. In our examples it's usually a `glsl::barrier()`. + +We will talk a bit more about Accessors later in this section, but we have an upcoming blogpost about them that goes deeper. + +Furthermore, you must define the method `uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize()` (usually since we know the size we will launch at compile time we make this return values based on some compile-time known constants). This is because of an issue / bug with DXC caused by SPIR-V allowing both compile-time and runtime workgroup sizes. + +With all of that said, here's an example of an FFT being ran: + +```cpp + +struct PushConstantData +{ + uint64_t deviceBufferAddress; +}; + +[[vk::push_constant]] PushConstantData pushConstants; + +using namespace nbl::hlsl; + +// Given compile-time known constants `ElementsPerInvocationLog2`, `WorkgroupSizeLog2` (defined elsewhere), and `float32_t`, +// we give an alias to the `workgroup::fft::ConstevalParameters` struct for clarity. +using ConstevalParameters = workgroup::fft::ConstevalParameters; + +// The constexpr `ConstevalParameters::SharedMemoryDWORDs` tells us the size (in number of `uint32_t`s) that the shared memory array must have, +// so we use that to declare the array +groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs]; + +// Users MUST define this method for FFT to work +uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); } + +struct SharedMemoryAccessor +{ + template + void set(IndexType idx, AccessType value) + { + sharedmem[idx] = value; + } + + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) + { + value = sharedmem[idx]; + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + } + +}; + +struct Accessor +{ + static Accessor create(const uint64_t address) + { + Accessor accessor; + accessor.address = address; + return accessor; + } + + template + void get(const uint32_t index, NBL_REF_ARG(AccessType) value) + { + value = vk::RawBufferLoad(address + index * sizeof(AccessType)); + } + + template + void set(const uint32_t index, const AccessType value) + { + vk::RawBufferStore(address + index * sizeof(AccessType), value); + } + + uint64_t address; +}; + +// launch `ConstevalParameters::WorkgroupSize` many threads in a workgroup, instantiate the accessors and then run FFTs +[numthreads(ConstevalParameters::WorkgroupSize,1,1)] +void main(uint32_t3 ID : SV_DispatchThreadID) +{ + Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); + SharedMemoryAccessor sharedmemAccessor; + + // FFT + + workgroup::FFT::template __call(accessor, sharedmemAccessor); + // We just used the sharedmemAccessor, and we're about to use it again. Therefore, we must block here to ensure all subgroups + // have made their last read from shared memory in the algorithm above. Otherwise, we could introduce a race condition. + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + workgroup::FFT::template __call(accessor, sharedmemAccessor); +} +``` + +In the snippet above, the first FFT is a Forward FFT and the second is an Inverse FFT. I'm running one after the other to showcase something important: +if you're going to use the shared memory after an FFT (in this case it's going to be used to run another FFT), you MUST do an execution and memory barrier like above. We explain why in the next section. + +The result of either FFT is actually exactly the same, save for normalization. The Inverse FFT divides the resulting array by $\text{FFTLength}$ at the end. + +### Static Polymorphism in Nabla: the Accessor pattern + +One of the great things of template metaprogramming is that it allows us to decouple behaviour as a way to generalize code and avoid overspecializing. For example, say you're writing an algorithm that that has to read and write some data. +This data, however, could be anywhere: on the GPU, you could be feeding the algo data from an image or buffer through a descriptor, or maybe you're using BDA and just passing a device address. On the CPU, the same applies: +maybe you're providing data from a vector, or from a hashmap or something else. + +The algorithm itself, however, doesn't need to know any of this, it just needs to get and/or set data. That's where Accessors come in handy: they're a way of abstracting this process of getting and setting data so that the algorithm itself is separated from the data acquisition process, while providing reference/pointer-like behaviour. + +In some cases it can even be used to create efficient specializations of an algorithm without having to rewrite anything. Such an example is our +real FFT specialization, of which I'll talk about in a bit. + +Both the `Accessor` and the `SharedMemoryAccesor` follow a similar convention. Part of that convention includes an important assumption about Accessors: they should be "clean" and "unused". This means that to ensure +the algorithm's correctness, there should be no aliasing (no one else should have access to the same memory as long as the algorithm is doing its thing). + +If you do in-fact optimize/resuse/alias between two algorithms (or invocations of same algorithm), then you need to worry about the potential overlap and ensure no data race via appropriate synchronisation primitives to ensure execution and memory dependencies between re-users. This is exemplified in our FFT example usage earlier: between two FFT passes, there's a barrier to ensure all threads have caught up to their work and there's no chance of a data race inbetween usages. + +We limit the rest of this discussion to the `Accessor` in our FFT, to exemplify the flexibility this patterns gives us. The `SharedMemoryAccessor` is not as flexible due to the role it has. + +In the FFT, the Accessor has different behaviour according to $\text{ElementsPerInvocation}$. If $\text{ElementsPerInvocation}=2$ then the Accessor is only used to read data in at the start and write it out at the end. This allows for the FFT to be done out-of-place: you don't necessarily have to make the accessor read and write to the same place. + +If $\text{ElementsPerInvocation}>2$, however, then the Accessor is also used as a way to "trade" elements between threads when doing "bigger-than-Workgroup-sized" FFTs so the FFT MUST be done in-place. + +Now let's go over the code in [the Bloom example](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/tree/master/28_FFTBloom) to show examples of other types of flexibility this pattern has. + +The code in the Bloom example uses [preloaded accessors](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/blob/87de8388a9082b4e3fa5566cceeebd0d8a5a3a1b/28_FFTBloom/app_resources/fft_common.hlsl#L42), meaning that they read in all their elements into private memory (likely increasing register usage) before running the FFT and write them out themselves after the FFT. This obviously decreases occupancy if preloading multiple channels or if $\text{ElementsPerInvocation}>2$ when loading a single channel. But we get different benefits. One of them is that there's no `memoryBarrier()` calls that matter (which is why in this case we specify it can be a method that does nothing). + +In all cases, the same flexibility as before stays: either when preloading before the FFT or unloading afterwards, you get to choose where they read from and where they write to. For example, the [first axis FFT](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/blob/87de8388a9082b4e3fa5566cceeebd0d8a5a3a1b/28_FFTBloom/app_resources/image_fft_first_axis.hlsl#L31) reads from an image and writes to a buffer, the [second axis FFT](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/blob/87de8388a9082b4e3fa5566cceeebd0d8a5a3a1b/28_FFTBloom/app_resources/fft_convolve_ifft.hlsl#L59) (which also performs product and IFFT in the same shader) does buffer->buffer, and the [last IFFT](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/blob/87de8388a9082b4e3fa5566cceeebd0d8a5a3a1b/28_FFTBloom/app_resources/image_ifft_first_axis.hlsl#L37) does buffer->image. + +On the first axis FFT, preloading all channels at once means we only have to read each position in the image once, opposed to three times if loading one channel at a time. On all FFTs, using preloaded accessors means that "bigger-than-Workgroup-sized" FFTs don't go through global memory but instead stay in registers. + +Finally, preloaded accessors also allow us to implement an efficient specialization of a real-valued FFT. We can preload two scanlines at once using the packing trick introduced earlier and run the FFT. Then after the FFT is done, since we're using a preloaded accessor we don't write straight to the buffer, [all writes stay in the Accessor's registers](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/blob/87de8388a9082b4e3fa5566cceeebd0d8a5a3a1b/28_FFTBloom/app_resources/fft_common.hlsl#L45). If we did, we'd be wasting twice the space. Instead, right after the FFT we can make the preloaded accessor's unload method store only the lower half of the DFT to the buffer. + +You can see that with the same FFT code we can implement a lot of different micro-optimizations or specializations of the algorithm just by modifying the behaviour the Accessor provides. This is the magic of Static Polymorphism! + +## More Accessor Magic: building large utilities from small incremental blocks + +Static Polymorphism is also what enables the use of virtual threading in some algorithms. Once again let's show how this works in the case of the FFT. [Here's the code](https://github.com/Devsh-Graphics-Programming/Nabla/blob/ae5dbadedc8817b4aebea4a5712887035472d7a8/include/nbl/builtin/hlsl/workgroup/fft.hlsl#L448) for the FFT when $\text{ElementsPerInvocation}>2$, but you can refer to our discussion above. Here's the diagram for the case we had with 4 virtual threads for an FFT of length $8$: + +![DIF diagram](dif_diagram_color.png "The same diagram as before") + +When running such an FFT, first we did the butterflies in the first stage per virtual workgroup, and then we run one Workgroup-sized FFT per virtual workgroup. To do this, we recycle the code we have for the Workgroup-sized FFT by calling the code for it but passing an Offset Accessor. For example in our diagram, the full FFT would be done by calling +`FFT::__call(accessor, sharedmemAccessor)`. This method will first compute the butterflies in stage 1, where each thread will perform one butterfly per virtual workgroup. + +Then, it's going to compute a Workgroup-sized FFT per veirtual workgroup. To achieve this, it's going to call +`FFT::__call(offsetAccessor, sharedmemAccessor)` where `offsetAccessor` can be built from the original `Accessor`. + +The first such FFT will run an FFT on elements indexed $0$ through $3$, which can be done with an `OffsetAccessor` with $0$ offset. Then, to run the second FFT on elements indexed $4$ through $7$, it's going to use an `OffsetAccessor` with an offset of $4$. Essentially, this is allowing us to incrementally build larger utilities by exploiting the genericity of the smaller ones. + +## FFT Utils + +### Figuring out the storage required for an FFT + +We provide the functions +```cpp +template +uint64_t getOutputBufferSize( + uint32_t numChannels, + vector inputDimensions, + uint16_t passIx, + vector axisPassOrder, + bool realFFT, + bool halfFloats +) + +template +uint64_t getOutputBufferSizeConvolution( + uint32_t numChannels, + vector inputDimensions, + vector kernelDimensions, + uint16_t passIx, + vector axisPassOrder, + bool realFFT, + bool halfFloats +) +``` +in the `fft` namespace which yield the size (in bytes) required to store the result of an FFT of a signal with `numChannels` channels of size `inputDImensions` after running the FFT along the axis `axisPassOrder[passIx]` (if you don't +provide this order it's assumed to be `xyzw`). This assumes that you don't run or store any unnecessary FFTs, since with wrapping modes it's always possible to recover the result in the padding area (sampling outside of $[0,1)$ along some axis). + +It furthermore takes an argument `realFFT` which if true means you are doing an FFT on a real signal AND you want to store the output of the FFT along the first axis +in a compact manner (knowing that FFTs of real signals are conjugate-symmetric). By default it assumes your complex numbers have `float32_t` scalars, `halfFloats` set to true means you're using `float16_t` scalars. + +`getOutputBufferSizeConvolution` furthermore takes a `kernelDimensions` argument. When convolving a signal against a kernel, the FFT has some extra padding to consider, so these methods are different. + +### Figuring out compile-time parameters +We provide a +```cpp +OptimalFFTParameters optimalFFTParameters(uint32_t maxWorkgroupSize, uint32_t inputArrayLength); +``` +function in the `workgroup::fft` namespace, which yields possible values for `ElementsPerInvocationLog2` and `WorkgroupSizeLog2` you might want to use to instantiate a `ConstevalParameters` struct, packed in a `OptimalFFTParameters` struct. + +By default, we prefer to use only 2 elements per invocation when possible, and only use more if +$2 \cdot \text{maxWorkgroupSize} < \text{inputArrayLength}$. This is because using more elements per thread either results in more accesses to the array via the `Accessor` or, if using preloaded accessors, it results in lower occupancy. + +`inputArrayLength` can be arbitrary, but please do note that the parameters returned will be for running an FFT on an array of length `roundUpToPoT(inputArrayLength)` and YOU are responsible for padding your data up to that size. + +You are, of course, free to choose whatever `ConstevalParameters` are better for your use case, this is just a default. + +### Indexing +We made some decisions in the design of the FFT algorithm pertaining to load/store order. In particular we wanted to keep stores linear to minimize cache misses when writing the output of an FFT. As such, the output of the FFT is not in its normal order, nor in bitreversed order (which is the standard for Cooley-Tukey implementations). Instead, it's in what we will refer to Nabla order going forward. The Nabla order allows for coalesced writes of the output, and is essentially the "natural order" of the output of our algorithm, meaning it's the order of the output that doesn't require incurring in any extra ordering operations. + +This whole discussion applies to our implementation of the forward FFT only. We have not yet implemented the same functions for the inverse FFT since we didn't have a need for it. A discussion of how to compute the Nabla order for the forward FFT and a proof for it can be found in the [maintainers section of the FFT Readme](https://github.com/Devsh-Graphics-Programming/Nabla/blob/master/include/nbl/builtin/hlsl/fft/README.md#bit-ordering-of-the-nabla-fft). + +The result of a forward FFT will be referred to as an $\text{NFFT}$ (N for Nabla). This $\text{NFFT}$ contains the same elements as the (properly-ordered) $\text{DFT}$ of the same signal, just in Nabla order. We provide a struct +```cpp +template +struct FFTIndexingUtils; +``` +that automatically handles the math for you in case you want to go from one order to the other. It provides the following methods: + +* `uint32_t getDFTIndex(uint32_t outputIdx)`: given an index $\text{outputIdx}$ into the $\text{NFFT}$, it yields its corresponding $\text{freqIdx}$ into the $\text{DFT}$, such that + + $\text{DFT}[\text{freqIdx}] = \text{NFFT}[\text{outputIdx}]$ +* `uint32_t getNablaIndex(uint32_t freqIdx)`: given an index $\text{freqIdx}$ into the $\text{DFT}$, it yields its corresponding $\text{outputIdx}$ into the $\text{NFFT}$, such that + + $\text{DFT}[\text{freqIdx}] = \text{NFFT}[\text{outputIdx}]$. It's essentially just the inverse of the previous method. +* `uint32_t getDFTMirrorIndex(uint32_t freqIdx)`: A common operation you might encounter using FFTs (especially FFTs of real signals) is to get the mirror around the middle (Nyquist frequency) of a given frequency. Given an index $\text{freqIdx}$ into the $\text{DFT}$, it returns a $\text{mirrorIndex}$ which is the index of its mirrored frequency, which satisfies the equation + + $\text{freqIdx} + \text{mirrorIndex} = 0 \mod \text{FFTLength}$. Two elements don't have proper mirrors and are fixed points of this function: + the Zero $($ index $0$ in the $\text{DFT})$ and + Nyquist $($ index $\frac {\text{FFTLength}} 2$ in the $\text{DFT})$ frequencies. +* `uint32_t getNablaMirrorIndex(uint32_t outputIdx)`: Yields the same as above, but the input and output are given in Nabla order. This is not to say we mirror $\text{outputIdx}$ around the middle frequency of the Nabla-ordered array (that operation makes zero sense) but rather this function is just $\text{getNablaIndex}\circ\text{getDFTMirrorIndex}\circ\text{getDFTIndex}$. That is, get the corresponding index in the proper $\text{DFT}$ order, mirror THAT index around Nyquist, then go back to Nabla order. + +For the the next struct and its functions, let's give an example of where you might need them first. Suppose you packed two real signals $x, y$ as $x + iy$ and did a single FFT to save compute. Now you might want to unpack them to get the FFTs of each signal. If you had the $\text{DFT}$ in the right order, unpacking requires to have values $\text{DFT}[T]$ and $\text{DFT}[-T]$ to unpack the values for each FFT at those positions. + +Suppose as well that you are using preloaded accessors, so the whole result of the FFT is +currently resident in registers for threads in a workgroup. Each element a thread is currently holding is associated with a +unique $\text{globalElementIndex}$, and to unpack some value a thread needs to know both $\text{NFFT}[\text{globalElementIndex}]$ and $\text{NFFT}[\text{getNablaMirrorIndex}(\text{globalElementIndex})]$. + +Usually what you'd want to do is iterate over every $\text{localElementIndex}$ +(which is associated with a $\text{globalElementIndex}$), get its mirror and do an unpack operation (an example of this is done +in the Bloom example). To get said mirror, we do a workgroup shuffle: with a shared memory array $\text{A}$, each thread of thread ID $\text{threadID}$ in a workgroup writes an element at $\text{A}[\text{threadID}]$ and reads a value from $\text{A}[\text{otherThreadID}]$, where +$\text{otherThreadID}$ is the ID of the thread holding the element $\text{NFFT}[\text{getNablaMirrorIndex}(\text{globalElementIndex})]$ (again, see +the Bloom example for an example of this). + +This works assuming that each workgroup shuffle is associated with the same +$\text{localElementIndex}$ for every thread - that is, every thread goes over its elements in the same order at the same time. The question now becomes, how does a thread know which value it has to send in this shuffle? + +The functions +```cpp +NablaMirrorLocalInfo FFTMirrorTradeUtils::getNablaMirrorLocalInfo(uint32_t globalElementIndex); + +NablaMirrorGlobalInfo FFTMirrorTradeUtils::getNablaMirrorGlobalInfo(uint32_t globalElementIndex); +``` + handle this for you: given a $\text{globalElementIndex}$, `getNablaMirrorLocalInfo` returns a struct with a field `otherThreadID` (the one we will receive a value from in the shuffle) and a field `mirrorLocalIndex` which is the $\text{localElementIndex}$ *of the element we should write to the shared memory array*. + +`getNablaMirrorGlobalInfo` returns the same info but with a `mirrorGlobalIndex` instead, so instead of returning the $\text{localElementIndex}$ of the element we have to send, it returns its $\text{globalElementIndex}$. + +In case this is hard to follow, you can copy the template function we use to trade mirrors around in `fft_mirror_common.hlsl` in the Bloom example. + +## Results + +We mentioned these already in the Optimization 7 section, but our FFT Bloom runs on an RTX 4060 in $0.57 \; \text{ms}$ (for a `1280x720` image with a `256x256` kernel) and in $1.04 \; \text{ms}$ for the same image +with a `512x512` kernel, taking the best-running case for each kernel as discussed in that section. + +For reference, [Froyok's implementation of CoD Bloom](https://github.com/Froyok/Bloom) takes $0.16 \; \text{ms}$ to run on an image of the same size, while our [Prefix Sum based Blur](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/tree/master/26_Blur) takes $1.27 \; \text{ms}$ (blog post on that in the works). + +When moving up to a `1920x1080` image, time taken skyrockets to $4.4 \; \text{ms}$ regardless of kernel size or which axis the FFT is ran along first. Froyok's Bloom takes takes $0.2 \; \text{ms}$ for that size, +while our Prefix Sum based Blur takes $2.59 \; \text{ms}$. + +The FFT Convolution does have some advantages over the Prefix Sum Blur: the former requires constant workgroup accessible memory, regardless of input size, while the latter requires an amount that's proportional to the length +of scanlines in the image. Also, the latter can only do radially symmetric kernels, while the FFT allows for arbitrarily shaped kernels. For example, you could convolve an image with a heart-shaped kernel: + +![Heart](heart.png "Convolution with a heart-shaped kernel") + +Analyzing the shaders in NSight, we see that we get perfect occupancy for the first FFT pass (first shader), using $40$ registers for a theoretical max warp occupancy of $48$. I was also getting perfect occupancy on the last IFFT pass (third shader) until two days ago, without changing any +code I'm now getting $42$ registers usage which is just $1$ over the perfect occupancy limit, which is a bummer. With a little more optimization it might be possible to bring it back down to $40$ and achieve perfect occupancy again. +The second shader, which doess FFT + Hadamard + IFFT, uses $59$ registers, yielding the same theoretical max warp occupancy of $32$ for both last shaders. + +Out of a full pass, the first shader only takes $16\%$ of the time, the last shader takes $36\%$ and the remaining time is taken by the second shader. + +Looking for bottlenecks we find that $22\%$ of stalls in the second shader are LGSB (`long_scoreboard_not_issued`), likely due to reads thrashing the cache: threads in the same workgroup don't sample the kernel spectrum in a locally-coherent +manner but they rather sample it all over the place because the image spectrum is in a weird mix of Nabla order along one axis and bitreversed along the other, while the kernel is in natural DFT order. + +This suggests +that it might be worth it to reorder the spectrum of the image (reordering after FFT along each axis) so that we get the spectrum in natural order, sample the kernel spectrum coherently, and then reorder it again +before the IFFTs. Of course, this would likely increase THDBAR stalls (waiting on barriers), which are already quite high. + +Speaking of such barriers, for each shader in order these represent $16\%, 18\%$ and $16\%$ of stalls. A LOT of these barriers happen when shuffling elements around in unpacking operations. They are necessary to prevent +data races, but frankly speaking time taken between each barrier was usually enough (on my GPU, at least) for all threads in the workgroup to catch up, so the image was always correct. So, at least on my GPU, you can +cut down the time taken by Bloom by reducing a bunch of these barriers. + +## Future Work + +Inbetween shaders, we optimized for coalesced writes. That is, along the first axis we do a coalesced write after an FFT or IFFT, which makes the next shader have to read non-coalesced. +We did not try using images in optimal tiling for this intermediate storage: these are usually stored as Z-order buffers so they *might* be better since you avoid thrashing the cache on both reads and writes, at the cost +of none of them being coalesced. + +There was an idea of skipping the zero-padding and getting rid of the need for scratch memory by doing the FFT entirely in-place by abusing spectral upsampling to provide the border, but: + +* We would need to perform arbitrarily-sized FFTs, not just PoT. Hard to code, especially if the size can be specified dynamically. +* Ringing would worsen +* We would waste shared memory and introduce an $O(\text{upscaleFactor}^2)$ cost for each invocation doing the Hadamard product + +Matt had also experimented decomposing the bloom into a low-resolution FFT + high resolution naive convolution filter, but it didn't work for arbitrary kernels. It starts by doing + +$\text{Kernel} * \text{Image} \approx \text{Image} * \text{SmallKernel} + \text{Downsample}(\text{Image)} * \text{ModifiedSmallKernel}$ + +Downsampling happens as a convolution with a $\text{sinc}$-like function, so + +$\text{Downsample}(\text{Image}) = \text{Image} * \text{Sinc-like}$ + +which means the convolution between image and kernel then becomes + +$F(\text{Image}) \cdot F(\text{SmallKernel}) + F(\text{Image}) \cdot \text{Box-like} \cdot F(\text{ModifiedSmallKernel})$ + +in the spectral domain. Equating this to $F(\text{Kernel}) \cdot F(\text{Image})$ yields that + +$F(\text{Kernel}) = F(\text{SmallKernel}) + \text{Box-like} \cdot F(\text{ModifiedSmallKernel})$ + +ans since the box-like function cuts off higher frequencies, you'd ideally have + +$F(\text{SmallKernel}) = F(\text{Kernel}) \text{ if } k > \text{Downsampled size}$ + +However, downsampling introduces alaising, and the upsampling of the result does as well (since it's not done with $\text{sinc}$), so in practice it gets tricky to find who +the $\text{SmallKernel}$ and $\text{ModifiedSmallKernel}$ should be. + +Last but not least, the most promising optimization: mixed-radix FFTs. Right now, since we only implement Radix-2 Cooley-Tukey, you need to pad up to the next power of two to run it, which in the worst case is almost +2x the size of your original array, and in 2D this scales to almost 4x. For example, to run an FFT on a $2049$-long array it will be padded to $4096$. With a workgroup size of $512$ this would have you run +$8$ Workgroup-sized FFTs + $3$ "bigger-than-Workgroup" stages. + +If we had arbitrarily-sized Radices (or at least for some small primes, like $3,5,7$) we could for example only pad up to $2560 = 512 * 5$, run a single Radix-5 "bigger than Workgroup"-sized stage, and then run $5$ +Radix-2 Workgroup-sized FFTs. + diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/kernel.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/kernel.png new file mode 100644 index 0000000..687f03c Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/kernel.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/kernel_small.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/kernel_small.png new file mode 100644 index 0000000..5f45c30 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/kernel_small.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/laplacian.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/laplacian.png new file mode 100644 index 0000000..39f33c2 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/laplacian.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/less_ring.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/less_ring.png new file mode 100644 index 0000000..9a65a3c Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/less_ring.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/mirror_padding.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/mirror_padding.png new file mode 100644 index 0000000..3d0ef4f Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/mirror_padding.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/mirror_padding_artifact.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/mirror_padding_artifact.png new file mode 100644 index 0000000..5baa637 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/mirror_padding_artifact.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/noring.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/noring.png new file mode 100644 index 0000000..c362449 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/noring.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/padded_kernel.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/padded_kernel.png new file mode 100644 index 0000000..42b5456 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/padded_kernel.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/padded_kernel_shifted.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/padded_kernel_shifted.png new file mode 100644 index 0000000..dd8a03c Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/padded_kernel_shifted.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/ring_whitepoint.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/ring_whitepoint.png new file mode 100644 index 0000000..2edf835 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/ring_whitepoint.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/sinc.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/sinc.png new file mode 100644 index 0000000..ac20277 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/sinc.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/unpadded.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/unpadded.png new file mode 100644 index 0000000..a3efd62 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/unpadded.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/unshifted.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/unshifted.png new file mode 100644 index 0000000..8a2c266 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/unshifted.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/wrapped.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/wrapped.png new file mode 100644 index 0000000..eec1793 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/wrapped.png differ diff --git a/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/zero_padded.png b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/zero_padded.png new file mode 100644 index 0000000..8470d23 Binary files /dev/null and b/blog/2025/2025-01-24-fft-bloom-optimized-to-the-bone-in-nabla/zero_padded.png differ diff --git a/blog/2025/2025-05-09-blender-baking/bake-influence.webp b/blog/2025/2025-05-09-blender-baking/bake-influence.webp new file mode 100644 index 0000000..abca7ee Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/bake-influence.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/bake-panel.webp b/blog/2025/2025-05-09-blender-baking/bake-panel.webp new file mode 100644 index 0000000..0ea6c03 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/bake-panel.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/bake-type-diffuse.webp b/blog/2025/2025-05-09-blender-baking/bake-type-diffuse.webp new file mode 100644 index 0000000..04829f2 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/bake-type-diffuse.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/bake-type-rough.webp b/blog/2025/2025-05-09-blender-baking/bake-type-rough.webp new file mode 100644 index 0000000..6a93ca1 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/bake-type-rough.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/bowl-rendered.webp b/blog/2025/2025-05-09-blender-baking/bowl-rendered.webp new file mode 100644 index 0000000..3736686 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/bowl-rendered.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/bumpy-bowl.webp b/blog/2025/2025-05-09-blender-baking/bumpy-bowl.webp new file mode 100644 index 0000000..99a9111 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/bumpy-bowl.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/compare-materials.webp b/blog/2025/2025-05-09-blender-baking/compare-materials.webp new file mode 100644 index 0000000..fcfb8ba Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/compare-materials.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/compare-polygons.webp b/blog/2025/2025-05-09-blender-baking/compare-polygons.webp new file mode 100644 index 0000000..d4d0b55 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/compare-polygons.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/complete-bake.webp b/blog/2025/2025-05-09-blender-baking/complete-bake.webp new file mode 100644 index 0000000..ac5009d Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/complete-bake.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/emission-colour.webp b/blog/2025/2025-05-09-blender-baking/emission-colour.webp new file mode 100644 index 0000000..0e88a7f Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/emission-colour.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/extrusion-and-max-ray-distance.webp b/blog/2025/2025-05-09-blender-baking/extrusion-and-max-ray-distance.webp new file mode 100644 index 0000000..449ccff Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/extrusion-and-max-ray-distance.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/good-values.webp b/blog/2025/2025-05-09-blender-baking/good-values.webp new file mode 100644 index 0000000..21244da Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/good-values.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/hatchet.webp b/blog/2025/2025-05-09-blender-baking/hatchet.webp new file mode 100644 index 0000000..3090599 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/hatchet.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/high-poly-blender.webp b/blog/2025/2025-05-09-blender-baking/high-poly-blender.webp new file mode 100644 index 0000000..6e2c669 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/high-poly-blender.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/hit-bake.webp b/blog/2025/2025-05-09-blender-baking/hit-bake.webp new file mode 100644 index 0000000..57e79fe Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/hit-bake.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/index.md b/blog/2025/2025-05-09-blender-baking/index.md new file mode 100644 index 0000000..e1aa86e --- /dev/null +++ b/blog/2025/2025-05-09-blender-baking/index.md @@ -0,0 +1,222 @@ +--- +title: 'The one true guide to baking materials in Blender' +slug: 'blender-baking' +description: 'How to get nice materials on low poly objects' +date: '2025-05-09' +authors: ['jaked'] +tags: ['blender', 'baking', 'normal-maps', 'article', 'tutorial'] +--- + +Real time rendering performance is often limited by 3D assets as much as it's limited by code. Good low poly assets inevitably rely on baking, the process of transferring details from a high poly mesh with a complex material to a low poly one with a much simpler material. Unfortunately, however, there seems to be a lack of info regarding baking around. Especially in Blender, things can sometimes be a bit unintuitive regarding baking. In the process of working on my game, A Short Odyssey (ASO), I came up with a workflow that works quite well for me, so I will share it with you today. + +For this tutorial we are going to use this wooden bowl model from the fantastic website [Polyhaven](https://polyhaven.com/a/wooden_bowl_02). + +![Bowl Rendered](bowl-rendered.webp) + + + +As with all of the free CC0 models on Polyhaven, this mesh has a fairly high number of small triangles. 4,666 to be exact. While that may not seem like a lot think about how big it is likely to be in a real time scene. Most of the time the entire bowl might only be a few pixels tall! Especially given that small triangles are much more expensive than large triangles (due to [quad occupancy](https://blog.selfshadow.com/2012/11/12/counting-quads/)). This is probably something we should deal with. + +![Small Bowl](small-bowl.webp) + +Now that we understand why we must bake, let's go ahead and do it. + +# Preparing for the Bake + +Open up your high poly model in Blender. I am using Blender 4.4. Other versions should work but your UI might not match up exactly with this tutorial. + +![High Poly Blender](high-poly-blender.webp) + +You then will need a low poly version of the model. How to create a low poly model is outside the scope of this tutorial, but it *must* be UV-unwrapped before proceeding and none of the polygons should be overlapping on the UV map. + +![Low Poly Blender](low-poly-blender.webp) + +My low-poly version uses only 272 triangles or roughly 5.8% of the original number. These are also bigger triangles so should have much better quad occupancy on the GPU. + +![overlapping models](overlapping-models.webp) + +The first thing you need to do is make sure the high and low-poly models are directly on top of each other, just like in the image above. You must also ensure the scale of the low poly version is exactly 1.0 on all axes. + +![Unit Scale](unit-scale.webp) + +If it is not, you can apply the scale with Ctrl+A -> Apply -> Scale, while the low poly object is selected in object mode. + +![triangulate](triangulate.webp) + +Next, you need to add a triangulate modifier to the low poly object. The exact options you pick here don't really matter, but if you change them after the bake you must re-bake all maps. + +# Creating a Bake Target Proxy + +The main way my workflow differs from what I've seen elsewhere is the use of a Bake Target or Proxy, this is not strictly necessary but it makes the entire process far less frustrating if you need to run the baking process more than once, which you inevitably will. This involves creating a linked duplicate to our low poly object, this will allow you to preview the bake results without having to mess around with the shader nodes and having to reconnect things between bakes. + +![Linked Duplicate](linked-duplicate.webp) + +To create a linked duplicate, simply select your low-poly object and hit Alt+D, you can then move your linked duplicate off to the side somewhere. + +![Outliner Names](outliner-names.webp) + +I'm going to name the new object `Low Poly` and the first one `Bake Target` (The names don't matter but it's nice to be organized). + +![Object Data](object-data.webp) + +This next part is very important, you must set the `Bake Target` to source it's materials from "Object" instead of "Data". This way the two linked objects can have different materials. This is done as shown above in the material tab for the `Bake Target` Object + +You can then create a material for it which I will also call `Bake Target`, I will also create a new material for the `Low Poly` object and call it `Low Poly`. + +# Setting up Materials + +![Shading Tab](shading-tab.webp) + +The rest of this process will be done in the shading tab so we can switch there. + +![Material Nodes](material-nodes.webp) + +With the bake target selected we will add 3 texture nodes to it's material. Because I'm using a PBR workflow, these will be Albedo, Normal & Roughness (I will get into metalness later in this tutorial). These texture nodes should have their colour space set to "sRGB" for the Albedo and "Non-Color" for the others. You should NOT connect these nodes to anything. + +![Pasted Nodes](pasted-nodes.webp) + +You can then copy & paste these nodes into the material for the `Low Poly` object. Then connect the nodes like shown here. + +![Invert Green](invert-green.webp) + +If you use DirectX style normal maps (Like I do in ASO), you will need to add an "RGB Curves" node with the green channel flipped in order to invert the green channel of the normal map. + +![Weird Shiny](weird-shiny.webp) + +Your low poly will look weird and shiny, that is because our baked textures are all black at the moment, that is OK. It will look correct after we are done baking. + +Now that everything is set up, we can start looking at the actual baking UI. + +# The Baking UI + +![Render Panel](render-panel.webp) + +Baking is accessed through the Render tab on the properties panel. + +![Render Engine](render-engine.webp) + +In order to see the bake options you need to set the Render Engine to "Cycles". You probably also want to set Device to "GPU Compute" in order to speed things up. + +![Bake Panel](bake-panel.webp) + +Expanding the bake controls will give you access to several new options. + +![Normal Baking](normal-baking.webp) + +We will start by baking the normal map. To do so we must first select "Normal" from the Bake Type combo box. You will also want to check "Selected to Active". For users of DirectX style normal maps, like myself, you will also need to set the G channel to "-Y". If you are using OpenGL style normal maps you can leave it as is. + +# Performing the Bake + +![Selected To Active](selected-to-active.webp) + +Ok its finally bake time, select your High Poly asset then press Ctrl and select your `Bake Target` this sets the High poly as selected and your `Bake Target` as Active. If everything is selected correctly your outliner should look like the image above. With a dark orange highlight on the high poly object and bright orange for the `Bake Target`. + +![Select Normal Node](select-normal-node.webp) + +Now select the normal map texture node in the shader nodes for the current material, this tells blender to use it as the destination for baking. + +![Hit Bake](hit-bake.webp) + +We can finally hit bake! + +![Messed Up Bake](messed-up-bake.webp) + +After some amount of processing time, you should see a preview of the normal map. There is also a 99% chance it will be messed up in some way. + +![Messed Up Bake Normals](messed-up-bake-normals.webp) + +As you can see looking at our `Low Poly` object something is very off. + +![Extrusion And Max Ray Distance](extrusion-and-max-ray-distance.webp) + +The solution to this problem is adjusting two very important parameters for baking. They are "Extrusion" and "Max Ray Distance". + +In Blender baking works by shooting out rays from the Bake Target. Since our Low poly mesh doesn't lie completely outside the surface of the the high poly object we blender needs to effectively extrude the surfaces of the target outward so that the high poly object is completely contained within the low poly one. The amount that it does this is the "Extrusion" and the length of the rays are "Max Ray Distance". + +Now of course you are probably wondering at this point, how do I know what to set these numbers to? My rule of thumb is to set extrusion to the smallest value you can that makes the green pixels in the normal map go away. Then set the Max Ray Distance to ~1.5-2 times the Extrusion. + +![Good Values](good-values.webp) + +In this case 0.1 and 0.2 are good values. + +![Not Enough Distance](not-enough-distance.webp) + +If the Max Ray Distance was too low, eg. 0.1, we would get holes in our normal map as shown above. + +![Perfect Normals](perfect-normals.webp) + +If our values are set properly we get a nice normal map without any artifacts. + +![Bumpy Bowl](bumpy-bowl.webp) + +We can also now look at our `Low Poly` object and see that it looks nice and bumpy. But there is one tiny problem, It's far too shiny! This is because its roughness map is entirely black or 0.0, this corresponds to a mirror like shine. So of course our next step should be to bake a roughness map. + +# Baking a Roughness Map + +![Select Roughness](select-roughness.webp) + +With your selection still on the high poly and your active still on the bake target select the roughness map texture node in the shader nodes editor. + +![Bake Type Rough](bake-type-rough.webp) + +Select "Roughness" for Bake Type and hit Bake again. + +![Roughness Result](roughness-result.webp) + +After waiting for the bake to complete we now have a roughness map and the shininess of our bowl looks correct. Last but certainly not least we need to bake albedo. This is the actual surface colour of our object. + +# Baking an Albedo Map + +![Select Albedo](select-albedo.webp) + +Just as before we need to select the Albedo texture node in the shader node editor. + +![Bake Type Diffuse](bake-type-diffuse.webp) + +We set the Bake Type to "Diffuse" this time, but there is one more thing before you bake! + +![Bake Influence](bake-influence.webp) + +In the below the bake button under "Influence" you must uncheck "Direct" and "Indirect", otherwise blender will bake the lighting into your albedo texture. Now we can hit Bake. + +![Complete Bake](complete-bake.webp) + +If everything went well, our bowl now has a complete material! + +![Compare Materials](compare-materials.webp) + +Our low poly now looks much more like the high poly one. + +![Compare Polygons](compare-polygons.webp) + +Even though their polygon counts are radically different. + +![Save Textures](save-textures.webp) + +Now before I finish I need to remind you to save your textures, and for some reason Blender doesn't do this automatically for you. you can do it from the hamburger menu in the "Image Editor" under Image -> Save. This must be done for each of your textures. + +There we go, that's it! That's how to bake full materials in Blender! + +# A Note on Metalness + +There is however one tiny consideration for metallic materials. For some reason if your high poly object has any metal on it whatsoever it will completely break everything when baking. Luckily however there is a workaround. + +![Hatchet](hatchet.webp) + +Lets use this hatchet as an example. You need to take the metallic parameter for the high poly mesh's material and hook it up to the *Emission Color* output. + +![Emission Colour](emission-colour.webp) + +Because ASO packs roughness and metal together I'm gonna send both through the Emission color using a "Combine Color" node (Note ASO uses R = Roughness, G = Metal, this is different from glTF). All you do now is locate the correct texture in your `Bake Target` material and instead of baking Metal and Roughness you bake using "Emission" as the bake type instead. + +# Considerations for Mirrored Objects + +![Mirror Modifier](mirror-modifier.webp) + +If your low poly object has a mirror modifier like the hatchet from the metal section, there is one more thing to be aware of. You should set the UV coordinate offset to 1.0 for either U or V. This will ensure the mirrored geometry generates UV coordinates that do not overlap with the ones we already have, which would have caused problems during the bake. + +# The End + +Hope you enjoyed this tutorial! If you found it useful or wanna know about my game A Short Odyssey, please wishlist it on Steam: https://store.steampowered.com/app/2818690/A_Short_Odyssey + + diff --git a/blog/2025/2025-05-09-blender-baking/invert-green.webp b/blog/2025/2025-05-09-blender-baking/invert-green.webp new file mode 100644 index 0000000..3b941b2 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/invert-green.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/linked-duplicate.webp b/blog/2025/2025-05-09-blender-baking/linked-duplicate.webp new file mode 100644 index 0000000..81cc85d Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/linked-duplicate.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/low-poly-blender.webp b/blog/2025/2025-05-09-blender-baking/low-poly-blender.webp new file mode 100644 index 0000000..f3570bd Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/low-poly-blender.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/material-nodes.webp b/blog/2025/2025-05-09-blender-baking/material-nodes.webp new file mode 100644 index 0000000..132860d Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/material-nodes.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/messed-up-bake-normals.webp b/blog/2025/2025-05-09-blender-baking/messed-up-bake-normals.webp new file mode 100644 index 0000000..a531eda Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/messed-up-bake-normals.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/messed-up-bake.webp b/blog/2025/2025-05-09-blender-baking/messed-up-bake.webp new file mode 100644 index 0000000..c54693d Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/messed-up-bake.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/mirror-modifier.webp b/blog/2025/2025-05-09-blender-baking/mirror-modifier.webp new file mode 100644 index 0000000..cfd1140 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/mirror-modifier.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/normal-baking.webp b/blog/2025/2025-05-09-blender-baking/normal-baking.webp new file mode 100644 index 0000000..be97e7b Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/normal-baking.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/not-enough-distance.webp b/blog/2025/2025-05-09-blender-baking/not-enough-distance.webp new file mode 100644 index 0000000..578a5ff Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/not-enough-distance.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/object-data.webp b/blog/2025/2025-05-09-blender-baking/object-data.webp new file mode 100644 index 0000000..4d121cf Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/object-data.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/outliner-names.webp b/blog/2025/2025-05-09-blender-baking/outliner-names.webp new file mode 100644 index 0000000..d48554a Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/outliner-names.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/overlapping-models.webp b/blog/2025/2025-05-09-blender-baking/overlapping-models.webp new file mode 100644 index 0000000..021f6aa Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/overlapping-models.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/pasted-nodes.webp b/blog/2025/2025-05-09-blender-baking/pasted-nodes.webp new file mode 100644 index 0000000..92600d3 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/pasted-nodes.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/perfect-normals.webp b/blog/2025/2025-05-09-blender-baking/perfect-normals.webp new file mode 100644 index 0000000..6714433 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/perfect-normals.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/render-engine.webp b/blog/2025/2025-05-09-blender-baking/render-engine.webp new file mode 100644 index 0000000..99f671f Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/render-engine.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/render-panel.webp b/blog/2025/2025-05-09-blender-baking/render-panel.webp new file mode 100644 index 0000000..697d786 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/render-panel.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/roughness-result.webp b/blog/2025/2025-05-09-blender-baking/roughness-result.webp new file mode 100644 index 0000000..352c78b Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/roughness-result.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/save-textures.webp b/blog/2025/2025-05-09-blender-baking/save-textures.webp new file mode 100644 index 0000000..66c4acf Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/save-textures.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/select-albedo.webp b/blog/2025/2025-05-09-blender-baking/select-albedo.webp new file mode 100644 index 0000000..61d6300 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/select-albedo.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/select-normal-node.webp b/blog/2025/2025-05-09-blender-baking/select-normal-node.webp new file mode 100644 index 0000000..52ea975 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/select-normal-node.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/select-roughness.webp b/blog/2025/2025-05-09-blender-baking/select-roughness.webp new file mode 100644 index 0000000..81c0f1d Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/select-roughness.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/selected-to-active.webp b/blog/2025/2025-05-09-blender-baking/selected-to-active.webp new file mode 100644 index 0000000..97b7d0f Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/selected-to-active.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/shading-tab.webp b/blog/2025/2025-05-09-blender-baking/shading-tab.webp new file mode 100644 index 0000000..ef4ddc5 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/shading-tab.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/small-bowl.webp b/blog/2025/2025-05-09-blender-baking/small-bowl.webp new file mode 100644 index 0000000..df36585 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/small-bowl.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/triangulate.webp b/blog/2025/2025-05-09-blender-baking/triangulate.webp new file mode 100644 index 0000000..fe0c020 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/triangulate.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/unit-scale.webp b/blog/2025/2025-05-09-blender-baking/unit-scale.webp new file mode 100644 index 0000000..e191f47 Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/unit-scale.webp differ diff --git a/blog/2025/2025-05-09-blender-baking/weird-shiny.webp b/blog/2025/2025-05-09-blender-baking/weird-shiny.webp new file mode 100644 index 0000000..0357c2b Binary files /dev/null and b/blog/2025/2025-05-09-blender-baking/weird-shiny.webp differ diff --git a/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/index.md b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/index.md new file mode 100644 index 0000000..9803e1b --- /dev/null +++ b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/index.md @@ -0,0 +1,366 @@ +--- +title: 'Nvidia SPIR-V Compiler Bug or Do Subgroup Shuffle Operations Not Imply Execution Dependency?' +slug: 'subgroup-shuffle-execution-dependency-on-nvidia' +description: "A look at the behavior behind Nabla's subgroup scan" +date: '2025-06-19' +authors: ['keptsecret', 'devshgraphicsprogramming'] +tags: ['nabla', 'vulkan', 'article'] +last_update: + date: '2025-06-19' + author: keptsecret +--- + +Reduce and scan operations are core building blocks in the world of parallel computing, and now [Nabla has a new release](https://github.com/Devsh-Graphics-Programming/Nabla/tree/v0.6.2-alpha1) with those operations made even faster for Vulkan at the subgroup and workgroup levels. + +This article takes a brief look at the Nabla implementation for reduce and scan on the GPU in Vulkan. + +Then, I discuss a missing execution dependency expected for a subgroup shuffle operation, which was only a problem on Nvidia devices in some test cases. + + + +## Reduce and Scan + +Let's give a quick introduction, or recap for those already familiar, to reduce and scan operations. + +A reduction takes a binary associative operator $\bigoplus$ and an array of $n$ elements + +$\left[x_0, x_1,...,x_{n-1}\right]$, + +and returns + +$x_0 \bigoplus x_1 \bigoplus ... \bigoplus x_{n-1}$. + +In other words, when $\bigoplus$ is an addition, a reduction of the array $X$ is then the sum of all elements of array $X$. + +``` +Input: 4 6 2 3 7 1 0 5 +Reduction: 28 +``` + +A scan is a generalization of reduction, and takes a binary associative operator $\bigoplus$ with identity $I$ and an array of $n$ elements. +Then, for each element, performs the reduction from the first element to the current element. +An _exclusive_ scan does so for all elements before the current element. + +$\left[I, x_0, (x_0 \bigoplus x_1), ..., (x_0 \bigoplus x_1 \bigoplus ... \bigoplus x_{n-2})\right]$. + +An _inclusive_ scan then includes the current element as well. + +$\left[x_0, (x_0 \bigoplus x_1), ..., (x_0 \bigoplus x_1 \bigoplus ... \bigoplus x_{n-1})\right]$. + +Notice the last element of the inclusive scan is the same as the reduction. + +``` +Input: 4 6 2 3 7 1 0 5 +Exclusive: 0 4 10 12 15 22 23 23 +Inclusive: 4 10 12 15 22 23 23 28 +``` + +## Nabla's subgroup scans + +We start with the most basic of building blocks: doing a reduction or a scan in the local subgroup of a Vulkan device. +Pretty simple actually, since Vulkan already has subgroup arithmetic operations supported. +Nabla exposes this via the [GLSL compatibility header](https://github.com/Devsh-Graphics-Programming/Nabla/blob/v0.6.2-alpha1/include/nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl) built of [HLSL SPIR-V inline intrinsics](https://github.com/Devsh-Graphics-Programming/Nabla/blob/v0.6.2-alpha1/include/nbl/builtin/hlsl/spirv_intrinsics/subgroup_arithmetic.hlsl). + +```cpp +nbl::hlsl::glsl::groupAdd(T value) +nbl::hlsl::glsl::groupInclusiveAdd(T value) +nbl::hlsl::glsl::groupExclusiveAdd(T value) +etc... +``` + +But wait, the SPIR-V-provided operations all require your Vulkan physical device to have support the `GroupNonUniformArithmetic` capability. +So, Nabla provides emulated versions for that too, and both versions are compiled into a single templated struct call. + +```cpp +template +struct inclusive_scan; + +template +struct exclusive_scan; + +template +struct reduction; +``` + +The implementation of emulated subgroup scans make use of subgroup shuffle operations to access partial sums from other invocations in the subgroup. +This is based on the [Kogge–Stone adder (KSA)](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda), using $\log_2 n$ steps where $n$ is the subgroup size with all lanes active. +It should also be noted that in cases like this where the SIMD/SIMT processor pays for all lanes regardless of whether or not they're active, the KSA design is faster than more theoretically work-efficient parallel scans like the Blelloch (which we use at the workgroup granularity). + +```cpp +T inclusive_scan(T value) +{ + rhs = shuffleUp(value, 1) + value = value + (firstInvocation ? identity : rhs) + + [unroll] + for (i = 1; i < SubgroupSizeLog2; i++) + { + nextLevelStep = 1 << i + rhs = shuffleUp(value, nextLevelStep) + value = value + (nextLevelStep out of bounds ? identity : rhs) + } + return value +} +``` + +In addition, Nabla also supports passing vectors into these subgroup operations, so you can perform reduce or scans on up to subgroup size * 4 (for `vec4`) elements per call. +Note that it expects the elements in the vectors to be consecutive and in the same order as the input array. +This is because we've found through benchmarking that the instructing the GPU to do a vector load/store results in faster performance than any attempt at coalesced load/store with striding. + +We also found shuffles and vector arithmetic to be very expensive, and so having the least amount of data exchange between invocations and pre-scanning up to 4 elements within an invocation was significantly faster. + +You can find all the implementations on the [Nabla repository](https://github.com/Devsh-Graphics-Programming/Nabla/blob/v0.6.2-alpha1/include/nbl/builtin/hlsl/subgroup2/arithmetic_portability_impl.hlsl) + +## An issue with subgroup sync and reconvergence + +Now, onto a pretty significant, but strangely obscure, problem that I ran into during unit testing this prior to release. +[See the unit tests.](https://github.com/Devsh-Graphics-Programming/Nabla-Examples-and-Tests/blob/master/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl) +Nabla also has implementations for workgroup reduce and scans that make use of the subgroup scans above, and one such section looks like this. + +```cpp +... workgroup scan code ... + +debug_barrier() +for (idx = 0; idx < VirtualWorkgroupSize / WorkgroupSize; idx++) +{ + value = getValueFromDataAccessor(memoryIdx) + + value = subgroup::inclusive_scan(value) + + setValueToDataAccessor(memoryIdx) + + if (lastSubgroupInvocation) + { + setValueToSharedMemory(smemIdx) + } +} +workgroup_execution_and_memory_barrier() + +... workgroup scan code ... +``` + +_I should note that this is the first level of scans for the workgroup scope. It is only one step of the algorithm and the data accesses are completely independent. Thus, `memoryIdx` is unique and per-invocation, and also that shared memory is only written to in this step to be accessed in later steps._ + +At first glance, it looks fine, and it does produce the expected results for the most part... except in some very specific cases. +After some more testing and debugging to try and identify the cause, I've found the conditions to be: + +* using an Nvidia GPU +* using emulated versions of subgroup operations +* a decent number of iterations in the loop (in this case at least 8). + +I tested this on an Intel GPU, to be sure, and the workgroup scan ran correctly. +This was very baffling initially. And the results produced on an Nvidia device looked like a sync problem. + +It was even more convincing when I moved the control barrier inside the loop and it immediately produced correct scan results. + +```cpp +... workgroup scan code ... + +debug_barrier() +for (idx = 0; idx < VirtualWorkgroupSize / WorkgroupSize; idx++) +{ + value = getValueFromDataAccessor(memoryIdx) + + value = subgroup::inclusive_scan(value) + + setValueToDataAccessor(memoryIdx) + + if (lastSubgroupInvocation) + { + setValueToSharedMemory(smemIdx) + } + workgroup_execution_and_memory_barrier() +} + +... workgroup scan code ... +``` + +Ultimately, we came to the conclusion that each subgroup invocation was probably somehow not in sync as each loop went on. +Particularly, the effect we're seeing is a shuffle done as if `value` is not in lockstep at the call site. +We tested using a subgroup execution barrier and maximal reconvergence. +Strangely enough, just a memory barrier also fixed it, which it shouldn't have as subgroup shuffles are magical intrinsics that take arguments by copy and don't really deal with accessing any memory locations (SSA form). + +```cpp +T inclusive_scan(T value) +{ + subgroup_execution_barrier() + rhs = shuffleUp(value, 1) + value = value + (firstInvocation ? identity : rhs) + + [unroll] + for (i = 1; i < SubgroupSizeLog2; i++) + { + nextLevelStep = 1 << i + subgroup_execution_barrier() + rhs = shuffleUp(value, nextLevelStep) + value = value + (nextLevelStep out of bounds ? identity : rhs) + } + return value +} +``` + +However, this problem was only observed on Nvidia devices. + +As a side note, using the `SPV_KHR_maximal_reconvergence` extension doesn't resolve this issue surprisingly. +I feel I should point out that many presentations and code listings seem to give an impression subgroup shuffle operations execute in lockstep based on the very simple examples provided. + +For instance, [the example in this presentation](https://vulkan.org/user/pages/09.events/vulkanised-2025/T08-Hugo-Devillers-SaarlandUniversity.pdf) correctly demonstrates where invocations in a tangle are reading and storing to SSBO, but may mislead readers into not considering the Availability and Visibility for other scenarios that need it. + +Such simple examples are good enough to demonstrate the purpose of the extension, but fail to elaborate on specific details. +If it did have a read-after-write between subgroup invocations, subgroup scope memory dependencies would have been needed. + +(With that said, since subgroup operations are SSA and take arguments "by copy", this discussion of Memory Dependencies and availability-visibility is not relevant to our problem, but just something to be aware of.) + +### A minor detour onto the performance of native vs. emulated on Nvidia devices + +Since all recent Nvidia GPUs support subgroup arithmetic SPIR-V capability, why were we using emulation with shuffles? +I think this observation warrants a small discussion section of its own. +The table below are some numbers from our benchmark measured through Nvidia's Nsight Graphics profiler of a subgroup inclusive scan using native SPIR-V instructions and our emulated version. + +#### Native + +| Workgroup size | SM throughput (%) | CS warp occupancy (%) | # registers | Dispatch time (ms) | +| :------------: | :---------------: | :-------------------: | :---------: | :----------------: | +| 256 | 41.6 | 90.5 | 16 | 27 | +| 512 | 41.4 | 89.7 | 16 | 27.15 | +| 1024 | 40.5 | 59.7 | 16 | 27.74 | + +#### Emulated + +| Workgroup size | SM throughput (%) | CS warp occupancy (%) | # registers | Dispatch time (ms) | +| :------------: | :---------------: | :-------------------: | :---------: | :----------------: | +| 256 | 37.9 | 90.7 | 16 | 12.22 | +| 512 | 37.7 | 90.3 | 16 | 12.3 | +| 1024 | 37.1 | 60.5 | 16 | 12.47 | + +These numbers are baffling to say the least, particularly the fact that our emulated subgroup scans are twice as fast than the native solution. +It should be noted that this is with the subgroup barrier before every shuffle, we did not see any marked decrease in performance. + +An potential explanation for this may be that Nvidia has to consider any inactive invocations in a subgroup, having them behave as if they contribute the identity $I$ element to the scan. +Our emulated scan instead requires people call the arithmetic in subgroup uniform fashion. +If that is not the case, this seems like a cause for concern for Nvidia's SPIR-V to SASS compiler. + +### What could cause this behavior on Nvidia? — The Independent Program Counter + +We think a potential culprit for this could be Nvidia's Independent Program Counter (IPC) that was introduced with the Volta architecture. + +Prior to Volta, all threads in a subgroup share the same program counter, which handles scheduling of instructions across all those threads. +This means all threads in the same subgroup execute the same instruction at any given time. +Therefore, when you have a branch in the program flow across threads in the same subgroup, all execution paths generally have to be executed and mask off threads that should not be active for that path. + +
+ ![Pascal and prior SIMT model](pascal_simt_model.png "Pascal and prior SIMT model") +
Thread scheduling under the SIMT warp execution model of Pascal and earlier NVIDIA GPUs. Taken from [NVIDIA TESLA V100 GPU ARCHITECTURE](https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf)
+
+ +With Volta up to now, each thread has its own program counter that allows it to execute independently of other threads in the same subgroup. +This also provides a new possibility on Nvidia devices, where you can now synchronize threads in the same subgroup. +The active invocations still have to execute the same instruction, but it can be at different locations in the program (e.g. different iterations of a loop). + +
+ ![Volta Independent Thread Scheduling model](volta_scheduling_model.png "Volta Independent Thread Scheduling model") +
Independent thread scheduling in Volta architecture onwards interleaving execution from divergent branches, using an explicit sync to reconverge threads. Taken from [NVIDIA TESLA V100 GPU ARCHITECTURE](https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf)
+
+ +In CUDA, this is exposed through `__syncwarp()`, and we can do similar in Vulkan using subgroup control barriers. + +The IPC also enables starvation-free algorithms on CUDA, along with the use of mutexes where a thread that attempts to acquire a mutex is guaranteed to eventually succeed. Consider the example in the Volta whitepaper of a doubly linked list: + +```cpp +__device__ void insert_after(Node* a, Node* b) +{ + Node* c; + lock(a); + lock(a->next); + c = a->next; + + a->next = b; + b->prev = a; + + b->next = c; + c->prev = b; + + unlock(c); + unlock(a); +} +``` + +The diagram shows how, with IPC, even if thread K holds the lock for node A, another thread J in the same subgroup (warp in the case of CUDA) can wait for the lock to become available and not affect K's progress. + +
+ ![Doubly Linked List lock](linked_list_lock.png "Doubly Linked List lock") +
Locks are acquired for nodes A and C, shown on the left, before the threads inserts node B shown on the right. Taken from [NVIDIA TESLA V100 GPU ARCHITECTURE](https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf)
+
+ +In our case however, it's entirely possible that each subgroup shuffle operation does not run in lockstep with the branching introduced, which would be why subgroup execution barriers are our solution to the problem for now. + +Unfortunately, I couldn't find anything explicit mention in the SPIR-V specification that confirmed whether subgroup shuffle operations actually imply execution dependency, even with hours of scouring the spec. + +So then we either have... + +## This is a gray area of the Subgroup Shuffle Spec and allowed Undefined Behaviour + +Consider what it means if subgroup convergence doesn't guarantee that active tangle invocations execute a subgroup operation in lockstep. + +Subgroup ballot and ballot arithmetic are two where you don't have to consider lockstepness, because it is expected that the return value of ballot to be uniform in a tangle, and as a corollary, it is known exactly what it should be. + +Similarly, for subgroup broadcasts, first the value being broadcast needs to computed, say from invocation K. +Even if other invocations don't run in lockstep, they can't read the value until invocation K broadcasts it if they want to read the same value (uniformity) and you know what value should be read (broadcasting invocation can check it got the same value back). + +On the flip side, reductions will always produce a uniform return value for all invocations, even if you reduce a stale or out-of-lockstep input value. + +Meanwhile, subgroup operations that don't return tangle-uniform values, such as shuffles and scans, would only produce the expected result only if performed on constants or variables written with an execution dependency. +These operations can give different results per invocation so there's no implied uniformity, which means there's no reason to expect any constraints on their apparent lockstepness being implied transitively through the properties of the return value. + +The important consideration then is how a subgroup operation is implemented. +When a subgroup operation doesn't explicitly state that they all have to execute at the same time by all invocations, we can imagine a scenario where a shuffle may be as simple as the receiving invocation snooping another's register without requiring any action on the latter's part. +And that comes with obvious IPC dangers, as snooping it before it gets written or after it gets overwritten if there are no other execution dependencies will surely provide inconsistent results. + +This leads to code listings like the following becoming undefined behavior simply by changing the `Broadcast` into a `Shuffle`. + +```cpp +// Broadcasting after computation +// OK, only counts active invocations in tangle (doesn't change) +int count = subgroupBallotBitCount(true); +// OK, done on a constant +int index = subgroupExclusiveAdd(1); +int base, base_slot; +if (subgroupElect()) + base_slot = atomicAdd(dst.size,count); +// NOT OK, `base_slot` not available, visible or other invocations may even have raced ahead of the elected one +// Not every invocation will see the correct value of `base_slot` in the elected one memory dependency not ensured +base = subgroupBroadcastFirst(base_slot); +``` + +Similarly again, with [this example from the Khronos blog on maximal reconvergence](https://www.khronos.org/blog/khronos-releases-maximal-reconvergence-and-quad-control-extensions-for-vulkan-and-spir-v) + +```cpp +// OK, thanks to subgroup uniform control flow, no wiggle room here (need to know all invcocation values) +if (subgroupAny(needs_space)) { + // OK, narrowly because `subgroupBallot` returns a ballot thats uniform in a tangle + uvec4 mask = subgroupBallot(needs_space); + // OK, because `mask` is tangle-uniform + uint size = subgroupBallotBitCount(mask); + uint base = 0; + if (subgroupElect()) + base = atomicAdd(b.free, size); + + // NOT OK if replaced Broadcast with Shuffle, non-elected invocations could race ahead or not see (visibility) the `base` value in the elected invocation before that one would excecute a shuffle + base = subgroupBroadcastFirst(base); + // OK, but only because `mask` is tangle-uniform + uint offset = subgroupBallotExclusiveBitCount(mask); + + if (needs_space) + b.data[base + offset] = ...; +} +``` + +With all that said, it needs to be noted that one can't expect every instruction to run in lockstep, as that would negate the advantages of Nvidia's IPC. + +## Or a bug in Nvidia's SPIR-V to SASS compiler + +And crucially, it's impossible to know (or discuss in the case of a signed NDA) what's happening for the bug or performance regression with Nvidia. +Unlike AMD's RDNA ISAs where we can verify that the compiler is doing what it should be doing using Radeon GPU Analyzer, the generated SASS is inaccessible and neither is the compiler public. + +---------------------------- +_This issue was observed happening inconsistently on Nvidia driver version 576.80, released 17th June 2025._ diff --git a/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/linked_list_lock.png b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/linked_list_lock.png new file mode 100644 index 0000000..6ecf59e Binary files /dev/null and b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/linked_list_lock.png differ diff --git a/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/pascal_simt_model.png b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/pascal_simt_model.png new file mode 100644 index 0000000..d6f4700 Binary files /dev/null and b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/pascal_simt_model.png differ diff --git a/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/volta_scheduling_model.png b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/volta_scheduling_model.png new file mode 100644 index 0000000..6ee1c2b Binary files /dev/null and b/blog/2025/2025-06-19-subgroup-shuffle-reconvergence-on-nvidia/volta_scheduling_model.png differ diff --git a/blog/authors.yml b/blog/authors.yml index d519fad..dc95fa4 100644 --- a/blog/authors.yml +++ b/blog/authors.yml @@ -23,4 +23,31 @@ jaked: image_url: https://avatars.githubusercontent.com/u/3148945?v=4 page: true socials: - github: pgrAm \ No newline at end of file + github: pgrAm + +fletterio: + name: Francisco Letterio + title: Junior Developer @ DevSH Graphics Programming Sp. z O.O. + url: https://github.com/Fletterio + image_url: https://avatars.githubusercontent.com/u/40742817?v=4 + page: true + socials: + github: Fletterio + +keptsecret: + name: Sorakrit Chonwattanagul + title: Associate Developer @ DevSH Graphics Programming Sp. z O.O. + url: https://github.com/keptsecret/ + image_url: https://avatars.githubusercontent.com/u/27181108?v=4 + page: true + socials: + github: keptsecret + +devshgraphicsprogramming: + name: Mateusz Kielan + title: CTO of DevSH Graphics Programming Sp. z O.O. + url: https://www.devsh.eu/ + image_url: https://avatars.githubusercontent.com/u/6894321?v=4 + page: true + socials: + github: devshgraphicsprogramming diff --git a/docusaurus.config.ts b/docusaurus.config.ts index ef64ed8..c5e8ed4 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -1,149 +1,159 @@ -import {themes as prismThemes} from 'prism-react-renderer'; -import type {Config} from '@docusaurus/types'; -import type * as Preset from '@docusaurus/preset-classic'; -import remarkMath from 'remark-math'; -import rehypeKatex from 'rehype-katex'; +import { themes as prismThemes } from "prism-react-renderer"; +import type { Config } from "@docusaurus/types"; +import type * as Preset from "@docusaurus/preset-classic"; +import remarkMath from "remark-math"; +import rehypeKatex from "rehype-katex"; const config: Config = { - title: 'Graphics Programming Discord', - tagline: 'Articles, guides, tips and tricks from and for frogs and forgis of the graphics programming discord. This is what we do:', - favicon: 'img/favicon.ico', + title: "Graphics Programming Discord", + tagline: + "Articles, guides, tips and tricks from and for frogs and forgis of the graphics programming discord. This is what we do:", + favicon: "img/favicon.ico", // Set the production url of your site here - url: 'https://graphicsprogramming.github.io/', + url: "https://graphicsprogramming.github.io/", // Set the // pathname under which your site is served // For GitHub pages deployment, it is often '//' - baseUrl: '/', + baseUrl: "/", // GitHub pages deployment config. - organizationName: 'GraphicsProgramming', - projectName: 'blog', - deploymentBranch: 'gh-pages', + organizationName: "GraphicsProgramming", + projectName: "blog", + deploymentBranch: "gh-pages", trailingSlash: false, - onBrokenLinks: 'warn', - onBrokenMarkdownLinks: 'warn', + onBrokenLinks: "warn", + onBrokenMarkdownLinks: "warn", - plugins: [[ require.resolve('docusaurus-lunr-search'), { - languages: ['en'] // language codes - }]], + plugins: [ + [ + require.resolve("docusaurus-lunr-search"), + { + languages: ["en"], // language codes + }, + ], + ], // Even if you don't use internationalization, you can use this field to set // useful metadata like html lang. For example, if your site is Chinese, you // may want to replace "en" with "zh-Hans". i18n: { - defaultLocale: 'en', - locales: ['en'], + defaultLocale: "en", + locales: ["en"], }, stylesheets: [ { - href: 'https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css', - type: 'text/css', + href: "https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css", + type: "text/css", integrity: - 'sha384-nB0miv6/jRmo5UMMR1wu3Gz6NLsoTkbqJghGIsx//Rlm+ZU03BU6SQNC66uf4l5+', - crossorigin: 'anonymous', - } + "sha384-nB0miv6/jRmo5UMMR1wu3Gz6NLsoTkbqJghGIsx//Rlm+ZU03BU6SQNC66uf4l5+", + crossorigin: "anonymous", + }, ], presets: [ [ - 'classic', + "classic", { docs: { - routeBasePath: 'docs', - sidebarPath: './sidebars.ts', + routeBasePath: "docs", + sidebarPath: "./sidebars.ts", }, blog: { showLastUpdateTime: true, showLastUpdateAuthor: true, showReadingTime: true, feedOptions: { - type: ['rss', 'atom'], + type: ["rss", "atom"], xslt: true, }, // Remove this to remove the "edit this page" links. - editUrl: - 'https://github.com/GraphicsProgramming/blog', + editUrl: "https://github.com/GraphicsProgramming/blog", // Useful options to enforce blogging best practices - onInlineTags: 'warn', - onInlineAuthors: 'warn', - onUntruncatedBlogPosts: 'warn', + onInlineTags: "warn", + onInlineAuthors: "warn", + onUntruncatedBlogPosts: "warn", //routeBasePath: '/', - blogSidebarCount: 'ALL', + blogSidebarCount: "ALL", remarkPlugins: [remarkMath], rehypePlugins: [rehypeKatex], }, theme: { - customCss: './src/css/custom.css', + customCss: "./src/css/custom.css", }, sitemap: { - changefreq: 'always' - } + changefreq: "always", + }, } satisfies Preset.Options, ], ], themeConfig: { colorMode: { - defaultMode: 'dark', + defaultMode: "dark", respectPrefersColorScheme: true, }, - image: 'img/social-embed.png', + image: "img/social-embed.png", navbar: { - title: 'Graphics Programming', + title: "Graphics Programming", logo: { - alt: 'Graphics Programming', - src: 'img/gp-discord-logo.webp', + alt: "Graphics Programming", + src: "img/gp-discord-logo.webp", }, items: [ { - to: '/blog', - label: 'Blog', - position: 'left' + to: "/blog", + label: "Blog", + position: "left", + }, + { + type: "docSidebar", + sidebarId: "discordServer", + position: "left", + label: "Discord Server", }, { - type: 'docSidebar', - sidebarId: 'discordServer', - position: 'left', - label: 'Discord Server', + type: "docSidebar", + sidebarId: "communityProjects", + position: "left", + label: "Community Projects", }, { - type: 'docSidebar', - sidebarId: 'communityProjects', - position: 'left', - label: 'Community Projects' - } + to: "/webring", + label: "Webring", + position: "left", + }, ], }, footer: { - style: 'dark', + style: "dark", links: [ { - title: 'Community', + title: "Community", items: [ { - label: 'Discord', - href: 'https://discord.com/invite/graphicsprogramming', + label: "Discord", + href: "https://discord.graphics-programming.org/", }, { - label: 'YouTube', - href: 'https://www.youtube.com/@graphicsprogramming9074', + label: "YouTube", + href: "https://www.youtube.com/@graphicsprogramming9074", }, { - label: 'Twitter', - href: 'https://x.com/i/communities/1500963350825472000' - } + label: "Twitter", + href: "https://x.com/i/communities/1500963350825472000", + }, ], }, { - title: 'More', + title: "More", items: [ { - label: 'Blog', - to: 'https://graphics-programming.org/', + label: "Blog", + to: "https://graphics-programming.org/", }, { - label: 'GitHub', - href: 'https://github.com/GraphicsProgramming', + label: "GitHub", + href: "https://github.com/GraphicsProgramming", }, ], }, @@ -153,9 +163,7 @@ const config: Config = { prism: { theme: prismThemes.duotoneLight, darkTheme: prismThemes.duotoneDark, - additionalLanguages: [ - 'glsl' - ] + additionalLanguages: ["glsl"], }, } satisfies Preset.ThemeConfig, }; diff --git a/gen-webring-routes.js b/gen-webring-routes.js new file mode 100644 index 0000000..7d36363 --- /dev/null +++ b/gen-webring-routes.js @@ -0,0 +1,38 @@ +const fs = require("fs"); + +// Import ze froges +const frogs = require("./static/webring/froglist.json"); + +function makeHtmlRedirect(frog) { + return ` + + + + + + + Codestin Search App + + + + If you are not redirected automatically, follow this link. + + `; +} + +function makeRoutes(frog, nextFrog, prevFrog) { + fs.mkdirSync(`./static/webring/frogs/${frog.name}`, { recursive: true }); + fs.appendFileSync(`./static/webring/frogs/${frog.name}.html`, makeHtmlRedirect(frog)); + fs.appendFileSync(`./static/webring/frogs/${frog.name}/next.html`, makeHtmlRedirect(nextFrog)); + fs.appendFileSync(`./static/webring/frogs/${frog.name}/prev.html`, makeHtmlRedirect(prevFrog)); +} + +frogs.forEach((frog, i) => { + const nextFrog = frogs.at((i + 1) % frogs.length); + const prevFrog = frogs.at(i - 1); // array.at(-1) returns the last element + + makeRoutes(frog, nextFrog, prevFrog); +}); \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 5bc56ea..752b370 100644 --- a/package-lock.json +++ b/package-lock.json @@ -359,13 +359,14 @@ } }, "node_modules/@babel/code-frame": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.25.7.tgz", - "integrity": "sha512-0xZJFNE5XMpENsgfHYTw8FbX4kv53mFLn2i3XPoq69LyhYSCBJtitaHx9QnsVTrsogI4Z3+HtEfZ2/GFPOtf5g==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", + "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==", "license": "MIT", "dependencies": { - "@babel/highlight": "^7.25.7", - "picocolors": "^1.0.0" + "@babel/helper-validator-identifier": "^7.27.1", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" }, "engines": { "node": ">=6.9.0" @@ -682,18 +683,18 @@ } }, "node_modules/@babel/helper-string-parser": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.7.tgz", - "integrity": "sha512-CbkjYdsJNHFk8uqpEkpCvRs3YRp9tY6FmFY7wLMSYuGYkrdUi7r2lc4/wqsvlHoMznX3WJ9IP8giGPq68T/Y6g==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", "license": "MIT", "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-validator-identifier": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.7.tgz", - "integrity": "sha512-AM6TzwYqGChO45oiuPqwL2t20/HdMC1rTPAesnBCgPCSF1x3oN9MVUwQV2iyz4xqWrctwK5RNC8LV22kaQCNYg==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.27.1.tgz", + "integrity": "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==", "license": "MIT", "engines": { "node": ">=6.9.0" @@ -723,111 +724,25 @@ } }, "node_modules/@babel/helpers": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.25.7.tgz", - "integrity": "sha512-Sv6pASx7Esm38KQpF/U/OXLwPPrdGHNKoeblRxgZRLXnAtnkEe4ptJPDtAZM7fBLadbc1Q07kQpSiGQ0Jg6tRA==", - "license": "MIT", - "dependencies": { - "@babel/template": "^7.25.7", - "@babel/types": "^7.25.7" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/highlight": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.25.7.tgz", - "integrity": "sha512-iYyACpW3iW8Fw+ZybQK+drQre+ns/tKpXbNESfrhNnPLIklLbXr7MYJ6gPEd0iETGLOK+SxMjVvKb/ffmk+FEw==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.27.1.tgz", + "integrity": "sha512-FCvFTm0sWV8Fxhpp2McP5/W53GPllQ9QeQ7SiqGWjMf/LVG07lFa5+pgK05IRhVwtvafT22KF+ZSnM9I545CvQ==", "license": "MIT", "dependencies": { - "@babel/helper-validator-identifier": "^7.25.7", - "chalk": "^2.4.2", - "js-tokens": "^4.0.0", - "picocolors": "^1.0.0" + "@babel/template": "^7.27.1", + "@babel/types": "^7.27.1" }, "engines": { "node": ">=6.9.0" } }, - "node_modules/@babel/highlight/node_modules/ansi-styles": { - "version": "3.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", - "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", - "license": "MIT", - "dependencies": { - "color-convert": "^1.9.0" - }, - "engines": { - "node": ">=4" - } - }, - "node_modules/@babel/highlight/node_modules/chalk": { - "version": "2.4.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz", - "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==", - "license": "MIT", - "dependencies": { - "ansi-styles": "^3.2.1", - "escape-string-regexp": "^1.0.5", - "supports-color": "^5.3.0" - }, - "engines": { - "node": ">=4" - } - }, - "node_modules/@babel/highlight/node_modules/color-convert": { - "version": "1.9.3", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", - "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==", - "license": "MIT", - "dependencies": { - "color-name": "1.1.3" - } - }, - "node_modules/@babel/highlight/node_modules/color-name": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", - "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==", - "license": "MIT" - }, - "node_modules/@babel/highlight/node_modules/escape-string-regexp": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", - "integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==", - "license": "MIT", - "engines": { - "node": ">=0.8.0" - } - }, - "node_modules/@babel/highlight/node_modules/has-flag": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", - "integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==", - "license": "MIT", - "engines": { - "node": ">=4" - } - }, - "node_modules/@babel/highlight/node_modules/supports-color": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", - "integrity": "sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==", - "license": "MIT", - "dependencies": { - "has-flag": "^3.0.0" - }, - "engines": { - "node": ">=4" - } - }, "node_modules/@babel/parser": { - "version": "7.25.8", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.25.8.tgz", - "integrity": "sha512-HcttkxzdPucv3nNFmfOOMfFf64KgdJVqm1KaCm25dPGMLElo9nsLvXeJECQg8UzPuBGLyTSA0ZzqCtDSzKTEoQ==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.27.1.tgz", + "integrity": "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ==", "license": "MIT", "dependencies": { - "@babel/types": "^7.25.8" + "@babel/types": "^7.27.1" }, "bin": { "parser": "bin/babel-parser.js" @@ -2061,39 +1976,35 @@ } }, "node_modules/@babel/runtime": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.25.7.tgz", - "integrity": "sha512-FjoyLe754PMiYsFaN5C94ttGiOmBNYTf6pLr4xXHAT5uctHb092PBszndLDR5XA/jghQvn4n7JMHl7dmTgbm9w==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.1.tgz", + "integrity": "sha512-1x3D2xEk2fRo3PAhwQwu5UubzgiVWSXTBfWpVd2Mx2AzRqJuDJCsgaDVZ7HB5iGzDW1Hl1sWN2mFyKjmR9uAog==", "license": "MIT", - "dependencies": { - "regenerator-runtime": "^0.14.0" - }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/runtime-corejs3": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.25.7.tgz", - "integrity": "sha512-gMmIEhg35sXk9Te5qbGp3W9YKrvLt3HV658/d3odWrHSqT0JeG5OzsJWFHRLiOohRyjRsJc/x03DhJm3i8VJxg==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/runtime-corejs3/-/runtime-corejs3-7.27.1.tgz", + "integrity": "sha512-909rVuj3phpjW6y0MCXAZ5iNeORePa6ldJvp2baWGcTjwqbBDDz6xoS5JHJ7lS88NlwLYj07ImL/8IUMtDZzTA==", "license": "MIT", "dependencies": { - "core-js-pure": "^3.30.2", - "regenerator-runtime": "^0.14.0" + "core-js-pure": "^3.30.2" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/template": { - "version": "7.25.7", - "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.7.tgz", - "integrity": "sha512-wRwtAgI3bAS+JGU2upWNL9lSlDcRCqD05BZ1n3X2ONLH1WilFP6O1otQjeMK/1g0pvYcXC7b/qVUB1keofjtZA==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.27.1.tgz", + "integrity": "sha512-Fyo3ghWMqkHHpHQCoBs2VnYjR4iWFFjguTDEqA5WgZDOrFesVjMhMM2FSqTKSoUSDO1VQtavj8NFpdRBEvJTtg==", "license": "MIT", "dependencies": { - "@babel/code-frame": "^7.25.7", - "@babel/parser": "^7.25.7", - "@babel/types": "^7.25.7" + "@babel/code-frame": "^7.27.1", + "@babel/parser": "^7.27.1", + "@babel/types": "^7.27.1" }, "engines": { "node": ">=6.9.0" @@ -2118,14 +2029,13 @@ } }, "node_modules/@babel/types": { - "version": "7.25.8", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.25.8.tgz", - "integrity": "sha512-JWtuCu8VQsMladxVz/P4HzHUGCAwpuqacmowgXFs5XjxIgKuNjnLokQzuVjlTvIzODaDmpjT3oxcC48vyk9EWg==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.27.1.tgz", + "integrity": "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q==", "license": "MIT", "dependencies": { - "@babel/helper-string-parser": "^7.25.7", - "@babel/helper-validator-identifier": "^7.25.7", - "to-fast-properties": "^2.0.0" + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.27.1" }, "engines": { "node": ">=6.9.0" @@ -6548,9 +6458,9 @@ } }, "node_modules/estree-util-value-to-estree": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/estree-util-value-to-estree/-/estree-util-value-to-estree-3.1.2.tgz", - "integrity": "sha512-S0gW2+XZkmsx00tU2uJ4L9hUT7IFabbml9pHh2WQqFmAbxit++YGZne0sKJbNwkj9Wvg9E4uqWl4nCIFQMmfag==", + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/estree-util-value-to-estree/-/estree-util-value-to-estree-3.3.3.tgz", + "integrity": "sha512-Db+m1WSD4+mUO7UgMeKkAwdbfNWwIxLt48XF2oFU9emPfXkIu+k5/nlOj313v7wqtAPo0f9REhUvznFrPkG8CQ==", "license": "MIT", "dependencies": { "@types/estree": "^1.0.0" @@ -8373,9 +8283,9 @@ } }, "node_modules/http-proxy-middleware": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz", - "integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==", + "version": "2.0.9", + "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz", + "integrity": "sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==", "license": "MIT", "dependencies": { "@types/http-proxy": "^1.17.8", @@ -8464,9 +8374,9 @@ } }, "node_modules/image-size": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.1.1.tgz", - "integrity": "sha512-541xKlUw6jr/6gGuk92F+mYM5zaFAc5ahphvkqvNe2bQ6gVBkd6bfrmVJ2t4KDAfikAYZyIqTnktX3i6/aQDrQ==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.2.1.tgz", + "integrity": "sha512-rH+46sQJ2dlwfjfhCyNx5thzrv+dtmBIhPHk0zgRUukHzZ/kRueTJXoYYsclBaKcSMBWuGbOFXtioLpzTb5euw==", "license": "MIT", "dependencies": { "queue": "6.0.2" @@ -9085,9 +8995,9 @@ } }, "node_modules/katex": { - "version": "0.16.11", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.11.tgz", - "integrity": "sha512-RQrI8rlHY92OLf3rho/Ts8i/XvjgguEjOkO1BEXcU3N8BqPpSzBNwV/G0Ukr+P/l3ivvJUE/Fa/CwbS6HesGNQ==", + "version": "0.16.21", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.21.tgz", + "integrity": "sha512-XvqR7FgOHtWupfMiigNzmh+MgUVmDGU2kXZm899ZkPfcuoPuFxyHmXsgATDpFZDAXCI8tvinaVcDo8PIIJSo4A==", "funding": [ "https://opencollective.com/katex", "https://github.com/sponsors/katex" @@ -12391,9 +12301,9 @@ } }, "node_modules/picocolors": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.0.tgz", - "integrity": "sha512-TQ92mBOW0l3LeMeyLV6mzy/kWr8lkd/hp3mTg7wYK7zJhuBStmGMBG0BdeDZS/dZx1IukaX6Bk11zcln25o1Aw==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", "license": "ISC" }, "node_modules/picomatch": { @@ -13137,9 +13047,9 @@ } }, "node_modules/prismjs": { - "version": "1.29.0", - "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.29.0.tgz", - "integrity": "sha512-Kx/1w86q/epKcmte75LNrEoT+lX8pBpavuAbvJWRXar7Hz8jrtF+e3vY751p0R8H9HdArwaCTNDDzHg/ScJK1Q==", + "version": "1.30.0", + "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.30.0.tgz", + "integrity": "sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==", "license": "MIT", "engines": { "node": ">=6" @@ -13710,12 +13620,6 @@ "node": ">=4" } }, - "node_modules/regenerator-runtime": { - "version": "0.14.1", - "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz", - "integrity": "sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==", - "license": "MIT" - }, "node_modules/regenerator-transform": { "version": "0.15.2", "resolved": "https://registry.npmjs.org/regenerator-transform/-/regenerator-transform-0.15.2.tgz", @@ -15422,15 +15326,6 @@ "integrity": "sha512-lBN9zLN/oAf68o3zNXYrdCt1kP8WsiGW8Oo2ka41b2IM5JL/S1CTyX1rW0mb/zSuJun0ZUrDxx4sqvYS2FWzPA==", "license": "MIT" }, - "node_modules/to-fast-properties": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz", - "integrity": "sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==", - "license": "MIT", - "engines": { - "node": ">=4" - } - }, "node_modules/to-regex-range": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", diff --git a/src/css/custom.css b/src/css/custom.css index 2cb3cdd..710a237 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -13,6 +13,7 @@ --ifm-color-primary-light: #9e47f2; --ifm-color-primary-lighter: #ab65f5; --ifm-color-primary-lightest: #c28cf8; + --ifm-color-landing: #white; --ifm-code-font-size: 95%; --docusaurus-highlighted-code-line-bg: rgba(138, 43, 226, 0.1); } @@ -25,6 +26,7 @@ --ifm-color-primary-light: #c1a7e2; --ifm-color-primary-lighter: #d1b5eb; --ifm-color-primary-lightest: #e2c8f3; + --ifm-color-landing: #1b1b1d; --docusaurus-highlighted-code-line-bg: rgba(177, 156, 217, 0.3); } diff --git a/src/pages/index.tsx b/src/pages/index.tsx index cd74dd3..91eef9a 100644 --- a/src/pages/index.tsx +++ b/src/pages/index.tsx @@ -1,31 +1,45 @@ -import clsx from 'clsx'; -import Link from '@docusaurus/Link'; -import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; -import Layout from '@theme/Layout'; -import HomepageFeatures from '@site/src/components/HomepageFeatures'; -import Heading from '@theme/Heading'; +import clsx from "clsx"; +import Link from "@docusaurus/Link"; +import useDocusaurusContext from "@docusaurus/useDocusaurusContext"; +import Layout from "@theme/Layout"; +import HomepageFeatures from "@site/src/components/HomepageFeatures"; +import Heading from "@theme/Heading"; -import styles from './index.module.css'; +import styles from "./index.module.css"; function HomepageHeader() { const { siteConfig } = useDocusaurusContext(); return ( -
+
{siteConfig.title} -

Articles, guides, tips and tricks from and for frogs and forgis of the Graphics Programming discord.
This is what we do

- +

+ Articles, guides, tips and tricks from and for frogs and forgis of the + Graphics Programming discord. +
+ This is what we do +

+
- + Discover our Blog + to="https://discord.graphics-programming.org/" + > Join our Discord Server
@@ -39,10 +53,33 @@ export default function Home(): JSX.Element { return ( + description="Description will go into a meta tag in " + >
+ +
); diff --git a/src/pages/webring/index.tsx b/src/pages/webring/index.tsx new file mode 100644 index 0000000..26a2c61 --- /dev/null +++ b/src/pages/webring/index.tsx @@ -0,0 +1,69 @@ +import React from "react"; +import Layout from "@theme/Layout"; + +import froglist from "/static/webring/froglist.json"; + +export default function Hello() { + return ( + +
+

Graphics Programming Webring

+ + +
+ + + + + Join the GP webring + +
+
+ ); +} diff --git a/src/pages/webring/join.md b/src/pages/webring/join.md new file mode 100644 index 0000000..06a11ca --- /dev/null +++ b/src/pages/webring/join.md @@ -0,0 +1,91 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Join the GP webring + +Do you have a cool website or blog that can be part of the GP webring? Join us! + +## 1. Add yourself to the webring + +To join the webring, add yourself to the [froglist](https://github.com/GraphicsProgramming/blog/blob/main/static/webring/froglist.json), +a file listing all the webring members. You can leave a PR with your edits to the file, or if you're not as comfortable with Git, an issue +asking to be added. + +Simply add a new entry at the end of the JSON file with your website's data: + +```json +{ + // A short name to identify your site. + // This will be in the URL, so keep it short and url-friendly (no spaces or special characters) + "name": "your-name-here", + "url": "https://link-to-my-cool.website", + "displayName": "Your Name Here", + "description": "A short description of your cool website" +} +``` + +## 2. Add the webring links to your site + +Once you've added yourself to the froglist, add the webring links to your website. Make sure they're visible from the homepage! + +You can find templates for the links below, for plain HTML or react. Simply copy and paste the appropriate code somewhere in your +home page, or feel free to make your own links—you can style them to fit your site, just be sure to include our friendly little froge +so people know you're part of the webring. + + + + ```html + + ``` + + + ```tsx + + ``` + + + ```tsx + + ``` + + diff --git a/static/img/froge.webp b/static/img/froge.webp new file mode 100644 index 0000000..6e2deb0 Binary files /dev/null and b/static/img/froge.webp differ diff --git a/static/webring/froglist.json b/static/webring/froglist.json new file mode 100644 index 0000000..de4e856 --- /dev/null +++ b/static/webring/froglist.json @@ -0,0 +1,86 @@ +[ + { + "name": "gp-blog", + "url": "https://graphics-programming.org/", + "displayName": "Graphics Programming Blog", + "description": "The official site for the Graphics Programming discord" + }, + { + "name": "bluescreen", + "url": "https://fumagalli.ar/", + "displayName": "Teo Fumagalli (bluescreen)", + "description": "Hi! I made a website with stuff in it" + }, + { + "name": "dragonslayer0531", + "url": "https://nickclark.tech/", + "displayName": "Nick Clark (DragonSlayer0531)", + "description": "Personal site and blog for Nick Clark" + }, + { + "name": "technicjelle", + "url": "https://technicjelle.com/", + "displayName": "TechnicJelle", + "description": "On this website you'll find info about me and my projects" + }, + { + "name": "eduameli", + "url": "https://eduameli.gitlab.io", + "displayName": "eduameli", + "description": "personal site to share my very cool projects" + }, + { + "name": "jaked", + "url": "https://jakedelmastro.com", + "displayName": "Jake S. Del Mastro", + "description": "My website where I share some of my graphics projects" + }, + { + "name": "manpat", + "url": "https://patrick-is.cool", + "displayName": "Patrick Monaghan", + "description": "Personal website and attempted blog" + }, + { + "name": "jaker", + "url": "https://juandiegomontoya.github.io/", + "displayName": "Jake Ryan", + "description": "A blog about graphics frogramming" + }, + { + "name": "neonmoe", + "url": "https://blog.neon.moe/", + "displayName": "Jens Pitkänen", + "description": "A blog about programming, the small web, and arcane personal computing" + }, + { + "name": "geometrian", + "url": "https://geometrian.com/", + "displayName": "Agatha Mallett", + "description": "Homepage of Agatha Mallett, including computer graphics research and many other projects!" + }, + { + "name": "jmaier", + "url": "https://www.jakobmaier.at/", + "displayName": "Jakob Maier", + "description": "A website where I share my projects and blog posts." + }, + { + "name": "edthedev", + "url": "https://edward.delaporte.us/", + "displayName": "Edward Delaporte", + "description": "Interactive JavaScript art, example code, and reference links." + }, + { + "name": "rtarun9", + "url": "https://rtarun9.github.io/", + "displayName": "Tarun R", + "description": "Personal site with projects and blogs that are rarely updated" + }, + { + "name": "devsh", + "url": "https://www.devsh.eu/", + "displayName": "DevSH Graphics Programming", + "description": "Homepage of DevSH Graphics Programming: Computer Graphics, Computer Geometry & Vision and High Performance Computing Consultancy" + } +]