Fix torch BiLSTM dispatch and engage cuDNN (#22874)

MarcosAsh · web-flow · commit 28bad7aaa83e · 2026-05-14T14:31:17.000-07:00
* Fix torch BiLSTM dispatch and engage cuDNN

* Address review: docstring and remove redundant tensor conversions

* Skip cuDNN LSTM during ONNX trace

* Loosen ONNX LSTM export test tolerance to 1e-5
diff --git a/keras/src/backend/torch/rnn.py b/keras/src/backend/torch/rnn.py
@@ -559,9 +559,6 @@ def lstm(
     if mask is not None:
         raise NotImplementedError
 
-    # Get device from inputs
-    device = get_device()
-
     # Convert to torch tensors (convert_to_tensor unwraps Variables)
     kernel = convert_to_tensor(kernel)
     recurrent_kernel = convert_to_tensor(recurrent_kernel)
@@ -580,16 +577,24 @@ def lstm(
         seq_dim = 1 if batch_first else 0
         inputs = torch.flip(inputs, dims=[seq_dim])
 
-    # Move all tensors to the same device
-    inputs = inputs.to(device)
-    initial_state_h = initial_state_h.to(device)
-    initial_state_c = initial_state_c.to(device)
-
-    cudnn_supported = cudnn_ok(
-        activation,
-        recurrent_activation,
-        unroll,
-        use_bias=bias is not None,
+    # cuDNN only runs on CUDA. Skip it when inputs aren't on CUDA, or when
+    # we're inside a TorchScript / Dynamo trace -- the trace records device
+    # transfers that then fail device-consistency validation downstream
+    # (e.g. `torch.onnx.export` failing in `wrapper_CUDA_cat`).
+    device = inputs.device
+    cudnn_supported = (
+        device.type == "cuda"
+        and not torch.jit.is_tracing()
+        and not (
+            hasattr(torch.compiler, "is_compiling")
+            and torch.compiler.is_compiling()
+        )
+        and cudnn_ok(
+            activation,
+            recurrent_activation,
+            unroll,
+            use_bias=bias is not None,
+        )
     )
 
     if cudnn_supported:
@@ -647,8 +652,9 @@ def _cudnn_lstm(
 
     params = prepare_lstm_params(kernel, recurrent_kernel, bias, device)
 
-    # Use functional LSTM to maintain gradient flow through weight tensors
-    outputs, (h_n, c_n) = torch._VF.lstm(
+    # Use functional LSTM to maintain gradient flow through weight tensors.
+    # ``torch._VF.lstm`` returns a flat ``(output, h_n, c_n)`` tuple.
+    outputs, h_n, c_n = torch._VF.lstm(
         inputs,
         (initial_state_h, initial_state_c),
         params,
@@ -872,5 +878,158 @@ def _cudnn_gru(
     return last_output, outputs, [h_n]
 
 
-def bidirectional_lstm(*args, **kwargs):
-    raise NotImplementedError
+def bidirectional_lstm(
+    inputs,
+    fwd_initial_state_h,
+    fwd_initial_state_c,
+    bwd_initial_state_h,
+    bwd_initial_state_c,
+    mask,
+    fwd_kernel,
+    fwd_recurrent_kernel,
+    fwd_bias,
+    bwd_kernel,
+    bwd_recurrent_kernel,
+    bwd_bias,
+    activation,
+    recurrent_activation,
+    return_sequences=False,
+    unroll=False,
+):
+    """Fused bidirectional cuDNN LSTM for the torch backend.
+
+    Runs forward and backward passes in a single
+    ``torch._VF.lstm(..., bidirectional=True)`` call instead of dispatching
+    two unidirectional LSTM calls. Backward outputs are returned in original
+    time order, ready for the caller's ``merge_mode`` to consume directly.
+
+    Args:
+        inputs: Input tensor of shape ``(batch, time, features)``.
+        fwd_initial_state_h: Initial hidden state for the forward direction,
+            shape ``(batch, hidden)``.
+        fwd_initial_state_c: Initial cell state for the forward direction,
+            shape ``(batch, hidden)``.
+        bwd_initial_state_h: Initial hidden state for the backward direction,
+            shape ``(batch, hidden)``.
+        bwd_initial_state_c: Initial cell state for the backward direction,
+            shape ``(batch, hidden)``.
+        mask: Sequence mask. Only ``None`` is supported; otherwise
+            ``NotImplementedError`` is raised so the caller can fall back to
+            the two-pass path.
+        fwd_kernel: Forward input kernel, shape ``(features, 4 * hidden)``.
+        fwd_recurrent_kernel: Forward recurrent kernel, shape
+            ``(hidden, 4 * hidden)``.
+        fwd_bias: Forward bias, shape ``(4 * hidden,)`` or ``None``.
+        bwd_kernel: Backward input kernel, shape ``(features, 4 * hidden)``.
+        bwd_recurrent_kernel: Backward recurrent kernel, shape
+            ``(hidden, 4 * hidden)``.
+        bwd_bias: Backward bias, shape ``(4 * hidden,)`` or ``None``.
+        activation: Output activation. Only ``tanh`` engages cuDNN.
+        recurrent_activation: Gate activation. Only ``sigmoid`` engages
+            cuDNN.
+        return_sequences: If ``True``, return outputs at every timestep;
+            otherwise only the last timestep.
+        unroll: Not supported; cuDNN requires the rolled path.
+
+    Returns:
+        A pair ``((fwd_last, fwd_outputs, [fwd_h_n, fwd_c_n]),
+        (bwd_last, bwd_outputs, [bwd_h_n, bwd_c_n]))`` matching the JAX
+        equivalent's return shape.
+    """
+    if mask is not None:
+        raise NotImplementedError
+    if not cudnn_ok(
+        activation,
+        recurrent_activation,
+        unroll,
+        use_bias=fwd_bias is not None and bwd_bias is not None,
+    ):
+        raise NotImplementedError
+
+    fwd_kernel = convert_to_tensor(fwd_kernel)
+    fwd_recurrent_kernel = convert_to_tensor(fwd_recurrent_kernel)
+    bwd_kernel = convert_to_tensor(bwd_kernel)
+    bwd_recurrent_kernel = convert_to_tensor(bwd_recurrent_kernel)
+
+    compute_dtype = fwd_kernel.dtype
+    inputs = convert_to_tensor(inputs, dtype=compute_dtype)
+    fwd_h0 = convert_to_tensor(fwd_initial_state_h, dtype=compute_dtype)
+    fwd_c0 = convert_to_tensor(fwd_initial_state_c, dtype=compute_dtype)
+    bwd_h0 = convert_to_tensor(bwd_initial_state_h, dtype=compute_dtype)
+    bwd_c0 = convert_to_tensor(bwd_initial_state_c, dtype=compute_dtype)
+
+    # cuDNN only runs on CUDA. Fall back to the two-pass path when inputs
+    # aren't on CUDA, or when we're inside a TorchScript / Dynamo trace --
+    # the trace records device transfers that then fail device-consistency
+    # validation downstream (e.g. `torch.onnx.export` in `wrapper_CUDA_cat`).
+    device = inputs.device
+    if (
+        device.type != "cuda"
+        or torch.jit.is_tracing()
+        or (
+            hasattr(torch.compiler, "is_compiling")
+            and torch.compiler.is_compiling()
+        )
+    ):
+        raise NotImplementedError
+
+    fwd_params = prepare_lstm_params(
+        fwd_kernel, fwd_recurrent_kernel, fwd_bias, device
+    )
+    bwd_params = prepare_lstm_params(
+        bwd_kernel, bwd_recurrent_kernel, bwd_bias, device
+    )
+
+    # torch._VF.lstm with bidirectional=True expects 4 params per direction,
+    # forward direction first, then backward.
+    params = fwd_params + bwd_params
+
+    # cuDNN expects (num_layers * num_directions, batch, hidden) for h0/c0.
+    h_0 = torch.stack([fwd_h0, bwd_h0], dim=0)
+    c_0 = torch.stack([fwd_c0, bwd_c0], dim=0)
+
+    try:
+        # ``torch._VF.lstm`` returns a flat ``(output, h_n, c_n)`` tuple.
+        outputs, h_n, c_n = torch._VF.lstm(
+            inputs,
+            (h_0, c_0),
+            params,
+            True,  # has_biases
+            1,  # num_layers
+            0.0,  # dropout
+            torch.is_grad_enabled(),  # training
+            True,  # bidirectional
+            True,  # batch_first
+        )
+    except (RuntimeError, TypeError, ValueError) as e:
+        raise NotImplementedError(
+            f"cuDNN bidirectional LSTM failed: {e}"
+        ) from e
+
+    # outputs: (batch, seq_len, 2 * hidden_size). First half is the forward
+    # direction, second half is the backward direction (in original time
+    # order, courtesy of cuDNN).
+    hidden_size = fwd_recurrent_kernel.shape[0]
+    y_fwd = outputs[..., :hidden_size]
+    y_bwd = outputs[..., hidden_size:]
+
+    fwd_h_n, bwd_h_n = h_n[0], h_n[1]
+    fwd_c_n, bwd_c_n = c_n[0], c_n[1]
+
+    # Forward "last" is the last timestep of the forward sweep; backward
+    # "last" is the first timestep in original time order (i.e., the result
+    # after the full reverse sweep).
+    fwd_last = y_fwd[:, -1]
+    bwd_last = y_bwd[:, 0]
+
+    if return_sequences:
+        fwd_outputs = y_fwd
+        bwd_outputs = y_bwd
+    else:
+        fwd_outputs = fwd_last.unsqueeze(1)
+        bwd_outputs = bwd_last.unsqueeze(1)
+
+    return (
+        (fwd_last, fwd_outputs, [fwd_h_n, fwd_c_n]),
+        (bwd_last, bwd_outputs, [bwd_h_n, bwd_c_n]),
+    )
diff --git a/keras/src/export/onnx_test.py b/keras/src/export/onnx_test.py
@@ -101,9 +101,14 @@ def test_standard_model_export(self, model_type):
         ort_inputs = {
             k.name: v for k, v in zip(ort_session.get_inputs(), [ref_input])
         }
+        # cuDNN-fused LSTM reference vs. unrolled ONNX graph differ by ~3e-6.
+        atol = 1e-5 if model_type == "lstm" else 1e-6
+        rtol = 1e-5 if model_type == "lstm" else 1e-6
         self.assertAllClose(
             ort_session.run(None, ort_inputs)[0],
             ref_output,
+            atol=atol,
+            rtol=rtol,
             tpu_atol=1e-3,
             tpu_rtol=1e-2,
         )