Source code for embedl_deploy._internal.tensorrt.patterns.fusions

# Copyright (C) 2026 Embedl AB

"""Fusion ``Pattern`` subclasses for TensorRT.

Each class declares a ``tree`` (what to match) and a ``graft`` (what to replace
with). The base ``Pattern`` class handles matching and replacement.
"""

import operator

from torch import fx, nn

from embedl_deploy._internal.core.modules import ActivationLike
from embedl_deploy._internal.core.patterns.main import Pattern, Phase
from embedl_deploy._internal.core.tree.match import node_check
from embedl_deploy._internal.core.tree.replace import get_auto_name, make_fused
from embedl_deploy._internal.core.tree.types import (
    Fork,
    Graft,
    NodeInserter,
    Tree,
    TreeMatch,
    Wildcard,
)
from embedl_deploy._internal.core.tree.utils import get_module
from embedl_deploy._internal.tensorrt.modules.attention import (
    FusedMHAInProjection,
    FusedScaledDotProductAttention,
    MHAInProjection,
    ScaledDotProductAttention,
)
from embedl_deploy._internal.tensorrt.modules.conv import (
    FusedConvBN,
    FusedConvBNAct,
    FusedConvBNActMaxPool,
    FusedConvBNAddAct,
    FusedConvBNSigmoidMul,
)
from embedl_deploy._internal.tensorrt.modules.linear import (
    FusedLayerNorm,
    FusedLinear,
    FusedLinearAct,
)
from embedl_deploy._internal.tensorrt.modules.pointwise import (
    FusedActAdd,
    FusedMulAdd,
)
from embedl_deploy._internal.tensorrt.modules.pool import (
    FusedAdaptiveAvgPool2d,
)
from embedl_deploy._internal.tensorrt.modules.swin_attention import (
    FusedSwinAttention,
    SwinAttention,
)


@node_check
def _is_add(node: fx.Node) -> bool:
    """Match ``operator.add`` or ``operator.iadd``."""
    return node.op == "call_function" and node.target in (
        operator.add,
        operator.iadd,
    )


@node_check
def _is_mul(node: fx.Node) -> bool:
    """Match ``operator.mul`` or ``operator.imul``."""
    return node.op == "call_function" and node.target in (
        operator.mul,
        operator.imul,
    )


def _is_stem_conv(node: fx.Node) -> bool:
    """Return ``True`` for a 7×7 ``Conv2d`` with 3 input channels."""
    module = get_module(node)
    return (
        isinstance(module, nn.Conv2d)
        and module.in_channels == 3
        and module.kernel_size == (7, 7)
    )


#: Wildcard entry for an optional ``BatchNorm2d`` between convolution and
#: activation (or end of chain).
_OPTIONAL_BN = Wildcard(nn.BatchNorm2d, quantifier="?")



[docs]
class ConvBNActPattern(Pattern):
    """Match ``Conv2d → [BatchNorm2d] → Activation`` and fuse.

    Any activation included in
    :data:`~embedl_deploy._internal.core.modules.ActivationLike` is accepted.
    The ``BatchNorm2d`` is optional.
    """

    phase = Phase.FUSION
    tree: Tree = (nn.Conv2d, _OPTIONAL_BN, ActivationLike)
    graft: Graft = (make_fused(FusedConvBNAct),)




[docs]
class ConvBNPattern(Pattern):
    """Match ``Conv2d → [BatchNorm2d]`` (no activation) and fuse.

    The ``BatchNorm2d`` is optional.
    """

    phase = Phase.FUSION
    tree: Tree = (nn.Conv2d, _OPTIONAL_BN)
    graft: Graft = (make_fused(FusedConvBN),)




[docs]
class StemConvBNActMaxPoolPattern(Pattern):
    """Match ``Conv2d(3in, 7×7) → [BatchNorm2d] → Activation → MaxPool2d``.

    Captures the common classification-network stem. The convolution is
    constrained to ``in_channels == 3, kernel_size == (7, 7)`` so only the
    actual stem is matched, not arbitrary ``Conv→Act→Pool`` chains. The
    ``BatchNorm2d`` is optional.
    """

    phase = Phase.FUSION
    tree: Tree = (
        _is_stem_conv,
        _OPTIONAL_BN,
        ActivationLike,
        nn.MaxPool2d,
    )
    graft: Graft = (make_fused(FusedConvBNActMaxPool),)




[docs]
class ConvBNAddActPattern(Pattern):
    """Match ``Conv2d → [BN] → add(·, residual) → [Activation]``.

    Captures the tail of ResNet-style bottleneck blocks where the convolution
    path is element-wise added to a skip connection before the final
    activation. Both the ``BatchNorm2d`` and the activation are optional so
    that EfficientNet-style ``Conv → BN → Add`` blocks (no trailing activation)
    and bare ``Conv → Add`` chains are also captured.
    """

    phase = Phase.FUSION
    tree: Tree = Fork(
        (
            (nn.Conv2d, _OPTIONAL_BN),
            (),
        ),
        _is_add,
        (Wildcard(ActivationLike, quantifier="?"),),
    )
    graft: Graft = (make_fused(FusedConvBNAddAct),)



class ConvBNSigmoidMulPattern(Pattern):
    """Match ``Conv2d → [BatchNorm2d] → Sigmoid → mul(·, skip)``.

    Captures the SE gate pattern: an expand convolution followed by Sigmoid
    attention, element-wise multiplied with a skip connection. The fused module
    places an internal Q/DQ between the conv output and the Sigmoid, enabling
    TensorRT's ``PWN(Sigmoid, Mul)`` fusion.
    """

    phase = Phase.FUSION
    tree: Tree = Fork(
        (
            (nn.Conv2d, _OPTIONAL_BN, nn.Sigmoid),
            (),
        ),
        _is_mul,
        (),
    )
    graft: Graft = (make_fused(FusedConvBNSigmoidMul),)


def _is_dynamic_value(value: object) -> bool:
    """Return ``True`` for FX values that are not literals or weights."""
    return isinstance(value, fx.Node) and value.op != "get_attr"


def _is_layer_scale_mul(node: fx.Node) -> bool:
    """Return ``True`` for a mul scaling a runtime tensor by a constant."""
    if not _is_mul(node):
        return False
    dynamic = [_is_dynamic_value(arg) for arg in node.args]
    return any(dynamic) and not all(dynamic)


@node_check
def _is_dynamic_add(node: fx.Node) -> bool:
    """Return ``True`` for an add whose operands are all runtime tensors."""
    return _is_add(node) and all(_is_dynamic_value(arg) for arg in node.args)


def _make_mul_add(tree_match: TreeMatch) -> tuple[NodeInserter]:
    """Build a ``NodeInserter`` wiring the mul operands and the addend.

    The fused node's arguments are the mul node's arguments followed by the
    addend. Scalar mul operands are captured here because the tree engine only
    carries ``fx.Node`` inputs through ``prev_args``.
    """
    mul_node = tree_match.get_node(0, 0)
    assert not mul_node.kwargs
    template = tuple(
        None if isinstance(arg, fx.Node) else arg for arg in mul_node.args
    )

    def _insert(
        graph_module: fx.GraphModule,
        prev_args: tuple[fx.Node, ...],
    ) -> list[fx.Node]:
        nodes = iter(prev_args)
        args = tuple(
            next(nodes) if slot is None else slot for slot in template
        )
        args += tuple(nodes)
        fused = FusedMulAdd()
        name = get_auto_name(graph_module, fused)
        graph_module.add_module(name, fused)
        order = {n: i for i, n in enumerate(graph_module.graph.nodes)}
        latest = max(prev_args, key=order.__getitem__)
        with graph_module.graph.inserting_after(latest):
            return [graph_module.graph.call_module(name, args)]

    return (_insert,)


class MulAddPattern(Pattern):
    """Match a layer-scale ``mul → add`` feeding a dynamic residual add.

    Captures the ConvNeXt layer-scale tail: a runtime tensor scaled by a
    constant (a weight or a scalar literal), then added to a residual whose
    other operand is a runtime tensor. Fully dynamic multiplies (e.g. SE-style
    gating) are left for the Conv-based fork patterns, and adds whose other
    operand is a weight (``get_attr``) are bias-like and left alone — the
    accuracy rationale only applies to layer-scale residual adds. The fused
    module keeps the operation in floating point (see
    :class:`~embedl_deploy._internal.tensorrt.modules.pointwise.FusedMulAdd`).
    """

    phase = Phase.FUSION
    tree: Tree = Fork(
        (
            (_is_layer_scale_mul,),
            (),
        ),
        _is_dynamic_add,
        (),
    )
    graft: Graft = (_make_mul_add,)



[docs]
class ActAddPattern(Pattern):
    """Match ``Act → add(·, residual)`` and fuse into ``FusedActAdd``.

    Placing this pattern before the Conv-based fusion patterns prevents
    TensorRT from attempting to merge the upstream convolution into an
    activation-fused kernel when the activation output feeds a residual add.
    With ``Act → add`` absorbed into a single pointwise leaf, the upstream
    ``Conv → BN`` is matched by
    :class:`~embedl_deploy._internal.tensorrt.patterns.fusions.ConvBNPattern`
    and quantized independently.
    """

    phase = Phase.FUSION
    tree: Tree = Fork(
        (
            (ActivationLike,),
            (),
        ),
        _is_add,
        (),
    )
    graft: Graft = (make_fused(FusedActAdd),)




[docs]
class LinearActPattern(Pattern):
    """Match ``Linear → Activation`` and fuse.

    Any activation included in
    :data:`~embedl_deploy._internal.core.modules.ActivationLike` is accepted.
    """

    phase = Phase.FUSION
    tree: Tree = (nn.Linear, ActivationLike)
    graft: Graft = (make_fused(FusedLinearAct),)




[docs]
class LinearPattern(Pattern):
    """Match a standalone ``Linear`` and wrap in a fused module."""

    phase = Phase.FUSION
    tree: Tree = (nn.Linear,)
    graft: Graft = (make_fused(FusedLinear),)




[docs]
class LayerNormPattern(Pattern):
    """Match ``LayerNorm`` and wrap in a fused module."""

    phase = Phase.FUSION
    tree: Tree = (nn.LayerNorm,)
    graft: Graft = (make_fused(FusedLayerNorm),)




[docs]
class MHAInProjectionPattern(Pattern):
    """Match ``MHAInProjection`` and wrap in a fused module."""

    phase = Phase.FUSION
    tree: Tree = (MHAInProjection,)
    graft: Graft = (make_fused(FusedMHAInProjection),)




[docs]
class ScaledDotProductAttentionPattern(Pattern):
    """Match ``ScaledDotProductAttention`` and wrap in a fused module."""

    phase = Phase.FUSION
    tree: Tree = (ScaledDotProductAttention,)
    graft: Graft = (make_fused(FusedScaledDotProductAttention),)



class SwinAttentionPattern(Pattern):
    """Match ``SwinAttention`` and wrap in a fused module."""

    phase = Phase.FUSION
    tree: Tree = (SwinAttention,)
    graft: Graft = (make_fused(FusedSwinAttention),)



[docs]
class AdaptiveAvgPoolPattern(Pattern):
    """Match ``AdaptiveAvgPool2d`` and wrap in a fused module.

    Although there is nothing to *fuse*, wrapping the pool in a recognized
    module allows the Q/DQ insertion pass to place quantize / dequantize stubs
    around it.
    """

    phase = Phase.FUSION
    tree: Tree = (nn.AdaptiveAvgPool2d,)
    graft: Graft = (make_fused(FusedAdaptiveAvgPool2d),)