Source code for embedl_deploy._internal.lattice.patterns.conversions.general

# Copyright (C) 2026 Embedl AB

"""General-purpose Lattice conversion patterns.

These patterns rewrite standard Torch operations into the limited subset
accepted by Lattice hardware.
"""

import copy
import logging
import operator

import torch
from torch import fx, nn

from embedl_deploy._internal.core.patterns.main import (
    Pattern,
    PatternMatch,
    Phase,
)
from embedl_deploy._internal.core.tree.replace import get_auto_name
from embedl_deploy._internal.core.tree.state import get_replaced_nodes
from embedl_deploy._internal.core.tree.types import (
    Graft,
    NodeInserter,
    Tree,
    TreeMatch,
    Wildcard,
)
from embedl_deploy._internal.core.tree.utils import (
    get_input_shape,
    get_module,
    resolve_module,
)
from embedl_deploy._internal.lattice.modules.conv import (
    LatticeConv2d,
    LatticeConv2dAdvanced,
)

_LOG = logging.getLogger(__name__)


def _is_conv2d(node: fx.Node) -> bool:
    """Return ``True`` when `node` is an unconverted ``Conv2d``."""
    mod = get_module(node)
    if isinstance(mod, LatticeConv2d):
        return False
    if isinstance(mod, nn.Conv2d):
        return True
    return False


def _make_lattice_conv(tree_match: TreeMatch) -> tuple[nn.Module, ...]:
    """Build the replacement ``Conv2d`` for the matched node."""
    node = tree_match.get_node(0)
    conv = resolve_module(node, nn.Conv2d)
    new_conv = LatticeConv2d(conv)
    if new_conv.weight.shape != conv.weight.shape:
        _LOG.warning(
            "%s: kernel_size snapped from %s to %s — "
            "weights reinitialized, output will differ; "
            "retraining recommended.",
            node.name,
            conv.kernel_size,
            new_conv.kernel_size,
        )
    elif new_conv.stride != conv.stride or new_conv.padding != conv.padding:
        _LOG.warning(
            "%s: stride/padding snapped from %s/%s to %s/%s — "
            "output will differ; retraining recommended.",
            node.name,
            conv.stride,
            conv.padding,
            new_conv.stride,
            new_conv.padding,
        )
    return (new_conv,)



[docs]
class LatticeConv2dPattern(Pattern):
    """Replace every ``Conv2d`` with a ``LatticeConv2d``.

    Every :class:`~torch.nn.Conv2d` that is not already a
    :class:`~embedl_deploy._internal.lattice.modules.conv.LatticeConv2d` is
    replaced by one. Lattice hardware accepts only ``1×1`` and ``3×3``
    convolutions with stride 1 or 2 (and stride 1 is mandatory for the ``1×1``
    kernel); the
    :class:`~embedl_deploy._internal.lattice.modules.conv.LatticeConv2d`
    constructor snaps parameters to the nearest supported values.

    Weights and bias are preserved when only stride and/or padding needed
    snapping (the weight tensor shape is unchanged). When the kernel size
    itself is snapped the replacement has freshly initialized weights, since
    the source kernel cannot be meaningfully reused at the new shape.
    """

    is_numerically_equivalent = False
    phase = Phase.CONVERSION
    tree: Tree = (_is_conv2d,)
    graft: Graft = (_make_lattice_conv,)



def _make_lattice_advanced_conv(
    tree_match: TreeMatch,
) -> tuple[nn.Module, ...]:
    """Build the replacement advanced ``Conv2d`` for the matched node."""
    node = tree_match.get_node(0)
    conv = resolve_module(node, nn.Conv2d)
    new_conv = LatticeConv2dAdvanced(conv)
    if new_conv.weight.shape != conv.weight.shape:
        _LOG.warning(
            "%s: kernel_size snapped from %s to %s — "
            "weights reinitialized, output will differ; "
            "retraining recommended.",
            node.name,
            conv.kernel_size,
            new_conv.kernel_size,
        )
    elif new_conv.stride != conv.stride or new_conv.padding != conv.padding:
        _LOG.warning(
            "%s: stride/padding snapped from %s/%s to %s/%s — "
            "output will differ; retraining recommended.",
            node.name,
            conv.stride,
            conv.padding,
            new_conv.stride,
            new_conv.padding,
        )
    return (new_conv,)



[docs]
class LatticeConv2dAdvancedPattern(Pattern):
    """Replace every ``Conv2d`` with a ``LatticeConv2dAdvanced``.

    Every :class:`~torch.nn.Conv2d` that is not already a
    :class:`~embedl_deploy._internal.lattice.modules.conv.LatticeConv2d` is
    replaced by a
    :class:`~embedl_deploy._internal.lattice.modules.conv.LatticeConv2dAdvanced`.
    Lattice advanced hardware supports ``1×1``, ``3×3``, ``5×5``, and ``7×7``
    convolutions. ``3×3`` allows stride 1 or 2; all other kernel sizes require
    stride 1. The constructor snaps the kernel to the nearest of ``{1, 3, 5,
    7}`` and the stride according to the snapped kernel size. Padding is set to
    ``kernel_size // 2``.

    Weights and bias are preserved when only stride and/or padding needed
    snapping. When the kernel size itself is snapped the replacement has
    freshly initialized weights.
    """

    is_numerically_equivalent = False
    phase = Phase.CONVERSION
    tree: Tree = (_is_conv2d,)
    graft: Graft = (_make_lattice_advanced_conv,)



def _flatten_dims(node: fx.Node) -> tuple[int, int]:
    """Return the ``(start_dim, end_dim)`` of a flatten `node`."""
    mod = get_module(node)
    if isinstance(mod, nn.Flatten):
        return mod.start_dim, mod.end_dim
    args = node.args
    kwargs = node.kwargs
    start_dim = kwargs.get("start_dim", args[1] if len(args) > 1 else 0)
    end_dim = kwargs.get("end_dim", args[2] if len(args) > 2 else -1)
    assert isinstance(start_dim, int)
    assert isinstance(end_dim, int)
    return start_dim, end_dim


def _is_convertible_flatten(node: fx.Node) -> bool:
    """Return ``True`` for flatten nodes whose input shape is known."""
    if node.op == "call_function":
        is_flat = node.target is torch.flatten
    elif node.op == "call_method":
        is_flat = node.target == "flatten"
    else:
        is_flat = isinstance(get_module(node), nn.Flatten)
    if not is_flat:
        return False
    return get_input_shape(node) is not None


def _flatten_to_reshape(tree_match: TreeMatch) -> tuple[NodeInserter, ...]:
    """Return a ``NodeInserter`` that emits a ``reshape`` call."""
    flatten_node = tree_match.get_node(0)
    shape = get_input_shape(flatten_node)
    assert shape is not None
    start_dim, end_dim = _flatten_dims(flatten_node)
    ndim = len(shape)
    if start_dim < 0:
        start_dim += ndim
    if end_dim < 0:
        end_dim += ndim
    # Dimensions outside the flattened span are baked in as static
    # values from the traced shape. Lattice targets a fixed batch
    # size of 1, so this is intentional.
    pre = [int(d) for d in shape[:start_dim]]
    post = [int(d) for d in shape[end_dim + 1 :]]
    target_shape: list[int] = [*pre, -1, *post]

    def _insert(
        graph_module: fx.GraphModule,
        prev_args: tuple[fx.Node, ...],
    ) -> list[fx.Node]:
        order = {
            n: i for i, n in enumerate(graph_module.graph.nodes)
        }.__getitem__
        latest = max(prev_args, key=order)
        with graph_module.graph.inserting_after(latest):
            node = graph_module.graph.call_method(
                "reshape", (prev_args[0], *target_shape)
            )
        return [node]

    return (_insert,)



[docs]
class FlattenToReshapePattern(Pattern):
    """Replace ``flatten`` with an equivalent static ``reshape``.

    Lattice does not support ``flatten`` as a stand-alone op but does support
    ``reshape`` to a fully-static target shape. When the input shape of the
    flatten node is known (via :class:`~torch.fx.passes.shape_prop.ShapeProp`)
    the dimensions outside ``[start_dim, end_dim]`` are kept verbatim and the
    flattened span is collapsed to a single ``-1`` axis, producing a
    statically-shaped ``reshape`` call equivalent to the original ``flatten``.

    Matches all three flatten spellings: the :class:`~torch.nn.Flatten` module,
    the ``torch.flatten`` function, and the ``Tensor.flatten`` method.

    .. note::

       The replacement ``reshape`` bakes in the batch dimension from
       the traced shape (typically 1). This is intentional: Lattice
       hardware targets a fixed batch size of 1, so the resulting model
       is not expected to handle variable batch sizes.
    """

    phase = Phase.CONVERSION
    tree: Tree = (_is_convertible_flatten,)
    graft: Graft = (_flatten_to_reshape,)



def _insert_module_chain(
    graph_module: fx.GraphModule,
    modules: list[nn.Module],
    input_node: fx.Node,
) -> list[fx.Node]:
    """Insert `modules` as a ``call_module`` chain fed from `input_node`."""
    created: list[fx.Node] = []
    prev = input_node
    for module in modules:
        name = get_auto_name(graph_module, module)
        graph_module.add_module(name, module)
        with graph_module.graph.inserting_after(prev):
            node = graph_module.graph.call_module(name, (prev,))
        created.append(node)
        prev = node
    return created


def _split_batchnorm(
    bn: nn.BatchNorm2d,
    half: int,
) -> tuple[nn.BatchNorm2d, nn.BatchNorm2d]:
    """Split a ``BatchNorm2d`` into two halves along the feature dimension.

    All learnable parameters (``weight``, ``bias``) and running statistics
    (``running_mean``, ``running_var``) are sliced at the midpoint.
    ``num_batches_tracked`` is copied as-is to both halves.

    :param bn:
        The original batch-norm to split.
    :param half:
        Number of features for each half.
    :returns:
        A pair ``(bn_a, bn_b)`` of ``BatchNorm2d`` modules.
    """
    bn_a = nn.BatchNorm2d(
        half,
        eps=bn.eps,
        momentum=bn.momentum,
        affine=bn.affine,
        track_running_stats=bn.track_running_stats,
    )
    bn_b = nn.BatchNorm2d(
        half,
        eps=bn.eps,
        momentum=bn.momentum,
        affine=bn.affine,
        track_running_stats=bn.track_running_stats,
    )
    # Move to the same device/dtype as the source BN before copying.
    ref = bn.weight if bn.affine else bn.running_mean
    if ref is not None:
        bn_a.to(device=ref.device, dtype=ref.dtype)
        bn_b.to(device=ref.device, dtype=ref.dtype)
    with torch.no_grad():
        if bn.affine:
            assert bn.weight is not None
            assert bn.bias is not None
            assert bn_a.weight is not None
            assert bn_b.weight is not None
            assert bn_a.bias is not None
            assert bn_b.bias is not None
            bn_a.weight.copy_(bn.weight[:half])
            bn_b.weight.copy_(bn.weight[half:])
            bn_a.bias.copy_(bn.bias[:half])
            bn_b.bias.copy_(bn.bias[half:])
        if bn.track_running_stats:
            mean = bn.running_mean
            var = bn.running_var
            tracked = bn.num_batches_tracked
            if mean is not None and var is not None:
                assert bn_a.running_mean is not None
                assert bn_b.running_mean is not None
                assert bn_a.running_var is not None
                assert bn_b.running_var is not None
                bn_a.running_mean.copy_(mean[:half])
                bn_b.running_mean.copy_(mean[half:])
                bn_a.running_var.copy_(var[:half])
                bn_b.running_var.copy_(var[half:])
            if tracked is not None:
                assert bn_a.num_batches_tracked is not None
                assert bn_b.num_batches_tracked is not None
                bn_a.num_batches_tracked.copy_(tracked)
                bn_b.num_batches_tracked.copy_(tracked)
    bn_a.train(bn.training)
    bn_b.train(bn.training)
    return bn_a, bn_b


def _parse_chunk_args(
    node: fx.Node,
) -> tuple[int, int] | None:
    """Extract ``(chunks, dim)`` from a chunk node, or ``None``."""
    if (node.op == "call_method" and node.target == "chunk") or (
        node.op == "call_function" and node.target is torch.chunk
    ):
        chunks = node.kwargs.get(
            "chunks",
            node.args[1] if len(node.args) > 1 else None,
        )
        dim = node.kwargs.get(
            "dim",
            node.args[2] if len(node.args) > 2 else 0,
        )
        if isinstance(chunks, int) and isinstance(dim, int):
            return (chunks, dim)
    return None


def _has_only_getitem_users(node: fx.Node) -> bool:
    """Return ``True`` when there are two users: ``getitem`` at 0 and 1."""
    if len(node.users) != 2:
        return False
    getitem_indices: set[int] = set()
    for user in node.users:
        if not (
            user.op == "call_function" and user.target is operator.getitem
        ):
            return False
        idx = user.args[1]
        if not isinstance(idx, int):
            return False
        getitem_indices.add(idx)
    return getitem_indices == {0, 1}


def _is_c2f_conv(node: fx.Node) -> bool:
    """Match a ``Conv2d`` suitable for C2f chunk splitting.

    Rejects ``LatticeConv2d`` (already converted), grouped convolutions
    (contiguous channel slicing is only valid for ``groups=1``), odd
    output-channel counts (cannot split evenly), and convolutions whose output
    feeds more than one consumer (the replacement erases the original node).
    """
    mod = get_module(node)
    if not isinstance(mod, nn.Conv2d) or isinstance(mod, LatticeConv2d):
        return False
    return (
        mod.out_channels % 2 == 0 and mod.groups == 1 and len(node.users) == 1
    )


def _is_single_user_bn_or_act(node: fx.Node) -> bool:
    """Match a single-user ``BatchNorm2d`` or activation module."""
    if len(node.users) != 1:
        return False
    mod = get_module(node)
    return isinstance(
        mod,
        (
            nn.BatchNorm2d,
            nn.SiLU,
            nn.ReLU,
            nn.ReLU6,
            nn.LeakyReLU,
            nn.GELU,
            nn.Mish,
        ),
    )


_C2F_INTERMEDIATE = Wildcard(_is_single_user_bn_or_act, quantifier="*")


def _is_c2f_chunk(node: fx.Node) -> bool:
    """Return ``True`` for a ``chunk(2, dim=1)`` with two ``getitem`` users."""
    parsed = _parse_chunk_args(node)
    if parsed is None or parsed != (2, 1):
        return False
    return _has_only_getitem_users(node)


def _build_split_branches(
    conv: nn.Conv2d,
    inter_mods: list[nn.Module],
    half: int,
) -> tuple[list[nn.Module], list[nn.Module]]:
    """Create the two parallel module chains (conv + intermediates)."""
    conv_a = nn.Conv2d(
        conv.in_channels,
        half,
        kernel_size=conv.kernel_size,  # type: ignore[arg-type]
        stride=conv.stride,  # type: ignore[arg-type]
        padding=conv.padding,  # type: ignore[arg-type]
        dilation=conv.dilation,  # type: ignore[arg-type]
        groups=conv.groups,
        bias=conv.bias is not None,
        padding_mode=conv.padding_mode,
    )
    conv_b = nn.Conv2d(
        conv.in_channels,
        half,
        kernel_size=conv.kernel_size,  # type: ignore[arg-type]
        stride=conv.stride,  # type: ignore[arg-type]
        padding=conv.padding,  # type: ignore[arg-type]
        dilation=conv.dilation,  # type: ignore[arg-type]
        groups=conv.groups,
        bias=conv.bias is not None,
        padding_mode=conv.padding_mode,
    )
    # Move to the same device/dtype as the source conv before copying weights.
    conv_a.to(device=conv.weight.device, dtype=conv.weight.dtype)
    conv_b.to(device=conv.weight.device, dtype=conv.weight.dtype)
    with torch.no_grad():
        conv_a.weight.copy_(conv.weight[:half])
        conv_b.weight.copy_(conv.weight[half:])
        if conv.bias is not None:
            assert conv_a.bias is not None
            assert conv_b.bias is not None
            conv_a.bias.copy_(conv.bias[:half])
            conv_b.bias.copy_(conv.bias[half:])

    modules_a: list[nn.Module] = [conv_a]
    modules_b: list[nn.Module] = [conv_b]
    for mod in inter_mods:
        if isinstance(mod, nn.BatchNorm2d):
            bn_a, bn_b = _split_batchnorm(mod, half)
            modules_a.append(bn_a)
            modules_b.append(bn_b)
        else:
            modules_a.append(copy.deepcopy(mod))
            modules_b.append(copy.deepcopy(mod))
    return modules_a, modules_b


#: ``tree_match.meta`` key under which the C2f inserter records the
#: ``(branch_a_output, branch_b_output)`` nodes for the ``replace`` override.
_C2F_BRANCH_OUTPUTS = "c2f_branch_outputs"


def _make_c2f_branches(tree_match: TreeMatch) -> tuple[NodeInserter]:
    """Return a ``NodeInserter`` that grows the two C2f branches.

    Each branch is an independent ``Conv → [BatchNorm2d] → [Activation]`` chain
    with the convolution weights and bias sliced at the channel midpoint. The
    second-half branch is grown first so the first-half branch lands ahead of
    it in graph order, matching the channel order of the original ``chunk``
    output. The inserter records the two branch outputs in ``tree_match.meta``
    under :data:`_C2F_BRANCH_OUTPUTS`; rewiring the ``chunk``'s ``getitem``
    consumers onto them is left to
    :meth:`~embedl_deploy._internal.lattice.patterns.conversions.general.LatticeC2fChunkPattern.replace`.
    """
    replaced_nodes = get_replaced_nodes()
    conv_node = tree_match.get_node(0)
    conv_node = replaced_nodes.get(conv_node, conv_node)
    wildcard = tree_match.get_node(1, is_wildcard=True)
    intermediates = [replaced_nodes.get(n, n) for n in wildcard.nodes]
    conv = resolve_module(conv_node, nn.Conv2d)
    inter_mods = [resolve_module(n) for n in intermediates]
    half = conv.out_channels // 2
    modules_a, modules_b = _build_split_branches(conv, inter_mods, half)

    chain_parts = ["Conv2d", *(type(m).__name__ for m in inter_mods)]
    chain_desc = " → ".join(chain_parts)
    _LOG.info(
        "%s: %s + chunk(2) replaced with two parallel %s branches "
        "(%d→%d ch each) — weights sliced from original; "
        "output is mathematically equivalent.",
        conv_node.name,
        chain_desc,
        chain_desc,
        conv.out_channels,
        half,
    )

    def _insert(
        graph_module: fx.GraphModule,
        prev_args: tuple[fx.Node, ...],
    ) -> list[fx.Node]:
        (conv_input,) = prev_args
        b_nodes = _insert_module_chain(graph_module, modules_b, conv_input)
        a_nodes = _insert_module_chain(graph_module, modules_a, conv_input)
        tree_match.meta[_C2F_BRANCH_OUTPUTS] = (a_nodes[-1], b_nodes[-1])
        return [*a_nodes, *b_nodes]

    return (_insert,)


def _rewire_c2f_getitems(pattern_match: PatternMatch) -> None:
    """Redirect the ``chunk``'s ``getitem`` consumers onto the two branches.

    Reads the branch outputs the inserter recorded in ``tree_match.meta`` under
    :data:`_C2F_BRANCH_OUTPUTS`, ordered so that ``acts[i]`` feeds
    ``getitem(i)``. The inserter returns the last branch as the replacement
    output, so ``replace_tree`` parked every ``chunk`` ``getitem`` on
    ``acts[-1]``; each is repointed at ``acts[getitem index]`` and erased.
    """
    acts = pattern_match.tree_match.meta[_C2F_BRANCH_OUTPUTS]
    for getitem in list(acts[-1].users):
        if not (
            getitem.op == "call_function"
            and getitem.target is operator.getitem
        ):
            continue
        getitem.replace_all_uses_with(acts[getitem.args[1]])
        getitem.graph.erase_node(getitem)



[docs]
class LatticeC2fChunkPattern(Pattern):
    """Replace a C2f ``chunk`` with two parallel branches.

    The C2f (CSP Bottleneck with 2 convolutions - Fast) module in YOLOv8 uses
    ``chunk`` to divide the output of a ``Conv2d → BatchNorm2d → Activation``
    chain into two channel halves. Lattice hardware does not support the ONNX
    ``Split`` operator that ``chunk`` decomposes to, so the convolution chain
    followed by a channel split is replaced with two independent parallel
    branches, each producing half the output channels.

    The replacement handles the full chain between the ``Conv2d`` and the
    ``chunk``:

    *  **Conv2d** — weights and bias are sliced at the channel
       midpoint; each branch gets half the output filters.
    *  **BatchNorm2d** — ``weight``, ``bias``, ``running_mean``, and
       ``running_var`` are split; ``num_batches_tracked`` is copied.
    *  **Activation** (``SiLU``, ``ReLU``, etc.) — deep-copied for
       each branch.

    All downstream ``getitem`` consumers of the ``chunk`` output are rewired to
    use the appropriate branch's output directly.

    This pattern requires that the convolution has an even number of output
    channels, that exactly two ``getitem`` consumers (indices 0 and 1) exist on
    the ``chunk`` node, and that every node in the chain from ``Conv2d`` to
    ``chunk`` has exactly one user.
    """

    phase = Phase.CONVERSION
    tree: Tree = (_is_c2f_conv, _C2F_INTERMEDIATE, _is_c2f_chunk)
    graft: Graft = (_make_c2f_branches,)


[docs]
    @classmethod
    def replace(cls, pattern_match: PatternMatch) -> list[fx.Node]:
        """Rewire the ``chunk``'s ``getitem`` consumers onto the branches.

        ``graft`` builds the two parallel branches (recording their outputs in
        ``tree_match.meta``) and ``replace_tree`` erases the matched ``Conv → …
        → chunk`` chain, pointing every ``getitem`` at the second-half branch
        output. This override then redirects ``getitem(0)`` to the first-half
        branch and ``getitem(1)`` to the second-half branch and drops the
        now-redundant ``getitem`` nodes — the two-output fan-out that the
        linear ``replace_tree`` contract cannot express on its own.
        """
        nodes = super().replace(pattern_match)
        _rewire_c2f_getitems(pattern_match)
        return nodes