import collections
import functools
import unittest

import caffe2.python._import_c_extension as C
import caffe2.python.hip_test_util as hiputl
import caffe2.python.hypothesis_test_util as hu
import caffe2.python.serialized_test.serialized_test_util as serial
import hypothesis.strategies as st
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import brew, core, utils, workspace
from caffe2.python.model_helper import ModelHelper
from hypothesis import assume, given, settings


def _cudnn_supports(dilation=False, nhwc=False, backward=False):
    """Return True if cuDNN supports this configuration."""
    v = workspace.GetCuDNNVersion()
    if backward:
        if nhwc:
            # nhwc isn't supported in backward ops.
            return False
    else:
        # Forward mode.
        if dilation and v < 6000:
            # Dilation not supported until v6
            return False
        if dilation and nhwc:
            # Dilation and NHWC not supported together
            return False
    return True


def _cudnn_convolution_algo_count(direction):
    try:
        if direction == "fwd":
            return st.integers(0, C.cudnn_convolution_fwd_algo_count - 1)
        elif direction == "dgrad":
            return st.integers(0, C.cudnn_convolution_bwd_data_algo_count - 1)
        elif direction == "wgrad":
            return st.integers(0, C.cudnn_convolution_bwd_filter_algo_count - 1)
        else:
            assert False
    except Exception:
        return st.sampled_from([-1])


class TestConvolution(serial.SerializedTestCase):
    # CUDNN does NOT support different padding values and we skip it
    @given(
        op_type=st.sampled_from(["Conv", "Conv2D"]),
        stride_h=st.integers(1, 3),
        stride_w=st.integers(1, 3),
        pad_t=st.integers(0, 3),
        pad_l=st.integers(0, 3),
        pad_b=st.integers(0, 3),
        pad_r=st.integers(0, 3),
        kernel=st.integers(3, 5),
        size=st.integers(1, 8),
        input_channels=st.integers(1, 3),
        output_channels=st.integers(1, 3),
        batch_size=st.integers(0, 3),
        group=st.integers(1, 2),
        order=st.sampled_from(["NCHW", "NHWC"]),
        engine=st.sampled_from(["", "EIGEN"]),
        shared_buffer=st.booleans(),
        use_bias=st.booleans(),
        **hu.gcs
    )
    @settings(deadline=None, max_examples=50)
    def test_convolution_separate_stride_pad_gradients(
        self,
        op_type,
        stride_h,
        stride_w,
        pad_t,
        pad_l,
        pad_b,
        pad_r,
        kernel,
        size,
        input_channels,
        output_channels,
        batch_size,
        group,
        order,
        engine,
        shared_buffer,
        use_bias,
        gc,
        dc,
    ):
        # TODO: Group conv in NHWC not implemented for GPU yet.
        assume(group == 1 or order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
        if group != 1 and order == "NHWC":
            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
        # Group conv not implemented with EIGEN engine.
        assume(group == 1 or engine != "EIGEN")

        input_channels *= group
        output_channels *= group

        op = core.CreateOperator(
            op_type,
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            stride_h=stride_h,
            stride_w=stride_w,
            pad_t=pad_t,
            pad_l=pad_l,
            pad_b=pad_b,
            pad_r=pad_r,
            kernel=kernel,
            group=group,
            order=order,
            engine=engine,
            shared_buffer=int(shared_buffer),
        )
        X = (
            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
            - 0.5
        )
        w = (
            np.random.rand(
                output_channels, kernel, kernel, int(input_channels / group)
            ).astype(np.float32)
            - 0.5
        )
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, w, b] if use_bias else [X, w]

        # Error handling path.
        if size + pad_r + pad_l < kernel or size + pad_t + pad_b < kernel:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return

        self.assertDeviceChecks(dc, op, inputs, [0])
        for i in range(len(inputs)):
            self.assertGradientChecks(gc, op, inputs, i, [0])

    # CUDNN does NOT support different padding values and we skip it
    @given(
        op_type=st.sampled_from(["Conv", "Conv2D"]),
        stride_h=st.integers(1, 3),
        stride_w=st.integers(1, 3),
        pad_t=st.integers(0, 3),
        pad_l=st.integers(0, 3),
        pad_b=st.integers(0, 3),
        pad_r=st.integers(0, 3),
        kernel=st.integers(1, 5),
        size=st.integers(7, 10),
        input_channels=st.integers(1, 8),
        output_channels=st.integers(1, 8),
        batch_size=st.integers(0, 3),
        engine=st.sampled_from(["", "EIGEN"]),
        use_bias=st.booleans(),
        **hu.gcs
    )
    @settings(deadline=None)
    def test_convolution_separate_stride_pad_layout(
        self,
        op_type,
        stride_h,
        stride_w,
        pad_t,
        pad_l,
        pad_b,
        pad_r,
        kernel,
        size,
        input_channels,
        output_channels,
        batch_size,
        engine,
        use_bias,
        gc,
        dc,
    ):
        X = (
            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
            - 0.5
        )
        w = (
            np.random.rand(output_channels, kernel, kernel, input_channels).astype(
                np.float32
            )
            - 0.5
        )
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        outputs = {}
        for order in ["NCHW", "NHWC"]:
            op = core.CreateOperator(
                op_type,
                ["X", "w", "b"] if use_bias else ["X", "w"],
                ["Y"],
                stride_h=stride_h,
                stride_w=stride_w,
                kernel=kernel,
                pad_t=pad_t,
                pad_l=pad_l,
                pad_b=pad_b,
                pad_r=pad_r,
                order=order,
                engine=engine,
                device_option=gc,
            )
            if order == "NCHW":
                X_f = utils.NHWC2NCHW(X)
                w_f = utils.NHWC2NCHW(w)
            else:
                X_f = X
                w_f = w
            self.ws.create_blob("X").feed(X_f, device_option=gc)
            self.ws.create_blob("w").feed(w_f, device_option=gc)
            self.ws.create_blob("b").feed(b, device_option=gc)
            self.ws.run(op)
            outputs[order] = self.ws.blobs["Y"].fetch()
        np.testing.assert_allclose(
            outputs["NCHW"], utils.NHWC2NCHW(outputs["NHWC"]), atol=1e-4, rtol=1e-4
        )

    @given(
        op_type=st.sampled_from(["Conv", "Conv2D"]),
        stride=st.integers(1, 3),
        pad=st.integers(0, 3),
        kernel=st.integers(1, 5),
        dilation=st.integers(1, 3),
        size=st.integers(7, 10),
        input_channels=st.integers(1, 8),
        output_channels=st.integers(1, 8),
        batch_size=st.integers(0, 3),
        group=st.integers(1, 2),
        order=st.sampled_from(["NCHW", "NHWC"]),
        engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
        use_bias=st.booleans(),
        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
        **hu.gcs
    )
    @settings(max_examples=20, deadline=None)
    def test_convolution_gradients(
        self,
        op_type,
        stride,
        pad,
        kernel,
        dilation,
        size,
        input_channels,
        output_channels,
        batch_size,
        group,
        order,
        engine,
        use_bias,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        # TODO: Group conv in NHWC not implemented for GPU yet.
        assume(
            group == 1
            or (order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
            and engine != "MKLDNN"
        )
        if group != 1 and order == "NHWC":
            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]

        input_channels *= group
        output_channels *= group
        dkernel = dilation * (kernel - 1) + 1

        if engine == "CUDNN":
            if hiputl.run_in_hip(gc, dc):
                assume((order == "NCHW") and not (dilation > 1 and group > 1))
            else:
                assume(
                    _cudnn_supports(
                        dilation=(dilation > 1), nhwc=(order == "NHWC"), backward=True
                    )
                )

        assume(engine != "MKLDNN" or use_bias is True)

        op = core.CreateOperator(
            op_type,
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            stride=stride,
            kernel=kernel,
            dilation=dilation,
            pad=pad,
            group=group,
            order=order,
            engine=engine,
            force_algo_fwd=force_algo_fwd,
            force_algo_dgrad=force_algo_dgrad,
            force_algo_wgrad=force_algo_wgrad,
        )
        X = (
            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
            - 0.5
        )
        w = (
            np.random.rand(
                output_channels, kernel, kernel, int(input_channels / group)
            ).astype(np.float32)
            - 0.5
        )
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        if order == "NCHW":
            X = utils.NHWC2NCHW(X)
            w = utils.NHWC2NCHW(w)

        inputs = [X, w, b] if use_bias else [X, w]
        # Error handling path.
        if size + pad + pad < dkernel or size + pad + pad < dkernel:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return

        try:
            self.assertDeviceChecks(dc, op, inputs, [0])
        except RuntimeError as e:
            es = str(e)
            # CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM should always have
            # implementation
            if (
                "status == CUDNN_STATUS_SUCCESS" not in es
                or "CUDNN_STATUS_NOT_SUPPORTED" not in es
                or force_algo_fwd == 0
            ):
                raise e

        for i in range(len(inputs)):
            try:
                self.assertGradientChecks(gc, op, inputs, i, [0])
            except RuntimeError as e:
                es = str(e)
                if (
                    "status == CUDNN_STATUS_SUCCESS" not in es
                    or "CUDNN_STATUS_NOT_SUPPORTED" not in es
                ):
                    raise e

    def _nd_convolution(
        self,
        n,
        input_channels_per_group,
        output_channels_per_group,
        batch_size,
        stride,
        size,
        kernel,
        dilation,
        pad,
        group,
        order,
        use_bias,
        engine,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        # TODO: Group conv in NHWC not implemented for GPU yet.
        # TODO: Group 1D conv in NCHW not implemented for GPU yet.
        assume(
            group == 1
            or (n != 1 and order == "NCHW")
            or gc.device_type == caffe2_pb2.CPU
        )
        if group != 1 and (n == 1 or order == "NHWC"):
            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]

        input_channels = group * input_channels_per_group
        output_channels = group * output_channels_per_group

        dkernel = dilation * (kernel - 1) + 1
        for op_type in ["Conv", "Conv" + str(n) + "D"]:
            op = core.CreateOperator(
                op_type,
                ["X", "w", "b"] if use_bias else ["X", "w"],
                ["Y"],
                strides=[stride] * n,
                kernels=[kernel] * n,
                dilations=[dilation] * n,
                pads=[pad] * n * 2,
                group=group,
                order=order,
                engine=engine,
                force_algo_fwd=force_algo_fwd,
                force_algo_dgrad=force_algo_dgrad,
                force_algo_wgrad=force_algo_wgrad,
            )

            input_dims = [batch_size, input_channels]
            input_dims.extend([size] * n)
            filter_dims = [output_channels, input_channels // group]
            filter_dims.extend([kernel] * n)

            X = np.random.rand(*input_dims).astype(np.float32) - 0.5
            w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
            b = np.random.rand(output_channels).astype(np.float32) - 0.5
            if order == "NHWC":
                X = utils.NCHW2NHWC(X)
                w = utils.NCHW2NHWC(w)

            inputs = [X, w, b] if use_bias else [X, w]

            if size + pad + pad < dkernel or size + pad + pad < dkernel:
                with self.assertRaises(RuntimeError):
                    self.assertDeviceChecks(dc, op, inputs, [0])
                return

            self.assertDeviceChecks(dc, op, inputs, [0])
            for i in range(len(inputs)):
                self.assertGradientChecks(gc, op, inputs, i, [0])

    @given(
        input_channels=st.integers(1, 3),
        output_channels=st.integers(1, 2),
        batch_size=st.integers(0, 3),
        stride=st.integers(1, 3),
        size=st.integers(7, 10),
        kernel=st.integers(1, 2),
        dilation=st.integers(1, 3),
        pad=st.integers(0, 3),
        group=st.integers(1, 2),
        order=st.sampled_from(["NCHW", "NHWC"]),
        use_bias=st.booleans(),
        engine=st.sampled_from(["", "CUDNN"]),
        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
        **hu.gcs
    )
    @settings(deadline=10000)
    def test_1d_convolution(
        self,
        input_channels,
        output_channels,
        batch_size,
        stride,
        size,
        kernel,
        dilation,
        pad,
        group,
        order,
        use_bias,
        engine,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        if hiputl.run_in_hip(gc, dc):
            # currently miopen only supports 2d conv
            assume(engine != "CUDNN")  # CUDNN is aliased to MIOPEN for HIP
        # TODO: 1D conv in NHWC not implemented for GPU yet.
        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
        if order == "NHWC":
            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]

        self._nd_convolution(
            1,
            input_channels,
            output_channels,
            batch_size,
            stride,
            size,
            kernel,
            dilation,
            pad,
            group,
            order,
            use_bias,
            engine,
            force_algo_fwd,
            force_algo_dgrad,
            force_algo_wgrad,
            gc,
            dc,
        )

    @given(
        input_channels=st.integers(1, 2),
        output_channels=st.integers(1, 2),
        batch_size=st.integers(0, 2),
        stride=st.integers(1, 2),
        size=st.integers(4, 5),
        kernel=st.integers(1, 2),
        dilation=st.integers(1, 2),
        pad=st.integers(0, 2),
        group=st.integers(1, 2),
        order=st.sampled_from(["NCHW", "NHWC"]),
        use_bias=st.booleans(),
        engine=st.sampled_from(["", "MIOPEN"]),  # TODO: add "CUDNN"
        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
        **hu.gcs
    )
    @settings(max_examples=20, deadline=None)
    def test_3d_convolution(
        self,
        input_channels,
        output_channels,
        batch_size,
        stride,
        size,
        kernel,
        dilation,
        pad,
        group,
        order,
        use_bias,
        engine,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        # TODO: 3D conv in NHWC not implemented for GPU yet.
        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
        if order == "NHWC":
            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
        self._nd_convolution(
            3,
            input_channels,
            output_channels,
            batch_size,
            stride,
            size,
            kernel,
            dilation,
            pad,
            group,
            order,
            use_bias,
            engine,
            force_algo_fwd,
            force_algo_dgrad,
            force_algo_wgrad,
            gc,
            dc,
        )

    @given(
        op_type=st.sampled_from(["Conv", "Conv3D"]),
        batch_size=st.integers(0, 2),
        stride=st.integers(1, 2),
        size=st.integers(3, 5),
        kernel=st.integers(1, 2),
        dilation=st.integers(1, 2),
        pad=st.integers(0, 2),
        use_bias=st.booleans(),
        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
        **hu.gcs_no_hip
    )  # MIOPEN doesn't support 3D conv yet
    @settings(deadline=10000)
    def test_3d_convolution_cudnn_nchw(
        self,
        op_type,
        batch_size,
        stride,
        size,
        kernel,
        dilation,
        pad,
        use_bias,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        input_channels = 1
        output_channels = 1
        n = 3
        dkernel = dilation * (kernel - 1) + 1
        order = "NCHW"

        op = core.CreateOperator(
            op_type,
            ["X", "w", "b"] if use_bias else ["X", "w"],
            ["Y"],
            strides=[stride] * n,
            kernels=[kernel] * n,
            dilations=[dilation] * n,
            pads=[pad] * n * 2,
            order=order,
            engine="CUDNN",
            force_algo_fwd=force_algo_fwd,
            force_algo_dgrad=force_algo_dgrad,
            force_algo_wgrad=force_algo_wgrad,
        )

        input_dims = [batch_size, input_channels]
        input_dims.extend([size] * n)
        filter_dims = [output_channels, input_channels]
        filter_dims.extend([kernel] * n)
        X = np.random.rand(*input_dims).astype(np.float32) - 0.5
        w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
        b = np.random.rand(output_channels).astype(np.float32) - 0.5

        inputs = [X, w, b] if use_bias else [X, w]

        if size + pad + pad < dkernel or size + pad + pad < dkernel:
            with self.assertRaises(RuntimeError):
                self.assertDeviceChecks(dc, op, inputs, [0])
            return

        try:
            self.assertDeviceChecks(dc, op, inputs, [0])
        except RuntimeError as e:
            es = str(e)
            # CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM should always have
            # implementation
            if (
                "status == CUDNN_STATUS_SUCCESS" not in es
                or "CUDNN_STATUS_NOT_SUPPORTED" not in es
                or force_algo_fwd == 0
            ):
                raise e

        for i in range(len(inputs)):
            try:
                self.assertGradientChecks(gc, op, inputs, i, [0])
            except RuntimeError as e:
                es = str(e)
                if (
                    "status == CUDNN_STATUS_SUCCESS" not in es
                    or "CUDNN_STATUS_NOT_SUPPORTED" not in es
                ):
                    raise e

    @given(
        op_type=st.sampled_from(["Conv", "Conv2D"]),
        stride=st.integers(1, 3),
        pad=st.integers(0, 3),
        kernel=st.integers(1, 5),
        dilation=st.integers(1, 3),
        size=st.integers(7, 10),
        input_channels=st.integers(1, 8),
        output_channels=st.integers(1, 8),
        batch_size=st.integers(0, 3),
        use_bias=st.booleans(),
        **hu.gcs
    )
    @settings(deadline=None, max_examples=50)
    def test_convolution_layout(
        self,
        op_type,
        stride,
        pad,
        kernel,
        dilation,
        size,
        input_channels,
        output_channels,
        batch_size,
        use_bias,
        gc,
        dc,
    ):
        assume(size >= dilation * (kernel - 1) + 1)

        X = (
            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
            - 0.5
        )
        w = (
            np.random.rand(output_channels, kernel, kernel, input_channels).astype(
                np.float32
            )
            - 0.5
        )
        b = np.random.rand(output_channels).astype(np.float32) - 0.5
        Output = collections.namedtuple("Output", ["Y", "engine", "order"])
        outputs = []

        for order in ["NCHW", "NHWC"]:
            engine_list = [""]
            if hiputl.run_in_hip(gc, dc):
                if order == "NCHW":
                    engine_list.append("MIOPEN")
            else:
                if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == "NHWC")):
                    engine_list.append("CUDNN")

            for engine in engine_list:
                op = core.CreateOperator(
                    op_type,
                    ["X", "w", "b"] if use_bias else ["X", "w"],
                    ["Y"],
                    stride=stride,
                    kernel=kernel,
                    dilation=dilation,
                    pad=pad,
                    order=order,
                    engine=engine,
                    device_option=gc,
                    exhaustive_search=True,
                )
                if order == "NCHW":
                    X_f = utils.NHWC2NCHW(X)
                    w_f = utils.NHWC2NCHW(w)
                else:
                    X_f = X
                    w_f = w
                self.assertDeviceChecks(
                    dc, op, [X_f, w_f, b] if use_bias else [X_f, w_f], [0]
                )
                self.ws.create_blob("X").feed(X_f, device_option=gc)
                self.ws.create_blob("w").feed(w_f, device_option=gc)
                self.ws.create_blob("b").feed(b, device_option=gc)
                self.ws.run(op)
                outputs.append(
                    Output(Y=self.ws.blobs["Y"].fetch(), engine=engine, order=order)
                )

        def canonical(o):
            if o.order == "NHWC":
                return utils.NHWC2NCHW(o.Y)
            else:
                return o.Y

        for o in outputs:
            np.testing.assert_allclose(
                canonical(outputs[0]), canonical(o), atol=1e-4, rtol=1e-4
            )

    @given(
        num_workers=st.integers(1, 4),
        net_type=st.sampled_from(
            ["simple", "dag"]
            + (
                ["async_dag"]
                if workspace.has_gpu_support
                else []
            )
        ),
        engine=st.sampled_from(["CUDNN", ""]),
        **hu.gcs_no_hip
    )
    @settings(deadline=None)
    def test_convolution_sync(self, net_type, num_workers, engine, gc, dc):
        m = ModelHelper(name="test_model")
        n = 1
        d = 2
        depth = 3
        iters = 5
        h = 5
        w = 5
        workspace.ResetWorkspace()

        use_cudnn = engine == "CUDNN"

        np.random.seed(1701)
        # Build a binary tree of conv layers, summing at each node.
        for i in reversed(range(depth)):
            for j in range(2 ** i):
                bottom_1 = "{}_{}".format(i + 1, 2 * j)
                bottom_2 = "{}_{}".format(i + 1, 2 * j + 1)
                mid_1 = "{}_{}_m".format(i + 1, 2 * j)
                mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1)
                top = "{}_{}".format(i, j)
                w1, b1, w2, b2 = np.random.randn(4).tolist()
                brew.conv(
                    m,
                    bottom_1,
                    mid_1,
                    dim_in=d,
                    dim_out=d,
                    kernel=3,
                    weight_init=("ConstantFill", {"value": w1}),
                    bias_init=("ConstantFill", {"value": b1}),
                    cudnn_state=np.random.randint(0, 3),
                    stride=1,
                    pad=1,
                    deterministic=1,
                    use_cudnn=use_cudnn,
                    engine=engine,
                )
                brew.conv(
                    m,
                    bottom_2,
                    mid_2,
                    dim_in=d,
                    dim_out=d,
                    kernel=3,
                    stride=1,
                    pad=1,
                    weight_init=("ConstantFill", {"value": w2}),
                    bias_init=("ConstantFill", {"value": b2}),
                    deterministic=1,
                    cudnn_state=np.random.randint(0, 3),
                    use_cudnn=use_cudnn,
                    engine=engine,
                )
                m.net.Sum([mid_1, mid_2], top)

        m.net.Flatten(["0_0"], ["0_0_flat"])
        m.net.SquaredL2Distance(["0_0_flat", "label"], "xent")
        m.net.AveragedLoss("xent", "loss")
        input_to_grad = m.AddGradientOperators(["loss"])
        m.Proto().device_option.CopyFrom(gc)
        m.param_init_net.Proto().device_option.CopyFrom(gc)
        m.Proto().type = net_type
        m.Proto().num_workers = num_workers
        self.ws.run(m.param_init_net)

        def run():
            import numpy as np

            np.random.seed(1701)
            input_blobs = ["{}_{}".format(depth, j) for j in range(2 ** depth)]
            for input_blob in input_blobs:
                self.ws.create_blob(input_blob).feed(
                    np.random.randn(n, d, h, w).astype(np.float32), device_option=gc
                )
                self.ws.create_blob("label").feed(
                    np.random.randn(n, d * h * w).astype(np.float32), device_option=gc
                )
            self.ws.run(m.net)
            gradients = [
                self.ws.blobs[str(input_to_grad[input_blob])].fetch()
                for input_blob in input_blobs
            ]
            return gradients

        outputs = [run() for _ in range(iters)]
        for output in outputs[1:]:
            np.testing.assert_array_equal(outputs[0], output)
            np.testing.assert_allclose(
                np.sum(np.square(output)), 1763719461732352.0, rtol=1e-5
            )

    def test_use_cudnn_engine_interactions(self):
        """Make sure the use_cudnn and engine kwargs work as expected."""
        for model_default in [None, True, False]:
            arg_scope = {}
            if model_default is not None:
                arg_scope["use_cudnn"] = model_default
            else:
                model_default = True  # the default

            model = ModelHelper(arg_scope=arg_scope)
            self.assertEqual(model.arg_scope["use_cudnn"], model_default)
            f = functools.partial(brew.conv, model, "conv_in", "conv_out", 10, 10, 5)

            for op_cudnn in [None, True, False]:
                for op_engine in [None, "", "CUDNN"]:
                    kwargs = {}
                    if op_cudnn is not None:
                        kwargs["use_cudnn"] = op_cudnn
                    else:
                        op_cudnn = False  # the default
                    if op_engine is not None:
                        kwargs["engine"] = op_engine

                    calculated_cudnn = kwargs.get("use_cudnn", model_default)
                    expected_engine = kwargs.get(
                        "engine", "CUDNN" if calculated_cudnn else ""
                    )

                    if (calculated_cudnn is False and op_engine == "CUDNN") or (
                        calculated_cudnn is True and op_engine == ""
                    ):
                        with self.assertRaises(ValueError):
                            f(**kwargs)
                    else:
                        f(**kwargs)
                        self.assertEqual(model.Proto().op[-1].engine, expected_engine)

    @given(
        op_type=st.sampled_from(["Conv", "Conv2D"]),
        N=st.integers(0, 3),
        G=st.integers(1, 3),
        DX=st.integers(1, 3),
        DY=st.integers(1, 3),
        H=st.integers(1, 3),
        W=st.integers(1, 3),
        use_bias=st.booleans(),
        order=st.sampled_from(["NCHW", "NHWC"]),
        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
        **hu.gcs
    )
    @settings(deadline=10000)
    def test_1x1_conv(
        self,
        op_type,
        N,
        G,
        DX,
        DY,
        H,
        W,
        use_bias,
        order,
        force_algo_fwd,
        force_algo_dgrad,
        force_algo_wgrad,
        gc,
        dc,
    ):
        if hiputl.run_in_hip(gc, dc):
            assume(order == "NCHW")
        if order == "NHWC":
            G = 1

        C = G * DX
        M = G * DY

        op = core.CreateOperator(
            op_type,
            ["X", "filter", "bias"] if use_bias else ["X", "filter"],
            ["Y"],
            stride_h=1,
            stride_w=1,
            pad_t=0,
            pad_l=0,
            pad_b=0,
            pad_r=0,
            kernel=1,
            order=order,
            group=G,
            force_algo_fwd=force_algo_fwd,
            force_algo_dgrad=force_algo_dgrad,
            force_algo_wgrad=force_algo_wgrad,
        )

        if order == "NCHW":
            X = np.random.randn(N, C, H, W).astype(np.float32)
            filter = np.random.randn(M, DX, 1, 1).astype(np.float32)
        else:
            X = np.random.randn(N, H, W, C).astype(np.float32)
            filter = np.random.randn(M, 1, 1, DX).astype(np.float32)
        bias = np.random.randn(M).astype(np.float32)
        inputs = [X, filter, bias] if use_bias else [X, filter]

        def conv_1x1_nchw_ref(X, filter, bias=None):
            if N == 0:
                Y = np.zeros(shape=(N, M, H, W), dtype=np.float32)
                return [Y]

            X = X.reshape(N, G, DX, -1)
            filter = filter.reshape(G, DY, DX)
            Y = np.zeros(shape=(N, G, DY, H * W), dtype=np.float32)
            for i in range(N):
                for j in range(G):
                    Y[i, j, :, :] = np.dot(filter[j, :, :], X[i, j, :, :])
            Y = Y.reshape(N, M, H, W)
            if bias is not None:
                bias = bias.reshape(1, M, 1, 1)
                Y = np.add(Y, bias)
            return [Y]

        def conv_1x1_nhwc_ref(X, filter, bias=None):
            if N == 0:
                Y = np.zeros(shape=(N, H, W, M), dtype=np.float32)
                return [Y]

            X = X.reshape(N, -1, G, DX)
            filter = filter.reshape(G, DY, DX)
            Y = np.zeros(shape=(N, H * W, G, DY), dtype=np.float32)
            for i in range(N):
                for j in range(G):
                    Y[i, :, j, :] = np.dot(X[i, :, j, :], filter[j, :, :].transpose())
            Y = Y.reshape(N, H, W, M)
            if bias is not None:
                bias = bias.reshape(1, 1, 1, M)
                Y = np.add(Y, bias)
            return [Y]

        if order == "NCHW":
            conv_1x1_ref = conv_1x1_nchw_ref
        else:
            conv_1x1_ref = conv_1x1_nhwc_ref
        self.assertReferenceChecks(
            device_option=gc, op=op, inputs=inputs, reference=conv_1x1_ref
        )
        self.assertDeviceChecks(dc, op, inputs, [0])
        for i in range(len(inputs)):
            self.assertGradientChecks(gc, op, inputs, i, [0])


if __name__ == "__main__":
    unittest.main()