#! /usr/bin/env python3

import os
import pathlib
import struct
import tempfile

import pickle
from collections import OrderedDict, defaultdict

import numpy as np

# 4 is best in test, reduce to 1/3 time.
os.environ["OMP_NUM_THREADS"] = "4"

from sys_flow_v2.kneron_round import kneron_round_array
import sys_flow_v2.flow_constants as fconsts
import sys_flow_v2.flow_utils as futils
import sys_flow_v2.compiler_v2 as compiler

DEBUG = True if os.environ.get("REGRESSION_DEBUG", False) else False
import snoop

snoop.install(enabled=DEBUG)


###################################################################################
# setup binary
###################################################################################

p_dynasty_so = pathlib.Path(fconsts.BIN_SET["dynasty"]["lib.so"])
ENV_DYNASTY_LIB = f"""export LD_LIBRARY_PATH="{p_dynasty_so.parent}:$LD_LIBRARY_PATH" """
BIN_DC = fconsts.BIN_SET["data_converter"]["v2"]


###################################################################################
# convert input to bin file
###################################################################################
def array_fl2fx(np_arr, bitwidth, radix, scales=None, i_dim_is_ch=1, round_process="array"):
    """Convert (1 input) numpy float to integer.

    .. parameters::
      - round_process (from 0.27.0):
        - "array": send in flattened array, loop in cython with kneron_round. 0.54s for 12M points.
        - "no_btm": use np to do int32 convert. DONT use this. not for bit-true-match. for speed check only. 0.082 for 12M points. (0.05 if no convert)

    .. seealso::
      - `dynasty clamp <https://gitlab.kneron.tw/SYS/dynasty_fx/-/blob/master/include/common/FixUtils.h#L317>`_
      - `dynasty quantization <https://gitlab.kneron.tw/SYS/dynasty_fx/-/blob/master/include/common/FixUtils.h#L763>`_
      - `kneron round <https://gitlab.kneron.tw/SYS/dynasty_fx/-/blob/master/src/dsp/common_utils.c#L414>`_

    15bit input: https://redmine.kneron.tw/issues/18689
    """
    # TODO: apply per-channel-scale to this_np with  scales + i_dim_is_ch
    # size should match

    # get per-channel radix
    n_dim = np_arr.shape[i_dim_is_ch]
    radix = futils.expand_array(radix, n_dim).astype('float32')

    if scales is None:
        scales = 1.0
    scales = futils.expand_array(scales, n_dim)
    # radix / scales must be list of n_dim size

    # NOTE: use 16B16R for bitwidth == 15 to comply with firmware
    # https://redmine.kneron.tw/issues/18689
    if bitwidth == 15:
        bitwidth = 15 + 1
        radix = radix + 1

    # clamp boundaries
    v_max = (1<<(bitwidth-1))-1
    v_min = -(1<<(bitwidth-1))

    # expand per-channel value to matrix
    scale_expand_dim = list(range(len(np_arr.shape)))
    scale_expand_dim.remove(i_dim_is_ch)
    scales *= 2**radix  # in-place
    scl_matx = np.expand_dims(scales, scale_expand_dim)

    np_arr *= scl_matx  # scale in-place
    np.clip(np_arr, v_min, v_max, out=np_arr)  # clip in-place

    if round_process == "array":
        # this should be fast
        return kneron_round_array(np_arr.ravel(), bitwidth)

    elif round_process == "no_btm":
        # this is fastest! NOT calling kneron_round
        return np_arr.flatten().astype(np.int32)


def bw2fmt(bitwidth):
    """Get format for this bitwidth."""
    # NOTE: use struct: https://docs.python.org/3/library/struct.html
    if bitwidth <= 8:
        return "b"  # 8bit
    elif bitwidth <= 16:
        return "h"  # 16bit
    elif bitwidth <= 32:
        return "i"  # 32bit
    else:
        return "q"


def fx2bin(arr_fx, fn_bin, bitwidth):
    """Write fx data to binary in sequential."""
    # TODELETE
    # # NOTE: use struct: https://docs.python.org/3/library/struct.html
    # fmt = bw2fmt(bitwidth)
    # with open(fn_bin, "wb") as f:
    #     f.write(struct.pack(f"<{len(arr_fx)}{fmt}", *arr_fx))
    dtype_map = {8: np.int8, 16: np.int16, 32: np.int32}
    try:
        dtype = dtype_map[bitwidth]
    except KeyError:
        raise ValueError(f"Unsupported bitwidth: {bitwidth}")

    # must be C style that memory continues
    arr_fx = np.asarray(arr_fx, dtype=dtype, order="C")
    arr_fx.tofile(fn_bin)


def idx2onnx_order(info_in, n_sqt_bin):
    """Guess onnx input name order."""
    # NOTE: each pair of sqt_bin is same length as onnx input node, say 2: A,B
    #       the input number in ioinfo.json, say 5, A1, A2, B1, B2, B3.
    #       compiler guarantee that A-related nodes go first, then B-related nodes.
    compiler_give_names = [a["name"] for a in info_in]
    if len(compiler_give_names) == n_sqt_bin:
        # most of cases, just 1:1
        return {name:i for i, name in enumerate(compiler_give_names)}, {i:name for i, name in enumerate(compiler_give_names)}

    # now each sequential.bin may convert to multiple dram.bin
    # TODO: verify compiler_give_names should be continues (same name stays together)
    name_order = {}
    o_order2dp = {}
    for name in compiler_give_names:
        if name not in name_order:
            n = len(name_order)
            name_order[name] = n
            o_order2dp[n] = name
    assert len(name_order) == n_sqt_bin

    return name_order, o_order2dp


def txt2bin_seq(in_pairs, ioinfo_in, p_out):
    """Convert fl text to sequential bin file.

    For compability of regression.
    """
    # load float txt into np
    np_in = defaultdict(list)
    # NOTE:
    # in_pair(s) is specified by onnx, it may be [a.txt, b.txt, c.txt]
    # compiler may specify dp_in to be [A, A, B, B, C].
    # same node may appear multiple times
    # TODO: we assume same input node always have same quantization
    _, o_order2dp = idx2onnx_order(ioinfo_in, len(in_pairs[0]))
    for i_pair, in_pair in enumerate(in_pairs):
        # independant between pairs
        for i_txt, p_txt in enumerate(in_pair):
            dp_name = o_order2dp[i_txt]
            # use the first shape info given by compiler
            shape = [a["onnx_shape"] for i, a in enumerate(ioinfo_in) if a["name"] == dp_name][0]
            # load txt into numpy array
            np_in[dp_name].append(futils.txt2np_fl(p_txt, shape))

    bin_pair = np2bin_seq(np_in, ioinfo_in, p_out)
    # there should be only 1 element
    return bin_pair


def txt2bin_rgba(in_pairs: list, info_in: list, p_out):
    """Convert fl text to rgba bin file for 520.

    The info_in is from knerex json.

    Args:
      in_pairs (list): list of list, inside list is a pair of input.
      info_in (list): quantization info per input node.
      p_out (pathlib or str): where to save converted.bin files.

    Special process for 520 csim input:
      1. minimal 4 channels.
      2. columns (w) should be multiple of 16.
      3. channel last.
    """
    # quick check on inputs.
    l1 = len(in_pairs[0])
    l2 = len(info_in)
    assert l1 == l2, f"in_pairs len: {l1}, info_in len: {l2}"

    # Load float txt into np
    bin_pair = [[] for a in in_pairs[0]]
    for i_pair, in_pair in enumerate(in_pairs):
        for i_dp, p_txt in enumerate(in_pair):
            # NOTE: 520 only support single input models (except stc.)
            # so i_dp is always 0
            shape = info_in[i_dp]["onnx_shape"]
            bw = info_in[i_dp]["bitw"]
            radix = info_in[i_dp]["radix"]
            scale = info_in[i_dp]["scale"]

            # padding to >= 4 dimension

            if isinstance(p_txt, (str, pathlib.Path)):
                fl = futils.txt2np_fl(p_txt, shape)
            elif isinstance(p_txt, np.ndarray):
                fl = p_txt
            else:
                raise NotImplementedError()
            n_dim = len(fl.shape)
            if n_dim < 4:
                fl = np.expand_dims(fl, list(range(4-n_dim)))
            n_dim = len(fl.shape)
            # assert n_dim == 4
            # NOTE: will fail when n_dim > 4

            # channel last
            fl = np.moveaxis(fl, 1, 3)

            # pad channel to 4, columns to 16x
            d_padding = [[0, 0] for i in range(n_dim)]
            do_pad = False
            if fl.shape[-1] < 4:
                # padding channel to 4
                n_pad = 4 - fl.shape[-1]
                d_padding[-1][-1] = n_pad
                arr_pad = np.zeros(n_pad)  # zero is fine.
                radix = np.append(radix, arr_pad)
                scale = np.append(scale, arr_pad)
                do_pad = True
            if fl.shape[-2] % 16 != 0:
                # padding columns
                d_padding[-2][-1] = 16 - fl.shape[-2] % 16
                do_pad = True
            if do_pad:
                fl = np.pad(fl, d_padding, mode="constant", constant_values=0)

            # quantization. for the 1st and only one input
            np_fx = array_fl2fx(fl, bw, radix, scale, n_dim-1)

            # save to bin
            p_bin = p_out / f"csim_p{i_pair:06}_i{i_pair:03}_rgba.bin"
            fx2bin(np_fx, p_bin, bw)
            bin_pair[i_dp].append(p_bin)

    # bin_pair is [[in1_pair1, in1_pair2], [in2_pair1, in2_pair2]]
    return bin_pair


def np2bin_seq(np_in, ioinfo_in, p_out=None, round_process="array"):
    """Convert numpy array to sequential bin file."""
    # TODO: internal process assume np_in as list of list
    # load np_in if not yet
    if type(np_in) in [str, pathlib.PosixPath]:
        with open(np_in, "rb") as f:
            np_in = pickle.load(f)

    # NOTE: ioinfo_in may have duplicated dp names: [A, A, B, B, C]
    #       we assume same DP have SAME quantization.
    #       use ioinfo_uni for now
    def keep_unique(l):
        uni = set()
        results = []

        for i, a in enumerate(l):
            if a["name"] not in uni:
                uni.add(a["name"])
                results.append(a)
        return results

    ioinfo_uni = keep_unique(ioinfo_in)

    # unpack data
    dp_in = [a["name"] for a in ioinfo_uni]
    bitwidth = [a["bitw"] for a in ioinfo_uni]
    radix = [a["radix"] for a in ioinfo_uni]
    scales = [a["scale"] for a in ioinfo_uni]
    i_dim_is_ch = [a["ch_dim"] for a in ioinfo_uni]
    # shape not used. maybe to check with np_in array shape?

    # sanity check
    assert set(np_in.keys()) == set(
        dp_in
    ), f"dp_in \"{dp_in}\" should be same as np_in \"{list(np_in.keys())}\""

    temp = [len(v) for k, v in np_in.items()]
    assert len(set(temp)) == 1, f"np list should have same length, but got {temp}"
    N_pairs = temp[0]

    if p_out is None:
        p_out = tempfile.mkdtemp(prefix="csim_")
    p_out = pathlib.Path(p_out)
    p_out.mkdir(exist_ok=True, parents=True)

    # TODO: save the radix/ scale quantization info

    # np_in_pairs = [[np_in[k1][n] for k1 in dp_in] for n in range(N_pairs)]
    bin_in_pairs = OrderedDict()
    for i_pair in range(N_pairs):
        pair = []
        for i_dp, dp_name in enumerate(dp_in):
            p_bin = p_out / f"seq_p{i_pair:06}_node{i_dp:03}.bin"
            pair.append(p_bin)
            this_np = np_in[dp_name][i_pair]

            ## channel dimension is not in onnx shape, need to expand
            if i_dim_is_ch[i_dp] == -1:
                this_np = np.expand_dims(this_np, 1)
                # i_dim_is_ch[i_dp] = 0

            # the real work done here
            np_fx = array_fl2fx(this_np,
                                bitwidth[i_dp],
                                radix[i_dp],
                                scales[i_dp],
                                i_dim_is_ch[i_dp],
                                round_process=round_process)
            fx2bin(np_fx, p_bin, bitwidth[i_dp])

        # NOTE: the pair_name include input pair order.
        # will use this to return inferenced results
        bin_in_pairs[i_pair] = pair

    return bin_in_pairs


def data_convert_1(bin_seq, bin_rgba,
                   fmt_in: str, fmt_out: str,
                   stride_in: list, stride_out: list,
                   shape: list, ndim: int):
    """Convert one sequential.bin to rgba.bin."""
    # TODO: what if fmt_out is RAW_FLOAT / RAW_8 / RAW_16?
    # seems dynasty dump it previously
    s_shape = ",".join(str(a) for a in shape)
    s_stride_in = ",".join(str(a) for a in stride_in)
    s_stride_out = ",".join(str(a) for a in stride_out)
    # always use --nhwc 1 for ng1 730
    # --is_ng1 is same as --nhwc
    cmd = f"""{BIN_DC} --fin {bin_seq} --fout {bin_rgba} --dim {ndim} --shape {s_shape} --fmt_in {fmt_in} --fmt_out {fmt_out} --stride_in {s_stride_in} --stride_out {s_stride_out}  --is_ng1 1"""

    return cmd


def data_convert(sqt_bin_list: dict, ioinfo_in: list, p_out=None, n_thread=4):
    """Convert list of sequential.bin to csim_input.bin, based on compiler info.

    * NOT for 520
    """
    # prepare for output folder
    if p_out is None:
        p_out = tempfile.mkdtemp(prefix="dc_")
    p_out = pathlib.Path(p_out)
    p_out.mkdir(exist_ok=True, parents=True)

    def bw2raw_fmt(bw):
        # the in.bin in regression is alwasy raw format.
        if bw == 8:
            return "FMT_RAW8B"
        elif bw in [15, 16]:
            # https://redmine.kneron.tw/issues/18706
            return "FMT_RAW_NPU16B"
        else:
            raise NotImplementedError(f"unsupported bitwidth: {bw}.")

    def convert_inproc_fmt(fmt1):
        """For inproc format, need to convert before send to data converter.
        
        Reference: 
          - https://redmine.kneron.tw/issues/23754
          - compiler_v2/get_support_formats() function.
        """
        if fmt1 == "HW1C8B":
            return "RAW8B"
        elif fmt1 == "HW4C8B_DROP_A":
            return "4C8B_DROP_A"
        elif fmt1 == "HW4C8B_KEEP_A":
            return "4C8B_DROP_A"
        elif fmt1 == "HW1C16B_LE":
            return "RAW16B"
        elif fmt1 == "HW1C16B_BE":
            return "RAW16B_BE"
        else:
            return fmt1

    # ordered dictionary
    n_sqt_bin = len(sqt_bin_list[0])
    name2o_ord, _ = idx2onnx_order(ioinfo_in, n_sqt_bin)

    cmds = []  # save the commands for debug
    list_bin_csim = defaultdict(list)
    for i_in, i_info in enumerate(ioinfo_in):
        shape = i_info["shape"]
        ndim = len(shape)
        # stride in / out are same, per Tommy.
        # stride_in = ioinfo_in[i_in]["stride"]
        # stride_out = ioinfo_in[i_in]["stride"]
        # TEMP: use -1 for now.
        stride_in = [-1]
        # stride_aligned in .kne.no_binary.json but will convert to stride
        stride_out = i_info["stride"]

        fmt_in = bw2raw_fmt(i_info["bitw"])
        fmt_out = "FMT_{}".format(convert_inproc_fmt(i_info["data_format"]))
        sqt_idx = name2o_ord[i_info["name"]]
        for i_pair, bin_pair in sqt_bin_list.items():
            p_sqtl_bin = bin_pair[sqt_idx]
            p_csim_bin = p_out / f"csim_p{i_pair:06}_i{i_in:03}.bin"
            list_bin_csim[i_pair].append(p_csim_bin)
            cmd1 = data_convert_1(p_sqtl_bin, p_csim_bin,
                                  fmt_in, fmt_out, stride_in, stride_out,
                                  shape, ndim)
            cmds.append(cmd1)

    # save cmds then run
    fn_cmd = p_out / "data_convert.sh"
    with open(fn_cmd, "w") as f:
        f.writelines([f"{cmd}\n" for cmd in cmds])

    if n_thread is None:
        n_str = ""
    else:
        n_str = f"--jobs {n_thread}"
    command = f"parallel {n_str} --halt now,fail=1 < {fn_cmd}"

    cp = futils.run_bash_script(command)
    if cp.returncode != 0:
        raise RuntimeError(f"data converter failed with {cp.returncode}")

    return list_bin_csim, cmds


###################################################################################
# do real work here
###################################################################################
def gen_csim_settings(bin_pair: list,
                      cmpl_map,
                      p_cmpl_rel,
                      hw_mode,
                      dump_core_opt=0,
                      golden_txt=None):
    """Generate csim settings.

    golden_txt should be relative path.
    """
    csim_settings = {}

    # convert to relative path
    for fn_key, fn_full in cmpl_map.items():
        csim_settings[fn_key] = p_cmpl_rel / fn_full.name

    # fill none for unused
    if hw_mode in fconsts.MODE_HW_LIMIT["nef_v2"]:
        csim_settings["command_bin"] = "None"
        csim_settings["weight_bin"] = "None"
        csim_settings["setup_bin"] = "None"
        csim_settings["apb_npu"] = "None"
        # in regression / unpack_nef, the kne will have have only 1 model. (it is possible to have multiple)
        # csim_settings["model_index_in_kne"] = 0
    else:
        csim_settings["kne"] = "None"

    # TODO: make this flow_constants
    # input_location: 0 for nmem, 1 for dram
    csim_settings["input_location"] = 1 if hw_mode in fconsts.MODE_HW_LIMIT["input_in_dram"] else 0

    csim_settings["input_bin"] = ",".join(str(a) for a in bin_pair)
    csim_settings["dump_core_opt"] = dump_core_opt

    # NOTE: if golden_text available, csim own regression may use it for quick check
    if golden_txt is None:
        csim_settings["golden_txt"] = "NONE"
    else:
        # convert relative path
        csim_settings["golden_txt"] = ",".join(str(a) for a in golden_txt)

    # prepare dynasty golden
    # golden_txt_fns = self.io_nodes[("btm_dynasty_golden_txt_fn", hw_mode)]
    # p_dynasty_golden = [p_dynasty_dump / fn for fn in golden_txt_fns]

    # # csim use relative path to output folder
    # rel_fns = ["{}/{}".format(p_dynasty_dump_relative, fn) for fn in golden_txt_fns]
    # csim_settings["golden_txt"] = ",".join(rel_fns)

    return csim_settings


def gen_csim_ini(bin_pair,
                 p_compiler,
                 hw_mode,
                 template,
                 fn_ini,
                 dump_core_opt=0,
                 golden_txts=None):
    """Generate a ini for csim calling per input pair."""
    if hw_mode == 520:
        # 520 csim doesn't use .ini it's just for regression flow usage
        return

    # check folder
    p_in = bin_pair[0].parent
    # relative to the ini file.
    p_ini = fn_ini.parent

    p_in_rel = futils.relative_path(p_in, p_ini)
    bin_pair_rel = [p_in_rel / a.name for a in bin_pair]

    p_comp_rel = futils.relative_path(p_compiler, p_ini)

    compiler_map = compiler.locate_compiler_dump(p_compiler, hw_mode)

    if golden_txts is None:
        p_golden_txt = None
    else:
        p_golden = golden_txts[0].parent
        p_golden_rel = futils.relative_path(p_golden, p_ini)
        p_golden_txt = [p_golden_rel / a.name for a in golden_txts]

    csim_settings = gen_csim_settings(bin_pair_rel,
                                      compiler_map,
                                      p_comp_rel,
                                      hw_mode,
                                      dump_core_opt=dump_core_opt,
                                      golden_txt=p_golden_txt)

    # render template and save to run_csim.ini
    output = template.render(model=csim_settings)
    with open(fn_ini, "w") as f:
        f.write(output)
    assert pathlib.Path(fn_ini).exists(), f"failed to create {fn_ini}"

    return fn_ini


def run_csim(list_csim: dict, bin_csim, sh_run_csim=None, n_thread=None, dry_run=False, timeout=3600*6):
    """Run csim inference on given input.

    NOTE: we need bin_csim to pass in as it is platform-dependant
    so we cannot grab from fconsts.BIN_SET directly without platform info.
    """
    cmds = []

    for i_csim, (p_out, ini_csim) in list_csim.items():
        # prepare folders
        p_out.mkdir(mode=0o770, parents=True, exist_ok=True)

        cmd1 = f"{ENV_DYNASTY_LIB}; pushd {p_out} && {bin_csim} {ini_csim}"
        cmds.append(cmd1)

    if sh_run_csim is None:
        r = np.random.randint(10000)
        sh_run_csim = f"/tmp/run_csim_{r:05}.sh"
    with open(sh_run_csim, "w") as f:
        f.write("\n".join(cmds))

    if n_thread is None:
        n_str = ""
    else:
        n_str = f"--jobs {n_thread}"
    command = f"parallel {n_str} --halt now,fail=1 < {sh_run_csim}"

    if dry_run:
        cp = None  # placeholder
    else:
        cp = futils.run_bash_script(command, timeout=timeout)

    return command, cp


def txt2np(out_node_list, out_nodes_shape, output_list, is_520=False):
    """Convert the csim dumped results to np.

    - csim will only dump fx data. need extra info to convert back to float.
    - csim dump files as `dma2seq_0.seq`

    Args:
      out_node_list (list): list of output node names.
        As csim dump multiple output in same order,
        this list will be used as key for output dict.
      out_nodes_shape (list): list of shapes (for all output nodes).
        The integer text will be reshaped to given shape in numpy.
      output_list: where to find csim dump text files.
        basically `list(p_dump.glob(f"*.txt/csim_{hw_mode}"))`.
      is_520 (bool): different name pattern for 520 csim dump.

    Returns:
      dict of list of numpy array.
      Each array is integer datatype and with onnx shape.
    """
    collect_txt_fx = defaultdict(list)
    for i_dp, dp_out in enumerate(out_node_list):
        shape_out = out_nodes_shape[dp_out]

        if is_520:
            fx_output = f"node_{i_dp:04d}_final_output.txt"
        else:
            # NOTE: .seq is only 8/15bit, no 16bit
            fx_output = f"dma2seq_{i_dp}.seq"

        for p_dump in output_list:
            p_fx = pathlib.Path(p_dump) / fx_output
            if not p_fx.exists():
                raise FileExistsError(f"missing {p_fx}")
            collect_txt_fx[dp_out].append(futils.txt2np_fx(p_fx, shape_out))

    return collect_txt_fx