kneron_model_converter/vendor/sys_flow_v2/inference.py

#! /usr/bin/env python3


import pathlib
from collections import OrderedDict
import os
import tempfile
import shutil

import pytz

from jinja2 import Environment, FileSystemLoader

import sys_flow_v2.flow_utils as futils
import sys_flow_v2.flow_constants as fconsts
import sys_flow_v2.dynasty_v3 as dynasty
import sys_flow_v2.csim_utils as csim

DEBUG = True if os.environ.get("REGRESSION_DEBUG", False) else False
import snoop
snoop.install(enabled=DEBUG)

# find the binaries first
dynasty_bin = fconsts.BIN_SET["dynasty"]["binary"]
dynasty_so = fconsts.BIN_SET["dynasty"]["lib.so"]
dynasty_cuda_so = fconsts.BIN_SET["dynasty"]["lib_cuda.so"]
bin_dc = fconsts.BIN_SET["data_converter"]["v2"]

timezone = pytz.timezone("America/Los_Angeles")

P_TMP_MODEL = pathlib.Path("/tmp/working")


def inference_onnx_runtime(
        fn_onnx,
        input_np,
        device="gpu",
        p_working="/tmp",
        shape_in="onnx_shape",
        dump_level=0
):
    raise NotImplementedError()


def get_model_io(p_onnx, hw_mode=None):
    """Interface to get ioinfo from onnx/bie.

    dynasty / csim flatten data to one dimension then dump to text.
    Input node name list and output node (onnx) shape,
    are needed when convert flattened data back to onnx shape.

    Args:
      p_onnx
        onnx or bie file to examine.
      hw_mode: int
        520/720/...
        If given p_onnx is a bie file, this is needed.
    Returns:
      - list of `input_nodes`
      - list of `output_nodes`
      - shape of output nodes
      - dict of ioinfo json files per platform.

    TODO:
      move to flow_utils.

    NOTE:
      ioinfo include radix info, is needed by e2e.
    """
    # make sure it is pathlib
    p_onnx = pathlib.Path(p_onnx)
    # get input_nodes names
    if p_onnx.name.endswith(".onnx"):
        input_nodes, output_nodes, out_node_shape, _ = futils.get_ioinfo_from_onnx(p_onnx)
    elif p_onnx.name.endswith(".bie"):
        assert not (hw_mode is None), "get_model_io: given bie file, hw_mode is needed."
        input_nodes, output_nodes, out_node_shape, _ = futils.get_ioinfo_from_bie(
            p_onnx, hw_mode, dynasty_bin)
        # dynasty dump float results directly, no need to convert fx to fl
    else:
        raise NotImplementedError

    # 0.24.0 ioinfo is needed for dynasty.
    # keep it (d_ioinfo) here for back-compatible
    d_ioinfo = {}

    return input_nodes, output_nodes, out_node_shape, d_ioinfo


def get_model_type(p_onnx):
    """Get model type from name.

    Why not put in gen_dynasty_mode_settings?
    Because it is complicated to support many regression debug purpose.
    """
    if p_onnx.name.endswith(".onnx"):
        return "piano_onnx"
    elif p_onnx.name.endswith(".bie"):
        return "piano_bie"
    else:
        raise NotImplementedError


def inference_dynasty_fl_so(
        fn_onnx,
        input_np,
        input_nodes=None,      # list of input nodes, to know the order of input nodes
        out_node_shape=None,   # need this to convert dynasty_fl dump to onnx shape
        device="dynasty_cpu",
        p_working=None,
        shape_in="onnx_shape",
        dump_level=0
):
    """
    Run inference on given model (fn_onnx) with data `input_np`.

    Will run `dynasty float` mode on this model with given input().
    Currently call dynasty bin for the inference.
    Maybe: use dynasty.so for inference.

    Args:
      fn_onnx (pathlib / str): path to origin.onnx.
      input_np (dict): dict of list of numpy file. See begin of page for details.
      input_nodes (list): a list of input node names.
        This indicate the order of input nodes so the numpy array in dict
        can be passed to dynasty binary in correct order.
        If not set, this function will call `get_model_io` to
        get `input_nodes` and `out_node_shape`.
      out_node_shape (list): a list of shape for out nodes.
        This is necessary to convert flattened data to onnx shape.
      device (str): choose which method for inference.

        - `dynasty_cpu` (default), using kneron dynasty binary.
          included in kneron/toolchain.
        - `dynasty_gpu`, using kneron dynasty binary.
          included in kneron/toolchain. Need to run in toolchain with cuda support.
        - `ort_gpu`, using onnx runtime with GPU. (not ready).
          Use this for accelleration but only appliable when GPU available.
        - `ort_cpu`, using onnx runtime with CPU. (not ready).
          some kneron customized nodes not supported?

      p_working (pathlib / str): where to put cache file.
        The user need to clean this folder to release disk space.
        e2e will give different for each image.
      shape_in (str): choose from `onnx_shape` (default) / `channel_last` (will be obsoleted).
      dump_level:

        - 0 (default): dump inference results for only output nodes.
        - 1: dump inference results for output nodes + cpu nodes.
        - 2: dump inference results for all nodes.

    Returns:
        dict of list of numpy array as inference results. Each numpy array is same shape as onnx specified.
    """
    # prepare folder and files
    if p_working is None:
        p_working = tempfile.mkdtemp("dyn_fl_")
    p_working = pathlib.Path(p_working)
    p_onnx = pathlib.Path(fn_onnx)
    assert p_onnx.exists(), f"{fn_onnx} does not exist!"

    assert device in ["dynasty_cpu", "dynasty_gpu", "ort_cpu", "ort_gpu"], \
        f"device should be dynasty/dynasty_gpu/ort_cpu/ort_gpu, but given {device}"

    model_name = futils.clean_name(p_onnx.name)
    model_id = f"tc/{model_name}"

    # setup for inference.
    np_id = futils.gen_random_string(8)
    # will run only float mode on given onnx
    # NOTE: we assume input does not change!
    p_input = p_working / f"model_input_{np_id}"
    # NOTE: each inference will have different results folder.
    p_output = p_working / f"res_{np_id}"

    # prepare text input
    if input_nodes is None or out_node_shape is None:
        # e2e need to call this if want to run infenrece one by one
        input_nodes, _, out_node_shape, _ = get_model_io(p_onnx)
    ch_last = shape_in == "channel_last"
    _, grouped_input_list, _ = futils.npy2txt(input_np, input_nodes, p_input,
                                              exists_then_skip=True,
                                              compression="txt", ch_last=ch_last)

    if "gpu" in device:
        lib = dynasty_cuda_so
    else:
        lib = dynasty_so

    if "ort" in device:
        ort = True
    else:
        ort = False

    # NOTE: use grouped_input_list[0] since E2E does one input at a time
    dynasty.run_dynasty_so(lib,
                           fn_onnx,
                           len(input_nodes),
                           grouped_input_list[0],
                           input_nodes,
                           str(p_working),
                           ort)

    # convert dynasty dump text to np
    np_out = dynasty.txt2np_so(out_node_shape, p_working)

    return np_out


def inference_dynasty_fl(
        fn_onnx,
        input_np,
        input_nodes=None,      # list of input nodes, to know the order of input nodes
        out_node_shape=None,   # need this to convert dynasty_fl dump to onnx shape
        device="dynasty",
        p_working=None,
        shape_in="onnx_shape",
        timeout=60*60*6,
        dump_level=0
):
    """
    Run inference on given model (fn_onnx) with data `input_np`.

    Will run `dynasty float` mode on this model with given input().
    Currently call dynasty bin for the inference.
    Maybe: use dynasty.so for inference.

    Args:
      fn_onnx (pathlib / str): path to origin.onnx.
      input_np (dict): dict of list of numpy file. See begin of page for details.
      input_nodes (list): a list of input node names.
        This indicate the order of input nodes so the numpy array in dict
        can be passed to dynasty binary in correct order.
        If not set, this function will call `get_model_io` to
        get `input_nodes` and `out_node_shape`.
      out_node_shape (list): a list of shape for out nodes.
        This is necessary to convert flattened data to onnx shape.
      device (str): choose which method for inference.

        - `dynasty` (default), using kneron dynasty binary. included in kneron/toolchain.
        - `ort_gpu`, using onnx runtime with GPU. (not ready).
          Use this for accelleration but only appliable when GPU available.
        - `ort_cpu`, using onnx runtime with CPU. (not ready).
          some kneron customized nodes not supported?
      p_working (pathlib / str): where to put cache file.
        The user need to clean this folder to release disk space.
        e2e will give different for each image.
      shape_in (str): choose from `onnx_shape` (default) / `channel_last` (will be obsoleted).
      dump_level:
        - 0 (default): dump inference results for only output nodes.
        - 1: dump inference results for output nodes + cpu nodes.
        - 2: dump inference results for all nodes.

    Returns:
        dict of list of numpy array as inference results. Each numpy array is same shape as onnx specified.
    """
    # prepare folder and files
    if p_working is None:
        p_working = tempfile.mkdtemp("dyn_fl_")
    p_working = pathlib.Path(p_working)
    p_onnx = pathlib.Path(fn_onnx)
    assert p_onnx.exists(), f"{fn_onnx} does not exist!"

    assert device in ["dynasty", "ort_cpu", "ort_gpu"], \
        f"device should be dynasty/ort_cpu/ort_gpu, but given {device}"
    # TODO: if device is "cpu"/"gpu", call libdynasty.so / libdynasty_gpu.so

    model_name = futils.clean_name(p_onnx.name)
    model_id = f"tc/{model_name}"

    np_id = futils.gen_random_string(8)
    # setup input for inference.
    # NOTE: we assume input does not change!
    p_input = p_working / f"model_input_{np_id}"
    # NOTE: each inference will have different results folder.
    p_output = p_working / f"res_{np_id}"

    # prepare text input
    if input_nodes is None or out_node_shape is None:
        # e2e need to call this if want to run infenrece one by one to save time.
        input_nodes, _, out_node_shape, _ = get_model_io(p_onnx)
    _, grouped_input_list, _ = futils.npy2txt(input_np, input_nodes, p_input, exists_then_skip=True)

    # prepare dynasty list
    mode_float = dynasty.gen_dynasty_mode_settings(
        "float", fn_onnx=p_onnx, onnx_map=None, model_id=model_id
    )

    d_list, dir_output_list = dynasty.gen_dynasty_list(
        [mode_float],
        grouped_input_list,
        input_nodes,
        p_output,
        dump_level=dump_level,
        shape_in=shape_in,
    )

    # run dynasty
    fn_dynasty_sh = p_working / f"run_dynasty_{np_id}.sh"
    cmds = dynasty.build_dynasty_cmd(d_list, dynasty_bin, fn_dynasty_sh)
    # TODO: add error process.
    dynasty.run_dynasty_command_parallel(model_id, fn_dynasty_sh, timeout=timeout)

    # convert dynasty dump text to np
    np_out, _ = dynasty.txt2np(
        out_node_shape,
        output_list=dir_output_list,
        dmode="float",
        load_fl=True,
        load_fx=False,
    )

    # TODO: remove dir_output_list

    return np_out


def inference_dynasty_fx(
        fn_onnx,
        hw_mode,
        input_np,
        input_nodes=None,
        out_node_shape=None,
        p_working=None,
        shape_in="onnx_shape",
        timeout=60 * 60 * 6,
        dump_level=0,
        d_ioinfo=None
):
    """
    Run inference of kneron platform (hw_mode) on given model (fn_onnx) with data `input_np`.

    This call will use kneron `dynasty_fx` binary (included in toolchain)
    for the inference. The inference is to simulate one of the kneron NPU
    chip (specified by hw_mode).

    Args:
      fn_onnx (pathlib / str): path to QUANTIZED MODEL (.bie file).
        `fn_onnx` must be generated for `hw_mode`.
        Sometime a quantized onnx file can be passed in, but the accompany
        quantization json must be in same folder with this onnx. (Not tested)
      hw_mode (int): specify which platform to run.
      input_np (dict): dict of list of numpy file. See begin of page for details.
      input_nodes (list): a list of input node names.
        This indicate the order of input nodes so the numpy array in dict
        can be passed to dynasty binary in correct order.
        If not set, this function will call `get_model_io` to
        get `input_nodes` and `out_node_shape`.
      out_node_shape (list): a list of shape for out nodes.
        This is necessary to convert flattened data to onnx shape.
      p_working (pathlib / str): where to put cache file.
        The user need to clean this folder to release disk space.
        e2e will give different for each image.
      shape_in (str): choose from `onnx_shape` (default) / `channel_last`
        (will be obsoleted).
      dump_level:
        - 0 (default): dump inference results for only output nodes.
        - 1: dump inference results for output nodes + cpu nodes.
        - 2: dump inference results for all nodes.
      d_ioinfo: obsoleted parameter. kept here for compatiblility with 0.25.0. will not be used.

    Returns:
        * dict of list of numpy as inference results in float number,
          with shape specified by onnx.
        * dict of list of numpy as inference results in fix point number,
          with shape specified by onnx.

    NOTE:
      input node shape:
        included in np_in array.
      output node order:
        not needed, because results are loaded in dictionary by name.
        The dynasty dumps is name based.
    """
    # check p_working.
    if p_working is None:
        p_working = tempfile.mkdtemp("dyn_fx_")
    p_working = pathlib.Path(p_working)
    # Prepare constants

    # check file / folders
    p_onnx = pathlib.Path(fn_onnx)
    assert p_onnx.exists(), f"Given fix-point-model: {p_onnx} does not exists!"

    np_id = "np_" + futils.gen_random_string(8)
    # NOTE: we assume input does not change!
    p_input = p_working / f"model_input_{np_id}"
    # NOTE: each inference will have different results folder.
    p_output = pathlib.Path(p_working) / f"res_kdp{hw_mode}_{np_id}"

    model_name = futils.remove_appendix(p_onnx.name)
    model_id = f"tc/{model_name}"

    if input_nodes is None or out_node_shape is None:
        # try to avoid this operation to save time.
        input_nodes, _, out_node_shape, _ = get_model_io(p_onnx)
    fx_model_type = get_model_type(p_onnx)

    # prepare text input
    _, grouped_input_list, _ = futils.npy2txt(input_np, input_nodes, p_input, exists_then_skip=True)

    # prepare dynasty list
    hw_mode = str(hw_mode)
    mode_fx = dynasty.gen_dynasty_mode_settings(
        hw_mode,
        fn_onnx=p_onnx,
        which_onnx=fx_model_type,
        onnx_map=None,
        model_id=model_id,
    )

    d_list, dir_output_list = dynasty.gen_dynasty_list(
        [mode_fx],
        grouped_input_list,
        input_nodes,
        p_output,
        dump_level=dump_level,
        shape_in=shape_in,
    )

    # run dynasty
    fn_dynasty_sh = p_working / f"run_dynasty_{np_id}.sh"
    cmds = dynasty.build_dynasty_cmd(d_list, dynasty_bin, fn_dynasty_sh)
    # TODO: add error process.
    dynasty.run_dynasty_command_parallel(model_id, fn_dynasty_sh, timeout=timeout)

    # convert dynasty dump text to np  format
    np_out_fl, np_out_fx = dynasty.txt2np(
        out_node_shape,
        output_list=dir_output_list,
        dmode=hw_mode,
        load_fl=True,
        load_fx=True,
    )

    # TODO: remove dir_output_list

    return np_out_fl, np_out_fx


def inference_csim_v2(p_model,
                      ioinfo,
                      input_np: dict,
                      hw_mode: int,
                      out_fmt="fl",
                      cleanup=False,
                      p_working=None,
                      p_out=None):
    """Run csim infenrence.

    NOTE:
      Need to call `unpack_nefs()` to break the nef then get model address and ioinfo
      for this model.

    Args:
      p_model (pathlib / str): path to unpacked model.
      ioinfo (dict): dict of input/output node info, e.g., quantization, shape.
      input_np (dict): dict of list of numpy file. See begin of page for details.
      hw_mode (int): specify which platform to run. It must match the nef file
        as the nef is platform dependant.
      out_fmt (str): choose from below:

        - fl: the function will return dict of list of numpy array.
          Each np array is inferenced results in float format,
          with shape specified by onnx.
        - fx: the function will return dict of list of numpy array.
          Each np array is inferenced results in fix-point format,
          with shape specified by onnx.
        - sqtl.bin: results is flattened, saved in bin format.
          directly dumped by csim. For debug purpose. (not ready yet)
        - dram.bin: results is as in dram, saved in bin format.
          directly dumped by csim. For debug purpose. (not ready yet)

      cleanup (bool): remove cache folder before finish.
        No cleanup by default, and leave it to users.
      p_working (pathlib / str): where to put cache file.
        User may use same folder when multiple call on same model.
        (e2e will give different for each inference call).
        The user need to clean this folder to release disk space.
      p_out (pathlib / str): path to output folder. Only appliable when
        `out_fmt` set to `sqtl.bin` or `dram.bin`. (Not ready yet)

    Returns:
      See `out_fmt` explanation.
    """
    # detour for 520
    if hw_mode == 520:
        return inference_csim_520(p_model, ioinfo, input_np,
                                  out_fmt,
                                  cleanup, p_working, p_out)

    # check parameters
    assert out_fmt in ["fl", "fx", "sqtl.bin", "dram.bin"]
    if out_fmt.endswith(".bin"):
        assert p_out is not None, "Please set p_out parameter to save .bin files."
        raise NotImplementedError()

    assert p_model.exists(), f"{p_model} does not exists!"

    if p_working is None:
        p_working = tempfile.mkdtemp(prefix="csim_")
    # actually working under a RANDOM folder under given p_working
    np_id = "np_" + futils.gen_random_string(8)
    p_working = pathlib.Path(p_working) / np_id

    # step 3: prepare input.bin
    csim_bin_list, p_working, p_results = data_converter(input_np, ioinfo["input"],
                                                         p_working=p_working)

    # step 5: prepare run_csim.ini
    file_loader = FileSystemLoader(f"{fconsts.P_FLOW}/template")
    jinja_env = Environment(loader=file_loader)
    template = jinja_env.get_template(f"run_csim_{hw_mode}.ini")
    csim_in_list = OrderedDict()
    dir_output_list = []
    for i, bin_pair in csim_bin_list.items():
        p_csim_dump = p_results / f"in{i:06}"
        p_csim_dump.mkdir(parents=True, exist_ok=True)

        # p_model as model input, the folder contains setup.bin / etc
        fn_ini = p_results / f"in{i:06}_csim.ini"
        csim.gen_csim_ini(bin_pair, p_model, hw_mode,
                          template=template,
                          fn_ini=fn_ini)
        csim_in_list[i] = [p_csim_dump, fn_ini]
        dir_output_list.append(p_csim_dump)

    # step 6: call run_csim with parallel
    sh_csim = p_working / "run_csim.sh"
    bin_csim = fconsts.BIN_SET["csim"][hw_mode]
    cmd, cp = csim.run_csim(csim_in_list, bin_csim, sh_csim)
    assert cp.returncode == 0, f"csim failed with return code {cp.returncode}"

    # step 7: load csim results and return float data
    # convert csim dump text to np format
    # there is no float dump in csim output
    info_out = ioinfo["output"]
    out_node_list = [a["name"] for a in info_out]
    out_node_shape = {a["name"]: a["onnx_shape"] for a in info_out}
    out_ch_dim = {a["name"]: a["ch_dim"] for a in info_out}
    out_scale = {a["name"]: a["scale"] for a in info_out}
    out_radix = {a["name"]: a["radix"] for a in info_out}

    if DEBUG:
        # NOTE: csim will only have 8 or 15 bit dump on .seq file.
        #       nef -> ioinfo should have convert 16bit radix to 15bit
        out_bw = {a["name"]: a["bitw"] for a in info_out}
        if 16 in set(out_bw.values()):
            raise ValueError(f"Got bitwidth of {out_bw}. but csim.seq bitwidth only support 8/15. check unpacked ioinfo from nef.")

    # TODO: return dram.bin / sqtl.bin to p_out

    # per channel support
    np_out_fx = csim.txt2np(out_node_list, out_node_shape, dir_output_list)
    if out_fmt == "fx":
        return np_out_fx

    # convert fx to fl
    np_out_fl = dynasty.np_fx2fl(np_out_fx, out_ch_dim, out_scale, out_radix)

    if cleanup:
        # off by default. e2e will clean up.
        shutil.rmtree(str(p_working))

    return np_out_fl


def inference_dongle(p_combined_nefs,
                     model_id: int,
                     ioinfo,
                     input_np: dict,
                     hw_mode: int,
                     out_fmt="fl"):
    """Inference model via dongle server.

    TODO: will be available after 0.24.0.

    - solution may prefer use combined nef.
      So assume p_combined_nefs may have multiple models included.
      ``model_id`` is used to pick correct model.
    - specify dongle server info in bash environment.
    """
    pass


def data_converter(input_np, info_in, p_working=None):
    """
    Convert input numpy data into dram.bin format input as compiler specified.

    Args:
      input_np (dict): dict of list of numpy file. See begin of page for details.
      info_in (dict): quantization info and dram format from compiler.
      p_working (pathlib / str): where to put temp files.
        same as p_working of csim if called by csim.
        If not specified, will use a random folder.
        If user give `p_working`, please make sure different path for each call.

    Returns:
      A tuple of 3 elements.
        - output_bins (dict): dictionary of list of path to input bin files,
          fix-point data in dram format. can be feed into csim/dongle.
        - p_working (pathlib): where the cache files are. The user need to clean it later.
        - p_results (pathlib): where the csim result will be saved.
    """
    # setup folders for inference. random EACH TIME
    # because this function may be called in a loop,
    # it should use uniq name per input_np.
    if p_working is None:
        p_working = pathlib.Path(tempfile.mkdtemp(prefix="dpc_"))
        p_working.mkdir(parents=True, exist_ok=True)
    else:
        p_working = pathlib.Path(p_working)
    # TODO: p_working should be empty
    p_bin = p_working / "sqtbin"
    p_csim_bin = p_working / "csimbin"
    p_results = p_working / "csim_out"
    for p in [p_bin, p_csim_bin, p_results]:
        p.mkdir(parents=True, exist_ok=True)
    # DEBUG: save ioinfo for debug
    fn_ioinfo = p_bin / "ioinfo.json"
    futils.dict2json(info_in, fn_ioinfo)

    # step 3: prepare sequential.bin
    # NOTE: we assume the per-channel-scale of input are always 1.0.
    #       Other scales not supported now.
    #       (per-channel-scale should put into pre-process)

    sqt_bin_list = csim.np2bin_seq(input_np, info_in, p_out=p_bin)

    # step 4: convert to compiler specified format.bin
    csim_bin_list, cmds = csim.data_convert(
        sqt_bin_list, info_in, p_out=p_csim_bin
    )

    return csim_bin_list, p_working, p_results


#######################################################################################################
# special process for csim 520 related.
#######################################################################################################

def fix_520_output_name(rslt_csim:dict, input_nodes:list):
    """Fix the csim/dongle output dict name.

    The 520 nef/setup.bin does not include node name,
    so the collected text files only use "0", "1", "2".
    """
    assert len(input_nodes) == len(rslt_csim), \
        f"rslt_csim: {len(rslt_csim)}, input_nodes: {len(input_nodes)}"
    rst = {v: rslt_csim[str(i)] for i, v in enumerate(input_nodes)}
    return rst


def convert_520_output_list(rslt_csim:dict):
    """Get the csim/dongle output in list (not node name included).

    The 520 nef/setup.bin does not include node name,
    so the collected text files only use "0", "1", "2".

    We can get inference results (without node names) in list,
    same order as onnx specified.
    """
    return [rslt_csim[str(i)] for i in range(len(rslt_csim))]


def hack_np_520(input_np):
    """Update input_np for 520 nef csim inference.

    the 520 nef / setup.bin does not include the node names.
    so will use "0", "1", etc as default.
    """
    assert len(input_np) == 1, f"input_np for 520 got {len(input_np)} inputs. keys: {input_np.keys()}"

    v1 = list(input_np.values())[0]
    return {"0":v1}


def inference_csim_520(p_model,
                       ioinfo,
                       input_np: dict,
                       out_fmt="fl",
                       cleanup=False,
                       p_working=None,
                       p_out=None):
    """Run csim infenrence for 520.

    NOTE:
      Need to call `unpack_nefs()` to break the nef then get model address and ioinfo
      for this model.

    Args:
      p_model (pathlib / str): path to unpacked model.
      ioinfo (dict): dict of input/output node info, e.g., quantization, shape.
      input_np (dict): dict of list of numpy file. See begin of page for details.
      out_fmt (str): choose from below:

        - fl: the function will return dict of list of numpy array.
          Each np array is inferenced results in float format,
          with shape specified by onnx.
        - fx: the function will return dict of list of numpy array.
          Each np array is inferenced results in fix-point format,
          with shape specified by onnx.
        - sqtl.bin: results is flattened, saved in bin format.
          directly dumped by csim. For debug purpose. (not ready yet)
        - dram.bin: results is as in dram, saved in bin format.
          directly dumped by csim. For debug purpose. (not ready yet)

      cleanup (bool): remove cache folder before finish.
        No cleanup by default, and leave it to users.
      p_working (pathlib / str): where to put cache file.
        User may use same folder when multiple call on same model.
        (e2e will give different for each inference call).
        The user need to clean this folder to release disk space.
      p_out (pathlib / str): path to output folder. Only appliable when
        `out_fmt` set to `sqtl.bin` or `dram.bin`. (Not ready yet)

    Returns:
      See `out_fmt` explanation.
    """
    hw_mode = 520

    # check parameters
    assert out_fmt in ["fl", "fx", "sqtl.bin", "dram.bin"]
    if out_fmt.endswith(".bin"):
        assert p_out is not None, "Please set p_out parameter to save .bin files."
        raise NotImplementedError()

    assert p_model.exists(), f"{p_model} does not exists!"

    if p_working is None:
        p_working = tempfile.mkdtemp(prefix="csim_")
    # actually working under a RANDOM folder under given p_working
    np_id = "np_" + futils.gen_random_string(8)
    p_working = pathlib.Path(p_working) / np_id

    # step 3: prepare input_rgba.bin
    csim_bin_list, p_working, p_results = data_converter_520(input_np,
                                                             ioinfo["input"],
                                                             p_working=p_working)

    def find_model_bins(p_model):
        cs = {}
        for fn_key in ["command.bin", "setup.bin", "weight.bin"]:
            t = p_model.glob(f"*{fn_key}")
            cs[fn_key] = list(t)[0]
        return cs

    cs_abs = find_model_bins(p_model)
    bin_csim = fconsts.BIN_SET["csim"][520]

    def gen_csim_cmd_1(fn_input_rgba, p_csim_out):
        # get relative position
        cs = {}
        for k, v in cs_abs.items():
            cs[k] = futils.relative_path(v, p_csim_out)

        # NOTE: only 1 input for 520. no need for ","
        c = f"""{bin_csim} -d 0 --thread 1 {cs["command.bin"]} {cs["weight.bin"]} {fn_input_rgba} --setup {cs["setup.bin"]}"""

        command = f"pushd {p_csim_out} > /dev/null && {c} && popd > /dev/null"

        return command

    # step 5: prepare run_csim command
    dir_output_list = []
    cmds = []
    # NOTE: there is only one input for 520
    for i, bin_rgba in enumerate(csim_bin_list[0]):
        p_csim_dump = p_results / f"in{i:06}"
        p_csim_dump.mkdir(mode=0o770, parents=True, exist_ok=True)
        dir_output_list.append(p_csim_dump)

        # p_model as model input, the folder contains setup.bin / etc
        cmd1 = gen_csim_cmd_1(bin_rgba, p_csim_dump)
        cmds.append(cmd1)

    # step 6: call run_csim, optional with parallel
    if len(cmds) > 1:
        sh_csim = p_working / "run_csim.sh"
        with open(sh_csim, "w") as f:
            for cmd1 in cmds:
                f.write(f"{cmd1}\n")
        command = f"parallel --jobs 6 --halt now,fail=1 < {sh_csim}"
    else:
        command = cmds[0]

    cp = futils.run_bash_script(command)
    assert cp.returncode == 0, f"csim failed with return code {cp.returncode}"

    # step 7: load csim results and return float data
    # convert csim dump text to np format
    # there is no float dump in csim output
    info_out = ioinfo["output"]
    out_node_list = [a["name"] for a in info_out]
    out_node_shape = {a["name"]: a["onnx_shape"] for a in info_out}
    out_ch_dim = {a["name"]: a["ch_dim"] for a in info_out}
    out_scale = {a["name"]: a["scale"] for a in info_out}
    out_radix = {a["name"]: a["radix"] for a in info_out}
    # NOTE: the a["name"] is "0", "1". ... as it is not available in nef_v0/setup.bin

    # TODO: return dram.bin / sqtl.bin to p_out

    # per channel support
    np_out_fx = csim.txt2np(out_node_list, out_node_shape, dir_output_list, is_520=True)
    if out_fmt == "fx":
        return np_out_fx

    # convert fx to fl
    np_out_fl = dynasty.np_fx2fl(np_out_fx, out_ch_dim, out_scale, out_radix)

    if cleanup:
        # off by default. e2e will clean up.
        shutil.rmtree(str(p_working))

    return np_out_fl


def data_converter_520(input_np, info_in, p_working=None):
    """
    Convert input numpy data into dram.bin format input as compiler specified.

    Note:
      - 520 only takes one RGBA.bin file, which is very different from other platforms.

    Args:
      input_np (dict): dict of list of numpy file. See begin of page for details.
      info_in (dict): quantization info and dram format from compiler.
      p_working (pathlib / str): where to put temp files.
        same as p_working of csim if called by csim.
        If not specified, will use a random folder.
        If user give `p_working`, please make sure different path for each call.

    Returns:
      A tuple of 3 elements.
        - output_bins (dict): dictionary of list of path to input bin files,
          fix-point data in dram format. can be feed into csim/dongle.
        - p_working (pathlib): where the cache files are. The user need to clean it later.
        - p_results (pathlib): where the csim result will be saved.
    """
    # setup folders for inference. random EACH TIME
    # because this function may be called in a loop,
    # it should use uniq name per input_np.
    if p_working is None:
        p_working = pathlib.Path(tempfile.mkdtemp(prefix="dpc_"))
        p_working.mkdir(parents=True, exist_ok=True)
    else:
        p_working = pathlib.Path(p_working)

    # TODO: p_working should be empty
    p_csim_bin = p_working / "csimbin"
    p_results = p_working / "csim_out"
    for p in [p_csim_bin, p_results]:
        p.mkdir(parents=True, exist_ok=True)

    # step 3: prepare rgba.bin
    input_np = hack_np_520(input_np)
    # should be list of list.
    in_lst = [[a] for a in input_np["0"]]
    csim_bin_list = csim.txt2bin_rgba(in_lst, info_in, p_csim_bin)

    return csim_bin_list, p_working, p_results