kneron_model_converter/vendor/sys_flow_v2/test_case.py

#! /usr/bin/env python3
import os
import shutil
import copy
import tempfile
import pathlib

import json  # sometime commentjson is too slow
import re
import random
from collections import OrderedDict
from dict_recursive_update import recursive_update

from blinker import signal
import subprocess

import pandas as pd

from jinja2 import Environment, FileSystemLoader

import sys_flow_v2.flow_utils as futils
import sys_flow_v2.util_lib as util_lib
import sys_flow_v2.flow_constants as fconsts
import sys_flow_v2.dynasty_v3 as dynasty
import sys_flow_v2.compiler_v2 as compiler
import sys_flow_v2.csim_utils as csim
from sys_flow_v2.exceptions import RegressionError, MultiRegressionError, GeneralError, print_err, print_command, run_module
from sys_flow_v2.onnx_op_stats import onnx_info
from sys_flow_v2.snr_calculator_v2 import combine_snr, calculate_statistics, get_case_output, get_weight_bin_stats

import snoop

DEBUG = True if os.environ.get("REGRESSION_DEBUG", False) else False
snoop.install(enabled=DEBUG)


def release_test_case(path_to_model, path_to_base, dump_dynasty=False):
    """a helper function to release generated model.

    inputs:
      - dump_dynasty: dump the dynasty output for debug purpose, in mode 2/3.

    """
    files_selected = [
        "input/*.origin.onnx",
        "input/knerex_input*",
        "input/simulator_input*",
        # "*/*.json",
        "output/knerex_*/*.onnx",
        "output/knerex_*/*.bie",
        "output/*.xlsx",
        "output/compiler_*/*command.bin",
        "output/compiler_*/*setup.bin",
        "output/compiler_*/*weight.bin",
        "output/compiler_*/apb.npu",
        "output/compiler_*/*.nef",
        "output/compiler_*/*.kne",
    ]

    p_from = pathlib.Path(path_to_model)
    p_to = pathlib.Path(path_to_base) / p_from.name
    for pat in files_selected:
        fns = p_from.glob(pat)
        for fn in fns:
            # copy to relative path to base.
            fn_r = futils.relative_path(fn, p_from)
            fn_to = p_to / fn_r
            pp(f"{fn} -> {fn_to}")  # noqa
            if fn_to.exists():
                pp(f"{fn_to} exists! skip")  # noqa
                continue
            if not fn_to.parent.exists():
                fn_to.parent.mkdir(exist_ok=True, parents=True)
            if fn.is_symlink():
                # fn_to.symlink_to(fn.readlink()) # TODO: after toolchain use py 3.9
                # NOTE: assume all released symbolic links in released files are relatively link
                # NOTE: check symlink before check is_dir
                fn_to.symlink_to(os.readlink(fn))
            elif fn.is_dir():
                shutil.copytree(fn, fn_to)
            else:
                shutil.copy(fn, fn_to, follow_symlinks=False)
    return p_to


class test_case:
    """The class to provide unified interface for test_case.

    input: model path, where model and files should be orgazed already.
    output: model infomation.

    * run_flow is the function to run all modules, with a `config` input
    * the config will define which modules to run.
    """

    def __init__(self, model_path, config=None):
        """
        The `test_case` class wrap up the interface of model.
        It support unprocessed model and load pre-existing fx model.

        """

        # the model may be unprocessed or processed (with fx model)
        # the config may be string or a path to a json saved for THIS model.
        if config is None:
            p_regression_config = pathlib.Path(model_path) / "output" / "regression_config.json"
            if p_regression_config.exists():
                # use existing config
                config = p_regression_config
        if config and type(config) in [str, pathlib.PosixPath]:
            p_config = pathlib.Path(config)
            if p_config.exists():
                config = futils.load_regression_json(p_config)
        # TODO: or should I skip some steps? where operate on self.config

        self.initial_test_case(model_path, config)
        if config:
            # NOTE: config will be deepcopyed. so no lock in it.
            self.prepare_flow(config)

        self.check_this_case()

    def initial_test_case(self, model_path, config=None):
        """initial test case. set up pre-defined path for this test case.

        * set up name/path for onnx / input, etc
        * verify input images for knerex / dynasty
        * set up logger.

        NOTE: do not use self.config in this function.
        Suppose to be independant from regression/config
        """

        try:
            self.model_path = pathlib.Path(model_path)
            self.model_name = self.model_path.name
            self.cat_name = self.model_path.parent.name
            self.model_id = f"{self.cat_name}/{self.model_name}"

            # create logger. Try to keep this as early as possible
            self.logger = futils.create_logger(f"model {self.model_name}", None, "WARNING")
            self.logger.info("run initial_test_case")

            if not self.model_path.exists():
                raise RegressionError("general/initial", self.model_id, msg="model does not exist.")
            self.prepare_path(config)

            # pre-defined onnx names
            self.map_onnx, self.onnx_infos, self.btm_dynasty_mode, self.btm_model_opt = self.get_map_onnx(config)

            self.graph_warnings = {}

        except Exception as e:
            self.logger.error(e)  # what if logger not ready yet?
            raise RegressionError("general/initial", self.model_id)

    @run_module(module_name="general/model oversize")
    def check_onnx_size(self, p_origin):
        """Examine the file size of origin.onnx.
        Internal regression will skip onnx too large.
        """
        onnx_size = int(pathlib.Path(p_origin).resolve().stat().st_size / (1024 * 1024))
        max_MB = self.config["compiler_piano"]["max_onnx_MB"]
        signal("data_sender").send((self.model_id, "general/onnx size (MB)", onnx_size))
        self.onnx_size = onnx_size
        if onnx_size > max_MB:
            raise RegressionError("general/model oversize", self.model_id, msg=f"onnx {onnx_size}Mb//max size {max_MB}Mb")

    def check_this_case(self):
        """Some special check on this case."""
        if pathlib.Path(self.map_onnx["origin"]).name.endswith(".bie"):
            # NOTE: origin.bie is only supported in only_ip_evaluator.
            assert self.config["module_run"]["only_ip_evaluator"], "origin.bie is only for only_ip_evaluator !!!"

    def check_csim_error(self, cp, platform):
        """Find detail reason for csim crash.

        CSIM will return 33 as exit code for some known errors.

        TODO: move to csim_utils.py?
        """

        cat1 = f"kdp{platform}"
        if cp.returncode == 0:
            # success
            return
        elif cp.returncode == 33:
            pat = re.compile("\[\[\[(.*?)\]\]\]", re.MULTILINE | re.DOTALL)
            log = "\n".join([cp.stdout, cp.stderr])
            msg = "\n".join(pat.findall(log))
            raise RegressionError(f"{cat1}/compiler error", self.model_id, msg=msg)
        elif cp.returncode == 111:
            # timeout
            raise RegressionError(f"{cat1}/csim", self.model_id, msg=cp.stderr)
        else:
            raise RegressionError(f"{cat1}/csim", self.model_id)

    def check_knerex_error(self, cp, platform):
        """Find detailed report for calling knerex.

        There are some submodules in knerex, e.g., datapath analysis, may went wrong.
        This step is to improve debug process by reporting specific reasons.
        """

        cat1 = f"kdp{platform}"
        log = "\n".join([str(cp.stdout), str(cp.stderr)])
        fn_log = self.path[f"knerex_output_{platform}"] / "knerex_run.log"
        if self.config["path"]["internal"]:
            # cp.returncode > 0 and
            # now save the log if run internal
            with open(fn_log, "w") as f:
                f.write(f"knerex return with code {cp.returncode}\n\n")
                f.writelines(log)

        # check memory estimation for datapath analysis

        re_mem_est = re.compile("Datapath Analysis takes (\d+)KB=\((\d+)KB for model buffer \+ (\d+)KB for results\) per thread")
        try:
            dpm_total, dpm_buf, dpm_rslt = re_mem_est.findall(log)[0]
            # buffer related to thread number
            # dpm_rslt related to image number
            signal("data_sender").send((self.model_id, f"{cat1}/dp analysis total (KB)", dpm_total))
            signal("data_sender").send((self.model_id, f"{cat1}/dp analysis buf (KB)", dpm_buf))
            signal("data_sender").send((self.model_id, f"{cat1}/dp_analysis result (KB)", dpm_rslt))
        except:
            pass

        # check memory estimation for sequential bias adjust
        re_mem_est = re.compile("Sequential Bias Adjustment takes (\d+)KB memory to hold (\d+) samples of (\d+)KB each")
        try:
            spb_total, spb_n, spb_x1 = re_mem_est.findall(log)[0]
            signal("data_sender").send((self.model_id, f"{cat1}/seq bias adjust total (KB)", spb_total))
            signal("data_sender").send((self.model_id, f"{cat1}/seq bias adjust n", spb_n))
            signal("data_sender").send((self.model_id, f"{cat1}/seq bias adjust mem x1 (KB)", spb_x1))
        except:
            pass

        # check memory estimation for parallel bias adjust
        re_mem_est = re.compile("Parallel Bias Adjustment takes (\d+)KB=\((\d+)KB for model buffer \+ (\d+)KB for results\) per thread")
        try:
            ppb_total, ppb_buf, ppb_rslt = re_mem_est.findall(log)[0]
            signal("data_sender").send((self.model_id, f"{cat1}/prll bias adjust total (KB)", ppb_total))
            signal("data_sender").send((self.model_id, f"{cat1}/prll bias adjust buf (KB)", ppb_buf))
            signal("data_sender").send((self.model_id, f"{cat1}/prll bias adjust result (KB)", ppb_rslt))
        except:
            pass

        s1 = {
            "knerex": "KnerexERROR:\s*(.*)",
            "HW not support": "HW_NOT_SUPPORT:\s*(.*)",
            "unimplemented feature": "UNIMPLEMENTED_FEATURE:\s*(.*)"
        }
        for m1, p1 in s1.items():
            p2 = re.compile(p1).findall(log)
            if len(p2) > 0:
                msg = p2[0]
                self.model_fx_report[(f"{cat1}/ERROR")] = msg
                raise RegressionError(f"{cat1}/{m1}", self.model_id, msg=msg)

        if cp.returncode == 0:
            return
        elif cp.returncode == 111:
            # stderr.startswith("TIMEOUT"):
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg=cp.stderr)
        elif cp.returncode == 11:
            # DELETE below
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg="datapath analysis failed")
        elif cp.returncode == 30:
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg="KnerexMemoryInsufficient")
        else:
            # NOTE: check knerex log for specific errors
            spec_err = {"deadloop": ["Deadloop", "Loop Maxed out"]}

            for cat2, msgs in spec_err.items():
                for msg in msgs:
                    if len(re.compile(msg).findall(log)) > 0:
                        raise RegressionError(f"{cat1}/knerex", self.model_id, msg=cat2)

            # by default
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg=f"err: {cp.returncode}")

    def get_map_onnx(self, config):
        """There are a few onnx used/generated during the quantization process.

        This step is to create map of possible onnx.

        NOTE:
          The keys here are widely used in this project. DO NOT change any.
          Follow the name rules of "kdp{hw_mode}_{optimization}_{dev_v}_{fmt}"

        Factors:
          - dev_v: develop version. currently only "piano"
          - hw_mode: float, kdp520/kdp720/etc
          - optimization: origin / scaled / bias adjust / ...
          - format: onnx / bie
        """
        map_onnx = {}
        onnx_infos = {}
        btm_dynasty_mode = {}
        btm_model_opt = {}

        # there must be a origin.onnx (or origin.bie for only_ip_evaluator)
        origin_onnx = f"{self.model_path}/input/{self.model_name}.origin.onnx"
        model_opt = config["compiler_piano"]["model_optimize"]

        p_origin = pathlib.Path(origin_onnx)
        using_bie = False
        if not p_origin.exists():
            # second choice is origin.bie
            origin_bie = f"{self.model_path}/input/{self.model_name}.origin.bie"
            p_origin = pathlib.Path(origin_bie)
            if not p_origin.exists():
                raise RegressionError("general/Missing origin.onnx", self.model_id)
            using_bie = True
        map_onnx["origin"] = p_origin

        # read in the origin.onnx for latter usage
        # TODO: can we skip to save time?
        # TODO: make this block work on bie?
        if not using_bie:
            onnx_infos["origin"] = onnx_info(p_origin)
            _, _, self.est_mac_kB = onnx_infos["origin"].get_mac_memory()
            self.check_onnx_io(onnx_infos["origin"])

        for hw_mode in fconsts.MODE_HARDWARE:  # 520/720/530
            btm_dynasty_mode[hw_mode] = f"{hw_mode}{fconsts.MODEL_RELEASE[model_opt]}"
            btm_model_opt[hw_mode] = model_opt

            for fmt in fconsts.MODEL_FORMAT:  # piano, onnx / bie
                # piano, normal. the only develop version for now. treat as constant
                dev_v = "piano"

                p_knerex_out = self.path[f"knerex_output_{hw_mode}"]

                prefix = f"{self.model_name}.kdp{hw_mode}"

                # this is copied fron compiler frontend
                map_onnx[f"kdp{hw_mode}_opt_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.graph_opt.{fmt}"

                # knerex generated for wq mode.
                map_onnx[f"kdp{hw_mode}_quan_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.scaled.quan.{fmt}"

                k_opt_prefix = {}
                # below generated by knerex.
                # some optimized level: scaled, wqbi, hwbi, hwbi-mse
                k_opt_prefix["scaled"] = f"{prefix}.scaled"
                for bi_name in ["wqbi", "hwbi", "hwbi-mse"]:
                    k_opt_prefix[bi_name] = f"{prefix}.scaled.quan.{bi_name}"

                for opt, pref in k_opt_prefix.items():
                    # this is to speficy how knerex dump
                    map_onnx[f"kdp{hw_mode}_{opt}_{dev_v}_{fmt}"] = p_knerex_out / f"{pref}.{fmt}"
                    # move_release_bie will REPLACE _{model_opt}_ to point to .release.bie

                # model_opt is config chosen # related to BTM
                pref = k_opt_prefix[model_opt]
                # compiler input bie from knerex. will affect btm. save in this key for future use.
                map_onnx[f"kdp{hw_mode}_bie4compiler_{dev_v}_{fmt}"] = map_onnx[f"kdp{hw_mode}_{model_opt}_{dev_v}_{fmt}"]
                # will release this bie
                map_onnx[f"kdp{hw_mode}_release_{dev_v}_{fmt}"] = p_knerex_out / f"{pref}.release.{fmt}"

        return map_onnx, onnx_infos, btm_dynasty_mode, btm_model_opt

    def load_per_model_config(self, p_model_config):
        """A user-config json file (model_config.json) may be provide for fine-tune quantization process. """
        if p_model_config.exists():
            # deep copy of origin config
            config_new = copy.deepcopy(self.config)
            with open(p_model_config, "r") as f:
                per_model_config = json.load(f)
            recursive_update(config_new, per_model_config)
            self.config = config_new

    def get_nef_model_id(self):
        """Get the NEF model ID.

        First tries to read from model_id file in output directory.
        If not found, determines ID based on configuration and saves it to the file.

        NOTE:
          - 不要依赖于 model_id 文件, 因为它可能被清空.
        """
        p_model_id = self.model_path / "output" / "model_id"

        try:
            if p_model_id.exists():
                with open(p_model_id, "r") as f:
                    model_id = int(f.read())
                return model_id
        except:
            pass

        # If file doesn't exist or is invalid, determine model ID using existing logic
        model_id = None
        k = (self.cat_name, self.model_name)
        if k in self.config["map_model_id"]:
            model_id = self.config["map_model_id"][k]
        else:
            if self.config["module_run"]["only_dongle"]:
                raise RegressionError("general/initial", self.model_id, msg="only_dongle requires model_id recorded. please run 'helper_model_id.py' first.")

            try:
                # guess from model_name if from app_release.
                s = re.compile("model_(\d+)")
                model_id = int(s.findall(str(self.model_name))[0])
            except:
                if self.config["path"]["internal"]:
                    model_id = random.randint(20000, 30000)

        if model_id is None:
            # this is fallback value.
            model_id = 32768

        # save to file
        with open(p_model_id, "w") as f:
            f.write(str(model_id))

        return model_id

    def prepare_flow(self, config):
        """Prepare for the quantization flow.

        Check the per-model config.
        """
        try:
            self.config = copy.deepcopy(config)

            if not self.config["module_run"]["only_ip_evaluator"]:
                self.check_input_files()

            # update config if this model has specific config to change
            p_model_config = self.model_path / "input" / "model_config.json"
            self.load_per_model_config(p_model_config)

            # save status to local
            # TODO: send this out to report instead of signal
            self.module_status = {"general": {"Success": False}}
            for hw_mode in self.config["hw_mode_on"]:
                self.module_status[hw_mode] = {}

            # some special model types. default settings.
            self.is_big_model = True
            self.is_single_layer = False  # for debug
            self.is_multi_layer = False  # for debug
            self.is_multi_core = False  # for debug
            if self.config["path"]["internal"]:
                # if internal, some special settings
                self.is_big_model = "big_model" == self.config["regression"]["model_type"]
                self.is_single_layer = "single_layer" == self.config["regression"]["model_type"]
                self.is_multi_layer = "multi_layer" == self.config["regression"]["model_type"]
                self.is_multi_core = "multi_core" == self.config["regression"]["model_type"]

            # nef_model_id is needed for calling batch-compiler
            self.nef_model_id = self.get_nef_model_id()
            self.logger.info(f"{self.cat_name}/{self.model_name} with nef model id: {self.nef_model_id}")
            if self.is_big_model:
                signal("data_sender").send((self.model_id, "general/nef_model_id", str(self.nef_model_id)))

            if len(str(self.path["user_config_json"])) > 4:
                with open(self.path["user_config_json"], "r") as f:
                    self.config["user_config"] = json.load(f)

            # need to check validation of onnx first
            if self.config["module_run"]["validate_onnx"]:
                self.check_onnx_valid()

            if self.is_big_model:
                self.check_onnx_size(self.map_onnx["origin"])

            self.compiler_output = {}

            # use model_report to save results for this fx model generating.
            # then save to "output/model_fx_report.json"
            self.model_fx_report = OrderedDict()
            self.model_fx_report["docker_version"] = self.config["path"]["toolchain"]["version"]
            if self.config["path"]["internal"]:
                self.model_fx_report["binary source"] = fconsts.bin_msg
            self.model_fx_report["comments"] = self.config["comments"]
            self.model_fx_release = OrderedDict()

            self.pre_clean_up()

            # create configs for datapath analysis, csim ini, etc
            # initial jinja2
            file_loader = FileSystemLoader(str(self.config["path"]["template"]))
            self.jinja_env = Environment(loader=file_loader)

            self.save_regression_json()

            # save cli commands for debug purpose
            self.commands = []

        except Exception as e:
            self.logger.error(e)
            if type(e) is RegressionError:  # TODO: MultiRegressionError
                raise
            else:
                raise RegressionError("general/prepare", self.model_id)

    @run_module(module_name="general/clean_opt")
    def clean_opt(self):
        """Clean up opt_compile generated by compiler submodules (fm-cut, etc)."""
        # clean up opt_compile which is from fm_cut but sometime not cleaned.
        p_out = self.path["dir_output"]
        p_opt_cmpls = list(p_out.glob("compiler_*/opt_compile"))
        for p_opt in p_opt_cmpls:
            cmd = f"pkill -f {self.model_name} ; sleep 1; rm -rf {p_opt}"
            cp2 = futils.run_bash_script(cmd, do_echo=False)
            # cp2.returncode == -15

    @run_module(module_name="general/post_clean")
    def post_clean_up(self):
        """To clean up before finish.

        NOTE: This used be `__del__` method but it may not be triggerd immediately
        after the flow finihs. It has been renamed and put into run_flow.

        The "run_flow" will not be called multiple times according to our experience.

        This method will be called when flow success.
        If any submodule failed, this function should be called in `run_single_case` error handle.
        """
        # detour. if need to delete output folder
        if self.need_clean("all_output"):
            self.clean_all_output()
            return

        # otherwise, normal clean up process.
        # save commands to file. but dynasty related are not included yet.
        self.generate_bash_script()

        if hasattr(self, "work_in_memory") and self.work_in_memory and hasattr(self, "path"):
            # per compiler team request, dont use zip, just copy back
            d_from = self.path["dir_output_memory"].absolute()
            d_to = self.path["dir_output"].absolute()
            # if d_to.is_symlink():
            #     d_to.unlink()
            command = f"if mountpoint -q {d_to}; then umount {d_to}; fi; pushd {d_from} > /dev/null; tar cf - . | (mkdir -p {d_to}; cd {d_to}; tar xvf -)"
            if DEBUG:
                print("recovering from work_in_memory")
                print(command)
            cp = futils.run_bash_script(command)
            # TODO: check cp.returncode
            shutil.rmtree(d_from.parent.absolute())

        if self.config["path"]["internal"]:
            # for internal, we need to set permission to debug
            self.set_permission_output()

        for handler in self.logger.handlers[:]:
            handler.close()
            self.logger.removeHandler(handler)

        if hasattr(self, "dir_output_list"):
            self.clean_dynasty_output(self.dir_output_list)

    def __repr__(self):
        """Provide brief info on the model."""
        return f"Model {self.model_path}"

    def prepare_output_dongle(self):
        """Prepare output_dongle for only_dongle.

        If only_dongle, it should work on a new folder so that have its own flow_commands.sh
        but it require links to output/compiler_xxx and output/results
        """
        p_out_1 = self.model_path / "output"
        p_out_2 = self.model_path / "output_dongle"
        p_out_2.mkdir(parents=True, exist_ok=True)
        p_links = ["results", "compiler_730", "knerex_730"]
        for pname in p_links:
            p_from = p_out_1 / pname
            if not p_from.exists():
                raise RegressionError("general/prepare", self.model_id, msg=f"only_dongle need output/{pname} ready.")
            p_to = p_out_2 / pname
            futils.safe_link(p_from, p_to)
        return p_out_2

    def prepare_path(self, config=None):
        """Examine essential files/folders for model.
        All essential paths are saved in a dictionary.

        if config is None, this will not be a full run.
        """
        self.path = {}
        # input folder

        # output folder.  this will be used many times
        dir_out = self.model_path / "output"
        if config and config["module_run"]["only_dongle"]:
            dir_out = self.prepare_output_dongle()

        self.path["user_config_json"] = self.model_path / "input/user_config.json"
        if not pathlib.Path(self.path["user_config_json"]).exists():
            self.path["user_config_json"] = ""

        for hw_mode in fconsts.MODE_HARDWARE:  # 520/720/530/730/630
            p_knerex_out = dir_out / f"knerex_{hw_mode}"
            self.path[f"knerex_output_{hw_mode}"] = p_knerex_out
            self.path[f"updater_{hw_mode}_json"] = p_knerex_out / f"updater_{hw_mode}.json"

        self.path["fn_json_radix"] = self.model_path / "input/input_radix.json"  # User defined json
        # NOTE: why use knerex_input instead of node_input name?
        # 1. the node_input name may include "/", which will cause great trouble if used as char in diretory name.
        # 2. the node_input name could be arbitariely ANYTHING. we cannot ganrantee safety or conflicts with our other files.
        # NOTE: for multiple inputs, we assume each PAIR/GROUP file are put into knerex_input/knerex_input_1/... with SAME name
        # here we assume knerex_input is for the 1st input node given by ONNX, and knerex_input_1 is for 2nd input node.
        # We also assume the input node given by ONNX is same as in piano graph. otherwise BIG PROBLEM.
        p_knerex_in = self.model_path / "input/knerex_input"
        self.path["dir_knerex"] = p_knerex_in
        if not p_knerex_in.exists():
            raise RegressionError("general/Missing input", self.model_id, msg="Mising knerex_input folder.")
        self.path["dir_simulator"] = self.model_path / "input/simulator_input"
        if not self.path["dir_simulator"].exists():
            # will use same as knerex_input
            self.path["dir_simulator"] = p_knerex_in

        # if dir_out is symlink, which is leftover from last UNSUCCESSFUL run, not cleaned up
        if dir_out.is_symlink():
            # NOTE: dir_out is a symlink but will not exist() if the target does not exist
            dir_out.unlink()

        self.path["dir_input"] = self.model_path / "input"
        self.path["dir_output"] = dir_out
        dir_out.mkdir(mode=0o770, parents=True, exist_ok=True)

        if config:
            skip_qat = config["knerex"]["skip_qat_json"]
            self.work_in_memory = config["regression"]["work_in_memory"]
        else:
            skip_qat = False
            self.work_in_memory = False

        # HACK: work_in_memory is to make output folder in memory. to avaoid disk io block.
        # especially for big model with feature map cut. which need to write many times in compiler output
        if self.work_in_memory:
            self.path["dir_output_memory"] = self.create_dir_in_memory(dir_out)

        for hw_mode in fconsts.MODE_HARDWARE:  # 520 / 720 / 530 / etc
            p_knerex_out = dir_out / f"knerex_{hw_mode}"
            # knerex temporally analysis results
            self.path[f"temp_dpa_piano_{hw_mode}"] = p_knerex_out / f"analysis_datapath_piano_{hw_mode}.tmp"
            self.path[f"temp_wta_piano_{hw_mode}"] = p_knerex_out / f"analysis_weight_piano_{hw_mode}.tmp"

            # compiler and nef output directory
            compiler_out = dir_out / f"compiler_{hw_mode}"
            nef_out = dir_out / f"nef_{hw_mode}"
            self.path[f"compiler_piano_{hw_mode}_out"] = compiler_out
            self.path[f"compiler_piano_{hw_mode}_json"] = compiler_out / f"compiler_piano.config.kdp{hw_mode}.json"
            self.path[f"compiler_hack_{hw_mode}_json"] = self.model_path / f"input/config_hack_{hw_mode}.json"

            self.path[f"nef_output_{hw_mode}"] = nef_out

            # for backend node graph. so customers could see the datapath.
            self.path[f"model_fx_svg_{hw_mode}"] = dir_out / f"opt_stage2_{hw_mode}.svg"

            # qat config json for knerex
            self.path[f"qat_{hw_mode}_config_json"] = self.model_path / f"input/qat_{hw_mode}_config.json"
            qat_not_exist = not self.path[f"qat_{hw_mode}_config_json"].exists()
            if skip_qat or qat_not_exist:
                self.path[f"qat_{hw_mode}_config_json"] = ""

        if config and (not config["module_run"]["only_ip_evaluator"]):
            self.check_npy_or_txt(self.path["dir_knerex"])
            self.find_btm_txt(config["dynasty"]["regression_input"])

        # fx model report. for every run
        self.path["model_fx_html"] = dir_out / "model_fx_report.html"
        # for app release only
        self.path["model_fx_json"] = dir_out / "model_fx_report.json"
        # where to save self.config to this file for future reference.
        self.path["export_regression_json"] = dir_out / "regression_config.json"
        # back up bash commands
        self.path["fn_cmd"] = dir_out / "flow_commands.sh"

        p1 = dir_out / "success"
        self.path["success_sign"] = p1
        if p1.exists():
            p1.unlink()

    def create_dir_in_memory(self, dir_out):
        """Create a folder to work-in-memory. avoid writing to disk many times.

        NOTE: not for only_dongle

        If need to work_in_memory, then work at /dev/shm
        will be saved as zip file later.
        the whole output folder is in memory
        """
        d_temp = pathlib.Path(tempfile.mkdtemp(prefix="/dev/shm/wim_"))
        dir_out_memory = d_temp / "output"
        dir_out_memory.mkdir(parents=True, exist_ok=True)

        # NOTE: work_in_memory means old results cleaned up.
        # it used to copy datapath_analysis temp results but the folder had been changed.
        # so skip it now.

        # use mount
        command = f"mount --bind {dir_out_memory} {dir_out}"
        cp = futils.run_bash_script(command)

        if DEBUG:
            print(f"work_in_memory: {dir_out_memory} mount to output folder: {dir_out}")
            print(command)

        return dir_out_memory

    def set_permission_output(self):
        """Set permission for test cases so that other users can access.

        If not using docker, One can only set permissions for file created by themselves.
        If using docker, you can anything

        Diretory set to 755, files set to 644.

        Using pathlib.Path.chmod in docker will NOT work. so we use bash
        """
        dir_out = self.path["dir_output"]
        try:
            futils.set_folder_public(dir_out)
        except Exception as e:
            self.logger.error(e)

    def find_simulator_input_list(self, p_txt):
        """
        Find the input images in simluator_input folder.

        The `simulator_input` contains input for dynasty/csim/dongle inference.

        Our regression are using the file name `test_input.txt` as default file name for bit-true-match. Users may limit the number of input groups for inference. The `test_input.txt` will be used at first by default.

        # TODO: refactor this function
        # TODO: if no test_input.txt exist, randomly pick it for bit-true-match
        """
        # default (self.btm_txt) is usually "test_input.txt"
        p_default = list(p_txt.glob(self.btm_txt))
        if len(p_default) == 0:
            raise RegressionError("general/Missing input", self.model_id, msg=f"No {self.btm_txt} in {p_txt.name}")

        if self.config["dynasty"]["regression_input"] == "default":
            # just use one
            sim_lists = [p_default[0]]
        else:  # otherwise runn dynasty on all txt
            # TODO: dynasty input may take both txt and npy?
            sim_lists = list(p_txt.glob(f"*.{self.input_file_format}"))

            # at least there is test_input.txt
            # sort input texts by names. but move "test_input.txt" to the 1st if exists
            sim_lists = sorted(sim_lists, key=lambda x: "" if x.name == self.btm_txt else x.name)
            if self.config["dynasty"]["sample_seed"] is not None and len(sim_lists) > 2:
                # randomize
                ram_list = sim_lists[1:]
                random.seed(self.config["dynasty"]["sample_seed"])
                random.shuffle(ram_list)
                sim_lists = sim_lists[:1] + ram_list
        # sim_lists[0] is always test_input.txt

        list_input_simulator = [self.find_multiple_input(a) for a in sim_lists]

        # apply num_input_samples to limit number of images. // to save time in regression for quicker test.
        n_max_input = self.config["dynasty"]["num_input_samples"]
        list_input_simulator = list_input_simulator[:n_max_input]

        return list_input_simulator

    def check_npy_or_txt(self, p_knerex):
        """Find out the input file format in knerex_input.

        Preferred `npy`, then `txt`.

        Currently there should be only one format in `knerex_input` folder.
        Knerex will report error if more than one formats in it.
        """
        n_npy = len(list(p_knerex.glob("*.npy")))
        n_txt = len(list(p_knerex.glob("*.txt")))
        if n_npy > 0:
            suffix = "npy"
            if n_txt > 0:
                raise RegressionError("general/Missing input", self.model_id, msg=f"Found {n_npy} npy and {n_txt} txt in {p_knerex}. Knerex only support 1 format in folder.")
        elif n_txt > 0:
            suffix = "txt"
        else:
            raise RegressionError("general/Missing input", self.model_id, msg=f"No npy/txt in {p_knerex}")

        self.input_file_format = suffix

    def find_btm_txt(self, regression_input="default", prefix="test_input"):
        """Setup btm_txt and related."""
        self.btm_txt = f"{prefix}.{self.input_file_format}"

        # selected one input (test_input.txt by default) for bit-true-match
        self.path["btm_dump"] = self.path["dir_output"] / "results" / prefix

        # this is deferred to now because we need the info of self.btm_txt
        if regression_input == "all":
            dir_o = self.path["dir_output"] / "snr_analysis"
        else:
            dir_o = self.path["btm_dump"]

        self.fn_report = dir_o / "snr_analysis_report.csv"
        self.path["snr_csv"] = dir_o / "snr_analysis_per_layer.csv"
        self.path["snr_excel"] = self.path["dir_output"] / f"{self.model_name}_snr_report.xlsx"

    def check_input_files(self):
        """Examine the input text files in knerex_input / simlulator_input folder

        There should be at least 1 input images in knerex_input for datapath analysis, which is essential for quantization.

        There should be at least 1 input images in simulator_input folder, which is used for dynasty / csim / dongle inference. Our regression are using the file name `test_input.txt` as default file name for bit-true-match. If there is no file named "test_input.txt", a random file in the simulator_input folder will be picked and linked as test_input.txt.

        For models with multiple input nodes, there should be SAME filename, e.g., `camera_002.txt` in
            * knerex_input   / simulator_input  , for 1st input node
            * knerex_input_1 / simulator_input_1, for 2nd input node
            * knerex_input_2 / simulator_input_2, for 3rd input node
            * ... if necessary
        """
        # knerex will use all txt in knerex_input folder
        p_knerex = pathlib.Path(self.path["dir_knerex"])

        # NOTE: '**/*.txt' will find all depth txt files
        self.list_input_knerex = [self.find_multiple_input(a) for a in list(p_knerex.glob(f"*.{self.input_file_format}"))]
        if len(self.list_input_knerex) == 0:
            raise RegressionError("general/Missing input", self.model_id, msg=f"No txt in {p_knerex}")

        # dynasty will pick text from simulator_input folder
        # it need test_input.txt
        self.list_input_simulator = self.find_simulator_input_list(pathlib.Path(self.path["dir_simulator"]))
        # `test_input.txt` in `simulator_input` will be used for bit-true-match check by default
        self.list_input_btm = [self.list_input_simulator[0]]

        # check input files
        self.logger.info(f"Found {len(self.list_input_knerex)} input image for knerex")
        self.logger.info(f"Found {len(self.list_input_simulator)} input image for simulator")

        # HACK: Create noise input
        if futils.get_switch_value(self.config["module_run"], "piano_dynasty_noise", False):
            sigma_levels = self.config["dynasty"]["noise_sigma"]
            p_input = self.model_path / "input"
            self.list_input_simulator_noise = {}
            for p_simu in p_input.glob("simulator_input*"):
                if "_sigma" in p_simu.name:  # don't repeat itself
                    continue
                futils.create_noise_input_folder(p_simu, sigma_levels)
            for sigma in sigma_levels:
                p_simu = p_input / f"simulator_input_sigma{sigma}"
                assert p_simu.exists(), f"{p_simu} does not exists."
                self.list_input_simulator_noise[sigma] = self.find_simulator_input_list(p_simu)

        # creat link for test_input.txt if necessary
        # as use models linked from model_source, this may fail.
        if self.config["dynasty"]["regression_input"] == "default":
            self.fn_input_default = [self.find_multiple_input(self.path["dir_simulator"] / self.btm_txt, verify_exist=False)]
            if not pathlib.Path(self.fn_input_default[0][0]).exists():
                self.logger.warn(f"missing simulator_input/{self.btm_txt}. trying to link.")
                for i_from, i_to in zip(self.list_input_simulator[0], self.fn_input_default[0]):
                    futils.safe_link(i_from, i_to)

    def check_onnx_io(self, origin_info):
        """Get onnx ioinfo from onnx file. This will only get some simple information about input/output nodes. Example: .

        Output:
            * self.io_nodes["input_node", "origin"] will contain input nodes name and their order
              * needed by knerex / dynasty before compiler

        A more accurate way is to call load_compiler_ioinfo() which will update self.io_nodes with more information. However this must run after compiler generate ioinfo.csv
        """

        self.io_nodes = {}
        input_nodes, output_nodes, opset = origin_info.get_ioinfo()
        if len(input_nodes) == 0:
            raise RegressionError("general/Missing input", self.model_id, "wrong onnx: no input nodes.")

        # NOTE: we suppose all the input nodes are same order for 520/720/etc.
        # otherwise the input_lots.json will be different for different hardware
        # NOTE: DO NOT use clean_name on input_nodes.
        #       original name needed in knerex updater and run_dynasty
        self.io_nodes["input_node", "origin"] = input_nodes
        self.io_nodes["out_node", "origin"] = [futils.clean_name(a) for a in output_nodes]

    def save_regression_json(self):
        """Dump this regression config for debug"""

        if self.is_big_model:
            with open(self.path["export_regression_json"], "w") as f:
                # remove "snr_ref" from self.config before saving.
                d = copy.deepcopy(self.config)
                d.pop('snr_ref', None)
                d.pop('map_model_id', None)
                # d.pop('hw_mode_on', None)
                json.dump(d, f, indent=4, sort_keys=False, default=str)

    def get_input_folders(self, input_nodes, first_input_folder):
        """Generate dictionary of input folders for knerex."""
        if not pathlib.Path(first_input_folder).exists():
            raise RegressionError("general/Missing input", self.model_id, msg=f"Missing {first_input_folder}")

        input_folders = {}
        # at least one input
        input_folders[input_nodes[0]] = first_input_folder
        # if multi inputs
        for i_name, this_name in enumerate(input_nodes[1:]):
            # NOTE: verify multi input node folder
            self.logger.info(f"Check input folder {i_name+2}/{len(input_nodes)}: \"{this_name}\". ")
            this_dir = f"{first_input_folder}_{i_name+1}"
            input_folders[this_name] = this_dir

            if not os.path.exists(this_dir):
                msg = f"""MISSING input folder {i_name+2}/{len(input_nodes)}: node "{this_name}", expect txt in "{this_dir}". """
                self.logger.critical(msg)
                raise RegressionError("general/Missing input", self.model_id, msg=msg)
        return input_folders

    def generate_knerex_config(self, *, hw_mode):
        """
        Generate config json for knerex using template.
        Settings include per regression / per model.

        Output file:
          * `updater_NNN.json` for platform `NNN`.
        """
        input_nodes = self.io_nodes["input_node", "origin"]
        fn_json, dir_input_1st = self.path[f"updater_{hw_mode}_json"], self.path["dir_knerex"]
        fn_json.parent.mkdir(parents=True, exist_ok=True)
        input_folders = self.get_input_folders(input_nodes, dir_input_1st)

        conf = {}

        # TODO: remove t, use keys from config["knerex"]
        t = [
            "verbose",
            "percentile",
            "same_scale",
            "per_channel_radix",
            "output_scale",
            "output_radix",
            "cpu_scale",
            "cpu_radix",
            "fixed_scale_mode",
            "max_scale",
            "data_analysis_threads",
            "datapath_range_method",
            "outlier_factor",
            "bn_weight_pct",
            "conv_weight_pct",
            "num_input_samples",
            "dump_level",
            "datapath_bitwidth_mode",
            "weight_bitwidth_mode",
            "model_in_bitwidth_mode",
            "model_out_bitwidth_mode",
            "cpu_bitwidth_mode",
            "datapath_mix_percentile",
            "weight_mix_percentile",
            "data_analysis_pct",  # outliers
            "need_additional_data_analysis_pct",
            "additional_data_analysis_pcts",
            "dynamic_range_based_on_bitwidth",
            "lut_high_accuracy_mode",
            "dummy_bn_remove_mode"
        ]

        # copy knerex configs from config
        for k in t:
            conf[k] = self.config["knerex"][k]

        input_shape = self.config["dynasty"]["input_shape"]
        convert = {"onnx_shape": "1", "channel_last": "0"}
        conf["shape_order"] = convert.get(input_shape, "1")
        conf["type"] = fconsts.KNEREX_UPDATER_TYPE[hw_mode]

        # per model settings.
        # input files for knerex
        # will only use graphopt.bie from compiler frontend from 0.24.0
        conf["fn_origin_onnx"] = self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"]
        conf["test_config"] = ""
        conf["user_config_json"] = self.path["user_config_json"]

        conf["qat_config"] = self.path[f"qat_{hw_mode}_config_json"]

        # temp files.
        conf["fn_dp_analysis_piano"] = self.path[f"temp_dpa_piano_{hw_mode}"]
        conf["fn_wt_analysis_piano"] = self.path[f"temp_wta_piano_{hw_mode}"]

        # output
        conf["outmodel"] = self.map_onnx[f"kdp{hw_mode}_scaled_piano_bie"]

        # render the json file
        template = self.jinja_env.get_template(f"updater_{hw_mode}.json")
        output = template.render(input_nodes=input_nodes, input_folders=input_folders, conf=conf)
        with open(fn_json, "w") as f:
            f.write(output)
        # check before finish
        if not pathlib.Path(fn_json).exists():
            raise RegressionError(f"kdp{hw_mode}/knerex", self.model_id, msg="Failed to create knerex config json.")

    @run_module(module_name="auto/check compiler output")
    def load_compiler_dump(self, *, hw_mode):
        """Check the output of compiler / batch compiler.

        The command.bin/etc had a prefix if generate by batch compiler
        """
        module_name = f"kdp{hw_mode}/load compiler dump"
        self.logger.info(f"{module_name}")
        dir_out = self.path[f"compiler_piano_{hw_mode}_out"]
        self.compiler_output[hw_mode] = compiler.locate_compiler_dump(dir_out, hw_mode)

    @run_module(module_name="auto/parse_ioinfo")
    def load_compiler_ioinfo(self, *, hw_mode):
        """Parse `ioinfo.csv` yielded by compiler to determine input nodes shapes.

        NOTE:
          this method requires compiler ouptut, so call it after compiler.

        This function will load the ioinfo from compiler output,

          - ~~load `ioinfo.json` in compiler output folder.~~ obsolete from 0.26.0
          - load `.no_binary.json` in compiler output folder, or extracted from models.kne. from 0.26.0
          - save to `self.io_nodes`, which include

            - input nodes shapes / data format.
            - output nodes shapes / data format.
            - cpu nodes.

        This function will also find corresponding the dynasty dump for golden.
        It need to decide:

          - which dynasty mode output folder (related to knerex optimization)
          - which format (fx or fl)
        """
        module_name = f"kdp{hw_mode}/parse_ioinfo"
        self.logger.info(f"{module_name}")

        p_compiler_out = self.path[f"compiler_piano_{hw_mode}_out"]
        # use the compiler_730/models.no_binary.json or .no_binary.json parsed from kne.
        ioinfo = compiler.convert_ioinfo(p_compiler_out, hw_mode)

        # no clean_name on input_nodes
        input_nodes = [a["name"] for a in ioinfo["input"]]
        output_nodes = [futils.clean_name(a["name"]) for a in ioinfo["output"]]
        cpu_nodes = []  # TODO

        if len(input_nodes) == 0:
            self.logger.critical("NO input_nodes found")
        if len(output_nodes) == 0:
            self.logger.critical("NO output_nodes found.")

        # find the golden in dynasty for btm
        dynasty_mode = self.btm_dynasty_mode[hw_mode]

        p_dump = self.path["btm_dump"]
        p_dynasty_dump = p_dump / f"mode_{dynasty_mode}_piano"
        p_csim_dump = p_dump / f"csim_{hw_mode}"
        p_pld_report = p_dump / "pld_report"

        # ini file for csim btm dump. default is test_input.txt
        self.path[f"csim_{hw_mode}_ini"] = p_csim_dump / f"run_csim_{hw_mode}.ini"
        self.path[f"csim_{hw_mode}_ini_pld"] = p_csim_dump / f"run_csim_{hw_mode}.pld.ini"

        # prepare dynasty golden
        # NOTE: 720, 530 dynasty may have golden as _fl.txt if `data_format` is `RAW_FLOAT`
        golden_txt_fns = [f"layer_output_{a}_fx.txt" for a in output_nodes]
        p_dynasty_golden = [p_dynasty_dump / fn for fn in golden_txt_fns]

        # predefined filenames
        # record information for bit-true-match. this is related to which text_input
        self.io_nodes[("btm_text_input", hw_mode)] = self.btm_txt
        self.io_nodes[("btm_dynasty_mode", hw_mode)] = dynasty_mode
        self.io_nodes[("btm_dynasty_path", hw_mode)] = p_dynasty_dump
        self.io_nodes[("btm_dynasty_golden_txt_fn", hw_mode)] = golden_txt_fns
        self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)] = p_dynasty_golden

        self.io_nodes[("btm_csim_path", hw_mode)] = p_csim_dump
        # we the csim input for btm now. this must match csim_utils.py
        self.io_nodes[("btm_csim_in_bin", hw_mode)] = [p_csim_dump / f"csim_p000000_i{i:03}.bin" for i in range(len(input_nodes))]
        # need both info to run csim
        self.io_nodes[("btm_csim_in", hw_mode)] = [[p_csim_dump, self.path[f"csim_{hw_mode}_ini"]]]
        self.io_nodes[("btm_csim_in_pld", hw_mode)] = [[p_csim_dump, self.path[f"csim_{hw_mode}_ini_pld"]]]

        # need for dynasty / csim btm debug
        self.io_nodes[("pld_report", hw_mode)] = p_pld_report

        # general info
        self.io_nodes[("ioinfo", hw_mode)] = ioinfo
        self.io_nodes[("input_node", hw_mode)] = input_nodes
        self.io_nodes[("out_node", hw_mode)] = output_nodes
        self.io_nodes[("cpu_node", hw_mode)] = cpu_nodes

        # verify input / output node names
        if DEBUG:
            self.verify_compiler_io_names(hw_mode)

        # save for reference but only internal regression
        if self.config["path"]["internal"]:
            self.model_fx_report[(f"kdp{hw_mode}/btm_dynasty_path")] = p_dynasty_dump

        for i in range(self.config["nef"]["inference_count"]):
            p_nef_dump = p_dump / f"nef_{hw_mode}_output_{i}"
            self.io_nodes[("btm_nef_path", hw_mode, i)] = p_nef_dump
            p_nef_kneron_plus_dump = p_dump / f"nef_kneron_plus_{hw_mode}_output_{i}"
            self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, i)] = p_nef_kneron_plus_dump

    def verify_compiler_io_names(self, hw_mode):
        """Verify input/output nodes between origin.onnx and knerex bie.

        NOTE:
            This is for internal regression.
            The compiler output may be different from origin.onnx.
            print the diff when REGRESSION_DEBUG=1
        """
        dp_in_cmpl = self.io_nodes[("input_node", hw_mode)]
        dp_out_cmpl = self.io_nodes[("out_node", hw_mode)]

        dp_in_ori = self.io_nodes["input_node", "origin"]
        dp_out_ori = self.io_nodes["out_node", "origin"]

        if dp_in_cmpl != dp_in_ori or dp_out_cmpl != dp_out_ori:
            print(f"origin.onnx specify:\n\tinput nodes: {dp_in_ori}\n\toutput nodes: {dp_out_ori} \n")
            print(f"compiler {hw_mode} specify:\n\tinput nodes: {dp_in_cmpl}\n\toutput nodes: {dp_out_cmpl} \n")

    @run_module(module_name="auto/gen_csim_ini")
    def generate_csim_ini(self, *, hw_mode):
        """
        create .ini config for csim using jinja2 template
        per 520/720/530/730/630.

        CSIM 520 will not use this .ini config
        CSIM 720/530/730/630 will use this .ini file directly

        Input files:
          * ioinfo.csv from compiler output.
          * model files for 520/720/530/530:
              * weight.bin
              * command.bin
              * setup.bin
              * apb.npu
          * model files for 540/730:
              * model_NNN.kne
          * input file for inference
              * dynasty dumped input file, prepared by `data_convert`
              * `output/results/FN_INPUT/model_520-wqbi_piano/layer_input_*.bin`

        Output files:
          * run_csim_NNN.ini

        """
        self.logger.info(f"generating csim ini for {hw_mode}")
        hw_modes_on = self.config["hw_mode_on"]
        assert hw_mode in hw_modes_on, f"hw_mode is: {hw_mode}, not in hw_mode_on {hw_modes_on}"

        # for piano compiler output
        p_compiler = self.path[f"compiler_piano_{hw_mode}_out"]
        p_csim_dump = self.io_nodes[("btm_csim_path", hw_mode)]
        bin_pair = self.io_nodes[("btm_csim_in_bin", hw_mode)]
        golden_txt = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]

        # RTL-release need to set this to 3
        dump_core_opt = self.config["csim"]["dump_core_opt"]

        # generate ini for normal csim
        template = self.jinja_env.get_template(f"run_csim_{hw_mode}.ini")
        # BUG: this ini is for btm pair only. not for general
        fn_ini = self.path[f"csim_{hw_mode}_ini"]
        csim.gen_csim_ini(bin_pair, p_compiler, hw_mode,
                          template=template,
                          fn_ini=fn_ini,
                          golden_txts=golden_txt,
                          dump_core_opt=dump_core_opt)

        # generate ini for pld csim
        template_pld_dump = self.jinja_env.get_template(f"run_csim_{hw_mode}.pld.ini")
        fn_ini_pld = self.path[f"csim_{hw_mode}_ini_pld"]
        csim.gen_csim_ini(bin_pair, p_compiler, hw_mode,
                          template=template_pld_dump,
                          fn_ini=fn_ini_pld,
                          golden_txts=golden_txt)

        # function created: fn_ini / fn_ini_pld

    def check_csim_btm_input(self, *, hw_mode):
        """Skip data convert but need to check exists of csim input for dongle."""
        lst_inputs = self.io_nodes[("btm_csim_in_bin", hw_mode)]

        missing_inputs = [k.name for k in lst_inputs if not k.exists()]
        missing_str = ", ".join(missing_inputs)
        if len(missing_inputs) > 0:
            raise RegressionError(f"kdp{hw_mode}/dongle missing input", self.model_id, msg=f"missing: {missing_str}")

    @run_module(module_name="auto/data_convert")
    def data_convert(self, *, hw_mode):
        """Convert input.txt pair to csim.bin.

        * no supporting 520.

        Input files:
            * dynasty input text files.
        """
        module_name = f"kdp{hw_mode}/data_convert"
        self.logger.info(f"check {module_name}")

        # Get input bins for csim
        # previously using self.io_nodes["input_node", "origin"] which is same as onnx input node order
        # but compiler may use different order. refer to ioinfo.csv
        # NOTE: when write to ini file, file refered to are in relative path to the ini (a.k.a, output folder)
        p_csim_dump = self.io_nodes[("btm_csim_path", hw_mode)]
        info_in = self.io_nodes[("ioinfo", hw_mode)]["input"]
        csim_bin_sqt = csim.txt2bin_seq(self.list_input_btm, info_in, p_csim_dump)
        list_input_bin, cmds = csim.data_convert(csim_bin_sqt,
                                                 info_in,
                                                 p_out=p_csim_dump)
        self.save_command(module_name, "\n".join(cmds))

        # assert list_input_bin.keys() == [0]
        # function output
        # TODO: should make sure these two equal
        self.io_nodes[("btm_csim_in_bin", hw_mode)] = list_input_bin[0]

        # TODO: why we need list_input_bin_rtl?
        # TODO: if compiler specify RAW_FLOAT, need to use dynasty/_fl.bin?

        return

    def find_multiple_input(self, fn_input0, verify_exist=True):
        """Look for (possible) multiple INPUT NODES for this MODEL.

        give 1st input image name, give a list with whole input set (might be 1 or more.)

        todo
        : need refactor into utils
        """
        fn_base = fn_input0.name
        p_base = fn_input0.parent.parent
        path_prefix = fn_input0.parent.name.removesuffix("_0")

        if verify_exist:
            assert fn_input0.exists()
        list_inputs = [str(fn_input0)]

        input_nodes, _, _ = self.onnx_infos["origin"].get_ioinfo()

        # NOTE: current by search input folders.
        # TODO: verify with onnx input number
        for i_dir in range(1, len(input_nodes)):
            next_input = p_base / f"{path_prefix}_{i_dir}" / fn_base
            if verify_exist and not next_input.exists():
                raise RegressionError("general/Missing input", self.model_id, msg=f"missing input: {next_input}")
            list_inputs.append(str(next_input))

        return list_inputs

    def est_memory_dynasty_fx(self):
        """
        Estimate how much memory needed for dynasty-fx inference
        """

        # only some need to estimate
        platforms_large_memory = [520, 720]
        plts = [hw_mode for hw_mode in self.config["hw_mode_on"] if hw_mode in platforms_large_memory]
        if len(plts) == 0:
            return

        est_avl_kB = futils.estimate_mem_available()
        # TODO: what if multi-thread?
        if self.est_mac_kB > est_avl_kB:
            self.logger.error(f"WARNING: Estimated max memory need for dynasty fx {plts} is {self.est_mac_kB} kB.")
            self.logger.error(f"         Current available memory is {est_avl_kB} kB.")

    @run_module(module_name="general/invalid_onnx")
    def check_onnx_valid(self):
        """Report if this onnx is invalid
        """
        if not self.onnx_infos["origin"].is_valid_onnx():
            raise RegressionError("general/invalid_onnx", self.model_id)

    def run_flow(self):
        """The main function for the kneron internal quantization flow.

        Here it controls the sequence of module execution.

        `config` defines which module to run.
        For complicated process, e.g., bias adjust,
        you can define multiple configs and call `run_flow(conf1)` and `run_flow(conf2)`, etc
        """
        # TODO: better flow control per platform. aka. one platform fail will not affect another one

        # some shortcuts
        do_dynasty = self.config["module_run"]["piano_dynasty"]
        do_csim = self.config["module_run"]["csim"]
        do_dongle = self.config["module_run"]["run_nef_kneron_plus"]
        only_dongle = self.config["module_run"]["only_dongle"]

        self.logger.setLevel(self.config["regression"]["logging_level"])

        # compiler frontend is need for only_ip_evaluator and  quantization
        # it will provide node-mapping for ip_eval
        if self.config["module_run"]["compiler_frontend"]:
            for hw_mode in self.config["hw_mode_on"]:
                # generate cpu node list and nod mapping
                self.run_compiler_frontend(hw_mode=hw_mode)

        # the real quantizaion
        # quantization = compiler frontend + knerex + compiler
        if self.config["module_run"]["piano_knerex"]:
            for hw_mode in self.config["hw_mode_on"]:
                # generate quantized model
                self.generate_knerex_config(hw_mode=hw_mode)
                self.run_knerex(hw_mode=hw_mode)
                if self.config["compiler_piano"]["convert_enc"]:
                    self.convert_enc(hw_mode=hw_mode)

        if self.config["module_run"]["gen_nef"]:
            for hw_mode in self.config["hw_mode_on"]:
                # generate nef+release.bie for hardware
                p_out = self.path[f"compiler_piano_{hw_mode}_out"]
                self.generate_nef(hw_mode=hw_mode, p_nef=p_out)

            # some cache folder in compiler dump need to be cleaned.
            self.clean_opt()

            if self.config["layer_statistics"]["weight_stats"]:
                self.load_weight_bin_stats()

        # now all kinds of inference
        if do_dynasty:
            self.dir_output_list = self.run_dynasty_inference()
        else:
            # if no dynasty scheduled to run, search the results folder for existing dynasty dumps.
            dir_results = self.path["dir_output"] / "results"
            self.dir_output_list = [f for f in dir_results.rglob('*') if f.is_dir()]

        if self.config["module_run"]["tflite"]:
            self.run_tflite(self.list_input_simulator)

        if self.config["module_run"]["onnxruntime"]:
            self.run_onnxruntime(self.list_input_simulator)

        if self.config["module_run"]["snr_calculation"]:
            # for SNR of dynasty v2 calling.
            self.run_dynasty_snr(self.dir_output_list)
            if self.config["dynasty"]["regression_input"] == "all":
                # combine snr to overal report
                self.generate_snr_report()
                self.clean_dynasty_output(self.dir_output_list)
            if not self.config["path"]["internal"]:
                # used by customer in toolchain
                self.convert_snr_report()

            for hw_mode in self.config["hw_mode_on"]:
                self.verify_snr(hw_mode=hw_mode)

            if self.config["module_run"]["verify_decomp_snr"]:
                for hw_mode in self.config["hw_mode_on"]:
                    self.verify_decomp_snr(hw_mode=hw_mode)

            if self.config["module_run"]["any_bi_enable"]:
                self.verify_bias_adjust_performance()

        if self.config["module_run"]["calculate_layer_statistics"]:
            self.load_layer_statistics()

        # PREPARE for csim/nef btm
        if do_csim or do_dongle:
            # NOTE: load io_info.csv in last time run (supposed to have)
            for hw_mode in self.config["hw_mode_on"]:
                self.load_compiler_dump(hw_mode=hw_mode)
                self.load_compiler_ioinfo(hw_mode=hw_mode)

                if only_dongle:
                    # for only_dongle, the csim should have run and the input.bin should be ready.
                    # TODO: load json as below saved
                    self.check_csim_btm_input(hw_mode=hw_mode)
                else:
                    if hw_mode not in [520]:
                        # convert dynasty input for csim. no need for 520
                        # NOTE: in regression, we  will only convert "test_input.txt" by default
                        self.data_convert(hw_mode=hw_mode)
                    else:
                        self.data_convert_520(hw_mode=hw_mode)

        if do_csim:
            for hw_mode in self.config["hw_mode_on"]:
                if hw_mode == 520:
                    self.run_csim_520()
                else:
                    self.generate_csim_ini(hw_mode=hw_mode)
                    self.run_csim(hw_mode=hw_mode)

                self.btm_dyn_csim(hw_mode=hw_mode)
                if self.config["module_run"]["csim_ci"] and hw_mode not in [520]:
                    self.run_csim_ci(hw_mode=hw_mode)

                if self.config["module_run"]["rtl_cmd_check"] and hw_mode not in [520, 720]:
                    self.check_rtl_cmd(hw_mode=hw_mode)

        if do_dongle:
            inference_count = self.config["nef"]["inference_count"]

            hw_dongle_available = [520, 720, 630, 730]  # 530
            for hw_mode in hw_dongle_available:
                if hw_mode in self.config["hw_mode_on"]:
                    self.run_nef_kneron_plus(hw_mode=hw_mode, number_try=inference_count)
                    for i in range(inference_count):
                        self.btm_csim_nef(hw_mode=hw_mode, number_try=i)
                        # self.btm_dyn_nef_kneron_plus(hw_mode=hw_mode, number_try=i)

        self.module_status["general"]["Success"] = True
        self.path["success_sign"].touch()

        self.gen_fx_report()

        self.post_clean_up()

        # model_fx_release is a list of files to released after gen_fx_model
        return self.model_fx_release

    @staticmethod
    def load_graphopt_bie_json(fn_bie, hw_mode):
        """Load js_fns from compiler frontend generated bie.

        TODO:

          - This file has been read once after `run_compiler_frontend`. Necessary to combine into one call?
        """
        t1_j = util_lib.load_zip_jsons(fn_bie)

        raw_reports = {}
        raw_reports["fe2origin"] = t1_j["node_mapping_opt_fe_to_origin.json"]
        raw_reports["fe2be"] = t1_j["node_mapping_opt_fe_to_opt_be.json"]
        raw_reports["ori_node_type"] = t1_j["node_types_origin.json"]
        if hw_mode not in [520]:
            # not available for 520
            raw_reports["fe_node_type"] = t1_j["node_types_opt_fe.json"]
            raw_reports["be_node_format"] = t1_j["node_format_opt_be.json"]

        return raw_reports

    @staticmethod
    def load_knerex_bie_json(bie_release):
        """Load the jsons from knerex bie2 for fx report."""
        # we assume: bie will always generated. bie could be scaled, wqbi, ... optimized
        # this step will not work if no knerex ran.
        # for example, in mode 0 (ip-eval-only)

        if bie_release.name.endswith(".onnx"):
            msg = f"should not release onnx: {bie_release}"
            raise TypeError(msg)

        t2_j = util_lib.load_zip_jsons(bie_release)

        d = {}

        for k, v in {
                "node_type": "model_info.json",
                # "node_shape": "shape_info.json",  # from 0.23.0
                "node_shape": "snr_shape_info.json",  # from 0.25.0
                "node_radix": "radix_info.json"
        }.items():
            d[k] = t2_j[v]

        return d

    def load_compiler_ip_eval_info(self, hw_mode):
        """Load json from compiler backend (w iip eval) info."""
        d = {}  # to save results

        p_compiler_out = self.path[f"compiler_piano_{hw_mode}_out"]
        js_fns = {}  # file list
        js_fns["be_node_analysis"] = p_compiler_out / "BE_node_evaluator_result.json"

        # load all json report files into:
        for k, p in js_fns.items():
            if p.exists():
                with open(p, "r") as f:
                    d[k] = json.load(f)
                    if d[k] is None:
                        raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg=f"{p.name} is empty.")

        return d

    @staticmethod
    def get_node_type(raw_reports, node_fe, nodes_origin):
        """Find the type (NPU/CPU/FUSED) for node_fe."""
        try:
            # get the info from knerex first
            node_type = raw_reports["node_type"][node_fe]["Mode"]
        except:
            try:
                node_type = raw_reports["fe_node_type"][node_fe]
            except:
                try:
                    # for 520, it fallback to origin_node_type
                    # BUG: just use the first origin node
                    node_type = raw_reports["ori_node_type"][nodes_origin[0]]
                except:
                    # print(raw_reports.keys())
                    node_type = "FUSED"
        if node_type == "NONE":
            node_type = "FUSED"

        return node_type

    def load_snr_report(self, hw_mode, raw_reports):
        """Load snr report for hw_mode."""
        try:
            if "snr_csv" not in self.path or not self.path["snr_csv"].exists():
                return {}, []

            ref_name = "mode_{}_piano".format(self.config["snr"]["ref"][hw_mode])
            deg_name = "mode_{}_piano".format(self.config["snr"]["deg"][hw_mode])
            snr_types = self.config["snr"]["report_snr_col"]
            snr_result = get_case_output(self.path["snr_csv"], ref_mode=ref_name, deg_mode=deg_name, col_snr=snr_types, out_dp="all")
            d_snr = snr_result.droplevel(["Category", "Model", "Mode_deg", "Mode_ref"], axis=0).to_dict("index")
            # HACK: special process for output node. extra copy for easier lookup
            for dp_out in raw_reports["node_shape"]["dp_out"]:
                # NOTE: dp_out in dynasty dump / snr need to be called with clean_name
                dp_out = futils.clean_name(dp_out)
                dpo2 = f"output_{dp_out}"
                if (dp_out not in d_snr) and (dpo2 in d_snr):
                    d_snr[dp_out] = d_snr[dpo2]
            return d_snr, snr_result.columns
        except Exception as e:
            print(f"Error loading SNR report: {e}")
            return {}, []

    @staticmethod
    def load_fe_nodes(raw_reports):
        """Load node_fe from knerex/snr_shape_info.json."""
        if "node_shape" in raw_reports:
            nodes_decomp, _, node_decomp2dp, _, _, _, _, _, _, _ = futils.parse_shape_info(raw_reports["node_shape"])
            sort_on_cmd_idx = False
        else:
            # detour for ip eval. no knerex results
            sort_on_cmd_idx = True
            nodes_decomp = list(raw_reports["fe2origin"].keys())
            node_decomp2dp = {}
        return nodes_decomp, node_decomp2dp, sort_on_cmd_idx

    def load_raw_json_reports(self, hw_mode):
        """Collect raw json from compiler frontend / knerex / compiler ip eval."""
        raw_reports = {}

        # loaded json from compiler frontend bie
        # release.bie has proper quantization info
        f_bie = self.map_onnx[f"kdp{hw_mode}_release_piano_bie"]
        if not f_bie.exists():
            # probably in mode 0 (ip eval only). no release.bie
            # opt.bie does not have proper quantization info yet.
            f_bie = self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"]

        d = self.load_graphopt_bie_json(f_bie, hw_mode)
        # this dict contains node mapping info
        raw_reports.update(d)

        if not self.config["module_run"]["only_ip_evaluator"]:
            # load js_fns from compiler generated bie
            # we assume: bie will always generated. bie could be scaled, wqbi, ... optimized
            # this step will not work if no knerex ran.
            # for example, not available in mode 0 (ip-eval-only)
            # Note: this dict contains quantization info and snr_shape_info
            bie_release = self.map_onnx[f"kdp{hw_mode}_release_piano_bie"]
            d = self.load_knerex_bie_json(bie_release)
            raw_reports.update(d)

        # load hw info per node (from ip evaluator)
        # acutally it is backend node evaluation
        # read from `BE_node_evaluator_result.json`
        d = self.load_compiler_ip_eval_info(hw_mode)
        raw_reports.update(d)

        return raw_reports

    @staticmethod
    def record2df_fx(temp_rec, snr_cols):
        """Convert records to dataframe for fx report."""
        # some columns may have NaN, not possible to use .astype
        rep_dtld = pd.DataFrame.from_records(temp_rec)

        # clean up. remove columns which are all None, all 0, all N/A
        cols_to_drop = [
            col for col in rep_dtld.columns
            if all(rep_dtld[col].isna()) or all(
                rep_dtld[col] == 'N/A') or all(rep_dtld[col] == 0)
        ]
        rep_dtld.drop(columns=cols_to_drop, inplace=True)

        # 将NaN值替换为空字符串
        rep_dtld = rep_dtld.fillna('')

        # move snr columns to front of df
        for name_col in snr_cols:
            if name_col in rep_dtld.columns:
                t_column = rep_dtld.pop(name_col)
                rep_dtld.insert(1, name_col, t_column)


        # 将指定列移动到DataFrame末尾
        cols_to_move_to_end = [
            'CMD_node_idx',
            'in_fmt',
            'out_fmt',
            'runtime(ms)',
            'CFUNC_runtime(ms)',
            'PFUNC_runtime(ms)',
            'SYNC_runtime(ms)',
            'MAC_cycle',
            'MAC_runtime(ms)',
            'RDMA_amount(Byte)',
            'RDMA_runtime(ms)',
            'WDMA_amount(Byte)',
            'WDMA_runtime(ms)',
            'Weight_amount(Byte)'  # 最后一列
        ]

        # 找到存在的列（按指定顺序）
        existing_cols_to_move = [col for col in cols_to_move_to_end if col in rep_dtld.columns]

        # 获取其他列（不在移动列表中的列）
        other_cols = [col for col in rep_dtld.columns if col not in cols_to_move_to_end]

        # 重新排列：其他列 + 移动的列
        new_column_order = other_cols + existing_cols_to_move
        rep_dtld = rep_dtld[new_column_order]

        return rep_dtld

    def collect_be_node_analysis(self, node_be, temp_rec, sort_on_cmd_idx, raw_reports):
        """Collect node_be info"""
        # backend node ip evaluate
        last_node_be = self.get_last_record(temp_rec, "node backend")
        fmt_col_cvrt = {"inputs": "in_fmt", "outputs": "out_fmt"}
        if last_node_be and (not sort_on_cmd_idx) and node_be == last_node_be:
            # if full run and same as above, put empty or ↑
            return self.collect_be_node_same(node_be, raw_reports, fmt_col_cvrt)

        # this is a new be_node
        return self.collect_be_node_new(node_be, raw_reports, fmt_col_cvrt)

    def collect_fe_node_bw(self, node_fe, raw_reports):
        """Collect bitwidth info per node_fe."""
        temp_d = {}
        try:
            bw_in = raw_reports["node_radix"][node_fe].get("input_datapath_bitwidth", "")
            bw_out = raw_reports["node_radix"][node_fe].get("output_datapath_bitwidth", "")
            bw_wt = raw_reports["node_radix"][node_fe].get("weight_bitwidth", "")
            temp_d["bw in"] = ", ".join(str(a) for a in bw_in)
            temp_d["bw out"] = ", ".join(str(a) for a in bw_out)
            # temp_d["bw weight"] = ", ".join(str(a) for a in bw_wt)
        except:
            pass

        return temp_d

    @staticmethod
    def get_last_record(temp_rec, k):
        """Get last node value for "k" key.

        To make the table easier to read, the cell with same value (name) with above cell,
        is shown as "↑". So if we saw "↑", we keep trace back until find the first valid name.
        """
        if len(temp_rec) > 0 and k in temp_rec[-1]:
            last_v = temp_rec[-1][k]
            if last_v != "↑":
                return temp_rec[-1][k]

            i = -2
            while last_v == "↑":
                last_v = temp_rec[i][k]
                i -= 1
            return last_v

        return None

    def collect_be_node_info(self, node_fe, node_be, nodes_origin, node_type, this_snr, raw_reports, temp_rec, sort_on_cmd_idx):
        """Collect all info for given node_be.

        One node_fe may split into multiple node_be.
        One node_origin may corresponding to multiple node_fe.
        So node_fe / node_origin may repeat multiple times in continues calls.
        """
        # first, node mapping
        temp_d = OrderedDict()

        # node is the node_fe, the key of table
        last_fe_name = self.get_last_record(temp_rec, "node")
        temp_d["node"] = "↑" if node_fe == last_fe_name else node_fe

        # node origin is the node in onnx
        # 1 node_fe may corresponding to multiple node_origin
        last_ori_name = self.get_last_record(temp_rec, "node origin")
        new_ori_name = ", ".join(str(a) for a in nodes_origin)
        temp_d["node origin"] = "↑" if new_ori_name == last_ori_name else new_ori_name

        temp_d["type"] = node_type
        if this_snr:
            temp_d.update(this_snr)

        # insert bw info
        d1 = self.collect_fe_node_bw(node_fe, raw_reports)
        temp_d.update(d1)

        # backend node ip evaluate
        d1 = self.collect_be_node_analysis(node_be, temp_rec, sort_on_cmd_idx, raw_reports)
        temp_d.update(d1)

        return temp_d

    def collect_be_node_same(self, node_be, raw_reports, fmt_col_cvrt):
        """Fill in the info for a repeating node_be."""
        temp_d = {}
        # full run
        # "↑" means same as above. will show merged cell in final html.
        # all columns of this repeating backend node is filled with "↑".
        temp_d["node backend"] = "↑"
        if "be_node_analysis" in raw_reports and node_be in raw_reports["be_node_analysis"]:
            for k in raw_reports["be_node_analysis"][node_be]:
                temp_d[k] = "↑"
        if "be_node_format" in raw_reports and node_be in raw_reports["be_node_format"]:
            for k in raw_reports["be_node_format"][node_be]:
                temp_d[fmt_col_cvrt[k]] = "↑"
        return temp_d

    def collect_be_node_new(self, node_be, raw_reports, fmt_col_cvrt):
        """Collect node_be info for a new node_be."""
        temp_d = {}
        temp_d["node backend"] = node_be
        if "be_node_analysis" in raw_reports and node_be in raw_reports["be_node_analysis"]:
            # NOTE: no node analysis for 520
            temp_d.update(raw_reports["be_node_analysis"][node_be])
        if "be_node_format" in raw_reports and node_be in raw_reports["be_node_format"]:
            iofmt = raw_reports["be_node_format"][node_be]
            for k1, v1 in iofmt.items():
                temp_d[fmt_col_cvrt[k1]] = futils.pprint_dict(v1)
        return temp_d


    def combine_node_info(self, nodes_decomp, node_decomp2dp, raw_reports, d_snr, sort_on_cmd_idx):
        """Combine node info of frontend, origin, backend."""
        temp_rec = []
        for node_fe in nodes_decomp:
            # node frontend is the KEY for table

            # find all nodes backend that include this node_fe
            if node_fe not in raw_reports["fe2be"]:
                nodes_be = [None]
            else:
                nodes_be = raw_reports["fe2be"][node_fe]
                if len(nodes_be) == 0:
                    nodes_be = [None]

            # find all nodes origin
            nodes_origin = raw_reports["fe2origin"].get(node_fe, [None])

            # find node type
            node_type = self.get_node_type(raw_reports, node_fe, nodes_origin)

            # snr info, if available. this is per dp
            # TODO: currently we assume one fe -> one dp. but soon we need to support multi-output
            try:
                this_dp = futils.clean_name(node_decomp2dp.get(node_fe, [None])[0])
                this_snr = d_snr.get(this_dp, None)
            except:
                this_snr = None

            for node_be in nodes_be:
                # loop through backend nodes
                temp_d = self.collect_be_node_info(node_fe, node_be, nodes_origin, node_type, this_snr, raw_reports, temp_rec, sort_on_cmd_idx)
                temp_rec.append(temp_d)

        if sort_on_cmd_idx:
            # for ip-eval-only, sort on cmd_idx.
            # for full run, no need to sort on cmd_idx.
            # 安全处理缺少CMD_node_idx键的情况，缺少的放到最后
            temp_rec.sort(key=lambda x: x.get("CMD_node_idx", float('inf')))
            temp_rec = self.record_merge_same_to_above(temp_rec)

        return temp_rec


    def record_merge_same_to_above(self, records):
        """Merge same records cell to above."""
        for i in range(len(records)-1, 0, -1):
            for k in records[i].keys(): # ["node", "node origin", "node backend"]:
                if k in ["type"]:
                    continue
                if k in records[i-1] and records[i][k] == records[i-1][k]:
                    records[i][k] = "↑"
                    # the final html will show merged cell
        return records


    def get_model_ins(self, hw_mode):
        """Get model input names.

        Priority:
            1. from compiler
            2. from knerex
            3. from origin.onnx

        TODO:
            1. maybe use raw_reports["node_shape"]["dp_in"]
        """
        # from compiler
        k1 = ('ioinfo', hw_mode)
        # from knerex
        k2 = ('input_node', hw_mode, 'bie')
        # from origin.onnx
        k3 = ('input_node', 'origin')
        if k1 in self.io_nodes:
            inputs_info = self.io_nodes[k1]["input"]
            model_ins = set(a["name"] for a in inputs_info)
        elif k2 in self.io_nodes:
            model_ins = set(self.io_nodes[k2])
        elif k3 in self.io_nodes:
            model_ins = set(self.io_nodes[k3])
        else:
            model_ins = set()
            self.logger.error("Failed to get model inputs")
        return model_ins


    def generate_be_graph(self, raw_reports, hw_mode):
        try:
            # 使用 SVG 格式以获得更好的性能，特别是对于大型模型
            p_svg = self.path[f"model_fx_svg_{hw_mode}"]
            # no need to generate dot/svg for only_dongle
            skip_dot = self.config["module_run"]["only_dongle"]
            _, set_ops = futils.gen_backend_node_graph(raw_reports["be_node_format"], p_svg, skip_dot=skip_dot)
            if p_svg.exists():
                # only release when generate successfully
                self.model_fx_release[f"kdp{hw_mode}/backend node graph"] = p_svg
        except Exception as e:
            print(e)
            set_ops = set()
            self.logger.error("Failed to generate backend node graph")
        return set_ops


    def collect_node_info(self, nodes_decomp, node_decomp2dp, raw_reports, d_snr, sort_on_cmd_idx, hw_mode):
        """Collect origin/fe/be node info for a given hw_mode.

        sort_on_cmd_idx: whether to sort on cmd_idx. only true for ip-eval-only
        """
        ##############################################################################
        set_ops = self.generate_be_graph(raw_reports, hw_mode)
        model_ins = self.get_model_ins(hw_mode)

        ###################################################################################
        # now combine all into a detailed report
        temp_rec = self.combine_node_info(nodes_decomp, node_decomp2dp, raw_reports, d_snr, sort_on_cmd_idx)
        # temp_rec is a list of dicts.

        # add prefix
        temp_rec = futils.be_node_name_add_prefix(temp_rec, set_ops, model_ins)

        return temp_rec


    @run_module(module_name="general/gen_fx_report")
    def gen_fx_report(self):
        """Generate the fx report for quantization process.

        The report will contain:

          - ModelInfo.json from knerex dump.
            - bitwidth info
          - snr info
          - hw info from ip_evaluator
        """
        detailed_reports = OrderedDict()
        for hw_mode in self.config["hw_mode_on"]:
            ###################################################################################
            # collect report files
            raw_reports = self.load_raw_json_reports(hw_mode)

            d_snr, snr_cols = self.load_snr_report(hw_mode, raw_reports)

            nodes_decomp, node_decomp2dp, sort_on_cmd_idx = self.load_fe_nodes(raw_reports)

            temp_rec = self.collect_node_info(nodes_decomp, node_decomp2dp, raw_reports, d_snr, sort_on_cmd_idx, hw_mode)
            detailed_reports[hw_mode] = self.record2df_fx(temp_rec, snr_cols)

        # now collect overal summary
        self.model_fx_release["gen fx model report"] = self.path["model_fx_html"]
        self.model_fx_release["gen fx model json"] = self.path["model_fx_json"]
        for k, v in self.model_fx_release.items():
            # those files will be moved to release folder. so just print file name
            self.model_fx_report[k] = v.name

        self.dump_fx_report(detailed_reports)

    def dump_fx_report(self, detailed_reports):
        """Write the fx_report to html and json."""
        # we need this file for app_release and gen_fx_model call
        with open(self.path["model_fx_json"], "w") as f:
            json.dump(self.model_fx_report, f, indent=4, sort_keys=False, default=str)

        # write multi-dataframe to html
        df_summary = pd.DataFrame.from_dict(self.model_fx_report, orient="index", columns=["info"])
        with open(self.path["model_fx_html"], 'w') as f:
            f.write('<h1>Summary</h1><br><hr>')
            f.write(f"{df_summary.to_html(border=2)}<br><hr>")
            for k, df in detailed_reports.items():
                f.write(f"<h2>kdp{k}</h2><br><hr>")
                html_string = df.to_html(border=1)
                # 合并内容为"↑"的单元格到上方单元格
                html_string = futils.html_merge_cell(html_string)
                html_string = futils.html_highlight_node_backend(html_string)
                html_string = futils.html_add_footnote(html_string)
                f.write(html_string)

                if self.graph_warnings.get(k):
                    self.write_compiler_warning_as_ul(f, self.graph_warnings[k], k)

                f.write("<br><hr>")

    def save_summary(self):
        """Save summary html only, when submoudles failed.

        NOTE: this method will be called in run_single_case.
              Not supposed to call in run_flow here.
        """
        # now collect overal summary
        self.model_fx_release["gen fx model report"] = self.path["model_fx_html"]
        self.model_fx_release["gen fx model json"] = self.path["model_fx_json"]
        for k, v in self.model_fx_release.items():
            # those files will be moved to release folder. so just print file name
            self.model_fx_report[k] = v.name

        # we need this file for app_release and gen_fx_model call
        with open(self.path["model_fx_json"], "w") as f:
            json.dump(self.model_fx_report, f, indent=4, sort_keys=False, default=str)

        df_summary = pd.DataFrame.from_dict(self.model_fx_report, orient="index", columns=["info"])
        # write multi-dataframe to html
        with open(self.path["model_fx_html"], 'w') as f:
            f.write('<h1>Summary</h1><br><hr>')
            f.write(f"{df_summary.to_html(border=2)}<br><hr>")

            for k, v in self.graph_warnings.items():
                self.write_compiler_warning_as_ul(f, v, k)

        # even case failed, we will try to provide summary report as well.
        return self.model_fx_release


    @staticmethod
    def write_compiler_warning_as_ul(f, warnings, hw_mode):
        if warnings and type(warnings) == list and len(warnings) > 0:
            f.write(f"<h2>Compiler Warnings on Graph (kdp{hw_mode})</h2><br><hr>")
            f.write("<ul>")
            for warning in warnings:
                f.write(f"<li>{warning['content']}</li>")
            f.write("</ul>")

    @run_module(module_name="auto/csim_ci")
    def run_csim_ci(self, *, hw_mode):
        """
        Internal use only. for csim release.
        only keep files needed by csim ci
        """
        model_dir = self.model_path

        p_csim_ci = self.config["path"][f"csim_{hw_mode}_ci_dir"]
        target_dir = pathlib.Path(f"{p_csim_ci}/{model_dir.parent.name}/{model_dir.name}")
        target_output_dir = target_dir / "output"
        target_compiler_dir = target_output_dir / f"compiler_piano_output_{hw_mode}/"

        # TODO/DEBUG: not hw_mode
        btm_dyn_mode = self.io_nodes[("btm_dynasty_mode", hw_mode)]
        target_dynasty_dump_dir = target_output_dir / f"results/{self.btm_txt}/{btm_dyn_mode}/"

        # path in regresssion folder
        compiler_dir = f"{self.model_path}/output/compiler_piano_output_{hw_mode}/"
        dynasty_dump_dir = f"{self.model_path}/output/results/{self.btm_txt}/{btm_dyn_mode}/"

        if os.path.exists(target_dir):
            shutil.rmtree(target_dir)
        shutil.copytree(dynasty_dump_dir, target_dynasty_dump_dir)
        shutil.copytree(compiler_dir, target_compiler_dir)

        combine_cmd = f"cp -r {model_dir}/output/run_csim_{hw_mode}.ini {target_output_dir}"
        cp = futils.run_bash_script(combine_cmd)
        if cp.returncode != 0:
            raise RegressionError(f"kdp{hw_mode}/csim ci", self.model_id, msg=f"Err: {cp.returncode}")

    @run_module(module_name="auto/rtl_cmd_check")
    def check_rtl_cmd(self, *, hw_mode):
        """compare command.bin inst.hex

        # Usage: python3 ./rtlCmdCmpBinTxt.py command.bin inst.hex.opt

        # TODO: check who will use this.
        """
        # TODO: link_bin had been removed.
        raise NotImplementedError()
        rtl_cmd_cmp = self.config["path"]["binary"]["csim"]["rtl_cmd_cmp"]
        link_bin = self.config["path"]["binary"]["compiler"]["link_bin"]
        compile_and_gen_conv_all = self.config["path"]["binary"]["compiler"]["compile_and_gen_conv_all"]

        dir_rtl = f"{self.model_path}/rtl"
        dir_rtl_cmd_cmp = pathlib.Path(f"{self.model_path}/rtl/cmd_cmp")
        inst_hex_opt = f"{dir_rtl_cmd_cmp}/output.rtl.{hw_mode}.testcase/cmd_cmp/inst.hex.opt"
        model_output_dir = f"{self.model_path}/output/"
        if dir_rtl_cmd_cmp.exists():
            shutil.rmtree(dir_rtl_cmd_cmp)
        pathlib.Path(dir_rtl_cmd_cmp).mkdir(mode=0o770, parents=True, exist_ok=True)
        cp_case_for_rtl_gen = f"cp -r {model_output_dir} {dir_rtl_cmd_cmp}".format(model_output_dir, dir_rtl_cmd_cmp)
        subprocess.run(cp_case_for_rtl_gen, shell=True, executable="/bin/bash", check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        compiler_bin = self.config["path"]["binary"]["compiler"]["compiler"]
        if self.is_big_model:
            model_type = "model_opt"
        elif self.is_multi_layer:
            model_type = "multi"
        elif self.is_single_layer:
            model_type = "single"
        else:
            raise ValueError("cannot determine model type: bm, multi, single?")
        gen_rtl_case_command = f"pushd {dir_rtl_cmd_cmp} > /dev/null && {link_bin} {compiler_bin}; {compile_and_gen_conv_all} {dir_rtl} {hw_mode} {model_type} && popd > /dev/null"
        # TODO: change to run_bash()
        subprocess.run(gen_rtl_case_command, shell=True, executable="/bin/bash", check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        cmd_cmp_command = f"{rtl_cmd_cmp} {self.model_path}/output/compiler_piano_output_{hw_mode}/command.bin {inst_hex_opt}"
        subprocess.run(cmd_cmp_command, shell=True, executable="/bin/bash", check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    @run_module(module_name="auto/verify_decomp_snr")
    def verify_decomp_snr(self, *, hw_mode):
        """Verify the graphopt.bie correct or not.

        It used to compare the origin.onnx (float) with knerex dumped decomposed.bie.
        now compare origin.onnx (float) with compiler frontend dumped graphopt.bie.

        TODO: should this be combined into snr_calculate?
        """
        snr_min = 80  # SNR must larger than 80dB

        df = pd.read_csv(self.fn_report, index_col=["ref", "deg", "layer"])
        out_layer_names = set(df.index.get_level_values("layer"))
        deg_modes = set(df.index.get_level_values("deg"))

        pairs = []
        mode_ref = "mode_float_piano"
        mode_deg = f"mode_{hw_mode}graphopt_piano"
        if mode_deg in deg_modes:
            # check corresponding SNR results exists
            for out_name in out_layer_names:
                pairs.append((mode_ref, mode_deg, out_name))

        # pairs are SNR we want to verify
        snr_name = "SNR_With_Mean"

        # TODO: put this into columns. NOT using assert
        for i_deg in pairs:
            assert df.loc[i_deg, snr_name] > snr_min

    @run_module(module_name="auto/verify_snr")
    def verify_snr(self, *, hw_mode):
        """Quick check on model snr reach threshold

        After snr_calculation, the snr_per_layer.csv is generated.
        The snr_report.csv was extract from per_layer.csv which include output nodes only.

        This function is to pick one or both snr columns from snr_report.csv
        according to settings.

        TODO:
          - should this be combined into snr_calculate?

        it used to work for multi platform/hw_mode at same time
        removed to simplify
        """
        if self.is_big_model:
            snr_min = 10  # big_model must large than 10dB
        else:
            snr_min = 20  # layer must larger than 20dB

        df = pd.read_csv(self.fn_report, index_col=["ref", "deg", "layer"])
        out_layer_names = set(df.index.get_level_values("layer"))
        deg_modes = set(df.index.get_level_values("deg"))

        pairs = []
        mode_ref = "mode_{}_piano".format(self.config["snr"]["ref"][hw_mode])
        mode_deg = "mode_{}_piano".format(self.config["snr"]["deg"][hw_mode])
        if mode_deg in deg_modes:
            # check corresponding SNR results exists
            for out_name in out_layer_names:
                pairs.append((mode_ref, mode_deg, out_name))
        # pairs are SNR we want to verify

        snr_names = self.config["snr"]["report_snr_col"]
        for snr_name in snr_names:
            details_regression_report = []
            details_fx_report = {}
            for i_deg in pairs:
                # per output
                this_snr = df.loc[i_deg, snr_name]
                if this_snr < snr_min:
                    prefix = "⋖T:"
                else:
                    prefix = "⋗T:"
                # notes in regression report, compare with threshold
                msg_regression = f"{prefix} {this_snr:5.1f}dB ({i_deg[2]})"
                details_regression_report.append(msg_regression)

                # notes for gen_fx_report, simply show snr.
                details_fx_report[i_deg[2]] = f"{this_snr:5.1f}"

            # update to fx_report
            snr_k = f"kdp{hw_mode}/{snr_name}(dB)"
            self.model_fx_report[snr_k] = details_fx_report

            signal("data_sender").send((self.model_id, f"kdp{hw_mode}/{snr_name} (T={snr_min:.0f}dB)", "//".join(sorted(details_regression_report))))

    @run_module(module_name="general/verify_bias_adjust")
    def verify_bias_adjust_performance(self):
        """this verify step is to report on module success/fail in flow report.

        bias adjust performance detailed compare report are generated in during regression.py:
        snr_calculator.py/gather_all_bi_improve

        """
        df = pd.read_csv(self.fn_report, index_col=["ref", "deg", "layer"])
        out_layer_names = set(df.index.get_level_values("layer"))
        ref_modes = set(df.index.get_level_values("ref"))
        deg_modes = set(df.index.get_level_values("deg"))
        pairs = []
        for out_name in out_layer_names:
            for comp, (ref, deg1, deg2) in fconsts.SNR_BI_IMPROVE.items():
                mode_ref = f"mode_{ref}_piano"
                mode_deg1 = f"mode_{deg1}_piano"
                mode_deg2 = f"mode_{deg2}_piano"

                if mode_deg1 in deg_modes and mode_deg2 in deg_modes and mode_ref in ref_modes:
                    # only if all three modes are running.
                    pairs.append(((mode_ref, mode_deg1, out_name), (mode_ref, mode_deg2, out_name)))

        snr_name = "SNR_With_Mean"
        for i_ref, i_deg in pairs:
            improve = df.loc[i_deg, snr_name] - df.loc[i_ref, snr_name]
            self.logger.info(
                "Bias Adj improved = {} db = {} - {}. {}, {}".format(
                    improve, df.loc[i_deg, snr_name], df.loc[i_ref, snr_name],
                    i_deg, self.path["dir_output"]))
            # TODO: just send the improve to some column. platform independent?
            # TODO: remove run_module for this function
            if improve < -0.5:
                # Dont use assert here. it will suppress compiler/csim behind it
                self.logger.error(f"    ATTENTION: Bias adjust snr drop by {improve}")

    def load_weight_bin_stats(self):
        # only some out of hw_mode_on
        modes_on = self.config["hw_mode_on"]

        for mode in modes_on:
            compiler_output_path = self.path["dir_output"] / f"compiler_{mode}"
            weight_bin_path = compiler_output_path / "weight.bin"
            if os.path.exists(weight_bin_path):
                get_weight_bin_stats(weight_bin_path, do_tile_analysis=self.config["layer_statistics"]["tile_analysis"], do_4bit_compression=self.config["layer_statistics"]["4bit_analysis"])
            else:
                all_weight_bins = list(compiler_output_path.glob("**/*weight.bin"))
                for subg_weight_bin in all_weight_bins:
                    subg_index = subg_weight_bin.parent.name
                    if subg_weight_bin.stat().st_size > 0:
                        get_weight_bin_stats(
                            str(subg_weight_bin),
                            subg_index,
                            do_tile_analysis=self.config["layer_statistics"]["tile_analysis"],
                            do_4bit_compression=self.config["layer_statistics"]["4bit_analysis"])

        return

    @run_module("auto/convert_enc")
    def convert_enc(self, *, hw_mode):
        """Encrypt select onnx of given platform and optimized level"""

        model_optized_type = self.config["compiler_piano"]["model_optimize"]
        if model_optized_type == "scaled":
            optimized_onnx = self.map_onnx[f"kdp{hw_mode}_quan_piano_onnx"]
            assert optimized_onnx.exists(), "knerex opt onnx is scaled onnx, need to convert enc based on wq onnx, but wq onnx does not exist!!!"
        elif model_optized_type == "wqbi":
            optimized_onnx = self.map_onnx[f"kdp{hw_mode}_wqbi_piano_onnx"]
            assert optimized_onnx.exists(), "knerex opt onnx is wqbi onnx, but wqbi onnx does not exist!!!"
        else:
            msg = "model_optized_type only support scaled/wqbi, need to double check compiler config setting!"
            raise ValueError(msg)

        model_convertor_bin = self.config["path"]["binary"]["compiler"]["model_converter"]
        command = f"{model_convertor_bin} {optimized_onnx} {optimized_onnx}.enc > /dev/null"
        cp = futils.run_bash_script(command, do_echo=True, fail_then_exit=True)

        module_name = f"kdp{hw_mode}/convert_enc"
        self.save_command(module_name, command)

        return

    def load_layer_statistics(self, base_dump="results"):
        """
        collect some analysis/statistics on dynasty per layer dump/
        """
        do_per_channel = self.config["layer_statistics"]["per_channel"]
        do_difference_matrix = self.config["layer_statistics"]["do_difference_matrix"]
        hw_code = self.config["hw_mode_on"][0]
        dynasty_output_path = self.path["dir_output"] / base_dump
        do_float = self.config["layer_statistics"]["do_float"]
        stat_params = self.config["layer_statistics"]["params"]
        no_plot = self.config["layer_statistics"]["no_plot"]
        mode_list = self.config["layer_statistics"]["mode_on"]

        self.logger.info("generating layer statistics, could be time consuming")
        calculate_statistics(dynasty_output_path,
                             hw_code,
                             mode_list,
                             do_per_channel=do_per_channel,
                             do_diff_stat=do_difference_matrix,
                             do_float=do_float,
                             stat_params=stat_params,
                             no_plot=no_plot)
        return

    @run_module(module_name="general/tflite")
    def run_tflite(self, input_list, base_dump="results"):
        """Inference with tflite and dump all layer float/fix result."""
        module_name = "general/tflite"

        tflite_dir = self.model_path / "input" / f"{self.model_name}.tflite"
        tflite_dump_exec = self.config["path"]["binary"]["tflite"]["dump.py"]

        # TODO: multi-thead
        # TODO: call python function?
        # TODO: why called mode_tflite_float_noise?

        for input_path in input_list:
            # DEBUG: input_path now is a list of path!!! in case for multi-inputs

            if "quant" in self.model_name:
                out_dir = "{}/{}/{}/mode_tflite_fix_noise/".format(self.path["dir_output"], base_dump, input_path.name)
            else:
                out_dir = "{}/{}/{}/mode_tflite_float_noise/".format(self.path["dir_output"], base_dump, input_path.name)
            pathlib.Path(out_dir).mkdir(mode=0o770, parents=True, exist_ok=True)

            command = "python3 {} -o {} -i {} -t {} -l {}".format(tflite_dump_exec, out_dir, input_path, tflite_dir, "True")

            self.save_command(module_name, command)
            cp = futils.run_bash_script(command)
            if cp.returncode != 0:
                raise RegressionError("general/tflite", self.model_id, msg=f"Err: {cp.returncode}")

        return

    @run_module(module_name="general/onnxruntime")
    def run_onnxruntime(self, input_list, base_dump="results"):
        """Inference with onnxruntime and dump final layer float result."""
        module_name = "general/onnxruntime"
        onnxruntime_dump_exec = self.config["path"]["binary"]["tflite"]["onnxruntime.py"]

        onnx_dir = self.map_onnx["origin"]

        # TODO: multi-thead
        # TODO: call python function?
        # TODO: why called mode_onnxruntime_noise?

        for input_path in input_list:
            # DEBUG: input_path now is a list of path!!! in case for multi-inputs
            out_dir = pathlib.Path("{}/{}/{}/mode_onnxruntime_noise/".format(self.path["dir_output"], base_dump, input_path.name))
            out_dir.mkdir(parents=True, exist_ok=True)

            command = "python3 {} -out {} -in {} -onnx {}".format(onnxruntime_dump_exec, out_dir, input_path, onnx_dir)

            self.save_command(module_name, command)
            cp = futils.run_bash_script(command)
            if cp.returncode != 0:
                raise RegressionError("general/onnxruntime", self.model_id, msg=f"Err: {cp.returncode}")

        return

    @run_module(module_name="general/snr cal")
    def run_dynasty_snr(self, dir_output_list):
        """function to calculate snr for each input image

        currently calculate when all input x mode done.
        TODO: calculater per input file, after all modes done
        """

        pc = "--pc" if self.config["snr"]["per_channel"] else ""
        bin_snr = fconsts.P_FLOW / "snr_calculator_v2.py"

        self.logger.info(f"calculating SNR for {len(dir_output_list)} outputs.")

        # precaution of bash input limit.
        # if 1000 input txt, each txt output path is 50 chars,
        # the command will be at least 50000 chars.
        # bash call will fail if too long.
        # Ref: https://stackoverflow.com/questions/19354870/bash-command-line-and-input-limit
        for dol in futils.chunker(dir_output_list, 100):
            s_outs = " ".join([str(a) for a in dol])
            command = f"python3 {bin_snr} single {pc} {s_outs}"
            dynasty_timeout = self.config["dynasty"]["timeout"]
            cp = futils.run_bash_script(command, timeout=dynasty_timeout)
            if cp.returncode != 0:
                raise RegressionError("general/snr cal", self.model_id, msg=f"Err: {cp.returncode}")

    def convert_snr_report(self):
        """
        Read dynasty snr full report for release. will use "SNR_With_Mean" col
        """

        if "snr_csv" not in self.path or not self.path["snr_csv"].exists():
            # snr need to be calculated. sometime not turned on. e.g., ip evaluator only.
            return None  # will not export excel

        # NOTE: customer will run only 1 mode per regression
        df_snr = pd.read_csv(self.path["snr_csv"], index_col=["Model", "Mode_deg", "Mode_ref", "dump name"])
        cols = [col for col in df_snr.columns if col in ["Input", "Layer_index", "SNR_With_Mean"]]
        df_snr = df_snr[cols]
        df_snr.rename(columns={"SNR_With_Mean": "SNR"}, inplace=True)
        df_snr.to_excel(self.path["snr_excel"])

        return self.path["snr_excel"]

    @run_module(module_name="general/dynasty")
    def run_dynasty_inference(self):
        """Run normal dynasty as configed for this test case."""
        module_name = "general/dynasty"
        self.logger.info(f"Run {module_name}")

        mode_list = [k for k, v in self.config["mode_run"].items() if v]
        input_list = self.list_input_simulator
        dump_level = self.config["dynasty"]["do_dump"]
        info_in = self.io_nodes["input_node", "origin"]
        p_output = self.path["dir_output"] / "results"
        dynasty_bin = self.config["path"]["binary"]["dynasty"]["binary"]
        onnx_map = self.map_onnx
        model_id = self.model_id
        fn_dynasty_sh = self.path["dir_output"] / "run_dynasty.sh"
        n_thread = self.config["dynasty"]["n_parallel_input"]
        onnx_type = self.config["dynasty"]["piano_dynasty"]["onnx_source"]
        shape_in = self.config["dynasty"]["input_shape"]

        # prepare dynasty list
        mode_settings = [dynasty.gen_dynasty_mode_settings(mode_name,
                                                           onnx_map=onnx_map,
                                                           which_onnx=onnx_type,
                                                           model_id=model_id)
                         for mode_name in mode_list]

        d_list, dir_output_list = dynasty.gen_dynasty_list(mode_settings,
                                                           input_list,
                                                           info_in,
                                                           p_output,
                                                           dump_level=dump_level,
                                                           shape_in=shape_in)

        # HACK: for noisy dynasty
        if self.config["module_run"]["piano_dynasty_noise"]:
            d_list_noise, d_out_list_noise = self.generate_dynasty_list_noise()
            d_list.extend(d_list_noise)
            dir_output_list.extend(d_out_list_noise)

        # run all the dynasty inference
        self.logger.info(f"Running dynasty with list of {len(d_list)}")
        cmds = dynasty.build_dynasty_cmd(d_list, dynasty_bin, fn_dynasty_sh)
        fn_log = p_output / "dynasty.log"
        dynasty_timeout = self.config["dynasty"]["timeout"]
        dynasty.run_dynasty_command_parallel(self.model_id, fn_dynasty_sh,
                                             n_thread=n_thread,
                                             fn_err=fn_log,
                                             timeout=dynasty_timeout)

        # save commands with others
        self.save_command(module_name, f"bash {fn_dynasty_sh}")

        return dir_output_list

    @run_module(module_name="general/dynasty noise")
    def run_dynasty_inference_noise(self):
        """TODO. re-write generate_dynasty_list_noise below."""
        raise NotImplementedError
        # return dir_output_list

    def generate_dynasty_list_noise(self):
        """Create dynasty noise list (expand mode+input) for regression.

        HACK: use noise input for dynasty float
        """
        raise NotImplementedError

        # create mode and input_list
        # NOTE: only noise input for float inference now.
        noise_list = []
        ref_modes = ["float"]
        noise_levels = self.config["dynasty"]["noise_sigma"]
        for ref_mode in ref_modes:
            for nl in noise_levels:
                noise_mode = f"{ref_mode}_noise{nl}"
                # copy from ref mode
                i_mode = self.generate_dynasty_mode_setting(ref_mode)
                i_mode["name_mode"] = noise_mode
                i_mode["dir_out"] = f"mode_{noise_mode}"

                input_list = self.list_input_simulator_noise[nl]

                noise_list.append((i_mode, input_list))

        # create detailed dynasty run list
        dynasty_list = []
        dynasty_out_list = []
        for noise_setting, noise_input in noise_list:
            d_list, d_out_list, _ = self.generate_dynasty_list(noise_setting, noise_input)
            dynasty_list.extend(d_list)
            dynasty_out_list.extend(d_out_list)

        return dynasty_list, dynasty_out_list

    @run_module(module_name="auto/dynasty btm dump2")
    def run_dynasty_inference_btm_dump2(self, *, hw_mode, dry_run=True):
        """Run dynasty for pld with dump 2."""
        # prepare dynasty run list for later
        selected_mode = str(hw_mode)
        input_list = self.list_input_btm
        dump_level = 2
        info_in = self.io_nodes["input_node", "origin"]
        p_output = self.path["dir_output"] / "results"
        dynasty_bin = self.config["path"]["binary"]["dynasty"]["binary"]
        onnx_map = self.map_onnx
        model_id = self.model_id
        fn_dynasty_sh = self.path["dir_output"] / "run_dynasty_btm_dump2.sh"
        onnx_type = self.config["dynasty"]["piano_dynasty"]["onnx_source"]
        shape_in = self.config["dynasty"]["input_shape"]

        # prepare dynasty mode setting x1
        selected_mode_setting = dynasty.gen_dynasty_mode_settings(
            selected_mode,
            onnx_map=onnx_map,
            which_onnx=onnx_type,
            model_id=model_id)

        d_list, dir_output_list = dynasty.gen_dynasty_list([selected_mode_setting],
                                                           input_list,
                                                           info_in,
                                                           p_output,
                                                           dump_level=dump_level,
                                                           shape_in=shape_in)

        # run dynasty
        cmds = dynasty.build_dynasty_cmd(d_list, dynasty_bin, fn_dynasty_sh)
        if not dry_run:
            dynasty_timeout = self.config["dynasty"]["timeout"]
            dynasty.run_dynasty_command_parallel(self.model_id, fn_dynasty_sh,
                                                 timeout=dynasty_timeout)

        return dir_output_list

    @staticmethod
    def compact_json(fn_json, fn_new=None):
        """
        Helper function to make json more human-friendly.
        """
        def compact_array(str_array):
            a = str_array.group().replace("\n", "").replace("\t", "")
            return a

        with open(fn_json, "r") as f:
            j = f.read()

        j = re.sub(r"\[.*?\]", compact_array, j, flags=re.DOTALL)
        j = re.sub(r":[ \n\t]*\[", ": [", j, flags=re.DOTALL)

        if fn_new is None:
            fn_new = fn_json
        with open(fn_new, "w") as f:
            f.write(j)

    def postprocess_piano_knerex_json(self, hw_mode):
        """
        Helper function: Prepare/link some knerex json file for compiler use.
        """
        for appd in ["_scaled_piano_bie", "_scaled_piano_onnx", "_quan_piano_bie", "_quan_piano_onnx"]:
            fn_json_scaled = "{}.json".format(self.map_onnx[f"kdp{hw_mode}{appd}"])
            p = pathlib.Path(fn_json_scaled)
            if p.exists() and not p.is_symlink():
                self.compact_json(fn_json_scaled)

        # HACK: for kai's script.
        # TODO: confirm still needed?
        fn_json_from = "{}.json".format(self.map_onnx[f"kdp{hw_mode}_scaled_piano_bie"])
        fn_json_to = "{}.json".format(self.map_onnx[f"kdp{hw_mode}_scaled_piano_onnx"])
        p_to = pathlib.Path(fn_json_to)

        if os.path.exists(fn_json_from) and not p_to.exists():
            shutil.copy(fn_json_from, fn_json_to)

    @run_module(module_name="auto/knerex")
    def run_knerex(self, *, hw_mode):
        """run knerex piano (weight / data analysis, updater 520/720) for this model.

        For knerex, no need for multi-processing.
        (datapath analysis run multi-processing in C++, will not affect python flow).

        input:
            origin.onnx
            compiler_xxx/graph_opt.onnx

        intermedial files:
            * analysis_datapath_piano_NNN.bin
            * analysis_weight_piano_NNN.tmp
        """
        module_name = f"kdp{hw_mode}/knerex"
        self.logger.info(f"Run {module_name}")

        openblas_num_threads = self.config["knerex"]["openblas_num_threads"]

        para_bin = self.config["path"]["binary"]["knerex"]["normal"]
        para_updater_json = self.path[f"updater_{hw_mode}_json"]

        command = f"export OPENBLAS_NUM_THREADS={openblas_num_threads}; {para_bin} -i {para_updater_json}"

        self.save_command(module_name, command)

        TOS = self.config["knerex"]["timeout"]
        cp = futils.run_bash_script(command, timeout=TOS)
        self.check_knerex_error(cp, hw_mode)

        self.postprocess_piano_knerex_json(hw_mode)


    def parse_compiler_warnings(self, hw_mode):
        """Compiler will give some warnings/error/critical.

        Load all the warnings/error/critical
        """
        if DEBUG or self.config["path"]["internal"]:
            p_compiler_out = self.path[f"compiler_piano_{hw_mode}_out"]
            self.graph_warnings[hw_mode] = compiler.parse_compiler_warning(p_compiler_out)

    def raise_error_from_compiler_logs(self, hw_mode):
        """Find detailed failure from gen_config/compiler log.

        common file names: batch_compile.log / compile.log / opt.log / backtrace.log

        opt.log moved to compiler_730/opt_output/image_cut_search/compile.log
        """
        # find all the logs
        p_compiler_out = self.path[f"compiler_piano_{hw_mode}_out"]

        err = compiler.parse_compiler_logs(p_compiler_out)
        if err is None:
            return None

        col_name, msg = err
        self.model_fx_report[(f"kdp{hw_mode}/ERROR")] = msg
        raise RegressionError(f"kdp{hw_mode}/{col_name}", self.model_id, msg=msg)

    def get_compiler_model_type(self, need_gen_nef_config, debug):
        "Get para_model_type for compiler."
        if self.is_multi_layer:
            para_model_type = "-v multi"
            if debug:
                para_model_type = "-v model_dbg"
        elif self.is_multi_core:
            para_model_type = "-v multi"
        elif self.is_single_layer:
            para_model_type = "-v single"
        elif self.is_big_model:
            # big model
            if need_gen_nef_config:  # batch compile to generate nef
                para_model_type = "-v model_rel"
            else:
                # normal compiler call
                para_model_type = "-v model_opt"
        return para_model_type

    def get_fm_cut_parameter(self, skip_fm_cut, para_onnx):
        """As name implies.

        NOTE:
            1. fm_cut 只在 compiler 阶段 (gen_config 时候) 跑. 不会在 compiler frontend 跑.
                - fm_cut 会多次呼叫compiler, 生成最佳 config 之后再呼叫一次 compiler.
            2. ip eval 在每次 compiler 结束时候跑.
            3. only_ip_eval 会跑 compiler frontend + compiler.
                - 所以打开 fm_cut (deep_search) 也是可以的.
        """
        if skip_fm_cut:
            # no need for nef
            fm_cut_conf = ""
        else:
            fm_cut_modes = {
                "default": "",
                "deep_search": f"""-m {para_onnx} --image_cut_search_args " -r -u -t -s" """,
                "partial_graph_search": f"""-m {para_onnx} --image_cut_search_args " -r -u -t -s -pgs" """,
            }
            fm_cut_k = self.config["compiler_piano"]["node_schedule_mode"]
            fm_cut_conf = fm_cut_modes[fm_cut_k]
        return fm_cut_conf

    def get_envs_compiler_bin_dir(self):
        """As name implies."""
        p_lib = self.config["path"]["binary"]["compiler"]["lib_dir"]
        p_bin = self.config["path"]["binary"]["compiler"]["bin_dir"]
        p_opt = self.config["path"]["binary"]["compiler"]["opt_bin_dir"]
        env_compiler_lib = f"""export LD_LIBRARY_PATH="{p_lib}:$LD_LIBRARY_PATH" """
        env_compile_bin_path = f"export COMPILER_BIN_DIR={p_bin}"
        env_opt_bin_path = f"export OPT_COMPILE_DIR={p_opt}"
        return [env_compiler_lib, env_compile_bin_path, env_opt_bin_path]

    def get_envs_compiler_frontend(self, hw_mode):
        """As name implies."""
        compiler_envs = []
        # ask compiler frontend to dump graphopt.bie
        this_name = self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"].stem
        env_gen_opt = f"export KNERON_GEN_OPT_BIE_NAME={this_name}"
        compiler_envs.append(env_gen_opt)
        if DEBUG or (not self.config["path"]["internal"]):
            # no dump onnx for internal regression to save time
            this_name = self.map_onnx[f"kdp{hw_mode}_opt_piano_onnx"].stem
            env_gen_opt = f"export KNERON_GEN_OPT_ONNX_NAME={this_name}"
            compiler_envs.append(env_gen_opt)

        if self.config["compiler_piano"]["no_dummy_bn"] or (hw_mode in [520, 720] and self.is_single_layer):
            # if qat configed
            # HACK: for knerex only, stc, 520/720
            compiler_envs.append("export KNERON_PIANO_OPT_NO_DUMMY_BN=1")

        return compiler_envs

    def get_envs_compiler(self, do_ip_eval):
        """Normal envs for compiler."""
        rst = []

        if do_ip_eval:
            env_ip_eval = "export RUN_IP_EVAL=1"
        else:
            env_ip_eval = "export RUN_IP_EVAL=0"
        rst.append(env_ip_eval)

        return rst

    def get_envs_compiler_hack(self, hw_mode):
        """Some special case settings."""
        compiler_envs = []

        # HACK: stc compiler for 540/730, https://redmine.kneron.tw/issues/17275
        if hw_mode in [540, 730] and self.is_single_layer:
            compiler_envs.append("export KNERON_NMEM_FT_REORDER_OP=1")

        # HACK: http://eip.kneron.com:8080/redmine/issues/16360#note-5
        #       for 720 16bit, knerex
        if self.is_big_model and hw_mode in [720] and self.config["knerex"]["datapath_bitwidth_mode"] in ["int16"]:
            compiler_envs.append("export KNERON_PIANO_OPT_ADD_DUMMY_BYPASS_NODE_FOR_PRELU_LRELU=1")

        return compiler_envs

    def find_compiler_input_bie(self, hw_mode, skip_backend, use_quan_model, p_out):
        """Find corresponding onnx/bie/onnx+json."""
        if self.config["module_run"]["only_ip_evaluator"] or (skip_backend and (not use_quan_model)):
            # no scaled onnx yet. use origin.onnx or origin.bie
            p_origin = pathlib.Path(self.map_onnx["origin"])
            para_onnx = futils.relative_path(p_origin, p_out)
            s_para_json = " "  # no json
            use_quan_model = False
        else:
            btm_bie = self.map_onnx[f"kdp{hw_mode}_bie4compiler_piano_bie"]
            para_onnx = futils.relative_path(btm_bie, p_out)
            use_quan_model = True
            if para_onnx.name.endswith(".bie"):
                # scaled.bie, no json
                s_para_json = " "
            else:
                # scaled.onnx, need json
                para_onnx_json = btm_bie.with_suffix(btm_bie.suffix + ".json")
                para_onnx_json = futils.relative_path(para_onnx_json, p_out)
                s_para_json = f"-r {para_onnx_json}"

        return para_onnx, s_para_json, use_quan_model

    def get_compiler_extra_optimize(self, hw_mode):
        """Gen optimize parameters."""
        extra_optimize = {}
        # for some special STC
        if hw_mode in [720, 530, 730, 630, 540] and futils.need_compress_command_bin(self.cat_name, self.model_name):
            extra_optimize["cmd_size"] = True
        # special compiler test
        if self.config["compiler_piano"]["do_loop_for_batch"]:
            extra_optimize["do_loop_for_batch"] = True
        return extra_optimize

    def get_compiler_extra_config(self, hw_mode, do_ip_eval, use_quan_model, fmt_limit, skip_backend):
        """Some special parameters."""
        extra_d = dict()

        if hw_mode == 720:
            extra_d["gen_setup_fbs"] = True

        if do_ip_eval:
            extra_d["ip_evaluator_cfg"] = self.config["compiler_piano"]["ip_evaluator_json"][hw_mode]

        if self.config["module_run"]["only_ip_evaluator"]:
            # NOTE: normal regression will have it as False,
            # so batch compiler will fail at unsupported cpu nodes.
            extra_d["skip_fw_cpu_op_impl_check"] = True

        if hw_mode in fconsts.MODE_HW_LIMIT["weight_compress"] and self.config["compiler_piano"]["weight_compress"]:
            extra_d["weight_compress"] = True

        extra_optimize = self.get_compiler_extra_optimize(hw_mode)
        if len(extra_optimize) > 0:
            extra_d["optimize"] = extra_optimize

        if (not use_quan_model) and self.config["knerex"]["datapath_bitwidth_mode"] == "int16":
            # run 16bit ip evaluator for only_ip_evaluator
            extra_d["def_data_bitw"] = 16

        if fmt_limit:
            # should not be in only_ip_evaluator
            # NOTE: it seems never send in via gen_nef()
            extra_d["input_fmt"] = fmt_limit

        if not skip_backend:
            # dont do this for compiler frontend
            # send the regression config given input_fmt etc to compiler config.
            # set up input/output format directly from config
            for k1 in ["input_fmt", "output_fmt", "use_ch_compact_fmt"]:
                if k1 in self.config["compiler_piano"]:
                    v1 = self.config["compiler_piano"][k1]
                    if DEBUG:
                        print(f"HACK: regression config override compiler config! {k1}: {v1}")
                    extra_d[k1] = v1

        extra_d["model_id"] = self.nef_model_id

        if hw_mode == 720 and skip_backend:
            # https://redmine.kneron.tw/issues/19020 for MO3
            do_change = False
            for case_end in ["1W16C8BHL_INTLV", "i15o15_INTLV", "1W16C8BHL_colAcc_INTLV"]:
                if self.model_name.endswith(case_end):
                    do_change = True
                    break
            if do_change:
                extra_d["output_fmt"] = "1W16C8B_INTLV"

        if skip_backend:
            # this is for frontend. first run
            extra_d["skip_backend"] = True

        # read per model compiler extra settings and update to extra_d
        # now only used for app_release, need to prepare this json ourself
        p_extra_compiler_settings_config = self.path["dir_input"] / "extra_compiler_settings.json"
        if p_extra_compiler_settings_config.exists():
            with open(p_extra_compiler_settings_config, "r") as f:
                extra_compiler_settings_config = json.load(f)
                if DEBUG:
                    print("Special compiler config loaded:")
                    print(extra_compiler_settings_config)
            recursive_update(extra_d, extra_compiler_settings_config)

        if len(extra_d) > 0:
            extra_para = "-a '{}'".format(json.dumps(extra_d, default=str))
        else:
            extra_para = ""

        return extra_para

    def get_gen_cfg_cmds(self, hw_mode, para_model_type,
                         s_para_json, fm_cut_conf, extra_para, need_gen_nef_config, p_out):
        # generated config file for compiler
        # example: compiler_piano.config.kdp530.json
        compiler_json_name = self.path[f"compiler_piano_{hw_mode}_json"].name
        # may save to different folder
        p_compiler_json = p_out / compiler_json_name
        para_compiler_json = f"-o {compiler_json_name}"

        hack_json = self.path[f"compiler_hack_{hw_mode}_json"]
        para_hack_json = f"-k {hack_json.absolute()}" if hack_json.exists() else ""

        p_img_cut_json = p_out / "image_cut_config.json"

        gen_py = self.config["path"]["binary"]["compiler"]["gen_py"]

        cmd_gen_cfg = f"{gen_py} -t {hw_mode} {para_model_type} {s_para_json} {para_compiler_json} {para_hack_json} {fm_cut_conf} {extra_para} 2>&1 > gen_config.log"

        # HACK: some hack files. may be used for some special models
        p_input = self.model_path / "input"
        p_in_compiler_customize = p_input / f"compiler_piano.config.kdp{hw_mode}.json"
        p_in_img_cut_customize = p_input / "image_cut_config.json"

        p_compiler_json_custom = None

        cp_cmds = ["echo"]  # echo is placeholder in bash
        if p_in_compiler_customize.exists():
            if need_gen_nef_config:
                # for nef gen, p_compiler_json_custom is used
                p_compiler_json_custom = p_out / "compiler_custom_config.json"
                cp_1 = f"cp {p_in_compiler_customize} {p_compiler_json_custom}"
                # normal p_compiler_json will be generated anyway
            else:
                # for normal compiler
                # normal p_compiler_json will be copied from input. not generated
                cp_1 = f"cp {p_in_compiler_customize} {p_compiler_json}"
            cp_cmds.append(cp_1)

            if p_in_img_cut_customize.exists():  # put inside above if?
                cp_1 = f"cp {p_in_img_cut_customize} {p_img_cut_json}"
                cp_cmds.append(cp_1)

        # has customized files?
        cp_cmd = " && ".join(cp_cmds)
        has_customized = len(cp_cmds) > 1

        if need_gen_nef_config:
            # for nef config. will run both
            return cmd_gen_cfg, cp_cmd, p_compiler_json, p_compiler_json_custom
        else:
            # normal compiler calling
            if has_customized:
                return cp_cmd, "echo", p_compiler_json, p_compiler_json_custom
            else:
                return cmd_gen_cfg, "echo", p_compiler_json, p_compiler_json_custom

    def get_compiler_config_helper1(self,
                                    hw_mode,
                                    p_out=None,
                                    debug=False,
                                    need_gen_nef_config=False,
                                    skip_backend=False,
                                    use_quan_model=True,
                                    fmt_limit=None,
                                    do_ip_eval=False):
        """Helper function to generate compiler config.

        Args:
          skip_backend (bool): True to run frontend only.
          use_quan_model (bool): only valid when skip_backend is True.
            set to True to use quantized model for accurate input bin format. (if needed.)
        """
        if type(p_out) is not pathlib.PosixPath:
            p_out = self.path[f"compiler_piano_{hw_mode}_out"]
        p_out.mkdir(mode=0o770, parents=True, exist_ok=True)

        if len(str(self.path[f"qat_{hw_mode}_config_json"])) > 10:
            # is using qat.json
            self.config["compiler_piano"]["no_dummy_bn"] = True

        para_model_type = self.get_compiler_model_type(need_gen_nef_config, debug)

        para_onnx, s_para_json, use_quan_model = self.find_compiler_input_bie(hw_mode, skip_backend, use_quan_model, p_out)

        compiler_envs = ["echo"]  # placeholder for bash
        compiler_envs.extend(self.get_envs_compiler(do_ip_eval) + self.get_envs_compiler_bin_dir() + self.get_envs_compiler_hack(hw_mode))
        if skip_backend:
            compiler_envs.extend(self.get_envs_compiler_frontend(hw_mode))

        extra_para = self.get_compiler_extra_config(hw_mode, do_ip_eval, use_quan_model, fmt_limit, skip_backend)

        # feature map cut
        fm_cut_conf = self.get_fm_cut_parameter(skip_backend, para_onnx)
        # no need for get_cmd_gen_apb

        (cmd_gen_cfg,
         cmd_gen_cfg_custom,
         p_compiler_json,
         p_compiler_json_custom) = self.get_gen_cfg_cmds(hw_mode,
                                                         para_model_type,
                                                         s_para_json,
                                                         fm_cut_conf,
                                                         extra_para,
                                                         need_gen_nef_config,
                                                         p_out)

        compiler_bin = "{} {}".format(self.config["path"]["binary"]["compiler"]["compiler"], hw_mode)
        if self.config["path"]["internal"] and (not self.config["path"]["use_toolchain"]):
            cmd_compiler = f"{compiler_bin} {para_onnx} {p_compiler_json.name} debug"
        else:
            cmd_compiler = f"{compiler_bin} {para_onnx} {p_compiler_json.name}"

        # batch compiler json is generated by regression.
        p_batch_config = self.generate_batch_compiler_json(hw_mode=hw_mode,
                                                           p_out=p_out,
                                                           p_compiler_json=p_compiler_json,
                                                           p_config_to_custom=p_compiler_json_custom)

        # batch compiler command
        cmd_batch = self.generate_batch_compiler_cmd_v1(hw_mode=hw_mode,
                                                        p_out=p_out,
                                                        p_batch_config=p_batch_config)

        return cmd_gen_cfg, cmd_compiler, cmd_batch, p_out, "; ".join(compiler_envs)

    def generate_batch_compiler_cmd_v1(self, *, hw_mode, p_out, p_batch_config):
        """batch_compile to support ALL (+540/730) platforms since 0.21.1. """
        compiler_commit = self.config["path"]["compiler_commit"]
        bin_bc = self.config["path"]["binary"]["compiler"]["batch_compiler"]
        command = f"pushd {p_out} > /dev/null && {bin_bc} {p_batch_config} -T {hw_mode} -t {compiler_commit} -o -D && popd > /dev/null"

        return command

    def generate_batch_compiler_json(self, *, hw_mode, p_out, p_compiler_json, p_config_to_custom):
        """Use template to generate batch_compile.json."""

        # create batch_compile.json

        # figure out which bie to use.
        # TODO: call self.find_compiler_input_bie()
        if self.config["module_run"]["only_ip_evaluator"]:
            # no scaled bie yet. use opt.bie > origin.onnx|origin.bie
            p_origin = self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"]
            if not p_origin.exists():
                p_origin = self.map_onnx["origin"]
            fn_knerex_onnx = futils.relative_path(p_origin, p_out)
            fn_knerex_json = ""
        else:
            # knerex should be ready now
            # TODO: only bie, no onnx
            fn_knerex_onnx = self.map_onnx[f"kdp{hw_mode}_bie4compiler_piano_bie"]
            if fn_knerex_onnx.name.endswith(".onnx"):
                fn_knerex_json = fn_knerex_onnx.with_suffix(fn_knerex_onnx.suffix + ".json")
            else:
                fn_knerex_json = ""

        c = {}
        # nef are used for verify board output against csim.
        c["flow_path"] = self.config["path"]["flow"]
        c["hw_mode"] = hw_mode
        c["model_id"] = self.nef_model_id
        c["stamp"] = "1"
        c["bie_path"] = str(fn_knerex_onnx)
        c["json"] = str(fn_knerex_json)
        # TODO: make this relative path
        c["gen_config_path"] = str(p_compiler_json)

        # save using template
        if p_config_to_custom and p_config_to_custom.exists():
            template = self.jinja_env.get_template("batch_compile_bconfig_custom.json")
            c["custom_config_path"] = str(p_config_to_custom)
        else:
            template = self.jinja_env.get_template("batch_compile_bconfig.json")

        output = template.render(config=c)
        fn_json_save = f"{p_out}/batch_compile.json"
        with open(fn_json_save, "w") as f:
            f.write(output)

        return fn_json_save

    def save_cp_log(self, p_log, cp):
        with open(p_log, "w") as f:
            f.write(f"bash run return code: {cp.returncode}")
            f.write("\n".join([cp.stdout, cp.stderr]))

    @run_module(module_name="auto/compiler_cfg")
    def generate_compiler_config(self, *, hw_mode, command):
        """Generate config for compiler. may do feature-map cut which is time consuming.

        Some optimize modules may be available.
          - feature-map cut deep search.
            - script will iterate compiler to find the best cut.
            - script will copy opt_compile.log to compiler output folder (even if failed).
            - This is time-consuming, may be killed by timeout. Will not have opt_compile.log if so.
        """
        module_name = f"kdp{hw_mode}/compiler_cfg"
        self.save_command(module_name, command)

        # NOTE: usually generate compiler config is very fast.
        #       however, it maybe too long if fm_cut turned on. (deep_search)
        TOS = self.config["compiler_piano"]["timeout"]
        cp = futils.run_bash_script(command, timeout=TOS)

        # in case fm_cut ran, get the report
        self.check_fm_cut_report(hw_mode)

        self.check_compiler_gen_config_error(hw_mode, cp)
        self.clean_opt_compile(hw_mode)

        if cp.returncode != 0:
            self.check_compiler_error(cp, hw_mode, module="compiler_cfg")

    def check_fm_cut_report(self, hw_mode):
        """Exact time and iteration from Summary.txt ."""
        p_compiler_out = self.path[f"compiler_piano_{hw_mode}_out"]
        p_summary = p_compiler_out / "opt_output/image_cut_search/Summary.txt"

        if not p_summary.exists():
            return

        time_total, n_total, n_fm_cut = compiler.parse_fm_cut_summary(p_summary)
        if time_total:
            signal("data_sender").send((self.model_id, f"kdp{hw_mode}/fm_cut:time min", time_total))
        if n_total:
            signal("data_sender").send((self.model_id, f"kdp{hw_mode}/fm_cut:iteration", f"{n_fm_cut}/{n_total}"))

    def check_compiler_gen_config_error(self, hw_mode, cp):
        p_json = self.path[f"compiler_piano_{hw_mode}_json"]
        # save log for debug
        p_log = p_json.parent / "compiler_gen_config.log"

        # DEBUG: check size of config. if empty, save log for debug
        if not p_json.exists():
            self.save_cp_log(p_log, cp)
            raise RegressionError(f"kdp{hw_mode}/compiler_cfg", self.model_id, msg="no config generated.")
        elif p_json.stat().st_size == 0:
            self.save_cp_log(p_log, cp)
            raise RegressionError(f"kdp{hw_mode}/compiler_cfg", self.model_id, msg="config empty.")
        elif cp.returncode != 0:
            # save log first.
            self.save_cp_log(p_log, cp)
            # will do detailed check below

    def clean_opt_compile(self, hw_mode):
        """Clean up opt_compile which is from fm_cut but sometime not cleaned. """
        p_json = self.path[f"compiler_piano_{hw_mode}_json"]
        p_opt_cmpl = p_json.parent / "opt_compile"
        if p_opt_cmpl.exists():
            cmd = f"pkill -f {self.model_name} ; sleep 1; rm -rf {p_opt_cmpl}"
            cp2 = futils.run_bash_script(cmd, do_echo=True)

            # TODO: examine cp2 return code
            # cp2.returncode == -15:

    def check_compiler_error(self, cp, hw_mode, module="compiler"):
        """Examine the return code of batch-compiler.

        TODO: what about normal compiler frontend?
        """
        # load all the warnings/error/critical which will be send to model_fx_report.html
        self.parse_compiler_warnings(hw_mode)

        rc = cp.returncode
        if rc == 0:
            return  # success

        # NOTE: there are two steps below to look for detailed error for compiler.
        # 1. usually log files will have more details for FAILED reason.
        self.raise_error_from_compiler_logs(hw_mode)
        # 2. use the return code to find the detailed error.
        report_col, msg = compiler.lookup_compiler_error(cp, hw_mode, module)
        raise RegressionError(f"kdp{hw_mode}/{report_col}", self.model_id, msg=msg)

    @run_module(module_name="auto/kne2nef")
    def convert_kne2nef(self, *, hw_mode, p_kne, p_nef):
        """Convert kne to nef.

        No more nef auto-gen since 0.27.0 .
        """
        compiler.kne2nef(pathlib.Path(p_kne), pathlib.Path(p_nef), hw_mode)

    @run_module(module_name="auto/compiler")
    def run_batch_compile_command(self, *, hw_mode, command, dir_out):
        module_name = f"kdp{hw_mode}/run batch compiler"
        self.save_command(module_name, command)

        TOS = self.config["compiler_piano"]["timeout"]
        cp = futils.run_bash_script(command, timeout=TOS)

        self.check_compiler_error(cp, hw_mode, module="compiler")

        fn_outs = {}
        if hw_mode in [540, 730]:
            # for 730/540, no setup.bin, command.bin is optional if last one is cpu node
            #              and csim/firmware both use kne
            fn_outs[f"kdp{hw_mode}/kne"] = f"{dir_out}/models_{hw_mode}.kne"
            fn_outs[f"kdp{hw_mode}/nef"] = f"{dir_out}/models_{hw_mode}.nef"

            # convert kne to nef from 0.27.0
            self.convert_kne2nef(hw_mode=hw_mode,
                                 p_kne=fn_outs[f"kdp{hw_mode}/kne"],
                                 p_nef=fn_outs[f"kdp{hw_mode}/nef"])

        else:
            # old setup + nefv1, setup.bin+command.bin for csim
            #                    nef for firmware
            fn_outs[f"kdp{hw_mode}/nef"] = f"{dir_out}/models_{hw_mode}.nef"

        if self.config["module_run"]["only_ip_evaluator"]:
            # no need to release nef file which is useless
            return

        for k, fn_check in fn_outs.items():
            p_check = pathlib.Path(fn_check)
            if not p_check.exists():
                raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg=f"{p_check.name} missing.")

            self.model_fx_release[k] = p_check

    @run_module("auto/compiler hw info")
    def load_hw_stats(self, *, dir_out, hw_mode):
        """Collect FPS info / weight size / cpu nodes from compiler log."""
        if hw_mode in self.config["hw_mode_on"]:
            ip_eval_report = compiler.collect_FPS(dir_out, hw_mode)
            if "fps" in ip_eval_report:
                # this is a valid report
                signal("data_sender").send((self.model_id, f"kdp{hw_mode}/FPS", ip_eval_report["fps"]))
                # Check cpu node info
                # TODO: simplify this. it must be compulsary
                k = "cpu_node"
                if k in ip_eval_report:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/{k}", ip_eval_report[k]))

                # patch up 520 using preset value
                if hw_mode == 520:
                    try:
                        ip_eval_bw = self.config["compiler_piano"]["ip_evaluator_bw"][hw_mode]
                        preset_keys = {
                            "bw_weight": "GETW bandwidth GB/s",
                            "bw_rdma": "RDMA bandwidth GB/s",
                            "bw_wdma": "WDMA bandwidth GB/s"}
                        for k1, k2 in preset_keys.items():
                            if ip_eval_bw[k1] is not None:
                                ip_eval_report[k2] = ip_eval_bw[k1]
                    except:
                        pass

                for k, v in ip_eval_report.items():
                    self.model_fx_report[f"kdp{hw_mode}/ip_eval/{k}"] = v

            fps_improved = compiler.collect_fps_improve(dir_out)
            if fps_improved:
                signal("data_sender").send((self.model_id, f"kdp{hw_mode}/FPS_improved", fps_improved))

            # Collect command size and weight size info
            if self.is_big_model:
                cmd_size, weight_size = compiler.collect_command_weight_size(dir_out)
                if cmd_size:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/cmd_size(KB)", cmd_size))
                if weight_size:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/wt_size(MB)", weight_size))
                    # TEMP: some temp analsysis on weight size. 8bit fx weight vs 32bit float
                    if self.onnx_size > 0:
                        wt_overhead = int(100 * (4 * weight_size / self.onnx_size - 1))
                    else:
                        wt_overhead = 0
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/wt_overhead (%)", wt_overhead))

        # if self.config["module_run"]["filter_cpu_cases"]:
        #     if cpu_node_list_str not in ["None", "N/A"]:
        #         # there are cpu nodes
        #         raise RegressionError(f"kdp{hw_mode}/filter_cpu_node", self.model_id)

    def move_graphopt_bie(self, hw_mode, dir_out):
        """Copy the compiler frontend generated graphopt file."""
        # copy to knerex folder
        p_knerex = self.path[f"knerex_output_{hw_mode}"]
        p_knerex.mkdir(exist_ok=True)

        # graphopt bie
        k = "opt"
        p_to = self.map_onnx[f"kdp{hw_mode}_{k}_piano_bie"]
        p_from = dir_out / p_to.name
        if not p_from.exists():
            raise RegressionError(f"kdp{hw_mode}/compiler frontend", self.model_id, msg=f"NO {p_from.name} generated.")
        # use move is faster than .copyfile ?
        shutil.move(p_from, p_to)

        # graphopt onnx. may not dumped.
        p_to = self.map_onnx[f"kdp{hw_mode}_{k}_piano_onnx"]
        p_from = dir_out / p_to.name
        if DEBUG and not p_from.exists():
            self.logger.error(f"compiler frontend {hw_mode}: no {p_from.name} generated.")
        if p_from.exists():
            # use move is faster than .copyfile ?
            shutil.move(p_from, p_to)

    def move_release_bie(self, hw_mode, dir_out):
        """Copy the compiler generated final file.

        Very similar to above `move_graphopt_bie`.
        """
        # copy to knerex folder
        p_knerex = self.path[f"knerex_output_{hw_mode}"]
        # p_knerex.mkdir(exist_ok=True)

        # chosen model_opt for BTM
        model_opt = self.config["compiler_piano"]["model_optimize"]
        k_opt = f"kdp{hw_mode}_{model_opt}_piano"
        k_release = f"kdp{hw_mode}_release_piano"

        # final bie
        p_to = self.map_onnx[f"{k_release}_bie"]
        p_from = dir_out / p_to.name
        if not p_from.exists():
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg=f"NO {p_from.name} generated.")
        # use move is faster than .copyfile ?
        shutil.move(p_from, p_to)
        # override the file from knerex dumped to compiler dumped
        self.map_onnx[f"{k_opt}_bie"] = self.map_onnx[f"{k_release}_bie"]
        # will be used by dynasty afterwards

        if DEBUG:
            k_bie = f"{k_release}_bie"
            self.verify_knerex_io_names(hw_mode, k_bie)

        # graphopt onnx. may not dumped.
        p_to = self.map_onnx[f"{k_release}_onnx"]
        p_from = dir_out / p_to.name
        if DEBUG and not p_from.exists():
            self.logger.error(f"compiler {hw_mode}: no {p_from.name} generated.")
        if p_from.exists():
            # use move is faster than .copyfile ?
            shutil.move(p_from, p_to)
            # override the file from knerex dumped to compiler dumped
            self.map_onnx[f"{k_opt}_onnx"] = self.map_onnx[f"{k_release}_onnx"]

        # release this bie
        self.model_fx_release[f"kdp{hw_mode}/bie"] = self.map_onnx[f"{k_release}_bie"]
        # this is decomposed float onnx
        self.model_fx_release[f"kdp{hw_mode}/onnx"] = self.map_onnx[f"kdp{hw_mode}_opt_piano_onnx"]

    def verify_knerex_io_names(self, hw_mode, k_bie):
        """Verify input/output nodes between origin.onnx and knerex bie.

        NOTE: verify the output name of origin.onnx and knerex bie.
        Compiler frontend may change output tensor name, for example, add dummy bn.
        So no raise error for now.
        """
        dp_in, dp_out, dp_out_shape, _ = futils.get_ioinfo_from_bie2(self.map_onnx[k_bie])
        # do NOT use clean_name on input_names
        self.io_nodes[("input_node", hw_mode, "bie")] = dp_in_bie = dp_in
        self.io_nodes[("out_node", hw_mode, "bie")] = dp_out_bie = [futils.clean_name(a) for a in dp_out]

        dp_in_ori = self.io_nodes["input_node", "origin"]
        dp_out_ori = self.io_nodes["out_node", "origin"]

        if dp_in_bie != dp_in_ori or dp_out_bie != dp_out_ori:
            print(f"origin.onnx specify:\n\tinput nodes: {dp_in_ori}\n\toutput nodes: {dp_out_ori} \n")
            print(f"{self.map_onnx[k_bie].name} specify:\n\tinput nodes: {dp_in_bie}\n\toutput nodes: {dp_out_bie} \n")
            # raise ValueError(f"origin.onnx and knerex/bie {hw_mode} give different input / output node names.")

    @run_module(module_name="auto/compiler frontend")
    def run_compiler_frontend(self, *, hw_mode, use_quan_model=False):
        """Call compiler frontend to generate cpu node list and decomposed node mapping.

        compiler has two steps:
            * generate config: `generate_compiler_config`
                * (optional) feature map search during gen_config, for better fps.
            * actual compiler run: `run_batch_compiler_command`

        Inputs:
          - hw_mode: 520/530/... supported platform
          - use_quan_model (bool): True if use knerex generated scaled.bie/onnx.
            Set to False if run for i

        Output files:
          - decomposed.bie
          - decomposed.onnx (for release)
        """
        module_name = f"kdp{hw_mode}/compiler frontend"

        (cmd_gen_cfg, cmd_compiler, cmd_batch_compiler, dir_out,
         envs) = self.get_compiler_config_helper1(
             hw_mode,
             skip_backend=True,
             use_quan_model=use_quan_model,
             do_ip_eval=False)

        command1 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_gen_cfg}"
        command2 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_compiler}"

        self.generate_compiler_config(command=command1, hw_mode=hw_mode)

        self.save_command(module_name, command2)
        cp = futils.run_bash_script(command2)

        self.check_compiler_error(cp, hw_mode, module="compiler frontend")

        self.move_graphopt_bie(hw_mode, dir_out)

        # load basic_info.json to check how many input bin formats for each input
        if use_quan_model:
            # load jsons from compiler frontend generated bie
            jsons = util_lib.load_zip_jsons(self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"])
            basic_info = jsons["basic_info.json"]
            self.io_nodes[("input_format", hw_mode)] = basic_info["input_fmt"]

        # prepare for fx_report
        kv = {
            # customer readable key: knerex config key
            "input bitwidth": "model_in_bitwidth_mode",
            "output bitwidth": "model_out_bitwidth_mode",
            "cpu bitwidth": "cpu_bitwidth_mode",
            "datapath bitwidth": "datapath_bitwidth_mode",
            "weight bitwidth": "weight_bitwidth_mode"
        }
        for k, v in kv.items():
            self.model_fx_report[f"kdp{hw_mode}/{k}"] = self.config["knerex"][v]

        # clean up folder
        shutil.rmtree(dir_out)

    @run_module(module_name="auto/pick bin format")
    def pick_in_bin_format(self, *, hw_mode, limited_input):
        """Pick 1 format for each limited_input.

        see https://redmine.kneron.tw/issues/18306
        """
        k1 = ("input_format", hw_mode)
        assert k1 in self.io_nodes, "Input formats are not generated with compiler frontend on quantized model. Check flow settings."
        cmpl_fmts = self.io_nodes[k1]
        results = {}
        for in_name in limited_input:
            if in_name not in cmpl_fmts:
                self.logger.critical(f"Constraint on input format not applied!!! Given {in_name} not in {list(cmpl_fmts.keys())} given by compiler.")
                continue
            if len(cmpl_fmts[in_name]) == 1:
                self.logger.critical(f"Constraint on input format not applied!!! Given {in_name} has only 1 format: {cmpl_fmts[in_name][0]}.")
                continue
            fmts = [f for f in cmpl_fmts[in_name] if not f.startswith("4W4C")]
            if len(fmts) == 0:
                self.logger.critical(f"Constraint on input format not applied!!! Given {in_name} has no valid format to limit: {cmpl_fmts[in_name]} -> remove 4W4B* -> [].")
                continue
            results[in_name] = fmts[0]
        return results

    def export_gen_release_bie(self, hw_mode):
        """Create some exports for release.bie dump in compiler.

        This is for second time compiler calling, which include parts of frontend + backend.
        With these flags, release.bie will be created by compiler.

        this bie will include `calculation_info.json` for dynasty fx
        """
        if self.config["module_run"]["only_ip_evaluator"]:
            return "echo"

        envs = []
        this_name = self.map_onnx[f"kdp{hw_mode}_release_piano_bie"].stem
        env_gen_opt = f"export KNERON_GEN_OPT_BIE_NAME={this_name}"
        envs.append(env_gen_opt)

        if DEBUG:
            # only dump graph_opt.onnx if debug. to save time in regression
            this_name = self.map_onnx[f"kdp{hw_mode}_release_piano_onnx"].stem
            env_gen_opt = f"export KNERON_GEN_OPT_ONNX_NAME={this_name}"
            envs.append(env_gen_opt)
        return "; ".join(envs)

    @run_module(module_name="auto/compiler")
    def generate_nef(self, *, hw_mode, p_nef=None, fmt_limit=None):
        """call batch compiler to generate nef.

        The last and full run of compiler.

        Inputs:
          * hw_mode supported.

        Output files:
          * model_NNN.nef
          * model_NNN.kne
        """

        module_name = f"kdp{hw_mode}/gen_nef"
        self.logger.info(f"run {module_name}")

        if p_nef is None:  # default path
            # TODO: move to compiler_piano_
            # p_nef = pathlib.Path(self.path["compiler_piano_{}_out".format(hw_mode)])
            p_nef = pathlib.Path(self.path[f"nef_output_{hw_mode}"])
        p_nef.mkdir(mode=0o770, parents=True, exist_ok=True)

        # generate compiler nef configs
        do_ip_eval = self.config["compiler_piano"]["ip_evaluator"]
        cmd_gen_cfg, cmd_compiler, cmd_batch_compiler, dir_out, envs = self.get_compiler_config_helper1(hw_mode,
                                                                                                        need_gen_nef_config=True,
                                                                                                        p_out=p_nef,
                                                                                                        fmt_limit=fmt_limit,
                                                                                                        do_ip_eval=do_ip_eval)

        # command1 is generate compiler config, which may call fm_cut.
        command1 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_gen_cfg}"
        # set envs to dump release.bie
        envs_dump_release = self.export_gen_release_bie(hw_mode)
        command3 = f"pushd {dir_out} > /dev/null; {envs}; {envs_dump_release}; {cmd_batch_compiler}"

        # below functions has decorated by run_module. will calculate time and report specific columns
        # this one may include fm_cut, which is time consuming
        self.generate_compiler_config(command=command1, hw_mode=hw_mode)
        self.run_batch_compile_command(command=command3, dir_out=dir_out, hw_mode=hw_mode)
        self.load_hw_stats(dir_out=dir_out, hw_mode=hw_mode)

        if not self.config["module_run"]["only_ip_evaluator"]:
            self.move_release_bie(hw_mode, dir_out)

    @run_module(module_name="auto/csim")
    def run_csim(self, *, hw_mode):
        """Run csim per platform.

        Input files:
            * run_csim_NNN.ini
                * pointing to files needed for csim.
                * refer to `generate_csim_ini` for reference. generate_csim_ini

        Output files:
            * `output/results/FN_INPUT/csim_NNN_output`

        if 520 given, will run `run_csim_520` instead.

        """
        module_name = f"kdp{hw_mode}/csim"
        self.logger.info(f"run {module_name}")

        list_csim = self.io_nodes[("btm_csim_in", hw_mode)]
        d_csim = {i: v for i, v in enumerate(list_csim)}
        bin_csim = fconsts.BIN_SET["csim"][hw_mode]
        fn_sh = self.path["btm_dump"] / f"csim_{hw_mode}" / f"run_csim_{hw_mode}.sh"
        cmd, cp = csim.run_csim(d_csim, bin_csim, fn_sh)

        self.check_csim_error(cp, hw_mode)

    @run_module(module_name="kdp520/csim")
    def run_csim_520(self):
        """run csim 520.

        520 is our first platform. This is different from later platforms.

        Input files:
            * command.bin
            * setup.bin
            * weight.bin
            * dynasty dumped input file at `output/results/FN_INPUT/model_520-wqbi_piano/layer_input_*.bin`

        Output files:
            * `output/results/FN_INPUT/csim_520_output`
        """

        hw_mode = 520
        module_name = f"kdp{hw_mode}/csim"
        self.logger.info(f"run {module_name}")

        p_csim_out = pathlib.Path(self.io_nodes[("btm_csim_path", hw_mode)])
        p_compiler_output = self.path[f"compiler_piano_{hw_mode}_out"]
        p_rel_compiler = futils.relative_path(p_compiler_output, p_csim_out)

        cs = {}
        for fn_key in ["command_bin", "setup_bin", "weight_bin"]:
            p_bin = self.compiler_output[hw_mode][fn_key].name
            cs[fn_key] = f"{p_rel_compiler}/{p_bin}"

        para_bin = self.config["path"]["binary"]["csim"][520]
        p_csim_out.mkdir(mode=0o770, parents=True, exist_ok=True)

        p_dynasty_so = pathlib.Path(self.config["path"]["binary"]["dynasty"]["lib.so"])
        ENV_DYNASTY_LIB = f"""export LD_LIBRARY_PATH="{p_dynasty_so.parent}:$LD_LIBRARY_PATH" """

        if self.is_big_model:
            # NOTE: only 1 input for 520. no need for ","?
            fn_input_rgba = ",".join([str(a) for a in self.io_nodes[("btm_csim_in_bin", hw_mode)]])
            c = f"""{para_bin} -d 0 --thread 1 {cs["command_bin"]} {cs["weight_bin"]} {fn_input_rgba} --setup {cs["setup_bin"]}"""
        else:
            # NOTE: 520 stc to use sequential.bin.
            # NOTE: v016 category will have TWO inputs!!!
            fn_input_sqtl = " ".join([str(a) for a in self.io_nodes[("btm_csim_in_bin", hw_mode)]])
            c = f"""{para_bin} -d 0 --thread 1 {cs["command_bin"]} {cs["weight_bin"]} -t {fn_input_sqtl}"""

        command = f"{ENV_DYNASTY_LIB}; pushd {p_csim_out} > /dev/null && {c} && popd > /dev/null"
        self.save_command(module_name, command)

        TOS = self.config["csim"]["timeout"]
        cp = futils.run_bash_script(command, timeout=TOS)
        self.check_csim_error(cp, hw_mode)

    @run_module(module_name="kdp520/btm dyn_csim")
    def btm_dyn_csim_520(self):
        """
        run bit-true-match check between dynasty / csim fix point results.

        Will raise RegressionError if mismatch.
        """
        module_name = "kdp520/btm dyn_csim"
        self.logger.info(f"check {module_name}")
        hw_mode = 520
        dir_csim_output = self.io_nodes[("btm_csim_path", hw_mode)]

        if self.is_big_model:
            # Multiple outputs possible
            golden_list = self.io_nodes[("btm_dynasty_golden_txt_path", 520)]
            for i in range(len(golden_list)):
                fn_csim_out = f"{dir_csim_output}/node_{i:04d}_final_output.txt"
                fn_d520_out = golden_list[i]
                assert os.path.exists(fn_d520_out), f"dynasty 520 output ({fn_d520_out}) does not exist!"
                # TODO: use futils.md5sum for bit-true-match? faster?
                with open(fn_csim_out, "r") as f_csim, open(fn_d520_out, "r") as f_dyn:
                    out_csim = [int(a) for a in f_csim]
                    out_dyna = [int(a) for a in f_dyn]

                    # do report
                    cond1 = len(out_csim) == len(out_dyna)
                    msg1 = "dynasty dump size ({len(out_dyna)}) != csim dump size ({len(out_csim)})"
                    cond2 = all(a == b for a, b in zip(out_csim, out_dyna))
                    msg2 = "dynasty-csim mismatch! "

                    for cond, msg in [(cond1, msg1), (cond2, msg2)]:
                        if not cond:
                            self.model_fx_report["btm_520"] = msg
                            assert cond, msg
                    else:
                        self.model_fx_report["kdp520/btm"] = "bit-true-match (520) verified between dynasty and csim."

        else:
            # single layer. BUG: we assume only one output.
            fn_csim_out = f"{dir_csim_output}/Lastlayer_final_output.txt"
            fn_d520_out = self.io_nodes[("btm_dynasty_golden_txt_path", 520)][0]
            assert os.path.exists(fn_d520_out), f"dynasty 520 output ({fn_d520_out}) does not exist!"

            with open(fn_csim_out, "r") as f_csim, open(fn_d520_out, "r") as f_dyn:
                out_csim = [int(a) for a in f_csim]
                out_dyna = [int(a) for a in f_dyn]
                assert len(out_csim) == len(out_dyna), f"dynasty dump size ({len(out_dyna)}) != csim dump size ({len(out_csim)})"
                assert all(a == b for a, b in zip(out_csim, out_dyna)), "dynasty-csim mismatch! "

        try:
            if self.config["post_clean_up"]["csim_output"]:
                shutil.rmtree(dir_csim_output)
        except:
            self.logger.error(f"Failed to delete csim 520 dum folder. {dir_csim_output}")

    @run_module(module_name="auto/btm dyn_csim")
    def btm_dyn_csim(self, *, hw_mode):
        """
        run bit-true-match check between dynasty / csim fix point results.

        Will raise RegressionError if mismatch.

        NOTE: platform 520 see btm_dyn_csim_520
        """

        # detour for 520
        if hw_mode == 520:
            self.btm_dyn_csim_520()
            return

        self.logger.info(f"check kdp{hw_mode}/btm_dym_csim")

        # dynasty golden
        p_d = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]

        # the quick way.
        # suppose all the text files are EXACTLY same, with same futils.md5sum
        p_csim_dump = self.io_nodes[("btm_csim_path", hw_mode)]

        # compare data from dma2seq. most easy.
        p_c = pathlib.Path(p_csim_dump).glob("dma2seq_*.seq")
        set_d = set(futils.md5sum(str(a)) for a in p_d)
        set_c = set(futils.md5sum(str(a)) for a in p_c)

        # DEBUG: if internal regression, mismatch will triger pld report automatically
        if self.config["path"]["internal"]:
            if set_d != set_c:
                try:
                    self.generate_pld_report(hw_mode)
                except Exception as e:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/pld dump", str(e)))

        if set_d != set_c:
            # do the report
            msg = f"mismatched results: {len(set_d.difference(set_c))}"
            self.model_fx_report[f"kdp{hw_mode}/btm"] = msg
            self.module_status[hw_mode]["btm_dyn_csim"] = False
            raise RegressionError(f"kdp{hw_mode}/btm dyn_csim", self.model_id, msg=msg)
        else:
            self.model_fx_report[f"kdp{hw_mode}/btm"] = f"bit-true-match ({hw_mode}) verified between dynasty and csim."

        # NOTE: the hard way, for loop to compare
        # self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]
        # dma2seq_*.seq

    #################################################################################
    @run_module(module_name="auto/kneron+")
    def run_nef_kneron_plus(self, *, hw_mode, number_try=0):
        """run nef on kneron plus (dongle server).

        NEF inference request send to kneron internal server,
        which call hardware dongle to do the inference.

        Dongle firmware may return either float or fix-point data on different request.
        Current format: `BCHW`.

        NOTE: the server will RESET dongle then sleep 15s !!!

        Input files:
            * For 520/720/530/630:
                * model_NNN.nef
            * For 540/730, dongle:
                * model_NNN.kne
            * dynasty dumped input bin at `output/results/FN_INPUT/model_NNN-wqbi_piano/layer_input_*.bin`

        Output files:
            * dongle inferenced results in BCHW, float or fix-point
        """
        from nef_utils.dongle_inference import dongle_inference

        module_name = f"kdp{hw_mode}/kneron+"
        self.logger.info(f"run {module_name}")

        dongle_server = self.config["nef"]["dongle_server"]
        npu_timeout = self.config["nef"]["npu_timeout"]
        if hw_mode != 730 and npu_timeout != 3:
            self.logger.info("only 730 npu can adjust timeout, setting to 3 sec by default")
            npu_timeout = 3

        dir_rgba_list = [f"{rgba_input}" for rgba_input in self.io_nodes[("btm_csim_in_bin", hw_mode)]]
        s_rgba = " ".join(dir_rgba_list)

        p_compiler = self.path[f'compiler_piano_{hw_mode}_out']
        p_nef_model = f"{p_compiler}/models_{hw_mode}.nef"

        dir_nef_out_list = []
        for i in range(number_try):
            dir_nef_out_list.append(self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, i)])
            dir_nef_out_list[i].mkdir(parents=True, exist_ok=True)

        dir_nef_out = str(self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, 0)])[:-2]

        # update in load_compiler_ioinfo
        output_order = self.io_nodes[("out_node", hw_mode)]

        # save the bash command for debug. regression will actually call python functions
        # TODO: why no output folder specified?
        dir_nef_script = self.config["path"]["binary"]["nef"]["nef_client.py"]
        command = f"python3 {dir_nef_script} -i {s_rgba} -m {p_nef_model} -p {hw_mode} -mid {self.nef_model_id} -g {dongle_server} -fix --npu_timeout {npu_timeout}"
        self.save_command(module_name, command)

        # acutally call dongle inference server from python function
        try:
            fix_output_list, dongle_client_log = dongle_inference(
                p_nef_model,
                dir_rgba_list,
                model_id=self.nef_model_id,
                platform=hw_mode,
                group=dongle_server,
                inference_times=number_try,
                npu_timeout=npu_timeout,
                is_fixed_output=True,
                output_path=dir_nef_out,
                output_order=output_order)
        except GeneralError as e:
            self.logger.error(e.details)
            raise RegressionError(f"kdp{hw_mode}/{e.msg}", self.model_id, msg=e.details)

        fn_log = self.path["btm_dump"] / "dongle_client.log"
        with open(fn_log, "w") as f:
            f.writelines([line + '\n' for line in dongle_client_log])

    def generate_pld_report(self, hw_mode, dry_run=False):
        """
        Internal process of generating pld report when dynasty/csim mismatch.

        Inputs:
          - hw_mode: platform (520 not supported)
          - dry_run: True to only create scripts. False will actually run them

        Steps included:
            * re-run dynasty per layer
            * re-run csim per layer
            * run pld.py to generate pld report

        Output files:
            * pld report
        """
        if hw_mode == 520:
            self.logger.error("PLD dump does not support 520")
            raise NotImplementedError

        module_name = f"kdp{hw_mode}/pld dump"
        self.logger.info(f"run {module_name}")

        # re-run csim with special config, already generated when run normal csim
        list_csim = self.io_nodes[("btm_csim_in_pld", hw_mode)]
        d_csim = {i: v for i, v in enumerate(list_csim)}
        bin_csim = self.config["path"]["binary"]["csim"][hw_mode]
        fn_sh = self.path["dir_output"] / f"run_csim_{hw_mode}_pld.sh"
        cmd, cp = csim.run_csim(d_csim, bin_csim, fn_sh, dry_run=dry_run)
        # self.check_csim_error(cp, hw_mode)

        # re-run dynasty on test_input.txt with dump 2
        if self.config["dynasty"]["do_dump"] < 2:
            # it maybe 730 or 730-wqbi or ...
            btm_mode = self.btm_dynasty_mode[hw_mode]
            # if dry_run, the dynasty script will be created without running.
            self.run_dynasty_inference_btm_dump2(hw_mode=btm_mode, dry_run=dry_run)

        # run pld.py for report
        p_compiler = self.path[f"compiler_piano_{hw_mode}_out"]
        p_dynasty = self.io_nodes[("btm_dynasty_path", hw_mode)]
        p_csim = self.io_nodes[("btm_csim_path", hw_mode)]
        p_report = self.io_nodes[("pld_report", hw_mode)]
        p_report.mkdir(parents=True, exist_ok=True)
        bin_pld_report = "python3 {}".format(self.config["path"]["binary"]["pld"]["pld.py"])
        command_pld_report = f"{bin_pld_report} {hw_mode} {p_compiler} {p_csim} {p_dynasty} {p_report}"
        self.save_command(module_name, command_pld_report)
        fn_cmd = self.path["dir_output"] / f"run_pld_report_{hw_mode}.sh"
        with open(fn_cmd, "w") as f:
            f.write(f"{command_pld_report}\n\n")
        # if not dry_run:
        if False:  # TODO: temporally disable csim pld dump.
            TOS = self.config["csim"]["pld_timeout"]
            cp = futils.run_bash_script(command_pld_report, do_echo=False, timeout=TOS)
            # run generate_pld_report scrip failed, save the .sh file for debug
            if cp.returncode != 0:
                fn_log = self.path["dir_output"] / f"run_pld_report_{hw_mode}.log"
                with open(fn_log, "w") as f:
                    f.write("\n".join([cp.stdout, cp.stderr]))
                if cp.returncode == 111:
                    msg = cp.stderr
                else:
                    msg = f"Err: {cp.returncode}"
                signal("data_sender").send((self.model_id, f"kdp{hw_mode}/pld dump", msg))

    @run_module(module_name="auto/btm csim_vs_dongle")
    def btm_csim_nef(self, *, hw_mode, number_try):
        """csim vs nef, 520/530/720

        # NOTE: we suppose NEF will only run on big_model
        # if need to run on stc, the csim reference may need to adjust, refer to btm_dyn_csim
        """
        try:
            module_name = f"kdp{hw_mode}/btm_csim_nef/try{number_try}"
            self.logger.info(f"check {module_name}")

            # find all nef inferenced results
            p_nef = pathlib.Path(self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, number_try)]).glob("layer_*_fx.txt")

            # find all csim inferenced results
            if hw_mode != 520:
                str_search = "dma2seq_*.seq"
            else:
                str_search = "node_*_final_output.txt"
            p_csim = pathlib.Path(self.io_nodes[("btm_csim_path", hw_mode)]).glob(str_search)
            ## if csim dump .16B output result, use it as golden for dongle output
            p_csim = [pathlib.Path(str(a) + ".16B") if pathlib.Path(str(a) + ".16B").exists() else pathlib.Path(a) for a in p_csim]

            # NOTE: does not btm on dynasty here
            # p_dynasty = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]
            # set_dynasty = set(futils.md5sum(str(a)) for a in p_dynasty)

            set_nef = set(futils.md5sum(str(a)) for a in p_nef)
            set_csim = set(futils.md5sum(str(a)) for a in p_csim)

            if set_nef != set_csim:
                msg = f"mismatched results: {len(set_nef.difference(set_csim))}"
                self.model_fx_report[f"kdp{hw_mode}/btm"] = msg
                raise RegressionError(f"kdp{hw_mode}/btm csim_vs_dongle", self.model_id, msg=msg)

        except Exception as e:
            print_err(e, self.config["regression"]["print_error"])
            raise RegressionError(f"kdp{hw_mode}/btm csim_vs_dongle", self.model_id)

    @run_module(module_name="auto/btm_dyn_kneron+")
    def btm_dyn_nef_kneron_plus(self, *, hw_mode, number_try):
        """dynasty vs nef, 520/530/720

        # NOTE: we suppose NEF will only run on big_model
        # if need to run on stc, the csim reference may need to adjust, refer to btm_dyn_csim
        """

        module_name = f"kdp{hw_mode}/btm dyn_vs_kneron+ ({number_try})"
        self.logger.info(f"check {module_name}")

        try:
            dir_kneron_plus_output = self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, number_try)]

            # Multiple outputs possible
            golden_list = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]
            for i in range(len(golden_list)):
                fn_dyn_out = str(golden_list[i])
                if not pathlib.Path(fn_dyn_out).exists():
                    raise RegressionError(f"kdp{hw_mode}/dynasty", self.model_id, msg=f"Missing output ({fn_dyn_out})")

                fn_kneron_plus = "{}/{}".format(dir_kneron_plus_output, str(golden_list[i]).split("/")[-1])

                # TODO: @weijie we can use futils.md5sum for fx results now.
                with open(fn_kneron_plus, "r") as f_kneron_plus, open(fn_dyn_out, "r") as f_dyn:
                    out_kneron_plus = [int(float(a)) for a in f_kneron_plus]
                    out_dyna = [int(a) for a in f_dyn]
                    assert len(out_kneron_plus) == len(out_dyna), "dynasty dump size ({}) != kneron plus dump size ({})".format(len(out_dyna), len(out_kneron_plus))
                    # assert all(a == b for a, b in zip(out_kneron_plus, out_dyna)), "dynasty-kneron plus mismatch! "
                    assert all(a == b for a, b in zip(out_kneron_plus, out_dyna)), "dynasty-kneron plus mismatch! "

        except Exception as e:
            print_err(e, self.config["regression"]["print_error"])
            raise RegressionError(module_name, self.model_id)

    @run_module(module_name="general/combine_snr")
    def generate_snr_report(self, base_dump="results"):
        """Generate an overall snr report from per-input-group snr reports.
        """
        self.logger.info("generate snr report")

        do_pc = self.config["snr"]["per_channel"]
        do_plot_pc = self.config["snr"]["plot_snr_per_channel"]

        combine_snr("{}/{}".format(self.path["dir_output"], base_dump), do_per_channel=do_pc, do_plot_per_channel=do_plot_pc)

    def save_command(self, module_name, command):
        self.commands.append((module_name, command))
        print_command(command, self.config["regression"]["print_command"])

    def generate_bash_script(self):
        """put all bash script called for this model in the flow into a bash script for future debug.

        Scripts specified for this model:
        - knerex: weight analysis, data analysis ...
        - dynasty: multiple inputs, multiple modes ...

        Each command are saved to self.commands before been executed.
        """
        if not hasattr(self, "commands") or len(self.commands) == 0:
            return
        with open(self.path["fn_cmd"], "w") as f:
            for submodule, command in self.commands:
                f.write(f"# {submodule}\n")
                f.write(command)
                f.write("\n\n")

    def pre_clean_up(self, base_dump="results"):
        """delete temp files / outputs before flow actually start."""

        try:
            flags = self.config["pre_clean_up"]
            dir_o = pathlib.Path(self.path["dir_output"])
            # self.logger.debug("pre clean up {}/{}".format(self.cat_name, self.model_name))

            if flags["all_output"]:
                command = f"rm -rf {dir_o}"
                cp = futils.run_bash_script(command)
                if cp.returncode > 0:
                    self.logger.warn(f"output folder ({dir_o}) cannot be deleted.")

                dir_o.mkdir(mode=0o770, parents=True, exist_ok=True)
                return

            if flags["knerex_analysis"]:
                for fn in dir_o.glob("analysis_*"):
                    fn.unlink()
            if flags["knerex_output"]:
                for fn in dir_o.glob(f"{self.model_name}*scale*.onnx*"):
                    fn.unlink()
                for fn in dir_o.glob(f"{self.model_name}*scale*.bie*"):
                    fn.unlink()
            if flags["dynasty_output"]:
                for fn in dir_o.glob(base_dump):
                    shutil.rmtree(str(fn), ignore_errors=True)
            if flags["compiler_output"]:
                for fn in dir_o.glob("compiler_output_*"):
                    shutil.rmtree(str(fn), ignore_errors=True)
        except (KeyError, TypeError):
            self.logger.error("pre clean up not configured. skip ...")

    def clean_knerex_output(self):
        # TODO
        raise NotImplementedError

    def need_clean(self, k="dynasty_output"):
        """Examine config and status to see necessary to delete.

        Always success-then-clean.
        """
        available_keys = [
            "all_output",
            "dynasty_output",
            "knerex_output",
            "csim_output"
        ]
        if k not in available_keys:
            raise ValueError(f"post_clean_up key {k} is not in {available_keys}")

        try:
            config_clean = self.config["post_clean_up"][k]
            is_success = self.module_status["general"]["Success"]
            do_clean = config_clean and is_success
        except:
            do_clean = False

        return do_clean

    def clean_dynasty_output(self, dir_output_list):
        """As name implies.

        TODO: this function is not callled properly.
        """
        if self.need_clean("dynasty_output"):
            for dir_o in dir_output_list:
                p_o = pathlib.Path(dir_o)
                if not p_o.exists():
                    continue
                for dir_dumps in p_o.glob("mode_*"):
                    shutil.rmtree(str(dir_dumps))

    def clean_all_output(self):
        """Delete output folder to save space."""
        # if work_in_memory
        if hasattr(self, "work_in_memory") and self.work_in_memory and hasattr(self, "path"):
            d_from = self.path["dir_output_memory"].absolute()
            d_to = self.path["dir_output"].absolute()
            command = f"if mountpoint -q {d_to}; then umount {d_to}; fi; rm -rf {d_from.parent}"
            cp = futils.run_bash_script(command)
            return

        # normal case
        shutil.rmtree(self.path["dir_output"].absolute())