kneron_model_converter/vendor/sys_flow/test_case.py

#! /usr/bin/env python3
import os
import shutil
import copy
import tempfile
import pathlib

import json  # sometime commentjson is too slow
import re
import random
from collections import OrderedDict, defaultdict
from dict_recursive_update import recursive_update

from blinker import signal
import subprocess

import pandas as pd

from jinja2 import Environment, FileSystemLoader

import sys_flow.flow_utils as futils
import sys_flow.util_lib as util_lib
import sys_flow.flow_constants as fconsts
import sys_flow.dynasty_v3 as dynasty
import sys_flow.compiler_v2 as compiler
import sys_flow.csim_utils as csim
from sys_flow.exceptions import RegressionError, MultiRegressionError, GeneralError, print_err, print_command, run_module
from sys_flow.onnx_op_stats import onnx_info
from sys_flow.snr_calculator_v2 import combine_snr, calculate_statistics, get_case_output, get_weight_bin_stats

import snoop

DEBUG = True if os.environ.get("REGRESSION_DEBUG", False) else False
snoop.install(enabled=DEBUG)


def release_test_case(path_to_model, path_to_base, dump_dynasty=False):
    """a helper function to release generated model.

    inputs:
      - dump_dynasty: dump the dynasty output for debug purpose, in mode 2/3.

    """
    files_selected = [
        "input/*.origin.onnx",
        "input/knerex_input*",
        "input/simulator_input*",
        # "*/*.json",
        "output/knerex_*/*.onnx",
        "output/knerex_*/*.bie",
        "output/*.xlsx",
        "output/compiler_*/*command.bin",
        "output/compiler_*/*setup.bin",
        "output/compiler_*/*weight.bin",
        "output/compiler_*/apb.npu",
        "output/compiler_*/*.nef",
        "output/compiler_*/*.kne",
    ]

    p_from = pathlib.Path(path_to_model)
    p_to = pathlib.Path(path_to_base) / p_from.name
    for pat in files_selected:
        fns = p_from.glob(pat)
        for fn in fns:
            # copy to relative path to base.
            fn_r = futils.relative_path(fn, p_from)
            fn_to = p_to / fn_r
            pp(f"{fn} -> {fn_to}")  # noqa
            if fn_to.exists():
                pp(f"{fn_to} exists! skip")  # noqa
                continue
            if not fn_to.parent.exists():
                fn_to.parent.mkdir(exist_ok=True, parents=True)
            if fn.is_symlink():
                # fn_to.symlink_to(fn.readlink()) # TODO: after toolchain use py 3.9
                # NOTE: assume all released symbolic links in released files are relatively link
                # NOTE: check symlink before check is_dir
                fn_to.symlink_to(os.readlink(fn))
            elif fn.is_dir():
                shutil.copytree(fn, fn_to)
            else:
                shutil.copy(fn, fn_to, follow_symlinks=False)
    return p_to


class test_case:
    """The class to provide unified interface for test_case.

    input: model path, where model and files should be orgazed already.
    output: model infomation.

    * run_flow is the function to run all modules, with a `config` input
    * the config will define which modules to run.
    """

    def __init__(self, model_path, config=None):
        """
        The `test_case` class wrap up the interface of model.
        It support unprocessed model and load pre-existing fx model.

        """

        # the model may be unprocessed or processed (with fx model)
        # the config may be string or a path to a json saved for THIS model.
        if config is None:
            p_regression_config = pathlib.Path(model_path) / "output" / "regression_config.json"
            if p_regression_config.exists():
                # use existing config
                config = p_regression_config
        if config and type(config) in [str, pathlib.PosixPath]:
            p_config = pathlib.Path(config)
            if p_config.exists():
                config = futils.load_regression_json(p_config)
        # TODO: or should I skip some steps? where operate on self.config

        self.initial_test_case(model_path, config)
        if config:
            # NOTE: config will be deepcopyed. so no lock in it.
            self.prepare_flow(config)

        self.check_this_case()

    def initial_test_case(self, model_path, config=None):
        """initial test case. set up pre-defined path for this test case.

        * set up name/path for onnx / input, etc
        * verify input images for knerex / dynasty
        * set up logger.

        NOTE: do not use self.config in this function.
        Suppose to be independant from regression/config
        """

        try:
            self.model_path = pathlib.Path(model_path)
            self.model_name = self.model_path.name
            self.cat_name = self.model_path.parent.name
            self.model_id = "{}/{}".format(self.cat_name, self.model_name)

            self.btm_txt = "test_input.txt"  # default input text file.

            # create logger. Try to keep this as early as possible
            self.logger = futils.create_logger("model {}".format(self.model_name), None, "WARNING")
            self.logger.info("run initial_test_case")

            if not self.model_path.exists():
                raise RegressionError("general/initial", self.model_id, msg="model does not exist.")
            self.prepare_path(config)

            # pre-defined onnx names
            self.map_onnx, self.onnx_infos = self.get_onnx_name_map()

        except Exception as e:
            self.logger.error(e)  # what if logger not ready yet?
            raise RegressionError("general/initial", self.model_id)

    @run_module(module_name="general/model oversize")
    def check_onnx_size(self, p_origin):
        """Examine the file size of origin.onnx.
        Internal regression will skip onnx too large.
        """
        onnx_size = int(pathlib.Path(p_origin).resolve().stat().st_size / (1024 * 1024))
        max_MB = self.config["compiler_piano"]["max_onnx_MB"]
        signal("data_sender").send((self.model_id, "general/onnx size (MB)", onnx_size))
        self.onnx_size = onnx_size
        if onnx_size > max_MB:
            raise RegressionError("general/model oversize", self.model_id, msg=f"onnx {onnx_size}Mb//max size {max_MB}Mb")

    def check_this_case(self):
        """Some special check on this case."""
        if pathlib.Path(self.map_onnx["origin"]).name.endswith(".bie"):
            # NOTE: origin.bie is only supported in only_ip_evaluator.
            assert self.config["module_run"]["only_ip_evaluator"], "origin.bie is only for only_ip_evaluator !!!"

    def check_csim_error(self, cp, platform):
        """Find detail reason for csim crash.

        CSIM will return 33 as exit code for some known errors.

        TODO: move to csim_utils.py?
        """

        cat1 = f"kdp{platform}"
        if cp.returncode == 0:
            # success
            return
        elif cp.returncode == 33:
            pat = re.compile("\[\[\[(.*?)\]\]\]", re.MULTILINE | re.DOTALL)
            log = "\n".join([cp.stdout, cp.stderr])
            msg = "\n".join(pat.findall(log))
            raise RegressionError(f"{cat1}/compiler error", self.model_id, msg=msg)
        elif cp.returncode == 111:
            # timeout
            raise RegressionError(f"{cat1}/csim", self.model_id, msg=cp.stderr)
        else:
            raise RegressionError(f"{cat1}/csim", self.model_id)

    def check_knerex_error(self, cp, platform):
        """Find detailed report for calling knerex.

        There are some submodules in knerex, e.g., datapath analysis, may went wrong.
        This step is to improve debug process by reporting specific reasons.
        """

        cat1 = f"kdp{platform}"
        log = "\n".join([str(cp.stdout), str(cp.stderr)])
        fn_log = self.path[f"knerex_output_{platform}"] / "knerex_run.log"
        if self.config["path"]["internal"]:
            # cp.returncode > 0 and
            # now save the log if run internal
            with open(fn_log, "w") as f:
                f.write(f"knerex return with code {cp.returncode}\n\n")
                f.writelines(log)

        # check memory estimation for datapath analysis

        re_mem_est = re.compile("Datapath Analysis takes (\d+)KB=\((\d+)KB for model buffer \+ (\d+)KB for results\) per thread")
        try:
            dpm_total, dpm_buf, dpm_rslt = re_mem_est.findall(log)[0]
            # buffer related to thread number
            # dpm_rslt related to image number
            signal("data_sender").send((self.model_id, f"{cat1}/dp analysis total (KB)", dpm_total))
            signal("data_sender").send((self.model_id, f"{cat1}/dp analysis buf (KB)", dpm_buf))
            signal("data_sender").send((self.model_id, f"{cat1}/dp_analysis result (KB)", dpm_rslt))
        except:
            pass

        # check memory estimation for sequential bias adjust
        re_mem_est = re.compile("Sequential Bias Adjustment takes (\d+)KB memory to hold (\d+) samples of (\d+)KB each")
        try:
            spb_total, spb_n, spb_x1 = re_mem_est.findall(log)[0]
            signal("data_sender").send((self.model_id, f"{cat1}/seq bias adjust total (KB)", spb_total))
            signal("data_sender").send((self.model_id, f"{cat1}/seq bias adjust n", spb_n))
            signal("data_sender").send((self.model_id, f"{cat1}/seq bias adjust mem x1 (KB)", spb_x1))
        except:
            pass

        # check memory estimation for parallel bias adjust
        re_mem_est = re.compile("Parallel Bias Adjustment takes (\d+)KB=\((\d+)KB for model buffer \+ (\d+)KB for results\) per thread")
        try:
            ppb_total, ppb_buf, ppb_rslt = re_mem_est.findall(log)[0]
            signal("data_sender").send((self.model_id, f"{cat1}/prll bias adjust total (KB)", ppb_total))
            signal("data_sender").send((self.model_id, f"{cat1}/prll bias adjust buf (KB)", ppb_buf))
            signal("data_sender").send((self.model_id, f"{cat1}/prll bias adjust result (KB)", ppb_rslt))
        except:
            pass

        s1 = {
            "knerex": "KnerexERROR:\s*(.*)",
            "HW not support": "HW_NOT_SUPPORT:\s*(.*)",
            "unimplemented feature": "UNIMPLEMENTED_FEATURE:\s*(.*)"
        }
        for m1, p1 in s1.items():
            p2 = re.compile(p1).findall(log)
            if len(p2) > 0:
                msg = p2[0]
                self.model_fx_report[(f"{cat1}/ERROR")] = msg
                raise RegressionError(f"{cat1}/{m1}", self.model_id, msg=msg)

        if cp.returncode == 0:
            return
        elif cp.returncode == 111:
            # stderr.startswith("TIMEOUT"):
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg=cp.stderr)
        elif cp.returncode == 11:
            # DELETE below
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg="datapath analysis failed")
        elif cp.returncode == 30:
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg="KnerexMemoryInsufficient")
        else:
            # NOTE: check knerex log for specific errors
            spec_err = {"deadloop": ["Deadloop", "Loop Maxed out"]}

            for cat2, msgs in spec_err.items():
                for msg in msgs:
                    if len(re.compile(msg).findall(log)) > 0:
                        raise RegressionError(f"{cat1}/knerex", self.model_id, msg=cat2)

            # by default
            raise RegressionError(f"{cat1}/knerex", self.model_id, msg=f"err: {cp.returncode}")

    def get_onnx_name_map(self):
        """
        There are a few onnx used/generated during the quantization process.
        This step is to create map of possible onnx.

        NOTE:
          The keys here are widely used in this project. DO NOT change any.
          Follow the name rules of "kdp{hw_mode}_{optimization}_{dev_v}_{fmt}"

        Factors:
          - dev_v: develop version. currently only "piano"
          - hw_mode: float, kdp520/kdp720/etc
          - optimization: origin / scaled / bias adjust / ...
          - format: onnx / bie
        """
        map_onnx = {}
        onnx_infos = {}

        # there must be a origin.onnx (or origin.bie for only_ip_evaluator)
        origin_onnx = f"{self.model_path}/input/{self.model_name}.origin.onnx"

        p_origin = pathlib.Path(origin_onnx)
        using_bie = False
        if not p_origin.exists():
            # second choice is origin.bie
            origin_bie = f"{self.model_path}/input/{self.model_name}.origin.bie"
            p_origin = pathlib.Path(origin_bie)
            if not p_origin.exists():
                raise RegressionError("general/Missing origin.onnx", self.model_id)
            using_bie = True
        map_onnx["origin"] = p_origin

        # read in the origin.onnx for latter usage
        # TODO: can we skip to save time?
        # TODO: make this block work on bie?
        if not using_bie:
            onnx_infos["origin"] = onnx_info(p_origin)
            _, _, self.est_mac_kB = onnx_infos["origin"].get_mac_memory()
            self.check_onnx_io(onnx_infos["origin"])

        for hw_mode in fconsts.MODE_HARDWARE:  # 520/720/530
            for fmt in fconsts.MODEL_FORMAT:  # piano, onnx / bie
                # piano, normal. the only develop version for now. treat as constant
                dev_v = "piano"
                p_knerex_out = self.path[f"knerex_output_{hw_mode}"]
                prefix = f"{self.model_name}.kdp{hw_mode}"
                # this is copied fron compiler frontend
                map_onnx[f"kdp{hw_mode}_opt_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.graph_opt.{fmt}"
                # below generated by knerex
                map_onnx[f"kdp{hw_mode}_scaled_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.scaled.{fmt}"
                map_onnx[f"kdp{hw_mode}_decomp_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.decomposed.{fmt}"
                map_onnx[f"kdp{hw_mode}_quan_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.scaled.quan.{fmt}"
                map_onnx[f"kdp{hw_mode}_release_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.release.{fmt}"
                # piano, bias_adjust
                for bi_name in ["wqbi", "hwbi", "hwbi-mse"]:
                    map_onnx[f"kdp{hw_mode}_{bi_name}_{dev_v}_{fmt}"] = p_knerex_out / f"{prefix}.scaled.quan.{bi_name}.{fmt}"
                # NOTE: the quantized model to release should have ".scaled" in it.
                #       example: kdp720.scaled.bie, kdp530.scaled.quan.wqbi.onnx

        return map_onnx, onnx_infos

    def load_per_model_config(self, p_model_config):
        """A user-config json file (model_config.json) may be provide for fine-tune quantization process. """
        if p_model_config.exists():
            # deep copy of origin config
            config_new = copy.deepcopy(self.config)
            with open(p_model_config, "r") as f:
                per_model_config = json.load(f)
            recursive_update(config_new, per_model_config)
            self.config = config_new

    def get_nef_model_id(self):
        """As name implies.

        HACK: get model_id for kneron solutions
        may in pre-defined.
        we should try best to assign one model id for internal cases.
        """
        k = (self.cat_name, self.model_name)
        if k in self.config["map_model_id"]:
            return self.config["map_model_id"][k]

        s = re.compile("model_(\d+)")
        try:
            # come here if kneron app release
            return int(s.findall(str(self.model_name))[0])
        except:
            if self.config["path"]["internal"]:
                return random.randint(20000, 30000)
            else:
                # 32768 is default
                return 32768

    def prepare_flow(self, config):
        """Prepare for the quantization flow.

        Check the per-model config.
        """
        try:
            self.config = copy.deepcopy(config)

            # update config if this model has specific config to change
            p_model_config = self.model_path / "input" / "model_config.json"
            self.load_per_model_config(p_model_config)

            # save status to local
            # TODO: send this out to report instead of signal
            self.module_status = {"general": {"Success": False}}
            for hw_mode in self.config["hw_mode_on"]:
                self.module_status[hw_mode] = {}

            # some special model types. default settings.
            self.is_big_model = True
            self.is_single_layer = False  # for debug
            self.is_multi_layer = False  # for debug
            self.is_multi_core = False  # for debug
            if self.config["path"]["internal"]:
                # if internal, some special settings
                self.is_big_model = "big_model" == self.config["regression"]["model_type"]
                self.is_single_layer = "single_layer" == self.config["regression"]["model_type"]
                self.is_multi_layer = "multi_layer" == self.config["regression"]["model_type"]
                self.is_multi_core = "multi_core" == self.config["regression"]["model_type"]

            # nef_model_id is needed for calling batch-compiler
            self.nef_model_id = self.get_nef_model_id()
            self.logger.info(f"{self.cat_name}/{self.model_name} with nef model id: {self.nef_model_id}")
            if self.is_big_model:
                signal("data_sender").send((self.model_id, "general/nef_model_id", str(self.nef_model_id)))

            if len(str(self.path["user_config_json"])) > 4:
                with open(self.path["user_config_json"], "r") as f:
                    self.config["user_config"] = json.load(f)

            # need to check validation of onnx first
            if self.config["module_run"]["validate_onnx"]:
                self.check_onnx_valid()

            if self.is_big_model:
                self.check_onnx_size(self.map_onnx["origin"])

            self.compiler_output = {}

            # use model_report to save results for this fx model generating.
            # then save to "output/model_fx_report.json"
            self.model_fx_report = OrderedDict()
            self.model_fx_report["docker_version"] = self.config["path"]["toolchain"]["version"]
            self.model_fx_report["comments"] = self.config["comments"]
            self.model_fx_release = OrderedDict()

            self.pre_clean_up()

            # create configs for datapath analysis, csim ini, etc
            # initial jinja2
            file_loader = FileSystemLoader(str(self.config["path"]["template"]))
            self.jinja_env = Environment(loader=file_loader)

            if not self.config["module_run"]["only_ip_evaluator"]:
                self.check_input_files()

            if self.config["dynasty"]["regression_input"] == "all":
                self.fn_report = "{}/output/snr_analysis/snr_analysis_report.csv".format(self.model_path)
            else:
                self.fn_report = "{}/output/results/{}/snr_analysis_report.csv".format(self.model_path, self.btm_txt)

            self.save_regression_json()

            # save cli commands for debug purpose
            self.commands = []

        except Exception as e:
            self.logger.error(e)
            if type(e) is RegressionError:  # TODO: MultiRegressionError
                raise
            else:
                raise RegressionError("general/prepare", self.model_id)

    @run_module(module_name="general/clean_opt")
    def clean_opt(self):
        """Clean up opt_compile generated by compiler submodules (fm-cut, etc)."""
        # clean up opt_compile which is from fm_cut but sometime not cleaned.
        p_out = self.path["dir_output"]
        p_opt_cmpls = list(p_out.glob("compiler_*/opt_compile"))
        for p_opt in p_opt_cmpls:
            cmd = f"pkill -f {self.model_name} ; sleep 1; rm -rf {p_opt}"
            cp2 = futils.run_bash_script(cmd, do_echo=False)
            # cp2.returncode == -15

    @run_module(module_name="general/post_clean")
    def post_clean_up(self):
        """To clean up before finish.

        This used be `__del__` method but it may not be triggerd immediately
        after the flow finihs. It has been renamed and put into run_flow.

        The "run_flow" will not be called multiple times according to our experience.

        If any submodule failed, this function will be called in `run_single_case`
        """
        # save commands to file. but dynasty related are not included yet.
        self.generate_bash_script()

        if hasattr(self, "work_in_memory") and self.work_in_memory and hasattr(self, "path"):
            # per compiler team request, dont use zip, just copy back
            d_from = self.path["dir_output_memory"].absolute()
            d_to = self.path["dir_output"].absolute()
            # if d_to.is_symlink():
            #     d_to.unlink()
            command = f"if mountpoint -q {d_to}; then umount {d_to}; fi; pushd {d_from} > /dev/null; tar cf - . | (mkdir -p {d_to}; cd {d_to}; tar xvf -)"
            if DEBUG:
                print("recovering from work_in_memory")
                print(command)
            cp = futils.run_bash_script(command)
            # TODO: check cp.returncode
            shutil.rmtree(self.path["dir_output_memory"].parent.absolute())

        self.set_permission_output()

        for handler in self.logger.handlers[:]:
            handler.close()
            self.logger.removeHandler(handler)

        if hasattr(self, "dir_output_list"):
            self.clean_dynasty_output(self.dir_output_list)

    def __repr__(self):
        """Provide brief info on the model."""
        return "Model {}".format(self.model_path)

    def prepare_path(self, config=None):
        """
        Examine essential files/folders for model.
        All essential paths are saved in a dictionary.
        """
        self.path = {}
        # input folder

        # output folder.  this will be used many times
        dir_out = self.model_path / "output"

        self.path["user_config_json"] = self.model_path / "input/user_config.json"
        if not pathlib.Path(self.path["user_config_json"]).exists():
            self.path["user_config_json"] = ""

        for hw_mode in fconsts.MODE_HARDWARE:  # 520/720/530/730/630
            p_knerex_out = dir_out / f"knerex_{hw_mode}"
            self.path[f"knerex_output_{hw_mode}"] = p_knerex_out
            self.path[f"updater_{hw_mode}_json"] = p_knerex_out / f"updater_{hw_mode}.json"

        self.path["fn_json_radix"] = self.model_path / "input/input_radix.json"  # User defined json
        # NOTE: why use knerex_input instead of node_input name?
        # 1. the node_input name may include "/", which will cause great trouble if used as char in diretory name.
        # 2. the node_input name could be arbitariely ANYTHING. we cannot ganrantee safety or conflicts with our other files.
        # NOTE: for multiple inputs, we assume each PAIR/GROUP file are put into knerex_input/knerex_input_1/... with SAME name
        # here we assume knerex_input is for the 1st input node given by ONNX, and knerex_input_1 is for 2nd input node.
        # We also assume the input node given by ONNX is same as in piano graph. otherwise BIG PROBLEM.
        p_knerex_in = self.model_path / "input/knerex_input"
        self.path["dir_knerex"] = p_knerex_in
        if not p_knerex_in.exists():
            raise RegressionError("general/Missing input", self.model_id, msg="Mising knerex_input folder.")
        self.path["dir_simulator"] = self.model_path / "input/simulator_input"
        if not self.path["dir_simulator"].exists():
            # will use same as knerex_input
            self.path["dir_simulator"] = p_knerex_in

        # if dir_out is symlink, which is leftover from last UNSUCCESSFUL run, not cleaned up
        if dir_out.is_symlink():
            # NOTE: dir_out is a symlink but will not exist() if the target does not exist
            dir_out.unlink()

        # HACK: work_in_memory is to make output folder in memory. to avaoid disk io block.
        # especially for big model with feature map cut. which need to write many times in compiler output
        try:
            self.work_in_memory = config["regression"]["work_in_memory"]
        except:
            self.work_in_memory = False
        if self.work_in_memory:
            # if need to work_in_memory, then work at /dev/shm
            # will be saved as zip file later.
            # the whole output folder is in memory
            d_temp = pathlib.Path(tempfile.mkdtemp(prefix="/dev/shm/wim_"))
            dir_out_memory = d_temp / "output"
            dir_out_memory.mkdir(parents=True, exist_ok=True)
            dir_out.mkdir(parents=True, exist_ok=True)

            # NOTE: work_in_memory means old results cleaned up.
            # it used to copy datapath_analysis temp results but the folder had been changed.
            # so skip it now.

            # TODELETE
            # dir_out will be deleted if exists
            # futils.safe_link(dir_out_memory, dir_out, relative=False, delete_exists=True)

            # use mount
            command = f"mount --bind {dir_out_memory} {dir_out}"
            cp = futils.run_bash_script(command)

            # save for future usage
            self.path["dir_output_memory"] = dir_out_memory
            if DEBUG:
                print(f"work_in_memory: {dir_out_memory} mount to output folder: {dir_out}")
                print(command)

        self.path["dir_input"] = self.model_path / "input"
        self.path["dir_output"] = dir_out
        dir_out.mkdir(mode=0o770, parents=True, exist_ok=True)

        # selected one input (test_input.txt by default) for bit-true-match
        p_btm_dump = dir_out / "results" / self.btm_txt
        self.path["btm_dump"] = p_btm_dump

        # TODO: remove platform variables
        platform = "_piano"  # only support piano platform now. no more renaissance
        for hw_mode in fconsts.MODE_HARDWARE:  # 520 / 720 / 530 / etc
            p_knerex_out = dir_out / f"knerex_{hw_mode}"
            # knerex temporally analysis results
            self.path[f"temp_dpa{platform}_{hw_mode}"] = p_knerex_out / f"analysis_datapath{platform}_{hw_mode}.tmp"
            self.path[f"temp_wta{platform}_{hw_mode}"] = p_knerex_out / f"analysis_weight{platform}_{hw_mode}.tmp"

            # compiler and nef output directory
            compiler_out = dir_out / f"compiler_{hw_mode}"
            nef_out = dir_out / f"nef_{hw_mode}"
            self.path[f"compiler{platform}_{hw_mode}_out"] = compiler_out
            # example: compiler_piano_output_530/compiler_piano.config.kdp530.json
            self.path[f"compiler{platform}_{hw_mode}_json"] = compiler_out / f"compiler{platform}.config.kdp{hw_mode}.json"
            self.path[f"nef_output_{hw_mode}"] = nef_out
            # to fill in later after run compiler
            self.path["ioinfo_json"] = {}
            self.path["calculation_json"] = {}

            # qat config json for knerex
            self.path[f"qat_{hw_mode}_config_json"] = self.model_path / "input/qat_{}_config.json".format(hw_mode)
            if not self.path[f"qat_{hw_mode}_config_json"].exists():
                self.path[f"qat_{hw_mode}_config_json"] = ""

        # snr file to check.
        if config:
            if config["dynasty"]["regression_input"] == "all":
                self.path["snr_csv"] = dir_out / "snr_analysis" / "snr_analysis_per_layer.csv"
            else:
                self.path["snr_csv"] = dir_out / "results" / self.btm_txt / "snr_analysis_per_layer.csv"
            self.path["snr_excel"] = dir_out / f"{self.model_name}_snr_report.xlsx"

        # fx model report. for every run
        self.path["model_fx_html"] = dir_out / "model_fx_report.html"
        # for app release only
        self.path["model_fx_json"] = dir_out / "model_fx_report.json"
        # where to save self.config to this file for future reference.
        self.path["export_regression_json"] = dir_out / "regression_config.json"
        # back up bash commands
        self.path["fn_cmd"] = self.model_path / "output/flow_commands.sh"

    def set_permission_output(self):
        """Set permission for test cases so that other users can access.

        If not using docker, One can only set permissions for file created by themselves.
        If using docker, you can anything

        Diretory set to 755, files set to 644.

        Using pathlib.Path.chmod in docker will NOT work. so we use bash
        """
        dir_out = self.path["dir_output"]
        try:
            futils.set_folder_public(dir_out)
        except Exception as e:
            self.logger.error(e)

    def find_simulator_input_list(self, p_txt):
        """
        Find the input images in simluator_input folder.

        The `simulator_input` contains input for dynasty/csim/dongle inference.

        Our regression are using the file name `test_input.txt` as default file name for bit-true-match. Users may limit the number of input groups for inference. The `test_input.txt` will be used at first by default.

        # TODO: refactor this function
        # TODO: if no test_input.txt exist, randomly pick it for bit-true-match
        """

        if self.config["dynasty"]["regression_input"] == "default":
            default_txt = list(p_txt.glob(self.btm_txt))[0]
            sim_lists = [default_txt]
        else:  # otherwise runn dynasty on all txt
            sim_lists = list(p_txt.glob("*.txt"))
            # sort input texts by names. but move "test_input.txt" to the 1st if exists
            sim_lists = sorted(sim_lists, key=lambda x: "" if x.name == self.btm_txt else x.name)
            if self.config["dynasty"]["sample_seed"] is not None and len(sim_lists) > 2:
                # randomize
                ram_list = sim_lists[1:]
                random.seed(self.config["dynasty"]["sample_seed"])
                random.shuffle(ram_list)
                sim_lists = sim_lists[:1] + ram_list

        list_input_simulator = [self.find_multiple_input(a) for a in sim_lists]
        assert len(list_input_simulator) > 0, "NO input images in simulator_input folder."

        # apply num_input_samples to limit number of images. // to save time in regression for quicker test.
        n_max_input = self.config["dynasty"]["num_input_samples"]
        list_input_simulator = list_input_simulator[:n_max_input]

        return list_input_simulator

    def check_input_files(self):
        """Examine the input text files in knerex_input / simlulator_input folder

        There should be at least 1 input images in knerex_input for datapath analysis, which is essential for quantization.

        There should be at least 1 input images in simulator_input folder, which is used for dynasty / csim / dongle inference. Our regression are using the file name `test_input.txt` as default file name for bit-true-match. If there is no file named "test_input.txt", a random file in the simulator_input folder will be picked and linked as test_input.txt.

        For models with multiple input nodes, there should be SAME filename, e.g., `camera_002.txt` in
            * knerex_input   / simulator_input  , for 1st input node
            * knerex_input_1 / simulator_input_1, for 2nd input node
            * knerex_input_2 / simulator_input_2, for 3rd input node
            * ... if necessary
        """

        # '**/*.txt' will find all txt files

        # knerex will use all txt in knerex_input folder
        self.list_input_knerex = [self.find_multiple_input(a) for a in list(pathlib.Path(self.path["dir_knerex"]).glob("*.txt"))]
        assert len(self.list_input_knerex) > 0, "NO input images in knerex_input folder."
        # dynasty will pick text from simulator_input folder
        self.list_input_simulator = self.find_simulator_input_list(pathlib.Path(self.path["dir_simulator"]))
        assert len(self.list_input_simulator) > 0, "NO input images in simulator_input folder."
        # `test_input.txt` in `simulator_input` will be used for bit-true-match check by default
        self.list_input_btm = [self.find_multiple_input(a) for a in list(pathlib.Path(self.path["dir_simulator"]).glob("test_input.txt"))]
        assert len(self.list_input_btm) == 1, f"""NO test_input.txt in {self.path["dir_simulator"]} folder."""

        # check input files
        self.logger.info("Found {} input image for knerex".format(len(self.list_input_knerex)))
        self.logger.info("Found {} input image for simulator".format(len(self.list_input_simulator)))

        # HACK: Create noise input
        if futils.get_switch_value(self.config["module_run"], "piano_dynasty_noise", False):
            sigma_levels = self.config["dynasty"]["noise_sigma"]
            p_input = self.model_path / "input"
            self.list_input_simulator_noise = {}
            for p_simu in p_input.glob("simulator_input*"):
                if "_sigma" in p_simu.name:  # don't repeat itself
                    continue
                futils.create_noise_input_folder(p_simu, sigma_levels)
            for sigma in sigma_levels:
                p_simu = p_input / "simulator_input_sigma{}".format(sigma)
                assert p_simu.exists(), f"{p_simu} does not exists."
                self.list_input_simulator_noise[sigma] = self.find_simulator_input_list(p_simu)

        # creat link for test_input.txt if necessary
        # as use models linked from model_source, this may fail.
        if self.config["dynasty"]["regression_input"] == "default":
            self.fn_input_default = [self.find_multiple_input(self.path["dir_simulator"] / self.btm_txt, verify_exist=False)]
            if not pathlib.Path(self.fn_input_default[0][0]).exists():
                self.logger.warn("missing simulator_input/{}. trying to link.".format(self.btm_txt))
                for i_from, i_to in zip(self.list_input_simulator[0], self.fn_input_default[0]):
                    futils.safe_link(i_from, i_to)

    def check_onnx_io(self, origin_info):
        """Get onnx ioinfo from onnx file. This will only get some simple information about input/output nodes. Example: .

        Output:
            * self.io_nodes["input"] will contain input nodes name and their order
              * needed by knerex / dynasty before compiler

        A more accurate way is to call load_compiler_ioinfo() which will update self.io_nodes with more information. However this must run after compiler generate ioinfo.csv
        """

        self.io_nodes = {}
        input_nodes, output_nodes, opset = origin_info.get_ioinfo()
        assert len(input_nodes) > 0, "Onnx: found no inputs nodes!"

        # NOTE: we suppose all the input nodes are same order for 520/720/etc.
        # otherwise the input_lots.json will be different for different hardware
        self.io_nodes["input"] = input_nodes

    def save_regression_json(self):
        """Dump this regression config for debug"""

        if self.is_big_model:
            with open(self.path["export_regression_json"], "w") as f:
                # remove "snr_ref" from self.config before saving.
                d = copy.deepcopy(self.config)
                d.pop('snr_ref', None)
                d.pop('map_model_id', None)
                # d.pop('hw_mode_on', None)
                json.dump(d, f, indent=4, sort_keys=False, default=str)

    def get_scaled_onnx_source(self, hw_mode):
        """ Find the targeted onnx file by config for btm.

          -  Format: onnx/bie
          -  Optimization: scaled/wqbi
        """
        model_format = futils.get_switch_value(self.config["compiler_piano"], "model_format", "bie")
        model_opt = futils.get_switch_value(self.config["compiler_piano"], "model_optimize", "wqbi")
        model_key = "kdp{}_{}_piano_{}".format(hw_mode, model_opt, model_format)

        fn_knerex = self.map_onnx[model_key]
        fn_json = "{}.json".format(fn_knerex)

        dynasty_mode = "{}{}".format(hw_mode, fconsts.MODEL_RELEASE[model_opt])

        # need to release this in toolchain
        decomp_onnx = pathlib.Path(self.map_onnx[f"kdp{hw_mode}_decomp_piano_onnx"])
        return pathlib.Path(fn_knerex), pathlib.Path(fn_json), dynasty_mode, decomp_onnx

    def get_input_folders(self, input_nodes, first_input_folder):
        """Generate dictionary of input folders for knerex."""
        if not os.path.exists(first_input_folder):
            raise RegressionError("general/Missing input", self.model_id)

        input_folders = {}
        # at least one input
        input_folders[input_nodes[0]] = first_input_folder
        # if multi inputs
        for i_name, this_name in enumerate(input_nodes[1:]):
            # NOTE: verify multi input node folder
            self.logger.info("Check input folder {}/{}: \"{}\". ".format(i_name + 2, len(input_nodes), this_name))
            this_dir = "{}_{}".format(first_input_folder, i_name + 1)
            input_folders[this_name] = this_dir

            if not os.path.exists(this_dir):
                self.logger.critical(
                    "MISSING input folder {}/{}: node \"{}\", input folder expect at \"{}\". "
                    .format(i_name + 2, len(input_nodes), this_name, this_dir))
                raise RegressionError("general/Missing input", self.model_id)
        return input_folders

    def generate_knerex_config(self, *, hw_mode):
        """
        Generate config json for knerex using template.
        Settings include per regression / per model.

        Output file:
          * `updater_NNN.json` for platform `NNN`.
        """
        input_nodes = self.io_nodes["input"]
        fn_json, dir_input_1st = self.path[f"updater_{hw_mode}_json"], self.path["dir_knerex"]
        fn_json.parent.mkdir(parents=True, exist_ok=True)
        input_folders = self.get_input_folders(input_nodes, dir_input_1st)

        conf = {}

        # TODO: remove t, use keys from config["knerex"]
        t = [
            "verbose",
            "percentile",
            "same_scale",
            "per_channel_radix",
            "output_scale",
            "output_radix",
            "cpu_scale",
            "cpu_radix",
            "fixed_scale_mode",
            "max_scale",
            "data_analysis_threads",
            "datapath_range_method",
            "outlier_factor",
            "bn_weight_pct",
            "conv_weight_pct",
            "num_input_samples",
            "dump_level",
            "datapath_bitwidth_mode",
            "weight_bitwidth_mode",
            "model_in_bitwidth_mode",
            "model_out_bitwidth_mode",
            "cpu_bitwidth_mode",
            "datapath_mix_percentile",
            "weight_mix_percentile",
            "data_analysis_pct",  # outliers
            "need_additional_data_analysis_pct",
            "additional_data_analysis_pcts",
            "dynamic_range_based_on_bitwidth"
        ]

        # copy knerex configs from config
        for k in t:
            conf[k] = self.config["knerex"][k]

        input_shape = self.config["dynasty"]["input_shape"]
        convert = {"onnx_shape": "1", "channel_last": "0"}
        conf["shape_order"] = convert.get(input_shape, "1")
        conf["type"] = fconsts.KNEREX_UPDATER_TYPE[hw_mode]

        # TODELETE
        # def get_test_config():
        #     # test_config.json for stc, but with some exceptions.
        #     if self.is_big_model or hw_mode in [520]:
        #         test_config = ""
        #     else:
        #         # for stc / mtc / etc
        #         test_config = self.path[f"json_hack_{hw_mode}"]
        #         bw_dp = self.config["knerex"]["datapath_bitwidth_mode"]
        #         if hw_mode in [720, 730] and bw_dp in ["int16"]:
        #             test_config = ""
        #     return test_config

        # per model settings.
        # input files for knerex
        # will only use decomposed.bie from compiler frontend from 0.24.0
        conf["fn_origin_onnx"] = self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"]
        conf["test_config"] = ""
        conf["user_config_json"] = self.path["user_config_json"]
        conf["qat_config"] = self.path[f"qat_{hw_mode}_config_json"]

        # temp files.
        conf["fn_dp_analysis_piano"] = self.path[f"temp_dpa_piano_{hw_mode}"]
        conf["fn_wt_analysis_piano"] = self.path[f"temp_wta_piano_{hw_mode}"]

        # output
        conf["outmodel"] = self.map_onnx[f"kdp{hw_mode}_scaled_piano_bie"]

        # render the json file
        template = self.jinja_env.get_template(f"updater_{hw_mode}.json")
        output = template.render(input_nodes=input_nodes, input_folders=input_folders, conf=conf)
        with open(fn_json, "w") as f:
            f.write(output)
        # check before finish
        assert pathlib.Path(fn_json).exists(), f"failed to create {fn_json}"

    @run_module(module_name="auto/check compiler output")
    def load_compiler_dump(self, *, hw_mode):
        """Check the output of compiler / batch compiler.

        The command.bin/etc had a prefix if generate by batch compiler
        """
        module_name = f"kdp{hw_mode}/load compiler dump"
        self.logger.info(f"{module_name}")
        dir_out = self.path["compiler_piano_{}_out".format(hw_mode)]
        self.compiler_output[hw_mode] = compiler.locate_compiler_dump(dir_out, hw_mode)

    def load_ioinfo_520(self):
        """Load ioinfo from  radix.json.

        Will use knerex generated radix.json and shape.json.
        """
        hw_mode = 520
        module_name = f"kdp{hw_mode}/load_ioinfo"
        self.logger.info(f"check {module_name}")

        _, fn_knerex_json, _, _ = self.get_scaled_onnx_source(hw_mode)
        with open(fn_knerex_json, "r") as f:
            d_radix = json.load(f)

        t = list(self.path[f"knerex_output_{hw_mode}"].glob("*kdp520*SnrShapeInfo.json"))
        fn_json_shape = t[0]
        with open(fn_json_shape, "r") as f:
            d_shape = json.load(f)

        ioinfo = futils.get_ioinfo_from_knerex_json(d_radix, d_shape)
        return ioinfo

    @run_module(module_name="auto/parse_ioinfo")
    def load_compiler_ioinfo(self, *, hw_mode):
        """Parse `ioinfo.csv` yielded by compiler to determine input nodes shapes.

        NOTE:
          this method requires compiler ouptut, so call it after compiler.

        This function will load the ioinfo from compiler output,

          - load `ioinfo.json` in compier output folder
          - save to `self.io_nodes`, which include

            - input nodes shapes / data format.
            - output nodes shapes / data format.
            - cpu nodes.

        This function will also find corresponding the dynasty dump for golden.
        It need to decide:

          - which dynasty mode output folder (related to knerex optimization)
          - which format (fx or fl)
        """
        assert hw_mode in self.config["hw_mode_on"], "hw_mode is: {}, not in hw_mode_on {}".format(hw_mode, self.config["hw_mode_on"])

        module_name = f"kdp{hw_mode}/parse_ioinfo"
        self.logger.info(f"{module_name}")

        if hw_mode in [520]:
            ioinfo = self.load_ioinfo_520()
        else:
            fn_ioinfo = self.compiler_output[hw_mode]["ioinfo_json"]
            ioinfo = compiler.load_ioinfo_json(fn_ioinfo)
            # TODO: patch dp_in_names for later reference
        input_nodes = [a["name"] for a in ioinfo["input"]]
        output_nodes = [a["name"] for a in ioinfo["output"]]
        cpu_nodes = []  # TODO

        if len(input_nodes) == 0:
            self.logger.critical("Input nodes cannot be found")
        if len(output_nodes) == 0:
            self.logger.critical("Output nodes cannot be found")

        # find the golden in dynasty for btm
        _, _, dynasty_mode, _ = self.get_scaled_onnx_source(hw_mode)

        p_dump = self.path["btm_dump"]
        p_dynasty_dump = p_dump / "mode_{}_piano".format(dynasty_mode)
        p_csim_dump = p_dump / f"csim_{hw_mode}"
        p_pld_report = p_dump / "pld_report"

        # ini file for csim btm dump. default is test_input.txt
        self.path[f"csim_{hw_mode}_ini"] = p_csim_dump / f"run_csim_{hw_mode}.ini"
        self.path[f"csim_{hw_mode}_ini_pld"] = p_csim_dump / f"run_csim_{hw_mode}.pld.ini"

        # prepare dynasty golden
        if hw_mode in [720, 530]:
            # could be fx.txt or fl.txt
            golden_txt_fns = []
            for i_dp, info_o in enumerate(ioinfo["output"]):
                fmt = info_o["data_format"]
                # TODO: confirm with Kai
                if fmt == "RAW_FLOAT":
                    fn_output = "layer_output_{}_fl.txt".format(info_o["name"])
                else:
                    fn_output = "layer_output_{}_fx.txt".format(info_o["name"])
                golden_txt_fns.append(fn_output)
        else:  # only fx txt
            golden_txt_fns = ["layer_output_{}_fx.txt".format(a["name"]) for a in ioinfo["output"]]
        p_dynasty_golden = [p_dynasty_dump / fn for fn in golden_txt_fns]

        # record information for bit-true-match. this is related to which text_input
        self.io_nodes[("btm_text_input", hw_mode)] = self.btm_txt
        self.io_nodes[("btm_dynasty_mode", hw_mode)] = dynasty_mode
        self.io_nodes[("btm_dynasty_path", hw_mode)] = p_dynasty_dump
        self.io_nodes[("btm_dynasty_golden_txt_fn", hw_mode)] = golden_txt_fns
        self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)] = p_dynasty_golden
        self.io_nodes[("btm_csim_path", hw_mode)] = p_csim_dump
        # need for dynasty / csim btm debug
        self.io_nodes[("pld_report", hw_mode)] = p_pld_report

        # general info
        self.io_nodes[("ioinfo", hw_mode)] = ioinfo
        self.io_nodes[("input_node", hw_mode)] = input_nodes
        self.io_nodes[("out_node", hw_mode)] = output_nodes
        self.io_nodes[("cpu_node", hw_mode)] = cpu_nodes

        # save for reference but only internal regression
        if self.config["path"]["internal"]:
            self.model_fx_report[(f"kdp{hw_mode}/btm_dynasty_path")] = p_dynasty_dump

        for i in range(self.config["nef"]["inference_count"]):
            p_nef_dump = p_dump / "nef_{}_output_{}".format(hw_mode, i)
            self.io_nodes[("btm_nef_path", hw_mode, i)] = p_nef_dump
            p_nef_kneron_plus_dump = p_dump / "nef_kneron_plus_{}_output_{}".format(hw_mode, i)
            self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, i)] = p_nef_kneron_plus_dump

    @run_module("auto/gen_csim_ini")
    def generate_csim_ini(self, *, hw_mode):
        """
        create .ini config for csim using jinja2 template
        per 520/720/530/730/630.

        CSIM 520 will not use this .ini config
        CSIM 720/530/730/630 will use this .ini file directly

        Input files:
          * ioinfo.csv from compiler output.
          * model files for 520/720/530/530:
              * weight.bin
              * command.bin
              * setup.bin
              * apb.npu
          * model files for 540/730:
              * model_NNN.kne
          * input file for inference
              * dynasty dumped input file, prepared by `data_convert`
              * `output/results/FN_INPUT/model_520-wqbi_piano/layer_input_*.bin`

        Output files:
          * run_csim_NNN.ini

        """
        self.logger.info(f"generating csim ini for {hw_mode}")
        assert hw_mode in self.config["hw_mode_on"], "hw_mode is: {}, not in hw_mode_on {}".format(hw_mode, self.config["hw_mode_on"])

        # for piano compiler output
        p_compiler = pathlib.Path(self.path["compiler_piano_{}_out".format(hw_mode)])
        p_csim_dump = self.io_nodes[("btm_csim_path", hw_mode)]
        bin_pair = self.io_nodes[("btm_csim_in_bin", hw_mode)]
        golden_txt = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]

        # RTL-release need to set this to 3
        dump_core_opt = self.config["csim"]["dump_core_opt"]

        # generate ini for normal csim
        template = self.jinja_env.get_template(f"run_csim_{hw_mode}.ini")
        fn_ini = self.path["csim_{}_ini".format(hw_mode)]
        csim.gen_csim_ini(bin_pair, p_compiler, hw_mode,
                          template=template,
                          fn_ini=fn_ini,
                          golden_txts=golden_txt,
                          dump_core_opt=dump_core_opt)
        # function output
        self.io_nodes[("btm_csim_in", hw_mode)] = [[p_csim_dump, fn_ini]]

        # generate ini for pld csim
        template_pld_dump = self.jinja_env.get_template(f"run_csim_{hw_mode}.pld.ini")
        fn_ini_pld = self.path["csim_{}_ini_pld".format(hw_mode)]
        csim.gen_csim_ini(bin_pair, p_compiler, hw_mode,
                          template=template_pld_dump,
                          fn_ini=fn_ini_pld,
                          golden_txts=golden_txt)
        # function output
        self.io_nodes[("btm_csim_in_pld", hw_mode)] = [[p_csim_dump, fn_ini_pld]]

    @run_module(module_name="kdp520/convert_rgba")
    def data_convert_520(self, *, hw_mode):
        """Convert input.txt pair to csim.bin. """
        module_name = "kdp520/data_convert"
        self.logger.info(f"check {module_name}")

        # Generate input bins for csim
        # previously using self.io_nodes["input"] which is same as onnx input node order
        p_csim_dump = self.io_nodes[("btm_csim_path", hw_mode)]
        p_csim_dump.mkdir(exist_ok=True, parents=True)

        info_in = self.io_nodes[("ioinfo", hw_mode)]["input"]

        if self.is_big_model:
            list_input_bin = csim.txt2bin_rgba(self.list_input_btm, info_in, p_csim_dump)
        else:  # only stc, no mtc
            list_input_bin = csim.txt2bin_seq(self.list_input_btm, info_in, p_csim_dump)

        # assert list_input_bin.keys() == [0]
        # function output
        self.io_nodes[("btm_csim_in_bin", hw_mode)] = list_input_bin[0]

        # TODO: why we need list_input_bin_rtl?
        # TODO: if compiler specify RAW_FLOAT, need to use dynasty/_fl.bin?

        return

    @run_module(module_name="auto/data_convert")
    def data_convert(self, *, hw_mode):
        """Convert input.txt pair to csim.bin.

        * no supporting 520.

        Input files:
            * dynasty input text files.
        """
        module_name = f"kdp{hw_mode}/data_convert"
        self.logger.info(f"check {module_name}")

        # Get input bins for csim
        # previously using self.io_nodes["input"] which is same as onnx input node order
        # but compiler may use different order. refer to ioinfo.csv
        # NOTE: when write to ini file, file refered to are in relative path to the ini (a.k.a, output folder)
        p_csim_dump = self.io_nodes[("btm_csim_path", hw_mode)]
        info_in = self.io_nodes[("ioinfo", hw_mode)]["input"]
        csim_bin_sqt = csim.txt2bin_seq(self.list_input_btm, info_in, p_csim_dump)
        list_input_bin, cmds = csim.data_convert(csim_bin_sqt,
                                                 info_in,
                                                 p_out=p_csim_dump)
        self.save_command(module_name, "\n".join(cmds))

        # assert list_input_bin.keys() == [0]
        # function output
        self.io_nodes[("btm_csim_in_bin", hw_mode)] = list_input_bin[0]

        # TODO: why we need list_input_bin_rtl?
        # TODO: if compiler specify RAW_FLOAT, need to use dynasty/_fl.bin?

        return

    def find_multiple_input(self, fn_input0, verify_exist=True):
        """Look for (possible) multiple INPUT NODES for this MODEL.

        give 1st input image name, give a list with whole input set (might be 1 or more.)

        TODO: need refactor into utils
        """
        fn_base = fn_input0.name
        p_base = fn_input0.parent.parent
        path_prefix = fn_input0.parent.name.rstrip("_0")

        if verify_exist:
            assert fn_input0.exists()
        list_inputs = [str(fn_input0)]

        input_nodes, _, _ = self.onnx_infos["origin"].get_ioinfo()

        # NOTE: current by search input folders.
        # TODO: verify with onnx input number
        for i_dir in range(1, len(input_nodes)):
            next_input = p_base / f"{path_prefix}_{i_dir}" / fn_base
            if verify_exist and not next_input.exists():
                raise RegressionError("general/Missing input", self.model_id, msg="missing input: {}".format(next_input))
            list_inputs.append(str(next_input))

        return list_inputs

    def est_memory_dynasty_fx(self):
        """
        Estimate how much memory needed for dynasty-fx inference
        """

        # only some need to estimate
        platforms_large_memory = [520, 720]
        plts = [hw_mode for hw_mode in self.config["hw_mode_on"] if hw_mode in platforms_large_memory]
        if len(plts) == 0:
            return

        est_avl_kB = futils.estimate_mem_available()
        # TODO: what if multi-thread?
        if self.est_mac_kB > est_avl_kB:
            self.logger.error(f"WARNING: Estimated max memory need for dynasty fx {plts} is {self.est_mac_kB} kB.")
            self.logger.error(f"         Current available memory is {est_avl_kB} kB.")

    @run_module(module_name="general/invalid_onnx")
    def check_onnx_valid(self):
        """Report if this onnx is invalid
        """
        if not self.onnx_infos["origin"].is_valid_onnx():
            raise RegressionError("general/invalid_onnx", self.model_id)

    def run_flow(self):
        """The main function for the kneron internal quantization flow.

        Here it controls the sequence of module execution.

        `config` defines which module to run.
        For complicated process, e.g., bias adjust,
        you can define multiple configs and call `run_flow(conf1)` and `run_flow(conf2)`, etc
        """
        # TODO: better flow control per platform. aka. one platform fail will not affect another one

        # some shortcuts
        do_dynasty = self.config["module_run"]["piano_dynasty"]
        do_csim = self.config["module_run"]["csim"]
        do_dongle = self.config["module_run"]["run_nef_kneron_plus"]

        self.logger.setLevel(self.config["regression"]["logging_level"])

        # compiler frontend
        if self.config["module_run"]["only_ip_evaluator"] or self.config["module_run"]["piano_knerex"]:
            for hw_mode in self.config["hw_mode_on"]:
                # generate cpu node list and nod mapping
                self.run_compiler_frontend(hw_mode=hw_mode)

        # quantizaion
        if self.config["module_run"]["piano_knerex"]:
            for hw_mode in self.config["hw_mode_on"]:
                # generate quantized model
                self.generate_knerex_config(hw_mode=hw_mode)
                self.run_knerex(hw_mode=hw_mode)
                if self.config["compiler_piano"]["convert_enc"]:
                    self.convert_enc(hw_mode=hw_mode)

        # generate nef for hardward
        if self.config["module_run"]["compiler_piano"]:
            for hw_mode in self.config["hw_mode_on"]:
                p_out = pathlib.Path(self.path["compiler_piano_{}_out".format(hw_mode)])
                self.generate_nef(hw_mode=hw_mode, p_nef=p_out)

            self.clean_opt()

            if self.config["layer_statistics"]["weight_stats"]:
                self.load_weight_bin_stats()

        if do_dynasty:

            if self.is_big_model:
                # provide some early warning for dynasty memory usage
                self.est_memory_dynasty_fx()

            self.dir_output_list = self.run_dynasty_inference()
        else:
            # if no dynasty scheduled to run, search the results folder for existing dynasty dumps.
            dir_results = self.path["dir_output"] / "results"
            self.dir_output_list = list(dir_results.glob("*.txt"))

        if self.config["module_run"]["tflite"]:
            self.run_tflite(self.list_input_simulator)

        if self.config["module_run"]["onnxruntime"]:
            self.run_onnxruntime(self.list_input_simulator)

        if self.config["module_run"]["snr_calculation"]:
            # for SNR of dynasty v2 calling.
            self.run_dynasty_snr(self.dir_output_list)
            if self.config["dynasty"]["regression_input"] == "all":
                # combine snr to overal report
                self.generate_snr_report()
                self.clean_dynasty_output(self.dir_output_list)
                # self.path["snr_csv"]
            # snr collection to regression report
            # redundant to verify_snr. TODELETE this function
            # self.load_dynasty_snr_output()
            if not self.config["path"]["internal"]:
                # used by customer in toolchain
                self.convert_snr_report()

            for hw_mode in self.config["hw_mode_on"]:
                self.verify_snr(hw_mode=hw_mode)

            if self.config["module_run"]["verify_decomp_snr"]:
                for hw_mode in self.config["hw_mode_on"]:
                    self.verify_decomp_snr(hw_mode=hw_mode)

            if self.config["module_run"]["any_bi_enable"]:
                self.verify_bias_adjust_performance()

        if self.config["module_run"]["calculate_layer_statistics"]:
            self.load_layer_statistics()

        # PREPARE for csim/nef btm
        if do_csim or do_dongle:
            # NOTE: load io_info.csv in last time run (supposed to have)
            for hw_mode in self.config["hw_mode_on"]:
                self.load_compiler_dump(hw_mode=hw_mode)
                self.load_compiler_ioinfo(hw_mode=hw_mode)

                if hw_mode not in [520]:
                    # convert dynasty input for csim. no need for 520
                    # NOTE: in regression, we  will only convert "test_input.txt" by default
                    self.data_convert(hw_mode=hw_mode)
                else:
                    self.data_convert_520(hw_mode=hw_mode)

        if do_csim:
            for hw_mode in self.config["hw_mode_on"]:
                if hw_mode == 520:
                    self.run_csim_520()
                else:
                    self.generate_csim_ini(hw_mode=hw_mode)
                    self.run_csim(hw_mode=hw_mode)

                self.btm_dyn_csim(hw_mode=hw_mode)
                if self.config["module_run"]["csim_ci"] and hw_mode not in [520]:
                    self.run_csim_ci(hw_mode=hw_mode)

                if self.config["module_run"]["rtl_cmd_check"] and hw_mode not in [520, 720]:
                    self.check_rtl_cmd(hw_mode=hw_mode)

        if do_dongle:
            inference_count = self.config["nef"]["inference_count"]

            hw_dongle_available = [520, 720, 630]  # 530
            for hw_mode in hw_dongle_available:
                if hw_mode in self.config["hw_mode_on"]:
                    self.run_nef_kneron_plus(hw_mode=hw_mode, number_try=inference_count)
                    for i in range(inference_count):
                        self.btm_csim_nef(hw_mode=hw_mode, number_try=i)
                        # self.btm_dyn_nef_kneron_plus(hw_mode=hw_mode, number_try=i)

        self.module_status["general"]["Success"] = True

        self.gen_fx_report()

        self.post_clean_up()

        # model_fx_release is a list of files to released after gen_fx_model
        return self.model_fx_release

    @staticmethod
    def load_compiler_bie_json(fn_bie, hw_mode):
        """Load js_fns from compiler frontend generated bie. """
        t1_j = util_lib.load_zip_jsons(fn_bie)

        raw_reports = {}
        raw_reports["fe2origin"] = t1_j["node_mapping_opt_fe_to_origin.json"]
        raw_reports["fe2be"] = t1_j["node_mapping_opt_fe_to_opt_be.json"]
        raw_reports["ori_node_type"] = t1_j["node_types_origin.json"]
        if hw_mode not in [520]:
            # not available for 520
            raw_reports["fe_node_type"] = t1_j["node_types_opt_fe.json"]
            raw_reports["be_node_format"] = t1_j["node_format_opt_be.json"]

        return raw_reports

    @staticmethod
    def load_knerex_bie_json(bie_release):
        """Load the jsons from knerex bie2 for fx report."""
        # we assume: bie will always generated. bie could be scaled, wqbi, ... optimized
        # this step will not work if no knerex ran.
        # for example, in mode 0 (ip-eval-only)

        # TODELETE: temp check. this should be bie.
        assert not bie_release.name.endswith(".onnx"), f"should not release onnx: {bie_release}"

        t2_j = util_lib.load_zip_jsons(bie_release)

        d = {}

        for k, v in {
                "node_type": "model_info.json",
                "node_shape": "shape_info.json",
                "node_radix": "radix_info.json"
        }.items():
            d[k] = t2_j[v]

        return d

    def load_compiler_ip_eval_info(self, hw_mode):
        """Load json from compiler backend (w iip eval) info."""
        d = {}  # to save results

        p_compiler_out = self.path["compiler_piano_{}_out".format(hw_mode)]
        js_fns = {}  # file list
        js_fns["be_node_analysis"] = p_compiler_out / "BE_node_evaluator_result.json"

        # load all json report files into:
        for k, p in js_fns.items():
            if p.exists():
                with open(p, "r") as f:
                    d[k] = json.load(f)
                    if d[k] is None:
                        raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg=f"{p.name} is empty.")

        return d

    @staticmethod
    def get_node_type(raw_reports, node_fe, nodes_origin):
        """Find the type (NPU/CPU/FUSED) for node_fe."""
        try:
            # get the info from knerex first
            node_type = raw_reports["node_type"][node_fe]["Mode"]
        except:
            try:
                node_type = raw_reports["fe_node_type"][node_fe]
            except:
                try:
                    # for 520, it fallback to origin_node_type
                    # BUG: just use the first origin node
                    node_type = raw_reports["ori_node_type"][nodes_origin[0]]
                except:
                    # print(raw_reports.keys())
                    node_type = "FUSED"
        if node_type == "NONE":
            node_type = "FUSED"

        return node_type

    def load_snr_report(self, hw_mode, raw_reports):
        """Load snr report for hw_mode."""
        try:
            if not self.path["snr_csv"].exists():
                return {}, []

            ref_name = "mode_{}_piano".format(self.config["snr"]["ref"][hw_mode])
            deg_name = "mode_{}_piano".format(self.config["snr"]["deg"][hw_mode])
            snr_types = self.config["snr"]["report_snr_col"]
            snr_result = get_case_output(self.path["snr_csv"], ref_mode=ref_name, deg_mode=deg_name, col_snr=snr_types, out_dp="all")
            d_snr = snr_result.droplevel(["Category", "Model", "Mode_deg", "Mode_ref"], axis=0).to_dict("index")
            # HACK: special process for output node. extra copy for easier lookup
            for dp_out in raw_reports["node_shape"]["dp_out"]:
                # NOTE: dp_out in dynasty dump / snr need to be called with clean_name
                dp_out = futils.clean_name(dp_out)
                dpo2 = f"output_{dp_out}"
                if (dp_out not in d_snr) and (dpo2 in d_snr):
                    d_snr[dp_out] = d_snr[dpo2]
            return d_snr, snr_result.columns
        except:
            return {}, []

    @staticmethod
    def load_fe_nodes(raw_reports):
        if "node_shape" in raw_reports:
            nodes_decomp, _, node_decomp2dp, _, _, _, _, _, _, _ = futils.parse_shape_info(raw_reports["node_shape"])
            sort_on_cmd_idx = False
        else:
            # detour for ip eval. no knerex results
            sort_on_cmd_idx = True
            nodes_decomp = list(raw_reports["fe2origin"].keys())
            node_decomp2dp = {}
        return nodes_decomp, node_decomp2dp, sort_on_cmd_idx

    def load_raw_json_reports(self, hw_mode):
        """Collect raw json from compiler frontend / knerex / compiler ip eval."""
        raw_reports = {}

        # loaded json from compiler frontend bie
        f_bie = self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"]
        d = self.load_compiler_bie_json(f_bie, hw_mode)
        raw_reports.update(d)

        # load js_fns from bie generated bie
        # we assume: bie will always generated. bie could be scaled, wqbi, ... optimized
        # this step will not work if no knerex ran.
        # for example, not available in mode 0 (ip-eval-only)
        k = f"kdp{hw_mode}/bie"
        if k in self.model_fx_release:
            bie_release = self.model_fx_release[k]
            d = self.load_knerex_bie_json(bie_release)
            raw_reports.update(d)

        # load hw info per node (from ip evaluator)
        # acutally it is backend node evaluation
        d = self.load_compiler_ip_eval_info(hw_mode)
        raw_reports.update(d)

        return raw_reports

    @staticmethod
    def record2df_fx(temp_rec, sort_on_cmd_idx, snr_cols):
        """Convert records to dataframe for fx report."""
        # some columns may have NaN, not possible to use .astype
        rep_dtld = pd.DataFrame.from_records(temp_rec)

        # clean up. remove columns which are all None, all 0, all N/A
        cols_to_drop = [
            col for col in rep_dtld.columns
            if all(rep_dtld[col].isna()) or all(
                rep_dtld[col] == 'N/A') or all(rep_dtld[col] == 0)
        ]
        rep_dtld.drop(columns=cols_to_drop, inplace=True)

        # in case ip-eval-only
        if sort_on_cmd_idx and "CMD_node_idx" in rep_dtld.columns:
            rep_dtld.loc[rep_dtld['CMD_node_idx'].isna(), 'type'] = 'FUSED'
            rep_dtld['CMD_node_idx'] = pd.to_numeric(rep_dtld['CMD_node_idx'], errors='coerce').astype('Int64')
            rep_dtld.sort_values(by='CMD_node_idx', na_position='last', inplace=True)

        # move snr columns to front of df
        for name_col in snr_cols:
            if name_col in rep_dtld.columns:
                t_column = rep_dtld.pop(name_col)
                rep_dtld.insert(1, name_col, t_column)

        return rep_dtld

    @run_module(module_name="general/gen_fx_report")
    def gen_fx_report(self):
        """Generate the fx report for quantization process.

        The report will contain:

          - ModelInfo.json from knerex dump.
            - bitwidth info
          - snr info
          - hw info from ip_evaluator
        """
        detailed_reports = OrderedDict()
        for hw_mode in self.config["hw_mode_on"]:
            ###################################################################################
            # collect report files
            raw_reports = self.load_raw_json_reports(hw_mode)

            fmt_col_cvrt = {"inputs": "in_fmt", "outputs": "out_fmt"}

            d_snr, snr_cols = self.load_snr_report(hw_mode, raw_reports)

            nodes_decomp, node_decomp2dp, sort_on_cmd_idx = self.load_fe_nodes(raw_reports)

            ###################################################################################
            # now combine all into a detailed report
            temp_rec = []
            for node_fe in nodes_decomp:
                # node frontend is the KEY for table

                # find all nodes backend that include this node_fe
                if node_fe not in raw_reports["fe2be"]:
                    nodes_be = [None]
                else:
                    nodes_be = raw_reports["fe2be"][node_fe]
                    if len(nodes_be) == 0:
                        nodes_be = [None]

                # find all nodes origin
                nodes_origin = raw_reports["fe2origin"].get(node_fe, [None])

                # find node type
                node_type = self.get_node_type(raw_reports, node_fe, nodes_origin)

                # snr info, if available. this is per dp
                # TODO: currently we assume one fe -> one dp. but soon we need to support multiple output
                try:
                    this_dp = futils.clean_name(node_decomp2dp.get(node_fe, [None])[0])
                    this_snr = d_snr.get(this_dp, None)
                except:
                    this_snr = None

                # get bitwidth info
                try:
                    bw_in = raw_reports["node_radix"][node_fe].get("input_datapath_bitwidth", "N/A")
                    bw_out = raw_reports["node_radix"][node_fe].get("output_datapath_bitwidth", "N/A")
                    bw_wt = raw_reports["node_radix"][node_fe].get("weight_bitwidth", "N/A")
                    add_bw = True
                except:
                    add_bw = False

                for node_be in nodes_be:
                    # loop through backend nodes

                    for node_org in nodes_origin:
                        # first, node mapping
                        temp_d = OrderedDict()
                        temp_d["node"] = node_fe
                        temp_d["node origin"] = node_org
                        temp_d["type"] = node_type
                        if this_snr:
                            temp_d.update(this_snr)

                        # insert bw info
                        if add_bw:
                            temp_d["bw in"] = bw_in
                            temp_d["bw out"] = bw_out
                            temp_d["bw weight"] = bw_wt

                        # backend node ip evaluate
                        skip_be = False
                        if len(temp_rec) > 0 and "node backend" in temp_rec[-1]:
                            i = -1
                            last_node_be = "↑"
                            while last_node_be == "↑":
                                last_node_be = temp_rec[i]["node backend"]
                                i -= 1

                            if (not sort_on_cmd_idx) and node_be == last_node_be:
                                # if full run and
                                # if same as above, put empty or ↑
                                skip_be = True

                                # full run
                                temp_d["node backend"] = "↑"
                                if "be_node_analysis" in raw_reports and node_be in raw_reports["be_node_analysis"]:
                                    for k in raw_reports["be_node_analysis"][node_be]:
                                        temp_d[k] = ""
                                if "be_node_format" in raw_reports and node_be in raw_reports["be_node_format"]:
                                    for k in raw_reports["be_node_format"][node_be]:
                                        temp_d[fmt_col_cvrt[k]] = ""
                        if not skip_be:
                            temp_d["node backend"] = node_be
                            if "be_node_analysis" in raw_reports and node_be in raw_reports["be_node_analysis"]:
                                # NOTE: no node analysis for 520
                                temp_d.update(raw_reports["be_node_analysis"][node_be])
                            if "be_node_format" in raw_reports and node_be in raw_reports["be_node_format"]:
                                iofmt = raw_reports["be_node_format"][node_be]
                                for k1, v1 in iofmt.items():
                                    temp_d[fmt_col_cvrt[k1]] = futils.pprint_dict(v1)

                temp_rec.append(temp_d)

            detailed_reports[hw_mode] = self.record2df_fx(temp_rec, sort_on_cmd_idx, snr_cols)

        # now collect overal summary
        self.model_fx_release["gen fx model report"] = self.path["model_fx_html"]
        self.model_fx_release["gen fx model json"] = self.path["model_fx_json"]
        for k, v in self.model_fx_release.items():
            # those files will be moved to release folder. so just print file name
            self.model_fx_report[k] = v.name
        df_summary = pd.DataFrame.from_dict(self.model_fx_report, orient="index", columns=["info"])

        # we need this file for app_release and gen_fx_model call
        with open(self.path["model_fx_json"], "w") as f:
            json.dump(self.model_fx_report, f, indent=4, sort_keys=False, default=str)

        # write multi-dataframe to html
        with open(self.path["model_fx_html"], 'w') as f:
            f.write('<h1>Summary</h1><br><hr>')
            f.write(f"{df_summary.to_html(border=2)}<br><hr>")
            for k, df in detailed_reports.items():
                f.write(f"<h2>kdp{k}</h2><br><hr>")
                f.write(f"{df.to_html(border=1)}<br><hr>")

    def save_summary(self):
        """Save summary html only, when submoudles failed.

        NOTE: this method will be called in run_single_case.
              Not supposed to call in run_flow here.
        """
        # now collect overal summary
        self.model_fx_release["gen fx model report"] = self.path["model_fx_html"]
        self.model_fx_release["gen fx model json"] = self.path["model_fx_json"]
        for k, v in self.model_fx_release.items():
            # those files will be moved to release folder. so just print file name
            self.model_fx_report[k] = v.name

        # we need this file for app_release and gen_fx_model call
        with open(self.path["model_fx_json"], "w") as f:
            json.dump(self.model_fx_report, f, indent=4, sort_keys=False, default=str)

        df_summary = pd.DataFrame.from_dict(self.model_fx_report, orient="index", columns=["info"])
        # write multi-dataframe to html
        with open(self.path["model_fx_html"], 'w') as f:
            f.write('<h1>Summary</h1><br><hr>')
            f.write(f"{df_summary.to_html(border=2)}<br><hr>")

        # even case failed, we will try to provide summary report as well.
        return self.model_fx_release

    @run_module(module_name="auto/csim_ci")
    def run_csim_ci(self, *, hw_mode):
        """
        Internal use only. for csim release.
        only keep files needed by csim ci
        """
        model_dir = self.model_path

        target_dir = pathlib.Path("{}/{}/{}".format(self.config["path"][f"csim_{hw_mode}_ci_dir"], model_dir.parent.name, model_dir.name))
        target_output_dir = pathlib.Path("{}/{}/{}/output/".format(self.config["path"][f"csim_{hw_mode}_ci_dir"], model_dir.parent.name, model_dir.name))

        compiler_dir = f"{self.model_path}/output/compiler_piano_output_{hw_mode}/"
        target_compiler_dir = pathlib.Path("{}/{}/{}/output/compiler_piano_output_{}/".format(self.config["path"][f"csim_{hw_mode}_ci_dir"], model_dir.parent.name, model_dir.name, hw_mode))

        dynasty_dump_dir = f"{self.model_path}/output/results/{self.btm_txt}/mode_{hw_mode}_piano/"
        target_dynasty_dump_dir = pathlib.Path("{}/{}/{}/output/results/{}/mode_{}_piano/".format(self.config["path"][f"csim_{hw_mode}_ci_dir"], model_dir.parent.name, model_dir.name, self.btm_txt, hw_mode))

        if os.path.exists(target_dir):
            shutil.rmtree(target_dir)
        shutil.copytree(dynasty_dump_dir, target_dynasty_dump_dir)
        shutil.copytree(compiler_dir, target_compiler_dir)

        combine_cmd = f"cp -r {model_dir}/output/run_csim_{hw_mode}.ini {target_output_dir}"
        cp = futils.run_bash_script(combine_cmd)
        if cp.returncode != 0:
            raise RegressionError(f"kdp{hw_mode}/csim ci", self.model_id, msg=f"Err: {cp.returncode}")

    @run_module(module_name="auto/rtl_cmd_check")
    def check_rtl_cmd(self, *, hw_mode):
        """compare command.bin inst.hex

        # Usage: python3 ./rtlCmdCmpBinTxt.py command.bin inst.hex.opt

        # TODO: check who will use this.
        """
        # TODO: link_bin had been removed.
        raise NotImplementedError()
        rtl_cmd_cmp = self.config["path"]["binary"]["csim"]["rtl_cmd_cmp"]
        link_bin = self.config["path"]["binary"]["compiler"]["link_bin"]
        compile_and_gen_conv_all = self.config["path"]["binary"]["compiler"]["compile_and_gen_conv_all"]

        dir_rtl = "{}/rtl".format(self.model_path)
        dir_rtl_cmd_cmp = pathlib.Path("{}/rtl/cmd_cmp".format(self.model_path))
        inst_hex_opt = "{}/output.rtl.{}.testcase/cmd_cmp/inst.hex.opt".format(dir_rtl_cmd_cmp, hw_mode)
        model_output_dir = "{}/output/".format(self.model_path)
        if dir_rtl_cmd_cmp.exists():
            shutil.rmtree(dir_rtl_cmd_cmp)
        pathlib.Path(dir_rtl_cmd_cmp).mkdir(mode=0o770, parents=True, exist_ok=True)
        cp_case_for_rtl_gen = "cp -r {} {}".format(model_output_dir, dir_rtl_cmd_cmp)
        subprocess.run(cp_case_for_rtl_gen, shell=True, executable="/bin/bash", check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        compiler_bin = self.config["path"]["binary"]["compiler"]["compiler"]
        if self.is_big_model:
            gen_rtl_case_command = "pushd {} > /dev/null && {} {}; {} {} {} model_opt && popd > /dev/null".format(dir_rtl_cmd_cmp, link_bin, compiler_bin, compile_and_gen_conv_all, dir_rtl, hw_mode)
        elif self.is_multi_layer:
            gen_rtl_case_command = "pushd {} > /dev/null && {} {}; {} {} {} multi && popd > /dev/null".format(dir_rtl_cmd_cmp, link_bin, compiler_bin, compile_and_gen_conv_all, dir_rtl, hw_mode)
        elif self.is_single_layer:
            gen_rtl_case_command = "pushd {} > /dev/null && {} {}; {} {} {} single && popd > /dev/null".format(dir_rtl_cmd_cmp, link_bin, compiler_bin, compile_and_gen_conv_all, dir_rtl, hw_mode)
        subprocess.run(gen_rtl_case_command, shell=True, executable="/bin/bash", check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        cmd_cmp_command = "{} {}/output/compiler_piano_output_{}/command.bin {}".format(rtl_cmd_cmp, self.model_path, hw_mode, inst_hex_opt)
        subprocess.run(cmd_cmp_command, shell=True, executable="/bin/bash", check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    @run_module(module_name="auto/verify_decomp_snr")
    def verify_decomp_snr(self, *, hw_mode):
        """
        should this be combined into snr_calculate?
        """

        snr_min = 80  # SNR must larger than 80dB

        df = pd.read_csv(self.fn_report, index_col=["ref", "deg", "layer"])
        out_layer_names = set(df.index.get_level_values("layer"))
        deg_modes = set(df.index.get_level_values("deg"))

        pairs = []
        mode_ref = "mode_float_piano"
        mode_deg = "mode_{}decomp_piano".format(self.config["snr"]["deg"][hw_mode])
        if mode_deg in deg_modes:
            # check corresponding SNR results exists
            for out_name in out_layer_names:
                pairs.append((mode_ref, mode_deg, out_name))

        # pairs are SNR we want to verify
        snr_name = "SNR_With_Mean"

        # TODO: put this into columns. NOT using assert
        for i_deg in pairs:
            assert df.loc[i_deg, snr_name] > snr_min

    @run_module(module_name="auto/verify_snr")
    def verify_snr(self, *, hw_mode):
        """Quick check on model snr reach threshold

        After snr_calculation, the snr_per_layer.csv is generated.
        The snr_report.csv was extract from per_layer.csv which include output nodes only.

        This function is to pick one or both snr columns from snr_report.csv
        according to settings.

        TODO:
          - should this be combined into snr_calculate?

        it used to work for multi platform/hw_mode at same time
        removed to simplify
        """
        if self.is_big_model:
            snr_min = 10  # big_model must large than 10dB
        else:
            snr_min = 20  # layer must larger than 20dB

        df = pd.read_csv(self.fn_report, index_col=["ref", "deg", "layer"])
        out_layer_names = set(df.index.get_level_values("layer"))
        deg_modes = set(df.index.get_level_values("deg"))

        pairs = []
        mode_ref = "mode_{}_piano".format(self.config["snr"]["ref"][hw_mode])
        mode_deg = "mode_{}_piano".format(self.config["snr"]["deg"][hw_mode])
        if mode_deg in deg_modes:
            # check corresponding SNR results exists
            for out_name in out_layer_names:
                pairs.append((mode_ref, mode_deg, out_name))
        # pairs are SNR we want to verify

        # TODELETE
        # # HACK: maxRoi snr use snr wo mean
        # if "maxRoi" in self.model_name:
        #     snr_name = "snr wo mean"
        # else:
        #     snr_name = "snr w/ mean"

        snr_names = self.config["snr"]["report_snr_col"]
        for snr_name in snr_names:
            details = []
            for i_deg in pairs:
                # per output
                this_snr = df.loc[i_deg, snr_name]
                if this_snr < snr_min:
                    prefix = "⋖T:"
                else:
                    prefix = "⋗T:"
                msg = f"{prefix} {this_snr:5.1f}dB ({i_deg[2]})"
                details.append(msg)
            signal("data_sender").send((self.model_id, f"kdp{hw_mode}/{snr_name} (T={snr_min:.0f}dB)", "//".join(sorted(details))))

    @run_module(module_name="general/verify_bias_adjust")
    def verify_bias_adjust_performance(self):
        """this verify step is to report on module success/fail in flow report.

        bias adjust performance detailed compare report are generated in during regression.py:
        snr_calculator.py/gather_all_bi_improve

        """
        df = pd.read_csv(self.fn_report, index_col=["ref", "deg", "layer"])
        out_layer_names = set(df.index.get_level_values("layer"))
        ref_modes = set(df.index.get_level_values("ref"))
        deg_modes = set(df.index.get_level_values("deg"))
        pairs = []
        for out_name in out_layer_names:
            for comp, (ref, deg1, deg2) in fconsts.SNR_BI_IMPROVE.items():
                mode_ref = "mode_{}_piano".format(ref)
                mode_deg1 = "mode_{}_piano".format(deg1)
                mode_deg2 = "mode_{}_piano".format(deg2)

                if mode_deg1 in deg_modes and mode_deg2 in deg_modes and mode_ref in ref_modes:
                    # only if all three modes are running.
                    pairs.append(((mode_ref, mode_deg1, out_name), (mode_ref, mode_deg2, out_name)))

        snr_name = "SNR_With_Mean"
        for i_ref, i_deg in pairs:
            improve = df.loc[i_deg, snr_name] - df.loc[i_ref, snr_name]
            self.logger.info(
                "Bias Adj improved = {} db = {} - {}. {}, {}".format(
                    improve, df.loc[i_deg, snr_name], df.loc[i_ref, snr_name],
                    i_deg, self.path["dir_output"]))
            # TODO: just send the improve to some column. platform independent?
            # TODO: remove run_module for this function
            if improve < -0.5:
                # Dont use assert here. it will suppress compiler/csim behind it
                self.logger.error(f"    ATTENTION: Bias adjust snr drop by {improve}")

    def load_weight_bin_stats(self):
        # only some out of hw_mode_on
        modes_on = self.config["hw_mode_on"]

        for mode in modes_on:
            compiler_output_path = self.path["dir_output"] / "compiler_{}".format(mode)
            weight_bin_path = compiler_output_path / "weight.bin"
            if os.path.exists(weight_bin_path):
                get_weight_bin_stats(weight_bin_path, do_tile_analysis=self.config["layer_statistics"]["tile_analysis"])
            else:
                all_weight_bins = list(compiler_output_path.glob("**/*weight.bin"))
                for subg_weight_bin in all_weight_bins:
                    subg_index = subg_weight_bin.parent.name
                    if subg_weight_bin.stat().st_size > 0:
                        get_weight_bin_stats(
                            str(subg_weight_bin),
                            subg_index,
                            do_tile_analysis=self.config["layer_statistics"]
                            ["tile_analysis"])

        return

    @run_module("auto/convert_enc")
    def convert_enc(self, *, hw_mode):
        """Encrypt select onnx of given platform and otimized level"""

        model_convertor_bin = self.config["path"]["binary"]["compiler"]["model_converter"]
        model_optized_type = self.config["compiler_piano"]["model_optimize"]
        if model_optized_type == "scaled":
            optimized_onnx = self.model_path / "output" / "knerex_{}".format(hw_mode) / "{}.kdp{}.{}.onnx".format(self.model_name, hw_mode, "scaled.quan")
            assert optimized_onnx.exists(), "knerex opt onnx is scaled onnx, need to convert enc based on wq onnx, but wq onnx does not exist!!!"
        elif model_optized_type == "wqbi":
            optimized_onnx = self.model_path / "output" / "knerex_{}".format(hw_mode) / "{}.kdp{}.{}.onnx".format(self.model_name, hw_mode, "scaled.quan.wqbi")
            assert optimized_onnx.exists(), "knerex opt onnx is wqbi onnx, but wqbi onnx does not exist!!!"

        command = f"{model_convertor_bin} {optimized_onnx} {optimized_onnx}.enc > /dev/null"
        cp = futils.run_bash_script(command, do_echo=True, fail_then_exit=True)

        module_name = f"kdp{hw_mode}/convert_enc"
        self.save_command(module_name, command)

        return

    def load_layer_statistics(self, base_dump="results"):
        """
        collect some analysis/statistics on dynasty per layer dump/
        """
        do_per_channel = self.config["layer_statistics"]["per_channel"]
        do_difference_matrix = self.config["layer_statistics"]["do_difference_matrix"]
        hw_code = self.config["hw_mode_on"][0]
        dynasty_output_path = self.path["dir_output"] / base_dump
        do_float = self.config["layer_statistics"]["do_float"]
        stat_params = self.config["layer_statistics"]["params"]
        no_plot = self.config["layer_statistics"]["no_plot"]
        mode_list = self.config["layer_statistics"]["mode_on"]

        self.logger.info("generating layer statistics, could be time consuming")
        calculate_statistics(dynasty_output_path,
                             hw_code,
                             mode_list,
                             do_per_channel=do_per_channel,
                             do_diff_stat=do_difference_matrix,
                             do_float=do_float,
                             stat_params=stat_params,
                             no_plot=no_plot)
        return

    @run_module(module_name="general/tflite")
    def run_tflite(self, input_list, base_dump="results"):
        """Inference with tflite and dump all layer float/fix result."""
        module_name = "general/tflite"

        tflite_dir = self.model_path / "input" / "{}.tflite".format(self.model_name)
        tflite_dump_exec = self.config["path"]["binary"]["tflite"]["dump.py"]

        # TODO: multi-thead
        # TODO: call python function?
        # TODO: why called mode_tflite_float_noise?

        for input_path in input_list:
            # DEBUG: input_path now is a list of path!!! in case for multi-inputs

            if "quant" in self.model_name:
                out_dir = "{}/{}/{}/mode_tflite_fix_noise/".format(self.path["dir_output"], base_dump, input_path.name)
            else:
                out_dir = "{}/{}/{}/mode_tflite_float_noise/".format(self.path["dir_output"], base_dump, input_path.name)
            pathlib.Path(out_dir).mkdir(mode=0o770, parents=True, exist_ok=True)

            command = "python3 {} -o {} -i {} -t {} -l {}".format(tflite_dump_exec, out_dir, input_path, tflite_dir, "True")

            self.save_command(module_name, command)
            cp = futils.run_bash_script(command)
            if cp.returncode != 0:
                raise RegressionError("general/tflite", self.model_id, msg=f"Err: {cp.returncode}")

        return

    @run_module(module_name="general/onnxruntime")
    def run_onnxruntime(self, input_list, base_dump="results"):
        """Inference with onnxruntime and dump final layer float result."""
        module_name = "general/onnxruntime"
        onnxruntime_dump_exec = self.config["path"]["binary"]["tflite"]["onnxruntime.py"]

        onnx_dir = self.map_onnx["origin"]

        # TODO: multi-thead
        # TODO: call python function?
        # TODO: why called mode_onnxruntime_noise?

        for input_path in input_list:
            # DEBUG: input_path now is a list of path!!! in case for multi-inputs
            out_dir = pathlib.Path("{}/{}/{}/mode_onnxruntime_noise/".format(self.path["dir_output"], base_dump, input_path.name))
            out_dir.mkdir(parents=True, exist_ok=True)

            command = "python3 {} -out {} -in {} -onnx {}".format(onnxruntime_dump_exec, out_dir, input_path, onnx_dir)

            self.save_command(module_name, command)
            cp = futils.run_bash_script(command)
            if cp.returncode != 0:
                raise RegressionError("general/onnxruntime", self.model_id, msg=f"Err: {cp.returncode}")

        return

    @run_module(module_name="general/snr cal")
    def run_dynasty_snr(self, dir_output_list):
        """function to calculate snr for each input image

        currently calculate when all input x mode done.
        TODO: calculater per input file, after all modes done
        """

        pc = "--pc" if self.config["snr"]["per_channel"] else ""
        bin_snr = fconsts.P_FLOW / "snr_calculator_v2.py"

        self.logger.info("calculating SNR for {} outputs.".format(len(dir_output_list)))

        # precaution of bash input limit.
        # if 1000 input txt, each txt output path is 50 chars,
        # the command will be at least 50000 chars.
        # bash call will fail if too long.
        # Ref: https://stackoverflow.com/questions/19354870/bash-command-line-and-input-limit
        for dol in futils.chunker(dir_output_list, 100):
            s_outs = " ".join([str(a) for a in dol])
            command = f"python3 {bin_snr} single {pc} {s_outs}"
            cp = futils.run_bash_script(command)
            if cp.returncode != 0:
                raise RegressionError("general/snr cal", self.model_id, msg=f"Err: {cp.returncode}")

    def load_dynasty_snr_output(self):
        """Read dynasty snr report, keeps only the output layers.

        Optional:
           - (internal regression) add snr reference from previous.
        """
        snr_types = self.config["snr"]["report_snr_col"]
        for hw_mode in self.config["hw_mode_on"]:
            try:
                ref_name = "mode_{}_piano".format(self.config["snr"]["ref"][hw_mode])
                deg_name = "mode_{}_piano".format(self.config["snr"]["deg"][hw_mode])
                snr_result = get_case_output(self.path["snr_csv"], ref_mode=ref_name, deg_mode=deg_name, col_snr=snr_types)
            except:
                continue
            for snr_type in snr_types:
                snr_vals = snr_result[snr_type].values

                snr_vals_string = ",".join(str(format(snr_val, '.0f')) for snr_val in snr_vals)
                snr_k = f"kdp{hw_mode}/{snr_type}(dB)"
                self.model_fx_report[snr_k] = snr_vals_string
                # add snr reference if internal
                if self.is_big_model and self.config["path"]["internal"]:
                    try:
                        # load reference.
                        # TODO: need to update when use new benchmark. try to use snr_k
                        snr_k_old = f"{snr_type}_{hw_mode}(dB)"
                        snr_ref = self.config["snr_ref"][futils.clean_case_name(self.model_name)][snr_k_old]
                        # use // to split snr and ref_snr
                        snr_vals_string += "//{}".format(snr_ref)
                    except:
                        pass
                signal("data_sender").send((self.model_id, snr_k, snr_vals_string))

    def convert_snr_report(self):
        """
        Read dynasty snr full report for release. will use "SNR_With_Mean" col
        """

        if not self.path["snr_csv"].exists():
            # snr need to be calculated. sometime not turned on. e.g., ip evaluator only.
            return None  # will not export excel

        # NOTE: customer will run only 1 mode per regression
        df_snr = pd.read_csv(self.path["snr_csv"], index_col=["Model", "Mode_deg", "Mode_ref", "dump name"])
        cols = [col for col in df_snr.columns if col in ["Input", "Layer_index", "SNR_With_Mean"]]
        df_snr = df_snr[cols]
        df_snr.rename(columns={"SNR_With_Mean": "SNR"}, inplace=True)
        df_snr.to_excel(self.path["snr_excel"])

        return self.path["snr_excel"]

    @run_module(module_name="general/dynasty")
    def run_dynasty_inference(self):
        """Run normal dynasty as configed for this test case."""
        module_name = "general/dynasty"
        self.logger.info(f"Run {module_name}")

        mode_list = [k for k, v in self.config["mode_run"].items() if v]
        input_list = self.list_input_simulator
        dump_level = self.config["dynasty"]["do_dump"]
        info_in = self.io_nodes["input"]
        p_output = self.path["dir_output"] / "results"
        dynasty_bin = self.config["path"]["binary"]["dynasty"]["binary"]
        onnx_map = self.map_onnx
        model_id = self.model_id
        fn_dynasty_sh = self.path["dir_output"] / "run_dynasty.sh"
        n_thread = self.config["dynasty"]["n_parallel_input"]
        onnx_type = self.config["dynasty"]["piano_dynasty"]["onnx_source"]
        shape_in = self.config["dynasty"]["input_shape"]
        # ioinfo.json from compiler
        # OBSOLETE / TODELETE
        # 主要是要看input_fmt + conv是否為first layer
        ioinfo_map = self.path["ioinfo_json"]

        # prepare dynasty list
        mode_settings = [dynasty.gen_dynasty_mode_settings(mode_name,
                                                           onnx_map=onnx_map,
                                                           ioinfo_map=ioinfo_map,
                                                           which_onnx=onnx_type,
                                                           model_id=model_id)
                         for mode_name in mode_list]

        d_list, dir_output_list = dynasty.gen_dynasty_list(mode_settings,
                                                           input_list,
                                                           info_in,
                                                           p_output,
                                                           dump_level=dump_level,
                                                           shape_in=shape_in)

        # HACK: for noisy dynasty
        if self.config["module_run"]["piano_dynasty_noise"]:
            d_list_noise, d_out_list_noise = self.generate_dynasty_list_noise()
            d_list.extend(d_list_noise)
            dir_output_list.extend(d_out_list_noise)

        # run all the dynasty inference
        self.logger.info("Running dynasty with list of {}".format(len(d_list)))
        cmds = dynasty.build_dynasty_cmd(d_list, dynasty_bin, fn_dynasty_sh)
        fn_log = p_output / "dynasty.log"
        dynasty.run_dynasty_command_parallel(self.model_id, fn_dynasty_sh, n_thread=n_thread, fn_err=fn_log)

        # save commands with others
        self.save_command(module_name, f"bash {fn_dynasty_sh}")

        return dir_output_list

    @run_module(module_name="general/dynasty noise")
    def run_dynasty_inference_noise(self):
        """TODO. re-write generate_dynasty_list_noise below."""
        raise NotImplementedError
        # return dir_output_list

    def generate_dynasty_list_noise(self):
        """Create dynasty noise list (expand mode+input) for regression.

        HACK: use noise input for dynasty float
        TODELETE
        """
        raise NotImplementedError

        # create mode and input_list
        # NOTE: only noise input for float inference now.
        noise_list = []
        ref_modes = ["float"]
        noise_levels = self.config["dynasty"]["noise_sigma"]
        for ref_mode in ref_modes:
            for nl in noise_levels:
                noise_mode = "{}_noise{}".format(ref_mode, nl)
                # copy from ref mode
                i_mode = self.generate_dynasty_mode_setting(ref_mode)
                i_mode["name_mode"] = noise_mode
                i_mode["dir_out"] = "mode_{}".format(noise_mode)

                input_list = self.list_input_simulator_noise[nl]

                noise_list.append((i_mode, input_list))

        # create detailed dynasty run list
        dynasty_list = []
        dynasty_out_list = []
        for noise_setting, noise_input in noise_list:
            d_list, d_out_list, _ = self.generate_dynasty_list(noise_setting, noise_input)
            dynasty_list.extend(d_list)
            dynasty_out_list.extend(d_out_list)

        return dynasty_list, dynasty_out_list

    @run_module(module_name="auto/dynasty btm dump2")
    def run_dynasty_inference_btm_dump2(self, *, hw_mode, dry_run=True):
        """Run dynasty for pld with dump 2."""
        # prepare dynasty run list for later
        selected_mode = str(hw_mode)
        input_list = self.list_input_btm
        dump_level = 2
        info_in = self.io_nodes["input"]
        p_output = self.path["dir_output"] / "results"
        dynasty_bin = self.config["path"]["binary"]["dynasty"]["binary"]
        onnx_map = self.map_onnx
        model_id = self.model_id
        fn_dynasty_sh = self.path["dir_output"] / "run_dynasty_btm_dump2.sh"
        onnx_type = self.config["dynasty"]["piano_dynasty"]["onnx_source"]
        shape_in = self.config["dynasty"]["input_shape"]
        # ioinfo.json from compiler
        ioinfo_map = self.path["ioinfo_json"]

        # prepare dynasty mode setting x1
        selected_mode_setting = dynasty.gen_dynasty_mode_settings(
            selected_mode,
            onnx_map=onnx_map,
            ioinfo_map=ioinfo_map,
            which_onnx=onnx_type,
            model_id=model_id)

        d_list, dir_output_list = dynasty.gen_dynasty_list([selected_mode_setting],
                                                           input_list,
                                                           info_in,
                                                           p_output,
                                                           dump_level=dump_level,
                                                           shape_in=shape_in)

        # run dynasty
        cmds = dynasty.build_dynasty_cmd(d_list, dynasty_bin, fn_dynasty_sh)
        if not dry_run:
            dynasty.run_dynasty_command_parallel(self.model_id, fn_dynasty_sh)

        return dir_output_list

    @staticmethod
    def compact_json(fn_json, fn_new=None):
        """
        Helper function to make json more human-friendly.
        """
        def compact_array(str_array):
            a = str_array.group().replace("\n", "").replace("\t", "")
            return a

        with open(fn_json, "r") as f:
            j = f.read()

        j = re.sub(r"\[.*?\]", compact_array, j, flags=re.DOTALL)
        j = re.sub(r":[ \n\t]*\[", ": [", j, flags=re.DOTALL)

        if fn_new is None:
            fn_new = fn_json
        with open(fn_new, "w") as f:
            f.write(j)

    def postprocess_piano_knerex_json(self, hw_mode):
        """
        Helper function: Prepare/link some knerex json file for compiler use.
        """

        for appd in ["_scaled_piano_bie", "_scaled_piano_onnx", "_quan_piano_bie", "_quan_piano_onnx"]:
            fn_json_scaled = "{}.json".format(self.map_onnx[f"kdp{hw_mode}{appd}"])
            p = pathlib.Path(fn_json_scaled)
            if p.exists() and not p.is_symlink():
                self.compact_json(fn_json_scaled)

        # HACK: for kai's script.
        # TODO: confirm still needed?
        fn_json_from = "{}.json".format(self.map_onnx[f"kdp{hw_mode}_scaled_piano_bie"])
        fn_json_to = "{}.json".format(self.map_onnx[f"kdp{hw_mode}_scaled_piano_onnx"])
        p_to = pathlib.Path(fn_json_to)
        if p_to.exists():
            p_to.unlink()
        if os.path.exists(fn_json_from):
            shutil.copy(fn_json_from, fn_json_to)

    @run_module(module_name="auto/knerex")
    def run_knerex(self, *, hw_mode):
        """run knerex piano (weight / data analysis, updater 520/720) for this model.

        For knerex, no need for multi-processing.
        (datapath analysis run multi-processing in C++, will not affect python flow).

        input:
            origin.onnx
            compiler_xxx/graph_opt.onnx

        intermedial files:
            * analysis_datapath_piano_NNN.bin
            * analysis_weight_piano_NNN.tmp
        """
        module_name = f"kdp{hw_mode}/knerex"
        self.logger.info(f"Run {module_name}")

        openblas_num_threads = self.config["knerex"]["openblas_num_threads"]

        para_bin = self.config["path"]["binary"]["knerex"]["normal"]
        para_updater_json = self.path[f"updater_{hw_mode}_json"]

        command = f"export OPENBLAS_NUM_THREADS={openblas_num_threads}; {para_bin} -i {para_updater_json}"

        self.save_command(module_name, command)

        TOS = self.config["knerex"]["timeout"]
        cp = futils.run_bash_script(command, timeout=TOS)
        self.check_knerex_error(cp, hw_mode)

        self.postprocess_piano_knerex_json(hw_mode)

        # release this bie
        release_bie, _, _, release_onnx = self.get_scaled_onnx_source(hw_mode)
        p_out = pathlib.Path(self.path["dir_output"])
        self.model_fx_release[f"kdp{hw_mode}/bie"] = p_out / release_bie
        self.model_fx_release[f"kdp{hw_mode}/onnx"] = p_out / release_onnx

    def check_compiler_HardwareNotSupport(self, hw_mode):
        """Find detailed failure from gen_config/compiler log."""
        p_compiler_out = pathlib.Path(self.path[f"compiler_piano_{hw_mode}_out"])
        # common file names: batch_compile.log / compile.log / opt.log / backtrace.log
        p_logs = list(p_compiler_out.glob("*.log"))

        t = ""
        for p_log in p_logs:
            with open(p_log, "r") as f:
                t += "".join(f.readlines())
        if len(t) == 0:
            return None
        # t is a long line with \n in it.

        prefixes_1 = {
            "ERROR: run sub-module \"image_cut_search\" failed": ("fm_cut", "compiler report"),
            "Invalid program input: Memory region \[weight\] .*? overlapps \[dram\]": ("compiler", "datapath oversize"),
            # 720 old setup
            "CSim only support CPU node in the end of model and write data to output buffer": ("compiler", "cpu node in middle"),
        }
        for keyw, (col_name, msg) in prefixes_1.items():
            pat1 = re.compile(keyw)
            if len(pat1.findall(t)) > 0:
                self.model_fx_report[(f"kdp{hw_mode}/ERROR")] = msg
                raise RegressionError(f"kdp{hw_mode}/{col_name}", self.model_id, msg=msg)

        prefixes = {
            "Common": ("compiler", ""),
            "InvalidProgramInput": ("compiler", ""),
            "InvalidONNXAttribute": ("compiler", ""),
            "HardwareNotSupport": ("HW not support", "compiler: "),
            "Hardware not support": ("HW not support", "compiler: "),
            "UnexpectedGraph": ("compiler", ""),
            "UnimplementedFeature": ("unimplemented feature", "compiler: "),
            "ValueNotReady": ("compiler", ""),
            "KnerexError": ("knerex", "compiler: "),
            "UnexpectedValue": ("compiler", ""),
            "creating an EmptyNode instance for op_type:": ("compiler", "unsupported nodes: //"),
        }

        for keyw, (col_name, prefix) in prefixes.items():
            pat1 = re.compile(f"{keyw}[:\s]*(.*)")
            if len(pat1.findall(t)) > 0:
                msg = prefix + "//".join(pat1.findall(t))
                self.model_fx_report[(f"kdp{hw_mode}/ERROR")] = msg
                raise RegressionError(f"kdp{hw_mode}/{col_name}", self.model_id, msg=msg)

        # otherwise will raise normal compiler error
        return None

    def get_compiler_config_helper1(self,
                                    hw_mode,
                                    p_out=None,
                                    debug=False,
                                    gen_nef_config=False,
                                    skip_backend=False,
                                    use_quan_model=True,
                                    fmt_limit=None,
                                    do_ip_eval=False):
        """Helper function to generate compiler config.

        Args:
          skip_backend (bool): True to run frontend only.
          use_quan_model (bool): only valid when skip_backend is True.
            set to True to use quantized model for accurate input bin format. (if needed.)
        """
        if type(p_out) is not pathlib.PosixPath:
            p_out = pathlib.Path(self.path[f"compiler_piano_{hw_mode}_out"])
        p_out.mkdir(mode=0o770, parents=True, exist_ok=True)

        # para_model_type for compiler
        if self.is_multi_layer:
            para_model_type = "-v multi"
            if debug:
                para_model_type = "-v model_dbg"
        elif self.is_multi_core:
            para_model_type = "-v multi"
        elif self.is_single_layer:
            para_model_type = "-v single"
        elif self.is_big_model:
            # big model
            if gen_nef_config:  # batch compile to generate nef
                para_model_type = "-v model_rel"
            else:
                # normal compiler call
                para_model_type = "-v model_opt"

        # find corresponding onnx/bie/onnx+json
        if self.config["module_run"]["only_ip_evaluator"] or (skip_backend and (not use_quan_model)):
            # no scaled onnx yet. use origin.onnx or origin.bie
            p_origin = pathlib.Path(self.map_onnx["origin"])
            para_onnx = futils.relative_path(p_origin, p_out)
            s_para_json = " "  # no json
            use_quan_model = False
        else:
            para_onnx, para_onnx_json, _, _ = self.get_scaled_onnx_source(hw_mode)
            para_onnx = futils.relative_path(para_onnx, p_out)
            use_quan_model = True
            if para_onnx.name.endswith(".bie"):
                # scaled.bie, no json
                s_para_json = " "
            else:
                # scaled.onnx, need json
                para_onnx_json = futils.relative_path(para_onnx_json, p_out)
                s_para_json = f"-r {para_onnx_json}"

        compiler_envs = ["echo"]  # placeholder for bash

        # extra config
        extra_d = dict()
        if hw_mode == 720:
            extra_d["gen_setup_fbs"] = True

        # TODO
        if do_ip_eval:
            env_ip_eval = "export RUN_IP_EVAL=1"
            extra_d["ip_evaluator_cfg"] = self.config["compiler_piano"]["ip_evaluator_json"][hw_mode]
        else:
            env_ip_eval = "export RUN_IP_EVAL=0"
        compiler_envs.append(env_ip_eval)

        if self.config["module_run"]["only_ip_evaluator"]:
            # NOTE: normal regression will have it as False,
            # so batch compiler will fail at unsupported cpu nodes.
            extra_d["skip_fw_cpu_op_impl_check"] = True

        if hw_mode in [720, 730, 630, 540] and self.config["compiler_piano"]["weight_compress"]:
            extra_d["weight_compress"] = True

        if hw_mode in [720, 530, 730, 630, 540] and futils.need_compress_command_bin(self.cat_name, self.model_name):
            extra_d["optimize"] = {"cmd_size": True}

        if fmt_limit:
            # should not be in ip_eval_only
            extra_d["input_fmt"] = fmt_limit

        if (not use_quan_model) and self.config["knerex"]["datapath_bitwidth_mode"] == "int16":
            # run 16bit ip evaluator for ip_eval_only
            extra_d["def_data_bitw"] = 16
            extra_d["input_fmt"] = "8W1C16B"

        extra_d["model_id"] = self.nef_model_id

        if hw_mode == 720 and skip_backend:
            # https://redmine.kneron.tw/issues/19020 for MO3
            do_change = False
            for case_end in ["1W16C8BHL_INTLV", "i15o15_INTLV", "1W16C8BHL_colAcc_INTLV"]:
                if self.model_name.endswith(case_end):
                    do_change = True
                    break
            if do_change:
                extra_d["output_fmt"] = "1W16C8B_INTLV"

        if skip_backend:
            extra_d["skip_backend"] = True
            env_gen_opt = "export KNERON_GEN_OPT_ONNX=1"
            compiler_envs.append(env_gen_opt)

            if self.config["compiler_piano"]["no_dummy_bn"] or (hw_mode in [520, 720] and self.is_single_layer):
                # if configed
                # HACK: for knerex only, stc, 520/720
                compiler_envs.append("export KNERON_PIANO_OPT_NO_DUMMY_BN=1")

        ## read per model compiler extra settings and update to extra_d
        ## now only used for app_release, need to prepare this json ourself
        p_extra_compiler_settings_config = self.path["dir_input"] / "extra_compiler_settings.json"
        if p_extra_compiler_settings_config.exists():
            with open(p_extra_compiler_settings_config, "r") as f:
                extra_compiler_settings_config = json.load(f)
            recursive_update(extra_d, extra_compiler_settings_config)

        if len(extra_d) > 0:
            extra_para = "-a '{}'".format(json.dumps(extra_d, default=str))
        else:
            extra_para = ""

        # example: compiler_piano.config.kdp530.json
        compiler_json_name = pathlib.Path(self.path[f"compiler_piano_{hw_mode}_json"]).name
        # may save to different folder
        p_compiler_json = p_out / compiler_json_name
        p_img_cut_json = p_out / "image_cut_config.json"
        para_compiler_json = "-o {}".format(compiler_json_name)

        gen_py = self.config["path"]["binary"]["compiler"]["gen_py"]

        # feature map cut
        def get_fm_cut_parameter(skip_fm_cut):
            if hw_mode == 520:
                fm_cut_conf = ""
            elif skip_fm_cut:
                # no need for nef
                fm_cut_conf = ""
            else:
                fm_cut_modes = {
                    "default": "",
                    "deep_search": f"-m {para_onnx}"
                }
                fm_cut_k = self.config["compiler_piano"]["node_schedule_mode"]
                fm_cut_conf = fm_cut_modes[fm_cut_k]
            return fm_cut_conf

        fm_cut_conf = get_fm_cut_parameter(skip_backend)

        # no need for get_cmd_gen_apb

        env_compiler_lib = """export LD_LIBRARY_PATH="{}:$LD_LIBRARY_PATH" """.format(self.config["path"]["binary"]["compiler"]["lib_dir"])
        env_compile_bin_path = "export COMPILER_BIN_DIR={}".format(self.config["path"]["binary"]["compiler"]["bin_dir"])
        env_opt_bin_path = "export OPT_COMPILE_DIR={}".format(self.config["path"]["binary"]["compiler"]["opt_bin_dir"])
        compiler_envs.extend([env_compiler_lib, env_compile_bin_path, env_opt_bin_path])

        # HACK: stc compiler for 540/730, https://redmine.kneron.tw/issues/17275
        if hw_mode in [540, 730] and self.is_single_layer:
            compiler_envs.append("export KNERON_NMEM_FT_REORDER_OP=1")

        # HACK: http://eip.kneron.com:8080/redmine/issues/16360#note-5
        #       for 720 16bit, knerex
        if self.is_big_model and hw_mode in [720] and self.config["knerex"]["datapath_bitwidth_mode"] in ["int16"]:
            compiler_envs.append("export KNERON_PIANO_OPT_ADD_DUMMY_BYPASS_NODE_FOR_PRELU_LRELU=1")

        compiler_bin = "{} {}".format(self.config["path"]["binary"]["compiler"]["compiler"], hw_mode)

        def get_gen_cfg_cmds():
            cmd_gen_cfg = "{} -t {} {} {} {} {} {} 2>&1 > gen_config.log".format(
                gen_py, hw_mode, para_model_type, s_para_json,
                para_compiler_json, fm_cut_conf, extra_para)

            # HACK: some hack files. may be used for some special models
            p_input = self.model_path / "input"
            p_in_compiler_customize = p_input / f"compiler_piano.config.kdp{hw_mode}.json"
            p_in_img_cut_customize = p_input / "image_cut_config.json"

            p_compiler_json_custom = None

            cp_cmds = ["echo"]  # echo is placeholder in bash
            if p_in_compiler_customize.exists():
                if gen_nef_config:
                    # for nef gen, p_compiler_json_custom is used
                    p_compiler_json_custom = p_out / "compiler_custom_config.json"
                    cp_1 = "cp {} {}".format(p_in_compiler_customize, p_compiler_json_custom)
                    # normal p_compiler_json will be generated anyway
                else:
                    # for normal compiler
                    # normal p_compiler_json will be copied from input. not generated
                    cp_1 = "cp {} {}".format(p_in_compiler_customize, p_compiler_json)
                cp_cmds.append(cp_1)

                if p_in_img_cut_customize.exists():  # put inside above if?
                    cp_1 = "cp {} {}".format(p_in_img_cut_customize, p_img_cut_json)
                    cp_cmds.append(cp_1)

            # has customized files?
            cp_cmd = " && ".join(cp_cmds)
            has_customized = len(cp_cmds) > 1

            if gen_nef_config:
                # for nef config. will run both
                return cmd_gen_cfg, cp_cmd, p_compiler_json_custom
            else:
                # normal compiler calling
                if has_customized:
                    return cp_cmd, "echo", p_compiler_json_custom
                else:
                    return cmd_gen_cfg, "echo", p_compiler_json_custom

        cmd_gen_cfg, cmd_gen_cfg_custom, p_compiler_json_custom = get_gen_cfg_cmds()

        if self.config["path"]["internal"] and (not self.config["path"]["use_toolchain"]):
            cmd_compiler = f"{compiler_bin} {para_onnx} {p_compiler_json.name} debug"
        else:
            cmd_compiler = f"{compiler_bin} {para_onnx} {p_compiler_json.name}"

        # batch compiler json is generated by regression.
        p_batch_config = self.generate_batch_compiler_json(hw_mode=hw_mode, p_out=p_out, p_compiler_json=p_compiler_json, p_config_to_custom=p_compiler_json_custom)

        # batch compiler command
        cmd_batch = self.generate_batch_compiler_cmd_v1(hw_mode=hw_mode, p_out=p_out, p_batch_config=p_batch_config)

        return cmd_gen_cfg, cmd_compiler, cmd_batch, p_out, "; ".join(compiler_envs)

    def generate_batch_compiler_cmd_v1(self, *, hw_mode, p_out, p_batch_config):
        """batch_compile to support ALL (+540/730) platforms since 0.21.1. """
        compiler_commit = self.config["path"]["compiler_commit"]
        bin_bc = self.config["path"]["binary"]["compiler"]["batch_compiler"]
        command = f"pushd {p_out} > /dev/null && {bin_bc} {p_batch_config} -T {hw_mode} -t {compiler_commit} -o -D && popd > /dev/null"

        return command

    def generate_batch_compiler_json(self, *, hw_mode, p_out, p_compiler_json, p_config_to_custom):
        """ Use template to generate batch_compile.json."""

        # create batch_compile.json

        if self.config["module_run"]["only_ip_evaluator"]:
            # no scaled onnx yet. use origin.onnx
            fn_knerex_onnx = futils.relative_path(self.map_onnx["origin"], p_out)
            fn_knerex_json = ""
        else:
            # knerex should be ready now
            fn_knerex_onnx, fn_knerex_json, _, _ = self.get_scaled_onnx_source(hw_mode)

        c = {}
        # nef are used for verify board output against csim.
        c["flow_path"] = self.config["path"]["flow"]
        c["hw_mode"] = hw_mode
        c["model_id"] = self.nef_model_id
        c["stamp"] = "1"
        c["bie_path"] = str(fn_knerex_onnx)
        if fn_knerex_onnx.name.endswith(".onnx"):
            c["json"] = str(fn_knerex_json)
        else:
            # no json needed for bie files
            c["json"] = ""
        # TODO: make this relative path
        c["gen_config_path"] = str(p_compiler_json)

        # save using template
        if p_config_to_custom and p_config_to_custom.exists():
            template = self.jinja_env.get_template("batch_compile_bconfig_custom.json")
            c["custom_config_path"] = str(p_config_to_custom)
        else:
            template = self.jinja_env.get_template("batch_compile_bconfig.json")

        output = template.render(config=c)
        fn_json_save = "{}/batch_compile.json".format(p_out)
        with open(fn_json_save, "w") as f:
            f.write(output)

        return fn_json_save

    def save_cp_log(self, p_log, cp):
        with open(p_log, "w") as f:
            f.write(f"bash run return code: {cp.returncode}")
            f.write("\n".join([cp.stdout, cp.stderr]))

    @run_module(module_name="auto/compiler_cfg")
    def generate_compiler_config(self, *, hw_mode, command):
        """Generate config for compiler. may do feature-map cut which is time consuming.

        Some optimize modules may be available.
          - feature-map cut deep search.
            - script will iterate compiler to find the best cut.
            - script will copy opt_compile.log to compiler output folder (even if failed).
            - This is time-consuming, may be killed by timeout. Will not have opt_compile.log if so.
        """
        module_name = f"kdp{hw_mode}/compiler_cfg"
        self.save_command(module_name, command)

        # NOTE: usually generate compiler config is very fast.
        #       however, it maybe too long if fm_cut turned on. (deep_search)
        TOS = self.config["compiler_piano"]["timeout"]
        cp = futils.run_bash_script(command, timeout=TOS)

        self.check_compiler_log(hw_mode, cp)
        self.clean_opt_compile(hw_mode)

        if cp.returncode != 0:
            self.check_bc_returncode(cp, hw_mode, module="compiler_cfg")

    def check_compiler_log(self, hw_mode, cp):
        p_json = pathlib.Path(self.path[f"compiler_piano_{hw_mode}_json"])
        # save log for debug
        p_log = p_json.parent / "compiler_gen_config.log"

        # DEBUG: check size of config. if empty, save log for debug
        if not p_json.exists():
            self.save_cp_log(p_log, cp)
            raise RegressionError(f"kdp{hw_mode}/compiler_cfg", self.model_id, msg="no config generated.")
        elif p_json.stat().st_size == 0:
            self.save_cp_log(p_log, cp)
            raise RegressionError(f"kdp{hw_mode}/compiler_cfg", self.model_id, msg="config empty.")
        elif cp.returncode != 0:
            # save log first.
            self.save_cp_log(p_log, cp)
            # will do detailed check below

    def clean_opt_compile(self, hw_mode):
        """Clean up opt_compile which is from fm_cut but sometime not cleaned. """
        p_json = pathlib.Path(self.path[f"compiler_piano_{hw_mode}_json"])
        p_opt_cmpl = p_json.parent / "opt_compile"
        if p_opt_cmpl.exists():
            cmd = f"pkill -f {self.model_name} ; sleep 1; rm -rf {p_opt_cmpl}"
            cp2 = futils.run_bash_script(cmd, do_echo=True)

            # TODO: examine cp2 return code
            # cp2.returncode == -15:

    def check_bc_returncode(self, cp, hw_mode, module="compiler"):
        """Examine the return code of batch-compiler.

        Ref: https://redmine.kneron.tw/issues/18389
        Compiler return code is between 1-30.
        gen_config.py will return 31-50 if fm_cut failed.

        TODO: what about normal compiler frontend?
        """
        rc = cp.returncode
        if rc == 0:
            return  # success
        elif rc == 1:
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg="compiler common")
        elif rc == 2:
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg="compiler invalid input")
        elif rc == 3:
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg="invlid onnx attribute")
        elif rc == 4:
            raise RegressionError(f"kdp{hw_mode}/HW not support", self.model_id, msg="Err: 4")
        elif rc == 5:
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg="unexpected graph")
        elif rc == 6:
            raise RegressionError(f"kdp{hw_mode}/unimplemented feature", self.model_id, msg=f"compiler: {rc}")
        elif rc == 7:
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg="value not ready")
        elif rc == 8:
            raise RegressionError(f"kdp{hw_mode}/knerex", self.model_id, msg="cmplr: knerex config error")
        elif rc == 9:
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg="unexpected value")
        elif rc >= 1 and rc <= 30:
            raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg=f"Err: {rc}")

        ###################################################################################
        elif rc == 111:
            # compiler never timeout. it is mostly fm_cut search
            raise RegressionError(f"kdp{hw_mode}/fm_cut", self.model_id, msg=cp.stderr)
        elif rc == -15:
            raise RegressionError(f"kdp{hw_mode}/fm_cut", self.model_id, msg="kille by SIGTERM")

        ###################################################################################
        # gen_config.py will return 31-50 if fm_cut failed.
        elif rc == 32:
            msg = f"fm_cut does not support {hw_mode}."
            raise RegressionError(f"kdp{hw_mode}/fm_cut", self.model_id, msg=msg)
        elif rc == 33:
            msg = "No info_cutting.log!"
            raise RegressionError(f"kdp{hw_mode}/fm_cut", self.model_id, msg=msg)
        elif rc >= 31 and rc <= 50:
            # default report for fm_cut fail
            msg = f"Err: {rc}"
            raise RegressionError(f"kdp{hw_mode}/fm_cut", self.model_id, msg=msg)

        ###################################################################################
        self.check_compiler_HardwareNotSupport(hw_mode)
        ###################################################################################
        # default error
        raise RegressionError(f"kdp{hw_mode}/{module}", self.model_id, msg=f"Err: {rc}")

    @run_module(module_name="auto/compiler")
    def run_batch_compile_command(self, *, hw_mode, command, dir_out):
        module_name = f"kdp{hw_mode}/run batch compiler"
        self.save_command(module_name, command)

        cp = futils.run_bash_script(command, do_echo=False)  # self.config["regression"]["print_error"]

        self.check_bc_returncode(cp, hw_mode, module="compiler")

        fn_outs = {}
        if hw_mode in [540, 730]:
            # for 730/540, no setup.bin, command.bin is optional if last one is cpu node
            #              and csim/firmware both use kne
            fn_outs[f"kdp{hw_mode}/kne"] = f"{dir_out}/models_{hw_mode}.kne"
            fn_outs[f"kdp{hw_mode}/nef"] = f"{dir_out}/models_{hw_mode}.nef"
        else:
            # old setup + nefv1, setup.bin+command.bin for csim
            #                    nef for firmware
            fn_outs[f"kdp{hw_mode}/nef"] = f"{dir_out}/models_{hw_mode}.nef"

        if self.config["module_run"]["only_ip_evaluator"]:
            # no need to release nef file which is useless
            return

        for k, fn_check in fn_outs.items():
            p_check = pathlib.Path(fn_check)
            if not p_check.exists():
                raise RegressionError(f"kdp{hw_mode}/compiler", self.model_id, msg=f"{p_check.name} missing.")

            self.model_fx_release[k] = p_check

    @run_module("auto/compiler hw info")
    def load_hw_stats(self, *, dir_out, hw_mode):
        """Collect FPS info / weight size / cpu nodes from compiler log."""
        if hw_mode in self.config["hw_mode_on"]:
            ip_eval_report = compiler.collect_FPS(dir_out, hw_mode)
            if "fps" in ip_eval_report:
                # this is a valid report
                signal("data_sender").send((self.model_id, f"kdp{hw_mode}/FPS", ip_eval_report["fps"]))
                # Check cpu node info
                # TODO: simplify this. it must be compulsary
                k = "cpu_node"
                if k in ip_eval_report:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/{k}", ip_eval_report[k]))


                # patch up 520 using preset value
                if hw_mode == 520:
                    try:
                        ip_eval_bw = self.config["compiler_piano"]["ip_evaluator_bw"][hw_mode]
                        preset_keys = {
                            "bw_weight": "GETW bandwidth GB/s",
                            "bw_rdma": "RDMA bandwidth GB/s",
                            "bw_wdma": "WDMA bandwidth GB/s"}
                        for k1, k2 in preset_keys.items():
                            if ip_eval_bw[k1] is not None:
                                ip_eval_report[k2] = ip_eval_bw[k1]
                    except:
                        pass

                for k, v in ip_eval_report.items():
                    self.model_fx_report[f"kdp{hw_mode}/ip_eval/{k}"] = v

            fps_improved = compiler.collect_fps_improve(dir_out)
            if fps_improved:
                signal("data_sender").send((self.model_id, f"kdp{hw_mode}/FPS_improved", fps_improved))

            # Collect command size and weight size info
            if self.is_big_model:
                cmd_size, weight_size = compiler.collect_command_weight_size(dir_out)
                if cmd_size:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/cmd_size(KB)", cmd_size))
                if weight_size:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/wt_size(MB)", weight_size))
                    # TEMP: some temp analsysis on weight size. 8bit fx weight vs 32bit float
                    if self.onnx_size > 0:
                        wt_overhead = int(100 * (4 * weight_size / self.onnx_size - 1))
                    else:
                        wt_overhead = 0
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/wt_overhead (%)", wt_overhead))

        # if self.config["module_run"]["filter_cpu_cases"]:
        #     if cpu_node_list_str not in ["None", "N/A"]:
        #         # there are cpu nodes
        #         raise RegressionError(f"kdp{hw_mode}/filter_cpu_node", self.model_id)

    @run_module(module_name="auto/compiler frontend")
    def run_compiler_frontend(self, *, hw_mode, use_quan_model=False):
        """Call compiler frontend to generate cpu node list and decomposed node mapping.

        compiler has two steps:
            * generate config: `generate_compiler_config`
                * (optional) feature map search during gen_config, for better fps.
            * actual compiler run: `run_batch_compiler_command`

        Inputs:
          - hw_mode: 520/530/... supported platform
          - use_quan_model (bool): True if use knerex generated scaled.bie/onnx.
            Set to False if run for i

        Output files:
          - decomposed.bie
          - decomposed.onnx (for release)
        """
        module_name = f"kdp{hw_mode}/compiler frontend"

        (cmd_gen_cfg, cmd_compiler, cmd_batch_compiler, dir_out,
         envs) = self.get_compiler_config_helper1(
             hw_mode,
             skip_backend=True,
             use_quan_model=use_quan_model,
             do_ip_eval=False)

        command1 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_gen_cfg}"
        command2 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_compiler}"

        self.generate_compiler_config(command=command1, hw_mode=hw_mode)

        self.save_command(module_name, command2)
        cp = futils.run_bash_script(command2, do_echo=False)

        self.check_bc_returncode(cp, hw_mode, module="compiler frontend")

        # https://redmine.kneron.tw/issues/17758
        # NOTE: old name is graph_opt.onnx
        kvs = {
            # name from compiler: new name in regression
            "decomposed.onnx": self.map_onnx[f"kdp{hw_mode}_opt_piano_onnx"],
            "decomposed.bie": self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"],
        }
        # copy to knerex folder
        p_knerex = self.path[f"knerex_output_{hw_mode}"]
        p_knerex.mkdir(exist_ok=True)
        for k, v in kvs.items():
            fn_from = list(pathlib.Path(dir_out).glob(k))
            if len(fn_from) == 0:
                raise RegressionError(f"kdp{hw_mode}/compiler frontend", self.model_id, msg=f"NO {k} generated by frontend.")
            shutil.copyfile(fn_from[0], v)

        # load basic_info.json to check how many input bin formats for each input
        if use_quan_model:
            # load jsons from compiler frontend generated bie
            jsons = util_lib.load_zip_jsons(self.map_onnx[f"kdp{hw_mode}_opt_piano_bie"])
            basic_info = jsons["basic_info.json"]
            self.io_nodes[("input_format", hw_mode)] = basic_info["input_fmt"]

        bw_in = self.config["knerex"]["model_in_bitwidth_mode"]
        bw_out = self.config["knerex"]["model_out_bitwidth_mode"]
        bw_cpu = self.config["knerex"]["cpu_bitwidth_mode"]
        bw_dp = self.config["knerex"]["datapath_bitwidth_mode"]
        bw_wt = self.config["knerex"]["weight_bitwidth_mode"]
        self.model_fx_report[f"kdp{hw_mode}/input bitwidth"] = bw_in
        self.model_fx_report[f"kdp{hw_mode}/output bitwidth"] = bw_out
        self.model_fx_report[f"kdp{hw_mode}/cpu bitwidth"] = bw_cpu
        self.model_fx_report[f"kdp{hw_mode}/datapath bitwidth"] = bw_dp
        self.model_fx_report[f"kdp{hw_mode}/weight bitwidth"] = bw_wt

        # clean up folder
        shutil.rmtree(dir_out)

    @run_module(module_name="auto/pick bin format")
    def pick_in_bin_format(self, *, hw_mode, limited_input):
        """Pick 1 format for each limited_input.

        see https://redmine.kneron.tw/issues/18306
        """
        k1 = ("input_format", hw_mode)
        assert k1 in self.io_nodes, "Input formats are not generated with compiler frontend on quantized model. Check flow settings."
        cmpl_fmts = self.io_nodes[k1]
        results = {}
        for in_name in limited_input:
            if in_name not in cmpl_fmts:
                self.logger.critical(f"Constraint on input format not applied!!! Given {in_name} not in {list(cmpl_fmts.keys())} given by compiler.")
                continue
            if len(cmpl_fmts[in_name]) == 1:
                self.logger.critical(f"Constraint on input format not applied!!! Given {in_name} has only 1 format: {cmpl_fmts[in_name][0]}.")
                continue
            fmts = [f for f in cmpl_fmts[in_name] if not f.startswith("4W4C")]
            if len(fmts) == 0:
                self.logger.critical(f"Constraint on input format not applied!!! Given {in_name} has no valid format to limit: {cmpl_fmts[in_name]} -> remove 4W4B* -> [].")
                continue
            results[in_name] = fmts[0]
        return results

    @run_module(module_name="auto/compiler")
    def generate_nef(self, *, hw_mode, p_nef=None, fmt_limit=None):
        """call batch compiler to generate nef.

        The last and full run of compiler.

        Inputs:
          * hw_mode supported.

        Output files:
          * model_NNN.nef
          * model_NNN.kne
        """

        module_name = f"kdp{hw_mode}/gen_nef"
        self.logger.info(f"run {module_name}")

        if p_nef is None:  # default path
            # TODO: move to compiler_piano_
            # p_nef = pathlib.Path(self.path["compiler_piano_{}_out".format(hw_mode)])
            p_nef = pathlib.Path(self.path["nef_output_{}".format(hw_mode)])
        p_nef.mkdir(mode=0o770, parents=True, exist_ok=True)

        # generate compiler nef configs
        do_ip_eval = self.config["compiler_piano"]["ip_evaluator"]
        cmd_gen_cfg, cmd_compiler, cmd_batch_compiler, dir_out, envs = self.get_compiler_config_helper1(hw_mode,
                                                                                                        gen_nef_config=True,
                                                                                                        p_out=p_nef,
                                                                                                        fmt_limit=fmt_limit,
                                                                                                        do_ip_eval=do_ip_eval)

        command1 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_gen_cfg}"
        # command2 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_compiler}"
        command3 = f"pushd {dir_out} > /dev/null; {envs}; {cmd_batch_compiler}"

        # below functions has decorated by run_module. will calculate time and report specific columns
        self.generate_compiler_config(command=command1, hw_mode=hw_mode)
        self.run_batch_compile_command(command=command3, dir_out=dir_out, hw_mode=hw_mode)
        self.load_hw_stats(dir_out=dir_out, hw_mode=hw_mode)

        fn_knerex_bie, _, _, _ = self.get_scaled_onnx_source(hw_mode)
        # collect ioinfo.json for future usage
        # needed for csim
        # needed for dynasty (especially for rgba)
        # NOTE: ioinfo.json is obsoleted. using calculation_info.json
        if fn_knerex_bie.name.endswith(".bie"):
            js = [
                # original name, key in regression, name in bie (for dynasty)
                ("ioinfo.json", "ioinfo_json", "ioinfo.json"),
                ("calculation.json", "calculation_json", "calculation_info.json"),
            ]
            for n1, n2, n3 in js:
                p_json = dir_out / n1
                if p_json.exists():
                    self.path[n2][hw_mode] = p_json
                    # patch bie
                    util_lib.patch_bie_w_ioinfo_json(fn_knerex_bie, p_json, n3)

    @run_module(module_name="auto/csim")
    def run_csim(self, *, hw_mode):
        """
        run csim for 720/530/730/630/540

        Input files:
            * run_csim_NNN.ini
                * pointing to files needed for csim.
                * refer to `generate_csim_ini` for reference. generate_csim_ini

        Output files:
            * `output/results/FN_INPUT/csim_NNN_output`

        if 520 given, will run `run_csim_520` instead.

        """
        module_name = f"kdp{hw_mode}/csim"
        self.logger.info(f"run {module_name}")

        list_csim = self.io_nodes[("btm_csim_in", hw_mode)]
        d_csim = {i: v for i, v in enumerate(list_csim)}
        bin_csim = fconsts.BIN_SET["csim"][hw_mode]
        fn_sh = self.path["btm_dump"] / f"csim_{hw_mode}" / f"run_csim_{hw_mode}.sh"
        cmd, cp = csim.run_csim(d_csim, bin_csim, fn_sh)

        self.check_csim_error(cp, hw_mode)

    @run_module(module_name="kdp520/csim")
    def run_csim_520(self):
        """run csim 520.

        520 is our first platform. This is different from later platforms.

        Input files:
            * command.bin
            * setup.bin
            * weight.bin
            * dynasty dumped input file at `output/results/FN_INPUT/model_520-wqbi_piano/layer_input_*.bin`

        Output files:
            * `output/results/FN_INPUT/csim_520_output`
        """

        hw_mode = 520
        module_name = f"kdp{hw_mode}/csim"
        self.logger.info(f"run {module_name}")

        p_csim_out = pathlib.Path(self.io_nodes[("btm_csim_path", hw_mode)])
        p_compiler_output = pathlib.Path(self.path["compiler_piano_{}_out".format(hw_mode)])
        p_rel_compiler = futils.relative_path(p_compiler_output, p_csim_out)

        cs = {}
        for fn_key in ["command_bin", "setup_bin", "weight_bin"]:
            p_bin = self.compiler_output[hw_mode][fn_key].name
            cs[fn_key] = f"{p_rel_compiler}/{p_bin}"

        para_bin = self.config["path"]["binary"]["csim"][520]
        p_csim_out.mkdir(mode=0o770, parents=True, exist_ok=True)

        p_dynasty_so = pathlib.Path(self.config["path"]["binary"]["dynasty"]["lib.so"])
        ENV_DYNASTY_LIB = f"""export LD_LIBRARY_PATH="{p_dynasty_so.parent}:$LD_LIBRARY_PATH" """

        if self.is_big_model:
            # NOTE: only 1 input for 520. no need for ","?
            fn_input_rgba = ",".join([str(a) for a in self.io_nodes[("btm_csim_in_bin", hw_mode)]])
            c = f"""{para_bin} -d 0 --thread 1 {cs["command_bin"]} {cs["weight_bin"]} {fn_input_rgba} --setup {cs["setup_bin"]}"""
        else:
            # NOTE: 520 stc to use sequential.bin.
            # NOTE: v016 category will have TWO inputs!!!
            fn_input_sqtl = " ".join([str(a) for a in self.io_nodes[("btm_csim_in_bin", hw_mode)]])
            c = f"""{para_bin} -d 0 --thread 1 {cs["command_bin"]} {cs["weight_bin"]} -t {fn_input_sqtl}"""

        command = f"{ENV_DYNASTY_LIB}; pushd {p_csim_out} > /dev/null && {c} && popd > /dev/null"
        self.save_command(module_name, command)

        cp = futils.run_bash_script(command, timeout=60*60*6)
        self.check_csim_error(cp, hw_mode)

    @run_module(module_name="kdp520/btm dyn_csim")
    def btm_dyn_csim_520(self):
        """
        run bit-true-match check between dynasty / csim fix point results.

        Will raise RegressionError if mismatch.
        """
        module_name = "kdp520/btm dyn_csim"
        self.logger.info(f"check {module_name}")
        hw_mode = 520
        dir_csim_output = self.io_nodes[("btm_csim_path", hw_mode)]

        if self.is_big_model:
            # Multiple outputs possible
            golden_list = self.io_nodes[("btm_dynasty_golden_txt_path", 520)]
            for i in range(len(golden_list)):
                fn_csim_out = "{}/node_{:04d}_final_output.txt".format(dir_csim_output, i)
                fn_d520_out = golden_list[i]
                assert os.path.exists(fn_d520_out), "dynasty 520 output ({}) does not exist!".format(fn_d520_out)
                # TODO: use futils.md5sum for bit-true-match? faster?
                with open(fn_csim_out, "r") as f_csim, open(fn_d520_out, "r") as f_dyn:
                    out_csim = [int(a) for a in f_csim]
                    out_dyna = [int(a) for a in f_dyn]

                    # do report
                    cond1 = len(out_csim) == len(out_dyna)
                    msg1 = "dynasty dump size ({len(out_dyna)}) != csim dump size ({len(out_csim)})"
                    cond2 = all(a == b for a, b in zip(out_csim, out_dyna))
                    msg2 = "dynasty-csim mismatch! "

                    for cond, msg in [(cond1, msg1), (cond2, msg2)]:
                        if not cond:
                            self.model_fx_report["btm_520"] = msg
                            assert cond, msg
                    else:
                        self.model_fx_report["kdp520/btm"] = "bit-true-match (520) verified between dynasty and csim."

        else:
            # single layer. BUG: we assume only one output.
            fn_csim_out = "{}/Lastlayer_final_output.txt".format(dir_csim_output)
            fn_d520_out = self.io_nodes[("btm_dynasty_golden_txt_path", 520)][0]
            assert os.path.exists(fn_d520_out), "dynasty 520 output ({}) does not exist!".format(fn_d520_out)

            with open(fn_csim_out, "r") as f_csim, open(fn_d520_out, "r") as f_dyn:
                out_csim = [int(a) for a in f_csim]
                out_dyna = [int(a) for a in f_dyn]
                assert len(out_csim) == len(out_dyna), "dynasty dump size ({}) != csim dump size ({})".format(len(out_dyna), len(out_csim))
                assert all(a == b for a, b in zip(out_csim, out_dyna)), "dynasty-csim mismatch! "

        try:
            if self.config["post_clean_up"]["csim_output"]:
                shutil.rmtree(dir_csim_output)
        except:
            self.logger.error("Failed to delete csim 520 dum folder. {}".format(dir_csim_output))

    @run_module(module_name="auto/btm dyn_csim")
    def btm_dyn_csim(self, *, hw_mode):
        """
        run bit-true-match check between dynasty / csim fix point results.

        Will raise RegressionError if mismatch.

        NOTE: platform 520 see btm_dyn_csim_520
        """

        # detour for 520
        if hw_mode == 520:
            self.btm_dyn_csim_520()
            return

        self.logger.info(f"check kdp{hw_mode}/btm_dym_csim")

        # dynasty golden
        p_d = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]

        # the quick way.
        # suppose all the text files are EXACTLY same, with same futils.md5sum
        p_csim_dump = self.io_nodes[("btm_csim_path", hw_mode)]

        # compare data from dma2seq. most easy.
        p_c = pathlib.Path(p_csim_dump).glob("dma2seq_*.seq")
        set_d = set(futils.md5sum(str(a)) for a in p_d)
        set_c = set(futils.md5sum(str(a)) for a in p_c)

        # DEBUG: if internal regression, mismatch will triger pld report automatically
        if self.config["path"]["internal"]:
            if set_d != set_c:
                try:
                    self.generate_pld_report(hw_mode)
                except Exception as e:
                    signal("data_sender").send((self.model_id, f"kdp{hw_mode}/pld dump", str(e)))

        if set_d != set_c:
            # do the report
            msg = "mismatched: {}".format(set_d.difference(set_c))
            self.model_fx_report[f"kdp{hw_mode}/btm"] = msg
            self.module_status[hw_mode]["btm_dyn_csim"] = False
            raise RegressionError(f"kdp{hw_mode}/btm dyn_csim", self.model_id, msg=msg)
        else:
            self.model_fx_report[f"kdp{hw_mode}/btm"] = f"bit-true-match ({hw_mode}) verified between dynasty and csim."

        # NOTE: the hard way, for loop to compare
        # self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]
        # dma2seq_*.seq

    #################################################################################
    @run_module(module_name="auto/kneron+")
    def run_nef_kneron_plus(self, *, hw_mode, number_try=0):
        """run nef on kneron plus (dongle server).

        NEF inference request send to kneron internal server,
        which call hardware dongle to do the inference.

        Dongle firmware may return either float or fix-point data on different request.
        Current format: `BCHW`.

        NOTE: the server will RESET dongle then sleep 15s !!!

        Input files:
            * For 520/720/530/630:
                * model_NNN.nef
            * For 540/730, dongle:
                * model_NNN.kne
            * dynasty dumped input bin at `output/results/FN_INPUT/model_NNN-wqbi_piano/layer_input_*.bin`

        Output files:
            * dongle inferenced results in BCHW, float or fix-point
        """
        from nef_utils.dongle_inference import dongle_inference

        module_name = f"kdp{hw_mode}/kneron+"
        self.logger.info(f"run {module_name}")

        dongle_server = self.config["nef"]["dongle_server"]

        dir_rgba_list = ["{}".format(rgba_input) for rgba_input in self.io_nodes[("btm_csim_in_bin", hw_mode)]]
        s_rgba = " ".join(dir_rgba_list)

        dir_nef_model = "{}/models_{}.nef".format(self.path['compiler_piano_{}_out'.format(hw_mode)], hw_mode)

        dir_nef_out_list = []
        for i in range(number_try):
            dir_nef_out_list.append(self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, i)])
            dir_nef_out_list[i].mkdir(parents=True, exist_ok=True)

        dir_nef_out = str(self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, 0)])[:-2]

        if hw_mode == 520:
            fn_ioinfo = "{}/ioinfo.csv".format(self.path["compiler_piano_{}_out".format(hw_mode)])
            ioinfo = pd.read_csv(fn_ioinfo, header=None)
            output_order = []
            for i in range(len(ioinfo)):
                in_or_out = ioinfo[0][i]
                if in_or_out == "o":
                    output_order.append(str(ioinfo[2][i]).replace("/", "_"))
        else:
            fn_ioinfo = "{}/ioinfo.json".format(self.path["compiler_piano_{}_out".format(hw_mode)])
            with open(fn_ioinfo, "r") as f:
                ioinfo = json.load(f)
                output_order = []
                for output_item in ioinfo["output"]:
                    output_order.append(output_item["name"].replace("/", "_"))

        # save the bash command for debug. regression will actually call python functions
        # TODO: why no output folder specified?
        dir_nef_script = self.config["path"]["binary"]["nef"]["nef_client.py"]
        command = f"python3 {dir_nef_script} -i {s_rgba} -m {dir_nef_model} -p {hw_mode} -mid {self.nef_model_id} -g {dongle_server} -fix"
        self.save_command(module_name, command)

        # acutally call dongle inference server from python function
        try:
            fix_output_list, dongle_client_log = dongle_inference(
                dir_nef_model,
                dir_rgba_list,
                model_id=self.nef_model_id,
                platform=hw_mode,
                group=dongle_server,
                inference_times=number_try,
                is_fixed_output=True,
                output_path=dir_nef_out,
                output_order=output_order)
        except GeneralError as e:
            self.logger.error(e.details)
            raise RegressionError(f"kdp{hw_mode}/{e.msg}", self.model_id, msg=e.details)

        fn_log = self.path["btm_dump"] / "dongle_client.log"
        with open(fn_log, "w") as f:
            f.writelines([line + '\n' for line in dongle_client_log])

    def generate_pld_report(self, hw_mode, dry_run=True):
        """
        Internal process of generating pld report when dynasty/csim mismatch.


        Inputs:
          - hw_mode: platform (520 not supported)
          - dry_run: True to only create scripts. False will actually run them

        Steps included:
            * re-run dynasty per layer
            * re-run csim per layer
            * run pld.py to generate pld report

        Output files:
            * pld report
        """
        if hw_mode == 520:
            self.logger.error("PLD dump does not support 520")
            raise NotImplementedError

        module_name = f"kdp{hw_mode}/pld dump"
        self.logger.info(f"run {module_name}")

        # re-run csim with special config, already generated when run normal csim
        list_csim = self.io_nodes[("btm_csim_in_pld", hw_mode)]
        d_csim = {i: v for i, v in enumerate(list_csim)}
        bin_csim = self.config["path"]["binary"]["csim"][hw_mode]
        fn_sh = self.path["dir_output"] / f"run_csim_{hw_mode}_pld.sh"
        cmd, cp = csim.run_csim(d_csim, bin_csim, fn_sh, dry_run=dry_run)
        # self.check_csim_error(cp, hw_mode)

        # re-run dynasty on test_input.txt with dump 2
        if self.config["dynasty"]["do_dump"] < 2:
            # it maybe 730 or 730-wqbi or ...
            _, _, btm_mode, _ = self.get_scaled_onnx_source(hw_mode)
            # if dry_run, the dynasty script will be created without running.
            self.run_dynasty_inference_btm_dump2(hw_mode=btm_mode, dry_run=dry_run)

        # run pld.py for report
        p_compiler = pathlib.Path(self.path["compiler_piano_{}_out".format(hw_mode)])
        p_dynasty = self.io_nodes[("btm_dynasty_path", hw_mode)]
        p_csim = self.io_nodes[("btm_csim_path", hw_mode)]
        p_report = self.io_nodes[("pld_report", hw_mode)]
        p_report.mkdir(parents=True, exist_ok=True)
        bin_pld_report = "python3 {}".format(self.config["path"]["binary"]["pld"]["pld.py"])
        command_pld_report = f"{bin_pld_report} {hw_mode} {p_compiler} {p_csim} {p_dynasty} {p_report}"
        self.save_command(module_name, command_pld_report)
        fn_cmd = self.path["dir_output"] / f"run_pld_report_{hw_mode}.sh"
        with open(fn_cmd, "w") as f:
            f.write(f"{command_pld_report}\n\n")
        if not dry_run:
            cp = futils.run_bash_script(command_pld_report, do_echo=False, timeout=60*60*6)
            # run generate_pld_report scrip failed, save the .sh file for debug
            if cp.returncode != 0:
                fn_log = self.path["dir_output"] / f"run_pld_report_{hw_mode}.log"
                with open(fn_log, "w") as f:
                    f.write("\n".join([cp.stdout, cp.stderr]))
                if cp.returncode == 111:
                    msg = cp.stderr
                else:
                    msg = f"Err: {cp.returncode}"
                signal("data_sender").send((self.model_id, "kdp{hw_mode}/pld dump", msg))


    @run_module(module_name="auto/btm csim_vs_dongle")
    def btm_csim_nef(self, *, hw_mode, number_try):
        """csim vs nef, 520/530/720

        # NOTE: we suppose NEF will only run on big_model
        # if need to run on stc, the csim reference may need to adjust, refer to btm_dyn_csim
        """
        try:
            module_name = f"kdp{hw_mode}/btm_csim_nef/try{number_try}"
            self.logger.info("check {}".format(module_name))

            # find all nef inferenced results
            p_nef = pathlib.Path(self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, number_try)]).glob("layer_*_fx.txt")

            # find all csim inferenced results
            if hw_mode != 520:
                if self.config["knerex"]["model_out_bitwidth_mode"] in ["int16"]:
                    # dongle output is 16B
                    str_search = "dma2seq_*.seq.16B"
                else:
                    # 8B / 15B, can vs dynasty directly
                    str_search = "dma2seq_*.seq"
            else:
                str_search = "node_*_final_output.txt"
            p_csim = pathlib.Path(self.io_nodes[("btm_csim_path", hw_mode)]).glob(str_search)

            # NOTE: does not btm on dynasty here
            # p_dynasty = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]
            # set_dynasty = set(futils.md5sum(str(a)) for a in p_dynasty)

            set_nef = set(futils.md5sum(str(a)) for a in p_nef)
            set_csim = set(futils.md5sum(str(a)) for a in p_csim)

            if set_nef != set_csim:
                msg = f"mismatched: {set_nef.difference(set_csim)}"
                self.model_fx_report[f"kdp{hw_mode}/btm"] = msg
                raise RegressionError(f"kdp{hw_mode}/btm csim_vs_dongle", self.model_id, msg=msg)

        except Exception as e:
            print_err(e, self.config["regression"]["print_error"])
            raise RegressionError(f"kdp{hw_mode}/btm csim_vs_dongle", self.model_id)

    @run_module(module_name="auto/btm_dyn_kneron+")
    def btm_dyn_nef_kneron_plus(self, *, hw_mode, number_try):
        """dynasty vs nef, 520/530/720

        # NOTE: we suppose NEF will only run on big_model
        # if need to run on stc, the csim reference may need to adjust, refer to btm_dyn_csim
        """

        module_name = f"kdp{hw_mode}/btm dyn_vs_kneron+ ({number_try})"
        self.logger.info("check {}".format(module_name))

        try:
            dir_kneron_plus_output = self.io_nodes[("btm_nef_kneron_plus_path", hw_mode, number_try)]

            # Multiple outputs possible
            golden_list = self.io_nodes[("btm_dynasty_golden_txt_path", hw_mode)]
            for i in range(len(golden_list)):
                fn_dyn_out = str(golden_list[i])

                assert os.path.exists(fn_dyn_out), "dynasty {} output ({}) does not exist!".format(hw_mode, fn_dyn_out)

                fn_kneron_plus = "{}/{}".format(dir_kneron_plus_output, str(golden_list[i]).split("/")[-1])

                # TODO: @weijie we can use futils.md5sum for fx results now.
                with open(fn_kneron_plus, "r") as f_kneron_plus, open(fn_dyn_out, "r") as f_dyn:
                    out_kneron_plus = [int(float(a)) for a in f_kneron_plus]
                    out_dyna = [int(a) for a in f_dyn]
                    assert len(out_kneron_plus) == len(out_dyna), "dynasty dump size ({}) != kneron plus dump size ({})".format(len(out_dyna), len(out_kneron_plus))
                    # assert all(a == b for a, b in zip(out_kneron_plus, out_dyna)), "dynasty-kneron plus mismatch! "
                    assert all(a == b for a, b in zip(out_kneron_plus, out_dyna)), "dynasty-kneron plus mismatch! "

        except Exception as e:
            print_err(e, self.config["regression"]["print_error"])
            raise RegressionError(module_name, self.model_id)

    @run_module(module_name="general/combine_snr")
    def generate_snr_report(self, base_dump="results"):
        """Generate an overall snr report from per-input-group snr reports.
        """
        self.logger.info("generate snr report")

        do_pc = self.config["snr"]["per_channel"]
        do_plot_pc = self.config["snr"]["plot_snr_per_channel"]

        combine_snr("{}/{}".format(self.path["dir_output"], base_dump), do_per_channel=do_pc, do_plot_per_channel=do_plot_pc)

    def save_command(self, module_name, command):
        self.commands.append((module_name, command))
        print_command(command, self.config["regression"]["print_command"])

    def generate_bash_script(self):
        """put all bash script called for this model in the flow into a bash script for future debug.

        Scripts specified for this model:
        - knerex: weight analysis, data analysis ...
        - dynasty: multiple inputs, multiple modes ...

        Each command are saved to self.commands before been executed.
        """
        if not hasattr(self, "commands") or len(self.commands) == 0:
            return
        with open(self.path["fn_cmd"], "w") as f:
            for submodule, command in self.commands:
                f.write(f"# {submodule}\n")
                f.write(command)
                f.write("\n\n")

    def pre_clean_up(self, base_dump="results"):
        """delete temp files / outputs before flow actually start."""

        try:
            flags = self.config["pre_clean_up"]
            dir_o = pathlib.Path(self.path["dir_output"])
            # self.logger.debug("pre clean up {}/{}".format(self.cat_name, self.model_name))

            if flags["all_output"]:
                command = f"rm -rf {dir_o}"
                cp = futils.run_bash_script(command)
                if cp.returncode > 0:
                    self.logger.warn(f"output folder ({dir_o}) cannot be deleted.")

                dir_o.mkdir(mode=0o770, parents=True, exist_ok=True)
                return

            if flags["knerex_analysis"]:
                for fn in dir_o.glob("analysis_*"):
                    fn.unlink()
            if flags["knerex_output"]:
                for fn in dir_o.glob("{}*scale*.onnx*".format(self.model_name)):
                    fn.unlink()
                for fn in dir_o.glob("{}*scale*.bie*".format(self.model_name)):
                    fn.unlink()
            if flags["dynasty_output"]:
                for fn in dir_o.glob(base_dump):
                    shutil.rmtree(str(fn), ignore_errors=True)
            if flags["compiler_output"]:
                for fn in dir_o.glob("compiler_output_*"):
                    shutil.rmtree(str(fn), ignore_errors=True)
        except (KeyError, TypeError):
            self.logger.error("pre clean up not configured. skip ...")

    def clean_knerex_output(self):
        # TODO
        raise NotImplementedError

    def clean_dynasty_output(self, dir_output_list):
        try:
            config_clean = self.config["post_clean_up"]["dynasty_output"]
            clean_only_success = self.config["post_clean_up"]["clean_when_success"]
            is_success = self.module_status["general"]["Success"]
            do_clean = config_clean and clean_only_success and is_success
        except:
            do_clean = False

        if do_clean:
            # skip in some case
            if self.config["path"]["internal"]:
                k = "btm_dyn_csim"
                for hw_mode, status in self.module_status.items():
                    if k in status and not status[k]:
                        pp(f"{k} mismatch! skip post-clean dynasty output.")  # noqa
                        return

            for dir_o in dir_output_list:
                p_o = pathlib.Path(dir_o)
                if not p_o.exists():
                    continue
                for dir_dumps in p_o.glob("mode_*"):
                    shutil.rmtree(str(dir_dumps))