kneron_model_converter/vendor/sys_flow_v2/run.py

#! /usr/bin/env python3

""" CLI interface for regression

Usage:
  run.py [--all-pass] <fn_json> [<keys>...]
  run.py (-h | --help)
  run.py --version

Options:
  --all-pass                            If all pass, exit with 0, otherwise with 1
  -h --help                             Show this screen.
  --version                             Show version.

"""


import shutil
import os
import time
import tempfile

import multiprocessing

from pathlib import Path
import json

from blinker import signal

from docopt import docopt

import sys_flow_v2.flow_utils as futils
import sys_flow_v2.flow_constants as fconsts
from sys_flow_v2.report_utils import report, generate_snr_reports, df_row2status
from sys_flow_v2.exceptions import RegressionError
from sys_flow_v2.test_case import test_case
from sys_flow_v2.regression import regression
from sys_flow_v2.compiler_v2 import check_input_fmt
from sys_flow_v2.onnx_op_stats import get_ioinfo_onnx
from sys_flow_v2.compiler_config import gen_ip_config
from sys_flow_v2.gen_regression_json import generate_conf
from sys_flow_v2 import mix_bitwidth_utils as mbu
from sys_flow_v2 import mix_bitwidth_utils_v2 as mbu_v2

import snoop
DEBUG = True if os.environ.get("REGRESSION_DEBUG", False) else False
MIXBW_DEBUG = True if os.environ.get("MIXBW_DEBUG", False) else False
snoop.install(enabled=DEBUG)

p_script = Path(__file__).resolve().parent


def rename_directory(source_path, target_path):
    source_path = Path(source_path)
    target_path = Path(target_path)

    # Check if the source directory exists
    if not source_path.exists():
        raise FileNotFoundError(f"The directory {source_path} does not exist.")
    # Remove the target directory if it exists
    if target_path.exists():
        shutil.rmtree(target_path)

    target_path.parent.mkdir(parents=True, exist_ok=True)
    try:
        os.rename(source_path, target_path)
        print(f"Directory renamed from {source_path} to {target_path}")
    except Exception as e:
        print(f"Directory rename failed: {e}")


def run_single_case(ts_w_r):
    """Use for multiprocess call.

    A error is returned to callback in main process then combined into report.
    (RETURN other information if needed. This is the only way to sync messages.)

    This function must in top level, should not be embeded in another function.

    the input ts_w_r must be one parameter.
    """

    test_case_path, r_config = ts_w_r

    try:
        i_case = test_case(test_case_path, r_config)
        released_files = i_case.run_flow()
        # success!
        return RegressionError("general/Success", i_case.model_id), released_files
    except Exception as e:
        # NOTE: if any submodule failed, it will reach here.
        try:
            # free up first
            i_case.clean_opt()
            released_files = i_case.save_summary()
            # released_files is probably only the model_fx_html / model_fx_json
            i_case.post_clean_up()

            if DEBUG:
                print(f"run_flow failed. Clean up {i_case}")
                print(e)
            return e, released_files
        except:
            return e, None


def gen_opt_model(
    p_onnx,
    np_txt,
    platform=730,  # choose "520" / "720" / "530" / "630"
    optimize="o0",  # choose from "o0", "o1", "02"
    limit_input_formats=False,
    datapath_range_method="percentage",
    data_analysis_pct=0.999,         # set to 1.0 if detection model
    data_analysis_16b_pct=0.999999,  # set to 1.0 if detection model
    data_analysis_threads=8,
    datapath_bitwidth_mode="int8",
    weight_bitwidth_mode="int8",
    model_in_bitwidth_mode="int8",
    model_out_bitwidth_mode="int8",
    cpu_node_bitwidth_mode="int8",   # from 0.24.0
    lut_high_accuracy_mode="2",      # from 0.25.0
    percentile=0.001,
    outlier_factor=1.0,
    quantize_mode="default",  # choose from "default", "post_sigmoid"
    quan_config=None,   # let user to set constraints for quantization.
    qat_config=None,  # optional config for qat
    compiler_tiling="default",  # changed from fm_cut, since 0.24.0
    p_output="/data1/kneron_flow",
    weight_bandwidth=None,  # None will use default.
    unlock_size_limit=False,  # set to True if need to use huge onnx file.
    mode=2,  # choose from 0/1/2/3. See document for details.
    snr_max_in_pair=3,  # max number pairs of input for dynasty+snr
    snr_layer_control=None,  # to control how dynsty dump results for snr check. From 0.26.0
    opt_model_json=None,  # if given, will use this json to generate opt model. From 0.26.0
    target_snr=None   # generate fastest fx model and user_config to reach target snr. From 0.26.0
):
    """
    1. run a output16 and output light
    2. get all conv value_info name via output_16
    3. run single 8bit cases
    4. run all cumulative cases and plot to culmulative snr plot
    5. if tagets snr exists:
        generate a new user_config.json and run -> final.opt.onnx
    """

    env_output = os.environ.get("KTC_OUTPUT_DIR")
    if env_output and p_output == "/data1/kneron_flow":
        p_output = env_output
    assert platform == 730, f"Only platform 730 is supported."
    assert optimize == "o0", f"Only optimize o0 is supported."

    gen_fx_model_params = {
        "p_onnx": p_onnx,
        "np_txt": np_txt,
        "platform": platform,
        "optimize": optimize,
        "datapath_bitwidth_mode": "int16",
        "weight_bitwidth_mode": "int16",
        "model_in_bitwidth_mode": "int16",
        "model_out_bitwidth_mode": "int16",
        "cpu_node_bitwidth_mode": "int16",
        "lut_high_accuracy_mode": "2",
        "quan_config": None,
        "p_output": p_output,
        "mode": 55,
        "snr_max_in_pair": snr_max_in_pair,
        "snr_layer_control": 0
    }
    p_onnx = Path(p_onnx)
    m_name = p_onnx.stem
    if m_name.endswith(".origin"):
        m_name = m_name.replace(".origin", "")
    if not p_onnx.exists():
        msg = f"Given onnx {p_onnx} does not exist!"
        raise FileExistsError(msg)

    env_workdir = os.environ.get("KTC_WORKDIR")
    p_working = Path(env_workdir or "/workspace/.tmp/models")
    p_model = p_working / m_name / m_name
    p_model_16 = p_working / f"{m_name}_16" / f"{m_name}_16"
    p_model_light = p_working / f"{m_name}_light" / f"{m_name}_light"
    p_model_target = p_working / f"{m_name}_target" / f"{m_name}_target"
    p_onnx_decomposed_16 = p_model_16 /"output"/ "knerex_730" / f"{m_name}.kdp730.decomposed.onnx"

    opt_model_data = None
    if opt_model_json is not None:
        with open(opt_model_json, 'r') as file:
            opt_model_data = json.load(file)

    # 1.1 run output 16
    # load from p_working directory, guarantee p_model_16 exists
    success_16, [snr_16, cfunc_time_16] = mbu.load_single_rst(p_model_16)
    if not success_16.value:
        gen_fx_model_params_16 = gen_fx_model_params.copy()
        # run output 16
        gen_fx_model(**gen_fx_model_params_16)
        rename_directory(p_model, p_model_16)
        success_16, [snr_16, cfunc_time_16] = mbu.load_single_rst(p_model_16)
        assert success_16.value, f"Failed to run output 16 for {m_name}"
    print('=====================================')
    print('success_16', success_16.name)
    print("snr_16", snr_16)
    print("cfunc_time_16", cfunc_time_16)
    print('=====================================')

    # # 1.2 run output mix light
    success_light, [snr_light, cfunc_time_light] = mbu.load_json_rst(opt_model_data, 'mix light')
    if not success_light.value:
        success_light, [snr_light, cfunc_time_light] = mbu.load_single_rst(p_model_light)
        if not success_light.value:
            gen_fx_model_params_light = gen_fx_model_params.copy()
            gen_fx_model_params_light['datapath_bitwidth_mode']='mix light'

            gen_fx_model(**gen_fx_model_params_light)
            rename_directory(p_model, p_model_light)
            success_light, [snr_light, cfunc_time_light] = mbu.load_single_rst(p_model_light)
            assert success_light.value, f"Failed to run output light for {m_name}"
    print('=====================================')
    print('success_light', success_light.name)
    print("snr_light", snr_light)
    print("cfunc_time_light", cfunc_time_light)
    print('=====================================')

    # 2. get all conv value_info name via output_16
    p_single = p_working / f"{m_name}_single"

    conv_nodes = mbu.get_conv_nodes_skip_input(p_onnx_decomposed_16)
    snr_list_single, cfunc_time_list_single = [], []
    print("conv_nodes", [node.name for node in conv_nodes])
    print("conv_nodes ", len(conv_nodes))

    # 3. run single 8bit case
    success_single, [snr_single, cfunc_time_single] = mbu.load_json_rst(opt_model_data, 'single')
    if success_single.value and len(snr_single) == len(conv_nodes):
        snr_list_single = snr_single
        cfunc_time_list_single = cfunc_time_single
    else:
        # re-run single 8bit case
        for node in conv_nodes:
            node_name = node.name.replace("/", "_")
            user_config_single = mbu.generate_user_config([node.input[0]])
            p_single_model = p_single / node_name

            success_single, [snr, cfunc_time] = mbu.load_single_rst(p_single_model)
            mbu.post_clean(p_single_model)
            if not success_single.value:
                gen_fx_model_params_single = gen_fx_model_params.copy()
                gen_fx_model_params_single['quan_config'] = user_config_single

                gen_fx_model(**gen_fx_model_params_single)
                rename_directory(p_model, p_single_model)
                success_single, [snr, cfunc_time] = mbu.load_single_rst(p_single_model)
                mbu.post_clean(p_single_model)
                assert success_single.value, f"Failed to run output single 8bit for {m_name}"

            snr_list_single.append(snr)
            cfunc_time_list_single.append(cfunc_time)
    print('=====================================')
    print('success_single', success_single.name)
    print("snr_list_single ", snr_list_single)
    print("cfunc_time_list_single ", cfunc_time_list_single)
    print('=====================================')


    # 4 run cumculative 8bit case
    # 4.1 get the dp_name_list in sorted order
    dp_name_list = [node.input[0] for node in conv_nodes]
    sorted_dp_idx_list = mbu.get_sorted_dp_idx_by_snr(dp_name_list, snr_list_single)
    print("sorted_dp_idx", sorted_dp_idx_list)

    p_cum = p_working / f"{m_name}_cum"
    snr_list_cum, cfunc_time_list_cum = [], []

    length, step = len(dp_name_list), max(len(dp_name_list)//40+1, 1)
    # 4.2 run cumulative 8bit case
    success_cum, [snr_cum, cfunc_time_cum] = mbu.load_json_rst(opt_model_data, 'cumulative')
    if success_cum.value and len(snr_cum) == len(range(0, length, step)):
        snr_list_cum = snr_cum
        cfunc_time_list_cum = cfunc_time_cum
    else:
        for i in range(0, length, step):
            begin, end = 0, min(i+step, length)
            user_config_cum = mbu.generate_user_config_by_idx(dp_name_list, sorted_dp_idx_list, begin, end)
            print(begin, end)

            p_cum_model = p_cum / str(i)
            success_cum, [snr, cfunc_time] = mbu.load_single_rst(p_cum_model)
            mbu.post_clean(p_cum_model)
            if not success_cum.value:
                gen_fx_model_params_cum = gen_fx_model_params.copy()
                gen_fx_model_params_cum['quan_config'] = user_config_cum
                gen_fx_model(**gen_fx_model_params_cum)

                rename_directory(p_model, p_cum_model)
                success_cum, [snr, cfunc_time] = mbu.load_single_rst(p_cum_model)
                mbu.post_clean(p_cum_model)
                assert success_cum.value, f"Failed to run output cumulative 8bit for {m_name}"
            snr_list_cum.append(snr)
            cfunc_time_list_cum.append(cfunc_time)
    print('=====================================')
    print("success_cum", success_cum.name)
    print("snr_list_cum ", snr_list_cum)
    print("cfunc_time_list_cum ", cfunc_time_list_cum)
    print('=====================================')

    # save gen_opt_json
    opt_model_json = mbu.dump_opt_json(snr_list_single, cfunc_time_list_single, snr_list_cum, cfunc_time_list_cum, snr_16, cfunc_time_16, snr_light, cfunc_time_light, sorted_dp_idx_list, m_name, p_working)

    mbu.plot_cum_snr_cfunc_time(opt_model_json, p_working)

    # 5. generate a new user_config.json and run -> target fx model
    if target_snr is not None:
        user_config_target = mbu.generate_user_config_by_target_snr(snr_16, snr_light, snr_list_cum, target_snr, dp_name_list, sorted_dp_idx_list, step)

        if user_config_target is not None:
            # run with user_config_tagret_snr
            gen_fx_model_params_target = gen_fx_model_params.copy()
            gen_fx_model_params_target['quan_config'] = user_config_target
            gen_fx_model(**gen_fx_model_params_target)

            rename_directory(p_model, p_model_target)
            success_target, [snr_target, cfunc_time_target] = mbu.load_single_rst(p_model_target)
            assert success_target.value, f"Failed to run output target for {m_name}"
            print('=====================================')
            print('success_target', success_target.name)
            print("snr_target", snr_target)
            print("cfunc_time_target", cfunc_time_target)
            print('=====================================')
    return p_output

def gen_opt_model_v2(
        p_onnx,
        np_txt,
        data_analysis_threads=8,
        weight_bitwidth_mode="int16",
        mixbw_mode='data', # ['data', 'weight', 'both']
        flops_ratio=0.5,
        p_output="/data1/kneron_flow",
        p_cache="/workspace/.tmp/models",
        clean_cache=False,  # whether to clean cache
        num_of_processors=16,
        snr_max_in_pair=3,  # max number pairs of input for dynasty+snr
):
    """
    使用15bit的conv浮点运算量占比，flops_ratio=0, 所有conv使用8bit，flops_ratio=1.0， 所有conv使用15bit.
    """

    env_output = os.environ.get("KTC_OUTPUT_DIR")
    env_workdir = os.environ.get("KTC_WORKDIR")
    if env_output and p_output == "/data1/kneron_flow":
        p_output = env_output
    if env_workdir and p_cache == "/workspace/.tmp/models":
        p_cache = env_workdir
    gen_fx_model_params = {
        "p_onnx": p_onnx,
        "np_txt": np_txt,
        "platform": 730,
        "optimize": "o0",
        "data_analysis_threads": data_analysis_threads,
        "datapath_bitwidth_mode": "int16",
        "weight_bitwidth_mode": weight_bitwidth_mode,
        "model_in_bitwidth_mode": "int16",
        "model_out_bitwidth_mode": "int16",
        "cpu_node_bitwidth_mode": "int16",
        "lut_high_accuracy_mode": "2",
        "quan_config": None,
        "p_output": p_output,
        "p_cache": p_cache,
        "clean_cache": False,
        "mode": 55,
        "snr_max_in_pair": snr_max_in_pair,
        "snr_layer_control": 2
    }
    p_onnx = Path(p_onnx)
    m_name = p_onnx.stem
    time_start = time.perf_counter()

    if m_name.endswith(".origin"):
        m_name = m_name.replace(".origin", "")
    if not p_onnx.exists():
        msg = f"Given onnx {p_onnx} does not exist!"
        raise FileExistsError(msg)

    p_temp = Path(p_cache)
    p_subgraphs = Path(os.path.join(p_temp, f'ng5_subgraphs'))
    p_bm_txt = p_temp / f"bm_mixbw.txt"
    p_model = p_temp / m_name / m_name
    p_model_16 = p_temp / f"{m_name}_16" / f"{m_name}_16"
    p_model_target = p_temp / f"{m_name}_target" / f"{m_name}_target"
    p_model_data_png = p_subgraphs / "data"
    p_model_weight_png = p_subgraphs / "weight"
    p_model_log = p_subgraphs / "snr_debug.txt"
    p_model_release = p_temp / "release"

    p_output_target = os.path.join(p_output, f"{m_name}_target")
    p_output_log = os.path.join(p_output, f"snr_debug.txt")
    p_output_data_png = os.path.join(p_output, f"data")
    p_output_weight_png = os.path.join(p_output, f"weight")

    # run output 16
    gen_fx_model_params_16 = gen_fx_model_params.copy()
    gen_fx_model_params_16['p_output'] = p_model_release
    gen_fx_model(**gen_fx_model_params_16)
    rename_directory(p_model, p_model_16)

    xbu_v2 = mbu_v2.MixBitwidthUtilsFast(workspace_dir=p_model_16,
                                        model_dir='.',
                                        subgraph_dir=p_temp,
                                        model_name=m_name,
                                        f_name_16='output',
                                        mixbw_mode=mixbw_mode,
                                        flops_ratio=flops_ratio
                                        )
    # step1
    path_subgraphs = Path(p_subgraphs)
    if os.path.exists(p_subgraphs):
        shutil.rmtree(p_subgraphs)
    p_subgraphs.mkdir(parents=True, exist_ok=True)
    xbu_v2.run_subgraph_creation()
    time_step1_end = time.perf_counter()
    print("\n======= run subgraph creation success =======\n")

    # step2
    xbu_v2.run_sub_cases_creation(float_input=False, pre_clean=True)
    xbu_v2.run_sub_cases_creation(float_input=True, pre_clean=True)
    time_step2_end = time.perf_counter()
    print("\n======= run subcases creation success =======\n")

    # step3
    m_name_dict = xbu_v2.get_m_name_dict()
    all_fx_m_names = [item for item in m_name_dict.keys() if item.endswith('fx')]

    def _process_parallel_models(config, task_mode):
        valid_modes = ("regression", "dynasty")
        if task_mode not in valid_modes:
            raise ValueError(f"Invalid task_mode: {task_mode}. Must be one of {valid_modes}")
        mode_configurations = {
            "regression": {
                "module_run": {
                    "piano_knerex": True,
                    "compiler_piano": True
                },
                "pre_clean_up": {
                    "dynasty_output": False,
                    "all_output": True
                }
            },
            "dynasty": {
                "module_run": {
                    "piano_knerex": False,
                    "compiler_piano": False
                },
                "pre_clean_up": {
                    "dynasty_output": True,
                    "all_output": False
                }
            }
        }
        for section, settings in mode_configurations[task_mode].items():
            config[section].update(settings)

        rfs, success_list, df_report, *_ = run_flow(config)
        n_good = len([a for a in success_list if a])
        n_all = len(success_list)
        assert n_good == n_all, f"Successed cases are {n_good}/{n_all}"
        d_status, d_time = df_row2status(df_report)
        return success_list, rfs[0], d_status, d_time

    p_mixbw_template = p_script / "template" / "regression_mixbw.json"
    with open(p_mixbw_template, "r") as f:
        conf_mixbw = json.load(f)
    conf_mixbw["path"]["cases"] = str(p_temp)
    conf_mixbw["path"]["search"] = ["-f", str(p_bm_txt)]
    conf_mixbw["knerex"]["data_analysis_threads"] = data_analysis_threads
    conf_mixbw["dynasty"]["num_input_samples"] = snr_max_in_pair
    conf_mixbw["dynasty"]["n_parallel_model"] = num_of_processors

    def _chunker(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))

    def _update_regression_case_name(bm_path, m_names):
        with open(bm_path, 'w') as f:
            for m_name in m_names:
                f.write(f'{m_name}\n')

    def _link_knerex_output(base_m_names):
        for base_m_name in base_m_names:
            fx_m_name, fl_m_name = f'{base_m_name}fx', f'{base_m_name}fl'
            fl_knerex_output = os.path.join(p_subgraphs, fl_m_name, 'output', 'knerex_730')
            fx_knerex_output = os.path.join(p_subgraphs, fx_m_name, 'output', 'knerex_730')
            assert os.path.exists(fl_knerex_output), f'fl_knerex_output {fl_knerex_output} does not exist'
            os.makedirs(fx_knerex_output, exist_ok=True)
            assert os.path.exists(fx_knerex_output), f'fx_knerex_output {fx_knerex_output} does not exist'
            for f_name in os.listdir(fl_knerex_output):
                fl_f_name = os.path.join(fl_knerex_output, f_name)
                # Find the position of the last occurrence of '_fl'
                last_fl_index = f_name.rfind('fl')
                if last_fl_index != -1:
                    replaced_f_name = f_name[:last_fl_index] + 'fx' + f_name[last_fl_index + 2:]
                else:
                    replaced_f_name = f_name
                fx_f_name = os.path.join(fx_knerex_output, replaced_f_name)
                if os.path.exists(fx_f_name):
                    os.remove(fx_f_name)
                os.symlink(fl_f_name, fx_f_name)

    def _move_result(base_m_names):
        for base_m_name in base_m_names:
            fx_m_name, fl_m_name = f'{base_m_name}fx', f'{base_m_name}fl'
            fx_p_model, fl_p_model = m_name_dict[fx_m_name][0], m_name_dict[fl_m_name][0]
            p_source = os.path.join(fl_p_model, 'output', 'results')
            path_source = Path(p_source)
            for item in path_source.iterdir():
                if not item.is_dir():
                    continue
                # cp mode_730graphopt_piano to ori_p_model
                dir_src = os.path.join(fl_p_model, 'output', 'results', item.name, 'mode_730graphopt_piano')
                dir_dst = os.path.join(fx_p_model, 'output', 'results', item.name, 'mode_730graphopt_piano')
                # replace dir_src with dir_dst
                if os.path.exists(dir_dst):
                    if os.path.islink(dir_dst):
                        os.remove(dir_dst)
                    elif os.path.isdir(dir_dst):
                        shutil.rmtree(dir_dst)
                os.symlink(dir_src, dir_dst)

    def _regression_snr(fx_m_names):
        for fx_m_name in fx_m_names:
            fx_p_model = m_name_dict[fx_m_name][0]
            p_subgraph_output = os.path.join(fx_p_model, 'output')
            p_subgraph_results = os.path.join(p_subgraph_output, 'results')
            dir_output_list = [f.path for f in os.scandir(p_subgraph_results) if f.is_dir()]
            xbu_v2.run_dynasty_snr(dir_output_list)
            xbu_v2.generate_snr_report(p_subgraph_output)

    def _remove_subgraph_results(base_m_names):
        for base_m_name in base_m_names:
            fx_m_name, fl_m_name = f'{base_m_name}fx', f'{base_m_name}fl'
            shutil.rmtree(os.path.join(p_subgraphs, fx_m_name, 'output', 'results'))
            shutil.rmtree(os.path.join(p_subgraphs, fl_m_name, 'output', 'results'))

    for fx_m_names in _chunker(all_fx_m_names, num_of_processors):
        print("fx_m_names", fx_m_names)
        base_m_names = [item[:-2] for item in fx_m_names]
        fl_m_names = [item + 'fl' for item in base_m_names]
        # 3.1 run fl regression
        _update_regression_case_name(p_bm_txt, fl_m_names)
        success, _, d_status, d_time = _process_parallel_models(conf_mixbw, "regression")
        # 3.2 link fl knerex_output to fx
        _link_knerex_output(base_m_names)
        # 3.3 run fx dynasty& snr
        _update_regression_case_name(p_bm_txt, fx_m_names)
        success, _, d_status, d_time = _process_parallel_models(conf_mixbw, "dynasty")
        # 3.4 move result
        _move_result(base_m_names)
        # 3.5 run snr
        _regression_snr(fx_m_names)
        # 3.6 post-process
        _remove_subgraph_results(base_m_names)

    time_step3_end = time.perf_counter()
    print("\n======= run subcases regression success =======\n")

    # step4
    xbu_v2.run_offset_calculation(dump=False)
    time_step4_end = time.perf_counter()
    print("\n======= run offset calculation success =======\n")

    # step5 run simulation
    xbu_v2.run_snr_simulation(dump_plot=True, dump_diff=False)
    time_step5_end = time.perf_counter()
    print("\n======= run snr simulation success =======\n")

    # step6 run target
    user_config_target = xbu_v2.run_user_config_generation(dump=False)
    time_step6_end = time.perf_counter()
    print("\n======= run user_config generation success =======\n")

    gen_fx_model_params_target = gen_fx_model_params.copy()
    gen_fx_model_params_target['quan_config'] = user_config_target
    gen_fx_model_params_target['snr_max_in_pair'] = 1000
    gen_fx_model_params_target['snr_layer_control'] = 0
    success, _, d_status, d_time = gen_fx_model(**gen_fx_model_params_target)
    rename_directory(p_model, p_model_target)
    time_step7_end = time.perf_counter()

    # release png, snr_debug.txt and target to p_output if MIXBW_DEBUG
    if MIXBW_DEBUG:
        def safe_copytree(src, dst):
            if os.path.exists(dst):
                shutil.rmtree(dst)
            if os.path.exists(src):
                shutil.copytree(src, dst)
        def safe_copy(src, dst):
            if os.path.exists(dst):
                os.remove(dst)
            if os.path.exists(src):
                shutil.copy(src, dst)
        # Copy directories
        for src, dst in [
            (p_model_data_png, p_output_data_png),
            (p_model_weight_png, p_output_weight_png),
        ]:
            safe_copytree(src, dst)
        # Copy file
        safe_copy(p_model_log, p_output_log)
    # post-clean
    if clean_cache:
        time.sleep(10)  # waiting for test_case() to finish
        shutil.rmtree(p_temp)

    time_end = time.perf_counter()

    release_time_dict = {
        'mixbw/subgraph creation:t': time_step1_end - time_start,
        'mixbw/subcases creation:t': time_step2_end - time_step1_end,
        'mixbw/subcases regression total:t': time_step3_end - time_step2_end,
        'mixbw/offset calculatio:t': time_step4_end - time_step3_end,
        'mixbw/snr simulation:t': time_step5_end - time_step4_end,
        'mixbw/user config gneration:t': time_step6_end - time_step5_end,
        'mixbw/target total:t': time_step7_end - time_step6_end,
        'mixbw/post process:t': time_end - time_step7_end,
        'mixbw/total:t': time_end - time_start
    }
    d_status.update(release_time_dict)
    return p_output, d_status

def gen_fx_model(
        p_onnx,
        np_txt,
        platform,  # choose "520" / "720" / "530" / "630"
        optimize="o0",  # choose from "o0", "o1", "o2"
        limit_input_formats=False,
        input_fmt=None,  # 可以是None（由编译器决定）、字符串或字典类型。字符串表示统一格式，字典用于指定每个输入节点的格式
        datapath_range_method="percentage",
        data_analysis_pct=0.999,  # set to 1.0 if detection model
        data_analysis_16b_pct=0.999999,  # set to 1.0 if detection model
        data_analysis_threads=8,
        datapath_bitwidth_mode="int8",
        weight_bitwidth_mode="int8",
        model_in_bitwidth_mode="int8",
        model_out_bitwidth_mode="int8",
        cpu_node_bitwidth_mode="int8",  # from 0.24.0
        lut_high_accuracy_mode="2",  # from 0.25.0
        per_channel_radix=1,
        percentile=0.001,
        outlier_factor=1.0,
        quantize_mode="default",  # choose from "default", "post_sigmoid"
        quan_config=None,  # let user to set constraints for quantization.
        qat_config=None,  # optional config for qat
        compiler_tiling="default",  # changed from fm_cut, From 0.24.0
        p_output="/data1/kneron_flow",   # where to put result
        p_cache="/workspace/.tmp/models",  # where to put temp folder
        clean_cache=False,  # whether to clean cache
        weight_bandwidth=None,  # None will use default.
        dma_bandwidth=None,  # None will use default.
        unlock_size_limit=False,  # set to True if need to use huge onnx file.
        mode=2,  # choose from 0/1/2/3. See document for details.
        snr_max_in_pair=1000,  # max number pairs of input for dynasty+snr. From 0.26.0
        snr_layer_control=None,  # to control how dynsty dump results for snr check. From 0.26.0
        need_copy_prepare_model=True
):
    """Generate fix-point model for kneron NPUs.

    Entrypoint for toolchain. Suppose only 1 model per flow run.

    Args:
      p_onnx (pathlib / str): path to onnx file. it should have passed through onnx2onnx.py.
      np_txt (dict): a dictionary of list of images in numpy format.
        The keys are the names of input nodes of model.
        e.g., `{"input1": [img1, img2]}`, here img1/img2 are two images -> preprocess -> numpy 3D array (HWC)
        if set to None, will run ip evaluator only, ignore knerenx+dynasty+compiler+csim
      platform:
        - "520"
        - "530"
        - "540"
        - "630"
        - "720"
        - "730"
        - "1140"

      mode:
        - 0: run ip_evaluator only.
        - 1: run knerex (for quantization) + compiler only.
        - 2: run knerex + compiler + dynasty + csim + bit-true-match check.
          dynasty will inference only 1 image and only check quantization accuracy of output layers.
        - 3: run knerex + compiler + dynasty + csim + bit-true-match check + SNR calcuation.
          dynasty will inference all images (or up to snr_max_in_pair) and dump results of all layers.
          It will provide most detailed analysis but will take much longer time.
        - 55: run knerex + compiler + dynasty + snr. For internal snr analysis.
        - 56: run dynasty + snr.

      snr_max_in_pair: max number of input to run dynasty+snr. Default is 1000. Change to smaller number to decrease time.
          only valid for mode 3, 55, 56. from 0.26.0

      snr_layer_control: dynasty will dump results for snr check. New from 0.26.0

        - None: Each mode had a preset value for dump level. Will use the preset value if snr_layer_control is None. User can specify below to override (in some modes).
        - 0: only dump output layer.
        - 1: only dump cpu layer and output layer.
        - 2: dump all layers.

      optimize: choose "o0" / "o1" / "o2"

        - "o0": the knerex generated quantization model.
        - "o1": bias adjust parallel, without fm cut improve
        - "o2": bias adjust parallel, with fm cut improve
        - "o3": bias adjust sequential, no fm cut improve. SLOW! Not recommended.
        - "o4": bias adjust sequential, w fm cut improve.  SLOW! Not recommended.
      limit_input_formats: Default False. If set to True, will force all
        input nodes to have only one hardware format.
        If a input node is connected to multiple computational nodes,
        compiler may set different formats for each connection by default.
      input_fmt: Default None.
        - None: 由编译器自动决定输入格式
        - str: 字符串类型，指定所有输入节点使用统一格式
        - dict: 字典类型，用于分别指定每个输入节点的格式
        可用格式请参考 `compiler_v2.get_support_formats(hw_mode)`
      datapath_range_method:
        - "percentage"
        - "mmse"
      data_analysis_pct: It is used to exclude extreme values for int8 mode.
        The default setting is 0.999. It means 0.1% of absolute maximum value
        will be removed among all data. set to 1.0 if detection model.
        (Appliable when datapath_range_method set to "percentage").
      data_analysis_16b_pct: It is used to exclude extreme values for int16 mode.
        The default setting is `0.999999`. It means `0.0001%` of absolute
        maximum value will be removed among all data.
        set to `1.0` if `detection` model.
        (Appliable when datapath_range_method set to "percentage").
      data_analysis_threads: how many threads to use for data analsysis for
        quantization. Default value is 8. Increase if more cpu cores / memory available.
      datapath_bitwidth_mode:
        - "int8", default value. (and only choice for `520`)
        - "int16".
        - "mix balance". A combined bitwidth of int8 and int16, with a preference for int16.
        - "mix light". A combined bitwidth of int8 and int16, with a preference for int8.
      weight_bitwidth_mode:
        - "int8", default value. (and only choice for `520`)
        - "int16".
        - "int4". (not supported in `520`/`720`)
        - "mix balance". A combined bitwidth of int8 and int16, with a preference for int16.
        - "mix light". A combined bitwidth of int8 and int16, with a preference for int8.
      model_in_bitwidth_mode:
        - "int8", default value.
        - "int16". (not supported in `520`).
      model_out_bitwidth_mode:
        - "int8", default value.
        - "int16". (not supported in `520`).
      cpu_node_bitwidth_mode:
        - "int8", default value.
        - "int16". (not supported in `520`).
      lut_high_accuracy_mode:
        - "0": only use one look-up table for Exp/Log.
        - "1": use multiple LUTs to support high accuracy and a wide input range in Exp/Log with a 15-bit bitwidth.
        - "2": (default) support Exp/Log for higher accuracy, but restrict the input andd output range.
      per_channel_radix: default 1. 0 for per layer radix, 1 for per channel radix (better accuracy). Change to 0 for debug. From 0.26.0
      percentile: default value 0.001. Appliable when datapath_range_method set to "mmse".
        Increase this parameter will increase the search range for optimized range.
      outlier_factor: default 1.0. Appliable when datapath_range_method set to "mmse".
        Increase this parameter will give weight on outliers so the final range will increased. Vice vesa.
      quantize_mode:
        - "default": no extra tuning.
        - "post_sigmoid": If a model's output nodes were ALL sigmoids and had been removed, choose "post_sigmoid" for better performance.
      quan_config: Default: `None`. User can pass in a dictionary to
        set constraints for quantization.
      compiler_tiling: methods to search for best feature map cut. choose from:

        - "default" (default)
        - "deep_search" (slow when calling this function, but will improve inference speed when deployed on NPU.)
        - "partial_graph_search" (search runtime optimization based on partial graph comparison. Less performance than deep_search) Available from 0.29.0.

      p_output: where to save the generated fix models. Default: "/data1/kneron_flow",
      weight_bandwidth: set the weight bandwidth. Set to `None` to use default value.
      dma_bandwidth: set the dma bandwidth. Set to `None` to use default value.

      unlock_size_limit:

        - False (default), will raise exceptions if onnx is larger than 3G.
        - True. the limitation of origin.onnx is 100G.

    Returns:

      - success: bool, whether the fix model generation is successful. The caller function should check this value before following actions.
      - released: dict of released files. including the html report.
      - d_status: dict of module status on each submodule.
      - d_time: dict of time on each submodule.
    """
    # check platforms
    assert platform in fconsts.MODE_HW_LIMIT["inc_in_toolchain"]

    # working directory
    # NOTE: p_working must be same as specified in template/regression_tc.json/path/cases
    env_workdir = os.environ.get("KTC_WORKDIR")
    if env_workdir and p_cache == "/workspace/.tmp/models":
        p_cache = env_workdir
    p_working = Path(p_cache)
    p_working.mkdir(parents=True, exist_ok=True)

    # prepare working_model_folder
    env_output = os.environ.get("KTC_OUTPUT_DIR")
    if env_output and p_output == "/data1/kneron_flow":
        p_output = env_output
    p_export = Path(p_output)
    p_export.mkdir(parents=True, exist_ok=True)

    p_onnx = Path(p_onnx)
    if not p_onnx.exists():
        msg = f"Given onnx {p_onnx} does not exist!"
        raise FileExistsError(msg)
    m_name = futils.remove_appendix(futils.clean_file_name(p_onnx.name))
    if DEBUG:
        print(f"given onnx: {p_onnx.name}, use cleanup model name: {m_name}")

    # check input shapes
    if mode > 0:
        # no need check npy if ip-eval only
        assert np_txt is not None, f"mode {mode} need a valid np_input."
        # Make sure input npy is dict
        np_txt = futils.load_np_in(np_txt)
        futils.verify_input_shape_onnx_npy(p_onnx, np_txt)

    platform = int(platform)  # platform must be like 520/720/... type: integers

    opt_map = {
        "o0": "scaled",  # no bias adjust, no fmcut
        "o1": "wqbi-p",  # bias adjust parallel, no fmcut
        "o2": "wqbi-p",  # bias adjust parallel, w fmcut
        "o3": "wqbi-s",  # bias adjust sequential, no fmcut. slow. don't use.
        "o4": "wqbi-s",  # bias adjust sequential, w fmcut. slow. don't use.
    }
    if optimize not in opt_map:
        msg = f"""Given optimize ({optimize}) not in {list(opt_map.keys())}. """
        raise ValueError(msg)

    # to keep same interface
    user_config = quantize_mode

    p_template = p_script / "template" / "regression_tc.json"
    with open(p_template, "r") as f:
        template = json.load(f)
        template["path"]["cases"] = str(p_working)

    # verify knerex parameters
    # choose from mmse / percentage
    valid_dp_range = ["percentage", "mmse"]
    if datapath_range_method not in valid_dp_range:
        raise ValueError(f"datapath_range_method should be {valid_dp_range}. But got: {datapath_range_method}")
    # Percentage to keep data: 0.999 (default), 1.0 (Keep all data, e.g., for detection model)
    # verify: 0.9 <= data_analysis_pct <= data_analysis_16b_pct <= 1.0
    if not 0.9 <= data_analysis_pct <= 1.0:
        raise ValueError(f"data_analysis_pct shoud be between 0.9 and 1.0. But got: {data_analysis_pct}")
    if not 0.9 <= data_analysis_16b_pct <= 1.0:
        raise ValueError(f"data_analysis_16b_pct shoud be between 0.9 and 1.0. But got: {data_analysis_16b_pct}")
    if data_analysis_pct > data_analysis_16b_pct:
        raise ValueError(f"data_analysis_pct should be less than or equal to data_analysis_16b_pct. But got: {data_analysis_pct} > {data_analysis_16b_pct}")
    if not 0 <= percentile <= 0.2:
        raise ValueError(f"percentile must be between 0 and 0.2. But got: {percentile}")
    if (datapath_range_method == "percentage") and (percentile > 0):
        # print(f"WARNING: using '{datapath_range_method}' datapath analysis. Percetile reset to 0.")
        percentile = 0
    if outlier_factor <= 0:
        raise ValueError(f"outlier_factor must > 0. But got: {outlier_factor}")

    # verify compiler parameters
    valid_tiling = ["default", "deep_search", "partial_graph_search"]
    if compiler_tiling not in valid_tiling:
        raise ValueError(f"compiler_tiling should be in {valid_tiling}. But got {compiler_tiling}")
    # possible override
    if platform == 520:
        # no compiler_tiling for 520
        compiler_tiling = "default"
    if optimize in ["o2", "o4"]:
        # force to use deep_search
        compiler_tiling = "deep_search"

    # verify input_fmt
    # did not check input node number here.
    check_input_fmt(input_fmt, platform)

    # create config for whole flow.
    try:
        conf_reg, _ = generate_conf(template,
                             platform,
                             optimize=opt_map[optimize],
                             mode=mode,
                             snr_max_in_pair=snr_max_in_pair,
                             snr_layer_control=snr_layer_control,
                             limit_input_formats=limit_input_formats,
                             input_fmt=input_fmt,
                             dp_bw=datapath_bitwidth_mode,
                             wt_bw=weight_bitwidth_mode,
                             in_bw=model_in_bitwidth_mode,
                             out_bw=model_out_bitwidth_mode,
                             cpu_bw=cpu_node_bitwidth_mode,
                             lut_high_accuracy_mode=lut_high_accuracy_mode,
                             per_channel_radix=per_channel_radix,
                             datapath_range_method=datapath_range_method,
                             data_analysis_pct=data_analysis_pct,
                             data_analysis_16b_pct=data_analysis_16b_pct,
                             data_analysis_threads=data_analysis_threads,
                             percentile=percentile,
                             outlier_factor=outlier_factor,
                             fm_cut=compiler_tiling
                             )
    except Exception as e:
        # probably bad configuration
        pp(e)
        raise ValueError("Wrong configuration for ktc.analysis().")

    # save conf_reg to disk for debug. it will not be used in gen_fx_model later.
    p_json = p_working / "regression_config.json"
    futils.dict2json(conf_reg, p_json)

    def update_config_ip_val(weight_bandwidth, dma_bandwidth, platform):
        # NOTE: if running multiple platform at same time,
        #       one setting for dma_bandwidth / weight_bandwidth may not be accurate

        # override the ip_evaluator in toolchain.
        # s1.json will based on this file. if necessary.
        ip_config = gen_ip_config(platform, weight_bandwidth, dma_bandwidth)
        res_dir = os.environ.get("KTC_SCRIPT_RES", "/workspace/scripts/res")
        os.makedirs(res_dir, exist_ok=True)
        fn_ip_config = os.path.join(res_dir, f"ip_config_{platform}.json")
        futils.dict2json(ip_config, fn_ip_config)

    update_config_ip_val(weight_bandwidth, dma_bandwidth, platform)

    # prepare model folder
    def copy_onnx(p_onnx, p_to):
        shutil.copy(p_onnx, p_to)

        # copy onnx_data
        p_from = p_onnx.parent / f"{p_onnx.name}_data"
        if p_from.exists():
            p_to_data = p_to.parent / f"{p_to.name}_data"
            if DEBUG:
                print(f"Found {p_from}, copy to {p_to_data}")
            shutil.copy(p_from, p_to_data)

    def prepare_model(p_user_config=None, quan_config=None, qat_config=None, hw_mode=platform):
        """Prepare model structure: onnx / input / configs."""
        # our model name convention require "cat/model" structure
        # use `m_name`/`m_name` will limit the flow will run only one category (which include only one model)
        p_model = p_working / m_name / m_name
        if p_model.exists():
            shutil.rmtree(str(p_model))
        p_input = p_model / "input"
        p_input.mkdir(parents=True, exist_ok=False)

        # copy onnx
        p_to = p_input / f"{m_name}.origin.onnx"
        copy_onnx(p_onnx, p_to)

        # read onnx for input and get input nodes info
        input_names, output_names, opset = get_ioinfo_onnx(str(p_to))

        futils.npy2txt(np_txt, input_names, p_input)

        # copy user_config.json which apply some constraints for better performance.
        if (p_user_config is not None) and p_user_config.exists():
            f_to = p_input / "user_config.json"
            shutil.copy(p_user_config, f_to)
        elif quan_config is not None:
            # BUG: need to merge with existing json (e.g., p_user_config from quantize_mode).
            futils.dict2json(quan_config, p_input / "user_config.json")

        # save qat_xxx_config.json
        if qat_config:
            futils.dict2json(qat_config, p_input / f"qat_{hw_mode}_config.json")

        return p_model

    def prepare_model_only_ip_eval():
        """Simpler version of prepare_model."""
        # our model name convention require "cat/model" structure
        # use `m_name`/`m_name` will limit the flow will run only one category (which include only one model)
        p_model = p_working / m_name / m_name
        if p_model.exists():
            shutil.rmtree(str(p_model))
        p_input = p_model / "input"   # need this folder to be find.
        p_knerex_in = p_input / "knerex_input"
        p_knerex_in.mkdir(parents=True, exist_ok=False)

        # TODO: only_ip_eval need to support both onnx and bie format!
        # but we need .origin.onnx to be find a model
        ext = p_onnx.suffix
        p_to = p_input / f"{m_name}.origin{ext}"
        # NOTE: only .onnx_data possible. if in .bie format, _data info in included in the bie.
        copy_onnx(p_onnx, p_to)

        return p_model

    def run_ip_evaluator_only():
        """Mode 0 for ip evaluator only."""
        try:
            p_model = prepare_model_only_ip_eval()
            rfs, success_list, df_report, _, _ = run_flow(conf_reg, [m_name])
            success = success_list[0], rfs[0], df_report  # only one model
        except Exception as e:
            success = False, None, None

        return success

    def copy_release_file(fn_to_release: dict, p_export):
        fn_released = {}

        if not fn_to_release:
            # no files to copy
            return fn_released

        for k, fn_from in fn_to_release.items():
            fn_to = p_export / fn_from.name
            shutil.copy(fn_from, fn_to, follow_symlinks=False)
            fn_released[k] = fn_to
        return fn_released

    def run_btm_and_release(need_copy_prepare_model):
        """Mode 1/2/3 to generate fix models.

        TODO: init model in the given folder and run regression in it.
              currently we create in a temp folder then copy to given folder
        """
        # check user_config
        assert user_config in ["default", "post_sigmoid"]
        user_config_available = {
            "post_sigmoid": p_script / "template" / "user_config_post_sigmoid.json"
            }
        p_user_config = user_config_available.get(user_config, None)

        if need_copy_prepare_model:
            p_model = prepare_model(p_user_config, quan_config, qat_config)
        # now the model should be ready to generate fx models
        rfs, success_list, df_report, _, _ = run_flow(conf_reg, [m_name])

        # if release_time_report:
        #     # TODO: save time.json. btm_report is a dataframe,
        #     df_time = df_only_time(btm_report)
        #     temp_pkl = Path(tempfile.gettempdir()) / "time_report.pkl.xz"
        #     futils.df2pkl(df_time, str(temp_pkl))
        #     rfs[0]["time_report"] = temp_pkl

        # only one model
        return success_list[0], rfs[0], df_report

    # force to have same output
    try:
        if mode == 0:
            success, fn_to_release, df_report = run_ip_evaluator_only()
        else:
            success, fn_to_release, df_report = run_btm_and_release(need_copy_prepare_model)
        d_status, d_time = df_row2status(df_report)
    except Exception as e:
        pp(e)
        success, fn_to_release, df_report, d_status, d_time = False, {}, None, {}, {}

    released = copy_release_file(fn_to_release, p_export)

    if clean_cache:
        time.sleep(10)  # waiting for test_case() to finish
        shutil.rmtree(p_working)

    return success, released, d_status, d_time


def run_flow(fn_json, keywords=None):
    """Core function for kneron regression flow.

    1. init regresson config
    2. run regression on each model, using multi-processing if appliable
    3. generate compiled report on btm and snr

    Returns: list of released files.
      - btm_report: a dataframe on module status on each model
      - snr_reports: a dictionary

        - key is platform, e.g., 520, 720, if turned on in this regression
        - value is a dataframe, with snr of output nodes for each model.
    """
    r = regression(fn_json)

    time_start = time.perf_counter()

    selected_case = r.filter_cases(keywords)

    logger = futils.create_logger("flow", None, r.config["regression"]["logging_level"])

    # this object is to record status/timestamp of all models long whole regression
    rep = report()
    signal("time_sender").connect(rep.receive_time_usage)
    signal("data_sender").connect(rep.receive_info)

    if len(selected_case) == 0:
        logger.critical("Error: found 0 test case matching keywords ({}). ".format(keywords))
        exit(1)

    logger.info("total models are: {}".format(len(selected_case)))
    n_parallel_model = r.config["dynasty"]["n_parallel_model"]

    is_customer = not r.config["path"]["internal"]
    # TODO: this condition may be wrong
    is_big_model = any(["big_model" in str(test_case_path) for test_case_path in selected_case])
    print_each_model = n_parallel_model == 1 and (is_customer or is_big_model)

    if n_parallel_model > 1:
        p = multiprocessing.Pool(n_parallel_model)
        ts_w_c = [(sc, r.config) for sc in selected_case]
        # NOTE: the run_single_case must be serializable. it should be on top level, not local function
        w = p.map_async(run_single_case, ts_w_c) # , callback=rep.add_err_record
        w.wait()
        # collect reports and released files
        success_list = []
        for e in [a[0] for a in w.get()]:
            rep.add_err_record([e])
            success_list.append(fconsts.is_success(e))
        released_files = [a[1] for a in w.get()]
        p.close()
    else:
        # only 1 model at 1 time
        # usually SNR regression & toolchain will be in this setting.
        released_files = []
        success_list = []
        for one_case in selected_case:
            e, rel_fn = run_single_case((one_case, r.config))
            # if run_single_case failed, rel_fn will be None
            released_files.append(rel_fn)
            rep.add_err_record([e])
            success_list.append(fconsts.is_success(e))
            if print_each_model:
                btm_report, btm_summary = rep.compile(r.report_csv)
                snr_reports = generate_snr_reports(r, rep, selected_case)

    # this run is finished.
    time_end = time.perf_counter()
    time_used_m = max(int((time_end - time_start) / 60), 1)
    r.commit_info.append(f"Duration for this run: {time_used_m} minutes\n")
    r.write_info()

    # generate reports for whole regression. not for only 1 test case.
    if not print_each_model:
        # final print of results. skip if print already.
        # compile report on errors
        btm_report, btm_summary = rep.compile(r.report_csv)
        # compile report on snr when piano_dynasty run
        snr_reports = generate_snr_reports(r, rep, selected_case)

    return released_files, success_list, btm_report, btm_summary, snr_reports


def check_reg_success_by_keys(d):
    """Quick way to examine a model flow executed successfully or not."""
    for k in d.keys():
        if k.endswith("/bie"):
            # at least one bie is released.
            return True
    # failed. but at leased report.html / report.json released.
    # MAYBE: use len(d) > 2
    return False


if __name__ == "__main__":
    arguments = docopt(__doc__, version="run regression 1.2")
    # print(arguments)

    # check commit folder

    fn_json = Path(arguments["<fn_json>"])
    if not fn_json.exists():
        print(f"Given config file: {fn_json} does not exist. quit...")
        exit(1)

    keywords = arguments["<keys>"]

    released_files, success_list, btm_report, btm_summary, snr_reports = run_flow(fn_json, keywords)

    n_good = len([a for a in success_list if a])
    n_all = len(success_list)
    print(f"Successed cases are {n_good}/{n_all} for {fn_json.name}.")
    print(f"▤"*140 + "\n\n\n")

    # check all cases success or not. needed in CI.
    if arguments["--all-pass"]:
        if not all(success_list):
            exit(99)

    # otherwise will always return 0 even if regression failed.
    exit(0)