#! /usr/bin/env python3 import os import pathlib import struct import tempfile import pickle from collections import OrderedDict, defaultdict import numpy as np # 4 is best in test, reduce to 1/3 time. os.environ["OMP_NUM_THREADS"] = "4" from sys_flow_v2.kneron_round import kneron_round_array import sys_flow_v2.flow_constants as fconsts import sys_flow_v2.flow_utils as futils import sys_flow_v2.compiler_v2 as compiler DEBUG = True if os.environ.get("REGRESSION_DEBUG", False) else False import snoop snoop.install(enabled=DEBUG) ################################################################################### # setup binary ################################################################################### p_dynasty_so = pathlib.Path(fconsts.BIN_SET["dynasty"]["lib.so"]) ENV_DYNASTY_LIB = f"""export LD_LIBRARY_PATH="{p_dynasty_so.parent}:$LD_LIBRARY_PATH" """ BIN_DC = fconsts.BIN_SET["data_converter"]["v2"] ################################################################################### # convert input to bin file ################################################################################### def array_fl2fx(np_arr, bitwidth, radix, scales=None, i_dim_is_ch=1, round_process="array"): """Convert (1 input) numpy float to integer. .. parameters:: - round_process (from 0.27.0): - "array": send in flattened array, loop in cython with kneron_round. 0.54s for 12M points. - "no_btm": use np to do int32 convert. DONT use this. not for bit-true-match. for speed check only. 0.082 for 12M points. (0.05 if no convert) .. seealso:: - `dynasty clamp `_ - `dynasty quantization `_ - `kneron round `_ 15bit input: https://redmine.kneron.tw/issues/18689 """ # TODO: apply per-channel-scale to this_np with scales + i_dim_is_ch # size should match # get per-channel radix n_dim = np_arr.shape[i_dim_is_ch] radix = futils.expand_array(radix, n_dim).astype('float32') if scales is None: scales = 1.0 scales = futils.expand_array(scales, n_dim) # radix / scales must be list of n_dim size # NOTE: use 16B16R for bitwidth == 15 to comply with firmware # https://redmine.kneron.tw/issues/18689 if bitwidth == 15: bitwidth = 15 + 1 radix = radix + 1 # clamp boundaries v_max = (1<<(bitwidth-1))-1 v_min = -(1<<(bitwidth-1)) # expand per-channel value to matrix scale_expand_dim = list(range(len(np_arr.shape))) scale_expand_dim.remove(i_dim_is_ch) scales *= 2**radix # in-place scl_matx = np.expand_dims(scales, scale_expand_dim) np_arr *= scl_matx # scale in-place np.clip(np_arr, v_min, v_max, out=np_arr) # clip in-place if round_process == "array": # this should be fast return kneron_round_array(np_arr.ravel(), bitwidth) elif round_process == "no_btm": # this is fastest! NOT calling kneron_round return np_arr.flatten().astype(np.int32) def bw2fmt(bitwidth): """Get format for this bitwidth.""" # NOTE: use struct: https://docs.python.org/3/library/struct.html if bitwidth <= 8: return "b" # 8bit elif bitwidth <= 16: return "h" # 16bit elif bitwidth <= 32: return "i" # 32bit else: return "q" def fx2bin(arr_fx, fn_bin, bitwidth): """Write fx data to binary in sequential.""" # TODELETE # # NOTE: use struct: https://docs.python.org/3/library/struct.html # fmt = bw2fmt(bitwidth) # with open(fn_bin, "wb") as f: # f.write(struct.pack(f"<{len(arr_fx)}{fmt}", *arr_fx)) dtype_map = {8: np.int8, 16: np.int16, 32: np.int32} try: dtype = dtype_map[bitwidth] except KeyError: raise ValueError(f"Unsupported bitwidth: {bitwidth}") # must be C style that memory continues arr_fx = np.asarray(arr_fx, dtype=dtype, order="C") arr_fx.tofile(fn_bin) def idx2onnx_order(info_in, n_sqt_bin): """Guess onnx input name order.""" # NOTE: each pair of sqt_bin is same length as onnx input node, say 2: A,B # the input number in ioinfo.json, say 5, A1, A2, B1, B2, B3. # compiler guarantee that A-related nodes go first, then B-related nodes. compiler_give_names = [a["name"] for a in info_in] if len(compiler_give_names) == n_sqt_bin: # most of cases, just 1:1 return {name:i for i, name in enumerate(compiler_give_names)}, {i:name for i, name in enumerate(compiler_give_names)} # now each sequential.bin may convert to multiple dram.bin # TODO: verify compiler_give_names should be continues (same name stays together) name_order = {} o_order2dp = {} for name in compiler_give_names: if name not in name_order: n = len(name_order) name_order[name] = n o_order2dp[n] = name assert len(name_order) == n_sqt_bin return name_order, o_order2dp def txt2bin_seq(in_pairs, ioinfo_in, p_out): """Convert fl text to sequential bin file. For compability of regression. """ # load float txt into np np_in = defaultdict(list) # NOTE: # in_pair(s) is specified by onnx, it may be [a.txt, b.txt, c.txt] # compiler may specify dp_in to be [A, A, B, B, C]. # same node may appear multiple times # TODO: we assume same input node always have same quantization _, o_order2dp = idx2onnx_order(ioinfo_in, len(in_pairs[0])) for i_pair, in_pair in enumerate(in_pairs): # independant between pairs for i_txt, p_txt in enumerate(in_pair): dp_name = o_order2dp[i_txt] # use the first shape info given by compiler shape = [a["onnx_shape"] for i, a in enumerate(ioinfo_in) if a["name"] == dp_name][0] # load txt into numpy array np_in[dp_name].append(futils.txt2np_fl(p_txt, shape)) bin_pair = np2bin_seq(np_in, ioinfo_in, p_out) # there should be only 1 element return bin_pair def txt2bin_rgba(in_pairs: list, info_in: list, p_out): """Convert fl text to rgba bin file for 520. The info_in is from knerex json. Args: in_pairs (list): list of list, inside list is a pair of input. info_in (list): quantization info per input node. p_out (pathlib or str): where to save converted.bin files. Special process for 520 csim input: 1. minimal 4 channels. 2. columns (w) should be multiple of 16. 3. channel last. """ # quick check on inputs. l1 = len(in_pairs[0]) l2 = len(info_in) assert l1 == l2, f"in_pairs len: {l1}, info_in len: {l2}" # Load float txt into np bin_pair = [[] for a in in_pairs[0]] for i_pair, in_pair in enumerate(in_pairs): for i_dp, p_txt in enumerate(in_pair): # NOTE: 520 only support single input models (except stc.) # so i_dp is always 0 shape = info_in[i_dp]["onnx_shape"] bw = info_in[i_dp]["bitw"] radix = info_in[i_dp]["radix"] scale = info_in[i_dp]["scale"] # padding to >= 4 dimension if isinstance(p_txt, (str, pathlib.Path)): fl = futils.txt2np_fl(p_txt, shape) elif isinstance(p_txt, np.ndarray): fl = p_txt else: raise NotImplementedError() n_dim = len(fl.shape) if n_dim < 4: fl = np.expand_dims(fl, list(range(4-n_dim))) n_dim = len(fl.shape) # assert n_dim == 4 # NOTE: will fail when n_dim > 4 # channel last fl = np.moveaxis(fl, 1, 3) # pad channel to 4, columns to 16x d_padding = [[0, 0] for i in range(n_dim)] do_pad = False if fl.shape[-1] < 4: # padding channel to 4 n_pad = 4 - fl.shape[-1] d_padding[-1][-1] = n_pad arr_pad = np.zeros(n_pad) # zero is fine. radix = np.append(radix, arr_pad) scale = np.append(scale, arr_pad) do_pad = True if fl.shape[-2] % 16 != 0: # padding columns d_padding[-2][-1] = 16 - fl.shape[-2] % 16 do_pad = True if do_pad: fl = np.pad(fl, d_padding, mode="constant", constant_values=0) # quantization. for the 1st and only one input np_fx = array_fl2fx(fl, bw, radix, scale, n_dim-1) # save to bin p_bin = p_out / f"csim_p{i_pair:06}_i{i_pair:03}_rgba.bin" fx2bin(np_fx, p_bin, bw) bin_pair[i_dp].append(p_bin) # bin_pair is [[in1_pair1, in1_pair2], [in2_pair1, in2_pair2]] return bin_pair def np2bin_seq(np_in, ioinfo_in, p_out=None, round_process="array"): """Convert numpy array to sequential bin file.""" # TODO: internal process assume np_in as list of list # load np_in if not yet if type(np_in) in [str, pathlib.PosixPath]: with open(np_in, "rb") as f: np_in = pickle.load(f) # NOTE: ioinfo_in may have duplicated dp names: [A, A, B, B, C] # we assume same DP have SAME quantization. # use ioinfo_uni for now def keep_unique(l): uni = set() results = [] for i, a in enumerate(l): if a["name"] not in uni: uni.add(a["name"]) results.append(a) return results ioinfo_uni = keep_unique(ioinfo_in) # unpack data dp_in = [a["name"] for a in ioinfo_uni] bitwidth = [a["bitw"] for a in ioinfo_uni] radix = [a["radix"] for a in ioinfo_uni] scales = [a["scale"] for a in ioinfo_uni] i_dim_is_ch = [a["ch_dim"] for a in ioinfo_uni] # shape not used. maybe to check with np_in array shape? # sanity check assert set(np_in.keys()) == set( dp_in ), f"dp_in \"{dp_in}\" should be same as np_in \"{list(np_in.keys())}\"" temp = [len(v) for k, v in np_in.items()] assert len(set(temp)) == 1, f"np list should have same length, but got {temp}" N_pairs = temp[0] if p_out is None: p_out = tempfile.mkdtemp(prefix="csim_") p_out = pathlib.Path(p_out) p_out.mkdir(exist_ok=True, parents=True) # TODO: save the radix/ scale quantization info # np_in_pairs = [[np_in[k1][n] for k1 in dp_in] for n in range(N_pairs)] bin_in_pairs = OrderedDict() for i_pair in range(N_pairs): pair = [] for i_dp, dp_name in enumerate(dp_in): p_bin = p_out / f"seq_p{i_pair:06}_node{i_dp:03}.bin" pair.append(p_bin) this_np = np_in[dp_name][i_pair] ## channel dimension is not in onnx shape, need to expand if i_dim_is_ch[i_dp] == -1: this_np = np.expand_dims(this_np, 1) # i_dim_is_ch[i_dp] = 0 # the real work done here np_fx = array_fl2fx(this_np, bitwidth[i_dp], radix[i_dp], scales[i_dp], i_dim_is_ch[i_dp], round_process=round_process) fx2bin(np_fx, p_bin, bitwidth[i_dp]) # NOTE: the pair_name include input pair order. # will use this to return inferenced results bin_in_pairs[i_pair] = pair return bin_in_pairs def data_convert_1(bin_seq, bin_rgba, fmt_in: str, fmt_out: str, stride_in: list, stride_out: list, shape: list, ndim: int): """Convert one sequential.bin to rgba.bin.""" # TODO: what if fmt_out is RAW_FLOAT / RAW_8 / RAW_16? # seems dynasty dump it previously s_shape = ",".join(str(a) for a in shape) s_stride_in = ",".join(str(a) for a in stride_in) s_stride_out = ",".join(str(a) for a in stride_out) # always use --nhwc 1 for ng1 730 # --is_ng1 is same as --nhwc cmd = f"""{BIN_DC} --fin {bin_seq} --fout {bin_rgba} --dim {ndim} --shape {s_shape} --fmt_in {fmt_in} --fmt_out {fmt_out} --stride_in {s_stride_in} --stride_out {s_stride_out} --is_ng1 1""" return cmd def data_convert(sqt_bin_list: dict, ioinfo_in: list, p_out=None, n_thread=4): """Convert list of sequential.bin to csim_input.bin, based on compiler info. * NOT for 520 """ # prepare for output folder if p_out is None: p_out = tempfile.mkdtemp(prefix="dc_") p_out = pathlib.Path(p_out) p_out.mkdir(exist_ok=True, parents=True) def bw2raw_fmt(bw): # the in.bin in regression is alwasy raw format. if bw == 8: return "FMT_RAW8B" elif bw in [15, 16]: # https://redmine.kneron.tw/issues/18706 return "FMT_RAW_NPU16B" else: raise NotImplementedError(f"unsupported bitwidth: {bw}.") def convert_inproc_fmt(fmt1): """For inproc format, need to convert before send to data converter. Reference: - https://redmine.kneron.tw/issues/23754 - compiler_v2/get_support_formats() function. """ if fmt1 == "HW1C8B": return "RAW8B" elif fmt1 == "HW4C8B_DROP_A": return "4C8B_DROP_A" elif fmt1 == "HW4C8B_KEEP_A": return "4C8B_DROP_A" elif fmt1 == "HW1C16B_LE": return "RAW16B" elif fmt1 == "HW1C16B_BE": return "RAW16B_BE" else: return fmt1 # ordered dictionary n_sqt_bin = len(sqt_bin_list[0]) name2o_ord, _ = idx2onnx_order(ioinfo_in, n_sqt_bin) cmds = [] # save the commands for debug list_bin_csim = defaultdict(list) for i_in, i_info in enumerate(ioinfo_in): shape = i_info["shape"] ndim = len(shape) # stride in / out are same, per Tommy. # stride_in = ioinfo_in[i_in]["stride"] # stride_out = ioinfo_in[i_in]["stride"] # TEMP: use -1 for now. stride_in = [-1] # stride_aligned in .kne.no_binary.json but will convert to stride stride_out = i_info["stride"] fmt_in = bw2raw_fmt(i_info["bitw"]) fmt_out = "FMT_{}".format(convert_inproc_fmt(i_info["data_format"])) sqt_idx = name2o_ord[i_info["name"]] for i_pair, bin_pair in sqt_bin_list.items(): p_sqtl_bin = bin_pair[sqt_idx] p_csim_bin = p_out / f"csim_p{i_pair:06}_i{i_in:03}.bin" list_bin_csim[i_pair].append(p_csim_bin) cmd1 = data_convert_1(p_sqtl_bin, p_csim_bin, fmt_in, fmt_out, stride_in, stride_out, shape, ndim) cmds.append(cmd1) # save cmds then run fn_cmd = p_out / "data_convert.sh" with open(fn_cmd, "w") as f: f.writelines([f"{cmd}\n" for cmd in cmds]) if n_thread is None: n_str = "" else: n_str = f"--jobs {n_thread}" command = f"parallel {n_str} --halt now,fail=1 < {fn_cmd}" cp = futils.run_bash_script(command) if cp.returncode != 0: raise RuntimeError(f"data converter failed with {cp.returncode}") return list_bin_csim, cmds ################################################################################### # do real work here ################################################################################### def gen_csim_settings(bin_pair: list, cmpl_map, p_cmpl_rel, hw_mode, dump_core_opt=0, golden_txt=None): """Generate csim settings. golden_txt should be relative path. """ csim_settings = {} # convert to relative path for fn_key, fn_full in cmpl_map.items(): csim_settings[fn_key] = p_cmpl_rel / fn_full.name # fill none for unused if hw_mode in fconsts.MODE_HW_LIMIT["nef_v2"]: csim_settings["command_bin"] = "None" csim_settings["weight_bin"] = "None" csim_settings["setup_bin"] = "None" csim_settings["apb_npu"] = "None" # in regression / unpack_nef, the kne will have have only 1 model. (it is possible to have multiple) # csim_settings["model_index_in_kne"] = 0 else: csim_settings["kne"] = "None" # TODO: make this flow_constants # input_location: 0 for nmem, 1 for dram csim_settings["input_location"] = 1 if hw_mode in fconsts.MODE_HW_LIMIT["input_in_dram"] else 0 csim_settings["input_bin"] = ",".join(str(a) for a in bin_pair) csim_settings["dump_core_opt"] = dump_core_opt # NOTE: if golden_text available, csim own regression may use it for quick check if golden_txt is None: csim_settings["golden_txt"] = "NONE" else: # convert relative path csim_settings["golden_txt"] = ",".join(str(a) for a in golden_txt) # prepare dynasty golden # golden_txt_fns = self.io_nodes[("btm_dynasty_golden_txt_fn", hw_mode)] # p_dynasty_golden = [p_dynasty_dump / fn for fn in golden_txt_fns] # # csim use relative path to output folder # rel_fns = ["{}/{}".format(p_dynasty_dump_relative, fn) for fn in golden_txt_fns] # csim_settings["golden_txt"] = ",".join(rel_fns) return csim_settings def gen_csim_ini(bin_pair, p_compiler, hw_mode, template, fn_ini, dump_core_opt=0, golden_txts=None): """Generate a ini for csim calling per input pair.""" if hw_mode == 520: # 520 csim doesn't use .ini it's just for regression flow usage return # check folder p_in = bin_pair[0].parent # relative to the ini file. p_ini = fn_ini.parent p_in_rel = futils.relative_path(p_in, p_ini) bin_pair_rel = [p_in_rel / a.name for a in bin_pair] p_comp_rel = futils.relative_path(p_compiler, p_ini) compiler_map = compiler.locate_compiler_dump(p_compiler, hw_mode) if golden_txts is None: p_golden_txt = None else: p_golden = golden_txts[0].parent p_golden_rel = futils.relative_path(p_golden, p_ini) p_golden_txt = [p_golden_rel / a.name for a in golden_txts] csim_settings = gen_csim_settings(bin_pair_rel, compiler_map, p_comp_rel, hw_mode, dump_core_opt=dump_core_opt, golden_txt=p_golden_txt) # render template and save to run_csim.ini output = template.render(model=csim_settings) with open(fn_ini, "w") as f: f.write(output) assert pathlib.Path(fn_ini).exists(), f"failed to create {fn_ini}" return fn_ini def run_csim(list_csim: dict, bin_csim, sh_run_csim=None, n_thread=None, dry_run=False, timeout=3600*6): """Run csim inference on given input. NOTE: we need bin_csim to pass in as it is platform-dependant so we cannot grab from fconsts.BIN_SET directly without platform info. """ cmds = [] for i_csim, (p_out, ini_csim) in list_csim.items(): # prepare folders p_out.mkdir(mode=0o770, parents=True, exist_ok=True) cmd1 = f"{ENV_DYNASTY_LIB}; pushd {p_out} && {bin_csim} {ini_csim}" cmds.append(cmd1) if sh_run_csim is None: r = np.random.randint(10000) sh_run_csim = f"/tmp/run_csim_{r:05}.sh" with open(sh_run_csim, "w") as f: f.write("\n".join(cmds)) if n_thread is None: n_str = "" else: n_str = f"--jobs {n_thread}" command = f"parallel {n_str} --halt now,fail=1 < {sh_run_csim}" if dry_run: cp = None # placeholder else: cp = futils.run_bash_script(command, timeout=timeout) return command, cp def txt2np(out_node_list, out_nodes_shape, output_list, is_520=False): """Convert the csim dumped results to np. - csim will only dump fx data. need extra info to convert back to float. - csim dump files as `dma2seq_0.seq` Args: out_node_list (list): list of output node names. As csim dump multiple output in same order, this list will be used as key for output dict. out_nodes_shape (list): list of shapes (for all output nodes). The integer text will be reshaped to given shape in numpy. output_list: where to find csim dump text files. basically `list(p_dump.glob(f"*.txt/csim_{hw_mode}"))`. is_520 (bool): different name pattern for 520 csim dump. Returns: dict of list of numpy array. Each array is integer datatype and with onnx shape. """ collect_txt_fx = defaultdict(list) for i_dp, dp_out in enumerate(out_node_list): shape_out = out_nodes_shape[dp_out] if is_520: fx_output = f"node_{i_dp:04d}_final_output.txt" else: # NOTE: .seq is only 8/15bit, no 16bit fx_output = f"dma2seq_{i_dp}.seq" for p_dump in output_list: p_fx = pathlib.Path(p_dump) / fx_output if not p_fx.exists(): raise FileExistsError(f"missing {p_fx}") collect_txt_fx[dp_out].append(futils.txt2np_fx(p_fx, shape_out)) return collect_txt_fx