/* * Kneron Model API Manager * * Copyright (C) 2019 Kneron, Inc. All rights reserved. * */ #include #include #include "project.h" #include "base.h" #include "kdrv_ipc.h" /*for NCPU triggering */ #include "kdrv_clock.h" /* for kdrv_delay_us() */ #include "kdev_flash.h" #include "kmdw_ipc.h" #include "kmdw_model.h" #include "kmdw_console.h" /*for dbg_msg */ #include "kmdw_memxfer.h" /*for flash access */ #include "kmdw_memory.h" #include "kmdw_utils_crc.h" #define DEBUG 0 #define OUTPUT_MEM_ADDR2_SIZE 0x100000 /* 1MB, for DME parallel buffer */ #define OUTPUT_MEM_ADDR3_SIZE 0x5000 /* for MBSSD anchor data */ #define FLAG_KMDW_MODEL_ABORT BIT(29) // Event flag to notify abort #define FLAG_KMDW_MODEL_FROM_NCPU BIT(30) // Event flag to know NCPU is done #define FLAG_KMDW_MODEL_FROM_NPU BIT(28) // Event flag to know NPU is done #define MODEL_INF_TIMEOUT (2000) // timeout milli-secs for waiting npcu response #define KDP_FLASH_FW_INFO_SIZE 0x1000 #ifdef EMBED_CMP_NPU /* the following is for specific dense model wt/cmd mem modification */ /*================================================*/ #define WT_DATA_SIZE_BYTE 272 #define CONF_QUEUE 0 #define GETW_QUEUE 3 #define CONF_GETW0_CMD_OFFSET 0x0038 #define CONF_WDMA0_DST0_CMD_OFFSET 0x00f0 #define ACL_NPU_GETW0 0x2e #define ACH_NPU_GETW0 0x2f #define ACL_NPU_WDMA0_DST0 0x36 #define ACH_NPU_WDMA0_DST0 0x37 #define MASK_2 0x0003 #define MASK_10 0x03FF #define MASK_16 0x00FFFF #define VAL_ACL(x) (((x)&0xffff)) #define VAL_ACH(x) (((x) >> 16) & 0xffff) #define SetBitsVal(tgt, val, mask, offset) \ ((tgt) &= ~((mask) << (offset))); \ ((tgt) |= (((val) & (mask)) << (offset))) /*================================================*/ #endif // EMBED_CMP_NPU extern const struct s_kdp_memxfer kdp_memxfer_module; /* Type of Operations */ enum { NODE_TYPE_IN, NODE_TYPE_CPU, NODE_TYPE_OUTPUT, NODE_TYPE_DATA, NODE_TYPE_SUPER, NODE_TYPE_INPUT }; /* Structures of Data Nodes */ struct super_node_s { uint32_t node_id; uint32_t addr; uint32_t row_start; uint32_t col_start; uint32_t ch_start; uint32_t row_length; uint32_t col_length; uint32_t ch_length; }; struct data_node_s { uint32_t node_id; uint32_t supernum; uint32_t data_format; uint32_t data_radix; uint32_t data_scale; uint32_t row_start; uint32_t col_start; uint32_t ch_start; uint32_t row_length; uint32_t col_length; uint32_t ch_length; struct super_node_s node_list[1]; }; /* Structure of Input Operation */ struct in_node_s { uint32_t node_id; uint32_t next_npu; }; /* Structure of Output Operation */ struct out_node_s { uint32_t node_id; uint32_t supernum; uint32_t data_format; uint32_t row_start; uint32_t col_start; uint32_t ch_start; uint32_t row_length; uint32_t col_length; uint32_t ch_length; uint32_t output_index; uint32_t output_radix; uint32_t output_scale; struct super_node_s node_list[1]; }; /* Structure of CPU Operation */ struct cpu_node_s { uint32_t node_id; uint32_t input_datanode_num; uint32_t op_type; /* There will be more parameter here for cpu operation */ uint32_t in_num_row; uint32_t in_num_col; uint32_t in_num_ch; uint32_t out_num_row; uint32_t out_num_col; uint32_t out_num_ch; uint32_t h_pad; uint32_t w_pad; uint32_t kernel_h; uint32_t kernel_w; uint32_t stride_h; uint32_t stride_w; struct data_node_s output_datanode; struct data_node_s input_datanode[1]; }; /* Structure of CNN Header in setup.bin */ struct cnn_header_s { uint32_t crc; uint32_t version; uint32_t key_offset; uint32_t model_type; uint32_t app_type; uint32_t dram_start; uint32_t dram_size; uint32_t input_row; uint32_t input_col; uint32_t input_channel; uint32_t cmd_start; uint32_t cmd_size; uint32_t weight_start; uint32_t weight_size; uint32_t input_start; uint32_t input_size; uint32_t input_radix; uint32_t output_nums; }; typedef struct { uint32_t n_model_source; // 0: not set, 1: from flash, 2: from ddr uint32_t n_model_count; // model count struct kdp_model_s p_model_info[KMDW_MODEL_MAX_MODEL_COUNT]; // save model info generated by compiler uint8_t pn_is_model_loaded_table[KMDW_MODEL_MAX_MODEL_COUNT]; // flag table to indicate if model is loaded uint32_t n_ddr_addr_model_end; // DDR address of model end = user data start int32_t n_model_slot_index; // scpu_to_ncpu->model_slot_index } kmdw_model_data_t; static kmdw_model_data_t s_model_data = {0}; typedef struct { int32_t raw_img_idx; osEventFlagsId_t evt_caller; // event to know/control ncpu uint32_t caller_e; osEventFlagsId_t evt_result; // event to know/control npu uint32_t result_e; } kmdw_img_data_t; // ptr to the buf for uploaded fw info from host static kmdw_model_fw_info_t *s_fw_info_buf_p = NULL; // ptr to the buf for uploaded fw info from flash static bool s_model_loaded_from_flash = false; static kmdw_img_data_t s_img_data[IPC_IMAGE_ACTIVE_MAX] = {0}; static int32_t s_current_ipc_idx = 0; static int32_t s_next_ipc_idx = 0; static bool ModelFromDDR = false; // check model is from flash : false, ddr : true /* ############################ * ## Static Functions ## * ############################ */ /** * @brief init ddr space for s_fw_info_buf_p * */ static void _init_fw_info_buf(void) { if (NULL == s_fw_info_buf_p) { s_fw_info_buf_p = (kmdw_model_fw_info_t*)kmdw_ddr_reserve(KDP_FLASH_FW_INFO_SIZE); if (NULL == s_fw_info_buf_p) critical_msg("insufficent memory for reading fw_info from flash\n"); } } #ifdef EMBED_CMP_NPU static inline void udt_conf_cmd(void *cmd_addr, int reg_idx, uint16_t val16b, int queue) { uint32_t val = 0x80000000; SetBitsVal(val, queue, MASK_2, 26); SetBitsVal(val, reg_idx, MASK_10, 16); SetBitsVal(val, val16b, MASK_16, 0); memcpy(cmd_addr, &val, sizeof(val)); } static void udt_npu_model_mem(uint32_t wt_addr, uint32_t out_addr, void *cmd_addr) { uint32_t *dst = (uint32_t *)((char *)cmd_addr + CONF_GETW0_CMD_OFFSET); udt_conf_cmd(dst, ACL_NPU_GETW0, VAL_ACL(wt_addr), GETW_QUEUE); udt_conf_cmd(dst + 1, ACH_NPU_GETW0, VAL_ACH(wt_addr), GETW_QUEUE); dst = (uint32_t *)((char *)cmd_addr + CONF_WDMA0_DST0_CMD_OFFSET); udt_conf_cmd(dst, ACL_NPU_WDMA0_DST0, VAL_ACL(out_addr), CONF_QUEUE); udt_conf_cmd(dst + 1, ACH_NPU_WDMA0_DST0, VAL_ACH(out_addr), CONF_QUEUE); } #endif // EMBED_CMP_NPU /** * @brief load fw info from flash * @return 0: OK, -1: fail * @note NULL means failed; non-zero ptr means OK */ static kmdw_model_fw_info_t* _load_flash_model_info(void) { //load model from flash once and reuse loaded data, until reload if (false == s_model_loaded_from_flash ) { s_model_loaded_from_flash = true; kdp_memxfer_module.flash_to_ddr((uint32_t)s_fw_info_buf_p, FLASH_MODEL_FW_INFO_ADDR, KDP_FLASH_FW_INFO_SIZE); } return s_fw_info_buf_p; } /** * @brief reset s_model_data */ static void _reset_model_data(void) { s_model_data.n_model_count = 0; s_model_data.n_model_source = 0; memset( s_model_data.p_model_info, 0, sizeof(s_model_data.p_model_info)); memset( s_model_data.pn_is_model_loaded_table, 0 , sizeof(s_model_data.pn_is_model_loaded_table)); //can't reset the following variable which maintains DDR boundary for model //n_last_model_space_end_addr return; } /** * @brief check flash read with timeout_ms * @param timeout_ms timeout in ms * @return flash ready ready time in ms * -1 means timeout hit */ //static int32_t _flash_wait_ready(int timeout_ms) //{ // kdev_flash_status_t flash_status; // int i; // for (i = 0; i < timeout_ms; i++) { // flash_status = kdev_flash_get_status(); // if (flash_status.busy == 0) break; // kdrv_delay_us(1*1000); // } // if (i == timeout_ms) i = -1; // we have timed out // return i; //} /** * @brief convert modeltype to modelInfo array index * @param model_type_p: model type (defined in model_type.h) * @return modelInfo model index (starts from 0) * -1 means not such modeltype in flash */ static int8_t _get_model_info_array_index_by_model_type(uint32_t model_type_p) { int i; for(i=0 ; i < s_model_data.n_model_count; i++) { if(s_model_data.p_model_info[i].model_type == model_type_p) return i; } return -1; } /** * @brief get fw info extension data from fw_info ptr * @param[in] fw_info_p the ptr to fw_info * @return the ptr to fw_info_ext */ static kmdw_model_fw_info_ext_t* _get_fw_info_ext_by_fw_info(kmdw_model_fw_info_t* fw_info_p) { if(NULL == fw_info_p) return NULL; else { kmdw_model_fw_info_ext_t* ret = NULL; uint32_t count; uint32_t offset; count = fw_info_p->model_count; offset = sizeof(struct kdp_model_s) * count; ret = (kmdw_model_fw_info_ext_t *)((uint32_t)fw_info_p->models + offset); return ret; } } /** * @brief load model information generated by compiler * @param [in] is_model_from_ddr: if model is from ddr/host command * @param [in] is_reload : is force reload * @return model count * 0 means no model is loaded in this call */ static int32_t _load_model_info(bool from_ddr, bool reload) { if (s_model_data.n_model_count && !reload) { return s_model_data.n_model_count; } if (reload) { _reset_model_data(); s_model_loaded_from_flash = false; } kmdw_model_fw_info_t *model_info_p = NULL; kmdw_model_fw_info_ext_t *model_info2_p = NULL; // load model Info if (from_ddr) { model_info_p = s_fw_info_buf_p; model_info2_p = _get_fw_info_ext_by_fw_info(model_info_p); if((NULL == model_info_p) || (NULL == model_info2_p) ) { s_model_data.n_model_count = 0; return 0; } // Use the version number for new fw_info structure. Model number is in use for dynamic model execution (DME) //if (*(uint32_t*)(base_addr + 8) == 0) { // return 0; //error, model_info is not ready //} // get model count s_model_data.n_model_count = model_info_p->model_count; dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count); if(0 == s_model_data.n_model_count) { info_msg("[info] model is not in DDR!!\n"); return 0; } else if (s_model_data.n_model_count > KMDW_MODEL_MAX_MODEL_COUNT) { info_msg("[ERR] model count is over MAX limit=%d!!\n", KMDW_MODEL_MAX_MODEL_COUNT); s_model_data.n_model_count = 0; return 0; } else { dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count); } // get model info memcpy(s_model_data.p_model_info, (const void*)model_info_p->models, sizeof(struct kdp_model_s)*s_model_data.n_model_count); // get ddr model end addr s_model_data.n_ddr_addr_model_end = model_info2_p->model_dram_addr_end; if (s_model_data.n_ddr_addr_model_end >= kmdw_ddr_get_heap_tail()) { err_msg("modelInfo: DDR end address: 0x%x over (>=) boundary 0x%x\n", s_model_data.n_ddr_addr_model_end, kmdw_ddr_get_heap_tail()); return 0; } else { dbg_msg("modelInfo: DDR end address: 0x%x\n", s_model_data.n_ddr_addr_model_end); } // set model source s_model_data.n_model_source = 2; // from ddr } else { // models are stored in flash model_info_p = _load_flash_model_info(); // this function updates data on s_fw_info_buf_p model_info2_p = _get_fw_info_ext_by_fw_info(model_info_p); if((NULL == model_info_p) || (NULL == model_info2_p) ) { s_model_data.n_model_count = 0; return 0; } // get model count s_model_data.n_model_count = model_info_p->model_count; dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count); if (s_model_data.n_model_count == 0xFFFFFFFF) { err_msg("[info] model is not in flash!!\n"); s_model_data.n_model_count = 0; return 0; } else if (s_model_data.n_model_count > KMDW_MODEL_MAX_MODEL_COUNT) { info_msg("[ERR] model count is over MAX limit=%d!!\n", KMDW_MODEL_MAX_MODEL_COUNT); s_model_data.n_model_count = 0; return 0; } else { dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count); } // get model info //FIXME, why need to clone to s_fw_info_buf_p //memcpy(s_fw_info_buf_p, (void *)model_info_p, KDP_FLASH_FW_INFO_SIZE); memcpy(s_model_data.p_model_info, model_info_p->models, sizeof(struct kdp_model_s)*s_model_data.n_model_count); // get ddr model end addr s_model_data.n_ddr_addr_model_end = model_info2_p->model_dram_addr_end; if (s_model_data.n_ddr_addr_model_end >= kmdw_ddr_get_heap_tail()) { err_msg("modelInfo: DDR end address: 0x%x over (>=) boundary 0x%x\n", s_model_data.n_ddr_addr_model_end, kmdw_ddr_get_heap_tail()); return 0; } else { dbg_msg("modelInfo: DDR end address: 0x%x\n", s_model_data.n_ddr_addr_model_end); } // set model source s_model_data.n_model_source = 1; // from flash } // for support of dynamic model execution *(uint32_t*)(((char*)s_fw_info_buf_p) + 8) = 0; //trick: we will check the work to see if model_info is uploaded return s_model_data.n_model_count; } /** * @brief load specific model by model info index (the order in flash) * @param model_index_p: model info index * @return 0: model not ready, 1: model is loaded */ static int32_t _load_model(uint8_t model_index_p/*starts from 0*/) { uint32_t ddr_addr_models_head; //start point = the 1st model's cmd.bin uint32_t ddr_addr_offset; uint32_t flash_addr; uint32_t len_to_load; struct kdp_model_s *p_model; if(s_model_data.n_model_count == 0) return 0; // model info is not ready if(s_model_data.pn_is_model_loaded_table[model_index_p] == 1 ) return 1; //model has been loaded else s_model_data.pn_is_model_loaded_table[model_index_p] = 1; //load model with (index=model_index_p) from flash to DDR ddr_addr_models_head = s_model_data.p_model_info[0].cmd_mem_addr; //start point = the 1st model's cmd.bin //load cmd + weight + setup together p_model = &(s_model_data.p_model_info[model_index_p]); ddr_addr_offset = p_model->cmd_mem_addr - ddr_addr_models_head; flash_addr = FLASH_MODEL_ALL_ADDR + ddr_addr_offset; len_to_load = ALIGN16(p_model->cmd_mem_len) + ALIGN16(p_model->weight_mem_len) + ALIGN16(p_model->setup_mem_len); //model from flash to ddr kdp_memxfer_module.flash_to_ddr(p_model->cmd_mem_addr, flash_addr, len_to_load); return 1; } /** * @brief prepare ouptut_mem_addr2 for ncpu/npu parallel mode inference * * @return 0:OK, -1:Fail */ static int32_t _prepare_output_mem_addr2(void) { /* Allocate parallel output buffer , if caller not provide buf*/ struct scpu_to_ncpu_s* comm_out = kmdw_ipc_get_output(); uint32_t addr_parallel = comm_out->output_mem_addr2; if (addr_parallel == 0) { //TODO, dynamic allocate memory for output_mem_addr2 //uint32_t addr1 = comm_out->models[model_idx].output_mem_addr; //uint32_t addr2 = comm_out->models[model_idx].buf_addr; //uint32_t len = comm_out->models[model_idx].output_mem_len; //if (addr1 == addr2) { // // Old memory layout, use working buffer length // len = comm_out->models[model_idx].buf_len; //} // reserve more space for larger model output (ty_608x608 need 620160) uint32_t len = OUTPUT_MEM_ADDR2_SIZE; addr_parallel = kmdw_ddr_reserve(len); if (addr_parallel == 0) { err_msg("Error ddr allocation ncpu/npu parallel buffer, len %d\n", len); return -1; //error } comm_out->output_mem_addr2 = addr_parallel; comm_out->output_mem_len2 = len; dbg_msg("allocated Parallel buffer: len %d, addr 0x%x", len, addr_parallel); } return 0; } #ifdef KL520 /** * @brief prepare ouptut_mem_addr3 for MBSSD network * * @param [in] model_type model id * @return 0:OK, -1:Fail */ static int32_t _prepare_output_mem_addr3(uint32_t model_type) { if (model_type == KNERON_FD_MBSSD_200_200_3 || model_type == KNERON_FD_MASK_MBSSD_200_200_3 || model_type == KNERON_OD_MBSSD || model_type == KNERON_PD_MBSSD || model_type == KNERON_CAR_DETECTION_MBSSD_224_416_3) { uint32_t *pMemAddr3; uint32_t len = OUTPUT_MEM_ADDR3_SIZE; struct scpu_to_ncpu_s* comm_out = kmdw_ipc_get_output(); switch (model_type) { case KNERON_FD_MBSSD_200_200_3 : case KNERON_FD_MASK_MBSSD_200_200_3 : { static uint32_t mem_addr3_fdssd = 0; pMemAddr3 = &mem_addr3_fdssd; break; } case KNERON_OD_MBSSD : { static uint32_t mem_addr3_odssd = 0; pMemAddr3 = &mem_addr3_odssd; break; } case KNERON_PD_MBSSD : { static uint32_t mem_addr3_pdssd = 0; pMemAddr3 = &mem_addr3_pdssd; break; } case KNERON_CAR_DETECTION_MBSSD_224_416_3 : { static uint32_t mem_addr3_vdssd = 0; pMemAddr3 = &mem_addr3_vdssd; break; } default : break; } if (*pMemAddr3 == 0) { *pMemAddr3 = kmdw_ddr_reserve(len*sizeof(uint32_t)); if (*pMemAddr3 == 0) { err_msg("Error ddr allocation fail for MBSSD network, mem_addr3 len %d\n", len); return -1; //error } *(uint32_t*)(*pMemAddr3) = 0; } comm_out->output_mem_addr3 = *pMemAddr3; } return 0; } #endif /** * @brief specify model information, load model info, load model * @param [in] model_type_p: model unique ID defined by Kneron * @param [in] model_from_ddr: is model from ddr or host command * @return model_slot_index(requested by NCPU/NPU) * -1 : model not found */ static int32_t _config_model(uint32_t model_type, bool model_from_ddr) { int model_info_idx; //limitation (hard coded in flash) int model_idx; //check if model info is loaded if( 0 == _load_model_info(model_from_ddr, false/*reload*/)) { return -1; } if( model_from_ddr == 0 ) { //FIXME, should remove application related code /* Special model not in DDR but in ncpu */ if (model_type == KNERON_2D_LIVENESS_224_224_3) { model_idx = 3; model_info_idx = 4; goto model_common; } model_info_idx = _get_model_info_array_index_by_model_type(model_type); if(model_info_idx == -1) { err_msg("[ERR] model_type[%d] is not found in flash\n", model_type); return -1; } _load_model(model_info_idx); // FIXME: need to remove the following hard code model_idx = model_info_idx; } else { model_info_idx = _get_model_info_array_index_by_model_type(model_type); if(model_info_idx == -1) { err_msg("[ERR] model_type[%d] is not found in DDR\n", model_type); return -1; } model_idx = model_info_idx; } model_common: s_model_data.n_model_slot_index = model_idx; kmdw_ipc_set_model(s_model_data.p_model_info, model_info_idx, model_idx); struct kdp_img_raw_s *raw_img = kmdw_model_get_raw_img(s_img_data[s_current_ipc_idx].raw_img_idx); if (raw_img->inf_format & IMAGE_FORMAT_PARALLEL_PROC) { if (-1 == _prepare_output_mem_addr2() ) { return -1; } } #ifdef KL520 if ( -1 == _prepare_output_mem_addr3(model_type) ) { return -1; } #else struct scpu_to_ncpu_s* p_comm_out = kmdw_ipc_get_output(); if (NULL == p_comm_out->output_mem_addr3) { uint32_t len = 0x5000; p_comm_out->output_mem_addr3 = kmdw_ddr_reserve(len*sizeof(uint32_t)); if(NULL == p_comm_out->output_mem_addr3) { critical_msg("kmdw_model: failed to malloc comm_out->output_mem_addr3\n"); return -1; } } if (NULL == p_comm_out->output_mem_addr4) { uint32_t len = 8 * (1 << 20); p_comm_out->output_mem_addr4 = kmdw_ddr_reserve(len); if (NULL == p_comm_out->output_mem_addr4) { critical_msg("kmdw_model: failed to malloc comm_out->output_mem_addr4\n"); return -1; } } #endif kmdw_ipc_set_model_active(model_idx); return model_idx; } /** * @brief run model according to config settings * @return status defined in NCPU * @note !!! must be called after kapp_config_model_image() */ static int32_t _run_model(void) { int active_idx = s_current_ipc_idx; int raw_img_idx = s_img_data[active_idx].raw_img_idx; struct kdp_img_raw_s *p_raw_image = kmdw_model_get_raw_img(raw_img_idx); uint32_t flags, wait_evt; uint32_t is_abort = 0; // Start time for ncpu/npu round trip p_raw_image->tick_start = osKernelGetTickCount(); if (s_img_data[active_idx].evt_caller == NULL) s_img_data[active_idx].evt_caller = osEventFlagsNew(0); if(!s_img_data[active_idx].evt_caller) err_msg(" active_idx=%d, osEventFlagsNew evt_caller failure\n",active_idx); // set notify for job done if (s_img_data[active_idx].evt_result) { /* Result event already set. Let's do local event for parallel. */ wait_evt = FLAG_KMDW_MODEL_FROM_NPU; } else { wait_evt = FLAG_KMDW_MODEL_FROM_NCPU; } dbg_msg(" wait %d[%d] evt %x\n", raw_img_idx, active_idx, wait_evt); //assign caller event before triggering ncpu/npu s_img_data[active_idx].caller_e = wait_evt; //trigger ncpu/npu kmdw_ipc_trigger_int(CMD_RUN_NPU); //check abort signal flags = osEventFlagsWait(s_img_data[active_idx].evt_caller, FLAG_KMDW_MODEL_ABORT, osFlagsWaitAll, 0); if( flags != osFlagsErrorResource ) { osEventFlagsClear(s_img_data[active_idx].evt_caller, FLAG_KMDW_MODEL_ABORT); is_abort = 1; } uint32_t wait_timeout = (kmdw_ipc_get_output()->kp_dbg_checkpoinots == 0x0) ? MODEL_INF_TIMEOUT : osWaitForever; //wait for finish of current task flags = osEventFlagsWait(s_img_data[active_idx].evt_caller, wait_evt, osFlagsNoClear, wait_timeout); if(flags == osFlagsErrorTimeout){ err_msg("[%s] osEventFlagsWait flag 0x%08x timeout\n", __FUNCTION__, wait_evt); return IMAGE_STATE_TIMEOUT; } else if (flags != wait_evt) dbg_msg("[%s] 1+ events 0x%08x (%d[%d] expected)\n", __FUNCTION__, flags, wait_evt, active_idx); else dbg_msg("[DBG][%s] got: raw_img_idx[active_idx]=%d[%d]\n", __FUNCTION__, raw_img_idx, active_idx); osEventFlagsClear(s_img_data[active_idx].evt_caller, wait_evt); if( 1 == is_abort ) { dbg_msg("[DBG][%s] abort after n_model_slot_index = %d\n", __FUNCTION__, s_model_data.n_model_slot_index); return KMDW_MODEL_RUN_RC_ABORT; //abort } return kmdw_ipc_get_input()->result.postproc.img_result.status; } __weak osStatus_t kmdw_fifoq_manager_result_enqueue(void *result_buf, int buf_size, bool preempt) { return osOK; } static void _ipc_handler(struct kdp_img_raw_s *p_raw_image, int state) { int ipc_idx; if(state == 0x999) // FIXME, very workaround { kmdw_ipc_get_input()->kp_dbg_status = 0x0; osStatus_t sts = kmdw_fifoq_manager_result_enqueue(kmdw_ipc_get_output()->kp_dbg_buffer, 0, false); if(sts != osOK) kmdw_printf("send dbg data failed in ipc, err %d\n", sts); } else if (state == IMAGE_STATE_RECEIVING) { ipc_idx = p_raw_image->ref_idx; // End time for ncpu/npu round trip p_raw_image->tick_end = osKernelGetSysTimerCount(); if (s_img_data[ipc_idx].evt_result) { dbg_msg("[done: post: P] ipc_idx: %d, result_e: %d (ram %x)\n", ipc_idx, s_img_data[ipc_idx].result_e, p_raw_image); osEventFlagsSet(s_img_data[ipc_idx].evt_result, s_img_data[ipc_idx].result_e); } else { dbg_msg("[done: post: S] ipc_idx: %d, caller_e: %x.\n", ipc_idx, s_img_data[ipc_idx].caller_e); osEventFlagsSet(s_img_data[ipc_idx].evt_caller, s_img_data[ipc_idx].caller_e); } } else if (state == IMAGE_STATE_ACTIVE){ ipc_idx = s_current_ipc_idx; dbg_msg("[done: npu: P] ipc_idx: %d, caller_e: %x\n", ipc_idx, s_img_data[ipc_idx].caller_e); osEventFlagsSet(s_img_data[ipc_idx].evt_caller, s_img_data[ipc_idx].caller_e); } else { err_msg("[ERR] wrong state: %d (ipc_idx %d)\n", state, ipc_idx); } } /* ############################ * ## Public Functions ## * ############################ */ void kmdw_model_init(void) { kmdw_ipc_initialize(_ipc_handler); _init_fw_info_buf(); s_fw_info_buf_p->model_count = 0; } int32_t kmdw_model_load_model(int8_t model_info_index_p) { int32_t ret = 0; if(1 != s_model_data.n_model_source || // check if s_model_data is not according to flash 0 == s_model_data.n_model_count) { if(0 == _load_model_info(false/*from ddr*/, true/*reload*/)) return 0; //error, no model is loaded } // load all models if (KMDW_MODEL_ALL_MODELS == model_info_index_p) { uint8_t i; for (i = 0 ; i < s_model_data.n_model_count ; i++) { ret = _load_model(i); if( 0 == ret) { err_msg("[ERR] %s : failed to load model array index:%d\n", __FUNCTION__, i); return 0; } } // Very slow if turn it on. Maybe hardware support is needed. // Add a new compiler directive if CRC32 method is also used in other scenarios (ex: check FW image) #if ENABLE_CRC32 // check CRC value of all_models.bin kmdw_model_fw_info_t *model_info_p = _load_flash_model_info(); kmdw_model_fw_info_ext_t *model_info2_p = _get_fw_info_ext_by_fw_info(model_info_p); // cmd_mem_addr of first model is the start address of all_models.bin uint8_t *addr = (uint8_t *)s_model_data.p_model_info[0].cmd_mem_addr; uint32_t crc32 = kmdw_utils_crc_gen_crc32(addr, model_info2_p->model_total_size); dbg_msg("[%s] crc32 calculated: 0x%x\n", __FUNCTION__, crc32); dbg_msg("[%s] crc32 read from flash: 0x%x\n", __FUNCTION__, model_info2_p->model_checksum); dbg_msg("[%s] model start address: 0x%x\n", __FUNCTION__, s_model_data.p_model_info[0].cmd_mem_addr); dbg_msg("[%s] model total size: %d\n", __FUNCTION__, model_info2_p->model_total_size); if (crc32 != model_info2_p->model_checksum) { err_msg("[ERR] %s: all models.bin CRC check failed\n", __FUNCTION__); return 0; } #endif return s_model_data.n_model_count; } else { // load specific model ret = _load_model(model_info_index_p); return ret; } } int32_t kmdw_model_reload_model_info(bool from_ddr) { return _load_model_info(from_ddr, true/*reload*/); } int32_t kmdw_model_refresh_models(void) // reload all the models from flash again { uint8_t i; // forcedly update s_model_data which might be poluted by model upload from host if(0 == _load_model_info(false/*from ddr*/, true/*reload*/)) return 0; //error, no model is loaded int ret; for (i = 0 ; i < s_model_data.n_model_count ; i++) { if (s_model_data.pn_is_model_loaded_table[i]) { // if previously loaded s_model_data.pn_is_model_loaded_table[i] = 0; ret = _load_model(i); // reload the model again if ( 0 == ret) { err_msg("[ERR] %s : failed to load model array index:%d\n", __FUNCTION__, i); return 0; } } } return s_model_data.n_model_count; } int32_t kmdw_model_config_result(osEventFlagsId_t result_evt, uint32_t result_evt_flag) { int active_idx = s_current_ipc_idx; s_img_data[active_idx].evt_result = result_evt; s_img_data[active_idx].result_e = result_evt_flag; return 0; } void kmdw_model_config_img(struct kdp_img_cfg *img_cfg, void *ext_param) { int act_img_idx = img_cfg->image_buf_active_index; struct kdp_img_raw_s *raw_img = kmdw_model_get_raw_img(act_img_idx); s_current_ipc_idx = s_next_ipc_idx; if (img_cfg->inf_format & IMAGE_FORMAT_PARALLEL_PROC) s_next_ipc_idx = !s_next_ipc_idx; kmdw_ipc_set_image_active(act_img_idx); s_img_data[s_current_ipc_idx].raw_img_idx = act_img_idx; raw_img->state = IMAGE_STATE_ACTIVE; raw_img->seq_num = act_img_idx; raw_img->ref_idx = s_current_ipc_idx; raw_img->num_image = img_cfg->num_image; raw_img->inf_format = img_cfg->inf_format; for (int i = 0; i < img_cfg->num_image; i++) { raw_img->image_list[i].input_row = img_cfg->image_list[i].input_row; raw_img->image_list[i].input_col = img_cfg->image_list[i].input_col; raw_img->image_list[i].input_channel = img_cfg->image_list[i].input_channel; raw_img->image_list[i].format = img_cfg->image_list[i].format; raw_img->image_list[i].image_mem_addr = img_cfg->image_list[i].image_mem_addr; raw_img->image_list[i].image_mem_len = img_cfg->image_list[i].image_mem_len; memcpy(&(raw_img->image_list[i].params_s), &(img_cfg->image_list[i].params_s), sizeof(parameter_t)); } if (ext_param == NULL) { memset(raw_img->ext_params, 0, MAX_PARAMS_LEN * 4); } else { memcpy(raw_img->ext_params, ext_param, MAX_PARAMS_LEN * 4); } } struct kdp_img_raw_s* kmdw_model_get_raw_img(int idx) { struct scpu_to_ncpu_s *comm_out = kmdw_ipc_get_output(); return &(comm_out->raw_images[idx]); } int kmdw_model_run(const char *tag, void *output, uint32_t model_type, bool model_from_ddr) { int model_idx = _config_model(model_type, model_from_ddr); if (model_idx < 0) { return KMDW_MODEL_RUN_RC_ABORT; } int img_idx = s_img_data[s_current_ipc_idx].raw_img_idx; struct kdp_img_raw_s *raw_img = kmdw_model_get_raw_img(img_idx); raw_img->results[model_idx].result_mem_addr = (uint32_t)output; dbg_msg("[INFO] %s:\n", tag); dbg_msg(" model_idx = %d\n", model_idx); dbg_msg(" model type = %d\n", model_type); dbg_msg(" ref_idx = %d\n", raw_img->ref_idx); dbg_msg(" inf_format = 0x%X\n", raw_img->inf_format); dbg_msg(" output addr = 0x%x\n", raw_img->results[model_idx].result_mem_addr); dbg_msg(" ext_params(first 4)= %d/%d/%d/%d\n", raw_img->ext_params[0], raw_img->ext_params[1], raw_img->ext_params[2], raw_img->ext_params[3]); for (int i = 0; i < raw_img->num_image; i++) { dbg_msg(" image index: %d\n", i); dbg_msg(" (row/col/ch) = %d/%d/%d\n", raw_img->image_list[i].input_row, raw_img->image_list[i].input_col, raw_img->image_list[i].input_channel); dbg_msg(" image format = 0x%x\n", raw_img->image_list[i].format); dbg_msg(" crop(tp/bt/lf/rt) = %d/%d/%d/%d\n", raw_img->image_list[i].params_s.crop_top, raw_img->image_list[i].params_s.crop_bottom, raw_img->image_list[i].params_s.crop_left, raw_img->image_list[i].params_s.crop_right); dbg_msg(" image addr = 0x%x\n", raw_img->image_list[i].image_mem_addr); } return _run_model(); } void kmdw_model_abort(void) { int active_idx = s_current_ipc_idx; if( 0 == s_img_data[active_idx].evt_caller) return; osEventFlagsSet(s_img_data[active_idx].evt_caller, FLAG_KMDW_MODEL_ABORT); } struct kdp_model_s* kmdw_model_get_model_info(int model_idx_p) { if (s_model_data.n_model_count == 0) { return NULL; } else if (model_idx_p >= s_model_data.n_model_count) { return NULL; } else { return &(s_model_data.p_model_info[model_idx_p]); } } void kmdw_model_get_run_time(int img_idx, kmdw_model_run_time_t *run_time/*out*/) { struct kdp_img_raw_s *p_raw_image; if (run_time == NULL) return; p_raw_image = kmdw_model_get_raw_img(img_idx); run_time->round_trip_time = p_raw_image->tick_end - p_raw_image->tick_start; run_time->pre_proc_time = p_raw_image->tick_end_pre - p_raw_image->tick_start_pre; run_time->npu_proc_time = p_raw_image->tick_end_npu - p_raw_image->tick_start_npu; run_time->post_proc_time = p_raw_image->tick_end_post - p_raw_image->tick_start_post; } int kmdw_model_is_model_loaded(uint32_t model_type) { if (_get_model_info_array_index_by_model_type(model_type) == -1) return 0; else return 1; } uint32_t *kmdw_model_get_all_model_info(bool trust_ddr_data) { static uint32_t *s_p_model_id_list = NULL; //[model_count, id0, id1, id2 ...] kmdw_model_fw_info_t *fw_info_ptr = NULL; fw_info_ptr = kmdw_model_get_fw_info(trust_ddr_data); if (fw_info_ptr) { if (NULL == s_p_model_id_list) s_p_model_id_list = (uint32_t *)calloc(1+KMDW_MODEL_MAX_MODEL_COUNT, sizeof(uint32_t)); if (NULL == s_p_model_id_list) { err_msg("[ERR] insufficent memory for model id list\n"); } else { int i; uint32_t model_id; s_p_model_id_list[0] = fw_info_ptr->model_count; dbg_msg("%s:\n", __FUNCTION__); dbg_msg("Model Count = %d\n", s_p_model_id_list[0]); for (i = 0 ; i < s_p_model_id_list[0]; i++) { model_id = fw_info_ptr->models[i].model_type; dbg_msg("Extract Model ID %d\n", model_id); s_p_model_id_list[i+1] = model_id; } } return s_p_model_id_list; } else { return NULL; } } uint32_t kmdw_model_get_crc(bool trust_ddr_data) { uint32_t ret = 0; kmdw_model_fw_info_t *fw_info_ptr; kmdw_model_fw_info_ext_t *fw_info_ext_ptr; fw_info_ptr = kmdw_model_get_fw_info(trust_ddr_data); fw_info_ext_ptr = _get_fw_info_ext_by_fw_info(fw_info_ptr); if (fw_info_ext_ptr) { ret = fw_info_ext_ptr->model_checksum; } dbg_msg("%s = 0x%x\n", __FUNCTION__, ret); return ret; } kmdw_model_fw_info_t *kmdw_model_get_fw_info(bool trust_ddr_data) { uint32_t model_cnt; kmdw_model_fw_info_t *fw_info_ptr = s_fw_info_buf_p; if (false == trust_ddr_data) { if ((0 >= s_model_data.n_model_count) || ((1 != s_model_data.n_model_source) && (2 != s_model_data.n_model_source))) { fw_info_ptr = NULL; } else { model_cnt = fw_info_ptr->model_count; if ((0 == model_cnt) || (model_cnt > KMDW_MODEL_MAX_MODEL_COUNT)) { fw_info_ptr = NULL; } } } return fw_info_ptr; } uint32_t kmdw_model_get_model_end_addr(bool trust_ddr_data) { uint32_t ret = 0; kmdw_model_fw_info_t* fw_info_ptr; kmdw_model_fw_info_ext_t* fw_info_ext_ptr = NULL; if (0 != s_model_data.n_ddr_addr_model_end) { ret = s_model_data.n_ddr_addr_model_end; goto FUNC_OUT; } fw_info_ptr = kmdw_model_get_fw_info(trust_ddr_data); fw_info_ext_ptr = _get_fw_info_ext_by_fw_info(fw_info_ptr); if (fw_info_ext_ptr) { ret = fw_info_ext_ptr->model_dram_addr_end; } FUNC_OUT: dbg_msg("%s = 0x%x\n", __FUNCTION__, ret); return ret; } void kmdw_model_set_location(bool model_inddr) { ModelFromDDR = model_inddr; } bool kmdw_model_get_location(void) { return ModelFromDDR; } int kmdw_model_get_input_tensor_num(uint32_t model_type) { int model_idx = 0; model_idx = _get_model_info_array_index_by_model_type(model_type); if (model_idx >= 0) { /****************************************************************** * KL520 only support single input model ******************************************************************/ return 1; } else { err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type); return 0; } } int kmdw_model_get_input_tensor_info(uint32_t model_type, uint32_t tensor_idx, kmdw_model_tensor_descriptor_t *tensor_info) { int ret = 1; int model_idx = 0; uint32_t p_setup_bin = 0; struct cnn_header_s *target_input_node = NULL; if (NULL == tensor_info) { err_msg("[%s] NULL tensor_info pointer\n", __FUNCTION__); ret = 0; goto FUNC_OUT; } model_idx = _get_model_info_array_index_by_model_type(model_type); if (model_idx >= 0) { struct kdp_model_s *p_model_info = kmdw_model_get_model_info(model_idx); if (NULL != p_model_info) { p_setup_bin = p_model_info->setup_mem_addr; } else { err_msg("[%s] NULL model info pointer %d\n", __FUNCTION__); return 0; } } else { err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type); ret = 0; goto FUNC_OUT; } if (tensor_idx >= 1) { err_msg("[%s] tensor index out of range %d\n", __FUNCTION__, tensor_idx); ret = 0; goto FUNC_OUT; } target_input_node = (struct cnn_header_s *)p_setup_bin; tensor_info->index = 1; tensor_info->shape_npu_len = 4; tensor_info->shape_npu[0] = 1; tensor_info->shape_npu[1] = target_input_node->input_channel; tensor_info->shape_npu[2] = target_input_node->input_row; tensor_info->shape_npu[3] = target_input_node->input_col; tensor_info->data_layout = DATA_FMT_4W4C8B; tensor_info->scale = 1.0; tensor_info->radix = target_input_node->input_radix; FUNC_OUT: return ret; } int kmdw_model_get_output_tensor_num(uint32_t model_type) { int model_idx = 0; struct kdp_model_s *p_model_info = NULL; model_idx = _get_model_info_array_index_by_model_type(model_type); if (model_idx >= 0) { p_model_info = kmdw_model_get_model_info(model_idx); } else { err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type); return 0; } /****************************************************************** * legacy setup.bin model ******************************************************************/ if (NULL != p_model_info) { return ((struct cnn_header_s *)p_model_info->setup_mem_addr)->output_nums; } else { err_msg("[%s] NULL model info pointer %d\n", __FUNCTION__); return 0; } } int kmdw_model_get_output_tensor_info(uint32_t model_type, uint32_t tensor_idx, kmdw_model_tensor_descriptor_t *tensor_info) { int ret = 1; int model_idx = 0; uint32_t p_setup_bin = 0; uint32_t node_num = 0; uint32_t setup_buff_offset = sizeof(struct cnn_header_s); uint32_t setup_buff_size = 0; struct out_node_s *target_output_node = NULL; struct out_node_s *output_node = NULL; if (NULL == tensor_info) { err_msg("[%s] NULL tensor_info pointer\n", __FUNCTION__); ret = 0; goto FUNC_OUT; } model_idx = _get_model_info_array_index_by_model_type(model_type); if (model_idx >= 0) { struct kdp_model_s *p_model_info = kmdw_model_get_model_info(model_idx); if (NULL != p_model_info) { p_setup_bin = p_model_info->setup_mem_addr; setup_buff_size = p_model_info->setup_mem_len; } else { err_msg("[%s] NULL model info pointer %d\n", __FUNCTION__); return 0; } } else { err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type); ret = 0; goto FUNC_OUT; } node_num = ((struct cnn_header_s *)p_setup_bin)->output_nums; if (tensor_idx >= node_num) { err_msg("[%s] tensor index out of range %d\n", __FUNCTION__, tensor_idx); ret = 0; goto FUNC_OUT; } while ((setup_buff_offset < setup_buff_size) && (NULL == target_output_node)) { uintptr_t node_buff = (uintptr_t)p_setup_bin + setup_buff_offset; uint32_t node_id = *(uint32_t *)node_buff; uint32_t node_offset = 0; switch (node_id) { case NODE_TYPE_IN: // NPU IN Signal NODE dbg_msg("current node is an NPU IN Signal NODE\n"); node_offset = sizeof(struct in_node_s); break; case NODE_TYPE_CPU: // CPU NODE dbg_msg("current node is a CPU NODE\n"); node_offset = sizeof(struct cpu_node_s) - (2 * sizeof(struct data_node_s)); break; case NODE_TYPE_OUTPUT: // OUTPUT NODE dbg_msg("current node is a output NODE\n"); output_node = (struct out_node_s *)node_buff; node_offset = sizeof(struct out_node_s) - (sizeof(struct super_node_s)); if (output_node->output_index == tensor_idx) target_output_node = output_node; break; case NODE_TYPE_DATA: // NPU DATA NODE dbg_msg("current node is an network data NODE\n"); node_offset = sizeof(struct data_node_s) - sizeof(struct super_node_s); break; case NODE_TYPE_SUPER: // NPU SUPER NODE dbg_msg("current node is an network super NODE\n"); node_offset = sizeof(struct super_node_s); break; default: // Unknown NODE err_msg("[%s] unknown node type: %d\n", __FUNCTION__, node_id); ret = 0; goto FUNC_OUT; } setup_buff_offset += node_offset; } if (NULL == target_output_node) { err_msg("[%s] can not find target index node %d\n", __FUNCTION__, tensor_idx); ret = 0; goto FUNC_OUT; } tensor_info->index = target_output_node->output_index; tensor_info->shape_npu_len = 4; tensor_info->shape_npu[0] = 1; tensor_info->shape_npu[1] = target_output_node->ch_length; tensor_info->shape_npu[2] = target_output_node->row_length; tensor_info->shape_npu[3] = target_output_node->col_length; tensor_info->data_layout = target_output_node->data_format; tensor_info->scale = *(float *)&(target_output_node->output_scale); tensor_info->radix = target_output_node->output_radix; FUNC_OUT: return ret; } #ifdef EMBED_CMP_NPU int8_t kmdw_model_add_update_model(uint32_t model_type, int cmd_len, int wt_len, int input_len, int output_len, int setup_len, uint32_t cmd_mem_addr, uint32_t wt_mem_addr, uint32_t input_mem_addr, uint32_t output_mem_addr, uint32_t setup_mem_addr) { int model_info_idx = _get_model_info_array_index_from_model_type(model_type); if (model_info_idx < 0) { int model_count = s_model_data.n_model_count + 1; s_model_data.n_model_count = model_count; model_info_idx = model_count - 1; s_model_data.p_model_info[model_info_idx].model_type = model_type; s_model_data.p_model_info[model_info_idx].cmd_mem_addr = cmd_mem_addr; s_model_data.p_model_info[model_info_idx].cmd_mem_len = cmd_len; s_model_data.p_model_info[model_info_idx].weight_mem_len = wt_len; s_model_data.p_model_info[model_info_idx].input_mem_addr = input_mem_addr; s_model_data.p_model_info[model_info_idx].input_mem_len = input_len; s_model_data.p_model_info[model_info_idx].output_mem_len = output_len; s_model_data.p_model_info[model_info_idx].buf_len = output_len; s_model_data.p_model_info[model_info_idx].setup_mem_addr = setup_mem_addr; s_model_data.p_model_info[model_info_idx].setup_mem_len = setup_len; s_model_data.pn_is_model_loaded_table[model_info_idx] = 1; } s_model_data.p_model_info[model_info_idx].weight_mem_addr = wt_mem_addr; s_model_data.p_model_info[model_info_idx].output_mem_addr = output_mem_addr; s_model_data.p_model_info[model_info_idx].buf_addr = output_mem_addr; dbg_msg("[%s] model cmd addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].cmd_mem_addr); dbg_msg("[%s] model wt addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].weight_mem_addr); dbg_msg("[%s] model input addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].input_mem_addr); dbg_msg("[%s] model output addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].output_mem_addr); dbg_msg("[%s] model buf addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].buf_addr); dbg_msg("[%s] model setup addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].setup_mem_addr); udt_npu_model_mem(wt_mem_addr, output_mem_addr, (void *)cmd_mem_addr); return 0; } #endif // EMBED_CMP_NPU #if DEBUG void kmdw_model_dump_model_info(void) { struct kdp_model_s *p_modelInfo = 0; uint8_t i; dbg_msg("Model info Count = %d\n", s_model_data.n_model_count); for (i = 0 ; i < s_model_data.n_model_count ; i++) { p_modelInfo = &(kmdw_model_data.p_model_info[i]); dbg_msg("Model(%2d) model_type(%3d)/version(%5d):\n", (i+1), p_modelInfo->model_type, p_modelInfo->model_version); dbg_msg("input[%x](sz:%d) -> cmd[%x](sz:%d),weight[%x](sz:%d),setup[%x](sz:%d),buf[%x](sz:%d) -> out[%x](sz:%d)\n", (i+1), p_modelInfo->input_mem_addr, p_modelInfo->input_mem_len, p_modelInfo->cmd_mem_addr, p_modelInfo->cmd_mem_len, p_modelInfo->weight_mem_addr,p_modelInfo->weight_mem_len, p_modelInfo->setup_mem_addr, p_modelInfo->setup_mem_len, p_modelInfo->buf_addr, p_modelInfo->buf_len, p_modelInfo->output_mem_addr,p_modelInfo->output_mem_len); } return; } #endif // DEBUG