KL520_SDK_2.2/mdw/model/kmdw_model.c

/*
 * Kneron Model API Manager
 *
 * Copyright (C) 2019 Kneron, Inc. All rights reserved.
 *
 */

#include <stdlib.h>
#include <string.h>
#include "project.h"

#include "base.h"
#include "kdrv_ipc.h"            /*for NCPU triggering */
#include "kdrv_clock.h"         /* for kdrv_delay_us() */
#include "kdev_flash.h"

#include "kmdw_ipc.h"
#include "kmdw_model.h"
#include "kmdw_console.h"       /*for dbg_msg */
#include "kmdw_memxfer.h"       /*for flash access */
#include "kmdw_memory.h"
#include "kmdw_utils_crc.h"

#define DEBUG 0
#define OUTPUT_MEM_ADDR2_SIZE 0x100000 /* 1MB, for DME parallel buffer */
#define OUTPUT_MEM_ADDR3_SIZE 0x5000   /* for MBSSD anchor data */

#define FLAG_KMDW_MODEL_ABORT      BIT(29)  // Event flag to notify abort
#define FLAG_KMDW_MODEL_FROM_NCPU  BIT(30)  // Event flag to know NCPU is done
#define FLAG_KMDW_MODEL_FROM_NPU   BIT(28)  // Event flag to know NPU is done

#define MODEL_INF_TIMEOUT (2000) // timeout milli-secs for waiting npcu response

#define KDP_FLASH_FW_INFO_SIZE      0x1000

#ifdef EMBED_CMP_NPU
  /* the following is for specific dense model wt/cmd mem modification */
  /*================================================*/
  #define WT_DATA_SIZE_BYTE          272

  #define CONF_QUEUE                 0
  #define GETW_QUEUE                 3

  #define CONF_GETW0_CMD_OFFSET      0x0038
  #define CONF_WDMA0_DST0_CMD_OFFSET 0x00f0

  #define ACL_NPU_GETW0              0x2e
  #define ACH_NPU_GETW0              0x2f
  #define ACL_NPU_WDMA0_DST0         0x36
  #define ACH_NPU_WDMA0_DST0         0x37

  #define MASK_2                     0x0003
  #define MASK_10                    0x03FF
  #define MASK_16                    0x00FFFF

  #define VAL_ACL(x)                 (((x)&0xffff))
  #define VAL_ACH(x)                 (((x) >> 16) & 0xffff)

  #define SetBitsVal(tgt, val, mask, offset) \
      ((tgt) &= ~((mask) << (offset)));      \
      ((tgt) |= (((val) & (mask)) << (offset)))

  /*================================================*/
#endif // EMBED_CMP_NPU


extern const struct s_kdp_memxfer kdp_memxfer_module;

/* Type of Operations */
enum {
    NODE_TYPE_IN,
    NODE_TYPE_CPU,
    NODE_TYPE_OUTPUT,
    NODE_TYPE_DATA,
    NODE_TYPE_SUPER,
    NODE_TYPE_INPUT
};

/* Structures of Data Nodes */
struct super_node_s {
    uint32_t node_id;
    uint32_t addr;
    uint32_t row_start;
    uint32_t col_start;
    uint32_t ch_start;
    uint32_t row_length;
    uint32_t col_length;
    uint32_t ch_length;
};

struct data_node_s {
    uint32_t node_id;
    uint32_t supernum;
    uint32_t data_format;
    uint32_t data_radix;
    uint32_t data_scale;
    uint32_t row_start;
    uint32_t col_start;
    uint32_t ch_start;
    uint32_t row_length;
    uint32_t col_length;
    uint32_t ch_length;
    struct super_node_s node_list[1];
};

/* Structure of Input Operation */
struct in_node_s {
    uint32_t node_id;
    uint32_t next_npu;
};

/* Structure of Output Operation */
struct out_node_s {
    uint32_t node_id;
    uint32_t supernum;
    uint32_t data_format;
    uint32_t row_start;
    uint32_t col_start;
    uint32_t ch_start;
    uint32_t row_length;
    uint32_t col_length;
    uint32_t ch_length;
    uint32_t output_index;
    uint32_t output_radix;
    uint32_t output_scale;
    struct super_node_s node_list[1];
};

/* Structure of CPU Operation */
struct cpu_node_s {
    uint32_t node_id;
    uint32_t input_datanode_num;
    uint32_t op_type;
    /* There will be more parameter here for cpu operation  */
    uint32_t in_num_row;
    uint32_t in_num_col;
    uint32_t in_num_ch;
    uint32_t out_num_row;
    uint32_t out_num_col;
    uint32_t out_num_ch;
    uint32_t h_pad;
    uint32_t w_pad;
    uint32_t kernel_h;
    uint32_t kernel_w;
    uint32_t stride_h;
    uint32_t stride_w;
    struct data_node_s output_datanode;
    struct data_node_s input_datanode[1];
};

/* Structure of CNN Header in setup.bin */
struct cnn_header_s {
    uint32_t crc;
    uint32_t version;
    uint32_t key_offset;
    uint32_t model_type;
    uint32_t app_type;
    uint32_t dram_start;
    uint32_t dram_size;
    uint32_t input_row;
    uint32_t input_col;
    uint32_t input_channel;
    uint32_t cmd_start;
    uint32_t cmd_size;
    uint32_t weight_start;
    uint32_t weight_size;
    uint32_t input_start;
    uint32_t input_size;
    uint32_t input_radix;
    uint32_t output_nums;
};

typedef struct {
    uint32_t n_model_source;           // 0: not set, 1: from flash, 2: from ddr
    uint32_t n_model_count;            // model count
    struct kdp_model_s p_model_info[KMDW_MODEL_MAX_MODEL_COUNT];  // save model info generated by compiler
    uint8_t pn_is_model_loaded_table[KMDW_MODEL_MAX_MODEL_COUNT]; // flag table to indicate if model is loaded
    uint32_t n_ddr_addr_model_end;     // DDR address of model end = user data start

    int32_t  n_model_slot_index;       // scpu_to_ncpu->model_slot_index
} kmdw_model_data_t;

static kmdw_model_data_t s_model_data = {0};

typedef struct {
    int32_t raw_img_idx;
    osEventFlagsId_t evt_caller;        // event to know/control ncpu
    uint32_t        caller_e;
    osEventFlagsId_t evt_result;        // event to know/control npu
    uint32_t        result_e;
} kmdw_img_data_t;

// ptr to the buf for uploaded fw info from host
static kmdw_model_fw_info_t *s_fw_info_buf_p = NULL;
// ptr to the buf for uploaded fw info from flash
static bool s_model_loaded_from_flash = false;

static kmdw_img_data_t s_img_data[IPC_IMAGE_ACTIVE_MAX] = {0};
static int32_t s_current_ipc_idx = 0;
static int32_t s_next_ipc_idx = 0;

static bool ModelFromDDR = false;       // check model is from flash : false, ddr : true

/* ############################
 * ##    Static Functions    ##
 * ############################ */
/**
 * @brief init ddr space for s_fw_info_buf_p
 *
 */
static void _init_fw_info_buf(void)
{
    if (NULL == s_fw_info_buf_p) {
        s_fw_info_buf_p = (kmdw_model_fw_info_t*)kmdw_ddr_reserve(KDP_FLASH_FW_INFO_SIZE);

        if (NULL == s_fw_info_buf_p)
            critical_msg("insufficent memory for reading fw_info from flash\n");
    }
}


#ifdef EMBED_CMP_NPU
static inline void udt_conf_cmd(void *cmd_addr, int reg_idx, uint16_t val16b, int queue)
{
    uint32_t val = 0x80000000;

    SetBitsVal(val, queue, MASK_2, 26);
    SetBitsVal(val, reg_idx, MASK_10, 16);
    SetBitsVal(val, val16b, MASK_16, 0);

    memcpy(cmd_addr, &val, sizeof(val));
}

static void udt_npu_model_mem(uint32_t wt_addr, uint32_t out_addr, void *cmd_addr)
{
    uint32_t *dst = (uint32_t *)((char *)cmd_addr + CONF_GETW0_CMD_OFFSET);
    udt_conf_cmd(dst,     ACL_NPU_GETW0, VAL_ACL(wt_addr), GETW_QUEUE);
    udt_conf_cmd(dst + 1, ACH_NPU_GETW0, VAL_ACH(wt_addr), GETW_QUEUE);

    dst = (uint32_t *)((char *)cmd_addr + CONF_WDMA0_DST0_CMD_OFFSET);
    udt_conf_cmd(dst,     ACL_NPU_WDMA0_DST0, VAL_ACL(out_addr), CONF_QUEUE);
    udt_conf_cmd(dst + 1, ACH_NPU_WDMA0_DST0, VAL_ACH(out_addr), CONF_QUEUE);
}
#endif // EMBED_CMP_NPU

/**
 * @brief load fw info from flash
 * @return 0: OK, -1: fail
 * @note NULL means failed; non-zero ptr means OK
 */
static kmdw_model_fw_info_t* _load_flash_model_info(void)
{
    //load model from flash once and reuse loaded data, until reload
    if (false == s_model_loaded_from_flash ) {
        s_model_loaded_from_flash = true;
        kdp_memxfer_module.flash_to_ddr((uint32_t)s_fw_info_buf_p, FLASH_MODEL_FW_INFO_ADDR, KDP_FLASH_FW_INFO_SIZE);
    }

    return s_fw_info_buf_p;
}


/**
 * @brief reset s_model_data
 */
static void _reset_model_data(void)
{
    s_model_data.n_model_count = 0;
    s_model_data.n_model_source = 0;
    memset( s_model_data.p_model_info, 0, sizeof(s_model_data.p_model_info));
    memset( s_model_data.pn_is_model_loaded_table, 0 , sizeof(s_model_data.pn_is_model_loaded_table));

    //can't reset the following variable which maintains DDR boundary for model
    //n_last_model_space_end_addr
    return;
}

/**
 * @brief check flash read with timeout_ms
 * @param timeout_ms timeout in ms
 * @return flash ready ready time in ms
 *         -1 means timeout hit
 */
//static int32_t _flash_wait_ready(int timeout_ms)
//{
//    kdev_flash_status_t flash_status;
//    int i;

//    for (i = 0; i < timeout_ms; i++) {
//        flash_status = kdev_flash_get_status();
//        if (flash_status.busy == 0) break;
//        kdrv_delay_us(1*1000);
//    }
//    if (i == timeout_ms) i = -1;  // we have timed out
//    return i;
//}

/**
 * @brief convert modeltype to modelInfo array index
 * @param model_type_p: model type (defined in model_type.h)
 * @return modelInfo model index (starts from 0)
 *         -1 means not such modeltype in flash
 */
static int8_t _get_model_info_array_index_by_model_type(uint32_t model_type_p)
{
    int i;
    for(i=0 ; i < s_model_data.n_model_count; i++) {
        if(s_model_data.p_model_info[i].model_type == model_type_p)
            return i;
    }

    return -1;
}

/**
 * @brief get fw info extension data from fw_info ptr
 * @param[in] fw_info_p the ptr to fw_info
 * @return the ptr to fw_info_ext
 */
static kmdw_model_fw_info_ext_t*
_get_fw_info_ext_by_fw_info(kmdw_model_fw_info_t* fw_info_p)
{
    if(NULL == fw_info_p)
        return NULL;
    else {
        kmdw_model_fw_info_ext_t* ret = NULL;
        uint32_t count;
        uint32_t offset;

        count = fw_info_p->model_count;
        offset = sizeof(struct kdp_model_s) * count;
        ret = (kmdw_model_fw_info_ext_t *)((uint32_t)fw_info_p->models + offset);
        return ret;
    }
}

/**
 * @brief load model information generated by compiler
 * @param [in] is_model_from_ddr: if model is from ddr/host command
 * @param [in] is_reload        : is force reload
 * @return model count
 *         0 means no model is loaded in this call
 */
static int32_t _load_model_info(bool from_ddr, bool reload)
{
    if (s_model_data.n_model_count && !reload) {
        return s_model_data.n_model_count;
    }

    if (reload) {
        _reset_model_data();
        s_model_loaded_from_flash = false;
    }

    kmdw_model_fw_info_t *model_info_p = NULL;
    kmdw_model_fw_info_ext_t *model_info2_p = NULL;

    // load model Info
    if (from_ddr) {
        model_info_p = s_fw_info_buf_p;
        model_info2_p = _get_fw_info_ext_by_fw_info(model_info_p);

        if((NULL == model_info_p) || (NULL == model_info2_p) ) {
            s_model_data.n_model_count = 0;
            return 0;
        }

        // Use the version number for new fw_info structure. Model number is in use for dynamic model execution (DME)
        //if (*(uint32_t*)(base_addr + 8) == 0) {
        //    return 0; //error, model_info is not ready
        //}

        // get model count
        s_model_data.n_model_count = model_info_p->model_count;
        dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count);

        if(0 == s_model_data.n_model_count) {
            info_msg("[info] model is not in DDR!!\n");
            return 0;
        } else if (s_model_data.n_model_count > KMDW_MODEL_MAX_MODEL_COUNT) {
            info_msg("[ERR] model count is over MAX limit=%d!!\n", KMDW_MODEL_MAX_MODEL_COUNT);
            s_model_data.n_model_count = 0;
            return 0;
        } else {
            dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count);
        }

        // get model info
        memcpy(s_model_data.p_model_info, (const void*)model_info_p->models,
               sizeof(struct kdp_model_s)*s_model_data.n_model_count);

        // get ddr model end addr
        s_model_data.n_ddr_addr_model_end = model_info2_p->model_dram_addr_end;
        if (s_model_data.n_ddr_addr_model_end >= kmdw_ddr_get_heap_tail()) {
            err_msg("modelInfo: DDR end address: 0x%x over (>=) boundary 0x%x\n", s_model_data.n_ddr_addr_model_end, kmdw_ddr_get_heap_tail());
            return 0;
        } else {
            dbg_msg("modelInfo: DDR end address: 0x%x\n", s_model_data.n_ddr_addr_model_end);
        }

        // set model source
        s_model_data.n_model_source = 2; // from ddr

    } else { // models are stored in flash

        model_info_p = _load_flash_model_info();  // this function updates data on s_fw_info_buf_p
        model_info2_p = _get_fw_info_ext_by_fw_info(model_info_p);

        if((NULL == model_info_p) || (NULL == model_info2_p) ) {
            s_model_data.n_model_count = 0;
            return 0;
        }

        // get model count
        s_model_data.n_model_count = model_info_p->model_count;
        dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count);

        if (s_model_data.n_model_count == 0xFFFFFFFF) {
            err_msg("[info] model is not in flash!!\n");
            s_model_data.n_model_count = 0;
            return 0;
        } else if (s_model_data.n_model_count > KMDW_MODEL_MAX_MODEL_COUNT) {
            info_msg("[ERR] model count is over MAX limit=%d!!\n", KMDW_MODEL_MAX_MODEL_COUNT);
            s_model_data.n_model_count = 0;
            return 0;
        } else {
            dbg_msg("[DBG] model info: model count:%d\n", s_model_data.n_model_count);
        }

        // get model info
        //FIXME, why need to clone to s_fw_info_buf_p
        //memcpy(s_fw_info_buf_p, (void *)model_info_p, KDP_FLASH_FW_INFO_SIZE);

        memcpy(s_model_data.p_model_info, model_info_p->models, sizeof(struct kdp_model_s)*s_model_data.n_model_count);

        // get ddr model end addr
        s_model_data.n_ddr_addr_model_end = model_info2_p->model_dram_addr_end;

        if (s_model_data.n_ddr_addr_model_end >= kmdw_ddr_get_heap_tail()) {
            err_msg("modelInfo: DDR end address: 0x%x over (>=) boundary 0x%x\n", s_model_data.n_ddr_addr_model_end, kmdw_ddr_get_heap_tail());
            return 0;
        } else {
            dbg_msg("modelInfo: DDR end address: 0x%x\n", s_model_data.n_ddr_addr_model_end);
        }

        // set model source
        s_model_data.n_model_source = 1; // from flash
    }

    // for support of dynamic model execution
    *(uint32_t*)(((char*)s_fw_info_buf_p) + 8) = 0; //trick: we will check the work to see if model_info is uploaded

    return s_model_data.n_model_count;
}

/**
 * @brief load specific model by model info index (the order in flash)
 * @param model_index_p: model info index
 * @return 0: model not ready, 1: model is loaded
 */
static int32_t _load_model(uint8_t model_index_p/*starts from 0*/)
{
    uint32_t ddr_addr_models_head; //start point = the 1st model's cmd.bin
    uint32_t ddr_addr_offset;
    uint32_t flash_addr;
    uint32_t len_to_load;

    struct kdp_model_s *p_model;

    if(s_model_data.n_model_count == 0)
        return 0; // model info is not ready

    if(s_model_data.pn_is_model_loaded_table[model_index_p] == 1 )
        return 1; //model has been loaded
    else
        s_model_data.pn_is_model_loaded_table[model_index_p] = 1;

    //load model with (index=model_index_p) from flash to DDR
    ddr_addr_models_head = s_model_data.p_model_info[0].cmd_mem_addr; //start point = the 1st model's cmd.bin

    //load cmd + weight + setup together
    p_model = &(s_model_data.p_model_info[model_index_p]);
    ddr_addr_offset = p_model->cmd_mem_addr - ddr_addr_models_head;

    flash_addr = FLASH_MODEL_ALL_ADDR + ddr_addr_offset;

    len_to_load = ALIGN16(p_model->cmd_mem_len) +
                  ALIGN16(p_model->weight_mem_len) +
                  ALIGN16(p_model->setup_mem_len);

    //model from flash to ddr
    kdp_memxfer_module.flash_to_ddr(p_model->cmd_mem_addr, flash_addr, len_to_load);

    return 1;
}

/**
 * @brief prepare ouptut_mem_addr2 for ncpu/npu parallel mode inference
 *
 * @return 0:OK, -1:Fail
 */
static int32_t _prepare_output_mem_addr2(void)
{
    /* Allocate parallel output buffer , if caller not provide buf*/
    struct scpu_to_ncpu_s* comm_out = kmdw_ipc_get_output();
    uint32_t addr_parallel = comm_out->output_mem_addr2;

    if (addr_parallel == 0) {
        //TODO, dynamic allocate memory for output_mem_addr2
        //uint32_t addr1 = comm_out->models[model_idx].output_mem_addr;
        //uint32_t addr2 = comm_out->models[model_idx].buf_addr;
        //uint32_t len = comm_out->models[model_idx].output_mem_len;
        //if (addr1 == addr2) {
        //    // Old memory layout, use working buffer length
        //    len = comm_out->models[model_idx].buf_len;
        //}

        // reserve more space for larger model output (ty_608x608 need 620160)
        uint32_t len = OUTPUT_MEM_ADDR2_SIZE;

        addr_parallel = kmdw_ddr_reserve(len);
        if (addr_parallel == 0) {
            err_msg("Error ddr allocation ncpu/npu parallel buffer, len %d\n", len);
            return -1; //error
        }
        comm_out->output_mem_addr2 = addr_parallel;
        comm_out->output_mem_len2 = len;

        dbg_msg("allocated Parallel buffer: len %d, addr 0x%x", len, addr_parallel);
    }
    return 0;
}

#ifdef KL520
/**
 * @brief prepare ouptut_mem_addr3 for MBSSD network
 *
 * @param [in] model_type model id
 * @return 0:OK, -1:Fail
 */
static int32_t _prepare_output_mem_addr3(uint32_t model_type)
{

    if (model_type == KNERON_FD_MBSSD_200_200_3 ||
	    model_type == KNERON_FD_MASK_MBSSD_200_200_3 ||
        model_type == KNERON_OD_MBSSD ||
		model_type == KNERON_PD_MBSSD ||
		model_type == KNERON_CAR_DETECTION_MBSSD_224_416_3) {

        uint32_t *pMemAddr3;
        uint32_t len = OUTPUT_MEM_ADDR3_SIZE;
        struct scpu_to_ncpu_s* comm_out = kmdw_ipc_get_output();

        switch (model_type) {
            case KNERON_FD_MBSSD_200_200_3 :
            case KNERON_FD_MASK_MBSSD_200_200_3 :
            {
                static uint32_t mem_addr3_fdssd = 0;
                pMemAddr3 = &mem_addr3_fdssd;
                break;
            }
            case KNERON_OD_MBSSD :
            {
                static uint32_t mem_addr3_odssd = 0;
                pMemAddr3 = &mem_addr3_odssd;
                break;
            }
            case KNERON_PD_MBSSD :
            {
                static uint32_t mem_addr3_pdssd = 0;
                pMemAddr3 = &mem_addr3_pdssd;
                break;
            }
            case KNERON_CAR_DETECTION_MBSSD_224_416_3 :
            {
                static uint32_t mem_addr3_vdssd = 0;
                pMemAddr3 = &mem_addr3_vdssd;
                break;
            }
            default :
                break;
        }
        if (*pMemAddr3 == 0) {
            *pMemAddr3 = kmdw_ddr_reserve(len*sizeof(uint32_t));
            if (*pMemAddr3 == 0) {
                err_msg("Error ddr allocation fail for MBSSD network, mem_addr3 len %d\n", len);
                return -1; //error
            }
            *(uint32_t*)(*pMemAddr3) = 0;
        }
        comm_out->output_mem_addr3 = *pMemAddr3;
    }
    return 0;
}
#endif

/**
 * @brief specify model information, load model info, load model
 * @param [in] model_type_p: model unique ID defined by Kneron
 * @param [in] model_from_ddr: is model from ddr or host command
 * @return model_slot_index(requested by NCPU/NPU)
 *         -1 : model not found
 */
static int32_t _config_model(uint32_t model_type, bool model_from_ddr)
{
    int model_info_idx; //limitation (hard coded in flash)
    int model_idx;

    //check if model info is loaded
    if( 0 == _load_model_info(model_from_ddr, false/*reload*/)) {
        return -1;
    }

    if( model_from_ddr == 0 ) {
        //FIXME, should remove application related code
        /* Special model not in DDR but in ncpu */
        if (model_type == KNERON_2D_LIVENESS_224_224_3) {
            model_idx = 3;
            model_info_idx = 4;

            goto model_common;
        }

        model_info_idx = _get_model_info_array_index_by_model_type(model_type);
        if(model_info_idx == -1) {
            err_msg("[ERR] model_type[%d] is not found in flash\n", model_type);
            return -1;
        }
        _load_model(model_info_idx);

        // FIXME: need to remove the following hard code
        model_idx = model_info_idx;

    } else {
        model_info_idx = _get_model_info_array_index_by_model_type(model_type);
        if(model_info_idx == -1) {
            err_msg("[ERR] model_type[%d] is not found in DDR\n", model_type);
            return -1;
        }
        model_idx = model_info_idx;
    }

model_common:
    s_model_data.n_model_slot_index = model_idx;

    kmdw_ipc_set_model(s_model_data.p_model_info, model_info_idx, model_idx);


    struct kdp_img_raw_s *raw_img = kmdw_model_get_raw_img(s_img_data[s_current_ipc_idx].raw_img_idx);

    if (raw_img->inf_format & IMAGE_FORMAT_PARALLEL_PROC) {
        if (-1 ==  _prepare_output_mem_addr2() ) {
            return -1;
        }
    }

#ifdef KL520
    if ( -1 == _prepare_output_mem_addr3(model_type) ) {
        return -1;
    }

#else
    struct scpu_to_ncpu_s* p_comm_out = kmdw_ipc_get_output();

    if (NULL == p_comm_out->output_mem_addr3) {
        uint32_t len = 0x5000;
        p_comm_out->output_mem_addr3 = kmdw_ddr_reserve(len*sizeof(uint32_t));
        if(NULL == p_comm_out->output_mem_addr3) {
            critical_msg("kmdw_model: failed to malloc comm_out->output_mem_addr3\n");
            return -1;
        }
    }

    if (NULL == p_comm_out->output_mem_addr4) {
        uint32_t len = 8 * (1 << 20);
        p_comm_out->output_mem_addr4 = kmdw_ddr_reserve(len);
        if (NULL == p_comm_out->output_mem_addr4) {
            critical_msg("kmdw_model: failed to malloc comm_out->output_mem_addr4\n");
            return -1;
        }
    }
#endif

    kmdw_ipc_set_model_active(model_idx);

    return model_idx;
}


/**
 * @brief run model according to config settings
 * @return status defined in NCPU
 * @note !!! must be called after kapp_config_model_image()
 */
static int32_t _run_model(void)
{
    int active_idx = s_current_ipc_idx;
    int raw_img_idx = s_img_data[active_idx].raw_img_idx;
    struct kdp_img_raw_s *p_raw_image = kmdw_model_get_raw_img(raw_img_idx);
    uint32_t flags, wait_evt;
    uint32_t is_abort = 0;

    // Start time for ncpu/npu round trip
    p_raw_image->tick_start = osKernelGetTickCount();

    if (s_img_data[active_idx].evt_caller == NULL)
        s_img_data[active_idx].evt_caller = osEventFlagsNew(0);

    if(!s_img_data[active_idx].evt_caller)
        err_msg("<Run-Model> active_idx=%d, osEventFlagsNew evt_caller failure\n",active_idx);

    // set notify for job done
    if (s_img_data[active_idx].evt_result) {
        /* Result event already set. Let's do local event for parallel. */
        wait_evt = FLAG_KMDW_MODEL_FROM_NPU;
    } else {
        wait_evt = FLAG_KMDW_MODEL_FROM_NCPU;
    }

    dbg_msg("<Run-Model> wait %d[%d] evt %x\n", raw_img_idx, active_idx, wait_evt);

    //assign caller event before triggering ncpu/npu
    s_img_data[active_idx].caller_e = wait_evt;

    //trigger ncpu/npu
    kmdw_ipc_trigger_int(CMD_RUN_NPU);

    //check abort signal
    flags = osEventFlagsWait(s_img_data[active_idx].evt_caller,
                             FLAG_KMDW_MODEL_ABORT,
                             osFlagsWaitAll, 0);
    if( flags != osFlagsErrorResource ) {
        osEventFlagsClear(s_img_data[active_idx].evt_caller, FLAG_KMDW_MODEL_ABORT);
        is_abort = 1;
    }

    uint32_t wait_timeout = (kmdw_ipc_get_output()->kp_dbg_checkpoinots == 0x0) ? MODEL_INF_TIMEOUT : osWaitForever;

    //wait for finish of current task
    flags = osEventFlagsWait(s_img_data[active_idx].evt_caller,
                             wait_evt,
                             osFlagsNoClear, wait_timeout);

    if(flags == osFlagsErrorTimeout){
        err_msg("[%s] osEventFlagsWait flag 0x%08x timeout\n", __FUNCTION__, wait_evt);
        return IMAGE_STATE_TIMEOUT;
    } else if (flags != wait_evt)
        dbg_msg("[%s] 1+ events 0x%08x (%d[%d] expected)\n", __FUNCTION__, flags, wait_evt, active_idx);
    else
        dbg_msg("[DBG][%s] got: raw_img_idx[active_idx]=%d[%d]\n", __FUNCTION__, raw_img_idx, active_idx);

    osEventFlagsClear(s_img_data[active_idx].evt_caller, wait_evt);

    if( 1 == is_abort ) {
        dbg_msg("[DBG][%s] abort after n_model_slot_index = %d\n", __FUNCTION__, s_model_data.n_model_slot_index);
        return KMDW_MODEL_RUN_RC_ABORT; //abort
    }

    return kmdw_ipc_get_input()->result.postproc.img_result.status;
}

__weak osStatus_t kmdw_fifoq_manager_result_enqueue(void *result_buf, int buf_size, bool preempt)
{
    return osOK;
}

static void _ipc_handler(struct kdp_img_raw_s *p_raw_image, int state)
{
    int ipc_idx;

    if(state == 0x999) // FIXME, very workaround
    {
        kmdw_ipc_get_input()->kp_dbg_status = 0x0;
        osStatus_t sts = kmdw_fifoq_manager_result_enqueue(kmdw_ipc_get_output()->kp_dbg_buffer, 0, false);
        if(sts != osOK)
            kmdw_printf("send dbg data failed in ipc, err %d\n", sts);
    }
    else if (state == IMAGE_STATE_RECEIVING) {
        ipc_idx = p_raw_image->ref_idx;

        // End time for ncpu/npu round trip
        p_raw_image->tick_end = osKernelGetSysTimerCount();

        if (s_img_data[ipc_idx].evt_result) {
            dbg_msg("[done: post: P] ipc_idx: %d, result_e: %d (ram %x)\n", ipc_idx, s_img_data[ipc_idx].result_e, p_raw_image);
            osEventFlagsSet(s_img_data[ipc_idx].evt_result, s_img_data[ipc_idx].result_e);
        } else {
            dbg_msg("[done: post: S] ipc_idx: %d, caller_e: %x.\n", ipc_idx, s_img_data[ipc_idx].caller_e);
            osEventFlagsSet(s_img_data[ipc_idx].evt_caller, s_img_data[ipc_idx].caller_e);
        }
    } else if (state == IMAGE_STATE_ACTIVE){
        ipc_idx = s_current_ipc_idx;
        dbg_msg("[done: npu: P] ipc_idx: %d, caller_e: %x\n", ipc_idx, s_img_data[ipc_idx].caller_e);
        osEventFlagsSet(s_img_data[ipc_idx].evt_caller, s_img_data[ipc_idx].caller_e);
    } else {
        err_msg("[ERR] wrong state: %d (ipc_idx %d)\n", state, ipc_idx);
    }
}

/* ############################
 * ##    Public Functions    ##
 * ############################ */

void kmdw_model_init(void)
{
    kmdw_ipc_initialize(_ipc_handler);

    _init_fw_info_buf();
    s_fw_info_buf_p->model_count = 0;
}

int32_t kmdw_model_load_model(int8_t model_info_index_p)
{
    int32_t ret = 0;

    if(1 != s_model_data.n_model_source ||  // check if s_model_data is not according to flash
       0 == s_model_data.n_model_count) {
        if(0 ==  _load_model_info(false/*from ddr*/, true/*reload*/))
            return 0; //error, no model is loaded
    }

    // load all models
    if (KMDW_MODEL_ALL_MODELS == model_info_index_p) {
        uint8_t i;
        for (i = 0 ; i < s_model_data.n_model_count ; i++) {
            ret = _load_model(i);
            if( 0 == ret) {
                err_msg("[ERR] %s : failed to load model array index:%d\n", __FUNCTION__, i);
                return 0;
            }
        }

// Very slow if turn it on. Maybe hardware support is needed.
// Add a new compiler directive if CRC32 method is also used in other scenarios (ex: check FW image)
#if ENABLE_CRC32
        // check CRC value of all_models.bin
        kmdw_model_fw_info_t *model_info_p = _load_flash_model_info();
        kmdw_model_fw_info_ext_t *model_info2_p = _get_fw_info_ext_by_fw_info(model_info_p);

        // cmd_mem_addr of first model is the start address of all_models.bin
        uint8_t *addr = (uint8_t *)s_model_data.p_model_info[0].cmd_mem_addr;

        uint32_t crc32 = kmdw_utils_crc_gen_crc32(addr, model_info2_p->model_total_size);

        dbg_msg("[%s] crc32 calculated: 0x%x\n", __FUNCTION__, crc32);
        dbg_msg("[%s] crc32 read from flash: 0x%x\n", __FUNCTION__, model_info2_p->model_checksum);
        dbg_msg("[%s] model start address: 0x%x\n", __FUNCTION__, s_model_data.p_model_info[0].cmd_mem_addr);
        dbg_msg("[%s] model total size: %d\n", __FUNCTION__, model_info2_p->model_total_size);

        if (crc32 != model_info2_p->model_checksum)
        {
            err_msg("[ERR] %s: all models.bin CRC check failed\n", __FUNCTION__);
            return 0;
        }
#endif

        return s_model_data.n_model_count;
    } else { // load specific model
        ret = _load_model(model_info_index_p);
        return ret;
    }
}

int32_t kmdw_model_reload_model_info(bool from_ddr)
{
    return _load_model_info(from_ddr, true/*reload*/);
}

int32_t kmdw_model_refresh_models(void)    // reload all the models from flash again
{
    uint8_t i;

    // forcedly update s_model_data which might be poluted by model upload from host
    if(0 ==  _load_model_info(false/*from ddr*/, true/*reload*/))
        return 0; //error, no model is loaded

    int ret;
    for (i = 0 ; i < s_model_data.n_model_count ; i++) {
        if (s_model_data.pn_is_model_loaded_table[i]) {  // if previously loaded
            s_model_data.pn_is_model_loaded_table[i] = 0;
            ret = _load_model(i);  // reload the model again
            if ( 0 == ret) {
                err_msg("[ERR] %s : failed to load model array index:%d\n", __FUNCTION__, i);
                return 0;
            }
        }
    }
    return s_model_data.n_model_count;
}

int32_t kmdw_model_config_result(osEventFlagsId_t result_evt, uint32_t result_evt_flag)
{
    int active_idx = s_current_ipc_idx;

    s_img_data[active_idx].evt_result = result_evt;
    s_img_data[active_idx].result_e = result_evt_flag;

    return 0;
}

void kmdw_model_config_img(struct kdp_img_cfg *img_cfg, void *ext_param)
{
    int act_img_idx = img_cfg->image_buf_active_index;
    struct kdp_img_raw_s *raw_img = kmdw_model_get_raw_img(act_img_idx);

    s_current_ipc_idx = s_next_ipc_idx;

    if (img_cfg->inf_format & IMAGE_FORMAT_PARALLEL_PROC)
        s_next_ipc_idx = !s_next_ipc_idx;

    kmdw_ipc_set_image_active(act_img_idx);
    s_img_data[s_current_ipc_idx].raw_img_idx = act_img_idx;

    raw_img->state = IMAGE_STATE_ACTIVE;
    raw_img->seq_num = act_img_idx;

    raw_img->ref_idx = s_current_ipc_idx;
    raw_img->num_image = img_cfg->num_image;
    raw_img->inf_format = img_cfg->inf_format;

    for (int i = 0; i < img_cfg->num_image; i++) {
        raw_img->image_list[i].input_row = img_cfg->image_list[i].input_row;
        raw_img->image_list[i].input_col = img_cfg->image_list[i].input_col;
        raw_img->image_list[i].input_channel = img_cfg->image_list[i].input_channel;
        raw_img->image_list[i].format = img_cfg->image_list[i].format;
        raw_img->image_list[i].image_mem_addr = img_cfg->image_list[i].image_mem_addr;
        raw_img->image_list[i].image_mem_len = img_cfg->image_list[i].image_mem_len;

        memcpy(&(raw_img->image_list[i].params_s), &(img_cfg->image_list[i].params_s), sizeof(parameter_t));
    }

    if (ext_param == NULL) {
        memset(raw_img->ext_params, 0, MAX_PARAMS_LEN * 4);
    } else {
        memcpy(raw_img->ext_params, ext_param, MAX_PARAMS_LEN * 4);
    }
}

struct kdp_img_raw_s* kmdw_model_get_raw_img(int idx)
{
    struct scpu_to_ncpu_s *comm_out = kmdw_ipc_get_output();
    return &(comm_out->raw_images[idx]);
}

int kmdw_model_run(const char *tag, void *output, uint32_t model_type, bool model_from_ddr)
{
    int model_idx = _config_model(model_type, model_from_ddr);
    if (model_idx < 0) {
        return KMDW_MODEL_RUN_RC_ABORT;
    }

    int img_idx = s_img_data[s_current_ipc_idx].raw_img_idx;
    struct kdp_img_raw_s *raw_img = kmdw_model_get_raw_img(img_idx);

    raw_img->results[model_idx].result_mem_addr = (uint32_t)output;

    dbg_msg("[INFO] %s:\n", tag);
    dbg_msg("  model_idx = %d\n", model_idx);
    dbg_msg("  model type = %d\n", model_type);
    dbg_msg("  ref_idx = %d\n", raw_img->ref_idx);
    dbg_msg("  inf_format = 0x%X\n", raw_img->inf_format);
    dbg_msg("  output addr = 0x%x\n", raw_img->results[model_idx].result_mem_addr);
    dbg_msg("  ext_params(first 4)= %d/%d/%d/%d\n", raw_img->ext_params[0], raw_img->ext_params[1],
                                                    raw_img->ext_params[2], raw_img->ext_params[3]);

    for (int i = 0; i < raw_img->num_image; i++) {
        dbg_msg("  image index: %d\n", i);
        dbg_msg("    (row/col/ch) = %d/%d/%d\n", raw_img->image_list[i].input_row,
                                                 raw_img->image_list[i].input_col,
                                                 raw_img->image_list[i].input_channel);
        dbg_msg("    image format = 0x%x\n", raw_img->image_list[i].format);
        dbg_msg("    crop(tp/bt/lf/rt) = %d/%d/%d/%d\n", raw_img->image_list[i].params_s.crop_top,
                                                         raw_img->image_list[i].params_s.crop_bottom,
                                                         raw_img->image_list[i].params_s.crop_left,
                                                         raw_img->image_list[i].params_s.crop_right);
        dbg_msg("    image addr = 0x%x\n", raw_img->image_list[i].image_mem_addr);
    }

    return _run_model();
}

void kmdw_model_abort(void)
{
    int active_idx = s_current_ipc_idx;

    if( 0 == s_img_data[active_idx].evt_caller)
        return;

    osEventFlagsSet(s_img_data[active_idx].evt_caller, FLAG_KMDW_MODEL_ABORT);
}

struct kdp_model_s* kmdw_model_get_model_info(int model_idx_p)
{
    if (s_model_data.n_model_count == 0) {
        return NULL;
    } else if (model_idx_p >= s_model_data.n_model_count) {
        return NULL;
    } else {
        return &(s_model_data.p_model_info[model_idx_p]);
    }
}

void kmdw_model_get_run_time(int img_idx, kmdw_model_run_time_t *run_time/*out*/)
{
    struct kdp_img_raw_s *p_raw_image;

    if (run_time == NULL)
        return;

    p_raw_image = kmdw_model_get_raw_img(img_idx);

    run_time->round_trip_time = p_raw_image->tick_end - p_raw_image->tick_start;
    run_time->pre_proc_time = p_raw_image->tick_end_pre - p_raw_image->tick_start_pre;
    run_time->npu_proc_time = p_raw_image->tick_end_npu - p_raw_image->tick_start_npu;
    run_time->post_proc_time = p_raw_image->tick_end_post - p_raw_image->tick_start_post;
}


int kmdw_model_is_model_loaded(uint32_t model_type)
{
    if (_get_model_info_array_index_by_model_type(model_type) == -1)
        return 0;
    else
        return 1;
}

uint32_t *kmdw_model_get_all_model_info(bool trust_ddr_data)
{
    static uint32_t *s_p_model_id_list = NULL;  //[model_count, id0, id1, id2 ...]

    kmdw_model_fw_info_t *fw_info_ptr = NULL;

    fw_info_ptr = kmdw_model_get_fw_info(trust_ddr_data);

    if (fw_info_ptr) {

        if (NULL == s_p_model_id_list)
            s_p_model_id_list = (uint32_t *)calloc(1+KMDW_MODEL_MAX_MODEL_COUNT, sizeof(uint32_t));

        if (NULL == s_p_model_id_list) {
            err_msg("[ERR] insufficent memory for model id list\n");
        } else {
            int i;
            uint32_t model_id;

            s_p_model_id_list[0] = fw_info_ptr->model_count;
            dbg_msg("%s:\n", __FUNCTION__);
            dbg_msg("Model Count = %d\n", s_p_model_id_list[0]);

            for (i = 0 ; i < s_p_model_id_list[0]; i++) {
                model_id = fw_info_ptr->models[i].model_type;
                dbg_msg("Extract Model ID %d\n", model_id);

                s_p_model_id_list[i+1] = model_id;
            }
        }
        return s_p_model_id_list;
    } else {
        return NULL;
    }

}

uint32_t kmdw_model_get_crc(bool trust_ddr_data)
{
    uint32_t ret = 0;
    kmdw_model_fw_info_t *fw_info_ptr;
    kmdw_model_fw_info_ext_t *fw_info_ext_ptr;

    fw_info_ptr = kmdw_model_get_fw_info(trust_ddr_data);
    fw_info_ext_ptr = _get_fw_info_ext_by_fw_info(fw_info_ptr);

    if (fw_info_ext_ptr) {
        ret = fw_info_ext_ptr->model_checksum;
    }

    dbg_msg("%s = 0x%x\n", __FUNCTION__, ret);

    return ret;
}


kmdw_model_fw_info_t *kmdw_model_get_fw_info(bool trust_ddr_data)
{
    uint32_t model_cnt;
    kmdw_model_fw_info_t *fw_info_ptr = s_fw_info_buf_p;

    if (false == trust_ddr_data) {
        if ((0 >= s_model_data.n_model_count) ||
            ((1 != s_model_data.n_model_source) && (2 != s_model_data.n_model_source))) {
            fw_info_ptr = NULL;
        } else {
            model_cnt = fw_info_ptr->model_count;

            if ((0 == model_cnt) || (model_cnt > KMDW_MODEL_MAX_MODEL_COUNT)) {
                fw_info_ptr = NULL;
            }
        }
    }

    return fw_info_ptr;
}

uint32_t kmdw_model_get_model_end_addr(bool trust_ddr_data)
{
    uint32_t ret = 0;
    kmdw_model_fw_info_t* fw_info_ptr;
    kmdw_model_fw_info_ext_t* fw_info_ext_ptr = NULL;

    if (0 != s_model_data.n_ddr_addr_model_end) {
        ret = s_model_data.n_ddr_addr_model_end;
        goto FUNC_OUT;
    }

    fw_info_ptr = kmdw_model_get_fw_info(trust_ddr_data);
    fw_info_ext_ptr = _get_fw_info_ext_by_fw_info(fw_info_ptr);

    if (fw_info_ext_ptr) {
        ret = fw_info_ext_ptr->model_dram_addr_end;
    }

FUNC_OUT:

    dbg_msg("%s = 0x%x\n", __FUNCTION__, ret);

    return ret;
}

void kmdw_model_set_location(bool model_inddr)
{
    ModelFromDDR = model_inddr;
}

bool kmdw_model_get_location(void)
{
    return ModelFromDDR;
}

int kmdw_model_get_input_tensor_num(uint32_t model_type)
{
    int model_idx = 0;

    model_idx = _get_model_info_array_index_by_model_type(model_type);
    if (model_idx >= 0) {
        /******************************************************************
         * KL520 only support single input model
         ******************************************************************/
        return 1;
    } else {
        err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type);
        return 0;
    }
}

int kmdw_model_get_input_tensor_info(uint32_t model_type, uint32_t tensor_idx, kmdw_model_tensor_descriptor_t *tensor_info)
{
    int ret = 1;
    int model_idx = 0;
    uint32_t p_setup_bin = 0;
    struct cnn_header_s *target_input_node = NULL;

    if (NULL == tensor_info) {
        err_msg("[%s] NULL tensor_info pointer\n", __FUNCTION__);
        ret = 0;
        goto FUNC_OUT;
    }

    model_idx = _get_model_info_array_index_by_model_type(model_type);
    if (model_idx >= 0) {
        struct kdp_model_s *p_model_info = kmdw_model_get_model_info(model_idx);

        if (NULL != p_model_info) {
            p_setup_bin = p_model_info->setup_mem_addr;
        } else {
            err_msg("[%s] NULL model info pointer %d\n", __FUNCTION__);
            return 0;
        }
    } else {
        err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type);
        ret = 0;
        goto FUNC_OUT;
    }

    if (tensor_idx >= 1) {
        err_msg("[%s] tensor index out of range %d\n", __FUNCTION__, tensor_idx);
        ret = 0;
        goto FUNC_OUT;
    }

    target_input_node = (struct cnn_header_s *)p_setup_bin;

    tensor_info->index =            1;
    tensor_info->shape_npu_len =    4;
    tensor_info->shape_npu[0] =     1;
    tensor_info->shape_npu[1] =     target_input_node->input_channel;
    tensor_info->shape_npu[2] =     target_input_node->input_row;
    tensor_info->shape_npu[3] =     target_input_node->input_col;
    tensor_info->data_layout =      DATA_FMT_4W4C8B;
    tensor_info->scale =            1.0;
    tensor_info->radix =            target_input_node->input_radix;

FUNC_OUT:
    return ret;
}

int kmdw_model_get_output_tensor_num(uint32_t model_type)
{
    int model_idx = 0;
    struct kdp_model_s *p_model_info = NULL;

    model_idx = _get_model_info_array_index_by_model_type(model_type);
    if (model_idx >= 0) {
        p_model_info = kmdw_model_get_model_info(model_idx);
    } else {
        err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type);
        return 0;
    }

    /******************************************************************
     * legacy setup.bin model
     ******************************************************************/
    if (NULL != p_model_info) {
        return ((struct cnn_header_s *)p_model_info->setup_mem_addr)->output_nums;
    } else {
        err_msg("[%s] NULL model info pointer %d\n", __FUNCTION__);
        return 0;
    }
}

int kmdw_model_get_output_tensor_info(uint32_t model_type, uint32_t tensor_idx, kmdw_model_tensor_descriptor_t *tensor_info)
{
    int ret = 1;
    int model_idx = 0;
    uint32_t p_setup_bin = 0;
    uint32_t node_num = 0;
    uint32_t setup_buff_offset = sizeof(struct cnn_header_s);
    uint32_t setup_buff_size = 0;
    struct out_node_s *target_output_node = NULL;
    struct out_node_s *output_node = NULL;

    if (NULL == tensor_info) {
        err_msg("[%s] NULL tensor_info pointer\n", __FUNCTION__);
        ret = 0;
        goto FUNC_OUT;
    }

    model_idx = _get_model_info_array_index_by_model_type(model_type);
    if (model_idx >= 0) {
        struct kdp_model_s *p_model_info = kmdw_model_get_model_info(model_idx);

        if (NULL != p_model_info) {
            p_setup_bin = p_model_info->setup_mem_addr;
            setup_buff_size = p_model_info->setup_mem_len;
        } else {
            err_msg("[%s] NULL model info pointer %d\n", __FUNCTION__);
            return 0;
        }
    } else {
        err_msg("[%s] invalid model id %d\n", __FUNCTION__, model_type);
        ret = 0;
        goto FUNC_OUT;
    }

    node_num = ((struct cnn_header_s *)p_setup_bin)->output_nums;

    if (tensor_idx >= node_num) {
        err_msg("[%s] tensor index out of range %d\n", __FUNCTION__, tensor_idx);
        ret = 0;
        goto FUNC_OUT;
    }

    while ((setup_buff_offset < setup_buff_size) && (NULL == target_output_node)) {
        uintptr_t node_buff = (uintptr_t)p_setup_bin + setup_buff_offset;
        uint32_t node_id = *(uint32_t *)node_buff;
        uint32_t node_offset = 0;

        switch (node_id) {
        case NODE_TYPE_IN:
            // NPU IN Signal NODE
            dbg_msg("current node is an NPU IN Signal NODE\n");
            node_offset = sizeof(struct in_node_s);
            break;
        case NODE_TYPE_CPU:
            // CPU NODE
            dbg_msg("current node is a CPU NODE\n");
            node_offset = sizeof(struct cpu_node_s) - (2 * sizeof(struct data_node_s));
            break;
        case NODE_TYPE_OUTPUT:
            // OUTPUT NODE
            dbg_msg("current node is a output NODE\n");
            output_node = (struct out_node_s *)node_buff;
            node_offset = sizeof(struct out_node_s) - (sizeof(struct super_node_s));

            if (output_node->output_index == tensor_idx)
                target_output_node = output_node;
            break;
        case NODE_TYPE_DATA:
            // NPU DATA NODE
            dbg_msg("current node is an network data NODE\n");
            node_offset = sizeof(struct data_node_s) - sizeof(struct super_node_s);
            break;
        case NODE_TYPE_SUPER:
            // NPU SUPER NODE
            dbg_msg("current node is an network super NODE\n");
            node_offset = sizeof(struct super_node_s);
            break;
        default:
            // Unknown NODE
            err_msg("[%s] unknown node type: %d\n", __FUNCTION__, node_id);
            ret = 0;
            goto FUNC_OUT;
        }

        setup_buff_offset += node_offset;
    }

    if (NULL == target_output_node) {
        err_msg("[%s] can not find target index node %d\n", __FUNCTION__, tensor_idx);
        ret = 0;
        goto FUNC_OUT;
    }

    tensor_info->index =            target_output_node->output_index;
    tensor_info->shape_npu_len =    4;
    tensor_info->shape_npu[0] =     1;
    tensor_info->shape_npu[1] =     target_output_node->ch_length;
    tensor_info->shape_npu[2] =     target_output_node->row_length;
    tensor_info->shape_npu[3] =     target_output_node->col_length;
    tensor_info->data_layout =      target_output_node->data_format;
    tensor_info->scale =            *(float *)&(target_output_node->output_scale);
    tensor_info->radix =            target_output_node->output_radix;

FUNC_OUT:
    return ret;
}

#ifdef EMBED_CMP_NPU

int8_t kmdw_model_add_update_model(uint32_t model_type,
                                   int cmd_len, int wt_len, int input_len, int output_len, int setup_len,
                                   uint32_t cmd_mem_addr, uint32_t wt_mem_addr,
                                   uint32_t input_mem_addr, uint32_t output_mem_addr, uint32_t setup_mem_addr)
{
    int model_info_idx = _get_model_info_array_index_from_model_type(model_type);

    if (model_info_idx < 0) {
        int model_count = s_model_data.n_model_count + 1;
        s_model_data.n_model_count = model_count;
        model_info_idx = model_count - 1;

        s_model_data.p_model_info[model_info_idx].model_type = model_type;

        s_model_data.p_model_info[model_info_idx].cmd_mem_addr = cmd_mem_addr;
        s_model_data.p_model_info[model_info_idx].cmd_mem_len = cmd_len;

        s_model_data.p_model_info[model_info_idx].weight_mem_len = wt_len;

        s_model_data.p_model_info[model_info_idx].input_mem_addr = input_mem_addr;
        s_model_data.p_model_info[model_info_idx].input_mem_len = input_len;

        s_model_data.p_model_info[model_info_idx].output_mem_len = output_len;
        s_model_data.p_model_info[model_info_idx].buf_len = output_len;

        s_model_data.p_model_info[model_info_idx].setup_mem_addr = setup_mem_addr;
        s_model_data.p_model_info[model_info_idx].setup_mem_len = setup_len;

        s_model_data.pn_is_model_loaded_table[model_info_idx] = 1;
    }

    s_model_data.p_model_info[model_info_idx].weight_mem_addr = wt_mem_addr;
    s_model_data.p_model_info[model_info_idx].output_mem_addr = output_mem_addr;
    s_model_data.p_model_info[model_info_idx].buf_addr = output_mem_addr;

    dbg_msg("[%s] model cmd addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].cmd_mem_addr);
    dbg_msg("[%s] model wt addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].weight_mem_addr);
    dbg_msg("[%s] model input addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].input_mem_addr);
    dbg_msg("[%s] model output addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].output_mem_addr);
    dbg_msg("[%s] model buf addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].buf_addr);
    dbg_msg("[%s] model setup addr: 0x%x\n", __func__, s_model_data.p_model_info[model_info_idx].setup_mem_addr);

    udt_npu_model_mem(wt_mem_addr, output_mem_addr, (void *)cmd_mem_addr);

    return 0;
}

#endif // EMBED_CMP_NPU

#if DEBUG

void kmdw_model_dump_model_info(void)
{
    struct kdp_model_s *p_modelInfo = 0;
    uint8_t i;

    dbg_msg("Model info Count = %d\n", s_model_data.n_model_count);

    for (i = 0 ; i < s_model_data.n_model_count ; i++) {
        p_modelInfo = &(kmdw_model_data.p_model_info[i]);
        dbg_msg("Model(%2d) model_type(%3d)/version(%5d):\n",
                (i+1),
                p_modelInfo->model_type, p_modelInfo->model_version);

        dbg_msg("input[%x](sz:%d) -> cmd[%x](sz:%d),weight[%x](sz:%d),setup[%x](sz:%d),buf[%x](sz:%d) -> out[%x](sz:%d)\n",
                (i+1),
                p_modelInfo->input_mem_addr, p_modelInfo->input_mem_len,
                p_modelInfo->cmd_mem_addr,   p_modelInfo->cmd_mem_len,
                p_modelInfo->weight_mem_addr,p_modelInfo->weight_mem_len,
                p_modelInfo->setup_mem_addr, p_modelInfo->setup_mem_len,
                p_modelInfo->buf_addr,       p_modelInfo->buf_len,
                p_modelInfo->output_mem_addr,p_modelInfo->output_mem_len);
    }

    return;
}

#endif // DEBUG