JoshuaChak
/

bmodel-qwen1.5-1.8b

Model card Files Files and versions Community

JoshuaChak commited on May 26, 2024

Commit

7c071a8

verified ·

1 Parent(s): ddb8425

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +45 -0
Baichuan2/README.md +182 -0
Baichuan2/compile/compile.sh +186 -0
Baichuan2/compile/export_onnx.py +182 -0
Baichuan2/compile/files/Baichuan2-7B/config.json +29 -0
Baichuan2/compile/files/Baichuan2-7B/modeling_baichuan.py +792 -0
Baichuan2/compile/torch_inference.py +16 -0
Baichuan2/demo/CMakeLists.txt +38 -0
Baichuan2/demo/demo.cpp +472 -0
Baichuan2/model/tokenizer.model +3 -0
Baichuan2/requirements.txt +7 -0
Baichuan2/src/include/bmdef.h +129 -0
Baichuan2/src/include/bmlib_runtime.h +2581 -0
Baichuan2/src/include/bmruntime_interface.h +404 -0
Baichuan2/src/include/sentencepiece/sentencepiece_processor.h +727 -0
Baichuan2/src/lib_pcie/libbmlib.so +0 -0
Baichuan2/src/lib_pcie/libbmrt.so +3 -0
Baichuan2/src/lib_pcie/libbmrt.so.1.0 +3 -0
Baichuan2/src/lib_pcie/libsentencepiece.a +3 -0
Baichuan2/src/lib_soc/libbmlib.so +0 -0
Baichuan2/src/lib_soc/libbmrt.so +3 -0
Baichuan2/src/lib_soc/libbmrt.so.1.0 +3 -0
Baichuan2/src/lib_soc/libsentencepiece.a +3 -0
Baichuan2/web_demo/CMakeLists.txt +36 -0
Baichuan2/web_demo/chat.cpp +419 -0
Baichuan2/web_demo/chat.py +97 -0
Baichuan2/web_demo/web_demo.py +108 -0
BaseModel/base_model.py +184 -0
ChatGLM2/README.md +160 -0
ChatGLM2/compile/compile.sh +179 -0
ChatGLM2/compile/export_onnx.py +176 -0
ChatGLM2/compile/files/chatglm2-6b/config.json +42 -0
ChatGLM2/compile/files/chatglm2-6b/modeling_chatglm.py +1285 -0
ChatGLM2/demo/CMakeLists.txt +33 -0
ChatGLM2/demo/demo.cpp +609 -0
ChatGLM2/run_demo.sh +27 -0
ChatGLM2/support/include/bmdef.h +129 -0
ChatGLM2/support/include/bmlib_runtime.h +2581 -0
ChatGLM2/support/include/bmruntime_interface.h +404 -0
ChatGLM2/support/include/sentencepiece/sentencepiece_processor.h +727 -0
ChatGLM2/support/lib_pcie/libbmlib.so +0 -0
ChatGLM2/support/lib_pcie/libbmrt.so +3 -0
ChatGLM2/support/lib_pcie/libbmrt.so.1.0 +3 -0
ChatGLM2/support/lib_pcie/libsentencepiece.a +3 -0
ChatGLM2/support/lib_soc/libbmlib.so +0 -0
ChatGLM2/support/lib_soc/libbmrt.so +3 -0
ChatGLM2/support/lib_soc/libbmrt.so.1.0 +3 -0
ChatGLM2/support/lib_soc/libsentencepiece.a +3 -0
ChatGLM2/support/tokenizer/tokenization_chatglm.py +257 -0
ChatGLM2/support/tokenizer/tokenizer.model +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,48 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 qwen1.5-1.8b_int4_1dev.bmodel filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 qwen1.5-1.8b_int4_1dev.bmodel filter=lfs diff=lfs merge=lfs -text
+Baichuan2/src/lib_pcie/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+Baichuan2/src/lib_pcie/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+Baichuan2/src/lib_pcie/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+Baichuan2/src/lib_soc/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+Baichuan2/src/lib_soc/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+Baichuan2/src/lib_soc/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+ChatGLM2/support/lib_pcie/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+ChatGLM2/support/lib_pcie/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+ChatGLM2/support/lib_pcie/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+ChatGLM2/support/lib_soc/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+ChatGLM2/support/lib_soc/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+ChatGLM2/support/lib_soc/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+ChatGLM3/support/lib_pcie/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+ChatGLM3/support/lib_pcie/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+ChatGLM3/support/lib_pcie/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+ChatGLM3/support/lib_soc/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+ChatGLM3/support/lib_soc/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+ChatGLM3/support/lib_soc/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+DeepSeek/requirements/sophon-3.7.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+LWM/support/lib_pcie/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+LWM/support/lib_pcie/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+LWM/support/lib_pcie/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+LWM/support/lib_soc/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+LWM/support/lib_soc/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+LWM/support/lib_soc/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+Llama2/assets/llama2_pcie filter=lfs diff=lfs merge=lfs -text
+Llama2/demo_parallel/lib/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+Llama2/support/lib_pcie/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+Llama2/support/lib_pcie/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+Llama2/support/lib_soc/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+Llama2/support/lib_soc/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text
+Llama3/python_demo/build/CMakeFiles/chat.dir/chat.cpp.o filter=lfs diff=lfs merge=lfs -text
+Llama3/python_demo/build/chat.cpython-38-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+Llama3/python_demo/chat.cpython-38-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+Qwen1_5/python_demo/build/CMakeFiles/chat.dir/chat.cpp.o filter=lfs diff=lfs merge=lfs -text
+Qwen1_5/python_demo/build/CMakeFiles/chat_parallel.dir/chat_parallel.cpp.o filter=lfs diff=lfs merge=lfs -text
+Qwen1_5/python_demo/build/chat.cpython-38-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+Qwen1_5/python_demo/build/chat_parallel.cpython-38-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+Qwen1_5/python_demo/chat.cpython-38-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+Qwen1_5/python_demo/chat_parallel.cpython-38-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+WizardCoder/demo/lib_pcie/lib/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+WizardCoder/demo/lib_pcie/lib/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+WizardCoder/demo/lib_soc/lib/libbmrt.so filter=lfs diff=lfs merge=lfs -text
+WizardCoder/demo/lib_soc/lib/libbmrt.so.1.0 filter=lfs diff=lfs merge=lfs -text
+Yi34B/demo_parallel/lib/libsentencepiece.a filter=lfs diff=lfs merge=lfs -text

Baichuan2/README.md ADDED Viewed

	@@ -0,0 +1,182 @@

+![image](../../assets/sophgo_chip.png)
+# Baichuan2-TPU
+本项目实现BM1684X部署语言大模型[Baichuan2-7B](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)。通过[TPU-MLIR](https://github.com/sophgo/tpu-mlir)编译器将模型转换成bmodel，并采用c++代码将其部署到BM1684X的PCIE环境，或者SoC环境。
+下文中默认是PCIE环境；如果是SoC环境，按提示操作即可。
+# 目录说明
+```
+.
+├── README.md                           #使用说明
+├── requirements.txt                    #需要使用的python wheel包
+├── compile
+│   ├── compile.sh                      #用来编译TPU模型的脚本
+│   ├── export_onnx.py                  #用来导出onnx的脚本
+│   ├── torch_inference.py              #torch推理脚本
+│   └── files
+│       └── Baichuan2-7B                #替换Baichuan2-7B-chat的对应文件的备份
+│           ├── config.json
+│           └── modeling_baichuan.py
+├── demo                                #Baichuan2 c++代码文件
+│   ├── CMakeLists.txt
+│   └── demo.cpp                        #主程序
+├── src                                 #编译依赖库
+│   ├── include
+│   ├── lib_pcie
+│   └── lib_soc
+├── model                               #模型文件（bmodel需下载）
+│   ├── baichuan2-7b-test_int8.bmodel
+│   └── tokenizer.model
+└── web_demo                            #web demo，提供网页对话示例
+    ├── chat.cpp
+    ├── chat.py
+    ├── CMakeLists.txt
+    └── web_demo.py
+```
+----------------------------
+# 【阶段一】模型编译
+## 注意点
+* 模型编译必须要在docker内完成，无法在docker外操作
+### 步骤一：模型下载
+Baichuan2模型在hugging face上完全开源，供用户下载使用。请根据官网下载步骤进行模型与权重的下载。
+```bash
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat
+```
+### 步骤二：下载docker
+下载docker，启动容器，如下：
+``` shell
+docker pull sophgo/tpuc_dev:latest
+# myname1234 is just an example, you can set your own name
+docker run --privileged --name myname1234 -v $PWD:/workspace -it sophgo/tpuc_dev:latest
+```
+### 步骤三：下载TPU-MLIR代码并编译
+``` shell
+git clone git@github.com:sophgo/tpu-mlir.git
+cd tpu-mlir
+source ./envsetup.sh
+./build.sh
+```
+* PS：重新进入docker环境并且需要编译模型时，必须在此路径下执行上述`source ./envsetup.sh` 和 `./build.sh`才能完成后续模型编译。
+### 步骤四：下载本项目，安装requirements.txt
+下载transfomers、sentencepiece、Baichuan2-TPU以及百度网盘里的.bin模型，并替换transformers里面的modeling_baichuan.py
+``` shell
+git clone https://github.com/sophgo/Baichuan2-TPU.git
+cd Baichuan2
+pip3 install -r requirements.txt
+```
+### 步骤五：替换modeling_baichuan.py, 修改config.json, 生成onnx文件
+修改Baichuan2-7B-chat项目中config.json文件中max_position_embeddings与model_max_length，从4096变为512
+``` shell
+cd compile
+cp files/Baichuan2-7B/modeling_baichuan.py $BAICHUAN2_PATH
+cp files/Baichuan2-7B/config.json $BAICHUAN2_PATH
+python3 export_onnx.py --model_path $BAICHUAN2_PATH
+```
+* PS1：your_model_path 指的是原模型下载后的地址, 如:"../../torch2onnx/Baichuan2-7B-Chat", 可以根据需要选择使用7b模型还是13b模型。
+* PS2：如果你想要debug，而不是一下子生成完成全部的onnx模型，可以将240行的num_layers改成1, 并结合函数对比单个block情况下是否可以和
+### 步骤六：生成bmodel文件
+生成模型
+``` shell
+./compile.sh --mode int8
+mv baichuan2-7b_int8_1dev.bmodel ../model
+```
+* PS1：编译完成后最终会在Baichuan2-TPU/compile路径下生成名为baichuan2-{X}b_{Y}_{Z}dev.bmodel,其中X为7或13，Y为`compile.sh`时选择的`mode`的数据类型,Z为推理的芯片数量(如果不指定num_device, 会省略{Z}dev的部分)
+* PS2：生成bmodel耗时大概3小时以上，建议64G内存以及200G以上硬盘空间，不然很可能OOM或者no space left
+* PS3：目前给定的lib_pcie和lib_soc部分仅包含单芯的动态库，多芯部分会在后续更新
+----------------------------
+# 阶段二：可执行文件生成（可以跳过）
+## 准备
+* bmodel模型准备：经过阶段一后将得到编译好的bmodel文件【也可以使用我们提供的现成编译好的bmodel文件】，下载方式为:
+```shell
+cd Baichuan2-TPU/model
+pip3 install dfss
+# baichuan2-7B
+python3 -m dfss --url=open@sophgo.com:sophon-demo/baichuan2/baichuan2-7b-test_int8.bmodel
+```
+将得到编译好的int8单芯bmodel模型文件。
+## 编译程序(C++版本)
+执行如下编译，默认是PCIE版本：
+```shell
+cd Baichuan2-TPU/demo
+mkdir build
+cd build
+cmake ..
+make
+```
+如果是SoC版本，有两种编译方法：
+方法1：直接将demo目录拷贝到SoC环境，按以上步骤编译(推荐)
+方法2：docker中交叉编译，如下操作
+```shell
+wget https://releases.linaro.org/components/toolchain/binaries/7.5-2019.12/aarch64-linux-gnu/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz
+tar -xvf gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz
+mv gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu /opt/aarch64-linux-gnu-7.5.0
+cd Baichuan2-TPU/demo
+mkdir build
+cd build
+cmake .. -DTARGET_ARCH=soc # soc 只有一颗芯片，因此不支持多芯编译
+make -j
+```
+编译生成Baichuan2可执行程序。
+运行`baichuan2`:
+```shell
+./baichuan2 --model ../model/baichuan2-7b-test_int8.bmodel --dev dev_id
+```
+## 编译程序(Python Web版本)【单芯】
+```shell
+pip3 install gradio==3.39.0
+cd Baichuan2-TPU/web_demo
+mkdir build
+cd build
+cmake ..
+make -j
+```
+编译成功会在`build`文件夹下生成`libtpuchat.so*`, 此时可以在web_demo.py中指定bmodel\_path token\_path device\_id, lib_path(编译生产的`libtpuchat.so*`文件, 默认路径是`./build`下), 以及dev_id。
+```python
+python3 web_demo.py
+```
+即可成功运行web的demo。
+* PS：在用户不修改上述token\_path的lib\_path的存放路径前提下只需指定bmodel\_path即可运行程序。
+如果是SoC环境，参考C++版本
+* PS：尽量下载gradio==3.39.0版本，不然会出现各种问题！！
+# 常见问题
+* 请根据实际block数目调整`demo/chat`中或者`web_demo/chat.cpp`中的NUM_LAYERS，默认是使用Baichuan2-7B(NUM_LAYERS=32)

Baichuan2/compile/compile.sh ADDED Viewed

	@@ -0,0 +1,186 @@

+#!/bin/bash
+set -ex
+models=
+mode="f16"
+folder="tmp"
+num_device=1
+mode_args=""
+device_args=""
+quantize_args="--quantize F16"
+name=""
+num_layers=
+out_model=$name.bmodel
+if [ -z "$name" ]; then
+    name="baichuan2-7b"
+    echo "Compile Baichuan2-7B"
+else
+    name="baichuan2-13b"
+    echo "Compile Baichuan2-13B"
+fi
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --mode)
+            mode="$2"
+            shift 2
+            ;;
+        --num_device)
+            num_device="$2"
+            shift 2
+            ;;
+        --name)
+            name="$2"
+            shift 2
+            ;;
+        *)
+            echo "Invalid option: $key" >&2
+            exit 1
+            ;;
+        :)
+            echo "Option -$OPTARG requires an argument." >&2
+            exit 1
+            ;;
+    esac
+done
+if [ x$mode == x"int8" ] || [ x$mode == x"int4" ]; then
+    if [ x$mode == x"int8" ]; then
+        quantize_args="--quantize W8F16"
+    else
+        quantize_args="--quantize W4BF16 --q_group_size 64"
+    fi
+    out_model=$name'_'$mode'.bmodel'
+fi
+if [ x$name == x"baichuan2-7b" ] || [ x$name == x"baichuan2-13b" ]; then
+    if [ x$name == x"baichuan2-7b" ]; then
+        num_layers=32
+    else
+        num_layers=40
+    fi
+fi
+if [ x$num_device != x1 ]; then
+    device_args="--num_device $num_device"
+    out_model=$name'_'$mode'_'$num_device'dev.bmodel'
+else
+    out_model=$name'_'$mode'_1dev.bmodel'
+fi
+outdir=${folder}/embedding
+mkdir -p $outdir
+pushd $outdir
+seqlen=512
+model_transform.py \
+    --model_name embedding \
+    --model_def ../embedding.onnx \
+    --input_shapes [[1,$seqlen]] \
+    --mlir embedding_${seqlen}.mlir
+model_deploy.py \
+    --mlir embedding_$seqlen.mlir \
+    --quantize F16 \
+    --chip bm1684x \
+    $device_args \
+    --model embedding_${seqlen}_f16.bmodel
+model_transform.py \
+    --model_name embedding_cache \
+    --model_def ../embedding.onnx \
+    --input_shapes [[1,1]] \
+    --mlir embedding_1.mlir
+model_deploy.py \
+    --mlir embedding_1.mlir \
+    --quantize F16 \
+    --chip bm1684x \
+    $device_args \
+    --model embedding_1_f16.bmodel
+rm *.npz
+models=$models' '$outdir'/embedding_1_f16.bmodel '$outdir'/embedding_'$seqlen'_f16.bmodel '
+popd
+echo $models
+outdir=${folder}/$mode"_"$num_device"dev"/lm_head
+mkdir -p $outdir
+pushd $outdir
+model_transform.py \
+    --model_name lm_head \
+    --model_def ../../lm_head.onnx \
+    --mlir lm_head.mlir
+model_deploy.py \
+    --mlir lm_head.mlir \
+    --quantize F16 \
+    --chip bm1684x \
+    --model lm_head.bmodel
+rm *.npz
+models=${models}${outdir}'/lm_head.bmodel '
+popd
+echo $models
+outdir=${folder}/$mode"_"$num_device"dev"/block
+mkdir -p $outdir
+pushd $outdir
+mkdir -p $outdir
+for ((i=0; i<$num_layers; i++))
+do
+model_transform.py \
+    --model_name block_$i \
+    --model_def ../../block_$i.onnx \
+    --mlir block_$i.mlir
+model_deploy.py \
+    --mlir block_$i.mlir \
+    $quantize_args \
+    --chip bm1684x \
+    --quant_output \
+    --quant_output_list 2,3 \
+    $device_args \
+    --model block_$i.bmodel
+model_transform.py \
+    --model_name block_cache_$i \
+    --model_def ../../block_cache_${i}.onnx \
+    --mlir block_cache_$i.mlir
+model_deploy.py \
+    --mlir block_cache_$i.mlir \
+    $quantize_args \
+    --chip bm1684x \
+    --quant_input \
+    --quant_output \
+    --quant_input_list 4,5 \
+    --quant_output_list 2,3 \
+    $device_args \
+    --model block_cache_$i.bmodel
+rm *.npz
+# rm ../../block_$i.onnx
+# rm ../../block_cache_$i.onnx
+models=${models}${outdir}'/block_'$i'.bmodel '$outdir'/block_cache_'$i'.bmodel '
+done
+popd
+echo $models
+model_tool --combine $models -o $out_model

Baichuan2/compile/export_onnx.py ADDED Viewed

	@@ -0,0 +1,182 @@

+#!/usr/bin/env python3
+# ==============================================================================
+#
+# Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+#
+# TPU-MLIR is licensed under the 2-Clause BSD License except for the
+# third-party components.
+#
+# ==============================================================================
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+import numpy as np
+import argparse
+folder = f"./tmp/onnx"
+parser = argparse.ArgumentParser(description='export onnx.')
+parser.add_argument('--model_path', type=str, help='path to the torch model.')
+parser.add_argument('--seq_length', type=int, default=512, help="sequence length")
+args = parser.parse_args()
+model_path = args.model_path
+folder = "./tmp"
+origin_model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).eval()
+origin_model.generation_config = GenerationConfig.from_pretrained(model_path)
+config = origin_model.config
+transformer = origin_model.model
+layers = transformer.layers
+SEQ_LENGTH = args.seq_length
+NUM_LAYERS = config.num_hidden_layers
+HIDDEN_SIZE = config.hidden_size
+NUM_ATTENTION_HEADS = config.num_attention_heads
+HEAD_DIM = HIDDEN_SIZE // NUM_ATTENTION_HEADS
+for param in origin_model.parameters():
+    param.requires_grad = False
+tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
+class Embedding(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, input_ids):
+        return transformer.embed_tokens(input_ids)
+class Block(torch.nn.Module):
+    def __init__(self, layer_id):
+        super().__init__()
+        # params
+        self.layer_id = layer_id
+        self.layer = layers[layer_id]
+    def forward(self, hidden_states, position_ids, attention_mask):
+        hidden_states, past_kv = self.layer(hidden_states,
+                                            attention_mask,
+                                            position_ids,
+                                            use_cache=True)
+        present_k, present_v = past_kv
+        return hidden_states, present_k, present_v
+class BlockCache(torch.nn.Module):
+    def __init__(self, layer_id):
+        super().__init__()
+        # params
+        self.layer_id = layer_id
+        self.layer = layers[layer_id]
+    def forward(self, hidden_states, position_ids, attention_mask, past_k,
+                past_v):
+        hidden_states, past_kv = self.layer(hidden_states,
+                                            attention_mask,
+                                            position_ids=position_ids,
+                                            past_key_value=(past_k, past_v),
+                                            use_cache=True)
+        present_k, present_v = past_kv
+        return hidden_states, present_k, present_v
+class LmHead(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, hidden_states):
+        hidden_states = transformer.norm(hidden_states)
+        m_logits = origin_model.lm_head(hidden_states)
+        _, token = torch.topk(m_logits, 1)
+        return token
+def convert_block(layer_id):
+    # input
+    hidden_states = torch.randn((1, SEQ_LENGTH, HIDDEN_SIZE))
+    position_ids = torch.tensor([range(SEQ_LENGTH)], dtype=torch.long)
+    attention_mask = torch.randn((1, 1, SEQ_LENGTH, SEQ_LENGTH))
+    model = Block(layer_id)
+    torch.onnx.export(
+        model, (hidden_states, position_ids, attention_mask),
+        f'{folder}/block_{layer_id}.onnx',
+        verbose=False,
+        input_names=['input_states', 'position_ids', 'attention_mask'],
+        output_names=['hidden_states', 'past_k', 'past_v'],
+        do_constant_folding=True,
+        opset_version=15)
+def convert_block_cache(layer_id):
+    # input
+    np.random.seed(42)
+    hidden_states = torch.randn((1, 1, HIDDEN_SIZE))
+    position_ids = torch.tensor([range(1)], dtype=torch.long)
+    attention_mask = torch.randn((1, 1, 1, SEQ_LENGTH + 1))
+    past_k = torch.randn((1, SEQ_LENGTH, NUM_ATTENTION_HEADS, HEAD_DIM))
+    past_v = torch.randn((1, SEQ_LENGTH, NUM_ATTENTION_HEADS, HEAD_DIM))
+    model = BlockCache(layer_id)
+    torch.onnx.export(
+        model, (hidden_states, position_ids, attention_mask, past_k, past_v),
+        f'{folder}/block_cache_{layer_id}.onnx',
+        verbose=False,
+        input_names=[
+            'input_states', 'position_ids', 'attention_mask', 'history_k',
+            'history_v'
+        ],
+        output_names=['hidden_states', 'past_k', 'past_v'],
+        do_constant_folding=True,
+        opset_version=15)
+def convert_embedding():
+    model = Embedding()
+    input = torch.tensor([range(SEQ_LENGTH)])
+    torch.onnx.export(model, (input),
+                      f'{folder}/embedding.onnx',
+                      verbose=False,
+                      input_names=['input_ids'],
+                      output_names=['input_embed'],
+                      dynamic_axes={"input_ids": {
+                          0: "length"
+                      }},
+                      do_constant_folding=True,
+                      opset_version=15)
+def convert_lm_head():
+    model = LmHead()
+    input = torch.randn(1, HIDDEN_SIZE)
+    torch.onnx.export(model, (input),
+                      f'{folder}/lm_head.onnx',
+                      verbose=False,
+                      input_names=['hidden_states'],
+                      output_names=['token'],
+                      do_constant_folding=True,
+                      opset_version=15)
+# create folder to store onnx
+if not os.path.exists(folder):
+    os.makedirs(folder)
+# export models
+for i in range(NUM_LAYERS):
+    print("convert_block_{}".format(i))
+    convert_block_cache(i)
+    convert_block(i)
+print("convert_embedding")
+convert_embedding()
+print("convert_lm_head")
+convert_lm_head()

Baichuan2/compile/files/Baichuan2-7B/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "BaichuanForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_baichuan.BaichuanConfig",
+    "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
+  },
+  "tokenizer_class": "BaichuanTokenizer",
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "model_max_length": 4096,
+  "model_type": "baichuan",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-06,
+  "_from_model_config": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.29.2",
+  "use_cache": true,
+  "vocab_size": 125696
+}

Baichuan2/compile/files/Baichuan2-7B/modeling_baichuan.py ADDED Viewed

	@@ -0,0 +1,792 @@

+# Copyright 2023 Baichuan Inc. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_baichuan import BaichuanConfig
+from .generation_utils import build_chat_input, TextIterStreamer
+import math
+from typing import List, Optional, Tuple, Union
+from threading import Thread
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.generation.utils import GenerationConfig
+from transformers.utils import logging, ContextManagers
+import os
+from contextlib import contextmanager
+logger = logging.get_logger(__name__)
+try:
+    from xformers import ops as xops
+except ImportError:
+    xops = None
+    logger.warning(
+        "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
+    )
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    if len(mask.size()) == 3:
+        bsz, src_len, _ = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:,None,:,:].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    else:
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32)
+        self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+            freqs = torch.outer(t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32).to(x.device)
+            self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device)
+        elif self.cos_cached.device != x.device:
+            self.cos_cached = self.cos_cached.to(x.device)
+            self.sin_cached = self.sin_cached.to(x.device)
+        return (
+            self.cos_cached[:, :, :seq_len, ...],
+            self.sin_cached[:, :, :seq_len, ...],
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids):
+    cos = cos_.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin_.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    cos = cos.transpose(1, 2)
+    sin = sin.transpose(1, 2)
+    q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin)
+    k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin)
+    return q_embed.to(q.dtype), k_embed.to(k.dtype)
+class MLP(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            intermediate_size: int,
+            hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: BaichuanConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        proj = self.W_pack(hidden_states)
+        proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
+        query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim)
+        value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim)
+        kv_seq_len = key_states.shape[-3]
+        if past_key_value is not None:
+            kv_seq_len = kv_seq_len + past_key_value[0].shape[-3]
+        if past_key_value is not None:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len-1)
+        else:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+        past_kv = (key_states, value_states) if use_cache else None
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=1)
+            value_states = torch.cat([past_key_value[1], value_states], dim=1)
+        if xops is not None and self.training:
+            attn_weights = None
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            attn_output = xops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask()
+            )
+        else:
+            with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
+                query_states = query_states.transpose(1, 2)
+                key_states = key_states.transpose(1, 2)
+                value_states = value_states.transpose(1, 2)
+                attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
+            attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_kv
+class DecoderLayer(nn.Module):
+    def __init__(self, config: BaichuanConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config=config)
+        self.mlp = MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class BaichuanPreTrainedModel(PreTrainedModel):
+    config_class = BaichuanConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DecoderLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BaichuanModel):
+            module.gradient_checkpointing = value
+class BaichuanModel(BaichuanPreTrainedModel):
+    def __init__(self, config: BaichuanConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class NormHead(nn.Module):
+    def __init__(self, hidden_size, vocab_size, bias=False):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty((vocab_size, hidden_size)))
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        self.first_flag = True
+    def forward(self, hidden_states):
+        if self.training:
+            norm_weight = nn.functional.normalize(self.weight)
+            self.first_flag = True
+        elif self.first_flag:
+            self.first_flag = False
+            self.weight.data = nn.functional.normalize(self.weight)
+            norm_weight = self.weight
+        else:
+            norm_weight = self.weight
+        return nn.functional.linear(hidden_states, norm_weight)
+_init_weights = True
+@contextmanager
+def no_init_weights(_enable=True):
+    global _init_weights
+    old_init_weights = _init_weights
+    if _enable:
+        _init_weights = False
+    try:
+        yield
+    finally:
+        _init_weights = old_init_weights
+class BaichuanForCausalLM(BaichuanPreTrainedModel):
+    def __init__(self, config, *model_args, **model_kwargs):
+        super().__init__(config, *model_args, **model_kwargs)
+        self.model = BaichuanModel(config)
+        self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
+        if hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and config.quantization_config.get('load_in_4bit', False):
+            try:
+                from .quantizer import quantize_offline, init_model_weight_int4
+            except ImportError:
+                raise ImportError(f"Needs QLinear to run quantize.")
+            quantize_offline(self, 4)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=False,
+                proxies=None,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder="",
+                _from_auto=False,
+                _from_pipeline=None,
+                **kwargs,
+            )
+        else:
+            model_kwargs = kwargs
+        if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
+            try:
+                from .quantizer import init_model_weight_int4
+                from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map
+                from accelerate.utils import CustomDtype
+                from accelerate.utils import get_balanced_memory
+            except ImportError:
+                raise ImportError(f"Needs import model weight init func to run quantize.")
+            # Instantiate model.
+            init_contexts = [no_init_weights(_enable=True)]
+            init_contexts.append(init_empty_weights())
+            with ContextManagers(init_contexts):
+                model = cls(config)
+            model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
+            state_dict = torch.load(model_file, map_location="cpu")
+            model.is_quantized = True
+            device_map = kwargs.pop("device_map", None)
+            torch_dtype = kwargs.pop("torch_dtype", None)
+            if device_map is not None:
+                kwargs = {"no_split_module_classes": model._no_split_modules}
+                target_dtype = CustomDtype.INT4
+                max_memory = get_balanced_memory(
+                    model,
+                    dtype=target_dtype,
+                    low_zero=(device_map == "balanced_low_0"),
+                    max_memory=None,
+                    **kwargs,
+                )
+                kwargs["max_memory"] = max_memory
+                device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
+            model = init_model_weight_int4(config, model, state_dict)
+            # Set model in evaluation mode to deactivate DropOut modules by default
+            model.eval()
+            # If it is a model with generation capabilities, attempt to load the generation config
+            if model.can_generate():
+                try:
+                    model.generation_config = GenerationConfig.from_pretrained(
+                        pretrained_model_name_or_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=False,
+                        proxies=None,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder="",
+                        _from_auto=False,
+                        _from_pipeline=None,
+                        **kwargs,
+                    )
+                except (OSError, TypeError):
+                    logger.info(
+                        "Generation config file not found, using a generation config created from the model config."
+                    )
+                    pass
+            if device_map is not None:
+                dispatch_model(model, device_map=device_map)
+            return model
+        return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args,
+                config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes,
+                force_download=force_download, local_files_only=local_files_only, token=token, revision=revision,
+                use_safetensors=use_safetensors, **kwargs)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            softmax_normalizer = shift_logits.max(-1).values ** 2
+            z_loss = self.config.z_loss_weight * softmax_normalizer.mean()
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels) + z_loss
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+    def quantize(self, bits: int):
+        try:
+            from .quantizer import quantize_online
+        except ImportError:
+            raise ImportError(f"Needs QLinear to run quantize.")
+        return quantize_online(self, bits)
+    def chat(self, tokenizer, messages: List[dict], stream=False,
+             generation_config: Optional[GenerationConfig]=None):
+        generation_config = generation_config or self.generation_config
+        input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
+        if stream:
+            streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+            Thread(target=self.generate, kwargs=dict(
+                inputs=input_ids, streamer=streamer,
+                generation_config=generation_config,
+            )).start()
+            return streamer
+        else:
+            outputs = self.generate(input_ids, generation_config=generation_config)
+            response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
+            return response

Baichuan2/compile/torch_inference.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('model_path', help='下载模型的绝对路径')
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float32, trust_remote_code=True)
+model.generation_config = GenerationConfig.from_pretrained(model_path)
+messages = []
+messages.append({"role": "user", "content": "解释一下“温故而知新”"})
+response = model.chat(tokenizer, messages)
+print(response)

Baichuan2/demo/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+cmake_minimum_required(VERSION 2.8)
+project(baichuan2)
+if (NOT DEFINED TARGET_ARCH)
+    set(TARGET_ARCH pcie)
+endif()
+set(CMAKE_INSTALL_PREFIX install)
+if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+	add_definitions(-DSOC_TARGET)
+	link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc)
+	message("SoC mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "pcie")
+    add_definitions(-DPCIE_TARGET)
+    link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie)
+	message("Pcie mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "soc")
+    add_definitions(-DSOC_TARGET)
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_ASM_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+    link_directories(${PROJECT_SOURCE_DIR}/lib_soc)
+	message("SoC mode, starting......")
+endif()
+include_directories(${PROJECT_SOURCE_DIR}/../src/include)
+add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror)
+set(CMAKE_BUILD_TYPE "Debug")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(baichuan2 demo.cpp)
+target_link_libraries(baichuan2 bmrt bmlib sentencepiece)

Baichuan2/demo/demo.cpp ADDED Viewed

	@@ -0,0 +1,472 @@

+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+//
+// TPU-MLIR is licensed under the 2-Clause BSD License except for the
+// third-party components.
+//
+//===----------------------------------------------------------------------===//
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <assert.h>
+#include <chrono>
+#include <algorithm>
+#include "memory.h"
+#include "sentencepiece/sentencepiece_processor.h"
+#include "bmruntime_interface.h"
+#include <getopt.h>
+#include <numeric>
+static const int NUM_LAYERS = 32;
+static const int MAX_LEN = 512;
+static const float ATTENTION_MASK = -1000.;
+static const std::string TOKENIZER_MODEL = "../model/tokenizer.model";
+// #define EXPORT_RESULTS
+#ifdef EXPORT_RESULTS
+#include "cnpy.h"
+static cnpy::npz_t map;
+template <typename T>
+static void add_array(std::string name, bm_handle_t bm_handle,
+                      const bm_device_mem_t &dst) {
+  std::vector<T> data(dst.size / sizeof(T));
+  bm_memcpy_d2s(bm_handle, data.data(), dst);
+  cnpy::npz_add_array(map, name, data);
+}
+static void save_array(std::string filename) {
+  cnpy::npz_save_all(filename, map);
+}
+#endif
+class Baichuan2 {
+public:
+  void init(const std::vector<int> &devid, std::string model);
+  void chat();
+  void deinit();
+private:
+  void answer(const std::string &input_str);
+  int forward_first(std::vector<int> &tokens);
+  int forward_next();
+  void load_sentencepiece();
+private:
+  std::vector<bm_handle_t> handles;
+  bm_handle_t bm_handle;
+  void *p_bmrt;
+  sentencepiece::SentencePieceProcessor sentencepiece;
+  const bm_net_info_t *net_blocks[NUM_LAYERS];
+  const bm_net_info_t *net_blocks_cache[NUM_LAYERS];
+  const bm_net_info_t *net_embed;
+  const bm_net_info_t *net_embed_cache;
+  const bm_net_info_t *net_lm;
+  bm_tensor_t inputs_embed_512, outputs_embed_512;
+  bm_tensor_t inputs_lm, outputs_lm;
+  bm_tensor_t inputs_pid, next_pid, inputs_attention, next_attention;
+  bm_tensor_t past_key[NUM_LAYERS], past_value[NUM_LAYERS];
+  bm_tensor_t present_key[NUM_LAYERS], present_value[NUM_LAYERS];
+  bm_tensor_t present_key_cache, present_value_cache;
+  std::string name_embed;
+  std::string name_embed_cache;
+  std::string name_lm;
+  std::string name_blocks[NUM_LAYERS];
+  std::string name_blocks_cache[NUM_LAYERS];
+  int round = 0;
+  int token_length;
+  int EOS;
+  std::vector<std::string> history;
+};
+void Baichuan2::load_sentencepiece() {
+  printf("Load %s ... ", TOKENIZER_MODEL.c_str());
+  auto status = sentencepiece.Load(TOKENIZER_MODEL);
+  if (!status.ok()) {
+    std::cout << status.ToString() << std::endl;
+    exit(-1);
+  }
+  EOS = sentencepiece.eos_id();
+  printf("Done!\n");
+}
+void Baichuan2::init(const std::vector<int> &devices, std::string model) {
+  load_sentencepiece();
+  // request bm_handle
+  std::cout << "Device [ ";
+  for (auto d : devices) {
+    std::cout << d << " ";
+  }
+  std::cout << "] loading ....\n";
+  // int device_num = devices.size();
+  for (auto d : devices) {
+    bm_handle_t h;
+    bm_status_t status = bm_dev_request(&h, d);
+    assert(BM_SUCCESS == status);
+    handles.push_back(h);
+  }
+  bm_handle = handles[0];
+  // create bmruntime
+  p_bmrt = bmrt_create(bm_handle);
+  assert(NULL != p_bmrt);
+  // load bmodel by file
+  printf("Model[%s] loading ....\n", model.c_str());
+  bool ret = bmrt_load_bmodel(p_bmrt, model.c_str());
+  assert(true == ret);
+  printf("Done!\n");
+  // net names
+  name_embed = "embedding";
+  name_embed_cache = "embedding_cache";
+  name_lm = "lm_head";
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    name_blocks[i] = "block_" + std::to_string(i);
+    name_blocks_cache[i] = "block_cache_" + std::to_string(i);
+  }
+  // net infos
+  net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str());
+  net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str());
+  net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str());
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    net_blocks[i] = bmrt_get_network_info(p_bmrt, name_blocks[i].c_str());
+    net_blocks_cache[i] =
+        bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str());
+  }
+  // net device mem
+  ret = bmrt_tensor(&inputs_embed_512, p_bmrt, net_embed->input_dtypes[0],
+                    net_embed->stages[0].input_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&outputs_embed_512, p_bmrt, net_embed->output_dtypes[0],
+                    net_embed->stages[0].output_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&inputs_pid, p_bmrt, net_blocks[0]->input_dtypes[1],
+                    net_blocks[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+  ret = bmrt_tensor(&inputs_attention, p_bmrt, net_blocks[0]->input_dtypes[2],
+                    net_blocks[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+  ret = bmrt_tensor(&next_pid, p_bmrt, net_blocks_cache[0]->input_dtypes[1],
+                    net_blocks_cache[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+  ret =
+      bmrt_tensor(&next_attention, p_bmrt, net_blocks_cache[0]->input_dtypes[2],
+                  net_blocks_cache[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    ret = bmrt_tensor(&past_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&past_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+  }
+  ret = bmrt_tensor(&present_key_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[1],
+                    net_blocks_cache[0]->stages[0].output_shapes[1]);
+  assert(true == ret);
+  ret = bmrt_tensor(&present_value_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[2],
+                    net_blocks_cache[0]->stages[0].output_shapes[2]);
+  assert(true == ret);
+  ret = bmrt_tensor(&inputs_lm, p_bmrt, net_lm->input_dtypes[0],
+                    net_lm->stages[0].input_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&outputs_lm, p_bmrt, net_lm->output_dtypes[0],
+                    net_lm->stages[0].output_shapes[0]);
+  assert(true == ret);
+}
+void Baichuan2::deinit() {
+  bm_free_device(bm_handle, inputs_embed_512.device_mem);
+  bm_free_device(bm_handle, outputs_embed_512.device_mem);
+  bm_free_device(bm_handle, inputs_lm.device_mem);
+  bm_free_device(bm_handle, outputs_lm.device_mem);
+  bm_free_device(bm_handle, inputs_pid.device_mem);
+  bm_free_device(bm_handle, next_pid.device_mem);
+  bm_free_device(bm_handle, inputs_attention.device_mem);
+  bm_free_device(bm_handle, next_attention.device_mem);
+  bm_free_device(bm_handle, present_key_cache.device_mem);
+  bm_free_device(bm_handle, present_value_cache.device_mem);
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_free_device(bm_handle, past_key[i].device_mem);
+    bm_free_device(bm_handle, past_value[i].device_mem);
+    bm_free_device(bm_handle, present_key[i].device_mem);
+    bm_free_device(bm_handle, present_value[i].device_mem);
+  }
+  bmrt_destroy(p_bmrt);
+  for (auto h : handles) {
+    bm_dev_free(h);
+  }
+}
+int Baichuan2::forward_first(std::vector<int> &tokens) {
+  int input_ids[MAX_LEN] = {0}; // start token
+  int position_id[MAX_LEN] = {0};
+  float attention_mask[MAX_LEN * MAX_LEN] = {0};
+  token_length = tokens.size();
+  std::copy(tokens.begin(), tokens.end(), input_ids);
+  for (int i = 0; i < token_length; i++) {
+    position_id[i] = i;
+  }
+  for (int i = 0; i < MAX_LEN; i++) {
+    for (int j = 0; j < MAX_LEN; j++) {
+      if (j <= i && i < token_length) {
+      } else {
+        attention_mask[i * MAX_LEN + j] = ATTENTION_MASK;
+      }
+    }
+  }
+  // forward embeding
+  bm_memcpy_s2d(bm_handle, inputs_embed_512.device_mem, (void *)input_ids);
+  auto ret =
+      bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &inputs_embed_512, 1,
+                            &outputs_embed_512, 1, true, false);
+  assert(ret);
+  // float test_embed[MAX_LEN] = {0};
+  // bm_memcpy_d2s(bm_handle, (void *)&test_embed, outputs_embed_512.device_mem);
+  bm_thread_sync(bm_handle);
+  // forward blocks
+  bm_memcpy_s2d(bm_handle, inputs_pid.device_mem, (void *)position_id);
+  bm_memcpy_s2d(bm_handle, inputs_attention.device_mem, (void *)attention_mask);
+  auto inputs_embed = outputs_embed_512;
+  inputs_embed.shape = net_blocks[0]->stages[0].input_shapes[0];
+  bm_tensor_t inputs_block[3] = {inputs_embed, inputs_pid, inputs_attention};
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t outputs_block[3] = {inputs_embed, past_key[i], past_value[i]};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(), inputs_block, 3,
+                                outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+  }
+  int bytes = inputs_embed.device_mem.size / MAX_LEN;
+  bm_memcpy_d2d_byte(bm_handle, inputs_lm.device_mem, 0,
+                     inputs_embed.device_mem, (token_length - 1) * bytes,
+                     bytes);
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+int Baichuan2::forward_next() {
+  float attention_mask[MAX_LEN + 1] = {0};
+  for (int i = token_length - 1; i < MAX_LEN; i++) {
+    attention_mask[i] = ATTENTION_MASK;
+  }
+  int32_t position_id = token_length - 1;
+  // embedding
+  outputs_lm.shape = net_embed_cache->stages[0].input_shapes[0];
+  auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed_cache.c_str(), &outputs_lm, 1,
+                                   &inputs_lm, 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+  // blocks
+  bm_memcpy_s2d(bm_handle, next_attention.device_mem, (void *)attention_mask);
+  bm_memcpy_s2d(bm_handle, next_pid.device_mem, (void *)&position_id);
+  auto inputs_embed = inputs_lm;
+  inputs_embed.shape = net_blocks_cache[0]->stages[0].input_shapes[0];
+  int bytes = bm_mem_get_device_size(present_key_cache.device_mem);
+  int token_offset = (token_length - 1) * bytes;
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t inputs_block[5] = {inputs_embed, next_pid, next_attention,
+                                   past_key[i], past_value[i]};
+    bm_tensor_t outputs_block[3] = {inputs_embed, present_key_cache, present_value_cache};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(),
+                                inputs_block, 5, outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+    bm_memcpy_d2d_byte(bm_handle, past_key[i].device_mem, token_offset,
+                       present_key_cache.device_mem, 0,
+                       bytes);
+    bm_memcpy_d2d_byte(bm_handle, past_value[i].device_mem, token_offset,
+                       present_value_cache.device_mem, 0,
+                       bytes);
+  }
+  outputs_lm.shape = net_lm->stages[0].output_shapes[0];
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+void Baichuan2::chat() {
+  while (true) {
+    std::cout << "\nQuestion: ";
+    std::string input_str;
+    std::getline(std::cin, input_str);
+    std::string user_token = "<reserved_106>"; //user token id 195
+    std::string assitant_token = "<reserved_107>"; //assistant token id 196
+    if (input_str == "exit") {
+        break;
+    }
+    if (input_str == "clear") {
+        history.clear();
+        continue;
+    }
+    input_str = user_token + input_str + assitant_token;
+    std::cout << "\nAnswer: " << std::flush;
+    answer(input_str);
+    std::cout << std::endl;
+  }
+}
+void Baichuan2::answer(const std::string &input_str) {
+  int tok_num = 0;
+  history.emplace_back(std::move(input_str));
+  std::vector<int> tokens;
+  std::string history_input = std::accumulate(history.begin(), history.end(), std::string());
+  sentencepiece.Encode(history_input, &tokens);
+  if (tokens.empty()) {
+    printf("Sorry: your question is too wierd!!\n");
+    history.clear();
+    round = 0;
+    return;
+  }
+  // make sure token not too large
+  if (tokens.size() > MAX_LEN - 10) {
+    // reset
+    if (round == 0) {
+      printf("Error: your question is too large!\n");
+      return;
+    }
+    round = 0;
+    history.clear();
+    answer(input_str);
+    return;
+  }
+  auto time_1 = std::chrono::system_clock::now();
+  int pre_token = 0;
+  int token = forward_first(tokens);
+  auto time_2 = std::chrono::system_clock::now();
+  std::string result;
+  while (token != EOS && token_length < MAX_LEN) {
+    std::string pre_word;
+    std::string word;
+    std::vector<int> pre_ids = {pre_token};
+    std::vector<int> ids = {pre_token, token};
+    sentencepiece.Decode(pre_ids, &pre_word);
+    sentencepiece.Decode(ids, &word);
+    std::string diff = word.substr(pre_word.size());
+    result += diff;
+    std::cout << diff << std::flush;
+    if (token_length < MAX_LEN) {
+      token_length++;
+    }
+    tok_num++;
+    token = forward_next();
+  }
+  auto time_3 = std::chrono::system_clock::now();
+  auto ftl_dur =
+      std::chrono::duration_cast<std::chrono::microseconds>(time_2 - time_1);
+  auto tps_dur =
+      std::chrono::duration_cast<std::chrono::microseconds>(time_3 - time_2);
+  double tps = tok_num / (tps_dur.count() * 1e-6);
+  if (token_length >= MAX_LEN) {
+    printf(" ......\nWarning: cleanup early history\n");
+  }
+  // double tht = tokens.size() / (tht_dur.count() * 1e-6);
+  printf("\nFTL:%f s, TPS: %f tokens/s\n", ftl_dur.count() * 1e-6, tps);
+  history.emplace_back(result);
+  if (token_length + 128 >= MAX_LEN) {
+    int num = (history.size() + 3) / 4 * 2;
+    history.erase(history.begin(), history.begin() + num);
+  }
+}
+static void split(const std::string &s, const std::string &delim,
+                  std::vector<std::string> &ret) {
+  size_t last = 0;
+  size_t index = s.find_first_of(delim, last);
+  while (index != std::string::npos) {
+    ret.push_back(s.substr(last, index - last));
+    last = index + 1;
+    index = s.find_first_of(delim, last);
+  }
+  if (last < s.length()) {
+    ret.push_back(s.substr(last));
+  }
+}
+static std::vector<int> parseCascadeDevices(const std::string &str) {
+  std::vector<int> devices;
+  std::vector<std::string> sub_str;
+  split(str, ",", sub_str);
+  for (auto &s : sub_str) {
+    devices.push_back(std::atoi(s.c_str()));
+  }
+  return devices;
+}
+void processArguments(int argc, char *argv[], std::string &baichuan_model,
+                      std::vector<int> &devices) {
+  struct option longOptions[] = {{"model", required_argument, nullptr, 'm'},
+                                 {"dev_id", required_argument, nullptr, 'd'},
+                                 {nullptr, 0, nullptr, 0}};
+  int optionIndex = 0;
+  int option;
+  while ((option = getopt_long(argc, argv, "m:d:", longOptions,
+                               &optionIndex)) != -1) {
+    switch (option) {
+    case 'm':
+      baichuan_model = optarg;
+      break;
+    case 'd':
+      devices = parseCascadeDevices(optarg);
+      break;
+    case '?':
+      exit(EXIT_FAILURE);
+    default:
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+int main(int argc, char **argv) {
+  // set your bmodel path here
+  printf("Demo for Baichuan2-7B in BM1684X\n");
+  std::string baichuan_model = "baichuan2-7b-test.bmodel";
+  std::vector<int> devices = {0};
+  processArguments(argc, argv, baichuan_model, devices);
+  Baichuan2 baichuan;
+  printf("Init Environment ...\n");
+  baichuan.init(devices, baichuan_model);
+  printf("==========================\n");
+  baichuan.chat();
+  baichuan.deinit();
+  return 0;
+}

Baichuan2/model/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79452955be6b419a65984273a9f08af86042e1c2a75ee3ba989cbf620a133cc2
+size 2001107

Baichuan2/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.1.2
+transformers==4.36.2
+sentencepiece==0.1.99
+gradio==3.39.0
+mdtex2html==1.2.0
+accelerate
+onnx

Baichuan2/src/include/bmdef.h ADDED Viewed

	@@ -0,0 +1,129 @@

+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Sophgo Technologies Inc. This is proprietary information owned by
+ *    Sophgo Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Sophgo Technologies Inc.
+ *
+ *****************************************************************************/
+#ifndef __BMRUNTIME_DEFINE_H__
+#define __BMRUNTIME_DEFINE_H__
+#include "bmlib_runtime.h"
+#include <stddef.h>
+#include <stdint.h>
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* --------------------------------------------------------------------------*/
+/* basic definitions */
+/* bm_data_type_t holds the type for a scalar value */
+typedef enum bm_data_type_e {
+  BM_FLOAT32 = 0,
+  BM_FLOAT16 = 1,
+  BM_INT8 = 2,
+  BM_UINT8 = 3,
+  BM_INT16 = 4,
+  BM_UINT16 = 5,
+  BM_INT32 = 6,
+  BM_UINT32 = 7,
+  BM_BFLOAT16 = 8,
+  BM_INT4 = 9,
+  BM_UINT4 = 10,
+} bm_data_type_t;
+/* store mode definitions */
+typedef enum bm_store_mode_e {
+  BM_STORE_1N = 0, /* default, if not sure, use 0 */
+  BM_STORE_2N = 1,
+  BM_STORE_4N = 2,
+} bm_store_mode_t;
+/* bm_shape_t holds the shape info */
+#define BM_MAX_DIMS_NUM 8
+typedef struct bm_shape_s {
+  int num_dims;
+  int dims[BM_MAX_DIMS_NUM];
+} bm_shape_t;
+typedef struct bm_shape_ex_s {
+  bm_shape_t shape;
+  int        elem_num;
+} bm_shape_ex_t;
+/*
+bm_tensor_t holds a multi-dimensional array of elements of a single data type
+and tensor are in device memory */
+typedef struct bm_tensor_s {
+  bm_data_type_t dtype;
+  bm_shape_t shape;
+  bm_device_mem_t device_mem;
+  bm_store_mode_t st_mode; /* user can set 0 as default store mode */
+} bm_tensor_t;
+/* --------------------------------------------------------------------------*/
+/* network information structure */
+/* bm_stage_info_t holds input/output shapes and device mems; every network can contain one or more
+ * stages */
+typedef struct bm_stage_info_s {
+  bm_shape_t *input_shapes;  /* input_shapes[0] / [1] / ... / [input_num-1] */
+  bm_shape_t *output_shapes; /* output_shapes[0] / [1] / ... / [output_num-1] */
+  bm_device_mem_t *input_mems; /* input_mems[0] / [1] / ... / [input_num-1] */
+  bm_device_mem_t *output_mems; /* output_mems[0] / [1] / ... / [output_num-1] */
+} bm_stage_info_t;
+/* bm_tensor_info_t holds all information of one net.
+ * scale for float type is 1.0 as default */
+typedef struct bm_net_info_s {
+  const char* name;              /* net name */
+  bool is_dynamic;               /* dynamic or static */
+  int input_num;                 /* number of inputs */
+  char const** input_names;      /* input_names[0] / [1] / .../ [input_num-1] */
+  bm_data_type_t* input_dtypes;  /* input_dtypes[0] / [1] / .../ [input_num-1] */
+  float* input_scales;           /* input_scales[0] / [1] / .../ [input_num-1] */
+  int output_num;                /* number of outputs */
+  char const** output_names;     /* output_names[0] / [1] / .../ [output_num-1] */
+  bm_data_type_t* output_dtypes; /* output_dtypes[0] / [1] / .../ [output_num-1] */
+  float* output_scales;          /* output_scales[0] / [1] / .../ [output_num-1] */
+  int stage_num;                 /* number of stages */
+  bm_stage_info_t* stages;       /* stages[0] / [1] / ... / [stage_num-1] */
+  size_t* max_input_bytes;       /* max_input_bytes[0]/ [1] / ... / [input_num-1] */
+  size_t* max_output_bytes;      /* max_output_bytes[0] / [1] / ... / [output_num-1] */
+  int* input_zero_point;         /* input_zero_point[0] / [1] / .../ [input_num-1] */
+  int* output_zero_point;        /* output_zero_point[0] / [1] / .../ [output_num-1] */
+  int *input_loc_devices;         /* input_loc_device[0] / [1] / .../ [input_num-1] */
+  int *output_loc_devices;        /* output_loc_device[0] / [1] / .../ [output_num-1] */
+} bm_net_info_t;
+typedef struct api_info_s {
+  /// @brief api_id to be sent to driver
+  int32_t api_id;
+  /// @brief api data to be sent to driver
+  uint8_t **api_data;
+  /// @brief size of the api data to be sent to driver
+  size_t api_data_size;
+  /// @brief subsize of the api data to be sent to driver
+  size_t *api_data_subsize;
+  /// @brief offset of input tensors' addr in api_data
+  uint32_t *input_addr_offset;
+  /// @brief number of the offset of input tensors' addr in api_data
+  size_t input_addr_offset_number;
+  /// @brief offset of output tensors' addr in api_data
+  uint32_t *output_addr_offset;
+  /// @brief number of the offset of output tensors' addr in api_data
+  size_t output_addr_offset_number;
+} api_info_c;
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __BM_NET_H__ */

Baichuan2/src/include/bmlib_runtime.h ADDED Viewed

	@@ -0,0 +1,2581 @@

+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Bitmain Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Bitmain Technologies Inc. This is proprietary information owned by
+ *    Bitmain Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Bitmain Technologies Inc.
+ *
+ *****************************************************************************/
+/**************************************************************************
+ * bmlib_runtime defines interfaces that operate TPU devices.
+ * The functions can be divided into serveral categories.
+ * 1) device handle creation and destroy
+ * 2) memory help functions
+ * 3) global memory allocation and free
+ * 4) data transfer between host and device
+ * 5) data transfer within device memory
+ * 6) api send and synchronization
+ * 7) global memory map and coherence
+ * 8) trace and profile
+ * 9) power management
+ * 10) miscellaneous functions
+ *************************************************************************/
+#ifndef BMLIB_RUNTIME_H_
+#define BMLIB_RUNTIME_H_
+#if defined(_WIN32) && !defined(__MINGW32__)
+    #include <vadefs.h>
+    #define DECL_EXPORT __declspec(dllexport)
+    #define DECL_IMPORT __declspec(dllimport)
+#else
+	#include <stdbool.h>
+	#include <stddef.h>
+	#include <stdarg.h>
+    #define DECL_EXPORT
+    #define DECL_IMPORT
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+typedef enum {
+  MODULE_CDMA = 0,
+  MODULE_GDMA = 1,
+  MODULE_TPU = 2,
+  MODULE_SMMU = 3,
+  MODULE_SRAM = 4,
+  MODULE_END = 5
+} MODULE_ID;
+#define BM_MEM_ADDR_NULL (0xfffffffff)
+#ifndef BM_MEM_DESC_T_
+#define BM_MEM_DESC_T_
+/* BM function return code definitions */
+typedef enum {
+  BM_SUCCESS = 0,
+  BM_ERR_DEVNOTREADY = 1, /* Device not ready yet */
+  BM_ERR_FAILURE = 2,     /* General failure */
+  BM_ERR_TIMEOUT = 3,     /* Timeout */
+  BM_ERR_PARAM = 4,       /* Parameters invalid */
+  BM_ERR_NOMEM = 5,       /* Not enough memory */
+  BM_ERR_DATA = 6,        /* Data error */
+  BM_ERR_BUSY = 7,        /* Busy */
+  BM_ERR_NOFEATURE = 8,   /* Not supported yet */
+  BM_NOT_SUPPORTED = 9
+} bm_status_t;
+/* BM memory type definitions */
+typedef enum {
+  BM_MEM_TYPE_DEVICE = 0,
+  BM_MEM_TYPE_HOST = 1,
+  BM_MEM_TYPE_SYSTEM = 2,
+  BM_MEM_TYPE_INT8_DEVICE = 3,
+  BM_MEM_TYPE_INVALID = 4
+} bm_mem_type_t;
+typedef enum {
+  PERF_MONITOR_GDMA = 0,
+  PERF_MONITOR_TPU = 1
+} PERF_MONITOR_ID;
+typedef enum {
+  BMCPU_IDLE    = 0,
+  BMCPU_RUNNING = 1,
+  BMCPU_FAULT   = 2
+} bm_cpu_status_t;
+/*
+* bm performace monitor
+*/
+typedef struct bm_perf_monitor {
+  long long buffer_start_addr; /*buffer address to store perf data*/
+  int buffer_size; /*buffer size*/
+  PERF_MONITOR_ID monitor_id; /*PERF_MONITOR_GDMA or PERF_MONITOR_TPU*/
+} bm_perf_monitor_t;
+typedef union {
+  struct {
+    bm_mem_type_t mem_type : 3;
+    unsigned int gmem_heapid : 3;
+    unsigned int reserved : 26;
+  } u;
+  unsigned int rawflags;
+} bm_mem_flags_t;
+/* BM memory descriptor definition*/
+typedef struct bm_mem_desc {
+  union {
+    struct {
+#ifdef __linux__
+      unsigned long device_addr;
+#else
+      unsigned long long device_addr;
+#endif
+      unsigned int reserved;
+      int dmabuf_fd;
+    } device;
+    struct {
+      void *system_addr;
+      unsigned int reserved0;
+      int reserved1;
+    } system;
+  } u;
+  bm_mem_flags_t flags;
+  unsigned int size;
+} bm_mem_desc_t;
+typedef struct bm_mem_desc bm_device_mem_t;
+typedef struct bm_mem_desc bm_system_mem_t;
+typedef struct sg_mem_desc {
+  union {
+    struct {
+#ifdef __linux__
+      unsigned long device_addr;
+#else
+      unsigned long long device_addr;
+#endif
+      unsigned int reserved;
+      int dmabuf_fd;
+    } device;
+    struct {
+      void *system_addr;
+      unsigned int reserved0;
+      int reserved1;
+    } system;
+  } u;
+  bm_mem_flags_t flags;
+  unsigned long long size;
+} sg_mem_desc_t;
+typedef struct sg_mem_desc sg_device_mem_t;
+typedef struct sg_mem_desc sg_system_mem_t;
+#endif
+struct bm_context;
+typedef struct bm_context *bm_handle_t;
+#define MD5SUM_LEN 16
+#define LIB_MAX_NAME_LEN 64
+#define FUNC_MAX_NAME_LEN 64
+typedef struct bm_module
+{
+  // void *lib_handle;
+  char lib_name[LIB_MAX_NAME_LEN];
+  unsigned char md5[MD5SUM_LEN];
+}bm_module;
+typedef struct bm_module *tpu_kernel_module_t;
+typedef int tpu_kernel_function_t;
+/**
+ * @name    tpu_kernel_load_module_file
+ * @brief   To load dyn file
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module_file     dyn file
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module_file(bm_handle_t handle, const char *module_file);
+/**
+ * @name    tpu_kernel_load_module_file_key
+ * @brief   To load dyn file with key
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module_file     dyn file
+ * @param [in]  key             identification str
+ * @param [in]  size            key size
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module_file_key(bm_handle_t handle, const char *module_file, const char *key, int size);
+/**
+ * @name    tpu_kernel_unload_module
+ * @brief   To unload dyn file
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  p_module        dyn lib ptr
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_unload_module(bm_handle_t handle, tpu_kernel_module_t p_module);
+/**
+ * @name    tpu_kernel_free_module
+ * @brief   To free p_module when not use
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  p_module        dyn lib ptr
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_free_module(bm_handle_t handle, tpu_kernel_module_t p_module);
+/**
+ * @name    tpu_kernel_load_module
+ * @brief   To load dyn module
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  data            dyn module
+ * @param [in]  length          dyn module size
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module(bm_handle_t handle, const char *data, size_t length);
+/**
+ * @name    tpu_kernel_get_function
+ * @brief   To get function from lib
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module          dyn module
+ * @param [in]  function        funtion name
+ * @retval  function id
+ */
+tpu_kernel_function_t tpu_kernel_get_function(bm_handle_t handle, tpu_kernel_module_t module, const char *function);
+/**
+ * @name    tpu_kernel_launch
+ * @brief   To launch function with sync
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  function        function id
+ * @param [in]  args            funtion args
+ * @param [in]  size            args size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size);
+/**
+ * @name    tpu_kernel_launch_async
+ * @brief   To launch function with async
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  function        function id
+ * @param [in]  args            funtion args
+ * @param [in]  size            args size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_async(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size);
+/**
+ * @name    tpu_kernel_launch_async_multi_cores
+ * @brief   To launch function with async for multi cores
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  func_name       function name
+ * @param [in]  api_param       funtion params
+ * @param [in]  api_size        params size
+ * @param [in]  core_list       list of core ids
+ * @param [in]  core_num        number of cores
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_async_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param,
+                                                size_t api_size, const int* core_list, const int core_num);
+/**
+ * @name    tpu_kernel_launch_sync_multi_cores
+ * @brief   To launch function with sync for multi cores
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  func_name       function name
+ * @param [in]  api_param       funtion params
+ * @param [in]  api_size        params size
+ * @param [in]  core_list       list of core ids
+ * @param [in]  core_num        number of cores
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_sync_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param,
+                                              size_t api_size, const int* core_list, const int core_num);
+/**
+ * @name    tpu_kernel_sync
+ * @brief   To sync
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_sync(bm_handle_t handle);
+void show_md5(unsigned char md5[]);
+DECL_EXPORT void bmlib_log(const char *tag, int level, const char *fmt, ...);
+#ifndef USING_CMODEL
+#define BM_CHECK_RET(call)                                                    \
+  do {                                                                        \
+    bm_status_t ret = (bm_status_t)call;                                                   \
+    if (ret != BM_SUCCESS) {                                                  \
+      bmlib_log("BM_CHECK",16,"BM_CHECK_RET fail %s: %s: %d\n", __FILE__, __func__, __LINE__); \
+      return ret;                                                             \
+    }                                                                         \
+  } while (0)
+#else
+#define BM_CHECK_RET(call)                     \
+  do {                                         \
+    bm_status_t ret = call;                    \
+    if (ret != BM_SUCCESS) {                   \
+      bmlib_log("BM_CHECK",16,"BM_CHECK_RET failed %d\n", ret);\
+      ASSERT(0);                               \
+      exit(-ret);                              \
+    }                                          \
+  } while (0)
+#endif
+/*******************handle releated functions *********************************/
+/**
+ * @name    bm_dev_getcount
+ * @brief   To get the number of sophon devices in system.
+ *          If N is got, valid devid is [0, N-1]
+ * @ingroup bmlib_runtime
+ *
+ * @param [out] count  The result number of sophon devices
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_dev_getcount(int *count);
+/**
+ * @name    bm_dev_query
+ * @brief   To query if a device is present
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] devid  The id of the device to query
+ * @retval  BM_SUCCESS Device is present
+ *          Other code Devcie is not present
+ */
+DECL_EXPORT bm_status_t bm_dev_query(int devid);
+/**
+ * @name    bm_dev_request
+ * @brief   To create a handle for the given device
+ * @ingroup bmlib_runtime
+ *
+ * @param [out] handle  The created handle
+ * @param [in]  devid   Specify on which device to create handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_dev_request(bm_handle_t *handle, int devid);
+/**
+ * @name    bm_get_devid
+ * @brief   To get device index for the given handle
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The given handle
+ * @retval  int  device index that the handle points to.
+ */
+DECL_EXPORT int bm_get_devid(bm_handle_t handle);
+/**
+ * @name    bm_dev_free
+ * @brief   To free a handle
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The handle to free
+ */
+DECL_EXPORT void bm_dev_free(bm_handle_t handle);
+/*******************memory help functions ************************************/
+/**
+ * @name    bm_mem_get_type
+ * @brief   To get a memory descriptor's type
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The memory descriptor queried
+ * @retval  BM_MEM_TYPE_DEVICE  Device global memory
+ * @retval  BM_MEM_TYPE_SYSTEM  Host user memory
+ */
+DECL_EXPORT bm_mem_type_t bm_mem_get_type(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_type
+ * @brief   To get a memory descriptor's type
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The memory descriptor queried
+ * @retval  BM_MEM_TYPE_DEVICE  Device global memory
+ * @retval  BM_MEM_TYPE_SYSTEM  Host user memory
+ */
+DECL_EXPORT bm_mem_type_t sg_mem_get_type(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_get_device_addr
+ * @brief   To get a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The device memory descriptor queried
+ * @retval  unsigned long long  The device memory address
+ */
+DECL_EXPORT unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_device_addr
+ * @brief   To get a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The device memory descriptor queried
+ * @retval  unsigned long long  The device memory address
+ */
+DECL_EXPORT unsigned long long sg_mem_get_device_addr(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_set_device_addr
+ * @brief   To set a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem   The device memory descriptor pointer
+ * @param ]in]  addr  The new device address of the device memory
+ */
+DECL_EXPORT void bm_mem_set_device_addr(struct bm_mem_desc* pmem, unsigned long long addr);
+/**
+ * @name    sg_mem_set_device_addr
+ * @brief   To set a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem   The device memory descriptor pointer
+ * @param ]in]  addr  The new device address of the device memory
+ */
+DECL_EXPORT void sg_mem_set_device_addr(struct sg_mem_desc* pmem, unsigned long long addr);
+/**
+ * @name    bm_mem_get_device_size
+ * @brief   To get a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem      The device memory descriptor queried
+ * @retval unsigned int  The device memory's size in bytes
+ */
+DECL_EXPORT unsigned int bm_mem_get_device_size(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_device_size
+ * @brief   To get a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem      The device memory descriptor queried
+ * @retval unsigned int  The device memory's size in bytes
+ */
+DECL_EXPORT unsigned long long sg_mem_get_device_size(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_set_device_size
+ * @brief   To set a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [out]  pmem  The device memory descriptor pointer
+ * @param [in]  size  The new device memory size (in bytes) of the device memory
+ */
+DECL_EXPORT void bm_mem_set_device_size(struct bm_mem_desc* pmem, unsigned int size);
+/**
+ * @name    sg_mem_set_device_size
+ * @brief   To set a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [out]  pmem  The device memory descriptor pointer
+ * @param [in]  size  The new device memory size (in bytes) of the device memory
+ */
+DECL_EXPORT void sg_mem_set_device_size(struct sg_mem_desc* pmem, unsigned long long size);
+/**
+ * @name    bm_set_device_mem
+ * @brief   To fill in a device memory descriptor with size and address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] pmem  The device memory descriptor pointer
+ * @param [in]  size  The device memory descriptor's size
+ * @param [in]  addr  The device memory descriptor's address
+ */
+DECL_EXPORT void bm_set_device_mem(bm_device_mem_t* pmem, unsigned int size,
+                       unsigned long long addr);
+/**
+ * @name    sg_set_device_mem
+ * @brief   To fill in a device memory descriptor with size and address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] pmem  The device memory descriptor pointer
+ * @param [in]  size  The device memory descriptor's size
+ * @param [in]  addr  The device memory descriptor's address
+ */
+DECL_EXPORT void sg_set_device_mem(sg_device_mem_t* pmem, unsigned long long size,
+                       unsigned long long addr);
+/**
+ * @name    bm_mem_from_device
+ * @brief   To create a device memory descriptor from address and size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] device_addr The device memory address
+ * @param [in] len         The device memory size
+ * @retval bm_device_mem_t The device memory descriptor created
+ */
+DECL_EXPORT bm_device_mem_t bm_mem_from_device(unsigned long long device_addr,
+                                   unsigned int len);
+/**
+ * @name    sg_mem_from_device
+ * @brief   To create a device memory descriptor from address and size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] device_addr The device memory address
+ * @param [in] len         The device memory size
+ * @retval bm_device_mem_t The device memory descriptor created
+ */
+DECL_EXPORT sg_device_mem_t sg_mem_from_device(unsigned long long device_addr,
+                                   unsigned long long len);
+/**
+ * @name    bm_mem_get_system_addr
+ * @brief   To get a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] mem  The system memory descriptor
+ * @retval void *   The system memory descriptor's address
+ */
+DECL_EXPORT void *bm_mem_get_system_addr(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_system_addr
+ * @brief   To get a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] mem  The system memory descriptor
+ * @retval void *   The system memory descriptor's address
+ */
+DECL_EXPORT void *sg_mem_get_system_addr(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_set_system_addr
+ * @brief   To set a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem  The system memory descriptor pointer
+ * @param [in]   addr The system memory address
+ */
+DECL_EXPORT void bm_mem_set_system_addr(struct bm_mem_desc* pmem, void *addr);
+/**
+ * @name    sg_mem_set_system_addr
+ * @brief   To set a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem  The system memory descriptor pointer
+ * @param [in]   addr The system memory address
+ */
+DECL_EXPORT void sg_mem_set_system_addr(struct sg_mem_desc* pmem, void *addr);
+/**
+ * @name    bm_mem_from_system
+ * @brief   To create a system memory descriptor with the given system address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  system_addr  The system address in the descriptor
+ * @retval  bm_system_mem_t  The system memory descriptor created
+ */
+DECL_EXPORT bm_system_mem_t bm_mem_from_system(void *system_addr);
+/*******************memory alloc and free functions ***************************/
+/**
+ * @name    bm_mem_null
+ * @brief   Return an illegal device memory descriptor
+ * @ingroup bmlib_runtime
+ *
+ * @retval  bm_device_mem_t  An invalid device memory descriptor
+ */
+DECL_EXPORT bm_device_mem_t bm_mem_null(void);
+#define BM_MEM_NULL (bm_mem_null())
+/**
+ * @name    bm_malloc_neuron_device
+ * @brief   To malloc device memory according to a tensor shape
+ *          (each neuron is 32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result devcie memory descriptor
+ * @param [in]  n, c, h, w  The shape of the input tensor
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_neuron_device(bm_handle_t handle, bm_device_mem_t *pmem,
+                                    int n, int c, int h, int w);
+/**
+ * @name    sg_malloc_neuron_device
+ * @brief   To malloc device memory according to a tensor shape
+ *          (each neuron is 32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result devcie memory descriptor
+ * @param [in]  n, c, h, w  The shape of the input tensor
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_neuron_device(bm_handle_t handle, sg_device_mem_t *pmem,
+                                    unsigned long long n, unsigned long long c,
+                                    unsigned long long h, unsigned long long w);
+/**
+ * @name    bm_malloc_device_dword
+ * @brief   To malloc device memory in size of dword (32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   count  The number of dwords(32bits) to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_dword(bm_handle_t handle, bm_device_mem_t *pmem,
+                                   int count);
+/**
+ * @name    sg_malloc_device_dword
+ * @brief   To malloc device memory in size of dword (32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   count  The number of dwords(32bits) to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_dword(bm_handle_t handle, sg_device_mem_t *pmem,
+                                   unsigned long long count);
+/**
+ * @name    bm_malloc_device_byte
+ * @brief   To malloc device memory in size of byte
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  unsigned int size);
+/**
+ * @name    sg_malloc_device_byte
+ * @brief   To malloc device memory in size of byte
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  unsigned long long size);
+/**
+ * @name    bm_malloc_device_byte_heap
+ * @brief   To malloc device memory in size of byte within the specified heap
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id The heap where to allocate  0/1/2
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte_heap(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  int heap_id, unsigned int size);
+/**
+ * @name    sg_malloc_device_byte_heap
+ * @brief   To malloc device memory in size of byte within the specified heap
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id The heap where to allocate  0/1/2
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte_heap(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  int heap_id, unsigned long long size);
+/**
+ * @name    bm_malloc_device_byte_heap_mask
+ * @brief   To malloc device memory in size of byte within the specified heaps
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id_mask The mask which heaps allocate from. each bit indicate one heap
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte_heap_mask(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  int heap_id_mask, unsigned int size);
+/**
+ * @name    sg_malloc_device_byte_heap_mask
+ * @brief   To malloc device memory in size of byte within the specified heaps
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id_mask The mask which heaps allocate from. each bit indicate one heap
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte_heap_mask(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  int heap_id_mask, unsigned long long size);
+/**
+ * @name    bm_free_device
+ * @brief   To free device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mem     The device memory descriptor to free
+ */
+DECL_EXPORT void bm_free_device(bm_handle_t handle, bm_device_mem_t mem);
+/**
+ * @name    sg_free_device
+ * @brief   To free device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mem     The device memory descriptor to free
+ */
+DECL_EXPORT void sg_free_device(bm_handle_t handle, sg_device_mem_t mem);
+/**
+ * @name    bm_gmem_arm_reserved_request
+ * @brief   To obtain the address of global memory reserved for arm926
+ * @param [in]  handle  The device handle
+ *
+ * @retval unsigned long long  The absolute address of gmem reserved for arm926
+ */
+DECL_EXPORT unsigned long long bm_gmem_arm_reserved_request(bm_handle_t handle);
+/**
+ * @name    bm_gmem_arm_reserved_release
+ * @brief   To release the global memory reserved for arm926
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ */
+DECL_EXPORT void bm_gmem_arm_reserved_release(bm_handle_t handle);
+/*******************memory copy functions *************************************/
+/**
+ * @name    bm_memcpy_s2d
+ * @brief   To copy data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] dst     The destination memory (device memory descriptor )
+ * @param [in] src     The source memory (system memory, a void* pointer)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d(bm_handle_t handle, bm_device_mem_t dst, void *src);
+/**
+ * @name    bm_memcpy_p2p
+ * @brief   To copy data from one chip to another chip
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle_src The source device handle
+ * @param [in] src        The source memory (device memory descriptor )
+ * @param [in] handle_dst The destination device handle
+ * @param [in] dst        The destination memory (device memory descriptor )
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_p2p(bm_handle_t handle_src, bm_device_mem_t src, bm_handle_t handle_dst,bm_device_mem_t dst);
+/**
+ * @name    sg_memcpy_s2d
+ * @brief   To copy data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] dst     The destination memory (device memory descriptor )
+ * @param [in] src     The source memory (system memory, a void* pointer)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d(bm_handle_t handle, sg_device_mem_t dst, void *src);
+/**
+ * @name    bm_memcpy_s2d_partial_offset
+ * @brief   To copy specified bytes of data from system memory to device memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d_partial_offset(bm_handle_t handle,
+                                         bm_device_mem_t dst, void *src,
+                                         unsigned int size,
+                                         unsigned int offset);
+/**
+ * @name    sg_memcpy_s2d_partial_offset
+ * @brief   To copy specified bytes of data from system memory to device memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d_partial_offset(bm_handle_t handle,
+                                         sg_device_mem_t dst, void *src,
+                                         unsigned long long size,
+                                         unsigned long long offset);
+/**
+ * @name    bm_memcpy_s2d_partial
+ * @brief   To copy specified bytes of data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d_partial(bm_handle_t handle, bm_device_mem_t dst,
+                                  void *src, unsigned int size);
+/**
+ * @name    sg_memcpy_s2d_partial
+ * @brief   To copy specified bytes of data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d_partial(bm_handle_t handle, sg_device_mem_t dst,
+                                  void *src, unsigned long long size);
+/**
+ * @name    bm_memcpy_d2s
+ * @brief   To copy data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s(bm_handle_t handle, void *dst, bm_device_mem_t src);
+/**
+ * @name    sg_memcpy_d2s
+ * @brief   To copy data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s(bm_handle_t handle, void *dst, sg_device_mem_t src);
+/**
+ * @name    bm_memcpy_d2s_partial_offset
+ * @brief   To copy specified bytes of data from device memory to system memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst,
+                                         bm_device_mem_t src, unsigned int size,
+                                         unsigned int offset);
+/**
+ * @name    sg_memcpy_d2s_partial_offset
+ * @brief   To copy specified bytes of data from device memory to system memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst,
+                                         sg_device_mem_t src, unsigned long long size,
+                                         unsigned long long offset);
+/**
+ * @name    bm_memcpy_d2s_partial
+ * @brief   To copy specified bytes of data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Data transfer succeeds.
+ *          Other code  Data transfer fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s_partial(bm_handle_t handle, void *dst,
+                                  bm_device_mem_t src, unsigned int size);
+/**
+ * @name    sg_memcpy_d2s_partial
+ * @brief   To copy specified bytes of data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Data transfer succeeds.
+ *          Other code  Data transfer fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s_partial(bm_handle_t handle, void *dst,
+                                  sg_device_mem_t src, unsigned long long size);
+/**
+ * @name    bm_memcpy_d2d
+ * @brief   To copy specified dwords of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address
+ * @param [in]  len       Length of data to copy (in DWORD 4 bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d(bm_handle_t handle, bm_device_mem_t dst,
+                          int dst_offset, bm_device_mem_t src, int src_offset,
+                          int len);
+/**
+ * @name    bm_memcpy_d2d_with_core
+ * @brief   To copy specified dwords of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address
+ * @param [in]  len       Length of data to copy (in DWORD 4 bytes)
+ * @param [in] core_id    The core id to copy
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_with_core(bm_handle_t handle, bm_device_mem_t dst,
+                          int dst_offset, bm_device_mem_t src, int src_offset,
+                          int len, int core_id);
+/**
+ * @name    bm_memcpy_d2d_byte
+ * @brief   To copy specified bytes of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address (in bytes)
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address (in bytes)
+ * @param [in]  size      Size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_byte(bm_handle_t handle, bm_device_mem_t dst,
+                               size_t dst_offset, bm_device_mem_t src,
+                               size_t src_offset, size_t size);
+/**
+ * @name    bm_memcpy_d2d_byte_with_core
+ * @brief   To copy specified bytes of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address (in bytes)
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address (in bytes)
+ * @param [in]  size      Size of data to copy (in bytes)
+ * @param [in] core_id    The core id to copy
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_byte_with_core(bm_handle_t handle, bm_device_mem_t dst,
+                               size_t dst_offset, bm_device_mem_t src,
+                               size_t src_offset, size_t size, int core_id);
+/**
+ * @name    bm_memcpy_d2d_stride
+ * @brief   To copy specified data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle      The device handle
+ * @param [in] dst         The destination device memory
+ * @param [in] dst_stride  The data stride of destination data
+ * @param [in] src         The source device memory
+ * @param [in] src_stride  The data stride of source data
+ * @param [in] count       Count of data to copy
+ * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc.
+ *                         format_size only support 1/2/4.
+ *
+ * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_stride(bm_handle_t     handle,
+                                 bm_device_mem_t dst,
+                                 int             dst_stride,
+                                 bm_device_mem_t src,
+                                 int             src_stride,
+                                 int             count,
+                                 int             format_size);
+/**
+ * @name    bm_memcpy_d2d_stride
+ * @brief   To copy specified data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle      The device handle
+ * @param [in] dst         The destination device memory
+ * @param [in] dst_stride  The data stride of destination data
+ * @param [in] src         The source device memory
+ * @param [in] src_stride  The data stride of source data
+ * @param [in] count       Count of data to copy
+ * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc.
+ *                         format_size only support 1/2/4.
+ * @param [in] core_id     The core id to copy.
+ *
+ * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_stride_with_core(bm_handle_t     handle,
+                                 bm_device_mem_t dst,
+                                 int             dst_stride,
+                                 bm_device_mem_t src,
+                                 int             src_stride,
+                                 int             count,
+                                 int             format_size,
+                                 int             core_id);
+/**
+ * @name    bm_memcpy_c2c
+ * @brief   To copy data from one chip to another chip.
+ *          (Used in multi-chip card scenario)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] src_handle The source device handle
+ * @param [in] dst_handle The destination device handle
+ * @param [in] src        The source device memory descriptor
+ * @param [in] dst        The destination device memory descriptor
+ * @param [in] force_dst_cdma If use the CDMA engine of the destination device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_c2c(bm_handle_t src_handle, bm_handle_t dst_handle,
+                          bm_device_mem_t src, bm_device_mem_t dst,
+                          bool force_dst_cdma);
+/**
+ * @name    bm_memset_device
+ * @brief   To fill in specified device memory with the given value
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   value  The value used to fill. (int type)
+ * @param [in]  mem     The device memory which will be filled in
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memset_device(bm_handle_t handle, const int value,
+                             bm_device_mem_t mem);
+/**
+ * @name    bm_memset_device_ext
+ * @brief   To fill in specified device memory with the given value and mode
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   value  The pointer of value used to fill
+ * @param [in]   mode   The valid bytes of *value
+ * @param [in]  mem     The device memory which will be filled in
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memset_device_ext(bm_handle_t handle, void* value, int mode,
+                             bm_device_mem_t mem);
+/**
+ * @name    bm_mem_convert_system_to_device_neuron
+ * @brief   To malloc a piece of device memory according to the shape of
+ *          neuron(in DWORD 4 bytes); copy neuron from system memory to
+ *          device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  n,c,h,w  Neuron shape size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron(bm_handle_t handle,
+                                                   struct bm_mem_desc *dev_mem,
+                                                   struct bm_mem_desc sys_mem,
+                                                   bool need_copy, int n, int c,
+                                                   int h, int w);
+/**
+ * @name    bm_mem_convert_system_to_device_neuron_byte
+ * @brief   To malloc a piece of device memory according to the shape of
+ *          neuron(in bytes); copy neuron from system memory to
+ *          device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  n,c,h,w  Neuron shape size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron_byte(
+    bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem,
+    bool need_copy, int n, int c, int h, int w);
+/**
+ * @name    bm_mem_convert_system_to_device_coeff
+ * @brief   To malloc a piece of device memory according to the size of
+ *          coefficient (in DWORD 4 bytes); copy coefficient from system
+ *          memory to device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  coeff_count Coefficient size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff(bm_handle_t handle,
+                                                  struct bm_mem_desc *dev_mem,
+                                                  struct bm_mem_desc sys_mem,
+                                                  bool need_copy,
+                                                  int coeff_count);
+/**
+ * @name    bm_mem_convert_system_to_device_coeff_byte
+ * @brief   To malloc a piece of device memory according to the size of
+ *          coefficient (in bytes); copy coefficient from system
+ *          memory to device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  coeff_count Coefficient size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff_byte(
+    bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem,
+    bool need_copy, int coeff_count);
+/*******************memory map functions *************************************/
+/**
+ * @name    bm_mem_mmap_device_mem
+ * @brief   To map a piece of device memory to user space with cache enabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_mmap_device_mem(bm_handle_t handle, bm_device_mem_t *dmem,
+        unsigned long long *vmem);
+/**
+ * @name    sg_mem_mmap_device_mem
+ * @brief   To map a piece of device memory to user space with cache enabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_mmap_device_mem(bm_handle_t handle, sg_device_mem_t *dmem,
+        unsigned long long *vmem);
+/*******************memory map functions *************************************/
+/**
+ * @name    bm_mem_mmap_device_mem_no_cache
+ * @brief   To map a piece of device memory to user space with cache disabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_mmap_device_mem_no_cache(bm_handle_t handle, bm_device_mem_t *dmem,
+        unsigned long long *vmem);
+/**
+ * @name    sg_mem_mmap_device_mem_no_cache
+ * @brief   To map a piece of device memory to user space with cache disabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_mmap_device_mem_no_cache(bm_handle_t handle, sg_device_mem_t *dmem,
+        unsigned long long *vmem);
+/**
+ * @name    bm_mem_vir_to_phy
+ * @brief   To get device mem address through the mapped virtual address .
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  vmem    The virtual address of the mapped device memory
+ * @param [out]  dev_mem The device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_vir_to_phy(bm_handle_t handle, unsigned long long vmem,
+        unsigned long long *device_mem);
+/**
+ * @name    bm_mem_invalidate_device_mem
+ * @brief   To invalidate a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_invalidate_device_mem(bm_handle_t handle,
+                                         bm_device_mem_t *dmem);
+/**
+ * @name    sg_mem_invalidate_device_mem
+ * @brief   To invalidate a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_invalidate_device_mem(bm_handle_t handle,
+                                         sg_device_mem_t *dmem);
+/**
+ * @name    bm_mem_invalidate_partial_device_mem
+ * @brief   To invalidate part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to invalidate in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_invalidate_partial_device_mem(bm_handle_t handle,
+                                                 bm_device_mem_t *dmem,
+                                                 unsigned int offset,
+                                                 unsigned int len);
+/**
+ * @name    sg_mem_invalidate_partial_device_mem
+ * @brief   To invalidate part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to invalidate in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_invalidate_partial_device_mem(bm_handle_t handle,
+                                                 sg_device_mem_t *dmem,
+                                                 unsigned long long offset,
+                                                 unsigned long long len);
+/**
+ * @name    bm_mem_flush_device_mem
+ * @brief   To flush a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_flush_device_mem(bm_handle_t handle, bm_device_mem_t *dmem);
+/**
+ * @name    sg_mem_flush_device_mem
+ * @brief   To flush a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_flush_device_mem(bm_handle_t handle, sg_device_mem_t *dmem);
+/**
+ * @name    bm_mem_flush_partial_device_mem
+ * @brief   To flush part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to flush in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_flush_partial_device_mem(bm_handle_t handle,
+                                            bm_device_mem_t *dmem,
+                                            unsigned int offset,
+                                            unsigned int len);
+/**
+ * @name    sg_mem_flush_partial_device_mem
+ * @brief   To flush part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to flush in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_flush_partial_device_mem(bm_handle_t handle,
+                                            sg_device_mem_t *dmem,
+                                            unsigned long long offset,
+                                            unsigned long long len);
+/**
+ * @name    bm_mem_unmap_device_mem
+ * @brief   To unmap a piece of mapped device memory
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   vmem   The virtual address of the mapped device memory
+ * @param [in]  size    The size of unmapped memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_unmap_device_mem(bm_handle_t handle, void *vmem, int size);
+/**
+ * @name    sg_mem_unmap_device_mem
+ * @brief   To unmap a piece of mapped device memory
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   vmem   The virtual address of the mapped device memory
+ * @param [in]  size    The size of unmapped memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_unmap_device_mem(bm_handle_t handle, void *vmem, unsigned long long size);
+/*******************api(kernel) functions *************************************/
+/**
+ * @name    bm_flush
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ */
+DECL_EXPORT void bm_flush(bm_handle_t handle);
+/**
+ * @name    bm_device_sync
+ * @brief   To synchronize APIs of the device. The thread will block
+ *          until all the outstanding APIs of the device are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_device_sync(bm_handle_t handle);
+/**
+ * @name    bm_handle_sync
+ * @brief   To synchronize APIs of the handle. The thread will block
+ *          until all the outstanding APIs of the handle are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_handle_sync(bm_handle_t handle);
+/**
+ * @name    bm_handle_sync_from_core
+ * @brief   To synchronize APIs of the handle. The thread will block
+ *          until all the outstanding APIs of the handle are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @param [in] core_id  The core id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_handle_sync_from_core(bm_handle_t handle, int core_id);
+/**
+ * @name    bm_thread_sync
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @retval  BM_SUCCESS Succeeds.
+ *          Other code Fails.
+ */
+DECL_EXPORT bm_status_t bm_thread_sync(bm_handle_t handle);
+/**
+ * @name    bm_thread_sync_from_core
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] core_id The core id
+ * @retval  BM_SUCCESS Succeeds.
+ *          Other code Fails.
+ */
+DECL_EXPORT bm_status_t bm_thread_sync_from_core(bm_handle_t handle, int core_id);
+/*******************trace and profile releated functions **********************/
+typedef struct bm_profile {
+#ifdef __linux__
+  unsigned long cdma_in_time;
+  unsigned long cdma_in_counter;
+  unsigned long cdma_out_time;
+  unsigned long cdma_out_counter;
+  unsigned long tpu_process_time;
+  unsigned long tpu1_process_time;
+  unsigned long sent_api_counter;
+  unsigned long completed_api_counter;
+#else
+  unsigned long long cdma_in_time;
+  unsigned long long cdma_in_counter;
+  unsigned long long cdma_out_time;
+  unsigned long long cdma_out_counter;
+  unsigned long long tpu_process_time;
+  unsigned long long tpu1_process_time;
+  unsigned long long sent_api_counter;
+  unsigned long long completed_api_counter;
+#endif
+} bm_profile_t;
+/**
+ * @name    bm_get_profile
+ * @brief   To get the profile data at the moment
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] profile The result profile data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_profile(bm_handle_t handle, bm_profile_t *profile);
+typedef struct bootloader_version{
+	char *bl1_version;
+	char *bl2_version;
+	char *bl31_version;
+	char *uboot_version;
+} boot_loader_version;
+/**
+ * @name    bm_get_boot_loader_version
+ * @brief   To get the boot_loader_version
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] version The result version data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_boot_loader_version(bm_handle_t handle, boot_loader_version *version);
+/**
+ * @name    bm_get_vpu_instant_usage
+ * @brief   To get vpu usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result vpu usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_vpu_instant_usage(bm_handle_t handle, int *vpu_usage);
+/**
+ * @name    bm_get_jpu_core_usage
+ * @brief   To get the jpu usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result jpu usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_jpu_core_usage(bm_handle_t handle, int *jpu_usage);
+/**
+ * @name    bm_get_vpp_instant_usage
+ * @brief   To get the vpp usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result vpp usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_vpp_instant_usage(bm_handle_t handle, int *vpp_usage);
+/**
+ * @name    bm_get_last_api_process_time_us
+ * @brief   This function is abandoned.
+ */
+#ifdef __linux__
+DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle,
+                                            unsigned long *time_us);
+#else
+DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle,
+											unsigned long long *time_us);
+#endif
+/*******************tpu clock and module reset releated functions *************/
+/**
+ * @name    bm_set_clk_tpu_freq
+ * @brief   To set the clock frequency of TPU (only valid in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   freq   The TPU target frequency
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_set_clk_tpu_freq(bm_handle_t handle, int freq);
+/**
+ * @name    bm_get_clk_tpu_freq
+ * @brief   To get the clock frequency of TPU
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  freq   The current TPU frequency
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_clk_tpu_freq(bm_handle_t handle, int *freq);
+/*******************misc functions ********************************************/
+struct bm_misc_info {
+  int pcie_soc_mode;  /*0---pcie; 1---soc*/
+  int ddr_ecc_enable; /*0---disable; 1---enable*/
+  long long ddr0a_size;
+  long long ddr0b_size;
+  long long ddr1_size;
+  long long ddr2_size;
+  unsigned int chipid;
+#define BM1682_CHIPID_BIT_MASK (0X1 << 0)
+#define BM1684_CHIPID_BIT_MASK (0X1 << 1)
+#define BM1686_CHIPID_BIT_MASK (0X1 << 2)
+#ifdef __linux__
+  unsigned long chipid_bit_mask;
+#else
+	unsigned long long chipid_bit_mask;
+#endif
+  unsigned int driver_version;
+  int domain_bdf;
+  int board_version; /*hardware board version [23:16]-mcu sw version, [15:8]-board type, [7:0]-hw version*/
+  int a53_enable;
+  int dyn_enable;
+};
+/**
+ * @name    bm_get_misc_info
+ * @brief   To get miscellaneous information of the device
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle     The device handle
+ * @param [out] pmisc_info The fetched misc info
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_misc_info(bm_handle_t handle, struct bm_misc_info *pmisc_info);
+/**
+ * @name    bm_get_chipid
+ * @brief   To get the chipid of the device. (0x1682 / 0x1684 / 0x168?)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle    The device handle
+ * @param [out] p_chipid The chip id of the device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chipid(bm_handle_t handle, unsigned int *p_chipid);
+#define BMLIB_LOG_QUIET    -8
+#define BMLIB_LOG_PANIC     0
+#define BMLIB_LOG_FATAL     8
+#define BMLIB_LOG_ERROR    16
+#define BMLIB_LOG_WARNING  24
+#define BMLIB_LOG_INFO     32
+#define BMLIB_LOG_VERBOSE  40
+#define BMLIB_LOG_DEBUG    48
+#define BMLIB_LOG_TRACE    56
+/**
+ * @name    bmlib_log_get_level
+ * @brief   To get the bmlib log level
+ * @ingroup bmlib_log
+ *
+ * @param void
+ * @retval  The level of bmlib log level
+ */
+DECL_EXPORT int  bmlib_log_get_level(void);
+/**
+ * @name    bmlib_log_set_level
+ * @brief   To set the bmlib log level
+ * @ingroup bmlib_log
+ *
+ * @param [in] level    The level of bmlib log level
+ * @retval  void
+ */
+DECL_EXPORT void bmlib_log_set_level(int level);
+/**
+ * @name    bmlib_log_set_callback
+ * @brief   To set callback to get bmlib log
+ * @ingroup bmlib_log
+ *
+ * @param [in]  callback     The callback function to get bmlib log
+ * @retval  void
+ */
+DECL_EXPORT void bmlib_log_set_callback(void (*callback)(const char*, int, const char*, va_list args));
+/**
+ * @name    bm_set_debug_mode
+ * @brief   To set the debug mode for firmware log for tpu
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mode    The debug mode of fw log, 0/1 for disable/enable log
+ * @retval  void
+ */
+DECL_EXPORT void bm_set_debug_mode(bm_handle_t handle, int mode);
+/**
+ * @name    bmlib_api_dbg_callback
+ * @brief   To set debug callback to get firmware log
+ * @ingroup bmlib_log
+ *
+ * @param [in]  bmlib_api_dbg_callback  callback to get firmware log
+ * @retval  void
+ */
+typedef void (*bmlib_api_dbg_callback)(int, int, int, const char*);
+// api, result, duratioin, log, third int for api duration for future
+DECL_EXPORT void bmlib_set_api_dbg_callback(bmlib_api_dbg_callback callback);
+/**
+ * @name    bmcpu_get_cpu_status
+ * @brief   Get bmcpu status
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BMCPU_RUNNING  bmcpu is running.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_cpu_status_t bmcpu_get_cpu_status(bm_handle_t handle);
+/**
+ * @name    bmcpu_start_cpu
+ * @brief   Start cpu in pcie mode
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  boot_file       Fip file
+ * @param [in]  core_file       Itb file
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_start_cpu(bm_handle_t handle, char *boot_file, char *core_file);
+/**
+ * @name    bmcpu_open_process
+ * @brief   Open a process to do some work
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  flags           Process flags
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  >= 0 process handle
+ *          < 0  Other code Fails.
+ */
+DECL_EXPORT int bmcpu_open_process(bm_handle_t handle, unsigned int flags, int timeout);
+/**
+ * @name    bmcpu_load_library
+ * @brief   Load a share library(so) to specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  library_file    Library file path
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_load_library(bm_handle_t handle, int process_handle, char *library_file, int timeout);
+/**
+ * @name    bmcpu_unload_library
+ * @brief   Load a share library(so) to specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  library_file    Library file path
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_unload_library(bm_handle_t handle, int process_handle, char *library_file, int timeout);
+/**
+ * @name    bmcpu_exec_function
+ * @brief   Execute specific function in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function parameters
+ * @param [in]  param_size      Parameters size in bytes
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_exec_function(bm_handle_t handle,
+                     int process_handle,
+                     char *function_name,
+                     void *function_param,
+                     unsigned int param_size,
+                     int timeout);
+#define BMCPU_EXEC_OPT_NO_FLUSH_CACHE     1
+/**
+ * @name    bmcpu_exec_function_ext
+ * @brief   Execute specific function in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function parameters
+ * @param [in]  param_size      Parameters size in bytes
+ * @param [in]  opt             exec options
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_exec_function_ext(bm_handle_t  handle,
+                            int process_handle,
+                            char *function_name,
+                            void *function_param,
+                            unsigned int param_size,
+                            unsigned int opt,
+                            int timeout);
+/**
+ * @name    bmcpu_exec_function_async
+ * @brief   Execute specific function in specific process asynchronous
+ *          user should use bm_query_exec_function_result to query result
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function param
+ * @param [in]  param_size      Param size in bytes
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_exec_function_async(bm_handle_t handle,
+                                   int process_handle,
+                                   char *function_name,
+                                   void *function_param,
+                                   unsigned int param_size,
+                                   unsigned long long *api_handle);
+/**
+ * @name    bmcpu_exec_function_async_ext
+ * @brief   Execute specific function in specific process asynchronous
+ *          user should use bm_query_exec_function_result to query result
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function param
+ * @param [in]  param_size      Param size in bytes
+ * @param [in]  opt             exec options
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_exec_function_async_ext(bm_handle_t handle,
+                                          int process_handle,
+                                          char *function_name,
+                                          void *function_param,
+                                          unsigned int param_size,
+                                          unsigned int opt,
+                                          unsigned long long *api_handle);
+/**
+ * @name    bmcpu_query_exec_function_result
+ * @brief   Query result from function called by bm_exec_function
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  api_handle      Api handle return by bm_exec_function_async
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_query_exec_function_result(bm_handle_t handle, unsigned long long api_handle, int timeout);
+/**
+ * @name    bmcpu_map_phys_addr
+ * @brief   Map physical address in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  phys_addr       Physical address
+ * @param [in]  size            Map size in bytes
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  >0  virtual address
+ *          0   fails
+ */
+DECL_EXPORT void *bmcpu_map_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, unsigned int size, int timeout);
+/**
+ * @name    bmcpu_unmap_phys_addr
+ * @brief   Unmap physical address in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  phys_addr       Physical address
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  <0  fail
+ *          0   success
+ */
+DECL_EXPORT bm_status_t bmcpu_unmap_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, int timeout);
+/**
+ * @name    bmcpu_close_process
+ * @brief   Close process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_close_process(bm_handle_t handle, int process_handle, int timeout);
+/**
+ * @name    bmcpu_reset_cpu
+ * @brief   Reset cpu in pcie mode
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_reset_cpu(bm_handle_t handle);
+/**
+ * @name    bm_enable_perf_monitor
+ * @brief   enable perf monitor to get gdma and tpu performance data
+ * @ingroup bmlib_perf
+ *
+ * @param [in]  handle         The device handle
+ * @param [in]  perf_monitor   The monitor to perf
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_enable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor);
+/**
+ * @name    bm_disable_perf_monitor
+ * @brief   disable perf monitor to get gdma and tpu performance data
+ * @ingroup bmlib_perf
+ *
+ * @param [in]  handle         The device handle
+ * @param [in]  perf_monitor   The monitor to perf
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_disable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor);
+/**
+ * @name    bmcpu_set_log
+ * @brief   Set cpu log options
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  log_level       0: DEBUG  1:INFO 2:WARN 3:ERROR 4:FATAL
+ * @param [in]  log_to_console  1: YES  0: No
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_set_log(bm_handle_t handle, unsigned int log_level,  unsigned int log_to_console, int timeout);
+/**
+ * @name    bmcpu_get_log
+ * @brief   Get cpu log file
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  log_file        save log as file
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_get_log(bm_handle_t handle, int process_handle, char *log_file, int timeout);
+/**
+ * @name    bmcpu_sync_time
+ * @brief   Sync device cpu time with host
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_sync_time(bm_handle_t handle);
+/*******************trace and profile releated functions **********************/
+struct bm_heap_stat {
+  unsigned int mem_total;
+  unsigned int mem_avail;
+  unsigned int mem_used;
+};
+typedef struct bm_heap_stat_byte {
+  unsigned int  heap_id;
+  unsigned long long mem_total;
+  unsigned long long mem_avail;
+  unsigned long long mem_used;
+  unsigned long long mem_start_addr;
+} bm_heap_stat_byte_t;
+typedef struct bm_dev_stat {
+  int mem_total;
+  int mem_used;
+  int tpu_util;
+  int heap_num;
+  struct bm_heap_stat heap_stat[4];
+} bm_dev_stat_t;
+/**
+ * @name    bm_get_stat
+ * @brief   To get the stat data at the moment
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] profile The result stat data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_stat(bm_handle_t handle, bm_dev_stat_t *stat);
+/**
+ * @name    bm_get_gmem_heap_id
+ * @brief   To get the heap id of allocated global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  pmem The allocted global memory
+ * @param [out] heapid The result of get heap id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_heap_id(bm_handle_t handle, bm_device_mem_t *pmem, unsigned int *heapid);
+/**
+ * @name    sg_get_gmem_heap_id
+ * @brief   To get the heap id of allocated global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  pmem The allocted global memory
+ * @param [out] heapid The result of get heap id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_get_gmem_heap_id(bm_handle_t handle, sg_device_mem_t *pmem, unsigned int *heapid);
+/**
+ * @name    bm_get_gmem_total_heap_num
+ * @brief   To get the total heap num of global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  heap_num The result of get total num
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_total_heap_num(bm_handle_t handle, unsigned int *heap_num);
+/**
+ * @name    bm_get_gmem_heap_stat_byte_by_id
+ * @brief   To get the heap stat by heap id
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  heap_id The heap index to get heap status
+ * @param [out] pheap_byte The result of get heap status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_heap_stat_byte_by_id(bm_handle_t handle, bm_heap_stat_byte_t *pheap_byte, unsigned int heap_id);
+DECL_EXPORT bm_status_t bm_load_firmware(
+        bm_handle_t  handle,
+        const char  *firmware_tcm,
+        const char  *firmware_ddr);
+#define bmkernel_load_firmware okkernel_load_firmware
+DECL_EXPORT bm_status_t okkernel_load_firmware(
+        bm_handle_t  handle,
+        const char  *firmware_tcm,
+        const char  *firmware_ddr);
+DECL_EXPORT bm_status_t okkernel_launch_async(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+DECL_EXPORT bm_status_t okkernel_launch_sync(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+DECL_EXPORT bm_status_t tpu_kernel_launch_sync(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+DECL_EXPORT bm_status_t okkernel_sync(bm_handle_t handle);
+/**
+ * @name    bmkernel_launch
+ * @brief   send api to device and launch function
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  api cmd struct pointer
+ * @param [in]  api cmd length
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmkernel_launch(bm_handle_t handle, const void *args,
+                            unsigned int size);
+/**
+ * @name    bmkernel_load_lookup_table
+ * @brief   load lookup table to l2-sram
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  table which loaded to l2-sram
+ * @param [in]  table size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmkernel_load_lookup_table(bm_handle_t handle, const void* table, unsigned int size);
+/*******************device management api functions ********************************************/
+/**
+ * @name    bm_get_tpu_current
+ * @brief   get tpu current
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle     The device handle
+ * @param [out]  tpuc(mA)   The pointer for tpu current
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_current(bm_handle_t handle, unsigned int *tpuc);
+/**
+ * @name    bm_get_board_max_power
+ * @brief   get board support max power
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  maxp    The pointer for maxp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_max_power(bm_handle_t handle, unsigned int *maxp);
+/**
+ * @name    bm_get_board_power
+ * @brief   get board power
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle    The device handle
+ * @param [out]  boardp    The pointer for boardp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_power(bm_handle_t handle, unsigned int *boardp);
+/**
+ * @name    bm_get_fan_speed
+ * @brief   get board fan speed
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  fan    The pointer for fan speed
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_fan_speed(bm_handle_t handle, unsigned int *fan);
+/**
+ * @name    bm_get_ecc_correct_num
+ * @brief   get ecc_correct_num
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  ecc_correct_num
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+#ifdef __linux__
+DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long *ecc_correct_num);
+#else
+DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long long *ecc_correct_num);
+#endif
+/**
+ * @name    bm_get_12v_atx
+ * @brief   get atx_12v
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  atx_12v
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_12v_atx(bm_handle_t handle, int *atx_12v);
+/**
+ * @name    bm_get_product_sn
+ * @brief   get SE5 sn
+ * @ingroup device management api
+ *
+ * @param [out]  product_sn
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_product_sn(char *product_sn);
+/**
+ * @name    bm_get_sn
+ * @brief   get sn
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  sn
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_sn(bm_handle_t handle, char *sn);
+/**
+ * @name    bm_get_status
+ * @brief   get chip status
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  status  The board error status, each bit represents an error state
+ *  status == 0x0, borad is nornal, staus > 0, borad is abnormal;
+ *  bit0 == 1, tpu is hang
+ *  bit1 == 1, pcie link abnormal
+ *  bit2 == 1, board temperature is too high
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_status(bm_handle_t handle, int *status);
+/**
+ * @name    bm_get_tpu_maxclk
+ * @brief   get tpu_maxclk
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  tpu_maxclk
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_maxclk(bm_handle_t handle, unsigned int *tpu_maxclk);
+/**
+ * @name    bm_get_tpu_minclk
+ * @brief   get tpu_minclk
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  tpu_minclk
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_minclk(bm_handle_t handle, unsigned int *tpu_minclk);
+/**
+ * @name    bm_get_driver_version
+ * @brief   get driver version
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  driver_version
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_driver_version(bm_handle_t handle, int *driver_version);
+/**
+ * @name    bm_get_board_name
+ * @brief   get device board name
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  board_name
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_name(bm_handle_t handle, char *name);
+/**
+ * @name    bm_get_board_temp
+ * @brief   get board temperature
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  board_temp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_temp(bm_handle_t handle, unsigned int *board_temp);
+/**
+ * @name    bm_get_chip_temp
+ * @brief   get chip temperature
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  chip_temp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chip_temp(bm_handle_t handle, unsigned int *chip_temp);
+/**
+ * @name    bm_get_tpu_power
+ * @brief   get TPU power
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  tpu_power
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_power(bm_handle_t handle, float *tpu_power);
+/**
+ * @name    bm_get_tpu_volt
+ * @brief   get TPU voltage
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  tpu_volt
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_volt(bm_handle_t handle, unsigned int *tpu_volt);
+/**
+ * @name    bm_get_card_id
+ * @brief   get card id
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  card_id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_card_id(bm_handle_t handle, unsigned int *card_id);
+/**
+ * @name    bm_get_card_num
+ * @brief   get card number
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  card_id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_card_num(unsigned int *card_num);
+/**
+ * @name    bm_get_chip_num_from_card
+ * @brief   get chip number and start chip id from card
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  chip_num
+ * @param [out]  dev_start_index
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chip_num_from_card(unsigned int card_id, unsigned int *chip_num, unsigned int *dev_start_index);
+/**
+ * @name    bm_get_dynfreq_status
+ * @brief   get chip dynamic freq status
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  dynfreq_status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_dynfreq_status(bm_handle_t handle, int *dynfreq_status);
+/**
+ * @name    bm_change_dynfreq_status
+ * @brief   change(enable/disable) chip dynamic freq status
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [in]   new_status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_change_dynfreq_status(bm_handle_t handle, int new_status);
+/**
+ * @name    bm_get_tpu_scalar_num
+ * @brief   To get the core number of TPU scalar
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle    The device handle
+ * @param [out] core_num The core number of TPU scalar
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_scalar_num(bm_handle_t handle, unsigned int *core_num);
+#define  bm_get_tpu_core_num bm_get_tpu_scalar_num
+#if defined(__cplusplus)
+}
+#endif
+#endif /* BM_RUNTIME_H_ */

Baichuan2/src/include/bmruntime_interface.h ADDED Viewed

	@@ -0,0 +1,404 @@

+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Sophgo Technologies Inc. This is proprietary information owned by
+ *    Sophgo Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Sophgo Technologies Inc.
+ *
+ *****************************************************************************/
+/*****************************************************************************
+ * BMRuntime Interface is mainly for inference.
+ * Also we can use it for device computation from BMLang programming.
+ * Note: please use interface from bmlib_runtime.h for device memory operation.
+ ****************************************************************************/
+#ifndef BMRUNTIME_INTERFACE_H_
+#define BMRUNTIME_INTERFACE_H_
+#include "bmdef.h"
+#ifdef _WIN32
+#define DECL_EXPORT _declspec(dllexport)
+#define DECL_IMPORT _declspec(dllimport)
+#else
+#define DECL_EXPORT
+#define DECL_IMPORT
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* --------------------------------------------------------------------------*/
+/* interface for basic data type */
+/* get data type byte size */
+DECL_EXPORT size_t bmrt_data_type_size(bm_data_type_t dtype);
+/*
+dims array to bm_shape_t,
+shape and dims should not be NULL, num_dims should not be larger than BM_MAX_DIMS_NUM */
+DECL_EXPORT void bmrt_shape(bm_shape_t* shape, const int* dims, int num_dims);
+/*
+number of shape elements, shape should not be NULL and num_dims should not large than
+BM_MAX_DIMS_NUM */
+DECL_EXPORT uint64_t bmrt_shape_count(const bm_shape_t* shape);
+/* compare whether two shape is same */
+DECL_EXPORT bool bmrt_shape_is_same(const bm_shape_t* left, const bm_shape_t* right);
+/*
+fill a tensor with data type and shape, and st_mode = 0 as default.
+tensor and p_bmrt should not be NULL, shape count should not be 0.
+it will alloc device mem to tensor->device_mem, so user should bmrt_free_device(p_bmrt,
+tensor->device_mem) to free it.*/
+DECL_EXPORT bool bmrt_tensor(bm_tensor_t* tensor, void* p_bmrt, bm_data_type_t dtype, bm_shape_t shape);
+/*
+fill a tensor with data type and shape, and st_mode = 0 as default.
+tensor and p_bmrt should not be NULL, shape count should not be 0.
+it will alloc device mem to tensor->device_mem on devid-th device.*/
+DECL_EXPORT bool bmrt_tensor_ex(bm_tensor_t* tensor, void* p_bmrt, int devid, bm_data_type_t dtype, bm_shape_t shape);
+/* fill a tensor with device mem existed, tensor byte size should not large than device mem size */
+DECL_EXPORT void bmrt_tensor_with_device(bm_tensor_t* tensor, bm_device_mem_t device_mem,
+                             bm_data_type_t dtype, bm_shape_t shape);
+/* get tensor bytes size, tensor should not be NULL */
+DECL_EXPORT size_t bmrt_tensor_bytesize(const bm_tensor_t* tensor);
+/* get tensor mem size allocated in device mem, tensor should not be NULL */
+DECL_EXPORT size_t bmrt_tensor_device_size(const bm_tensor_t* tensor);
+/* print net info for debug */
+DECL_EXPORT void bmrt_print_network_info(const bm_net_info_t* net_info);
+/* --------------------------------------------------------------------------*/
+/**
+ * @name    bmrt_create
+ * @brief   To create the bmruntime with bm_handle.
+ * @ingroup bmruntime
+ *
+ * This API creates the bmruntime. It returns a void* pointer which is the pointer
+ * of bmruntime. Device id is set when get bm_handle;
+ *
+ * @param [in] bm_handle     bm handle. It must be initialized by using bmlib.
+ *
+ * @retval void* the pointer of bmruntime
+ */
+DECL_EXPORT void* bmrt_create(bm_handle_t bm_handle);
+/* --------------------------------------------------------------------------*/
+/**
+ * @name    bmrt_create_ex
+ * @brief   To create the bmruntime with one or more bm_handle.
+ * @ingroup bmruntime
+ *
+ * This API creates the bmruntime. It returns a void* pointer which is the pointer
+ * of bmruntime.
+ *
+ * @param [in] bm_handles   bm handles. They must be initialized by using bmlib.
+ * @param [in] num_handles  number of bm_handles.
+ *
+ * @retval void* the pointer of bmruntime
+ */
+DECL_EXPORT void *bmrt_create_ex(bm_handle_t *bm_handles, int num_handles);
+/**
+ * @name    bmrt_destroy
+ * @brief   To destroy the bmruntime pointer
+ * @ingroup bmruntime
+ *
+ * This API destroy the bmruntime.
+ *
+ * @param [in]     p_bmrt        Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_destroy(void* p_bmrt);
+/**
+ * @name    bmrt_get_bm_handle
+ * @brief   To get the BM runtime context.
+ * @ingroup bmruntime
+ *
+ * This API get the BM runtime context for using BMDNN, BMCV or BMLIB
+ *
+ * @param [in]     p_bmrt        Bmruntime that had been created
+ */
+DECL_EXPORT void * bmrt_get_bm_handle(void* p_bmrt);
+/**
+ * @name    bmrt_load_bmodel
+ * @brief   To load the bmodel which is created by BM compiler
+ * @ingroup bmruntime
+ *
+ * This API is to load bmodel created by BM compiler.
+ * After loading bmodel, we can run the inference of neuron network.
+ *
+ * @param   [in]   p_bmrt        Bmruntime that had been created
+ * @param   [in]   bmodel_path   Bmodel file directory.
+ *
+ * @retval true    Load context sucess.
+ * @retval false   Load context failed.
+ */
+DECL_EXPORT bool bmrt_load_bmodel(void* p_bmrt, const char *bmodel_path);
+/**
+ * @name    bmrt_load_bmodel_data
+ * @brief   To load the bmodel which is created by BM compiler from buffer
+ * @ingroup bmruntime
+ *
+ * This API is to load bmodel created by BM compiler.
+ * After loading bmodel, we can run the inference of neuron network.
+ * Different with bmrt_load_bmodel, bmodel is the data in host memory.
+ *
+ * @param   [in]   p_bmrt        Bmruntime that had been created
+ * @param   [in]   bmodel_data   Bmodel data pointer to buffer
+ * @param   [in]   size          Bmodel data size
+ *
+ * @retval true    Load context sucess.
+ * @retval false   Load context failed.
+ */
+DECL_EXPORT bool bmrt_load_bmodel_data(void* p_bmrt, const void * bmodel_data, size_t size);
+/**
+ * @name    bmrt_show_neuron_network
+ * @brief   To print the name of all neuron network
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_show_neuron_network(void* p_bmrt);
+/**
+ * @name    bmrt_get_network_number
+ * @brief   To get the number of neuron network in the bmruntime
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ *
+ * @retval  int value     The number of neuron networks.
+ */
+DECL_EXPORT int bmrt_get_network_number(void* p_bmrt);
+/**
+ * @name    bmrt_get_network_names
+ * @brief   To get the names of all neuron network in the bmruntime
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ * @param [out]    network_names  The names of all neuron networks. It should be declare as (const char** networks_ = NULL),
+ *                                and use as the param &networks_. After this API, user need to free(networks_) if user
+ *                                do not need it.
+ */
+DECL_EXPORT void bmrt_get_network_names(void* p_bmrt, const char*** network_names);
+/**
+ * @name    bmrt_get_network_info
+ * @brief   To get network info by net name
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ * @param [in]     net_name       Network name
+ *
+ * @retval  bm_net_info_t*        Pointer to net info, needn't free by user; if net name not found, will return NULL.
+ */
+DECL_EXPORT const bm_net_info_t* bmrt_get_network_info(void* p_bmrt, const char* net_name);
+/**
+ * @name    bmrt_launch_tensor
+ * @brief   To launch the inference of the neuron network with setting input tensors
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync should be called to make sure inference finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ * @param [in]    net_name       The name of the neuron network
+ * @param [in]    input_tensors  Array of input tensor, defined like bm_tensor_t input_tensors[input_num].
+ *                               User should initialize each input tensor.
+ * @param [in]    input_num      Input number
+ * @param [out]   output_tensors Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                               This interface will alloc devcie mem to store output data. User should free each
+ *                               device mem by bm_free_device after the result data not used.
+ * @param [in]    output_num     Output number
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
+                        bm_tensor_t output_tensors[], int output_num);
+/**
+ * @name    bmrt_launch_tensor_ex
+ * @brief   To launch the inference of the neuron network with setting input tensors
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync should be called to make sure inference finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt            Bmruntime that had been created
+ * @param [in]    net_name          The name of the neuron network
+ * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
+ *                                  User should initialize each input tensor.
+ * @param [in]    input_num         Input number
+ * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
+ *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
+ *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
+ *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
+ * @param [in]    output_num        Output number
+ * @param [in]    user_mem          whether device_mem of output tensors are set
+ * @param [in]    user_stmode       whether stmode of output tensors are set
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor_ex(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
+                           bm_tensor_t output_tensors[], int output_num, bool user_mem, bool user_stmode);
+/**
+ * @name    bmrt_launch_data
+ * @brief   To launch the inference of the neuron network with setting input datas in system memory
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU
+ * program will be blocked.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ * @param [in]    net_name       The name of the neuron network
+ * @param [in]    input_datas    Array of input data, defined like void * input_datas[input_num]. User should
+ *                               initialize each data pointer as input.
+ * @param [in]    input_shapes   Array of input shape, defined like bm_shape_t input_shapes[input_num].
+ *                               User should set each input shape
+ * @param [in]    input_num      Input number
+ * @param [out]   output_datas   Array of output data, defined like void * output_datas[output_num].
+ *                               If user don't alloc each output data, set user_mem to false, and this api will alloc
+ *                               output mem, user should free each output mem when output data not used. Also
+ *                               user can alloc system memory for each output data by self and set user_mem = true.
+ * @param [out]   output_shapes  Array of output shape, defined like bm_shape_t output_shapes[output_num].
+ *                               It will store each output shape.
+ * @param [in]    output_num     Output number
+ * @param [in]    user_mem       whether output_datas[i] have allocated memory
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_data(void* p_bmrt, const char* net_name, void* const input_datas[],
+                      const bm_shape_t input_shapes[], int input_num, void * output_datas[],
+                      bm_shape_t output_shapes[], int output_num, bool user_mem);
+/**
+ * @name    bmrt_trace
+ * @brief   To check runtime environment, and collect info for DEBUG
+ * @ingroup bmruntime
+ *
+ * This API is to collect runtime info for DEBUG. Expecially when launch result sudden mistake, call bmrt_trace
+ * will show whether device mems are broken, and other check info.
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_trace(void* p_bmrt);
+/**
+ * @name    bmrt_launch_tensor_multi_cores
+ * @brief   To launch the inference of the neuron network with setting input tensors, and support multi core inference.
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync_from_core should be called to make sure inference is finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt            Bmruntime that had been created
+ * @param [in]    net_name          The name of the neuron network
+ * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
+ *                                  User should initialize each input tensor.
+ * @param [in]    input_num         Input number
+ * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
+ *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
+ *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
+ *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
+ * @param [in]    output_num        Output number
+ * @param [in]    user_mem          whether device_mem of output tensors are set
+ * @param [in]    user_stmode       whether stmode of output tensors are set
+ * @param [in]    core_list         core id list those will be used to inference
+ * @param [in]    core_num          number of the core list
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor_multi_cores(
+    void *p_bmrt,
+    const char *net_name,
+    const bm_tensor_t input_tensors[],
+    int input_num,
+    bm_tensor_t output_tensors[],
+    int output_num,
+    bool user_mem,
+    bool user_stmode,
+    const int *core_list,
+    int core_num);
+/**
+ *  @name    bmrt_memcpy_s2d_parallel
+ *  @brief   To copy data from system memory to muti-devices memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, datas[:tensor_num[0]] will be copied to the first device, and
+ *  datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] will be copied to the second device and so on.
+ *  The process of copying data to different devices is done in parallel and to the same device is in sequence.
+ *
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     tensors     Array of tensors that will be copied to devices
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied to each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_s2d_parallel(
+    void *p_bmrt,
+    bm_tensor_t tensors[],
+    void *datas[],
+    int tensor_num[],
+    int device_num);
+/**
+ *  @name    bmrt_memcpy_d2s_parallel
+ *  @brief   To copy data from muti-devices memory to system memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, tensors on the first device will be copied to datas[:tensor_num[0]] , and
+ *  tensors on the second device will be copied to datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] and so on.
+ *  The process of copying data from different devices is done in parallel and from the same device is in sequence.
+ *
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensors     Array of tensors that will be copied from devices
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied from each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_d2s_parallel(
+    void *p_bmrt,
+    void *datas[],
+    bm_tensor_t tensors[],
+    int tensor_num[],
+    int device_num);
+#if defined (__cplusplus)
+}
+#endif
+#endif

Baichuan2/src/include/sentencepiece/sentencepiece_processor.h ADDED Viewed

	@@ -0,0 +1,727 @@

+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+#ifndef SENTENCEPIECE_PROCESSOR_H_
+#define SENTENCEPIECE_PROCESSOR_H_
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+#ifndef SWIG
+namespace absl {
+using std::string_view;
+}  // namespace absl
+#endif  // SWIG
+namespace sentencepiece {
+namespace util {
+enum class StatusCode : int {
+  kOk = 0,
+  kCancelled = 1,
+  kUnknown = 2,
+  kInvalidArgument = 3,
+  kDeadlineExceeded = 4,
+  kNotFound = 5,
+  kAlreadyExists = 6,
+  kPermissionDenied = 7,
+  kResourceExhausted = 8,
+  kFailedPrecondition = 9,
+  kAborted = 10,
+  kOutOfRange = 11,
+  kUnimplemented = 12,
+  kInternal = 13,
+  kUnavailable = 14,
+  kDataLoss = 15,
+  kUnauthenticated = 16,
+};
+class Status {
+ public:
+  Status();
+  ~Status();
+  Status(StatusCode code, absl::string_view error_message);
+  Status(const Status &s);
+  void operator=(const Status &s);
+  bool operator==(const Status &s) const;
+  bool operator!=(const Status &s) const;
+  inline bool ok() const { return rep_ == nullptr; }
+  void set_error_message(const char *str);
+  const char *error_message() const;
+  const char *message() const { return error_message(); }
+  StatusCode code() const;
+  std::string ToString() const;
+  void IgnoreError();
+ private:
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+}  // namespace util
+// SentencePieceProcessor:
+// Simple and language independent tokenizer and de-tokenizer for
+// Neural Network Machine Translation.
+//
+// SentencePieceProcessor provides Encode() and Decode() methods,
+// which correspond to tokenization and de-tokenization respectively.
+//
+// - Encode:
+//   Given a raw source sentence, encode it into a sequence
+//   of pieces or vocabulary ids.
+//
+// - Decode:
+//   Given a sequence of pieces or vocabulary ids, decode it
+//   into a de-tokenized raw sentence.
+//
+// SentencePieceProcessor provides a lossless data conversion
+// that allows the original raw sentence to be perfectly reconstructed
+// from the encoded data, i.e., Decode(Encode(input)) == input.
+// This characteristics is useful, as we can make the de-tokenization
+// completely language independent.
+//
+// Usage:
+//   SentencePieceProcessor sp;
+//   sp.Load("//path/to/model");
+//
+//   vector<string> sps;
+//   sp.Encode("hello world.", &sps).IgnoreError();
+//
+//   vector<int> ids;
+//   sp.Encode("hello world.", &ids).IgnoreError();
+//
+//   string detok;
+//   sp.Decode(sps, &detok);
+//   CHECK_EQ("hello world.", detok).IgnoreError();
+//
+//   sp.Decode(ids, &detok);
+//   CHECK_EQ("hello world.", detok).IgnoreError();
+//
+//  We can also use SentencePieceText which manages the byte-offsets
+//  between user input (output) and internal sentence pieces.
+//
+//   SentencePieceText spt;
+//   sp.Encode("hello world.", &spt);
+//   // Emits the byte range of each piece.
+//   for (const auto &piece : spt.pieces()) {
+//      LOG(INFO) << piece.begin() << " " << piece.end();
+//   }
+//
+//   sp.Decode({0, 1, 2, 3..}, &spt);
+//   for (const auto &piece : spt.pieces()) {
+//      LOG(INFO) << piece.begin() << " " << piece.end();
+//   }
+//
+class NBestSentencePieceText;
+class ModelInterface;
+class SentencePieceText;
+class ModelProto;
+namespace normalizer {
+class Normalizer;
+}  // namespace normalizer
+#ifndef SWIGGO
+namespace util {
+// Redefine std::string for serialized_proto interface as Python's string is
+// a Unicode string. We can enforce the return value to be raw byte sequence
+// with SWIG's typemap.
+using bytes = std::string;
+}  // namespace util
+#endif  // SWIGGO
+class NBestSentencePieceText;
+class ModelInterface;
+class SentencePieceText;
+class SentencePieceText_SentencePiece;
+// Wrapper class of SentencePieceText
+// This wrapper only allows an immutable access to the proto and
+// hides the actual implementation of protobuf.
+// See sentencepiece.proto for the details of this class.
+class ImmutableSentencePieceText_ImmutableSentencePiece {
+ public:
+  ImmutableSentencePieceText_ImmutableSentencePiece();
+  ~ImmutableSentencePieceText_ImmutableSentencePiece() = default;
+  const std::string &piece() const;
+  const std::string &surface() const;
+  uint32_t id() const;
+  uint32_t begin() const;
+  uint32_t end() const;
+  friend class ImmutableSentencePieceText;
+ private:
+  explicit ImmutableSentencePieceText_ImmutableSentencePiece(
+      const SentencePieceText_SentencePiece &sp);
+  const SentencePieceText_SentencePiece *sp_ = nullptr;
+};
+class ImmutableSentencePieceText {
+ public:
+  ImmutableSentencePieceText();
+  virtual ~ImmutableSentencePieceText();
+  std::vector<ImmutableSentencePieceText_ImmutableSentencePiece> pieces() const;
+  size_t pieces_size() const;
+  ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const;
+  const std::string &text() const;
+  float score() const;
+  util::bytes SerializeAsString() const;
+  // Returns the actual mutable proto.
+  // Do not use this outside of SentencePieceProcessor, as
+  // it returns the raw pointer managed by the shared_ptr.
+  SentencePieceText *mutable_proto();
+  // Converts the utf8 byte spans into Unicode char span.
+  void ConvertToUnicodeSpans();
+  friend class ImmutableNBestSentencePieceText;
+ private:
+  explicit ImmutableSentencePieceText(const SentencePieceText &spt);
+  const SentencePieceText *spt_ = nullptr;
+  std::shared_ptr<SentencePieceText> rep_;
+};
+// Wrapper class of SentencePieceText
+// This wrapper only allows an immutable access to the proto and
+// hides the actual implementation of protobuf.
+// See sentencepiece.proto for the details of this class.
+class ImmutableNBestSentencePieceText {
+ public:
+  ImmutableNBestSentencePieceText();
+  virtual ~ImmutableNBestSentencePieceText();
+  std::vector<ImmutableSentencePieceText> nbests() const;
+  size_t nbests_size() const;
+  ImmutableSentencePieceText nbests(int index) const;
+  util::bytes SerializeAsString() const;
+  // Returns the actual mutable proto.
+  // Do not use this outside of SentencePieceProcessor, as
+  // it returns the raw pointer managed by the shared_ptr.
+  NBestSentencePieceText *mutable_proto();
+  void ConvertToUnicodeSpans();
+ private:
+  std::shared_ptr<NBestSentencePieceText> rep_;
+};
+class SentencePieceProcessor {
+ public:
+  SentencePieceProcessor();
+  virtual ~SentencePieceProcessor();
+  // Loads model from `filename`.
+  // Returns false if `filename` cannot be loaded.
+  virtual util::Status Load(absl::string_view filename);
+  // Loads model from `filename`.
+  // Crash if `filename` cannot be loaded.
+  virtual void LoadOrDie(absl::string_view filename);
+  // Loads model from `model_proto`.
+  // `model_proto` is copied.
+  virtual util::Status Load(const ModelProto &model_proto);
+  // Loads model from `model_proto`.
+  // `model_proto` is moved.
+  virtual util::Status Load(std::unique_ptr<ModelProto> model_proto);
+  // Loads model from `serialized`, which is a string-serialized model proto.
+  // Useful to load the model from a platform independent blob object.
+  virtual util::Status LoadFromSerializedProto(absl::string_view serialized);
+  // Returns the status. Encode/Decode methods are valid when status is OK.
+  virtual util::Status status() const;
+  // Sets encode extra_option sequence.
+  virtual util::Status SetEncodeExtraOptions(absl::string_view extra_option);
+  // Sets decode extra_option sequence.
+  virtual util::Status SetDecodeExtraOptions(absl::string_view extra_option);
+  //////////////////////////////////////////////////////////////
+  // Vocabulary restriction.
+  // Background:
+  // https://github.com/rsennrich/subword-nmt#best-practice-advice-for-byte-pair-encoding-in-nmt
+  // Restricts the vocabulary set.
+  // The input sentences are encoded into the tokens in `valid_vocab`.
+  virtual util::Status SetVocabulary(
+      const std::vector<absl::string_view> &valid_vocab);
+  // Reverts the vocabulary restriction.
+  virtual util::Status ResetVocabulary();
+  // Loads the valid vocabulary set from `filename` in TSV format.
+  // Format:  <token> <tab> <freq>.
+  // Any token with frequency < threshold will be treated as OOV.
+  virtual util::Status LoadVocabulary(absl::string_view filename,
+                                      int threshold);
+  //////////////////////////////////////////////////////////////
+  // Simple Encode and Decode API.
+  //
+  // Given a UTF8 input, encodes it into a sequence of sentence pieces.
+  virtual util::Status Encode(absl::string_view input,
+                              std::vector<std::string> *pieces) const;
+  // Given a UTF8 input, encodes it into a sequence of ids.
+  virtual util::Status Encode(absl::string_view input,
+                              std::vector<int> *ids) const;
+  // Given a sequence of pieces, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<std::string> &pieces,
+                              std::string *detokenized) const;
+  // Given a sequence of pieces, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<absl::string_view> &pieces,
+                              std::string *detokenized) const;
+  // Given a sequence of ids, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<int> &ids,
+                              std::string *detokenized) const;
+  //////////////////////////////////////////////////////////////
+  // NBest API.
+  //
+  // Same as Encode, but returns nbest results.
+  virtual util::Status NBestEncode(
+      absl::string_view input, int nbest_size,
+      std::vector<std::vector<std::string>> *pieces) const;
+  // Same as Encode, but returns nbest results.
+  virtual util::Status NBestEncode(absl::string_view input, int nbest_size,
+                                   std::vector<std::vector<int>> *ids) const;
+  //////////////////////////////////////////////////////////////
+  // Sampling API.
+  //
+  // Unigram and BPE support sampling mode.
+  // - Unigram (--model_type=unigram):
+  // `nbest_size`: When `nbest_size` is positive value, approximately samples
+  // one segmentation from nbest candidates. When `nbest_size` is negative
+  // value, samples one segmentation from the hypotheses (Lattice) according to
+  // the generation probabilities using forward-filtering and backward-sampling
+  // algorithm.
+  // `alpha`: Smoothing parameter (inverse temperature). The best segmentation
+  // (Viterbi segmentation) is more likely sampled when setting larger alpha.
+  // When alpha is 0.0, one segmentation is uniformly sampled from the nbest or
+  // lattice. `nbest_size` and `alpha` correspond to parameters `l` and `alpha`
+  // in https://arxiv.org/abs/1804.10959  (nbest_size < 0 means l = infinity)
+  //
+  // - BPE (--model_type=bpe):
+  // `alpha`: The dropout probability `p` of bpe merge operations in
+  // https://arxiv.org/abs/1910.13267 Nbest-based sampling is not supported so
+  // nbest_size parameter is ignored in BPE.
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha,
+                                    std::vector<std::string> *pieces) const;
+  // Same as above, but returns a sequence of ids.
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha, std::vector<int> *ids) const;
+  //////////////////////////////////////////////////////////////
+  // SampleEncodeAndScore API.
+  //
+  // Sample `samples` many tokenisations from the segmentation lattice.
+  // These methods are only available in model_type=unigram.
+  //
+  // `alpha`: smoothing parameter (inverse temperature). The same as `alpha` in
+  // `Sample` method.
+  // 'wor`: If `wor` is true, the samples are taken without replacement, and the
+  // scores are the inclusion probabilities of the elements in the sample;
+  // otherwise the samples are taken with replacement and the scores are the
+  // log-probs of sample elements
+  // `include_best`: If `include_best` is true, the best tokenisation is always
+  // included in the sample, and the remaining elements are sampled excluding
+  // the best.
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best,
+      std::vector<std::pair<std::vector<std::string>, float>> *pieces) const;
+  // Same as above, but returns a sequence of ids.
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best,
+      std::vector<std::pair<std::vector<int>, float>> *ids) const;
+  //////////////////////////////////////////////////////////////
+  // Entropy API.
+  //
+  // This only available in model_type=unigram.
+  // Calculate entropy of possible tokenisations
+  virtual util::Status CalculateEntropy(absl::string_view input, float alpha,
+                                        float *entropy) const;
+  //////////////////////////////////////////////////////////////
+  // Advanced API returning SentencePieceText, which manages
+  // utf8-byte alignments between user-input/detokenized text
+  // and internal sentencepiece sequence.
+  //
+  // Given a UTF8 input, encodes it into SentencePieceText.
+  //
+  // When using these APIs, sentencepiece.pb.h header files must be included.
+  // We can also use ImutableSentencePieceText as follows.
+  //
+  // ImmutableSentencePieceText spt;
+  // Encode("hello", spt.mutable_proto()).IgnoreError();
+  // std::cout << spt.pieces_size() << std::endl;
+  virtual util::Status Encode(absl::string_view input,
+                              SentencePieceText *spt) const;
+  virtual util::Status NBestEncode(absl::string_view input, int nbest_size,
+                                   NBestSentencePieceText *nbest_spt) const;
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha, SentencePieceText *spt) const;
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best, NBestSentencePieceText *samples_spt) const;
+  // DEPRECATED: Remove this API and use std::vector<std::string_view>
+  virtual util::Status Decode(const std::vector<std::string> &pieces,
+                              SentencePieceText *spt) const;
+  virtual util::Status Decode(const std::vector<absl::string_view> &pieces,
+                              SentencePieceText *spt) const;
+  virtual util::Status Decode(const std::vector<int> &ids,
+                              SentencePieceText *spt) const;
+#ifdef SWIG
+#define SPP_SWIG_CHECK_AND_THROW \
+  if (!status.ok()) throw status;
+#else
+#define SPP_SWIG_CHECK_AND_THROW \
+  if (!status.ok()) {            \
+  }
+#endif  // SWIG
+#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \
+  OutType output;                                           \
+  const auto status = FuncName(__VA_ARGS__, &output);       \
+  SPP_SWIG_CHECK_AND_THROW;				    \
+  return output;
+#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...)     \
+  OutType output;                                                    \
+  const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
+  SPP_SWIG_CHECK_AND_THROW;					     \
+  return output.SerializeAsString();
+#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...)      \
+  OutType output;                                                    \
+  const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
+  SPP_SWIG_CHECK_AND_THROW;					     \
+  return output;
+  //////////////////////////////////////////////////////////////
+  // Handy methods that return the result directly.
+  // These functions ignore internal errors.
+  virtual std::vector<std::string> EncodeAsPieces(
+      absl::string_view input) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector<std::string>, input);
+  }
+  virtual std::vector<int> EncodeAsIds(absl::string_view input) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector<int>, input);
+  }
+  virtual std::vector<std::vector<std::string>> NBestEncodeAsPieces(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(
+        NBestEncode, std::vector<std::vector<std::string>>, input, nbest_size);
+  }
+  virtual std::vector<std::vector<int>> NBestEncodeAsIds(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(NBestEncode, std::vector<std::vector<int>>,
+                                input, nbest_size);
+  }
+  virtual std::vector<std::string> SampleEncodeAsPieces(absl::string_view input,
+                                                        int nbest_size,
+                                                        float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector<std::string>, input,
+                                nbest_size, alpha);
+  }
+  virtual std::vector<int> SampleEncodeAsIds(absl::string_view input,
+                                             int nbest_size,
+                                             float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector<int>, input,
+                                nbest_size, alpha);
+  }
+  virtual std::vector<std::pair<std::vector<std::string>, float>>
+  SampleEncodeAndScoreAsPieces(absl::string_view input, int num_samples,
+                               float alpha, bool wor, bool include_best) const {
+    using _T = std::vector<std::pair<std::vector<std::string>, float>>;
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples,
+                                alpha, wor, include_best);
+  }
+  virtual std::vector<std::pair<std::vector<int>, float>>
+  SampleEncodeAndScoreAsIds(absl::string_view input, int num_samples,
+                            float alpha, bool wor, bool include_best) const {
+    using _T = std::vector<std::pair<std::vector<int>, float>>;
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples,
+                                alpha, wor, include_best);
+  }
+  // DEPRECATED: Remove this API and use std::vector<std::string_view>
+  virtual std::string DecodePieces(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces);
+  }
+  virtual std::string DecodePieces(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces);
+  }
+  virtual std::string DecodeIds(const std::vector<int> &ids) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, ids);
+  }
+  virtual float CalculateEntropy(absl::string_view text, float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(CalculateEntropy, float, text, alpha);
+  }
+  //////////////////////////////////////////////////////////////
+  // SerializedProto API. (DEPRECATED). Use ImmutableProto API.
+  // They are used in Python interface. Returns serialized proto.
+  // In python module, we can get access to the full Proto after
+  // deserialzing the returned byte sequence.
+  virtual util::bytes EncodeAsSerializedProto(absl::string_view input) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Encode, ImmutableSentencePieceText, input);
+  }
+  virtual util::bytes SampleEncodeAsSerializedProto(absl::string_view input,
+                                                    int nbest_size,
+                                                    float alpha) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText,
+                                     input, nbest_size, alpha);
+  }
+  virtual util::bytes NBestEncodeAsSerializedProto(absl::string_view input,
+                                                   int nbest_size) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(
+        NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size);
+  }
+  virtual util::bytes SampleEncodeAndScoreAsSerializedProto(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncodeAndScore,
+                                     ImmutableNBestSentencePieceText, input,
+                                     num_samples, alpha, wor, include_best);
+  }
+  // TODO(taku): Remove this API and use std::vector<std::string_view>
+  virtual util::bytes DecodePiecesAsSerializedProto(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText,
+                                     pieces);
+  }
+  virtual util::bytes DecodePiecesAsSerializedProto(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText,
+                                     pieces);
+  }
+  virtual util::bytes DecodeIdsAsSerializedProto(
+      const std::vector<int> &ids) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids);
+  }
+  //////////////////////////////////////////////////////////////
+  // ImmutableProto API.
+  virtual ImmutableSentencePieceText EncodeAsImmutableProto(
+      absl::string_view input) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Encode, ImmutableSentencePieceText, input);
+  }
+  virtual ImmutableSentencePieceText SampleEncodeAsImmutableProto(
+      absl::string_view input, int nbest_size, float alpha) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText,
+                                    input, nbest_size, alpha);
+  }
+  virtual ImmutableNBestSentencePieceText NBestEncodeAsImmutableProto(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(
+        NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size);
+  }
+  virtual ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncodeAndScore,
+                                    ImmutableNBestSentencePieceText, input,
+                                    num_samples, alpha, wor, include_best);
+  }
+  // TODO(taku): Remove this API and use std::vector<std::string_view>
+  virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces);
+  }
+  virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces);
+  }
+  virtual ImmutableSentencePieceText DecodeIdsAsImmutableProto(
+      const std::vector<int> &ids) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids);
+  }
+#undef DEFINE_SPP_DIRECT_FUNC_IMPL
+#undef DEFINE_SPP_SERIALIZED_PROTO_IMPL
+#undef DEFINE_SPP_IMMUTABLE_PROTO_IMPL
+  //////////////////////////////////////////////////////////////
+  // Vocabulary management methods.
+  //
+  // Returns the size of sentence pieces, which is the same as
+  // the size of vocabulary for NMT.
+  virtual int GetPieceSize() const;
+  // Returns the vocab id of `piece`.
+  // Returns UNK(0) if `piece` is unknown.
+  virtual int PieceToId(absl::string_view piece) const;
+  // Returns the string representation of vocab with `id`.
+  virtual const std::string &IdToPiece(int id) const;
+  // Returns the score of `id`.
+  // Usually score is an emission log probability of unigram language
+  // model.
+  virtual float GetScore(int id) const;
+  // Returns true if `id` is unknown symbol.
+  virtual bool IsUnknown(int id) const;
+  // Returns true if `id` is control symbol.
+  virtual bool IsControl(int id) const;
+  // Returns true if `id` is unused symbol.
+  virtual bool IsUnused(int id) const;
+  // Returns true if `id` is byte symbol.
+  virtual bool IsByte(int id) const;
+  // Returns the reserved id.
+  // Returns -1 if not defined.
+  // Returns unknown (<unk>) id.
+  virtual int unk_id() const;
+  // Returns BOS (<s>) id.
+  virtual int bos_id() const;
+  // Returns EOS (</s>) id.
+  virtual int eos_id() const;
+  // Returns PAD (<pad>) id.
+  virtual int pad_id() const;
+  //////////////////////////////////////////////////////////////
+  // Model management.
+  //
+  // Allows injection of a mock model instance. `model` is moved.
+  void SetModel(std::unique_ptr<ModelInterface> &&model);
+  // Allows injection of a normalizer instance. `normalizer` is moved.
+  void SetNormalizer(std::unique_ptr<normalizer::Normalizer> &&normalizer);
+  // Returns immutable model proto. Useful to obtain extended
+  // or experimental parameters encoded in model_proto.
+  const ModelProto &model_proto() const;
+  // returns immutable model proto as std::string.
+  // Useful to save the state of this instance via Python's pickle object.
+  util::bytes serialized_model_proto() const;
+ private:
+  enum ExtraOption { REVERSE, BOS, EOS, UNK_PIECE };
+  util::Status ParseExtraOptions(absl::string_view extra_option,
+                                 std::vector<ExtraOption> *extra_options) const;
+  util::Status ApplyExtraOptions(const std::vector<ExtraOption> &extra_options,
+                                 SentencePieceText *spt) const;
+  util::Status PopulateSentencePieceText(
+      absl::string_view input, absl::string_view normalized,
+      const std::vector<size_t> &norm_to_orig,
+      const std::vector<std::pair<absl::string_view, int>> &result,
+      SentencePieceText *spt) const;
+  std::unique_ptr<ModelInterface> model_;
+  std::unique_ptr<normalizer::Normalizer> normalizer_;
+  std::unique_ptr<normalizer::Normalizer> denormalizer_;
+  // Underlying model protocol buffer. The same lifetime as model_.
+  std::unique_ptr<ModelProto> model_proto_;
+  std::vector<ExtraOption> encode_extra_options_;
+  std::vector<ExtraOption> decode_extra_options_;
+};
+// Set seed value of random generator.
+// Do not set static_cast<unique_int>(-1),
+// as this seed is reserved for initializing from
+// std::random_device.
+void SetRandomGeneratorSeed(unsigned int seed);
+// IO related functions to absorb model formats.
+namespace io {
+// Loads `model_proto` from `filename`.
+// We can instantiate SentencePieceProcessor as follows:
+//
+//  auto model_proto = absl::make_unique<ModelProto>();
+//  io::LoadModelProto("//path/spm.model", model_proto.get());
+//  SentencePieceProcessor sp;
+//  CHECK_OK(sp.Load(std::move(model_proto)));
+util::Status LoadModelProto(absl::string_view, ModelProto *model_proto);
+// Saves `model_proto` as `filename`.
+util::Status SaveModelProto(absl::string_view, const ModelProto &model_proto);
+}  // namespace io
+}  // namespace sentencepiece
+#endif  // SENTENCEPIECE_PROCESSOR_H_

Baichuan2/src/lib_pcie/libbmlib.so ADDED Viewed

Binary file (195 kB). View file

Baichuan2/src/lib_pcie/libbmrt.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621e33823dca470275e09570324a567ce4a30fa6100ac9e52742bb9e1ee02f45
+size 2966400

Baichuan2/src/lib_pcie/libbmrt.so.1.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621e33823dca470275e09570324a567ce4a30fa6100ac9e52742bb9e1ee02f45
+size 2966400

Baichuan2/src/lib_pcie/libsentencepiece.a ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68811cd99e6e1a58572372f14f3b7a02cf98bc98f5d46d24c406be65a94b53e8
+size 2858304

Baichuan2/src/lib_soc/libbmlib.so ADDED Viewed

Binary file (191 kB). View file

Baichuan2/src/lib_soc/libbmrt.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff807807fcc8c6a9d16353e389422d434ae2b79c8bc191266d0eb5a69b3d97d
+size 2915352

Baichuan2/src/lib_soc/libbmrt.so.1.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff807807fcc8c6a9d16353e389422d434ae2b79c8bc191266d0eb5a69b3d97d
+size 2915352

Baichuan2/src/lib_soc/libsentencepiece.a ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b1c1ece6c62265ee879cf5876d31e82580c3ee88c2cb627b8ac3eaf35695bde
+size 3032062

Baichuan2/web_demo/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+cmake_minimum_required(VERSION 2.8)
+project(baichuan2)
+if (NOT DEFINED TARGET_ARCH)
+    set(TARGET_ARCH pcie)
+endif()
+set(CMAKE_INSTALL_PREFIX install)
+if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+	add_definitions(-DSOC_TARGET)
+	link_directories(${PROJECT_SOURCE_DIR}/../src/lib_soc)
+	message("SoC mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "pcie")
+    add_definitions(-DPCIE_TARGET)
+    link_directories(${PROJECT_SOURCE_DIR}/../src/lib_pcie)
+	message("Pcie mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "soc")
+    add_definitions(-DSOC_TARGET)
+    set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_ASM_COMPILER aarch64-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+    link_directories(${PROJECT_SOURCE_DIR}/lib_soc)
+	message("SoC mode, starting......")
+endif()
+include_directories(${PROJECT_SOURCE_DIR}/../src/include)
+add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror)
+set(CMAKE_BUILD_TYPE "Debug")
+add_library(tpuchat SHARED chat.cpp)
+target_link_libraries(tpuchat bmrt bmlib sentencepiece)

Baichuan2/web_demo/chat.cpp ADDED Viewed

	@@ -0,0 +1,419 @@

+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+//
+// TPU-MLIR is licensed under the 2-Clause BSD License except for the
+// third-party components.
+//
+//===----------------------------------------------------------------------===//
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <assert.h>
+#include <chrono>
+#include <algorithm>
+#include "memory.h"
+#include "sentencepiece/sentencepiece_processor.h"
+#include "bmruntime_interface.h"
+#include <getopt.h>
+static const int NUM_LAYERS = 32;
+static const int MAX_LEN = 512;
+static const float ATTENTION_MASK = -1000.;
+static const std::string TOKENIZER_MODEL = "tokenizer.model";
+// #define EXPORT_RESULTS
+#ifdef EXPORT_RESULTS
+#include "cnpy.h"
+static cnpy::npz_t map;
+template <typename T>
+static void add_array(std::string name, bm_handle_t bm_handle,
+                      const bm_device_mem_t &dst) {
+  std::vector<T> data(dst.size / sizeof(T));
+  bm_memcpy_d2s(bm_handle, data.data(), dst);
+  cnpy::npz_add_array(map, name, data);
+}
+static void save_array(std::string filename) {
+  cnpy::npz_save_all(filename, map);
+}
+#endif
+class Baichuan2 {
+public:
+  void init(int devid, const std::string model, const std::string tokenizer_path);
+  void chat();
+  void deinit();
+  std::string name;
+  std::string history = "";
+  int round = 0;
+  int token_length;
+  int EOS;
+  std::string predict_next_token();
+  std::string predict_first_token(const std::string &input_str);
+private:
+  int forward_first(std::vector<int> &tokens);
+  int forward_next();
+  void load_sentencepiece(const std::string &tokenizer_path);
+private:
+  std::vector<bm_handle_t> handles;
+  bm_handle_t bm_handle;
+  void *p_bmrt;
+  sentencepiece::SentencePieceProcessor sentencepiece;
+  const bm_net_info_t *net_blocks[NUM_LAYERS];
+  const bm_net_info_t *net_blocks_cache[NUM_LAYERS];
+  const bm_net_info_t *net_embed;
+  const bm_net_info_t *net_lm;
+  bm_tensor_t inputs_embed_512, outputs_embed_512;
+  bm_tensor_t inputs_lm, outputs_lm;
+  bm_tensor_t inputs_pid, next_pid, inputs_attention, next_attention;
+  bm_tensor_t past_key[NUM_LAYERS], past_value[NUM_LAYERS];
+  bm_tensor_t present_key[NUM_LAYERS], present_value[NUM_LAYERS];
+  bm_tensor_t present_key_cache, present_value_cache;
+  std::string name_embed;
+  std::string name_lm;
+  std::string name_blocks[NUM_LAYERS];
+  std::string name_blocks_cache[NUM_LAYERS];
+};
+void Baichuan2::load_sentencepiece(const std::string &model) {
+  printf("Load %s ... ", model.c_str());
+  auto status = sentencepiece.Load(model);
+  if (!status.ok()) {
+    std::cout << status.ToString() << std::endl;
+    exit(-1);
+  }
+  EOS = sentencepiece.eos_id();
+  printf("Done!\n");
+}
+void Baichuan2::init(int devid, const std::string model, const std::string tokenizer_path) {
+  load_sentencepiece(tokenizer_path);
+  // request bm_handle
+  bm_status_t status = bm_dev_request(&bm_handle, devid);
+  assert(BM_SUCCESS == status);
+  // create bmruntime
+  p_bmrt = bmrt_create(bm_handle);
+  assert(NULL != p_bmrt);
+  // load bmodel by file
+  printf("Model[%s] loading ....\n", model.c_str());
+  bool ret = bmrt_load_bmodel(p_bmrt, model.c_str());
+  assert(true == ret);
+  printf("Done!\n");
+  // net names
+  name_embed = "embedding";
+  name_lm = "lm_head";
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    name_blocks[i] = "block_" + std::to_string(i);
+    name_blocks_cache[i] = "block_cache_" + std::to_string(i);
+  }
+  // net infos
+  net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str());
+  net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str());
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    net_blocks[i] = bmrt_get_network_info(p_bmrt, name_blocks[i].c_str());
+    net_blocks_cache[i] =
+        bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str());
+  }
+  // net device mem
+  ret = bmrt_tensor(&inputs_embed_512, p_bmrt, net_embed->input_dtypes[0],
+                    net_embed->stages[1].input_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&outputs_embed_512, p_bmrt, net_embed->output_dtypes[0],
+                    net_embed->stages[1].output_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&inputs_pid, p_bmrt, net_blocks[0]->input_dtypes[1],
+                    net_blocks[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+  ret = bmrt_tensor(&inputs_attention, p_bmrt, net_blocks[0]->input_dtypes[2],
+                    net_blocks[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+  ret = bmrt_tensor(&next_pid, p_bmrt, net_blocks_cache[0]->input_dtypes[1],
+                    net_blocks_cache[0]->stages[0].input_shapes[1]);
+  assert(true == ret);
+  ret =
+      bmrt_tensor(&next_attention, p_bmrt, net_blocks_cache[0]->input_dtypes[2],
+                  net_blocks_cache[0]->stages[0].input_shapes[2]);
+  assert(true == ret);
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    ret = bmrt_tensor(&past_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&past_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_key[i], p_bmrt, net_blocks[0]->output_dtypes[1],
+                      net_blocks[0]->stages[0].output_shapes[1]);
+    assert(true == ret);
+    ret = bmrt_tensor(&present_value[i], p_bmrt, net_blocks[0]->output_dtypes[2],
+                      net_blocks[0]->stages[0].output_shapes[2]);
+    assert(true == ret);
+  }
+  ret = bmrt_tensor(&present_key_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[1],
+                    net_blocks_cache[0]->stages[0].output_shapes[1]);
+  assert(true == ret);
+  ret = bmrt_tensor(&present_value_cache, p_bmrt, net_blocks_cache[0]->output_dtypes[2],
+                    net_blocks_cache[0]->stages[0].output_shapes[2]);
+  assert(true == ret);
+  ret = bmrt_tensor(&inputs_lm, p_bmrt, net_lm->input_dtypes[0],
+                    net_lm->stages[0].input_shapes[0]);
+  assert(true == ret);
+  ret = bmrt_tensor(&outputs_lm, p_bmrt, net_lm->output_dtypes[0],
+                    net_lm->stages[0].output_shapes[0]);
+  assert(true == ret);
+}
+void Baichuan2::deinit() {
+  bm_free_device(bm_handle, inputs_embed_512.device_mem);
+  bm_free_device(bm_handle, outputs_embed_512.device_mem);
+  bm_free_device(bm_handle, inputs_lm.device_mem);
+  bm_free_device(bm_handle, outputs_lm.device_mem);
+  bm_free_device(bm_handle, inputs_pid.device_mem);
+  bm_free_device(bm_handle, next_pid.device_mem);
+  bm_free_device(bm_handle, inputs_attention.device_mem);
+  bm_free_device(bm_handle, next_attention.device_mem);
+  bm_free_device(bm_handle, present_key_cache.device_mem);
+  bm_free_device(bm_handle, present_value_cache.device_mem);
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_free_device(bm_handle, past_key[i].device_mem);
+    bm_free_device(bm_handle, past_value[i].device_mem);
+    bm_free_device(bm_handle, present_key[i].device_mem);
+    bm_free_device(bm_handle, present_value[i].device_mem);
+  }
+  bmrt_destroy(p_bmrt);
+  for (auto h : handles) {
+    bm_dev_free(h);
+  }
+}
+int Baichuan2::forward_first(std::vector<int> &tokens) {
+  int input_ids[MAX_LEN] = {0}; // start token
+  int position_id[MAX_LEN] = {0};
+  float attention_mask[MAX_LEN * MAX_LEN] = {0};
+  token_length = tokens.size();
+  std::copy(tokens.begin(), tokens.end(), input_ids);
+  for (int i = 0; i < token_length; i++) {
+    position_id[i] = i;
+  }
+  for (int i = 0; i < MAX_LEN; i++) {
+    for (int j = 0; j < MAX_LEN; j++) {
+      if (j <= i && i < token_length) {
+      } else {
+        attention_mask[i * MAX_LEN + j] = ATTENTION_MASK;
+      }
+    }
+  }
+  // forward embeding
+  bm_memcpy_s2d(bm_handle, inputs_embed_512.device_mem, (void *)input_ids);
+  auto ret =
+      bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &inputs_embed_512, 1,
+                            &outputs_embed_512, 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+  // forward blocks
+  bm_memcpy_s2d(bm_handle, inputs_pid.device_mem, (void *)position_id);
+  bm_memcpy_s2d(bm_handle, inputs_attention.device_mem, (void *)attention_mask);
+  auto inputs_embed = outputs_embed_512;
+  inputs_embed.shape = net_blocks[0]->stages[0].input_shapes[0];
+  bm_tensor_t inputs_block[3] = {inputs_embed, inputs_pid, inputs_attention};
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t outputs_block[3] = {inputs_embed, past_key[i], past_value[i]};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(), inputs_block, 3,
+                                outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+  }
+  int bytes = inputs_embed.device_mem.size / MAX_LEN;
+  bm_memcpy_d2d_byte(bm_handle, inputs_lm.device_mem, 0,
+                     inputs_embed.device_mem, (token_length - 1) * bytes,
+                     bytes);
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+int Baichuan2::forward_next() {
+  float attention_mask[MAX_LEN + 1] = {0};
+  for (int i = token_length - 1; i < MAX_LEN; i++) {
+    attention_mask[i] = ATTENTION_MASK;
+  }
+  int32_t position_id = token_length - 1;
+  // embedding
+  outputs_lm.shape = net_embed->stages[0].input_shapes[0];
+  auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(), &outputs_lm, 1,
+                                   &inputs_lm, 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+  // blocks
+  bm_memcpy_s2d(bm_handle, next_attention.device_mem, (void *)attention_mask);
+  bm_memcpy_s2d(bm_handle, next_pid.device_mem, (void *)&position_id);
+  auto inputs_embed = inputs_lm;
+  inputs_embed.shape = net_blocks_cache[0]->stages[0].input_shapes[0];
+  int bytes = bm_mem_get_device_size(present_key_cache.device_mem);
+  int token_offset = (token_length - 1) * bytes;
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    bm_tensor_t inputs_block[5] = {inputs_embed, next_pid, next_attention,
+                                   past_key[i], past_value[i]};
+    bm_tensor_t outputs_block[3] = {inputs_embed, present_key_cache, present_value_cache};
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(),
+                                inputs_block, 5, outputs_block, 3, true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+    bm_memcpy_d2d_byte(bm_handle, past_key[i].device_mem, token_offset,
+                       present_key_cache.device_mem, 0,
+                       bytes);
+    bm_memcpy_d2d_byte(bm_handle, past_value[i].device_mem, token_offset,
+                       present_value_cache.device_mem, 0,
+                       bytes);
+  }
+  outputs_lm.shape = net_lm->stages[0].output_shapes[0];
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm, 1,
+                              &outputs_lm, 1, true, false);
+  bm_thread_sync(bm_handle);
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm.device_mem);
+  return token;
+}
+std::string Baichuan2::predict_first_token(const std::string &input_str) {
+  history = input_str;
+  //int tok_num = 1;
+  std::vector<int> tokens;
+  sentencepiece.Encode(history, &tokens);
+  tokens.insert(tokens.begin(), 1);
+  if (tokens.empty()) {
+    round = 0;
+    history = "Sorry: your question is too wierd!!\n";
+    return history;
+  }
+  // make sure token not too large
+  if (tokens.size() > MAX_LEN - 10) {
+    // reset
+    if (round == 0) {
+      history = "Error: your question is too large!\n";
+      return history;
+    }
+    round = 0;
+    history = "";
+    return predict_first_token(input_str);
+  }
+  int token = forward_first(tokens);
+  int pre_token = 0;
+  std::string pre_word;
+  std::string word;
+  std::vector<int> pre_ids = {pre_token};
+  std::vector<int> ids = {pre_token,token};
+  sentencepiece.Decode(pre_ids, &pre_word);
+  sentencepiece.Decode(ids, &word);
+  std::string diff = word.substr(pre_word.size());
+#ifdef PRINT
+  printf("token %d",token);
+  printf("diff %s",diff.c_str());
+#endif
+  history += diff;
+  if (token_length < MAX_LEN) {
+    token_length++;
+  }
+  return diff;
+}
+std::string Baichuan2::predict_next_token() {
+  int pre_token;
+  pre_token = 0;
+  int token = forward_next();
+  if(token == EOS){
+    round = 0;
+    history = history.substr(history.size()/2);
+    return "_GETEOS_";
+  }
+  std::string pre_word;
+  std::string word;
+  std::vector<int> pre_ids = {pre_token};
+  std::vector<int> ids = {pre_token, token};
+  sentencepiece.Decode(pre_ids, &pre_word);
+  sentencepiece.Decode(ids, &word);
+  std::string diff = word.substr(pre_word.size());
+#ifdef PRINT
+  printf("token %d",token);
+  printf("diff %s",diff.c_str());
+#endif
+  history += diff;
+  if (token_length < MAX_LEN) {
+    token_length++;
+  }else{
+    round = 0;
+    return "_GETMAX_";
+  }
+  return diff;
+}
+extern "C" {
+Baichuan2 *Baichuan2_with_devid_and_model(int devid, const char *bmodel_path, const char *tokenizer_path) {
+  Baichuan2 *chat = new Baichuan2();
+  chat->init(devid, bmodel_path, tokenizer_path);
+  return chat;
+}
+void Baichuan2_delete(Baichuan2 *chat) { delete chat; }
+void Baichuan2_deinit(Baichuan2 *chat) {
+  chat->deinit();
+}
+const char *get_history(Baichuan2 *chat) {
+  std::string str = chat->history;
+  return strdup(str.c_str());
+}
+const char *set_history(Baichuan2 *chat, const char *history) {
+  chat->history = history;
+  return strdup(history);
+}
+const char *Baichuan2_predict_first_token(Baichuan2 *chat, const char *input_str) {
+  std::string str = chat->predict_first_token(input_str);
+  return strdup(str.c_str());
+}
+const char *Baichuan2_predict_next_token(Baichuan2 *chat) {
+  std::string str = chat->predict_next_token();
+  return strdup(str.c_str());
+}
+const int get_eos(Baichuan2 *chat){
+  const int res = chat->EOS;
+  return res;
+}
+}

Baichuan2/web_demo/chat.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# coding=utf-8
+import ctypes
+class TokenWord(ctypes.Structure):
+    _fields_ = [
+        ("token", ctypes.c_int),
+        ("word", ctypes.c_char * 2048)  # 假设最大长度为 100，你可以根据实际情况调整
+    ]
+class TPUChatglm:
+    def __init__(self):
+        self.lib = ctypes.cdll.LoadLibrary('./build/libtpuchat.so')
+        device_id = 3
+        bmodel_path = "../model/baichuan2-7b-test_int8.bmodel"
+        token_path = "../model/tokenizer.model"
+        self.device_id = device_id
+        self.bmodel_path = bmodel_path
+        self.token_path = token_path
+        self.libset()
+        self.init()
+    def libset(self):
+        self.lib.Baichuan2_with_devid_and_model.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p]
+        self.lib.Baichuan2_with_devid_and_model.restype = ctypes.c_void_p
+        self.lib.Baichuan2_delete.argtypes = [ctypes.c_void_p]
+        # deinit
+        self.lib.Baichuan2_deinit.argtypes = [ctypes.c_void_p]
+        # Baichuan2_predict_first_token
+        self.lib.Baichuan2_predict_first_token.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+        self.lib.Baichuan2_predict_first_token.restype = ctypes.c_char_p
+        # Baichuan2_predict_next_token
+        self.lib.Baichuan2_predict_next_token.argtypes = [ctypes.c_void_p]
+        self.lib.Baichuan2_predict_next_token.restype = ctypes.c_char_p
+        # get_eos
+        self.lib.get_eos.argtypes = [ctypes.c_void_p]
+        self.lib.get_eos.restype = ctypes.c_int
+        # get_history
+        self.lib.get_history.argtypes = [ctypes.c_void_p]
+        self.lib.get_history.restype = ctypes.c_char_p
+        # set history
+        self.lib.set_history.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+    def init(self):
+        self.obj = self.lib.Baichuan2_with_devid_and_model(self.device_id, self.bmodel_path.encode('utf-8'),
+                                                          self.token_path.encode('utf-8'))
+    def predict_first_token(self, context):
+        return self.lib.Baichuan2_predict_first_token(self.obj, context.encode('utf-8')).decode('utf-8')
+    def predict_next_token(self):
+        return self.lib.Baichuan2_predict_next_token(self.obj).decode('utf-8')
+    def predict(self, context):
+        first_token = self.predict_first_token(context)
+        # print(first_token, end='')
+        res = ''
+        while True:
+            next_token = self.predict_next_token()
+            if next_token == '_GETMAX_' or next_token == '_GETEOS_':
+                # print(next_token)
+                break
+            # print(next_token, end='')
+            res += next_token
+        return res
+    def stream_predict(self, query, history):
+        history.append((query, ''))
+        prompt = ''
+        # for i, (old_query, response) in enumerate(history):
+        #     prompt += "[Round {}]\n\n问：{}\n\n答：{}\n\n".format(i + 1, old_query, response)
+        # prompt += "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+        prompt = "<reserved_106>" + query + "<reserved_107>"
+        res = ''
+        first_token = self.predict_first_token(prompt)
+        res += first_token
+        while True:
+            next_token = self.predict_next_token()
+            if next_token == '_GETMAX_' or next_token == '_GETEOS_':
+                break
+            res += next_token
+            history[-1] = (query, res)
+            yield res, history
+    def get_config(self):
+        pass

Baichuan2/web_demo/web_demo.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import time
+import gradio as gr
+import mdtex2html
+from chat import TPUChatglm
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+gr.Chatbot.postprocess = postprocess
+glm = TPUChatglm()
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>" + line
+    text = "".join(lines)
+    return text
+def gen(input, history):
+    i = 0
+    history.append((input, ''))
+    res = ''
+    while i < 10:
+        i += 1
+        res += str(i)
+        time.sleep(0.05)
+        history[-1] = (input, res)
+        yield res, history
+def predict(input, chatbot, max_length, top_p, temperature, history):
+    chatbot.append((parse_text(input), ""))
+    for response, history in glm.stream_predict(input, history):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        yield chatbot, history
+def reset_user_input():
+    return gr.update(value='')
+def reset_state():
+    return [], [], None
+with gr.Blocks() as demo:
+    gr.HTML("""<h1 align="center">Baichuan2-7B TPU</h1>""")
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        with gr.Column(scale=4):
+            with gr.Column(scale=12):
+                user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(
+                    container=False)
+            with gr.Column(min_width=32, scale=1):
+                submitBtn = gr.Button("Submit", variant="primary")
+        with gr.Column(scale=1):
+            emptyBtn = gr.Button("Clear History")
+            max_length = gr.Slider(0, 32768, value=8192, step=1.0, label="Maximum length", interactive=True)
+            top_p = gr.Slider(0, 1, value=0.8, step=0.01, label="Top P", interactive=True)
+            temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
+    history = gr.State([])
+    submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history],
+                    [chatbot, history], show_progress=True)
+    submitBtn.click(reset_user_input, [], [user_input])
+    emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True)
+demo.queue().launch(share=True, server_name="0.0.0.0", inbrowser=True)

BaseModel/base_model.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import time
+from transformers import AutoTokenizer
+class BaseModel:
+    def __init__(self, args):
+        # parameters
+        self.EOS = None
+        self.SEQLEN = None
+        self.input_str = ""
+        self.system_prompt = ""
+        self.history = []
+        # devid
+        self.devices = [int(d) for d in args.devid.split(",")]
+        # load tokenizer
+        print("Load " + args.tokenizer_path + " ...")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_path, trust_remote_code=True
+        )
+        # warm up
+        self.tokenizer.decode([0])
+        print("Done!")
+    def chat(self):
+        """
+        Start a chat session.
+        """
+        # check
+        if not self.EOS:
+            raise NotImplementedError("Forget to set End of Sentence Token Id(EOS)")
+        if not self.SEQLEN:
+            raise NotImplementedError("Forget to set End of Sentence Token Id")
+        # Instruct
+        print(
+            """\n===========================================================
+1. If you want to quit, please enter one of [q, quit, exit]
+2. To create a new chat session, please enter one of [clear, new]
+==========================================================="""
+        )
+        # Stop Chatting with "exit" input
+        while True:
+            self.input_str = input("\nQuestion: ")
+            # Quit
+            if self.input_str in ["exit", "q", "quit"]:
+                break
+            # New Chat
+            elif self.input_str in ["clear", "new"]:
+                self.clear()
+            # Chat
+            else:
+                tokens = self.encode_tokens()
+                # check tokens
+                if not tokens:
+                    print("Sorry: your question is empty!!")
+                    return
+                if len(tokens) > self.SEQLEN:
+                    print(
+                        "The maximum question length should be shorter than {} but we get {} instead.".format(
+                            self.SEQLEN, len(tokens)
+                        )
+                    )
+                    return
+                print("\nAnswer: ", end="")
+                self.stream_answer(tokens)
+    def stream_answer(self, tokens):
+        """
+        Stream the answer for the given tokens.
+        """
+        tok_num = 0
+        self.answer_cur = ""
+        self.answer_token = []
+        # First token
+        first_start = time.time()
+        token = self.forward_first(tokens)
+        first_end = time.time()
+        # Following tokens
+        while token != self.EOS and self.model.token_length < self.SEQLEN:
+            pre_word = self.decode_tokens([token])
+            word = self.decode_tokens([token, token])[len(pre_word):]
+            self.answer_token += [token]
+            print(word, flush=True, end="")
+            tok_num += 1
+            token = self.forward_next()
+        self.answer_cur = self.tokenizer.decode(self.answer_token)
+        # counting time
+        next_end = time.time()
+        first_duration = first_end - first_start
+        next_duration = next_end - first_end
+        tps = tok_num / next_duration
+        self.update_history()
+        print()
+        print(f"FTL: {first_duration:.3f} s")
+        print(f"TPS: {tps:.3f} token/s")
+    def stream_predict(self, query):
+        """
+        Stream the prediction for the given query.
+        """
+        self.answer_cur = ""
+        self.input_str = query
+        tokens = self.encode_tokens()
+        for answer_cur, history in self._generate_predictions(tokens):
+            yield answer_cur, history
+    def _generate_predictions(self, tokens):
+        """
+        Generate predictions for the given tokens.
+        """
+        # First token
+        next_token = self.forward_first(tokens)
+        output_tokens = [next_token]
+        # Following tokens
+        while True:
+            next_token = self.forward_next()
+            if next_token == self.EOS:
+                break
+            output_tokens += [next_token]
+            self.answer_cur = self.tokenizer.decode(output_tokens)
+            if self.model.token_length >= self.SEQLEN:
+                self.update_history()
+                yield self.answer_cur + "\n\n\nReached the maximum length; The history context has been cleared.", self.history
+                break
+            else:
+                yield self.answer_cur, self.history
+        self.update_history()
+    def forward_first(self, tokens):
+        """
+        Forward the first token.
+        """
+        token = self.model.forward_first(tokens)
+        return token
+    def forward_next(self):
+        """
+        Forward the next token.
+        """
+        token = self.model.forward_next()
+        return token
+    def decode_tokens(self, token):
+        """
+        Decode the given token.
+        """
+        word = self.tokenizer.decode(token, skip_special_tokens=True)
+        return word
+    def encode_tokens(self):
+        """
+        Encode the input string to tokens.
+        """
+        raise NotImplementedError
+    def load_model(self):
+        """
+        Load the model.
+        """
+        raise NotImplementedError
+    def clear(self):
+        """
+        Clear the chat session.
+        """
+        raise NotImplementedError
+    def update_history(self):
+        """
+        Update chat history.
+        """
+        raise NotImplementedError

ChatGLM2/README.md ADDED Viewed

	@@ -0,0 +1,160 @@

+![](./assets/sophgo_chip.png)
+# ChatGLM2
+本项目实现BM1684X部署语言大模型[ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b)。通过[TPU-MLIR](https://github.com/sophgo/tpu-mlir)编译器将模型转换成bmodel，并采用c++代码将其部署到BM1684X的PCIE环境，或者SoC环境。
+在知乎上写了关于`ChatGLM`的解读，方便大家理解源码：
+[ChatGLM2流程解析与TPU-MLIR部署](https://zhuanlan.zhihu.com/p/641975976)
+## 开发环境
+1. 下载docker，启动容器，如下：
+``` shell
+docker pull sophgo/tpuc_dev:latest
+# myname1234 is just an example, you can set your own name
+docker run --privileged --name myname1234 -v $PWD:/workspace -it sophgo/tpuc_dev:latest
+```
+后文假定环境都在docker的`/workspace`目录。
+2. 从Huggingface下载`ChatGLM2-6B`，比较大，会花较长时间
+``` shell
+git lfs install
+git clone git@hf.co:THUDM/chatglm2-6b
+```
+并将本项目中./models/ChatGLM2/compile/files/chatglm2-6b中config.json与modeling_chatglm.py替换至上述下载后的文件夹中，并替换同名文件（其中需要采用其它sequence length的用户请参考[常见问题](#常见问题),默认sequence length = 512）
+3. 下载`TPU-MLIR`代码并编译，(也可以直接下载编译好的release包解压)
+目前由于mlir还在维护中，编译GLM系列模型的用户请下载
+``` shell
+pip3 install dfss
+python3 -m dfss --url=open@sophgo.com:/ext_model_information/LLM/mlir_club/glm_mlir.tar.gz
+tar -xf glm_mlir.tar.gz
+source source tpu-mlir_v1.6.45-gdc3e9f6b-20231220/envsetup.sh
+```
+后续mlir维护完成后可以使用如下方式
+``` shell
+git clone git@github.com:sophgo/tpu-mlir.git
+cd tpu-mlir
+source ./envsetup.sh
+./build.sh
+```
+## 编译模型
+1. 导出所有onnx模型，如果过程中提示缺少某些组件，直接`pip3 install 组件`即可
+``` shell
+cd compile
+python3 export_onnx.py --model_path your_chatglm2-6b_path
+```
+此时有大量onnx模型被导出到tmp目录。
+2. 对onnx模型进行编译
+目前TPU-MLIR支持对ChatGLM2进行F16、INT8和INT4量化，且支持多芯分布式推理，默认情况下会进行F16量化和单芯推理，最终生成`chatglm2-6b_f16_1dev.bmodel`文件
+```shell
+./compile.sh --name chatglm2-6b --mode inference_mode --num_device device_number
+```
+其中：
+`--name` 为模型名称，在此指定为`chatglm2-6b`；
+`--mode` 为推理所使用的数据类型，可以选择`f16, int8, int4`中任意一种，默认为`f16`；
+`--num_device` 为推理所使用的芯片数量，请根据实际所使用的设备指定，默认`--num_device 1`。
+## 编译程序(C++版本)
+执行如下编译，（PCIE与SOC相同）：
+```shell
+cd demo
+mkdir build
+cd build
+cmake ..
+make
+```
+编译生成chatglm可执行程序，将`chatglm`放到demo目录下，同时按照下列方式指定芯片数量和bmodel路径。
+运行`chatglm`，默认单芯运行`chatglm2-6b_f16_1dev.bmodel`:
+```shell
+./chatglm --model chatglm2-6b_f16_1dev.bmodel --tokenizer ../support/tokenizer/tokenizer.model --devid  your_devid
+```
+其中`--devid`为用来推理的TPU编号，默认为0，如果使用多芯推理（需要保证编译的bmodel也是多芯）可以使用`,`来增加芯片，如`--devid 2,3` 表示使用TPU2 和 TPU3来进行推理。
+## 运行效果
+以下为单芯片下INT8量化模式的运行效果：
+![](./assets/chatglm.jpg)
+## 常见问题
+#### sentencepiece是怎么来的
+工程中已经有编译好的，所以不需要编译，如果好奇的话，参考如下步骤。
+下载[sentencepiece](https://github.com/google/sentencepiece)，并编译得到`libsentencepiece.a`
+```shell
+git clone git@github.com:google/sentencepiece.git
+cd sentencepiece
+mkdir build
+cd build
+cmake ..
+make -j
+```
+如果要编译SoC环境，则参考demo的编译方式，在makefile中指定交叉编译器
+#### demo程序无法正常运行
+如果demo程序拷贝到运行环境提示无法运行，比如接口找不到等等错误。
+原因是运行环境的库有所不同，将demo中的`./support/lib_pcie`（PCIE）或者 `./support/lib_soc`(SoC)里面的so文件拷贝到运行环境，链接到里面的so即可。
+#### 对源码做了哪些修改：
+一共做了三点修改：
+- 将`config.json`文件中`seq_length`配置为512；
+- 将`modeling_chatglm.py`文件中的如下代码：
+```python
+if attention_mask is not None:
+    attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+```
+修改为：
+```python
+if attention_mask is not None:
+    attention_scores = attention_scores + (attention_mask * -10000.0)
+```
+这样修改可以提升效率，使用`masked_fill`效率低下；另一方面`masked_fill`转ONNX存在些bug。
+- 将`modeling_chatglm.py`文件中的如下代码：
+```python
+pytorch_major_version = int(torch.__version__.split('.')[0])
+if pytorch_major_version >= 2:
+```
+修改为：
+```python
+pytorch_major_version = int(torch.__version__.split('.')[0])
+if False:
+```
+这是因为ONNX无法支持`torch.nn.functional.scaled_dot_product_attention`算子的转换。

ChatGLM2/compile/compile.sh ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/bin/bash
+set -ex
+models=
+mode="f16"
+folder="tmp"
+num_device=1
+mode_args=""
+device_args=""
+quantize_args="--quantize F16"
+name=""
+num_layers=
+out_model=$name.bmodel
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --mode)
+            mode="$2"
+            shift 2
+            ;;
+        --num_device)
+            num_device="$2"
+            shift 2
+            ;;
+        --name)
+            name="$2"
+            shift 2
+            ;;
+        *)
+            echo "Invalid option: $key" >&2
+            exit 1
+            ;;
+        :)
+            echo "Option -$OPTARG requires an argument." >&2
+            exit 1
+            ;;
+    esac
+done
+if [ "$name" = "chatglm2-6b" ]; then
+  num_layers=27
+  echo "Compile ChatGLM2-6B"
+else
+  >&2 echo -e "Error: Invalid name $name, the input name must be \033[31mchatglm2-6b\033[0m"
+  exit 1
+fi
+if [ x$mode == x"int8" ]; then
+    quantize_args="--quantize W8F16"
+elif [ x$mode == x"f16" ]; then
+    quantize_args="--quantize F16"
+elif [ x$mode == x"int4" ]; then
+    quantize_args="--quantize W4F16 --q_group_size 64"
+else
+    echo "Error, unknown quantize mode"
+    exit 1
+fi
+if [ x$num_device != x1 ]; then
+    device_args="--num_device $num_device"
+    out_model=$name'_'$mode'_'$num_device'dev.bmodel'
+else
+    out_model=$name'_'$mode'_1dev.bmodel'
+fi
+outdir=${folder}/embedding
+mkdir -p $outdir
+pushd $outdir
+model_transform.py \
+    --model_name embedding \
+    --model_def ../onnx/embedding.onnx \
+    --mlir embedding.mlir
+model_deploy.py \
+    --mlir embedding.mlir \
+    --quantize F16 \
+    --quant_input \
+    --quant_output \
+    --chip bm1684x \
+    $device_args \
+    --model embedding.bmodel
+model_transform.py \
+    --model_name embedding_cache \
+    --model_def ../onnx/embedding.onnx \
+    --input_shapes [[1,1]] \
+    --mlir embedding_cache.mlir
+model_deploy.py \
+    --mlir embedding_cache.mlir \
+    --quantize F16 \
+    --quant_input \
+    --quant_output \
+    --chip bm1684x \
+    $device_args \
+    --model embedding_cache.bmodel
+rm *.npz
+models=$models' '$outdir'/embedding.bmodel '$outdir'/embedding_cache.bmodel '
+popd
+echo $models
+outdir=tmp/$mode"_"$num_device"dev"/lm_head
+mkdir -p $outdir
+pushd $outdir
+model_transform.py \
+    --model_name lm_head \
+    --model_def ../../onnx/lm_head.onnx \
+    --mlir lm_head.mlir
+model_deploy.py \
+    --mlir lm_head.mlir \
+    $quantize_args \
+    --quant_input \
+    --quant_output \
+    --chip bm1684x \
+    $device_args \
+    --model lm_head.bmodel
+rm *.npz
+models=${models}${outdir}'/lm_head.bmodel '
+popd
+echo $models
+outdir=tmp/$mode"_"$num_device"dev"/block
+mkdir -p $outdir
+pushd $outdir
+mkdir -p $outdir
+for ((i=0; i<=$num_layers; i++)); do
+    model_transform.py \
+        --model_name block_$i \
+        --model_def ../../onnx/block_$i.onnx \
+        --mlir block_$i.mlir
+    model_deploy.py \
+        --mlir block_$i.mlir \
+        $quantize_args \
+        --quant_input \
+        --quant_output \
+        --chip bm1684x \
+        $device_args \
+        --model block_$i.bmodel
+    model_transform.py \
+        --model_name block_cache_$i \
+        --model_def ../../onnx/block_cache_$i.onnx \
+        --mlir block_cache_$i.mlir
+    model_deploy.py \
+        --mlir block_cache_$i.mlir \
+        $quantize_args \
+        --quant_input \
+        --quant_output \
+        --chip bm1684x \
+        $device_args \
+        --model block_cache_$i.bmodel
+    rm *.npz
+    models=${models}${outdir}'/block_'$i'.bmodel '$outdir'/block_cache_'$i'.bmodel '
+done
+popd
+echo $models
+model_tool --combine $models -o $out_model

ChatGLM2/compile/export_onnx.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#!/usr/bin/env python3
+# ==============================================================================
+#
+# Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+#
+# TPU-MLIR is licensed under the 2-Clause BSD License except for the
+# third-party components.
+#
+# ==============================================================================
+import os
+import torch
+import argparse
+from tqdm import tqdm
+from transformers import AutoModel, AutoTokenizer
+parser = argparse.ArgumentParser(description='export onnx.')
+parser.add_argument('--model_path', type=str, help='path to the torch model.')
+args = parser.parse_args()
+model_path = args.model_path
+folder = f"./tmp/onnx"
+origin_model = AutoModel.from_pretrained(
+    model_path, trust_remote_code=True).float().eval()
+for param in origin_model.parameters():
+    param.requires_grad = False
+config = origin_model.config
+transformer = origin_model.transformer
+layers = transformer.encoder.layers
+SEQ_LENGTH = transformer.seq_length
+NUM_LAYERS = config.num_layers
+HIDDEN_SIZE = config.hidden_size
+NUM_ATTENTION_HEADS = config.num_attention_heads
+HEAD_DIM = HIDDEN_SIZE // NUM_ATTENTION_HEADS
+print(f'Layers: {NUM_LAYERS}\nHidden size: {HIDDEN_SIZE}\n')
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+class Embedding(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, input_ids):
+        return transformer.embedding.word_embeddings(input_ids)
+class Block(torch.nn.Module):
+    def __init__(self, layer_id):
+        super().__init__()
+        self.layer_id = layer_id
+        self.layer = layers[layer_id]
+    def forward(self, hidden_states, position_ids, attention_mask):
+        rotary_pos_emb = transformer.rotary_pos_emb(SEQ_LENGTH)[position_ids]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+        hidden_states, past_kv = self.layer(hidden_states,
+                                            attention_mask,
+                                            rotary_pos_emb=rotary_pos_emb)
+        return hidden_states, past_kv
+class BlockCache(torch.nn.Module):
+    def __init__(self, layer_id):
+        super().__init__()
+        self.layer_id = layer_id
+        self.layer = layers[layer_id]
+    def forward(self, hidden_states, position_ids, attention_mask, past_k,
+                past_v):
+        rotary_pos_emb = transformer.rotary_pos_emb(SEQ_LENGTH)[position_ids]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+        hidden_states, past_kv = self.layer(hidden_states,
+                                            attention_mask,
+                                            kv_cache=(past_k, past_v),
+                                            rotary_pos_emb=rotary_pos_emb)
+        present_k, present_v = past_kv
+        return hidden_states, present_k[1:], present_v[1:]
+class LmHead(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, hidden_states):
+        hidden_states = transformer.encoder.final_layernorm(hidden_states)
+        m_logits = transformer.output_layer(hidden_states)
+        _, token = torch.topk(m_logits, 1)
+        return token
+def convert_block(layer_id):
+    model = Block(layer_id)
+    hidden_states = torch.randn((SEQ_LENGTH, 1, HIDDEN_SIZE))
+    position_ids = torch.tensor([range(SEQ_LENGTH)], dtype=torch.long)
+    attention_mask = -1000 * torch.ones((1, 1, SEQ_LENGTH, SEQ_LENGTH), dtype=torch.float32).triu(diagonal=1)
+    torch.onnx.export(
+        model, (hidden_states, position_ids, attention_mask),
+        f'{folder}/block_{layer_id}.onnx',
+        verbose=False,
+        input_names=['input_states', 'position_ids', 'attention_mask'],
+        output_names=['hidden_states', 'past_k', 'past_v'],
+        do_constant_folding=True,
+        opset_version=15)
+def convert_block_cache(layer_id):
+    model = BlockCache(layer_id)
+    hidden_states = torch.randn((1, 1, HIDDEN_SIZE))
+    position_ids = torch.tensor([range(1)], dtype=torch.long)
+    attention_mask = -1000 * torch.ones((1, 1, 1, SEQ_LENGTH + 1), dtype=torch.float32).triu(diagonal=1)
+    past_k = torch.randn((SEQ_LENGTH, 1, 2, HEAD_DIM))
+    past_v = torch.randn((SEQ_LENGTH, 1, 2, HEAD_DIM))
+    torch.onnx.export(
+        model, (hidden_states, position_ids, attention_mask, past_k, past_v),
+        f'{folder}/block_cache_{layer_id}.onnx',
+        verbose=False,
+        input_names=[
+            'input_states', 'position_ids', 'attention_mask', 'history_k',
+            'history_v'
+        ],
+        output_names=['hidden_states', 'past_k', 'past_v'],
+        do_constant_folding=True,
+        opset_version=15)
+def convert_embedding():
+    model = Embedding()
+    input_ids = torch.tensor([range(SEQ_LENGTH)])
+    torch.onnx.export(model, (input_ids),
+                      f'{folder}/embedding.onnx',
+                      verbose=False,
+                      input_names=['input_ids'],
+                      output_names=['input_embed'],
+                      do_constant_folding=True,
+                      opset_version=15)
+def convert_lm_head():
+    model = LmHead()
+    input = torch.randn(1, HIDDEN_SIZE)
+    torch.onnx.export(model, (input),
+                      f'{folder}/lm_head.onnx',
+                      verbose=False,
+                      input_names=['hidden_states'],
+                      output_names=['token'],
+                      do_constant_folding=True,
+                      opset_version=15)
+# create folder to store onnx
+if not os.path.exists(folder):
+    os.makedirs(folder)
+# export models
+print(f'Convert block & block_cache')
+for i in tqdm(range(NUM_LAYERS)):
+    convert_block(i)
+    convert_block_cache(i)
+print(f'Convert embedding')
+convert_embedding()
+print(f'Convert lm_head')
+convert_lm_head()

ChatGLM2/compile/files/chatglm2-6b/config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_name_or_path": "THUDM/chatglm2-6b",
+  "model_type": "chatglm",
+  "architectures": [
+    "ChatGLMModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bias_dropout_fusion": true,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "rmsnorm": true,
+  "seq_length": 512,
+  "use_cache": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.27.1",
+  "tie_word_embeddings": false,
+  "eos_token_id": 2,
+  "pad_token_id": 0
+}

ChatGLM2/compile/files/chatglm2-6b/modeling_chatglm.py ADDED Viewed

	@@ -0,0 +1,1285 @@

+""" PyTorch ChatGLM model. """
+import math
+import copy
+import warnings
+import re
+import sys
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+from .configuration_chatglm import ChatGLMConfig
+# flags required to enable jit fusion kernels
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM2-6B"
+_CONFIG_FOR_DOC = "ChatGLM6BConfig"
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm2-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+    return tensor_list
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        return (self.weight * hidden_states).to(input_dtype)
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        projection_size = config.kv_channels * config.num_attention_heads
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if False:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores + attention_mask
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+        self.projection_size = config.kv_channels * config.num_attention_heads
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+        self.core_attention = CoreAttention(config, self.layer_number)
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+        # ==================================
+        # core attention computation
+        # ==================================
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output = self.dense(context_layer)
+        return output, kv_cache
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+class MLP(torch.nn.Module):
+    """MLP.
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+        self.add_bias = config.add_bias_linear
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+        self.activation_func = swiglu
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.fp32_residual_connection = config.fp32_residual_connection
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+        # MLP
+        self.mlp = MLP(config, device=device)
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+        return output, kv_cache
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+        # Number of layers.
+        self.num_layers = config.num_layers
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+        self.gradient_checkpointing = False
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = input_ids.shape
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        return response
+    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
+        prompt = tokenizer.build_prompt(query, history=history)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        return inputs
+    def build_stream_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
+        if history:
+            prompt = "\n\n[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            input_ids = input_ids[1:]
+            inputs = tokenizer.batch_encode_plus([(input_ids, None)], return_tensors="pt", add_special_tokens=False)
+        else:
+            prompt = "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+            inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        return inputs
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192, num_beams=1,
+             do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = self.build_inputs(tokenizer, query, history=history)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+        response = tokenizer.decode(outputs)
+        response = self.process_response(response)
+        history = history + [(query, response)]
+        return response, history
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values=None,
+                    max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+                    return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None and not return_past_key_values:
+            inputs = self.build_inputs(tokenizer, query, history=history)
+        else:
+            inputs = self.build_stream_inputs(tokenizer, query, history=history)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            return_past_key_values=return_past_key_values, **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response = self.process_response(response)
+                new_history = history + [(query, response)]
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+        from .quantization import quantize
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+        self.quantized = True
+        self.config.quantization_bit = bits
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

ChatGLM2/demo/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+cmake_minimum_required(VERSION 2.8)
+project(chatglm)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "")
+if (NOT DEFINED TARGET_ARCH)
+    set(TARGET_ARCH pcie)
+endif()
+include_directories(${PROJECT_SOURCE_DIR}/../support/include)
+if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+	add_definitions(-DSOC_TARGET)
+	link_directories(${PROJECT_SOURCE_DIR}/../support/lib_soc)
+	message("SoC mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "pcie")
+    add_definitions(-DPCIE_TARGET)
+    link_directories(${PROJECT_SOURCE_DIR}/../support/lib_pcie)
+	message("PCIE mode, starting......")
+elseif (${TARGET_ARCH} STREQUAL "soc")
+    add_definitions(-DSOC_TARGET)
+    set(CMAKE_C_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-gcc)
+    set(CMAKE_ASM_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER /opt/aarch64-linux-gnu-7.5.0/bin/aarch64-linux-gnu-g++)
+    link_directories(${PROJECT_SOURCE_DIR}/../support/lib_soc)
+	message("SoC mode, starting......")
+endif()
+add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror)
+set(CMAKE_BUILD_TYPE "Debug")
+add_executable(chatglm demo.cpp)
+target_link_libraries(chatglm bmrt bmlib sentencepiece)

ChatGLM2/demo/demo.cpp ADDED Viewed

	@@ -0,0 +1,609 @@

+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2023 Sophgo Technologies Inc.  All rights reserved.
+//
+// TPU-MLIR is licensed under the 2-Clause BSD License except for the
+// third-party components.
+//
+//===----------------------------------------------------------------------===//
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <assert.h>
+#include <chrono>
+#include <algorithm>
+#include "memory.h"
+#include "sentencepiece/sentencepiece_processor.h"
+#include "bmruntime_interface.h"
+#include <getopt.h>
+#include <stdio.h>
+#include <inttypes.h>
+static const uint16_t ATTENTION_MASK = 0xF0E2;
+class ChatGLM {
+public:
+  void init(const std::vector<int> &devid, std::string model_path, std::string tokenizer_path);
+  void chat();
+  void deinit();
+private:
+  void answer(const std::string &input_str);
+  void tokenizer_encode(const std::string &input_str, std::vector<int> &tokens);
+  int forward_first(std::vector<int> &tokens);
+  int forward_next(int cur_token);
+  void move2end(const bm_tensor_t &kv);
+  void load_sentencepiece(std::string tokenizer_path);
+private:
+  std::vector<bm_handle_t> handles;
+  bm_handle_t bm_handle;
+  void *p_bmrt;
+  sentencepiece::SentencePieceProcessor sentencepiece;
+  const bm_net_info_t *net_embed;
+  const bm_net_info_t *net_embed_cache;
+  const bm_net_info_t *net_lm;
+  std::vector<const bm_net_info_t *> net_blocks;
+  std::vector<const bm_net_info_t *> net_blocks_cache;
+  std::vector<bm_tensor_t> inputs_embed_512, outputs_embed_512;
+  std::vector<bm_tensor_t> inputs_pid, next_pid, inputs_attention, next_attention;
+  std::vector<std::vector<bm_tensor_t>> past_key, past_value;
+  std::vector<bm_tensor_t> inputs_lm, outputs_lm;
+  std::string name_embed;
+  std::string name_embed_cache;
+  std::string name_lm;
+  std::vector<std::string> name_blocks;
+  std::vector<std::string> name_blocks_cache;
+  std::vector<std::pair<std::string, std::string>> history_vector;
+  std::vector<int> history_tokens;
+  std::string cur_answer = "";
+  int device_num;
+  int round = 0;
+  int token_length;
+  int EOS;
+  int SEQLEN;
+  int NUM_LAYERS;
+};
+void ChatGLM::load_sentencepiece(std::string tokenizer_path) {
+  printf("Load %s ... ", tokenizer_path.c_str());
+  auto status = sentencepiece.Load(tokenizer_path);
+  if (!status.ok()) {
+    std::cout << status.ToString() << std::endl;
+    exit(-1);
+  }
+  EOS = sentencepiece.eos_id();
+  printf("Done!\n");
+}
+void ChatGLM::init(const std::vector<int> &devices, std::string model_path, std::string tokenizer_path) {
+  device_num = devices.size();
+  load_sentencepiece(tokenizer_path);
+  // request bm_handle
+  std::cout << "Device [ ";
+  for (auto d : devices) {
+    std::cout << d << " ";
+  }
+  std::cout << "] loading ....\n";
+  for (auto d : devices) {
+    bm_handle_t h;
+    bm_status_t status = bm_dev_request(&h, d);
+    assert(BM_SUCCESS == status);
+    handles.push_back(h);
+  }
+  bm_handle = handles[0];
+  // create bmruntime
+#ifdef SOC_TARGET
+  p_bmrt = bmrt_create(handles[0]);
+#else
+  p_bmrt = bmrt_create_ex(handles.data(), handles.size());
+#endif
+  assert(NULL != p_bmrt);
+  // load bmodel by file
+  printf("Model[%s] loading ....\n", model_path.c_str());
+  bool ret = bmrt_load_bmodel(p_bmrt, model_path.c_str());
+  assert(true == ret);
+  printf("Done!\n");
+  // set NUM_LAYERS
+  auto num_nets = bmrt_get_network_number(p_bmrt);
+  NUM_LAYERS = (num_nets - 2) / 2;
+  // net names
+  name_embed = "embedding";
+  name_embed_cache = "embedding_cache";
+  name_lm = "lm_head";
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    name_blocks.emplace_back("block_" + std::to_string(i));
+    name_blocks_cache.emplace_back("block_cache_" + std::to_string(i));
+  }
+  // net infos
+  net_embed = bmrt_get_network_info(p_bmrt, name_embed.c_str());
+  net_embed_cache = bmrt_get_network_info(p_bmrt, name_embed_cache.c_str());
+  net_lm = bmrt_get_network_info(p_bmrt, name_lm.c_str());
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    net_blocks.emplace_back(
+        bmrt_get_network_info(p_bmrt, name_blocks[i].c_str()));
+    net_blocks_cache.emplace_back(
+        bmrt_get_network_info(p_bmrt, name_blocks_cache[i].c_str()));
+  }
+  // set SEQLEN
+  SEQLEN = net_embed->stages[0].input_shapes[0].dims[1];
+  // resize
+  net_blocks.resize(NUM_LAYERS);
+  net_blocks_cache.resize(NUM_LAYERS);
+  past_key.resize(NUM_LAYERS);
+  past_value.resize(NUM_LAYERS);
+  // net device mem
+  inputs_embed_512.resize(net_embed->input_num);
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&inputs_embed_512[i], p_bmrt,
+                        net_embed->input_loc_devices[i],
+                        net_embed->input_dtypes[i],
+                        net_embed->stages[0].input_shapes[i]);
+    assert(true == ret);
+  }
+  outputs_embed_512.resize(net_embed->output_num);
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&outputs_embed_512[i], p_bmrt,
+                        net_embed->output_loc_devices[i],
+                        net_embed->output_dtypes[i],
+                        net_embed->stages[0].output_shapes[i]);
+    assert(true == ret);
+  }
+  inputs_pid.resize(device_num);
+  inputs_attention.resize(device_num);
+  int in_num = net_blocks[0]->input_num / device_num;
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&inputs_pid[i], p_bmrt,
+                        net_blocks[0]->input_loc_devices[1 + i * in_num],
+                        net_blocks[0]->input_dtypes[1 + i * in_num],
+                        net_blocks[0]->stages[0].input_shapes[1 + i * in_num]);
+    assert(true == ret);
+    ret = bmrt_tensor_ex(&inputs_attention[i], p_bmrt,
+                        net_blocks[0]->input_loc_devices[2 + i * in_num],
+                        net_blocks[0]->input_dtypes[2 + i * in_num],
+                        net_blocks[0]->stages[0].input_shapes[2 + i * in_num]);
+    assert(true == ret);
+  }
+  next_pid.resize(device_num);
+  next_attention.resize(device_num);
+  int in_num_cache = net_blocks_cache[0]->input_num / device_num;
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&next_pid[i], p_bmrt,
+                        net_blocks_cache[0]->input_loc_devices[1 + i * in_num_cache],
+                        net_blocks_cache[0]->input_dtypes[1 + i * in_num_cache],
+                        net_blocks_cache[0]->stages[0].input_shapes[1 + i * in_num_cache]);
+    assert(true == ret);
+    ret = bmrt_tensor_ex(&next_attention[i], p_bmrt,
+                        net_blocks_cache[0]->input_loc_devices[2 + i * in_num_cache],
+                        net_blocks_cache[0]->input_dtypes[2 + i * in_num_cache],
+                        net_blocks_cache[0]->stages[0].input_shapes[2 + i * in_num_cache]);
+    assert(true == ret);
+  }
+  int out_num = net_blocks[0]->output_num / device_num;
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    past_key[i].resize(device_num);
+    past_value[i].resize(device_num);
+    for (int j = 0; j < device_num; j++) {
+      ret = bmrt_tensor_ex(&past_key[i][j], p_bmrt,
+                          net_blocks[0]->output_loc_devices[1 + j * out_num],
+                          net_blocks[0]->output_dtypes[1 + j * out_num],
+                          net_blocks[0]->stages[0].output_shapes[1 + j * out_num]);
+      assert(true == ret);
+      ret = bmrt_tensor_ex(&past_value[i][j], p_bmrt,
+                          net_blocks[0]->output_loc_devices[2 + j * out_num],
+                          net_blocks[0]->output_dtypes[2 + j * out_num],
+                          net_blocks[0]->stages[0].output_shapes[2 + j * out_num]);
+      assert(true == ret);
+    }
+  }
+  inputs_lm.resize(device_num);
+  outputs_lm.resize(device_num);
+  for (int i = 0; i < device_num; ++i) {
+    ret = bmrt_tensor_ex(&inputs_lm[i], p_bmrt, i, net_lm->input_dtypes[0],
+                        net_lm->stages[0].input_shapes[0]);
+    assert(true == ret);
+    ret = bmrt_tensor_ex(&outputs_lm[i], p_bmrt, i, net_lm->output_dtypes[0],
+                        net_lm->stages[0].output_shapes[0]);
+    assert(true == ret);
+  }
+}
+void ChatGLM::deinit() {
+  for (int i = 0; i < device_num; ++i) {
+    bm_free_device(handles[i], inputs_embed_512[i].device_mem);
+    bm_free_device(handles[i], outputs_embed_512[i].device_mem);
+    bm_free_device(handles[i], inputs_pid[i].device_mem);
+    bm_free_device(handles[i], next_pid[i].device_mem);
+    bm_free_device(handles[i], inputs_attention[i].device_mem);
+    bm_free_device(handles[i], next_attention[i].device_mem);
+    bm_free_device(handles[i], inputs_lm[i].device_mem);
+    bm_free_device(handles[i], outputs_lm[i].device_mem);
+  }
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    for (int j = 0; j < device_num; j++) {
+      bm_free_device(handles[j], past_key[i][j].device_mem);
+      bm_free_device(handles[j], past_value[i][j].device_mem);
+    }
+  }
+  bmrt_destroy(p_bmrt);
+  for (auto h : handles) {
+    bm_dev_free(h);
+  }
+}
+// after first block, move real result to end of mem
+void ChatGLM::move2end(const bm_tensor_t &kv) {
+  if (token_length >= SEQLEN) {
+    return;
+  }
+  auto total_size = bm_mem_get_device_size(kv.device_mem);
+  auto bytes = total_size / SEQLEN;
+  auto real_size = token_length * bytes;
+  auto mem =
+      bm_mem_from_device(bm_mem_get_device_addr(kv.device_mem), real_size);
+  auto buffer = new uint8_t[real_size];
+  auto dst = new uint8_t[total_size];
+  bm_memcpy_d2s(bm_handle, (void *)buffer, mem);
+  memset(dst, 0, total_size - real_size);
+  memcpy(dst + total_size - real_size, buffer, real_size);
+  bm_memcpy_s2d(bm_handle, kv.device_mem, (void *)dst);
+  delete[] buffer;
+  delete[] dst;
+}
+int ChatGLM::forward_first(std::vector<int> &tokens) {
+  std::vector<int> input_ids(SEQLEN, 0);
+  std::vector<int> position_id(SEQLEN, 0);
+  std::vector<uint16_t> attention_mask(SEQLEN * SEQLEN, 0);
+  std::copy(tokens.begin(), tokens.end(), input_ids.data());
+  token_length = tokens.size();
+  for (int i = 0; i < token_length; i++) {
+    position_id[i] = i;
+  }
+  for (int i = 0; i < SEQLEN; i++) {
+    for (int j = 0; j < SEQLEN; j++) {
+      if (j <= i && i < token_length) {
+      } else {
+        attention_mask[i * SEQLEN + j] = ATTENTION_MASK;
+      }
+    }
+  }
+  // forward embeding
+  std::vector<int> input_nums(device_num, 1);
+  std::vector<void*> datas(device_num, (void*)input_ids.data());
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_embed_512.data(), datas.data(),
+                          input_nums.data(), device_num);
+  auto ret =
+      bmrt_launch_tensor_ex(p_bmrt, name_embed.c_str(),
+                            inputs_embed_512.data(), inputs_embed_512.size(),
+                            outputs_embed_512.data(), outputs_embed_512.size(),
+                            true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+  // forward blocks
+  std::vector<void*> pos_id_datas(device_num, position_id.data());
+  std::vector<void*> in_attn_datas(device_num, attention_mask.data());
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_pid.data(), pos_id_datas.data(),
+                          input_nums.data(), device_num);
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_attention.data(),in_attn_datas.data(),
+                          input_nums.data(), device_num);
+  auto embed_512 = outputs_embed_512;
+  std::vector<bm_tensor_t> inputs_block;
+  std::vector<bm_tensor_t> outputs_block;
+  for (int i = 0; i < device_num; ++i) {
+    embed_512[i].shape = net_blocks[0]->stages[0].input_shapes[0];
+    inputs_block.push_back(embed_512[i]);
+    inputs_block.push_back(inputs_pid[i]);
+    inputs_block.push_back(inputs_attention[i]);
+    outputs_block.push_back(embed_512[i]);
+    outputs_block.push_back(past_key[0][i]);
+    outputs_block.push_back(past_value[0][i]);
+  }
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    for (int j = 0; j < device_num; ++j) {
+      outputs_block[1 + j * 3] = past_key[i][j];
+      outputs_block[2 + j * 3] = past_value[i][j];
+    }
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks[i].c_str(),
+                                inputs_block.data(), inputs_block.size(),
+                                outputs_block.data(), outputs_block.size(),
+                                true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+    for (int j = 0; j < device_num; ++j) {
+      move2end(past_key[i][j]);
+      move2end(past_value[i][j]);
+    }
+  }
+  // forward lmhead
+  int bytes = embed_512[0].device_mem.size / SEQLEN;
+  bm_memcpy_d2d_byte(bm_handle, inputs_lm[0].device_mem, 0,
+                     embed_512[0].device_mem, (token_length - 1) * bytes,
+                     bytes);
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm[0], 1,
+                              &outputs_lm[0], 1, true, false);
+  bm_thread_sync(bm_handle);
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm[0].device_mem);
+  return token;
+}
+int ChatGLM::forward_next(int cur_token) {
+  std::vector<uint16_t> attention_mask(SEQLEN + 1, 0);
+  for (int i = 0; i <= SEQLEN - token_length; i++) {
+    attention_mask[i] = ATTENTION_MASK;
+  }
+  int32_t position_id = token_length - 1;
+  // forward embedding
+  std::vector<bm_tensor_t> inputs_embed;
+  std::vector<void*> input_datas;
+  std::vector<int> input_nums(device_num, 1);
+  for (int i = 0; i < device_num; ++i) {
+    inputs_embed.push_back(outputs_lm[i]); // token_id
+    inputs_embed[i].shape = net_embed_cache->stages[0].input_shapes[0];
+    input_datas.push_back((void*)(&cur_token));
+  }
+  bmrt_memcpy_s2d_parallel(p_bmrt, inputs_embed.data(), input_datas.data(),
+                          input_nums.data(), device_num);
+  auto ret = bmrt_launch_tensor_ex(p_bmrt, name_embed_cache.c_str(),
+                                  inputs_embed.data(), inputs_embed.size(),
+                                  inputs_lm.data(), inputs_lm.size(), true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+  // forward blocks
+  std::vector<void*> attn_datas(device_num, attention_mask.data());
+  std::vector<void*> pid_datas(device_num, &position_id);
+  bmrt_memcpy_s2d_parallel(p_bmrt, next_attention.data(), attn_datas.data(),
+                          input_nums.data(), device_num);
+  bmrt_memcpy_s2d_parallel(p_bmrt, next_pid.data(), pid_datas.data(),
+                          input_nums.data(), device_num);
+  // WARNING: make inputs_lm device_num
+  std::vector<bm_tensor_t> embed_1 = inputs_lm;
+  for (int i = 0; i < device_num; ++i) {
+    embed_1[i].shape = net_blocks_cache[0]->stages[0].input_shapes[0];
+  }
+  std::vector<bm_tensor_t> inputs_block;
+  std::vector<bm_tensor_t> outputs_block;
+  for (int i = 0; i < device_num; ++i) {
+    inputs_block.push_back(embed_1[i]);
+    inputs_block.push_back(next_pid[i]);
+    inputs_block.push_back(next_attention[i]);
+    inputs_block.push_back(past_key[0][i]);
+    inputs_block.push_back(past_value[0][i]);
+    outputs_block.push_back(embed_1[i]);
+    outputs_block.push_back(past_key[0][i]);
+    outputs_block.push_back(past_value[0][i]);
+  }
+  for (int i = 0; i < NUM_LAYERS; i++) {
+    for (int j = 0; j < device_num; ++j) {
+      inputs_block[3 + j * 5] = past_key[i][j];
+      inputs_block[4 + j * 5] = past_value[i][j];
+      outputs_block[1 + j * 3] = past_key[i][j];
+      outputs_block[2 + j * 3] = past_value[i][j];
+    }
+    ret = bmrt_launch_tensor_ex(p_bmrt, name_blocks_cache[i].c_str(),
+                                inputs_block.data(), inputs_block.size(),
+                                outputs_block.data(), outputs_block.size(),
+                                true, false);
+    assert(ret);
+    bm_thread_sync(bm_handle);
+  }
+  // forward lmhead
+  ret = bmrt_launch_tensor_ex(p_bmrt, name_lm.c_str(), &inputs_lm[0], 1,
+                              &outputs_lm[0], 1, true, false);
+  assert(ret);
+  bm_thread_sync(bm_handle);
+  int token = 0;
+  bm_memcpy_d2s(bm_handle, (void *)&token, outputs_lm[0].device_mem);
+  return token;
+}
+std::string build_prompt(std::string query, std::vector<std::pair<std::string, std::string>> history = {}) {
+    std::string prompt = "";
+    int round_number = 1;
+    for (const auto& item : history) {
+        prompt += "[Round " + std::to_string(round_number) + "]\n\n问：" + item.first + "\n\n答：" + item.second + "\n\n";
+        round_number ++;
+    }
+    prompt += "[Round " + std::to_string(history.size() + 1) + "]\n\n问：" + query + "\n\n答：";
+    return prompt;
+}
+void ChatGLM::chat() {
+  while (true) {
+    std::cout << "\nQuestion: ";
+    std::string input_str;
+    std::getline(std::cin, input_str);
+    if (input_str == "exit") {
+      break;
+    }
+    std::cout << "\nAnswer: " << std::flush;
+    answer(input_str);
+    std::cout << std::endl;
+  }
+}
+void ChatGLM::answer(const std::string &input_str) {
+  // auto time_0 = std::chrono::system_clock::now();
+  std::string query = build_prompt(input_str, history_vector);
+  int tok_num = 0;
+  std::vector<int> tokens;
+  std::vector<int> prompt{64790, 64792};
+  sentencepiece.Encode(query, &tokens);
+  if (tokens.empty()) {
+    printf("Sorry: your question is too wierd!!\n");
+    return;
+  }
+  // tokens is not empty
+  tokens.insert(tokens.begin(), prompt.begin(), prompt.end());
+  // make sure token not too large
+  if ((int)tokens.size() > SEQLEN - 10) {
+    // reset
+    tokens.clear();
+    cur_answer.clear();
+    printf("Error: your question is too large!\n");
+    return;
+  }
+  int pre_token = 0;
+  auto t0 = std::chrono::system_clock::now();
+  int token = forward_first(tokens);
+  auto t1 = std::chrono::system_clock::now();
+  while (token != EOS && token_length < SEQLEN) {
+    std::string pre_word;
+    std::string word;
+    std::vector<int> pre_ids = {pre_token};
+    std::vector<int> ids = {pre_token, token};
+    sentencepiece.Decode(pre_ids, &pre_word);
+    sentencepiece.Decode(ids, &word);
+    std::string diff = word.substr(pre_word.size());
+    cur_answer += diff;
+    tokens.emplace_back(token);
+    std::cout << diff << std::flush;
+    if (token_length < SEQLEN) {
+      token_length++;
+    }
+    tok_num++;
+    token = forward_next(token);
+  }
+  auto t2 = std::chrono::system_clock::now();
+  auto use0 = std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0);
+  auto use1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
+  printf("\n\nfirst token latency: %f s", (use0.count() * 1e-6));
+  printf("\nspeed: %f token/s\n", tok_num / (use1.count() * 1e-6));
+  if (token_length >= SEQLEN) {
+    printf("Warning: Reach to the max sequence length!\n");
+    history_vector.push_back({input_str, cur_answer});
+    cur_answer.clear();
+    // Delete the first half data
+    size_t half_size = history_vector.size() / 2;
+    history_vector.erase(history_vector.begin(), history_vector.begin() + half_size);
+  } else {
+    history_vector.push_back({input_str, cur_answer});
+    cur_answer.clear();
+  }
+}
+static void split(const std::string &s, const std::string &delim,
+                  std::vector<std::string> &ret) {
+  size_t last = 0;
+  size_t index = s.find_first_of(delim, last);
+  while (index != std::string::npos) {
+    ret.push_back(s.substr(last, index - last));
+    last = index + 1;
+    index = s.find_first_of(delim, last);
+  }
+  if (last < s.length()) {
+    ret.push_back(s.substr(last));
+  }
+}
+static std::vector<int> parseCascadeDevices(const std::string &str) {
+  std::vector<int> devices;
+  std::vector<std::string> sub_str;
+  split(str, ",", sub_str);
+  for (auto &s : sub_str) {
+    devices.push_back(std::atoi(s.c_str()));
+  }
+  return devices;
+}
+void Usage() {
+  printf("Usage:\n"
+         "  --help         : Show help info.\n"
+         "  --model        : Set model path \n"
+         "  --tokenizer    : Set tokenizer path \n"
+         "  --devid        : Set devices to run for model, e.g. 1,2. if not "
+         "set, use 0\n");
+}
+void processArguments(int argc, char *argv[], std::string &model_path, std::string &tokenizer_path,
+                      std::vector<int> &devices) {
+  struct option longOptions[] = {{"model", required_argument, nullptr, 'm'},
+                                 {"tokenizer", required_argument, nullptr, 't'},
+                                 {"devid", required_argument, nullptr, 'd'},
+                                 {"help", no_argument, nullptr, 'h'},
+                                 {nullptr, 0, nullptr, 0}};
+  int optionIndex = 0;
+  int option;
+  while ((option = getopt_long(argc, argv, "m:t:d:h:", longOptions,
+                               &optionIndex)) != -1) {
+    switch (option) {
+    case 'm':
+      model_path = optarg;
+      break;
+    case 't':
+      tokenizer_path = optarg;
+      break;
+    case 'd':
+      devices = parseCascadeDevices(optarg);
+      break;
+    case 'h':
+      Usage();
+      exit(EXIT_FAILURE);
+    case '?':
+      Usage();
+      exit(EXIT_FAILURE);
+    default:
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+int main(int argc, char **argv) {
+  // set your bmodel path here
+  printf("Demo for ChatGLM in BM1684X, support ChatGLM1/2/3\n");
+  std::string model_path;
+  std::string tokenizer_path;
+  std::vector<int> devices = {0};
+  processArguments(argc, argv, model_path, tokenizer_path, devices);
+  if (model_path.empty()) {
+    Usage();
+    exit(EXIT_FAILURE);
+  }
+  ChatGLM glm;
+  printf("Init Environment ...\n");
+  glm.init(devices, model_path, tokenizer_path);
+  printf("==========================\n");
+  glm.chat();
+  glm.deinit();
+  return 0;
+}

ChatGLM2/run_demo.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+set -ex
+#!/bin/bash
+# download bmodel
+if [ ! -d "../../bmodels" ]; then
+  mkdir ../../bmodels
+fi
+if [ ! -f "../../bmodels/chatglm2-6b_int4_1dev.bmodel" ]; then
+  pip3 install dfss
+  python3 -m dfss --url=open@sophgo.com:/ext_model_information/LLM/LLM-TPU/chatglm2-6b_int4_1dev.bmodel
+  mv chatglm2-6b_int4_1dev.bmodel ../../bmodels
+else
+  echo "Bmodel Exists!"
+fi
+if [ ! -f "./demo/chatglm" ]; then
+  cd demo && rm -rf build && mkdir build && cd build
+  cmake .. && make -j
+  cp chatglm .. && cd ../..
+else
+  echo "chatglm file Exists!"
+fi
+# run demo
+./demo/chatglm --model ../../bmodels/chatglm2-6b_int4_1dev.bmodel --tokenizer ./support/tokenizer/tokenizer.model --devid 0

ChatGLM2/support/include/bmdef.h ADDED Viewed

	@@ -0,0 +1,129 @@

+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Sophgo Technologies Inc. This is proprietary information owned by
+ *    Sophgo Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Sophgo Technologies Inc.
+ *
+ *****************************************************************************/
+#ifndef __BMRUNTIME_DEFINE_H__
+#define __BMRUNTIME_DEFINE_H__
+#include "bmlib_runtime.h"
+#include <stddef.h>
+#include <stdint.h>
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* --------------------------------------------------------------------------*/
+/* basic definitions */
+/* bm_data_type_t holds the type for a scalar value */
+typedef enum bm_data_type_e {
+  BM_FLOAT32 = 0,
+  BM_FLOAT16 = 1,
+  BM_INT8 = 2,
+  BM_UINT8 = 3,
+  BM_INT16 = 4,
+  BM_UINT16 = 5,
+  BM_INT32 = 6,
+  BM_UINT32 = 7,
+  BM_BFLOAT16 = 8,
+  BM_INT4 = 9,
+  BM_UINT4 = 10,
+} bm_data_type_t;
+/* store mode definitions */
+typedef enum bm_store_mode_e {
+  BM_STORE_1N = 0, /* default, if not sure, use 0 */
+  BM_STORE_2N = 1,
+  BM_STORE_4N = 2,
+} bm_store_mode_t;
+/* bm_shape_t holds the shape info */
+#define BM_MAX_DIMS_NUM 8
+typedef struct bm_shape_s {
+  int num_dims;
+  int dims[BM_MAX_DIMS_NUM];
+} bm_shape_t;
+typedef struct bm_shape_ex_s {
+  bm_shape_t shape;
+  int        elem_num;
+} bm_shape_ex_t;
+/*
+bm_tensor_t holds a multi-dimensional array of elements of a single data type
+and tensor are in device memory */
+typedef struct bm_tensor_s {
+  bm_data_type_t dtype;
+  bm_shape_t shape;
+  bm_device_mem_t device_mem;
+  bm_store_mode_t st_mode; /* user can set 0 as default store mode */
+} bm_tensor_t;
+/* --------------------------------------------------------------------------*/
+/* network information structure */
+/* bm_stage_info_t holds input/output shapes and device mems; every network can contain one or more
+ * stages */
+typedef struct bm_stage_info_s {
+  bm_shape_t *input_shapes;  /* input_shapes[0] / [1] / ... / [input_num-1] */
+  bm_shape_t *output_shapes; /* output_shapes[0] / [1] / ... / [output_num-1] */
+  bm_device_mem_t *input_mems; /* input_mems[0] / [1] / ... / [input_num-1] */
+  bm_device_mem_t *output_mems; /* output_mems[0] / [1] / ... / [output_num-1] */
+} bm_stage_info_t;
+/* bm_tensor_info_t holds all information of one net.
+ * scale for float type is 1.0 as default */
+typedef struct bm_net_info_s {
+  const char* name;              /* net name */
+  bool is_dynamic;               /* dynamic or static */
+  int input_num;                 /* number of inputs */
+  char const** input_names;      /* input_names[0] / [1] / .../ [input_num-1] */
+  bm_data_type_t* input_dtypes;  /* input_dtypes[0] / [1] / .../ [input_num-1] */
+  float* input_scales;           /* input_scales[0] / [1] / .../ [input_num-1] */
+  int output_num;                /* number of outputs */
+  char const** output_names;     /* output_names[0] / [1] / .../ [output_num-1] */
+  bm_data_type_t* output_dtypes; /* output_dtypes[0] / [1] / .../ [output_num-1] */
+  float* output_scales;          /* output_scales[0] / [1] / .../ [output_num-1] */
+  int stage_num;                 /* number of stages */
+  bm_stage_info_t* stages;       /* stages[0] / [1] / ... / [stage_num-1] */
+  size_t* max_input_bytes;       /* max_input_bytes[0]/ [1] / ... / [input_num-1] */
+  size_t* max_output_bytes;      /* max_output_bytes[0] / [1] / ... / [output_num-1] */
+  int* input_zero_point;         /* input_zero_point[0] / [1] / .../ [input_num-1] */
+  int* output_zero_point;        /* output_zero_point[0] / [1] / .../ [output_num-1] */
+  int *input_loc_devices;         /* input_loc_device[0] / [1] / .../ [input_num-1] */
+  int *output_loc_devices;        /* output_loc_device[0] / [1] / .../ [output_num-1] */
+} bm_net_info_t;
+typedef struct api_info_s {
+  /// @brief api_id to be sent to driver
+  int32_t api_id;
+  /// @brief api data to be sent to driver
+  uint8_t **api_data;
+  /// @brief size of the api data to be sent to driver
+  size_t api_data_size;
+  /// @brief subsize of the api data to be sent to driver
+  size_t *api_data_subsize;
+  /// @brief offset of input tensors' addr in api_data
+  uint32_t *input_addr_offset;
+  /// @brief number of the offset of input tensors' addr in api_data
+  size_t input_addr_offset_number;
+  /// @brief offset of output tensors' addr in api_data
+  uint32_t *output_addr_offset;
+  /// @brief number of the offset of output tensors' addr in api_data
+  size_t output_addr_offset_number;
+} api_info_c;
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __BM_NET_H__ */

ChatGLM2/support/include/bmlib_runtime.h ADDED Viewed

	@@ -0,0 +1,2581 @@

+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Bitmain Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Bitmain Technologies Inc. This is proprietary information owned by
+ *    Bitmain Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Bitmain Technologies Inc.
+ *
+ *****************************************************************************/
+/**************************************************************************
+ * bmlib_runtime defines interfaces that operate TPU devices.
+ * The functions can be divided into serveral categories.
+ * 1) device handle creation and destroy
+ * 2) memory help functions
+ * 3) global memory allocation and free
+ * 4) data transfer between host and device
+ * 5) data transfer within device memory
+ * 6) api send and synchronization
+ * 7) global memory map and coherence
+ * 8) trace and profile
+ * 9) power management
+ * 10) miscellaneous functions
+ *************************************************************************/
+#ifndef BMLIB_RUNTIME_H_
+#define BMLIB_RUNTIME_H_
+#if defined(_WIN32) && !defined(__MINGW32__)
+    #include <vadefs.h>
+    #define DECL_EXPORT __declspec(dllexport)
+    #define DECL_IMPORT __declspec(dllimport)
+#else
+	#include <stdbool.h>
+	#include <stddef.h>
+	#include <stdarg.h>
+    #define DECL_EXPORT
+    #define DECL_IMPORT
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+typedef enum {
+  MODULE_CDMA = 0,
+  MODULE_GDMA = 1,
+  MODULE_TPU = 2,
+  MODULE_SMMU = 3,
+  MODULE_SRAM = 4,
+  MODULE_END = 5
+} MODULE_ID;
+#define BM_MEM_ADDR_NULL (0xfffffffff)
+#ifndef BM_MEM_DESC_T_
+#define BM_MEM_DESC_T_
+/* BM function return code definitions */
+typedef enum {
+  BM_SUCCESS = 0,
+  BM_ERR_DEVNOTREADY = 1, /* Device not ready yet */
+  BM_ERR_FAILURE = 2,     /* General failure */
+  BM_ERR_TIMEOUT = 3,     /* Timeout */
+  BM_ERR_PARAM = 4,       /* Parameters invalid */
+  BM_ERR_NOMEM = 5,       /* Not enough memory */
+  BM_ERR_DATA = 6,        /* Data error */
+  BM_ERR_BUSY = 7,        /* Busy */
+  BM_ERR_NOFEATURE = 8,   /* Not supported yet */
+  BM_NOT_SUPPORTED = 9
+} bm_status_t;
+/* BM memory type definitions */
+typedef enum {
+  BM_MEM_TYPE_DEVICE = 0,
+  BM_MEM_TYPE_HOST = 1,
+  BM_MEM_TYPE_SYSTEM = 2,
+  BM_MEM_TYPE_INT8_DEVICE = 3,
+  BM_MEM_TYPE_INVALID = 4
+} bm_mem_type_t;
+typedef enum {
+  PERF_MONITOR_GDMA = 0,
+  PERF_MONITOR_TPU = 1
+} PERF_MONITOR_ID;
+typedef enum {
+  BMCPU_IDLE    = 0,
+  BMCPU_RUNNING = 1,
+  BMCPU_FAULT   = 2
+} bm_cpu_status_t;
+/*
+* bm performace monitor
+*/
+typedef struct bm_perf_monitor {
+  long long buffer_start_addr; /*buffer address to store perf data*/
+  int buffer_size; /*buffer size*/
+  PERF_MONITOR_ID monitor_id; /*PERF_MONITOR_GDMA or PERF_MONITOR_TPU*/
+} bm_perf_monitor_t;
+typedef union {
+  struct {
+    bm_mem_type_t mem_type : 3;
+    unsigned int gmem_heapid : 3;
+    unsigned int reserved : 26;
+  } u;
+  unsigned int rawflags;
+} bm_mem_flags_t;
+/* BM memory descriptor definition*/
+typedef struct bm_mem_desc {
+  union {
+    struct {
+#ifdef __linux__
+      unsigned long device_addr;
+#else
+      unsigned long long device_addr;
+#endif
+      unsigned int reserved;
+      int dmabuf_fd;
+    } device;
+    struct {
+      void *system_addr;
+      unsigned int reserved0;
+      int reserved1;
+    } system;
+  } u;
+  bm_mem_flags_t flags;
+  unsigned int size;
+} bm_mem_desc_t;
+typedef struct bm_mem_desc bm_device_mem_t;
+typedef struct bm_mem_desc bm_system_mem_t;
+typedef struct sg_mem_desc {
+  union {
+    struct {
+#ifdef __linux__
+      unsigned long device_addr;
+#else
+      unsigned long long device_addr;
+#endif
+      unsigned int reserved;
+      int dmabuf_fd;
+    } device;
+    struct {
+      void *system_addr;
+      unsigned int reserved0;
+      int reserved1;
+    } system;
+  } u;
+  bm_mem_flags_t flags;
+  unsigned long long size;
+} sg_mem_desc_t;
+typedef struct sg_mem_desc sg_device_mem_t;
+typedef struct sg_mem_desc sg_system_mem_t;
+#endif
+struct bm_context;
+typedef struct bm_context *bm_handle_t;
+#define MD5SUM_LEN 16
+#define LIB_MAX_NAME_LEN 64
+#define FUNC_MAX_NAME_LEN 64
+typedef struct bm_module
+{
+  // void *lib_handle;
+  char lib_name[LIB_MAX_NAME_LEN];
+  unsigned char md5[MD5SUM_LEN];
+}bm_module;
+typedef struct bm_module *tpu_kernel_module_t;
+typedef int tpu_kernel_function_t;
+/**
+ * @name    tpu_kernel_load_module_file
+ * @brief   To load dyn file
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module_file     dyn file
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module_file(bm_handle_t handle, const char *module_file);
+/**
+ * @name    tpu_kernel_load_module_file_key
+ * @brief   To load dyn file with key
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module_file     dyn file
+ * @param [in]  key             identification str
+ * @param [in]  size            key size
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module_file_key(bm_handle_t handle, const char *module_file, const char *key, int size);
+/**
+ * @name    tpu_kernel_unload_module
+ * @brief   To unload dyn file
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  p_module        dyn lib ptr
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_unload_module(bm_handle_t handle, tpu_kernel_module_t p_module);
+/**
+ * @name    tpu_kernel_free_module
+ * @brief   To free p_module when not use
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  p_module        dyn lib ptr
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_free_module(bm_handle_t handle, tpu_kernel_module_t p_module);
+/**
+ * @name    tpu_kernel_load_module
+ * @brief   To load dyn module
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  data            dyn module
+ * @param [in]  length          dyn module size
+ * @retval  dyn lib ptr
+ */
+tpu_kernel_module_t tpu_kernel_load_module(bm_handle_t handle, const char *data, size_t length);
+/**
+ * @name    tpu_kernel_get_function
+ * @brief   To get function from lib
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  module          dyn module
+ * @param [in]  function        funtion name
+ * @retval  function id
+ */
+tpu_kernel_function_t tpu_kernel_get_function(bm_handle_t handle, tpu_kernel_module_t module, const char *function);
+/**
+ * @name    tpu_kernel_launch
+ * @brief   To launch function with sync
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  function        function id
+ * @param [in]  args            funtion args
+ * @param [in]  size            args size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size);
+/**
+ * @name    tpu_kernel_launch_async
+ * @brief   To launch function with async
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  function        function id
+ * @param [in]  args            funtion args
+ * @param [in]  size            args size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_async(bm_handle_t handle, tpu_kernel_function_t function, void *args, size_t size);
+/**
+ * @name    tpu_kernel_launch_async_multi_cores
+ * @brief   To launch function with async for multi cores
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  func_name       function name
+ * @param [in]  api_param       funtion params
+ * @param [in]  api_size        params size
+ * @param [in]  core_list       list of core ids
+ * @param [in]  core_num        number of cores
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_async_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param,
+                                                size_t api_size, const int* core_list, const int core_num);
+/**
+ * @name    tpu_kernel_launch_sync_multi_cores
+ * @brief   To launch function with sync for multi cores
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  func_name       function name
+ * @param [in]  api_param       funtion params
+ * @param [in]  api_size        params size
+ * @param [in]  core_list       list of core ids
+ * @param [in]  core_num        number of cores
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_launch_sync_multi_cores(bm_handle_t handle, const char *func_name, const void *api_param,
+                                              size_t api_size, const int* core_list, const int core_num);
+/**
+ * @name    tpu_kernel_sync
+ * @brief   To sync
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+bm_status_t tpu_kernel_sync(bm_handle_t handle);
+void show_md5(unsigned char md5[]);
+DECL_EXPORT void bmlib_log(const char *tag, int level, const char *fmt, ...);
+#ifndef USING_CMODEL
+#define BM_CHECK_RET(call)                                                    \
+  do {                                                                        \
+    bm_status_t ret = (bm_status_t)call;                                                   \
+    if (ret != BM_SUCCESS) {                                                  \
+      bmlib_log("BM_CHECK",16,"BM_CHECK_RET fail %s: %s: %d\n", __FILE__, __func__, __LINE__); \
+      return ret;                                                             \
+    }                                                                         \
+  } while (0)
+#else
+#define BM_CHECK_RET(call)                     \
+  do {                                         \
+    bm_status_t ret = call;                    \
+    if (ret != BM_SUCCESS) {                   \
+      bmlib_log("BM_CHECK",16,"BM_CHECK_RET failed %d\n", ret);\
+      ASSERT(0);                               \
+      exit(-ret);                              \
+    }                                          \
+  } while (0)
+#endif
+/*******************handle releated functions *********************************/
+/**
+ * @name    bm_dev_getcount
+ * @brief   To get the number of sophon devices in system.
+ *          If N is got, valid devid is [0, N-1]
+ * @ingroup bmlib_runtime
+ *
+ * @param [out] count  The result number of sophon devices
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_dev_getcount(int *count);
+/**
+ * @name    bm_dev_query
+ * @brief   To query if a device is present
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] devid  The id of the device to query
+ * @retval  BM_SUCCESS Device is present
+ *          Other code Devcie is not present
+ */
+DECL_EXPORT bm_status_t bm_dev_query(int devid);
+/**
+ * @name    bm_dev_request
+ * @brief   To create a handle for the given device
+ * @ingroup bmlib_runtime
+ *
+ * @param [out] handle  The created handle
+ * @param [in]  devid   Specify on which device to create handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_dev_request(bm_handle_t *handle, int devid);
+/**
+ * @name    bm_get_devid
+ * @brief   To get device index for the given handle
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The given handle
+ * @retval  int  device index that the handle points to.
+ */
+DECL_EXPORT int bm_get_devid(bm_handle_t handle);
+/**
+ * @name    bm_dev_free
+ * @brief   To free a handle
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The handle to free
+ */
+DECL_EXPORT void bm_dev_free(bm_handle_t handle);
+/*******************memory help functions ************************************/
+/**
+ * @name    bm_mem_get_type
+ * @brief   To get a memory descriptor's type
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The memory descriptor queried
+ * @retval  BM_MEM_TYPE_DEVICE  Device global memory
+ * @retval  BM_MEM_TYPE_SYSTEM  Host user memory
+ */
+DECL_EXPORT bm_mem_type_t bm_mem_get_type(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_type
+ * @brief   To get a memory descriptor's type
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The memory descriptor queried
+ * @retval  BM_MEM_TYPE_DEVICE  Device global memory
+ * @retval  BM_MEM_TYPE_SYSTEM  Host user memory
+ */
+DECL_EXPORT bm_mem_type_t sg_mem_get_type(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_get_device_addr
+ * @brief   To get a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The device memory descriptor queried
+ * @retval  unsigned long long  The device memory address
+ */
+DECL_EXPORT unsigned long long bm_mem_get_device_addr(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_device_addr
+ * @brief   To get a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem  The device memory descriptor queried
+ * @retval  unsigned long long  The device memory address
+ */
+DECL_EXPORT unsigned long long sg_mem_get_device_addr(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_set_device_addr
+ * @brief   To set a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem   The device memory descriptor pointer
+ * @param ]in]  addr  The new device address of the device memory
+ */
+DECL_EXPORT void bm_mem_set_device_addr(struct bm_mem_desc* pmem, unsigned long long addr);
+/**
+ * @name    sg_mem_set_device_addr
+ * @brief   To set a device memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem   The device memory descriptor pointer
+ * @param ]in]  addr  The new device address of the device memory
+ */
+DECL_EXPORT void sg_mem_set_device_addr(struct sg_mem_desc* pmem, unsigned long long addr);
+/**
+ * @name    bm_mem_get_device_size
+ * @brief   To get a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem      The device memory descriptor queried
+ * @retval unsigned int  The device memory's size in bytes
+ */
+DECL_EXPORT unsigned int bm_mem_get_device_size(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_device_size
+ * @brief   To get a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  mem      The device memory descriptor queried
+ * @retval unsigned int  The device memory's size in bytes
+ */
+DECL_EXPORT unsigned long long sg_mem_get_device_size(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_set_device_size
+ * @brief   To set a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [out]  pmem  The device memory descriptor pointer
+ * @param [in]  size  The new device memory size (in bytes) of the device memory
+ */
+DECL_EXPORT void bm_mem_set_device_size(struct bm_mem_desc* pmem, unsigned int size);
+/**
+ * @name    sg_mem_set_device_size
+ * @brief   To set a device memory descriptor's size
+ * @ingroup bmlib_runtime
+ *
+ * @param [out]  pmem  The device memory descriptor pointer
+ * @param [in]  size  The new device memory size (in bytes) of the device memory
+ */
+DECL_EXPORT void sg_mem_set_device_size(struct sg_mem_desc* pmem, unsigned long long size);
+/**
+ * @name    bm_set_device_mem
+ * @brief   To fill in a device memory descriptor with size and address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] pmem  The device memory descriptor pointer
+ * @param [in]  size  The device memory descriptor's size
+ * @param [in]  addr  The device memory descriptor's address
+ */
+DECL_EXPORT void bm_set_device_mem(bm_device_mem_t* pmem, unsigned int size,
+                       unsigned long long addr);
+/**
+ * @name    sg_set_device_mem
+ * @brief   To fill in a device memory descriptor with size and address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] pmem  The device memory descriptor pointer
+ * @param [in]  size  The device memory descriptor's size
+ * @param [in]  addr  The device memory descriptor's address
+ */
+DECL_EXPORT void sg_set_device_mem(sg_device_mem_t* pmem, unsigned long long size,
+                       unsigned long long addr);
+/**
+ * @name    bm_mem_from_device
+ * @brief   To create a device memory descriptor from address and size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] device_addr The device memory address
+ * @param [in] len         The device memory size
+ * @retval bm_device_mem_t The device memory descriptor created
+ */
+DECL_EXPORT bm_device_mem_t bm_mem_from_device(unsigned long long device_addr,
+                                   unsigned int len);
+/**
+ * @name    sg_mem_from_device
+ * @brief   To create a device memory descriptor from address and size
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] device_addr The device memory address
+ * @param [in] len         The device memory size
+ * @retval bm_device_mem_t The device memory descriptor created
+ */
+DECL_EXPORT sg_device_mem_t sg_mem_from_device(unsigned long long device_addr,
+                                   unsigned long long len);
+/**
+ * @name    bm_mem_get_system_addr
+ * @brief   To get a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] mem  The system memory descriptor
+ * @retval void *   The system memory descriptor's address
+ */
+DECL_EXPORT void *bm_mem_get_system_addr(struct bm_mem_desc mem);
+/**
+ * @name    sg_mem_get_system_addr
+ * @brief   To get a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] mem  The system memory descriptor
+ * @retval void *   The system memory descriptor's address
+ */
+DECL_EXPORT void *sg_mem_get_system_addr(struct sg_mem_desc mem);
+/**
+ * @name    bm_mem_set_system_addr
+ * @brief   To set a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem  The system memory descriptor pointer
+ * @param [in]   addr The system memory address
+ */
+DECL_EXPORT void bm_mem_set_system_addr(struct bm_mem_desc* pmem, void *addr);
+/**
+ * @name    sg_mem_set_system_addr
+ * @brief   To set a system memory descriptor's address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  pmem  The system memory descriptor pointer
+ * @param [in]   addr The system memory address
+ */
+DECL_EXPORT void sg_mem_set_system_addr(struct sg_mem_desc* pmem, void *addr);
+/**
+ * @name    bm_mem_from_system
+ * @brief   To create a system memory descriptor with the given system address
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  system_addr  The system address in the descriptor
+ * @retval  bm_system_mem_t  The system memory descriptor created
+ */
+DECL_EXPORT bm_system_mem_t bm_mem_from_system(void *system_addr);
+/*******************memory alloc and free functions ***************************/
+/**
+ * @name    bm_mem_null
+ * @brief   Return an illegal device memory descriptor
+ * @ingroup bmlib_runtime
+ *
+ * @retval  bm_device_mem_t  An invalid device memory descriptor
+ */
+DECL_EXPORT bm_device_mem_t bm_mem_null(void);
+#define BM_MEM_NULL (bm_mem_null())
+/**
+ * @name    bm_malloc_neuron_device
+ * @brief   To malloc device memory according to a tensor shape
+ *          (each neuron is 32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result devcie memory descriptor
+ * @param [in]  n, c, h, w  The shape of the input tensor
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_neuron_device(bm_handle_t handle, bm_device_mem_t *pmem,
+                                    int n, int c, int h, int w);
+/**
+ * @name    sg_malloc_neuron_device
+ * @brief   To malloc device memory according to a tensor shape
+ *          (each neuron is 32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result devcie memory descriptor
+ * @param [in]  n, c, h, w  The shape of the input tensor
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_neuron_device(bm_handle_t handle, sg_device_mem_t *pmem,
+                                    unsigned long long n, unsigned long long c,
+                                    unsigned long long h, unsigned long long w);
+/**
+ * @name    bm_malloc_device_dword
+ * @brief   To malloc device memory in size of dword (32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   count  The number of dwords(32bits) to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_dword(bm_handle_t handle, bm_device_mem_t *pmem,
+                                   int count);
+/**
+ * @name    sg_malloc_device_dword
+ * @brief   To malloc device memory in size of dword (32 bits)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   count  The number of dwords(32bits) to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_dword(bm_handle_t handle, sg_device_mem_t *pmem,
+                                   unsigned long long count);
+/**
+ * @name    bm_malloc_device_byte
+ * @brief   To malloc device memory in size of byte
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  unsigned int size);
+/**
+ * @name    sg_malloc_device_byte
+ * @brief   To malloc device memory in size of byte
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  unsigned long long size);
+/**
+ * @name    bm_malloc_device_byte_heap
+ * @brief   To malloc device memory in size of byte within the specified heap
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id The heap where to allocate  0/1/2
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte_heap(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  int heap_id, unsigned int size);
+/**
+ * @name    sg_malloc_device_byte_heap
+ * @brief   To malloc device memory in size of byte within the specified heap
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id The heap where to allocate  0/1/2
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte_heap(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  int heap_id, unsigned long long size);
+/**
+ * @name    bm_malloc_device_byte_heap_mask
+ * @brief   To malloc device memory in size of byte within the specified heaps
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id_mask The mask which heaps allocate from. each bit indicate one heap
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_malloc_device_byte_heap_mask(bm_handle_t handle, bm_device_mem_t *pmem,
+                                  int heap_id_mask, unsigned int size);
+/**
+ * @name    sg_malloc_device_byte_heap_mask
+ * @brief   To malloc device memory in size of byte within the specified heaps
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  pmem   The result device memory descriptor
+ * @param [in]  heap_id_mask The mask which heaps allocate from. each bit indicate one heap
+ * @param [in]   size   The number of bytes to allocate
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_malloc_device_byte_heap_mask(bm_handle_t handle, sg_device_mem_t *pmem,
+                                  int heap_id_mask, unsigned long long size);
+/**
+ * @name    bm_free_device
+ * @brief   To free device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mem     The device memory descriptor to free
+ */
+DECL_EXPORT void bm_free_device(bm_handle_t handle, bm_device_mem_t mem);
+/**
+ * @name    sg_free_device
+ * @brief   To free device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mem     The device memory descriptor to free
+ */
+DECL_EXPORT void sg_free_device(bm_handle_t handle, sg_device_mem_t mem);
+/**
+ * @name    bm_gmem_arm_reserved_request
+ * @brief   To obtain the address of global memory reserved for arm926
+ * @param [in]  handle  The device handle
+ *
+ * @retval unsigned long long  The absolute address of gmem reserved for arm926
+ */
+DECL_EXPORT unsigned long long bm_gmem_arm_reserved_request(bm_handle_t handle);
+/**
+ * @name    bm_gmem_arm_reserved_release
+ * @brief   To release the global memory reserved for arm926
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ */
+DECL_EXPORT void bm_gmem_arm_reserved_release(bm_handle_t handle);
+/*******************memory copy functions *************************************/
+/**
+ * @name    bm_memcpy_s2d
+ * @brief   To copy data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] dst     The destination memory (device memory descriptor )
+ * @param [in] src     The source memory (system memory, a void* pointer)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d(bm_handle_t handle, bm_device_mem_t dst, void *src);
+/**
+ * @name    bm_memcpy_p2p
+ * @brief   To copy data from one chip to another chip
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle_src The source device handle
+ * @param [in] src        The source memory (device memory descriptor )
+ * @param [in] handle_dst The destination device handle
+ * @param [in] dst        The destination memory (device memory descriptor )
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_p2p(bm_handle_t handle_src, bm_device_mem_t src, bm_handle_t handle_dst,bm_device_mem_t dst);
+/**
+ * @name    sg_memcpy_s2d
+ * @brief   To copy data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] dst     The destination memory (device memory descriptor )
+ * @param [in] src     The source memory (system memory, a void* pointer)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d(bm_handle_t handle, sg_device_mem_t dst, void *src);
+/**
+ * @name    bm_memcpy_s2d_partial_offset
+ * @brief   To copy specified bytes of data from system memory to device memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d_partial_offset(bm_handle_t handle,
+                                         bm_device_mem_t dst, void *src,
+                                         unsigned int size,
+                                         unsigned int offset);
+/**
+ * @name    sg_memcpy_s2d_partial_offset
+ * @brief   To copy specified bytes of data from system memory to device memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d_partial_offset(bm_handle_t handle,
+                                         sg_device_mem_t dst, void *src,
+                                         unsigned long long size,
+                                         unsigned long long offset);
+/**
+ * @name    bm_memcpy_s2d_partial
+ * @brief   To copy specified bytes of data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_s2d_partial(bm_handle_t handle, bm_device_mem_t dst,
+                                  void *src, unsigned int size);
+/**
+ * @name    sg_memcpy_s2d_partial
+ * @brief   To copy specified bytes of data from system memory to device memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (device memory descriptor)
+ * @param [in]  src    The source memory (system memory, a void* pointer)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_s2d_partial(bm_handle_t handle, sg_device_mem_t dst,
+                                  void *src, unsigned long long size);
+/**
+ * @name    bm_memcpy_d2s
+ * @brief   To copy data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s(bm_handle_t handle, void *dst, bm_device_mem_t src);
+/**
+ * @name    sg_memcpy_d2s
+ * @brief   To copy data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s(bm_handle_t handle, void *dst, sg_device_mem_t src);
+/**
+ * @name    bm_memcpy_d2s_partial_offset
+ * @brief   To copy specified bytes of data from device memory to system memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst,
+                                         bm_device_mem_t src, unsigned int size,
+                                         unsigned int offset);
+/**
+ * @name    sg_memcpy_d2s_partial_offset
+ * @brief   To copy specified bytes of data from device memory to system memory
+ *          with an offset in device memory address.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ * @param [in] offset  The offset of the device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s_partial_offset(bm_handle_t handle, void *dst,
+                                         sg_device_mem_t src, unsigned long long size,
+                                         unsigned long long offset);
+/**
+ * @name    bm_memcpy_d2s_partial
+ * @brief   To copy specified bytes of data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Data transfer succeeds.
+ *          Other code  Data transfer fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2s_partial(bm_handle_t handle, void *dst,
+                                  bm_device_mem_t src, unsigned int size);
+/**
+ * @name    sg_memcpy_d2s_partial
+ * @brief   To copy specified bytes of data from device memory to system memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in]  dst    The destination memory (system memory, a void* pointer)
+ * @param [in]  src    The source memory (device memory descriptor)
+ * @param [in] size    The size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Data transfer succeeds.
+ *          Other code  Data transfer fails.
+ */
+DECL_EXPORT bm_status_t sg_memcpy_d2s_partial(bm_handle_t handle, void *dst,
+                                  sg_device_mem_t src, unsigned long long size);
+/**
+ * @name    bm_memcpy_d2d
+ * @brief   To copy specified dwords of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address
+ * @param [in]  len       Length of data to copy (in DWORD 4 bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d(bm_handle_t handle, bm_device_mem_t dst,
+                          int dst_offset, bm_device_mem_t src, int src_offset,
+                          int len);
+/**
+ * @name    bm_memcpy_d2d_with_core
+ * @brief   To copy specified dwords of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address
+ * @param [in]  len       Length of data to copy (in DWORD 4 bytes)
+ * @param [in] core_id    The core id to copy
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_with_core(bm_handle_t handle, bm_device_mem_t dst,
+                          int dst_offset, bm_device_mem_t src, int src_offset,
+                          int len, int core_id);
+/**
+ * @name    bm_memcpy_d2d_byte
+ * @brief   To copy specified bytes of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address (in bytes)
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address (in bytes)
+ * @param [in]  size      Size of data to copy (in bytes)
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_byte(bm_handle_t handle, bm_device_mem_t dst,
+                               size_t dst_offset, bm_device_mem_t src,
+                               size_t src_offset, size_t size);
+/**
+ * @name    bm_memcpy_d2d_byte_with_core
+ * @brief   To copy specified bytes of data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle     The device handle
+ * @param [in]  dst       The destination device memory
+ * @param [in] dst_offset The offset of destination device memory address (in bytes)
+ * @param [in]  src       The source device memory
+ * @param [in] src_offset The offset of source device memory address (in bytes)
+ * @param [in]  size      Size of data to copy (in bytes)
+ * @param [in] core_id    The core id to copy
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_byte_with_core(bm_handle_t handle, bm_device_mem_t dst,
+                               size_t dst_offset, bm_device_mem_t src,
+                               size_t src_offset, size_t size, int core_id);
+/**
+ * @name    bm_memcpy_d2d_stride
+ * @brief   To copy specified data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle      The device handle
+ * @param [in] dst         The destination device memory
+ * @param [in] dst_stride  The data stride of destination data
+ * @param [in] src         The source device memory
+ * @param [in] src_stride  The data stride of source data
+ * @param [in] count       Count of data to copy
+ * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc.
+ *                         format_size only support 1/2/4.
+ *
+ * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_stride(bm_handle_t     handle,
+                                 bm_device_mem_t dst,
+                                 int             dst_stride,
+                                 bm_device_mem_t src,
+                                 int             src_stride,
+                                 int             count,
+                                 int             format_size);
+/**
+ * @name    bm_memcpy_d2d_stride
+ * @brief   To copy specified data from one piece of device memory
+ *          to another piece of device memory within one device. Both source
+ *          and destination offsets can be specified.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle      The device handle
+ * @param [in] dst         The destination device memory
+ * @param [in] dst_stride  The data stride of destination data
+ * @param [in] src         The source device memory
+ * @param [in] src_stride  The data stride of source data
+ * @param [in] count       Count of data to copy
+ * @param [in] format_size Data format byte size, such as sizeof(uint8_t), sizeof(float), etc.
+ *                         format_size only support 1/2/4.
+ * @param [in] core_id     The core id to copy.
+ *
+ * dst_stride MUST be 1, EXCEPT: dst_stride == 4 && src_stride == 1 && format_size ==1
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_d2d_stride_with_core(bm_handle_t     handle,
+                                 bm_device_mem_t dst,
+                                 int             dst_stride,
+                                 bm_device_mem_t src,
+                                 int             src_stride,
+                                 int             count,
+                                 int             format_size,
+                                 int             core_id);
+/**
+ * @name    bm_memcpy_c2c
+ * @brief   To copy data from one chip to another chip.
+ *          (Used in multi-chip card scenario)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] src_handle The source device handle
+ * @param [in] dst_handle The destination device handle
+ * @param [in] src        The source device memory descriptor
+ * @param [in] dst        The destination device memory descriptor
+ * @param [in] force_dst_cdma If use the CDMA engine of the destination device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memcpy_c2c(bm_handle_t src_handle, bm_handle_t dst_handle,
+                          bm_device_mem_t src, bm_device_mem_t dst,
+                          bool force_dst_cdma);
+/**
+ * @name    bm_memset_device
+ * @brief   To fill in specified device memory with the given value
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   value  The value used to fill. (int type)
+ * @param [in]  mem     The device memory which will be filled in
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memset_device(bm_handle_t handle, const int value,
+                             bm_device_mem_t mem);
+/**
+ * @name    bm_memset_device_ext
+ * @brief   To fill in specified device memory with the given value and mode
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   value  The pointer of value used to fill
+ * @param [in]   mode   The valid bytes of *value
+ * @param [in]  mem     The device memory which will be filled in
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_memset_device_ext(bm_handle_t handle, void* value, int mode,
+                             bm_device_mem_t mem);
+/**
+ * @name    bm_mem_convert_system_to_device_neuron
+ * @brief   To malloc a piece of device memory according to the shape of
+ *          neuron(in DWORD 4 bytes); copy neuron from system memory to
+ *          device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  n,c,h,w  Neuron shape size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron(bm_handle_t handle,
+                                                   struct bm_mem_desc *dev_mem,
+                                                   struct bm_mem_desc sys_mem,
+                                                   bool need_copy, int n, int c,
+                                                   int h, int w);
+/**
+ * @name    bm_mem_convert_system_to_device_neuron_byte
+ * @brief   To malloc a piece of device memory according to the shape of
+ *          neuron(in bytes); copy neuron from system memory to
+ *          device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  n,c,h,w  Neuron shape size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_neuron_byte(
+    bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem,
+    bool need_copy, int n, int c, int h, int w);
+/**
+ * @name    bm_mem_convert_system_to_device_coeff
+ * @brief   To malloc a piece of device memory according to the size of
+ *          coefficient (in DWORD 4 bytes); copy coefficient from system
+ *          memory to device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  coeff_count Coefficient size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff(bm_handle_t handle,
+                                                  struct bm_mem_desc *dev_mem,
+                                                  struct bm_mem_desc sys_mem,
+                                                  bool need_copy,
+                                                  int coeff_count);
+/**
+ * @name    bm_mem_convert_system_to_device_coeff_byte
+ * @brief   To malloc a piece of device memory according to the size of
+ *          coefficient (in bytes); copy coefficient from system
+ *          memory to device memory if need_copy is true.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory descriptor
+ * @param [in]  sys_mem The system memory descriptor
+ * @param [in]  need_copy If copy from system to device is needed
+ * @param [in]  coeff_count Coefficient size
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_convert_system_to_device_coeff_byte(
+    bm_handle_t handle, struct bm_mem_desc *dev_mem, struct bm_mem_desc sys_mem,
+    bool need_copy, int coeff_count);
+/*******************memory map functions *************************************/
+/**
+ * @name    bm_mem_mmap_device_mem
+ * @brief   To map a piece of device memory to user space with cache enabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_mmap_device_mem(bm_handle_t handle, bm_device_mem_t *dmem,
+        unsigned long long *vmem);
+/**
+ * @name    sg_mem_mmap_device_mem
+ * @brief   To map a piece of device memory to user space with cache enabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_mmap_device_mem(bm_handle_t handle, sg_device_mem_t *dmem,
+        unsigned long long *vmem);
+/*******************memory map functions *************************************/
+/**
+ * @name    bm_mem_mmap_device_mem_no_cache
+ * @brief   To map a piece of device memory to user space with cache disabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_mmap_device_mem_no_cache(bm_handle_t handle, bm_device_mem_t *dmem,
+        unsigned long long *vmem);
+/**
+ * @name    sg_mem_mmap_device_mem_no_cache
+ * @brief   To map a piece of device memory to user space with cache disabled.
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  dev_mem The device memory to map
+ * @param [out] vmem    The virtual address of the mapped device memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_mmap_device_mem_no_cache(bm_handle_t handle, sg_device_mem_t *dmem,
+        unsigned long long *vmem);
+/**
+ * @name    bm_mem_vir_to_phy
+ * @brief   To get device mem address through the mapped virtual address .
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  vmem    The virtual address of the mapped device memory
+ * @param [out]  dev_mem The device memory address
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_vir_to_phy(bm_handle_t handle, unsigned long long vmem,
+        unsigned long long *device_mem);
+/**
+ * @name    bm_mem_invalidate_device_mem
+ * @brief   To invalidate a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_invalidate_device_mem(bm_handle_t handle,
+                                         bm_device_mem_t *dmem);
+/**
+ * @name    sg_mem_invalidate_device_mem
+ * @brief   To invalidate a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_invalidate_device_mem(bm_handle_t handle,
+                                         sg_device_mem_t *dmem);
+/**
+ * @name    bm_mem_invalidate_partial_device_mem
+ * @brief   To invalidate part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to invalidate in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_invalidate_partial_device_mem(bm_handle_t handle,
+                                                 bm_device_mem_t *dmem,
+                                                 unsigned int offset,
+                                                 unsigned int len);
+/**
+ * @name    sg_mem_invalidate_partial_device_mem
+ * @brief   To invalidate part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to invalidate
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to invalidate in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_invalidate_partial_device_mem(bm_handle_t handle,
+                                                 sg_device_mem_t *dmem,
+                                                 unsigned long long offset,
+                                                 unsigned long long len);
+/**
+ * @name    bm_mem_flush_device_mem
+ * @brief   To flush a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_flush_device_mem(bm_handle_t handle, bm_device_mem_t *dmem);
+/**
+ * @name    sg_mem_flush_device_mem
+ * @brief   To flush a piece of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_flush_device_mem(bm_handle_t handle, sg_device_mem_t *dmem);
+/**
+ * @name    bm_mem_flush_partial_device_mem
+ * @brief   To flush part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to flush in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_flush_partial_device_mem(bm_handle_t handle,
+                                            bm_device_mem_t *dmem,
+                                            unsigned int offset,
+                                            unsigned int len);
+/**
+ * @name    sg_mem_flush_partial_device_mem
+ * @brief   To flush part of mapped device memory to maintain
+ *          cache coherence
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   dmem   The device memory to flush
+ * @param [in]  offset  The offset of device memory address
+ * @param [in]  len     The length of memory to flush in bytes
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_flush_partial_device_mem(bm_handle_t handle,
+                                            sg_device_mem_t *dmem,
+                                            unsigned long long offset,
+                                            unsigned long long len);
+/**
+ * @name    bm_mem_unmap_device_mem
+ * @brief   To unmap a piece of mapped device memory
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   vmem   The virtual address of the mapped device memory
+ * @param [in]  size    The size of unmapped memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_mem_unmap_device_mem(bm_handle_t handle, void *vmem, int size);
+/**
+ * @name    sg_mem_unmap_device_mem
+ * @brief   To unmap a piece of mapped device memory
+ *          (only valid in SoC mode; Not supported in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   vmem   The virtual address of the mapped device memory
+ * @param [in]  size    The size of unmapped memory
+ *
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_mem_unmap_device_mem(bm_handle_t handle, void *vmem, unsigned long long size);
+/*******************api(kernel) functions *************************************/
+/**
+ * @name    bm_flush
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ */
+DECL_EXPORT void bm_flush(bm_handle_t handle);
+/**
+ * @name    bm_device_sync
+ * @brief   To synchronize APIs of the device. The thread will block
+ *          until all the outstanding APIs of the device are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_device_sync(bm_handle_t handle);
+/**
+ * @name    bm_handle_sync
+ * @brief   To synchronize APIs of the handle. The thread will block
+ *          until all the outstanding APIs of the handle are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_handle_sync(bm_handle_t handle);
+/**
+ * @name    bm_handle_sync_from_core
+ * @brief   To synchronize APIs of the handle. The thread will block
+ *          until all the outstanding APIs of the handle are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle   The device handle
+ * @param [in] core_id  The core id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_handle_sync_from_core(bm_handle_t handle, int core_id);
+/**
+ * @name    bm_thread_sync
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @retval  BM_SUCCESS Succeeds.
+ *          Other code Fails.
+ */
+DECL_EXPORT bm_status_t bm_thread_sync(bm_handle_t handle);
+/**
+ * @name    bm_thread_sync_from_core
+ * @brief   To synchronize APIs of the current thread. The thread will block
+ *          until all the outstanding APIs of the current thread are finished.
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle  The device handle
+ * @param [in] core_id The core id
+ * @retval  BM_SUCCESS Succeeds.
+ *          Other code Fails.
+ */
+DECL_EXPORT bm_status_t bm_thread_sync_from_core(bm_handle_t handle, int core_id);
+/*******************trace and profile releated functions **********************/
+typedef struct bm_profile {
+#ifdef __linux__
+  unsigned long cdma_in_time;
+  unsigned long cdma_in_counter;
+  unsigned long cdma_out_time;
+  unsigned long cdma_out_counter;
+  unsigned long tpu_process_time;
+  unsigned long tpu1_process_time;
+  unsigned long sent_api_counter;
+  unsigned long completed_api_counter;
+#else
+  unsigned long long cdma_in_time;
+  unsigned long long cdma_in_counter;
+  unsigned long long cdma_out_time;
+  unsigned long long cdma_out_counter;
+  unsigned long long tpu_process_time;
+  unsigned long long tpu1_process_time;
+  unsigned long long sent_api_counter;
+  unsigned long long completed_api_counter;
+#endif
+} bm_profile_t;
+/**
+ * @name    bm_get_profile
+ * @brief   To get the profile data at the moment
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] profile The result profile data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_profile(bm_handle_t handle, bm_profile_t *profile);
+typedef struct bootloader_version{
+	char *bl1_version;
+	char *bl2_version;
+	char *bl31_version;
+	char *uboot_version;
+} boot_loader_version;
+/**
+ * @name    bm_get_boot_loader_version
+ * @brief   To get the boot_loader_version
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] version The result version data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_boot_loader_version(bm_handle_t handle, boot_loader_version *version);
+/**
+ * @name    bm_get_vpu_instant_usage
+ * @brief   To get vpu usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result vpu usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_vpu_instant_usage(bm_handle_t handle, int *vpu_usage);
+/**
+ * @name    bm_get_jpu_core_usage
+ * @brief   To get the jpu usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result jpu usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_jpu_core_usage(bm_handle_t handle, int *jpu_usage);
+/**
+ * @name    bm_get_vpp_instant_usage
+ * @brief   To get the vpp usage
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] smi_attr The result vpp usage
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_vpp_instant_usage(bm_handle_t handle, int *vpp_usage);
+/**
+ * @name    bm_get_last_api_process_time_us
+ * @brief   This function is abandoned.
+ */
+#ifdef __linux__
+DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle,
+                                            unsigned long *time_us);
+#else
+DECL_EXPORT bm_status_t bm_get_last_api_process_time_us(bm_handle_t handle,
+											unsigned long long *time_us);
+#endif
+/*******************tpu clock and module reset releated functions *************/
+/**
+ * @name    bm_set_clk_tpu_freq
+ * @brief   To set the clock frequency of TPU (only valid in PCIE mode).
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]   freq   The TPU target frequency
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_set_clk_tpu_freq(bm_handle_t handle, int freq);
+/**
+ * @name    bm_get_clk_tpu_freq
+ * @brief   To get the clock frequency of TPU
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out]  freq   The current TPU frequency
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_clk_tpu_freq(bm_handle_t handle, int *freq);
+/*******************misc functions ********************************************/
+struct bm_misc_info {
+  int pcie_soc_mode;  /*0---pcie; 1---soc*/
+  int ddr_ecc_enable; /*0---disable; 1---enable*/
+  long long ddr0a_size;
+  long long ddr0b_size;
+  long long ddr1_size;
+  long long ddr2_size;
+  unsigned int chipid;
+#define BM1682_CHIPID_BIT_MASK (0X1 << 0)
+#define BM1684_CHIPID_BIT_MASK (0X1 << 1)
+#define BM1686_CHIPID_BIT_MASK (0X1 << 2)
+#ifdef __linux__
+  unsigned long chipid_bit_mask;
+#else
+	unsigned long long chipid_bit_mask;
+#endif
+  unsigned int driver_version;
+  int domain_bdf;
+  int board_version; /*hardware board version [23:16]-mcu sw version, [15:8]-board type, [7:0]-hw version*/
+  int a53_enable;
+  int dyn_enable;
+};
+/**
+ * @name    bm_get_misc_info
+ * @brief   To get miscellaneous information of the device
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle     The device handle
+ * @param [out] pmisc_info The fetched misc info
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_misc_info(bm_handle_t handle, struct bm_misc_info *pmisc_info);
+/**
+ * @name    bm_get_chipid
+ * @brief   To get the chipid of the device. (0x1682 / 0x1684 / 0x168?)
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle    The device handle
+ * @param [out] p_chipid The chip id of the device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chipid(bm_handle_t handle, unsigned int *p_chipid);
+#define BMLIB_LOG_QUIET    -8
+#define BMLIB_LOG_PANIC     0
+#define BMLIB_LOG_FATAL     8
+#define BMLIB_LOG_ERROR    16
+#define BMLIB_LOG_WARNING  24
+#define BMLIB_LOG_INFO     32
+#define BMLIB_LOG_VERBOSE  40
+#define BMLIB_LOG_DEBUG    48
+#define BMLIB_LOG_TRACE    56
+/**
+ * @name    bmlib_log_get_level
+ * @brief   To get the bmlib log level
+ * @ingroup bmlib_log
+ *
+ * @param void
+ * @retval  The level of bmlib log level
+ */
+DECL_EXPORT int  bmlib_log_get_level(void);
+/**
+ * @name    bmlib_log_set_level
+ * @brief   To set the bmlib log level
+ * @ingroup bmlib_log
+ *
+ * @param [in] level    The level of bmlib log level
+ * @retval  void
+ */
+DECL_EXPORT void bmlib_log_set_level(int level);
+/**
+ * @name    bmlib_log_set_callback
+ * @brief   To set callback to get bmlib log
+ * @ingroup bmlib_log
+ *
+ * @param [in]  callback     The callback function to get bmlib log
+ * @retval  void
+ */
+DECL_EXPORT void bmlib_log_set_callback(void (*callback)(const char*, int, const char*, va_list args));
+/**
+ * @name    bm_set_debug_mode
+ * @brief   To set the debug mode for firmware log for tpu
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  mode    The debug mode of fw log, 0/1 for disable/enable log
+ * @retval  void
+ */
+DECL_EXPORT void bm_set_debug_mode(bm_handle_t handle, int mode);
+/**
+ * @name    bmlib_api_dbg_callback
+ * @brief   To set debug callback to get firmware log
+ * @ingroup bmlib_log
+ *
+ * @param [in]  bmlib_api_dbg_callback  callback to get firmware log
+ * @retval  void
+ */
+typedef void (*bmlib_api_dbg_callback)(int, int, int, const char*);
+// api, result, duratioin, log, third int for api duration for future
+DECL_EXPORT void bmlib_set_api_dbg_callback(bmlib_api_dbg_callback callback);
+/**
+ * @name    bmcpu_get_cpu_status
+ * @brief   Get bmcpu status
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BMCPU_RUNNING  bmcpu is running.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_cpu_status_t bmcpu_get_cpu_status(bm_handle_t handle);
+/**
+ * @name    bmcpu_start_cpu
+ * @brief   Start cpu in pcie mode
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  boot_file       Fip file
+ * @param [in]  core_file       Itb file
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_start_cpu(bm_handle_t handle, char *boot_file, char *core_file);
+/**
+ * @name    bmcpu_open_process
+ * @brief   Open a process to do some work
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  flags           Process flags
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  >= 0 process handle
+ *          < 0  Other code Fails.
+ */
+DECL_EXPORT int bmcpu_open_process(bm_handle_t handle, unsigned int flags, int timeout);
+/**
+ * @name    bmcpu_load_library
+ * @brief   Load a share library(so) to specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  library_file    Library file path
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_load_library(bm_handle_t handle, int process_handle, char *library_file, int timeout);
+/**
+ * @name    bmcpu_unload_library
+ * @brief   Load a share library(so) to specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  library_file    Library file path
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_unload_library(bm_handle_t handle, int process_handle, char *library_file, int timeout);
+/**
+ * @name    bmcpu_exec_function
+ * @brief   Execute specific function in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function parameters
+ * @param [in]  param_size      Parameters size in bytes
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_exec_function(bm_handle_t handle,
+                     int process_handle,
+                     char *function_name,
+                     void *function_param,
+                     unsigned int param_size,
+                     int timeout);
+#define BMCPU_EXEC_OPT_NO_FLUSH_CACHE     1
+/**
+ * @name    bmcpu_exec_function_ext
+ * @brief   Execute specific function in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function parameters
+ * @param [in]  param_size      Parameters size in bytes
+ * @param [in]  opt             exec options
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_exec_function_ext(bm_handle_t  handle,
+                            int process_handle,
+                            char *function_name,
+                            void *function_param,
+                            unsigned int param_size,
+                            unsigned int opt,
+                            int timeout);
+/**
+ * @name    bmcpu_exec_function_async
+ * @brief   Execute specific function in specific process asynchronous
+ *          user should use bm_query_exec_function_result to query result
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function param
+ * @param [in]  param_size      Param size in bytes
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_exec_function_async(bm_handle_t handle,
+                                   int process_handle,
+                                   char *function_name,
+                                   void *function_param,
+                                   unsigned int param_size,
+                                   unsigned long long *api_handle);
+/**
+ * @name    bmcpu_exec_function_async_ext
+ * @brief   Execute specific function in specific process asynchronous
+ *          user should use bm_query_exec_function_result to query result
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  function_name   Function name
+ * @param [in]  function_param  Function param
+ * @param [in]  param_size      Param size in bytes
+ * @param [in]  opt             exec options
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_exec_function_async_ext(bm_handle_t handle,
+                                          int process_handle,
+                                          char *function_name,
+                                          void *function_param,
+                                          unsigned int param_size,
+                                          unsigned int opt,
+                                          unsigned long long *api_handle);
+/**
+ * @name    bmcpu_query_exec_function_result
+ * @brief   Query result from function called by bm_exec_function
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  api_handle      Api handle return by bm_exec_function_async
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  0   success.
+ *          >0  code fails from bmlib
+ *          <0  code fails from function
+ */
+DECL_EXPORT int bmcpu_query_exec_function_result(bm_handle_t handle, unsigned long long api_handle, int timeout);
+/**
+ * @name    bmcpu_map_phys_addr
+ * @brief   Map physical address in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  phys_addr       Physical address
+ * @param [in]  size            Map size in bytes
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  >0  virtual address
+ *          0   fails
+ */
+DECL_EXPORT void *bmcpu_map_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, unsigned int size, int timeout);
+/**
+ * @name    bmcpu_unmap_phys_addr
+ * @brief   Unmap physical address in specific process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  phys_addr       Physical address
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  <0  fail
+ *          0   success
+ */
+DECL_EXPORT bm_status_t bmcpu_unmap_phys_addr(bm_handle_t handle, int process_handle, void *phys_addr, int timeout);
+/**
+ * @name    bmcpu_close_process
+ * @brief   Close process
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_close_process(bm_handle_t handle, int process_handle, int timeout);
+/**
+ * @name    bmcpu_reset_cpu
+ * @brief   Reset cpu in pcie mode
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_reset_cpu(bm_handle_t handle);
+/**
+ * @name    bm_enable_perf_monitor
+ * @brief   enable perf monitor to get gdma and tpu performance data
+ * @ingroup bmlib_perf
+ *
+ * @param [in]  handle         The device handle
+ * @param [in]  perf_monitor   The monitor to perf
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_enable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor);
+/**
+ * @name    bm_disable_perf_monitor
+ * @brief   disable perf monitor to get gdma and tpu performance data
+ * @ingroup bmlib_perf
+ *
+ * @param [in]  handle         The device handle
+ * @param [in]  perf_monitor   The monitor to perf
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_disable_perf_monitor(bm_handle_t handle, bm_perf_monitor_t *perf_monitor);
+/**
+ * @name    bmcpu_set_log
+ * @brief   Set cpu log options
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  log_level       0: DEBUG  1:INFO 2:WARN 3:ERROR 4:FATAL
+ * @param [in]  log_to_console  1: YES  0: No
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_set_log(bm_handle_t handle, unsigned int log_level,  unsigned int log_to_console, int timeout);
+/**
+ * @name    bmcpu_get_log
+ * @brief   Get cpu log file
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @param [in]  process_handle  Process handle
+ * @param [in]  log_file        save log as file
+ * @param [in]  timeout         Timeout value in millisecond, -1 means default value of this device
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_get_log(bm_handle_t handle, int process_handle, char *log_file, int timeout);
+/**
+ * @name    bmcpu_sync_time
+ * @brief   Sync device cpu time with host
+ * @ingroup bmlib_log
+ *
+ * @param [in]  handle          The device handle
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmcpu_sync_time(bm_handle_t handle);
+/*******************trace and profile releated functions **********************/
+struct bm_heap_stat {
+  unsigned int mem_total;
+  unsigned int mem_avail;
+  unsigned int mem_used;
+};
+typedef struct bm_heap_stat_byte {
+  unsigned int  heap_id;
+  unsigned long long mem_total;
+  unsigned long long mem_avail;
+  unsigned long long mem_used;
+  unsigned long long mem_start_addr;
+} bm_heap_stat_byte_t;
+typedef struct bm_dev_stat {
+  int mem_total;
+  int mem_used;
+  int tpu_util;
+  int heap_num;
+  struct bm_heap_stat heap_stat[4];
+} bm_dev_stat_t;
+/**
+ * @name    bm_get_stat
+ * @brief   To get the stat data at the moment
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [out] profile The result stat data
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_stat(bm_handle_t handle, bm_dev_stat_t *stat);
+/**
+ * @name    bm_get_gmem_heap_id
+ * @brief   To get the heap id of allocated global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  pmem The allocted global memory
+ * @param [out] heapid The result of get heap id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_heap_id(bm_handle_t handle, bm_device_mem_t *pmem, unsigned int *heapid);
+/**
+ * @name    sg_get_gmem_heap_id
+ * @brief   To get the heap id of allocated global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  pmem The allocted global memory
+ * @param [out] heapid The result of get heap id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t sg_get_gmem_heap_id(bm_handle_t handle, sg_device_mem_t *pmem, unsigned int *heapid);
+/**
+ * @name    bm_get_gmem_total_heap_num
+ * @brief   To get the total heap num of global memory
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  heap_num The result of get total num
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_total_heap_num(bm_handle_t handle, unsigned int *heap_num);
+/**
+ * @name    bm_get_gmem_heap_stat_byte_by_id
+ * @brief   To get the heap stat by heap id
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  heap_id The heap index to get heap status
+ * @param [out] pheap_byte The result of get heap status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_gmem_heap_stat_byte_by_id(bm_handle_t handle, bm_heap_stat_byte_t *pheap_byte, unsigned int heap_id);
+DECL_EXPORT bm_status_t bm_load_firmware(
+        bm_handle_t  handle,
+        const char  *firmware_tcm,
+        const char  *firmware_ddr);
+#define bmkernel_load_firmware okkernel_load_firmware
+DECL_EXPORT bm_status_t okkernel_load_firmware(
+        bm_handle_t  handle,
+        const char  *firmware_tcm,
+        const char  *firmware_ddr);
+DECL_EXPORT bm_status_t okkernel_launch_async(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+DECL_EXPORT bm_status_t okkernel_launch_sync(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+DECL_EXPORT bm_status_t tpu_kernel_launch_sync(
+        bm_handle_t   handle,
+        const char   *func_name,
+        const void   *args,
+        unsigned int  size);
+DECL_EXPORT bm_status_t okkernel_sync(bm_handle_t handle);
+/**
+ * @name    bmkernel_launch
+ * @brief   send api to device and launch function
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  api cmd struct pointer
+ * @param [in]  api cmd length
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmkernel_launch(bm_handle_t handle, const void *args,
+                            unsigned int size);
+/**
+ * @name    bmkernel_load_lookup_table
+ * @brief   load lookup table to l2-sram
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]  handle  The device handle
+ * @param [in]  table which loaded to l2-sram
+ * @param [in]  table size
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bmkernel_load_lookup_table(bm_handle_t handle, const void* table, unsigned int size);
+/*******************device management api functions ********************************************/
+/**
+ * @name    bm_get_tpu_current
+ * @brief   get tpu current
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle     The device handle
+ * @param [out]  tpuc(mA)   The pointer for tpu current
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_current(bm_handle_t handle, unsigned int *tpuc);
+/**
+ * @name    bm_get_board_max_power
+ * @brief   get board support max power
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  maxp    The pointer for maxp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_max_power(bm_handle_t handle, unsigned int *maxp);
+/**
+ * @name    bm_get_board_power
+ * @brief   get board power
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle    The device handle
+ * @param [out]  boardp    The pointer for boardp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_power(bm_handle_t handle, unsigned int *boardp);
+/**
+ * @name    bm_get_fan_speed
+ * @brief   get board fan speed
+ * @ingroup bmlib_runtime
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  fan    The pointer for fan speed
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_fan_speed(bm_handle_t handle, unsigned int *fan);
+/**
+ * @name    bm_get_ecc_correct_num
+ * @brief   get ecc_correct_num
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  ecc_correct_num
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+#ifdef __linux__
+DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long *ecc_correct_num);
+#else
+DECL_EXPORT bm_status_t bm_get_ecc_correct_num(bm_handle_t handle, unsigned long long *ecc_correct_num);
+#endif
+/**
+ * @name    bm_get_12v_atx
+ * @brief   get atx_12v
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  atx_12v
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_12v_atx(bm_handle_t handle, int *atx_12v);
+/**
+ * @name    bm_get_product_sn
+ * @brief   get SE5 sn
+ * @ingroup device management api
+ *
+ * @param [out]  product_sn
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_product_sn(char *product_sn);
+/**
+ * @name    bm_get_sn
+ * @brief   get sn
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  sn
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_sn(bm_handle_t handle, char *sn);
+/**
+ * @name    bm_get_status
+ * @brief   get chip status
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  status  The board error status, each bit represents an error state
+ *  status == 0x0, borad is nornal, staus > 0, borad is abnormal;
+ *  bit0 == 1, tpu is hang
+ *  bit1 == 1, pcie link abnormal
+ *  bit2 == 1, board temperature is too high
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_status(bm_handle_t handle, int *status);
+/**
+ * @name    bm_get_tpu_maxclk
+ * @brief   get tpu_maxclk
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  tpu_maxclk
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_maxclk(bm_handle_t handle, unsigned int *tpu_maxclk);
+/**
+ * @name    bm_get_tpu_minclk
+ * @brief   get tpu_minclk
+ * @ingroup device management api
+ *
+ * @param [in]   handle  The device handle
+ * @param [out]  tpu_minclk
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_minclk(bm_handle_t handle, unsigned int *tpu_minclk);
+/**
+ * @name    bm_get_driver_version
+ * @brief   get driver version
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  driver_version
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_driver_version(bm_handle_t handle, int *driver_version);
+/**
+ * @name    bm_get_board_name
+ * @brief   get device board name
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  board_name
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_name(bm_handle_t handle, char *name);
+/**
+ * @name    bm_get_board_temp
+ * @brief   get board temperature
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  board_temp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_board_temp(bm_handle_t handle, unsigned int *board_temp);
+/**
+ * @name    bm_get_chip_temp
+ * @brief   get chip temperature
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  chip_temp
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chip_temp(bm_handle_t handle, unsigned int *chip_temp);
+/**
+ * @name    bm_get_tpu_power
+ * @brief   get TPU power
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  tpu_power
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_power(bm_handle_t handle, float *tpu_power);
+/**
+ * @name    bm_get_tpu_volt
+ * @brief   get TPU voltage
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  tpu_volt
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_volt(bm_handle_t handle, unsigned int *tpu_volt);
+/**
+ * @name    bm_get_card_id
+ * @brief   get card id
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  card_id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_card_id(bm_handle_t handle, unsigned int *card_id);
+/**
+ * @name    bm_get_card_num
+ * @brief   get card number
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  card_id
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_card_num(unsigned int *card_num);
+/**
+ * @name    bm_get_chip_num_from_card
+ * @brief   get chip number and start chip id from card
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  chip_num
+ * @param [out]  dev_start_index
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_chip_num_from_card(unsigned int card_id, unsigned int *chip_num, unsigned int *dev_start_index);
+/**
+ * @name    bm_get_dynfreq_status
+ * @brief   get chip dynamic freq status
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [out]  dynfreq_status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_dynfreq_status(bm_handle_t handle, int *dynfreq_status);
+/**
+ * @name    bm_change_dynfreq_status
+ * @brief   change(enable/disable) chip dynamic freq status
+ * @ingroup device management api
+ *
+ * @param [in]   handle The device handle
+ * @param [in]   new_status
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_change_dynfreq_status(bm_handle_t handle, int new_status);
+/**
+ * @name    bm_get_tpu_scalar_num
+ * @brief   To get the core number of TPU scalar
+ * @ingroup bmlib_runtime
+ *
+ * @param [in] handle    The device handle
+ * @param [out] core_num The core number of TPU scalar
+ * @retval  BM_SUCCESS  Succeeds.
+ *          Other code  Fails.
+ */
+DECL_EXPORT bm_status_t bm_get_tpu_scalar_num(bm_handle_t handle, unsigned int *core_num);
+#define  bm_get_tpu_core_num bm_get_tpu_scalar_num
+#if defined(__cplusplus)
+}
+#endif
+#endif /* BM_RUNTIME_H_ */

ChatGLM2/support/include/bmruntime_interface.h ADDED Viewed

	@@ -0,0 +1,404 @@

+/*****************************************************************************
+ *
+ *    Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
+ *
+ *    The material in this file is confidential and contains trade secrets
+ *    of Sophgo Technologies Inc. This is proprietary information owned by
+ *    Sophgo Technologies Inc. No part of this work may be disclosed,
+ *    reproduced, copied, transmitted, or used in any way for any purpose,
+ *    without the express written permission of Sophgo Technologies Inc.
+ *
+ *****************************************************************************/
+/*****************************************************************************
+ * BMRuntime Interface is mainly for inference.
+ * Also we can use it for device computation from BMLang programming.
+ * Note: please use interface from bmlib_runtime.h for device memory operation.
+ ****************************************************************************/
+#ifndef BMRUNTIME_INTERFACE_H_
+#define BMRUNTIME_INTERFACE_H_
+#include "bmdef.h"
+#ifdef _WIN32
+#define DECL_EXPORT _declspec(dllexport)
+#define DECL_IMPORT _declspec(dllimport)
+#else
+#define DECL_EXPORT
+#define DECL_IMPORT
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* --------------------------------------------------------------------------*/
+/* interface for basic data type */
+/* get data type byte size */
+DECL_EXPORT size_t bmrt_data_type_size(bm_data_type_t dtype);
+/*
+dims array to bm_shape_t,
+shape and dims should not be NULL, num_dims should not be larger than BM_MAX_DIMS_NUM */
+DECL_EXPORT void bmrt_shape(bm_shape_t* shape, const int* dims, int num_dims);
+/*
+number of shape elements, shape should not be NULL and num_dims should not large than
+BM_MAX_DIMS_NUM */
+DECL_EXPORT uint64_t bmrt_shape_count(const bm_shape_t* shape);
+/* compare whether two shape is same */
+DECL_EXPORT bool bmrt_shape_is_same(const bm_shape_t* left, const bm_shape_t* right);
+/*
+fill a tensor with data type and shape, and st_mode = 0 as default.
+tensor and p_bmrt should not be NULL, shape count should not be 0.
+it will alloc device mem to tensor->device_mem, so user should bmrt_free_device(p_bmrt,
+tensor->device_mem) to free it.*/
+DECL_EXPORT bool bmrt_tensor(bm_tensor_t* tensor, void* p_bmrt, bm_data_type_t dtype, bm_shape_t shape);
+/*
+fill a tensor with data type and shape, and st_mode = 0 as default.
+tensor and p_bmrt should not be NULL, shape count should not be 0.
+it will alloc device mem to tensor->device_mem on devid-th device.*/
+DECL_EXPORT bool bmrt_tensor_ex(bm_tensor_t* tensor, void* p_bmrt, int devid, bm_data_type_t dtype, bm_shape_t shape);
+/* fill a tensor with device mem existed, tensor byte size should not large than device mem size */
+DECL_EXPORT void bmrt_tensor_with_device(bm_tensor_t* tensor, bm_device_mem_t device_mem,
+                             bm_data_type_t dtype, bm_shape_t shape);
+/* get tensor bytes size, tensor should not be NULL */
+DECL_EXPORT size_t bmrt_tensor_bytesize(const bm_tensor_t* tensor);
+/* get tensor mem size allocated in device mem, tensor should not be NULL */
+DECL_EXPORT size_t bmrt_tensor_device_size(const bm_tensor_t* tensor);
+/* print net info for debug */
+DECL_EXPORT void bmrt_print_network_info(const bm_net_info_t* net_info);
+/* --------------------------------------------------------------------------*/
+/**
+ * @name    bmrt_create
+ * @brief   To create the bmruntime with bm_handle.
+ * @ingroup bmruntime
+ *
+ * This API creates the bmruntime. It returns a void* pointer which is the pointer
+ * of bmruntime. Device id is set when get bm_handle;
+ *
+ * @param [in] bm_handle     bm handle. It must be initialized by using bmlib.
+ *
+ * @retval void* the pointer of bmruntime
+ */
+DECL_EXPORT void* bmrt_create(bm_handle_t bm_handle);
+/* --------------------------------------------------------------------------*/
+/**
+ * @name    bmrt_create_ex
+ * @brief   To create the bmruntime with one or more bm_handle.
+ * @ingroup bmruntime
+ *
+ * This API creates the bmruntime. It returns a void* pointer which is the pointer
+ * of bmruntime.
+ *
+ * @param [in] bm_handles   bm handles. They must be initialized by using bmlib.
+ * @param [in] num_handles  number of bm_handles.
+ *
+ * @retval void* the pointer of bmruntime
+ */
+DECL_EXPORT void *bmrt_create_ex(bm_handle_t *bm_handles, int num_handles);
+/**
+ * @name    bmrt_destroy
+ * @brief   To destroy the bmruntime pointer
+ * @ingroup bmruntime
+ *
+ * This API destroy the bmruntime.
+ *
+ * @param [in]     p_bmrt        Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_destroy(void* p_bmrt);
+/**
+ * @name    bmrt_get_bm_handle
+ * @brief   To get the BM runtime context.
+ * @ingroup bmruntime
+ *
+ * This API get the BM runtime context for using BMDNN, BMCV or BMLIB
+ *
+ * @param [in]     p_bmrt        Bmruntime that had been created
+ */
+DECL_EXPORT void * bmrt_get_bm_handle(void* p_bmrt);
+/**
+ * @name    bmrt_load_bmodel
+ * @brief   To load the bmodel which is created by BM compiler
+ * @ingroup bmruntime
+ *
+ * This API is to load bmodel created by BM compiler.
+ * After loading bmodel, we can run the inference of neuron network.
+ *
+ * @param   [in]   p_bmrt        Bmruntime that had been created
+ * @param   [in]   bmodel_path   Bmodel file directory.
+ *
+ * @retval true    Load context sucess.
+ * @retval false   Load context failed.
+ */
+DECL_EXPORT bool bmrt_load_bmodel(void* p_bmrt, const char *bmodel_path);
+/**
+ * @name    bmrt_load_bmodel_data
+ * @brief   To load the bmodel which is created by BM compiler from buffer
+ * @ingroup bmruntime
+ *
+ * This API is to load bmodel created by BM compiler.
+ * After loading bmodel, we can run the inference of neuron network.
+ * Different with bmrt_load_bmodel, bmodel is the data in host memory.
+ *
+ * @param   [in]   p_bmrt        Bmruntime that had been created
+ * @param   [in]   bmodel_data   Bmodel data pointer to buffer
+ * @param   [in]   size          Bmodel data size
+ *
+ * @retval true    Load context sucess.
+ * @retval false   Load context failed.
+ */
+DECL_EXPORT bool bmrt_load_bmodel_data(void* p_bmrt, const void * bmodel_data, size_t size);
+/**
+ * @name    bmrt_show_neuron_network
+ * @brief   To print the name of all neuron network
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_show_neuron_network(void* p_bmrt);
+/**
+ * @name    bmrt_get_network_number
+ * @brief   To get the number of neuron network in the bmruntime
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ *
+ * @retval  int value     The number of neuron networks.
+ */
+DECL_EXPORT int bmrt_get_network_number(void* p_bmrt);
+/**
+ * @name    bmrt_get_network_names
+ * @brief   To get the names of all neuron network in the bmruntime
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ * @param [out]    network_names  The names of all neuron networks. It should be declare as (const char** networks_ = NULL),
+ *                                and use as the param &networks_. After this API, user need to free(networks_) if user
+ *                                do not need it.
+ */
+DECL_EXPORT void bmrt_get_network_names(void* p_bmrt, const char*** network_names);
+/**
+ * @name    bmrt_get_network_info
+ * @brief   To get network info by net name
+ * @ingroup bmruntime
+ *
+ * @param [in]     p_bmrt         Bmruntime that had been created
+ * @param [in]     net_name       Network name
+ *
+ * @retval  bm_net_info_t*        Pointer to net info, needn't free by user; if net name not found, will return NULL.
+ */
+DECL_EXPORT const bm_net_info_t* bmrt_get_network_info(void* p_bmrt, const char* net_name);
+/**
+ * @name    bmrt_launch_tensor
+ * @brief   To launch the inference of the neuron network with setting input tensors
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync should be called to make sure inference finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ * @param [in]    net_name       The name of the neuron network
+ * @param [in]    input_tensors  Array of input tensor, defined like bm_tensor_t input_tensors[input_num].
+ *                               User should initialize each input tensor.
+ * @param [in]    input_num      Input number
+ * @param [out]   output_tensors Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                               This interface will alloc devcie mem to store output data. User should free each
+ *                               device mem by bm_free_device after the result data not used.
+ * @param [in]    output_num     Output number
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
+                        bm_tensor_t output_tensors[], int output_num);
+/**
+ * @name    bmrt_launch_tensor_ex
+ * @brief   To launch the inference of the neuron network with setting input tensors
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync should be called to make sure inference finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt            Bmruntime that had been created
+ * @param [in]    net_name          The name of the neuron network
+ * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
+ *                                  User should initialize each input tensor.
+ * @param [in]    input_num         Input number
+ * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
+ *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
+ *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
+ *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
+ * @param [in]    output_num        Output number
+ * @param [in]    user_mem          whether device_mem of output tensors are set
+ * @param [in]    user_stmode       whether stmode of output tensors are set
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor_ex(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
+                           bm_tensor_t output_tensors[], int output_num, bool user_mem, bool user_stmode);
+/**
+ * @name    bmrt_launch_data
+ * @brief   To launch the inference of the neuron network with setting input datas in system memory
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU
+ * program will be blocked.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ * @param [in]    net_name       The name of the neuron network
+ * @param [in]    input_datas    Array of input data, defined like void * input_datas[input_num]. User should
+ *                               initialize each data pointer as input.
+ * @param [in]    input_shapes   Array of input shape, defined like bm_shape_t input_shapes[input_num].
+ *                               User should set each input shape
+ * @param [in]    input_num      Input number
+ * @param [out]   output_datas   Array of output data, defined like void * output_datas[output_num].
+ *                               If user don't alloc each output data, set user_mem to false, and this api will alloc
+ *                               output mem, user should free each output mem when output data not used. Also
+ *                               user can alloc system memory for each output data by self and set user_mem = true.
+ * @param [out]   output_shapes  Array of output shape, defined like bm_shape_t output_shapes[output_num].
+ *                               It will store each output shape.
+ * @param [in]    output_num     Output number
+ * @param [in]    user_mem       whether output_datas[i] have allocated memory
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_data(void* p_bmrt, const char* net_name, void* const input_datas[],
+                      const bm_shape_t input_shapes[], int input_num, void * output_datas[],
+                      bm_shape_t output_shapes[], int output_num, bool user_mem);
+/**
+ * @name    bmrt_trace
+ * @brief   To check runtime environment, and collect info for DEBUG
+ * @ingroup bmruntime
+ *
+ * This API is to collect runtime info for DEBUG. Expecially when launch result sudden mistake, call bmrt_trace
+ * will show whether device mems are broken, and other check info.
+ *
+ * @param [in]    p_bmrt         Bmruntime that had been created
+ */
+DECL_EXPORT void bmrt_trace(void* p_bmrt);
+/**
+ * @name    bmrt_launch_tensor_multi_cores
+ * @brief   To launch the inference of the neuron network with setting input tensors, and support multi core inference.
+ * @ingroup bmruntime
+ *
+ * This API supports the neuron nework that is static-compiled or dynamic-compiled
+ * After calling this API, inference on TPU is launched. And the CPU program will not
+ * be blocked. bm_thread_sync_from_core should be called to make sure inference is finished.
+ * This API support multiple inputs, and multi thread safety
+ *
+ * @param [in]    p_bmrt            Bmruntime that had been created
+ * @param [in]    net_name          The name of the neuron network
+ * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
+ *                                  User should initialize each input tensor.
+ * @param [in]    input_num         Input number
+ * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
+ *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
+ *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
+ *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
+ *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
+ * @param [in]    output_num        Output number
+ * @param [in]    user_mem          whether device_mem of output tensors are set
+ * @param [in]    user_stmode       whether stmode of output tensors are set
+ * @param [in]    core_list         core id list those will be used to inference
+ * @param [in]    core_num          number of the core list
+ *
+ * @retval true    Launch success.
+ * @retval false   Launch failed.
+ */
+DECL_EXPORT bool bmrt_launch_tensor_multi_cores(
+    void *p_bmrt,
+    const char *net_name,
+    const bm_tensor_t input_tensors[],
+    int input_num,
+    bm_tensor_t output_tensors[],
+    int output_num,
+    bool user_mem,
+    bool user_stmode,
+    const int *core_list,
+    int core_num);
+/**
+ *  @name    bmrt_memcpy_s2d_parallel
+ *  @brief   To copy data from system memory to muti-devices memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, datas[:tensor_num[0]] will be copied to the first device, and
+ *  datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] will be copied to the second device and so on.
+ *  The process of copying data to different devices is done in parallel and to the same device is in sequence.
+ *
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     tensors     Array of tensors that will be copied to devices
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied to each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_s2d_parallel(
+    void *p_bmrt,
+    bm_tensor_t tensors[],
+    void *datas[],
+    int tensor_num[],
+    int device_num);
+/**
+ *  @name    bmrt_memcpy_d2s_parallel
+ *  @brief   To copy data from muti-devices memory to system memory in parallel
+ *  @ingroup bmruntime
+ *
+ *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
+ *  After calling this API, tensors on the first device will be copied to datas[:tensor_num[0]] , and
+ *  tensors on the second device will be copied to datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] and so on.
+ *  The process of copying data from different devices is done in parallel and from the same device is in sequence.
+ *
+ *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
+ *  @param [in]     datas       Array of satas allocated in system memory
+ *  @param [in]     tensors     Array of tensors that will be copied from devices
+ *  @param [in]     tensor_num  Array of tensor_num that will be copied from each device
+ *  @param [in]     device_num  Device number
+*/
+DECL_EXPORT bool bmrt_memcpy_d2s_parallel(
+    void *p_bmrt,
+    void *datas[],
+    bm_tensor_t tensors[],
+    int tensor_num[],
+    int device_num);
+#if defined (__cplusplus)
+}
+#endif
+#endif

ChatGLM2/support/include/sentencepiece/sentencepiece_processor.h ADDED Viewed

	@@ -0,0 +1,727 @@

+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+#ifndef SENTENCEPIECE_PROCESSOR_H_
+#define SENTENCEPIECE_PROCESSOR_H_
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+#ifndef SWIG
+namespace absl {
+using std::string_view;
+}  // namespace absl
+#endif  // SWIG
+namespace sentencepiece {
+namespace util {
+enum class StatusCode : int {
+  kOk = 0,
+  kCancelled = 1,
+  kUnknown = 2,
+  kInvalidArgument = 3,
+  kDeadlineExceeded = 4,
+  kNotFound = 5,
+  kAlreadyExists = 6,
+  kPermissionDenied = 7,
+  kResourceExhausted = 8,
+  kFailedPrecondition = 9,
+  kAborted = 10,
+  kOutOfRange = 11,
+  kUnimplemented = 12,
+  kInternal = 13,
+  kUnavailable = 14,
+  kDataLoss = 15,
+  kUnauthenticated = 16,
+};
+class Status {
+ public:
+  Status();
+  ~Status();
+  Status(StatusCode code, absl::string_view error_message);
+  Status(const Status &s);
+  void operator=(const Status &s);
+  bool operator==(const Status &s) const;
+  bool operator!=(const Status &s) const;
+  inline bool ok() const { return rep_ == nullptr; }
+  void set_error_message(const char *str);
+  const char *error_message() const;
+  const char *message() const { return error_message(); }
+  StatusCode code() const;
+  std::string ToString() const;
+  void IgnoreError();
+ private:
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+}  // namespace util
+// SentencePieceProcessor:
+// Simple and language independent tokenizer and de-tokenizer for
+// Neural Network Machine Translation.
+//
+// SentencePieceProcessor provides Encode() and Decode() methods,
+// which correspond to tokenization and de-tokenization respectively.
+//
+// - Encode:
+//   Given a raw source sentence, encode it into a sequence
+//   of pieces or vocabulary ids.
+//
+// - Decode:
+//   Given a sequence of pieces or vocabulary ids, decode it
+//   into a de-tokenized raw sentence.
+//
+// SentencePieceProcessor provides a lossless data conversion
+// that allows the original raw sentence to be perfectly reconstructed
+// from the encoded data, i.e., Decode(Encode(input)) == input.
+// This characteristics is useful, as we can make the de-tokenization
+// completely language independent.
+//
+// Usage:
+//   SentencePieceProcessor sp;
+//   sp.Load("//path/to/model");
+//
+//   vector<string> sps;
+//   sp.Encode("hello world.", &sps).IgnoreError();
+//
+//   vector<int> ids;
+//   sp.Encode("hello world.", &ids).IgnoreError();
+//
+//   string detok;
+//   sp.Decode(sps, &detok);
+//   CHECK_EQ("hello world.", detok).IgnoreError();
+//
+//   sp.Decode(ids, &detok);
+//   CHECK_EQ("hello world.", detok).IgnoreError();
+//
+//  We can also use SentencePieceText which manages the byte-offsets
+//  between user input (output) and internal sentence pieces.
+//
+//   SentencePieceText spt;
+//   sp.Encode("hello world.", &spt);
+//   // Emits the byte range of each piece.
+//   for (const auto &piece : spt.pieces()) {
+//      LOG(INFO) << piece.begin() << " " << piece.end();
+//   }
+//
+//   sp.Decode({0, 1, 2, 3..}, &spt);
+//   for (const auto &piece : spt.pieces()) {
+//      LOG(INFO) << piece.begin() << " " << piece.end();
+//   }
+//
+class NBestSentencePieceText;
+class ModelInterface;
+class SentencePieceText;
+class ModelProto;
+namespace normalizer {
+class Normalizer;
+}  // namespace normalizer
+#ifndef SWIGGO
+namespace util {
+// Redefine std::string for serialized_proto interface as Python's string is
+// a Unicode string. We can enforce the return value to be raw byte sequence
+// with SWIG's typemap.
+using bytes = std::string;
+}  // namespace util
+#endif  // SWIGGO
+class NBestSentencePieceText;
+class ModelInterface;
+class SentencePieceText;
+class SentencePieceText_SentencePiece;
+// Wrapper class of SentencePieceText
+// This wrapper only allows an immutable access to the proto and
+// hides the actual implementation of protobuf.
+// See sentencepiece.proto for the details of this class.
+class ImmutableSentencePieceText_ImmutableSentencePiece {
+ public:
+  ImmutableSentencePieceText_ImmutableSentencePiece();
+  ~ImmutableSentencePieceText_ImmutableSentencePiece() = default;
+  const std::string &piece() const;
+  const std::string &surface() const;
+  uint32_t id() const;
+  uint32_t begin() const;
+  uint32_t end() const;
+  friend class ImmutableSentencePieceText;
+ private:
+  explicit ImmutableSentencePieceText_ImmutableSentencePiece(
+      const SentencePieceText_SentencePiece &sp);
+  const SentencePieceText_SentencePiece *sp_ = nullptr;
+};
+class ImmutableSentencePieceText {
+ public:
+  ImmutableSentencePieceText();
+  virtual ~ImmutableSentencePieceText();
+  std::vector<ImmutableSentencePieceText_ImmutableSentencePiece> pieces() const;
+  size_t pieces_size() const;
+  ImmutableSentencePieceText_ImmutableSentencePiece pieces(int index) const;
+  const std::string &text() const;
+  float score() const;
+  util::bytes SerializeAsString() const;
+  // Returns the actual mutable proto.
+  // Do not use this outside of SentencePieceProcessor, as
+  // it returns the raw pointer managed by the shared_ptr.
+  SentencePieceText *mutable_proto();
+  // Converts the utf8 byte spans into Unicode char span.
+  void ConvertToUnicodeSpans();
+  friend class ImmutableNBestSentencePieceText;
+ private:
+  explicit ImmutableSentencePieceText(const SentencePieceText &spt);
+  const SentencePieceText *spt_ = nullptr;
+  std::shared_ptr<SentencePieceText> rep_;
+};
+// Wrapper class of SentencePieceText
+// This wrapper only allows an immutable access to the proto and
+// hides the actual implementation of protobuf.
+// See sentencepiece.proto for the details of this class.
+class ImmutableNBestSentencePieceText {
+ public:
+  ImmutableNBestSentencePieceText();
+  virtual ~ImmutableNBestSentencePieceText();
+  std::vector<ImmutableSentencePieceText> nbests() const;
+  size_t nbests_size() const;
+  ImmutableSentencePieceText nbests(int index) const;
+  util::bytes SerializeAsString() const;
+  // Returns the actual mutable proto.
+  // Do not use this outside of SentencePieceProcessor, as
+  // it returns the raw pointer managed by the shared_ptr.
+  NBestSentencePieceText *mutable_proto();
+  void ConvertToUnicodeSpans();
+ private:
+  std::shared_ptr<NBestSentencePieceText> rep_;
+};
+class SentencePieceProcessor {
+ public:
+  SentencePieceProcessor();
+  virtual ~SentencePieceProcessor();
+  // Loads model from `filename`.
+  // Returns false if `filename` cannot be loaded.
+  virtual util::Status Load(absl::string_view filename);
+  // Loads model from `filename`.
+  // Crash if `filename` cannot be loaded.
+  virtual void LoadOrDie(absl::string_view filename);
+  // Loads model from `model_proto`.
+  // `model_proto` is copied.
+  virtual util::Status Load(const ModelProto &model_proto);
+  // Loads model from `model_proto`.
+  // `model_proto` is moved.
+  virtual util::Status Load(std::unique_ptr<ModelProto> model_proto);
+  // Loads model from `serialized`, which is a string-serialized model proto.
+  // Useful to load the model from a platform independent blob object.
+  virtual util::Status LoadFromSerializedProto(absl::string_view serialized);
+  // Returns the status. Encode/Decode methods are valid when status is OK.
+  virtual util::Status status() const;
+  // Sets encode extra_option sequence.
+  virtual util::Status SetEncodeExtraOptions(absl::string_view extra_option);
+  // Sets decode extra_option sequence.
+  virtual util::Status SetDecodeExtraOptions(absl::string_view extra_option);
+  //////////////////////////////////////////////////////////////
+  // Vocabulary restriction.
+  // Background:
+  // https://github.com/rsennrich/subword-nmt#best-practice-advice-for-byte-pair-encoding-in-nmt
+  // Restricts the vocabulary set.
+  // The input sentences are encoded into the tokens in `valid_vocab`.
+  virtual util::Status SetVocabulary(
+      const std::vector<absl::string_view> &valid_vocab);
+  // Reverts the vocabulary restriction.
+  virtual util::Status ResetVocabulary();
+  // Loads the valid vocabulary set from `filename` in TSV format.
+  // Format:  <token> <tab> <freq>.
+  // Any token with frequency < threshold will be treated as OOV.
+  virtual util::Status LoadVocabulary(absl::string_view filename,
+                                      int threshold);
+  //////////////////////////////////////////////////////////////
+  // Simple Encode and Decode API.
+  //
+  // Given a UTF8 input, encodes it into a sequence of sentence pieces.
+  virtual util::Status Encode(absl::string_view input,
+                              std::vector<std::string> *pieces) const;
+  // Given a UTF8 input, encodes it into a sequence of ids.
+  virtual util::Status Encode(absl::string_view input,
+                              std::vector<int> *ids) const;
+  // Given a sequence of pieces, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<std::string> &pieces,
+                              std::string *detokenized) const;
+  // Given a sequence of pieces, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<absl::string_view> &pieces,
+                              std::string *detokenized) const;
+  // Given a sequence of ids, decodes it into a detokenized output.
+  virtual util::Status Decode(const std::vector<int> &ids,
+                              std::string *detokenized) const;
+  //////////////////////////////////////////////////////////////
+  // NBest API.
+  //
+  // Same as Encode, but returns nbest results.
+  virtual util::Status NBestEncode(
+      absl::string_view input, int nbest_size,
+      std::vector<std::vector<std::string>> *pieces) const;
+  // Same as Encode, but returns nbest results.
+  virtual util::Status NBestEncode(absl::string_view input, int nbest_size,
+                                   std::vector<std::vector<int>> *ids) const;
+  //////////////////////////////////////////////////////////////
+  // Sampling API.
+  //
+  // Unigram and BPE support sampling mode.
+  // - Unigram (--model_type=unigram):
+  // `nbest_size`: When `nbest_size` is positive value, approximately samples
+  // one segmentation from nbest candidates. When `nbest_size` is negative
+  // value, samples one segmentation from the hypotheses (Lattice) according to
+  // the generation probabilities using forward-filtering and backward-sampling
+  // algorithm.
+  // `alpha`: Smoothing parameter (inverse temperature). The best segmentation
+  // (Viterbi segmentation) is more likely sampled when setting larger alpha.
+  // When alpha is 0.0, one segmentation is uniformly sampled from the nbest or
+  // lattice. `nbest_size` and `alpha` correspond to parameters `l` and `alpha`
+  // in https://arxiv.org/abs/1804.10959  (nbest_size < 0 means l = infinity)
+  //
+  // - BPE (--model_type=bpe):
+  // `alpha`: The dropout probability `p` of bpe merge operations in
+  // https://arxiv.org/abs/1910.13267 Nbest-based sampling is not supported so
+  // nbest_size parameter is ignored in BPE.
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha,
+                                    std::vector<std::string> *pieces) const;
+  // Same as above, but returns a sequence of ids.
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha, std::vector<int> *ids) const;
+  //////////////////////////////////////////////////////////////
+  // SampleEncodeAndScore API.
+  //
+  // Sample `samples` many tokenisations from the segmentation lattice.
+  // These methods are only available in model_type=unigram.
+  //
+  // `alpha`: smoothing parameter (inverse temperature). The same as `alpha` in
+  // `Sample` method.
+  // 'wor`: If `wor` is true, the samples are taken without replacement, and the
+  // scores are the inclusion probabilities of the elements in the sample;
+  // otherwise the samples are taken with replacement and the scores are the
+  // log-probs of sample elements
+  // `include_best`: If `include_best` is true, the best tokenisation is always
+  // included in the sample, and the remaining elements are sampled excluding
+  // the best.
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best,
+      std::vector<std::pair<std::vector<std::string>, float>> *pieces) const;
+  // Same as above, but returns a sequence of ids.
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best,
+      std::vector<std::pair<std::vector<int>, float>> *ids) const;
+  //////////////////////////////////////////////////////////////
+  // Entropy API.
+  //
+  // This only available in model_type=unigram.
+  // Calculate entropy of possible tokenisations
+  virtual util::Status CalculateEntropy(absl::string_view input, float alpha,
+                                        float *entropy) const;
+  //////////////////////////////////////////////////////////////
+  // Advanced API returning SentencePieceText, which manages
+  // utf8-byte alignments between user-input/detokenized text
+  // and internal sentencepiece sequence.
+  //
+  // Given a UTF8 input, encodes it into SentencePieceText.
+  //
+  // When using these APIs, sentencepiece.pb.h header files must be included.
+  // We can also use ImutableSentencePieceText as follows.
+  //
+  // ImmutableSentencePieceText spt;
+  // Encode("hello", spt.mutable_proto()).IgnoreError();
+  // std::cout << spt.pieces_size() << std::endl;
+  virtual util::Status Encode(absl::string_view input,
+                              SentencePieceText *spt) const;
+  virtual util::Status NBestEncode(absl::string_view input, int nbest_size,
+                                   NBestSentencePieceText *nbest_spt) const;
+  virtual util::Status SampleEncode(absl::string_view input, int nbest_size,
+                                    float alpha, SentencePieceText *spt) const;
+  virtual util::Status SampleEncodeAndScore(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best, NBestSentencePieceText *samples_spt) const;
+  // DEPRECATED: Remove this API and use std::vector<std::string_view>
+  virtual util::Status Decode(const std::vector<std::string> &pieces,
+                              SentencePieceText *spt) const;
+  virtual util::Status Decode(const std::vector<absl::string_view> &pieces,
+                              SentencePieceText *spt) const;
+  virtual util::Status Decode(const std::vector<int> &ids,
+                              SentencePieceText *spt) const;
+#ifdef SWIG
+#define SPP_SWIG_CHECK_AND_THROW \
+  if (!status.ok()) throw status;
+#else
+#define SPP_SWIG_CHECK_AND_THROW \
+  if (!status.ok()) {            \
+  }
+#endif  // SWIG
+#define DEFINE_SPP_DIRECT_FUNC_IMPL(FuncName, OutType, ...) \
+  OutType output;                                           \
+  const auto status = FuncName(__VA_ARGS__, &output);       \
+  SPP_SWIG_CHECK_AND_THROW;				    \
+  return output;
+#define DEFINE_SPP_SERIALIZED_PROTO_IMPL(FuncName, OutType, ...)     \
+  OutType output;                                                    \
+  const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
+  SPP_SWIG_CHECK_AND_THROW;					     \
+  return output.SerializeAsString();
+#define DEFINE_SPP_IMMUTABLE_PROTO_IMPL(FuncName, OutType, ...)      \
+  OutType output;                                                    \
+  const auto status = FuncName(__VA_ARGS__, output.mutable_proto()); \
+  SPP_SWIG_CHECK_AND_THROW;					     \
+  return output;
+  //////////////////////////////////////////////////////////////
+  // Handy methods that return the result directly.
+  // These functions ignore internal errors.
+  virtual std::vector<std::string> EncodeAsPieces(
+      absl::string_view input) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector<std::string>, input);
+  }
+  virtual std::vector<int> EncodeAsIds(absl::string_view input) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Encode, std::vector<int>, input);
+  }
+  virtual std::vector<std::vector<std::string>> NBestEncodeAsPieces(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(
+        NBestEncode, std::vector<std::vector<std::string>>, input, nbest_size);
+  }
+  virtual std::vector<std::vector<int>> NBestEncodeAsIds(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(NBestEncode, std::vector<std::vector<int>>,
+                                input, nbest_size);
+  }
+  virtual std::vector<std::string> SampleEncodeAsPieces(absl::string_view input,
+                                                        int nbest_size,
+                                                        float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector<std::string>, input,
+                                nbest_size, alpha);
+  }
+  virtual std::vector<int> SampleEncodeAsIds(absl::string_view input,
+                                             int nbest_size,
+                                             float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncode, std::vector<int>, input,
+                                nbest_size, alpha);
+  }
+  virtual std::vector<std::pair<std::vector<std::string>, float>>
+  SampleEncodeAndScoreAsPieces(absl::string_view input, int num_samples,
+                               float alpha, bool wor, bool include_best) const {
+    using _T = std::vector<std::pair<std::vector<std::string>, float>>;
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples,
+                                alpha, wor, include_best);
+  }
+  virtual std::vector<std::pair<std::vector<int>, float>>
+  SampleEncodeAndScoreAsIds(absl::string_view input, int num_samples,
+                            float alpha, bool wor, bool include_best) const {
+    using _T = std::vector<std::pair<std::vector<int>, float>>;
+    DEFINE_SPP_DIRECT_FUNC_IMPL(SampleEncodeAndScore, _T, input, num_samples,
+                                alpha, wor, include_best);
+  }
+  // DEPRECATED: Remove this API and use std::vector<std::string_view>
+  virtual std::string DecodePieces(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces);
+  }
+  virtual std::string DecodePieces(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, pieces);
+  }
+  virtual std::string DecodeIds(const std::vector<int> &ids) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(Decode, std::string, ids);
+  }
+  virtual float CalculateEntropy(absl::string_view text, float alpha) const {
+    DEFINE_SPP_DIRECT_FUNC_IMPL(CalculateEntropy, float, text, alpha);
+  }
+  //////////////////////////////////////////////////////////////
+  // SerializedProto API. (DEPRECATED). Use ImmutableProto API.
+  // They are used in Python interface. Returns serialized proto.
+  // In python module, we can get access to the full Proto after
+  // deserialzing the returned byte sequence.
+  virtual util::bytes EncodeAsSerializedProto(absl::string_view input) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Encode, ImmutableSentencePieceText, input);
+  }
+  virtual util::bytes SampleEncodeAsSerializedProto(absl::string_view input,
+                                                    int nbest_size,
+                                                    float alpha) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText,
+                                     input, nbest_size, alpha);
+  }
+  virtual util::bytes NBestEncodeAsSerializedProto(absl::string_view input,
+                                                   int nbest_size) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(
+        NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size);
+  }
+  virtual util::bytes SampleEncodeAndScoreAsSerializedProto(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(SampleEncodeAndScore,
+                                     ImmutableNBestSentencePieceText, input,
+                                     num_samples, alpha, wor, include_best);
+  }
+  // TODO(taku): Remove this API and use std::vector<std::string_view>
+  virtual util::bytes DecodePiecesAsSerializedProto(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText,
+                                     pieces);
+  }
+  virtual util::bytes DecodePiecesAsSerializedProto(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText,
+                                     pieces);
+  }
+  virtual util::bytes DecodeIdsAsSerializedProto(
+      const std::vector<int> &ids) const {
+    DEFINE_SPP_SERIALIZED_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids);
+  }
+  //////////////////////////////////////////////////////////////
+  // ImmutableProto API.
+  virtual ImmutableSentencePieceText EncodeAsImmutableProto(
+      absl::string_view input) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Encode, ImmutableSentencePieceText, input);
+  }
+  virtual ImmutableSentencePieceText SampleEncodeAsImmutableProto(
+      absl::string_view input, int nbest_size, float alpha) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncode, ImmutableSentencePieceText,
+                                    input, nbest_size, alpha);
+  }
+  virtual ImmutableNBestSentencePieceText NBestEncodeAsImmutableProto(
+      absl::string_view input, int nbest_size) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(
+        NBestEncode, ImmutableNBestSentencePieceText, input, nbest_size);
+  }
+  virtual ImmutableNBestSentencePieceText SampleEncodeAndScoreAsImmutableProto(
+      absl::string_view input, int num_samples, float alpha, bool wor,
+      bool include_best) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(SampleEncodeAndScore,
+                                    ImmutableNBestSentencePieceText, input,
+                                    num_samples, alpha, wor, include_best);
+  }
+  // TODO(taku): Remove this API and use std::vector<std::string_view>
+  virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto(
+      const std::vector<std::string> &pieces) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces);
+  }
+  virtual ImmutableSentencePieceText DecodePiecesAsImmutableProto(
+      const std::vector<absl::string_view> &pieces) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, pieces);
+  }
+  virtual ImmutableSentencePieceText DecodeIdsAsImmutableProto(
+      const std::vector<int> &ids) const {
+    DEFINE_SPP_IMMUTABLE_PROTO_IMPL(Decode, ImmutableSentencePieceText, ids);
+  }
+#undef DEFINE_SPP_DIRECT_FUNC_IMPL
+#undef DEFINE_SPP_SERIALIZED_PROTO_IMPL
+#undef DEFINE_SPP_IMMUTABLE_PROTO_IMPL
+  //////////////////////////////////////////////////////////////
+  // Vocabulary management methods.
+  //
+  // Returns the size of sentence pieces, which is the same as
+  // the size of vocabulary for NMT.
+  virtual int GetPieceSize() const;
+  // Returns the vocab id of `piece`.
+  // Returns UNK(0) if `piece` is unknown.
+  virtual int PieceToId(absl::string_view piece) const;
+  // Returns the string representation of vocab with `id`.
+  virtual const std::string &IdToPiece(int id) const;
+  // Returns the score of `id`.
+  // Usually score is an emission log probability of unigram language
+  // model.
+  virtual float GetScore(int id) const;
+  // Returns true if `id` is unknown symbol.
+  virtual bool IsUnknown(int id) const;
+  // Returns true if `id` is control symbol.
+  virtual bool IsControl(int id) const;
+  // Returns true if `id` is unused symbol.
+  virtual bool IsUnused(int id) const;
+  // Returns true if `id` is byte symbol.
+  virtual bool IsByte(int id) const;
+  // Returns the reserved id.
+  // Returns -1 if not defined.
+  // Returns unknown (<unk>) id.
+  virtual int unk_id() const;
+  // Returns BOS (<s>) id.
+  virtual int bos_id() const;
+  // Returns EOS (</s>) id.
+  virtual int eos_id() const;
+  // Returns PAD (<pad>) id.
+  virtual int pad_id() const;
+  //////////////////////////////////////////////////////////////
+  // Model management.
+  //
+  // Allows injection of a mock model instance. `model` is moved.
+  void SetModel(std::unique_ptr<ModelInterface> &&model);
+  // Allows injection of a normalizer instance. `normalizer` is moved.
+  void SetNormalizer(std::unique_ptr<normalizer::Normalizer> &&normalizer);
+  // Returns immutable model proto. Useful to obtain extended
+  // or experimental parameters encoded in model_proto.
+  const ModelProto &model_proto() const;
+  // returns immutable model proto as std::string.
+  // Useful to save the state of this instance via Python's pickle object.
+  util::bytes serialized_model_proto() const;
+ private:
+  enum ExtraOption { REVERSE, BOS, EOS, UNK_PIECE };
+  util::Status ParseExtraOptions(absl::string_view extra_option,
+                                 std::vector<ExtraOption> *extra_options) const;
+  util::Status ApplyExtraOptions(const std::vector<ExtraOption> &extra_options,
+                                 SentencePieceText *spt) const;
+  util::Status PopulateSentencePieceText(
+      absl::string_view input, absl::string_view normalized,
+      const std::vector<size_t> &norm_to_orig,
+      const std::vector<std::pair<absl::string_view, int>> &result,
+      SentencePieceText *spt) const;
+  std::unique_ptr<ModelInterface> model_;
+  std::unique_ptr<normalizer::Normalizer> normalizer_;
+  std::unique_ptr<normalizer::Normalizer> denormalizer_;
+  // Underlying model protocol buffer. The same lifetime as model_.
+  std::unique_ptr<ModelProto> model_proto_;
+  std::vector<ExtraOption> encode_extra_options_;
+  std::vector<ExtraOption> decode_extra_options_;
+};
+// Set seed value of random generator.
+// Do not set static_cast<unique_int>(-1),
+// as this seed is reserved for initializing from
+// std::random_device.
+void SetRandomGeneratorSeed(unsigned int seed);
+// IO related functions to absorb model formats.
+namespace io {
+// Loads `model_proto` from `filename`.
+// We can instantiate SentencePieceProcessor as follows:
+//
+//  auto model_proto = absl::make_unique<ModelProto>();
+//  io::LoadModelProto("//path/spm.model", model_proto.get());
+//  SentencePieceProcessor sp;
+//  CHECK_OK(sp.Load(std::move(model_proto)));
+util::Status LoadModelProto(absl::string_view, ModelProto *model_proto);
+// Saves `model_proto` as `filename`.
+util::Status SaveModelProto(absl::string_view, const ModelProto &model_proto);
+}  // namespace io
+}  // namespace sentencepiece
+#endif  // SENTENCEPIECE_PROCESSOR_H_

ChatGLM2/support/lib_pcie/libbmlib.so ADDED Viewed

Binary file (195 kB). View file

ChatGLM2/support/lib_pcie/libbmrt.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621e33823dca470275e09570324a567ce4a30fa6100ac9e52742bb9e1ee02f45
+size 2966400

ChatGLM2/support/lib_pcie/libbmrt.so.1.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621e33823dca470275e09570324a567ce4a30fa6100ac9e52742bb9e1ee02f45
+size 2966400

ChatGLM2/support/lib_pcie/libsentencepiece.a ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68811cd99e6e1a58572372f14f3b7a02cf98bc98f5d46d24c406be65a94b53e8
+size 2858304

ChatGLM2/support/lib_soc/libbmlib.so ADDED Viewed

Binary file (191 kB). View file

ChatGLM2/support/lib_soc/libbmrt.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff807807fcc8c6a9d16353e389422d434ae2b79c8bc191266d0eb5a69b3d97d
+size 2915352

ChatGLM2/support/lib_soc/libbmrt.so.1.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff807807fcc8c6a9d16353e389422d434ae2b79c8bc191266d0eb5a69b3d97d
+size 2915352

ChatGLM2/support/lib_soc/libsentencepiece.a ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b1c1ece6c62265ee879cf5876d31e82580c3ee88c2cb627b8ac3eaf35695bde
+size 3032062

ChatGLM2/support/tokenizer/tokenization_chatglm.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import os
+import torch
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"]
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+    def tokenize(self, s: str):
+        return self.sp_model.EncodeAsPieces(s)
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: List[int]) -> str:
+        return self.sp_model.decode(t)
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens or index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
+        self.name = "GLMTokenizer"
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+        return (vocab_file,)
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+    def build_prompt(self, query, history=None):
+        if history is None:
+            history = []
+        prompt = ""
+        for i, (old_query, response) in enumerate(history):
+            prompt += "[Round {}]\n\n问：{}\n\n答：{}\n\n".format(i + 1, old_query, response)
+        prompt += "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+        return prompt
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+        return encoded_inputs

ChatGLM2/support/tokenizer/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370