#include <bits/stdc++.h> |
#include <bmlib_runtime.h> |
#include <bmruntime_interface.h> |
#include <getopt.h> |
#include <cstdio> |
#include "include/tokenizer.h" |
static const int NUM_LAYERS = 40; |
static const int MAX_LEN = 512; |
static const float ATTENTION_MASK = -10000.; |
static const int num_heads = 48; |
#define FYEL |
inline long long get_elapsed( |
std::chrono::time_point< |
std::chrono::system_clock, |
std::chrono::duration<long, std::ratio<1, 1000000000>>>& last) { |
auto now = std::chrono::high_resolution_clock::now(); |
return std::chrono::duration_cast<std::chrono::milliseconds>(now - last) |
.count(); |
} |
struct WizardCoder { |
WizardCoder() {} |
WizardCoder(const WizardCoder&) = delete; |
WizardCoder& operator=(const WizardCoder&) = delete; |
WizardCoder(WizardCoder&&) noexcept = default; |
WizardCoder& operator=(WizardCoder&&) noexcept = default; |
GPT2Tokenizer tokenizer; |
std::vector<bm_handle_t> handles; |
std::vector<int> dev_ids; |
int num_device; |
bm_handle_t handle; |
void* bmrt; |
struct WizardCoderEmbedding { |
bm_tensor_t input_ids_512, input_pos_512; |
bm_tensor_t input_ids_1, input_pos_1; |
bm_tensor_t hidden_states_512, hidden_states_1; |
} embedding; |
struct WizardCoderBlock { |
std::vector<bm_tensor_t> input_states; |
std::vector<bm_tensor_t> attention_mask; |
std::vector<bm_tensor_t> hidden_states; |
std::vector<bm_tensor_t> past_layers; |
}; |
struct WizardCoderBlockCache { |
std::vector<bm_tensor_t> input_states; |
std::vector<bm_tensor_t> past_cache; |
std::vector<bm_tensor_t> attention_mask; |
std::vector<bm_tensor_t> hidden_states; |
std::vector<bm_tensor_t> current_cache; |
}; |
std::vector<WizardCoderBlock> blocks; |
std::vector<WizardCoderBlockCache> blocks_cache; |
struct WizardCoderLmHead { |
bm_tensor_t hidden_states; |
bm_tensor_t token; |
} lm_head; |
int token_length; |
std::unordered_map<std::string_view, const bm_net_info_t*> networks; |
void move2end(const bm_tensor_t& cache); |
int forward_first(const std::vector<int>& token_ids); |
int forward_next(); |
void deinit(); |
void stream_generate(const std::vector<int>& input_ids, int max_new_length); |
std::string generate(const std::vector<int>& input_ids, int max_new_length); |
std::string build_prompt(std::string_view) const; |
void init(std::string_view, std::string_view, const std::vector<int>&); |
void answer(std::string_view, int max_new_length = 500); |
void chat(); |
}; |
void WizardCoder::init( |
std::string_view model_path, |
std::string_view vocab_path, |
const std::vector<int>& devids) { |
auto tokenizer = GPT2Tokenizer::from_pretrained(vocab_path); |
if (!tokenizer) { |
std::cerr << "No tokenizer\n"; |
} |
this->tokenizer = std::move(tokenizer.value()); |
num_device = devids.size(); |
blocks.resize(NUM_LAYERS); |
blocks_cache.resize(NUM_LAYERS); |
for (auto&& block : blocks) { |
block.attention_mask.resize(num_device); |
block.hidden_states.resize(num_device); |
block.input_states.resize(num_device); |
block.past_layers.resize(num_device); |
} |
for (auto&& block_cache : blocks_cache) { |
block_cache.current_cache.resize(num_device); |
block_cache.past_cache.resize(num_device); |
block_cache.attention_mask.resize(num_device); |
block_cache.hidden_states.resize(num_device); |
block_cache.input_states.resize(num_device); |
} |
for (auto id : devids) { |
bm_handle_t handle; |
if (bm_dev_request(&handle, id) != BM_SUCCESS) { |
std::cerr << "Error in bm_dev_request\n"; |
return; |
} |
handles.push_back(handle); |
} |
handle = handles[0]; |
auto& handle = handles[0]; |
if (!(bmrt = bmrt_create_ex(&handle, num_device))) { |
std::cerr << "Error in bmrt_create_ex\n"; |
return; |
} |
if (!bmrt_load_bmodel(bmrt, model_path.data())) { |
std::cerr << "Error in bmrt_load_bmodel\n"; |
return; |
} |
const char** network_names{nullptr}; |
bmrt_get_network_names(bmrt, &network_names); |
int num = bmrt_get_network_number(bmrt); |
for (int i = 0; i < num; i++) { |
networks[network_names[i]] = |
bmrt_get_network_info(bmrt, network_names[i]); |
} |
[&]() { |
bmrt_tensor( |
&embedding.input_ids_512, |
bmrt, |
networks["embedding"]->input_dtypes[0], |
networks["embedding"]->stages[1].input_shapes[0]); |
bmrt_tensor( |
&embedding.input_pos_512, |
bmrt, |
networks["embedding"]->input_dtypes[1], |
networks["embedding"]->stages[1].input_shapes[1]); |
bmrt_tensor( |
&embedding.hidden_states_512, |
bmrt, |
networks["embedding"]->output_dtypes[0], |
networks["embedding"]->stages[1].output_shapes[0]); |
bmrt_tensor( |
&embedding.input_ids_1, |
bmrt, |
networks["embedding"]->input_dtypes[0], |
networks["embedding"]->stages[0].input_shapes[0]); |
bmrt_tensor( |
&embedding.input_pos_1, |
bmrt, |
networks["embedding"]->input_dtypes[1], |
networks["embedding"]->stages[0].input_shapes[1]); |
bmrt_tensor( |
&embedding.hidden_states_1, |
bmrt, |
networks["embedding"]->output_dtypes[0], |
networks["embedding"]->stages[0].output_shapes[0]); |
}(); |
[&]() { |
for (int i = 0; i < NUM_LAYERS; i++) { |
auto name = std::string{"block_"} + std::to_string(i); |
auto block_net = bmrt_get_network_info(bmrt, name.c_str()); |
int in_num = block_net->input_num / num_device; |
int out_num = block_net->output_num / num_device; |
auto& block = blocks[i]; |
for (int j = 0; j < num_device; j++) { |
bmrt_tensor_ex( |
&block.input_states[j], |
bmrt, |
block_net->input_loc_devices[j * in_num + 0], |
block_net->input_dtypes[j * in_num + 0], |
block_net->stages[0].input_shapes[j * in_num + 0]); |
bmrt_tensor_ex( |
&block.attention_mask[j], |
bmrt, |
block_net->input_loc_devices[j * in_num + 1], |
block_net->input_dtypes[j * in_num + 1], |
block_net->stages[0].input_shapes[j * in_num + 1]); |
bmrt_tensor_ex( |
&block.hidden_states[j], |
bmrt, |
block_net->output_loc_devices[j * out_num + 0], |
block_net->output_dtypes[j * out_num + 0], |
block_net->stages[0].output_shapes[j * out_num + 0]); |
bmrt_tensor_ex( |
&block.past_layers[j], |
bmrt, |
block_net->output_loc_devices[j * out_num + 1], |
block_net->output_dtypes[j * out_num + 1], |
block_net->stages[0].output_shapes[j * out_num + 1]); |
} |
} |
}(); |
[&]() { |
for (int i = 0; i < NUM_LAYERS; i++) { |
auto name = std::string{"block_cache_"} + std::to_string(i); |
auto block_net = bmrt_get_network_info(bmrt, name.c_str()); |
int in_num = block_net->input_num / num_device; |
int out_num = block_net->output_num / num_device; |
auto& block = blocks_cache[i]; |
for (int j = 0; j < num_device; j++) { |
bmrt_tensor_ex( |
&block.input_states[j], |
bmrt, |
block_net->input_loc_devices[j * in_num + 0], |
block_net->input_dtypes[j * in_num + 0], |
block_net->stages[0].input_shapes[j * in_num + 0]); |
bmrt_tensor_ex( |
&block.past_cache[j], |
bmrt, |
block_net->input_loc_devices[j * in_num + 1], |
block_net->input_dtypes[j * in_num + 1], |
block_net->stages[0].input_shapes[j * in_num + 1]); |
bmrt_tensor_ex( |
&block.attention_mask[j], |
bmrt, |
block_net->input_loc_devices[j * in_num + 2], |
block_net->input_dtypes[j * in_num + 2], |
block_net->stages[0].input_shapes[j * in_num + 2]); |
bmrt_tensor_ex( |
&block.hidden_states[j], |
bmrt, |
block_net->output_loc_devices[j * out_num + 0], |
block_net->output_dtypes[j * out_num + 0], |
block_net->stages[0].output_shapes[j * out_num + 0]); |
bmrt_tensor_ex( |
&block.current_cache[j], |
bmrt, |
block_net->output_loc_devices[j * out_num + 1], |
block_net->output_dtypes[j * out_num + 1], |
block_net->stages[0].output_shapes[j * out_num + 1]); |
} |
} |
}(); |
[&]() { |
auto lm_head = bmrt_get_network_info(bmrt, "lm_head"); |
bmrt_tensor( |
&this->lm_head.hidden_states, |
bmrt, |
lm_head->input_dtypes[0], |
lm_head->stages[0].input_shapes[0]); |
bmrt_tensor( |
&this->lm_head.token, |
bmrt, |
lm_head->output_dtypes[0], |
lm_head->stages[0].output_shapes[0]); |
}(); |
return; |
} |
int WizardCoder::forward_first(const std::vector<int>& token_ids) { |
token_length = token_ids.size(); |
auto attention_mask = std::make_unique<float[]>(MAX_LEN * MAX_LEN); |
auto position_id = std::make_unique<int[]>(MAX_LEN); |
for (int i = 0; i < MAX_LEN; i++) { |
for (int j = i + 1; j < MAX_LEN; j++) |
attention_mask[j + i * MAX_LEN] = -1000.0; |
if (i < token_length) position_id[i] = i; |
} |
std::vector<int> one_input_nums{1}; |
std::vector<int> num_device_inputs_nums(num_device, 1); |
std::vector<void*> pos_id_data{position_id.get()}; |
std::vector<void*> tok_id_data{(void*)token_ids.data()}; |
std::vector<void*> attention_mask_data( |
num_device, (void*)attention_mask.get()); |
bmrt_memcpy_s2d_parallel( |
bmrt, |
&embedding.input_ids_512, |
tok_id_data.data(), |
one_input_nums.data(), |
1); |
bmrt_memcpy_s2d_parallel( |
bmrt, |
&embedding.input_pos_512, |
pos_id_data.data(), |
one_input_nums.data(), |
1); |
bmrt_memcpy_s2d_parallel( |
bmrt, |
blocks.begin()->attention_mask.data(), |
attention_mask_data.data(), |
num_device_inputs_nums.data(), |
num_device); |
bm_tensor_t input_blocks[] = { |
embedding.input_ids_512, embedding.input_pos_512}; |
bmrt_launch_tensor_ex( |
bmrt, |
"embedding", |
input_blocks, |
2, |
&embedding.hidden_states_512, |
1, |
true, |
false); |
bm_thread_sync(handle); |
std::vector<bm_tensor_t> inputs_block; |
std::vector<bm_tensor_t> outputs_block; |
for (int i = 0; i < num_device; i++) { |
embedding.hidden_states_512.shape = blocks[0].input_states[0].shape; |
inputs_block.push_back(embedding.hidden_states_512); |
inputs_block.push_back(blocks[0].attention_mask[i]); |
outputs_block.push_back(embedding.hidden_states_512); |
outputs_block.push_back(blocks[0].past_layers[i]); |
} |
for (int i = 0; i < NUM_LAYERS; i++) { |
auto name = std::string{"block_"} + std::to_string(i); |
for (int j = 0; j < num_device; j++) { |
outputs_block[1] = blocks[i].past_layers[j]; |
} |
bmrt_launch_tensor_ex( |
bmrt, |
name.c_str(), |
inputs_block.data(), |
inputs_block.size(), |
outputs_block.data(), |
outputs_block.size(), |
true, |
false); |
bm_thread_sync(handle); |
for (int j = 0; j < num_device; j++) { |
move2end(blocks[i].past_layers[j]); |
} |
bm_thread_sync(handle); |
} |
auto bytes = |
bm_mem_get_device_size(embedding.hidden_states_512.device_mem) / |
bm_memcpy_d2d_byte( |
handle, |
lm_head.hidden_states.device_mem, |
0, |
embedding.hidden_states_512.device_mem, |
(token_length - 1) * bytes, |
bytes); |
bmrt_launch_tensor_ex( |
bmrt, |
"lm_head", |
&lm_head.hidden_states, |
1, |
&lm_head.token, |
1, |
true, |
false); |
int token = 0; |
bm_memcpy_d2s(handle, &token, lm_head.token.device_mem); |
++token_length; |
return token; |
} |
void WizardCoder::move2end(const bm_tensor_t& cache) { |
auto sz = bm_mem_get_device_size(cache.device_mem); |
auto bytes = sz / MAX_LEN; |
auto len = token_length * bytes; |
bm_memcpy_d2d(handle, cache.device_mem, sz - len, cache.device_mem, 0, len); |
} |
int WizardCoder::forward_next() { |
int pid = token_length - 1; |
std::vector<void*> input_pid_data{&pid}; |
std::vector<int> embedding_inputs_num{1}; |
bmrt_memcpy_s2d_parallel( |
bmrt, |
&embedding.input_pos_1, |
input_pid_data.data(), |
embedding_inputs_num.data(), |
1); |
bmrt_tensor_with_device( |
&embedding.input_ids_1, |
lm_head.token.device_mem, |
embedding.input_ids_1.dtype, |
embedding.input_ids_1.shape); |
bm_tensor_t input_blocks[] = {embedding.input_ids_1, embedding.input_pos_1}; |
bmrt_launch_tensor_ex( |
bmrt, |
"embedding", |
input_blocks, |
2, |
&embedding.hidden_states_1, |
1, |
true, |
false); |
bm_thread_sync(handle); |
auto attention_mask = std::make_unique<float[]>(1 + MAX_LEN); |
for (int i = 0; i < MAX_LEN - token_length + 1; i++) |
attention_mask[i] = -1000; |
std::vector<int> input_nums(num_device, 1); |
std::vector<void*> attention_mask_data(num_device, attention_mask.get()); |
bmrt_memcpy_s2d_parallel( |
bmrt, |
blocks_cache.begin()->attention_mask.data(), |
attention_mask_data.data(), |
input_nums.data(), |
num_device); |
std::vector<bm_tensor_t> inputs_block; |
std::vector<bm_tensor_t> outputs_block; |
for (int i = 0; i < num_device; i++) { |
inputs_block.push_back(embedding.hidden_states_1); |
inputs_block.push_back(blocks[0].past_layers[i]); |
inputs_block.push_back(blocks_cache[0].attention_mask[i]); |
outputs_block.push_back(embedding.hidden_states_1); |
outputs_block.push_back(blocks_cache[0].current_cache[i]); |
} |
for (int i = 0; i < NUM_LAYERS; i++) { |
auto name = std::string{"block_cache_"} + std::to_string(i); |
for (int j = 0; j < num_device; j++) { |
inputs_block[1] = blocks[i].past_layers[j]; |
outputs_block[1] = blocks_cache[i].current_cache[j]; |
} |
bmrt_launch_tensor_ex( |
bmrt, |
name.c_str(), |
inputs_block.data(), |
inputs_block.size(), |
outputs_block.data(), |
outputs_block.size(), |
true, |
false); |
bm_thread_sync(handle); |
auto totalsize = bm_mem_get_device_size( |
blocks_cache[0].current_cache[0].device_mem) / |
513; |
for (int j = 0; j < num_device; j++) { |
bm_memcpy_d2d( |
handle, |
blocks[i].past_layers[j].device_mem, |
0, |
blocks_cache[i].current_cache[j].device_mem, |
totalsize, |
totalsize * 512); |
} |
} |
bmrt_launch_tensor_ex( |
bmrt, |
"lm_head", |
&embedding.hidden_states_1, |
1, |
&lm_head.token, |
1, |
true, |
false); |
bm_thread_sync(handle); |
int token = 0; |
++token_length; |
bm_memcpy_d2s(handle, &token, lm_head.token.device_mem); |
return token; |
} |
std::string WizardCoder::build_prompt(std::string_view input_str) const { |
return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n" |
"### Instruction:\n" + |
std::string{input_str} + "\n\n### Response:"; |
} |
void WizardCoder::stream_generate( |
const std::vector<int>& input_ids, |
int max_new_length) { |
int cnt = 1; |
auto const input_token_len = input_ids.size(); |
auto start_time = std::chrono::high_resolution_clock::now(); |
auto token = forward_first(input_ids); |
auto FTL = get_elapsed(start_time); |
start_time = std::chrono::high_resolution_clock::now(); |
while (++cnt < max_new_length && cnt + input_token_len <= MAX_LEN) { |
auto result = tokenizer.decode_id(token, true); |
if (result == "<|endoftext|>") break; |
std::cout << result << std::flush; |
token = forward_next(); |
} |
auto total = get_elapsed(start_time); |
std::cout << FYEL("\n\nInference Time: ") << (total + FTL) |
<< FYEL(" ms\nToken: ") << cnt << FYEL(" FTL: ") << FTL |
<< FYEL(" ms\nRate: ") << (cnt - 1) * 1000.0 / total |
<< FYEL(" Token/Sec\n"); |
} |
void WizardCoder::answer(std::string_view input_str, int max_new_length) { |
auto prompt = build_prompt(input_str); |
auto input_ids = tokenizer.encode(prompt); |
stream_generate(input_ids, max_new_length); |
} |
void WizardCoder::chat() { |
while (true) { |
std::cout << "\nQuestion: "; |
std::string input_str; |
std::getline(std::cin, input_str); |
if (input_str == "exit") { |
break; |
} |
std::cout << "\nAnswer: " << std::flush; |
answer(input_str); |
std::cout << std::endl; |
} |
} |
static void split( |
const std::string& s, |
const std::string& delim, |
std::vector<std::string>& ret) { |
size_t last = 0; |
size_t index = s.find_first_of(delim, last); |
while (index != std::string::npos) { |
ret.push_back(s.substr(last, index - last)); |
last = index + 1; |
index = s.find_first_of(delim, last); |
} |
if (last < s.length()) { |
ret.push_back(s.substr(last)); |
} |
} |
static std::vector<int> parseCascadeDevices(const std::string& str) { |
std::vector<int> devices; |
std::vector<std::string> sub_str; |
split(str, ",", sub_str); |
for (auto& s : sub_str) { |
devices.push_back(std::atoi(s.c_str())); |
} |
return devices; |
} |
void processArguments( |
int argc, |
char* argv[], |
std::string& model_path, |
std::string& vocab_path, |
std::vector<int>& devices) { |
struct option longOptions[] = { |
{"model", required_argument, nullptr, 'm'}, |
{"devid", required_argument, nullptr, 'd'}, |
{"vocab", required_argument, nullptr, 'v'}, |
{nullptr, 0, nullptr, 0}}; |
int optionIndex = 0; |
int option; |
while ((option = getopt_long( |
argc, argv, "m:d:v:", longOptions, &optionIndex)) != -1) { |
switch (option) { |
case 'm': |
model_path = optarg; |
break; |
case 'd': |
devices = parseCascadeDevices(optarg); |
break; |
case 'v': |
vocab_path = optarg; |
break; |
case '?': |
default: |
} |
} |
} |
int main(int argc, char** argv) { |
printf("Demo for Wizardcoder-15B in BM1684X\n"); |
std::string model_path = "wizardcoder-15b_int4_1dev.bmodel"; |
std::string vocab_path = "../vocab/vocab.json"; |
std::vector<int> devices = {0}; |
processArguments(argc, argv, model_path, vocab_path, devices); |
printf("Init Environment ...\n"); |
WizardCoder model; |
model.init(model_path, vocab_path, devices); |
model.chat(); |
return 0; |
} |