JoshuaChak's picture
Upload folder using huggingface_hub
7c071a8 verified
raw
history blame
21.1 kB
#include <bits/stdc++.h>
#include <bmlib_runtime.h>
#include <bmruntime_interface.h>
#include <getopt.h>
#include <cstdio>
#include "include/tokenizer.h"
static const int NUM_LAYERS = 40;
static const int MAX_LEN = 512;
static const float ATTENTION_MASK = -10000.;
static const int num_heads = 48;
#define FYEL
inline long long get_elapsed(
std::chrono::time_point<
std::chrono::system_clock,
std::chrono::duration<long, std::ratio<1, 1000000000>>>& last) {
auto now = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::milliseconds>(now - last)
.count();
}
struct WizardCoder {
WizardCoder() {}
WizardCoder(const WizardCoder&) = delete;
WizardCoder& operator=(const WizardCoder&) = delete;
WizardCoder(WizardCoder&&) noexcept = default;
WizardCoder& operator=(WizardCoder&&) noexcept = default;
GPT2Tokenizer tokenizer;
std::vector<bm_handle_t> handles;
std::vector<int> dev_ids;
int num_device;
bm_handle_t handle;
void* bmrt;
struct WizardCoderEmbedding {
bm_tensor_t input_ids_512, input_pos_512;
bm_tensor_t input_ids_1, input_pos_1;
bm_tensor_t hidden_states_512, hidden_states_1;
} embedding;
struct WizardCoderBlock {
std::vector<bm_tensor_t> input_states;
std::vector<bm_tensor_t> attention_mask;
std::vector<bm_tensor_t> hidden_states;
std::vector<bm_tensor_t> past_layers;
};
struct WizardCoderBlockCache {
std::vector<bm_tensor_t> input_states;
std::vector<bm_tensor_t> past_cache;
std::vector<bm_tensor_t> attention_mask;
std::vector<bm_tensor_t> hidden_states;
std::vector<bm_tensor_t> current_cache;
};
std::vector<WizardCoderBlock> blocks;
std::vector<WizardCoderBlockCache> blocks_cache;
struct WizardCoderLmHead {
bm_tensor_t hidden_states;
bm_tensor_t token;
} lm_head;
int token_length;
std::unordered_map<std::string_view, const bm_net_info_t*> networks;
void move2end(const bm_tensor_t& cache);
int forward_first(const std::vector<int>& token_ids);
int forward_next();
void deinit();
void stream_generate(const std::vector<int>& input_ids, int max_new_length);
std::string generate(const std::vector<int>& input_ids, int max_new_length);
std::string build_prompt(std::string_view) const;
void init(std::string_view, std::string_view, const std::vector<int>&);
void answer(std::string_view, int max_new_length = 500);
void chat();
};
void WizardCoder::init(
std::string_view model_path,
std::string_view vocab_path,
const std::vector<int>& devids) {
auto tokenizer = GPT2Tokenizer::from_pretrained(vocab_path);
if (!tokenizer) {
std::cerr << "No tokenizer\n";
}
this->tokenizer = std::move(tokenizer.value());
num_device = devids.size();
blocks.resize(NUM_LAYERS);
blocks_cache.resize(NUM_LAYERS);
for (auto&& block : blocks) {
block.attention_mask.resize(num_device);
block.hidden_states.resize(num_device);
block.input_states.resize(num_device);
block.past_layers.resize(num_device);
}
for (auto&& block_cache : blocks_cache) {
block_cache.current_cache.resize(num_device);
block_cache.past_cache.resize(num_device);
block_cache.attention_mask.resize(num_device);
block_cache.hidden_states.resize(num_device);
block_cache.input_states.resize(num_device);
}
for (auto id : devids) {
bm_handle_t handle;
if (bm_dev_request(&handle, id) != BM_SUCCESS) {
std::cerr << "Error in bm_dev_request\n";
return;
}
handles.push_back(handle);
}
handle = handles[0];
auto& handle = handles[0];
if (!(bmrt = bmrt_create_ex(&handle, num_device))) {
std::cerr << "Error in bmrt_create_ex\n";
return;
}
if (!bmrt_load_bmodel(bmrt, model_path.data())) {
std::cerr << "Error in bmrt_load_bmodel\n";
return;
}
const char** network_names{nullptr};
bmrt_get_network_names(bmrt, &network_names);
int num = bmrt_get_network_number(bmrt);
for (int i = 0; i < num; i++) {
networks[network_names[i]] =
bmrt_get_network_info(bmrt, network_names[i]);
}
[&]() {
bmrt_tensor(
&embedding.input_ids_512,
bmrt,
networks["embedding"]->input_dtypes[0],
networks["embedding"]->stages[1].input_shapes[0]);
bmrt_tensor(
&embedding.input_pos_512,
bmrt,
networks["embedding"]->input_dtypes[1],
networks["embedding"]->stages[1].input_shapes[1]);
bmrt_tensor(
&embedding.hidden_states_512,
bmrt,
networks["embedding"]->output_dtypes[0],
networks["embedding"]->stages[1].output_shapes[0]);
bmrt_tensor(
&embedding.input_ids_1,
bmrt,
networks["embedding"]->input_dtypes[0],
networks["embedding"]->stages[0].input_shapes[0]);
bmrt_tensor(
&embedding.input_pos_1,
bmrt,
networks["embedding"]->input_dtypes[1],
networks["embedding"]->stages[0].input_shapes[1]);
bmrt_tensor(
&embedding.hidden_states_1,
bmrt,
networks["embedding"]->output_dtypes[0],
networks["embedding"]->stages[0].output_shapes[0]);
}();
[&]() {
for (int i = 0; i < NUM_LAYERS; i++) {
auto name = std::string{"block_"} + std::to_string(i);
auto block_net = bmrt_get_network_info(bmrt, name.c_str());
int in_num = block_net->input_num / num_device;
int out_num = block_net->output_num / num_device;
auto& block = blocks[i];
for (int j = 0; j < num_device; j++) {
bmrt_tensor_ex(
&block.input_states[j],
bmrt,
block_net->input_loc_devices[j * in_num + 0],
block_net->input_dtypes[j * in_num + 0],
block_net->stages[0].input_shapes[j * in_num + 0]);
bmrt_tensor_ex(
&block.attention_mask[j],
bmrt,
block_net->input_loc_devices[j * in_num + 1],
block_net->input_dtypes[j * in_num + 1],
block_net->stages[0].input_shapes[j * in_num + 1]);
bmrt_tensor_ex(
&block.hidden_states[j],
bmrt,
block_net->output_loc_devices[j * out_num + 0],
block_net->output_dtypes[j * out_num + 0],
block_net->stages[0].output_shapes[j * out_num + 0]);
bmrt_tensor_ex(
&block.past_layers[j],
bmrt,
block_net->output_loc_devices[j * out_num + 1],
block_net->output_dtypes[j * out_num + 1],
block_net->stages[0].output_shapes[j * out_num + 1]);
}
}
}();
[&]() {
for (int i = 0; i < NUM_LAYERS; i++) {
auto name = std::string{"block_cache_"} + std::to_string(i);
auto block_net = bmrt_get_network_info(bmrt, name.c_str());
int in_num = block_net->input_num / num_device;
int out_num = block_net->output_num / num_device;
auto& block = blocks_cache[i];
for (int j = 0; j < num_device; j++) {
bmrt_tensor_ex(
&block.input_states[j],
bmrt,
block_net->input_loc_devices[j * in_num + 0],
block_net->input_dtypes[j * in_num + 0],
block_net->stages[0].input_shapes[j * in_num + 0]);
bmrt_tensor_ex(
&block.past_cache[j],
bmrt,
block_net->input_loc_devices[j * in_num + 1],
block_net->input_dtypes[j * in_num + 1],
block_net->stages[0].input_shapes[j * in_num + 1]);
bmrt_tensor_ex(
&block.attention_mask[j],
bmrt,
block_net->input_loc_devices[j * in_num + 2],
block_net->input_dtypes[j * in_num + 2],
block_net->stages[0].input_shapes[j * in_num + 2]);
bmrt_tensor_ex(
&block.hidden_states[j],
bmrt,
block_net->output_loc_devices[j * out_num + 0],
block_net->output_dtypes[j * out_num + 0],
block_net->stages[0].output_shapes[j * out_num + 0]);
bmrt_tensor_ex(
&block.current_cache[j],
bmrt,
block_net->output_loc_devices[j * out_num + 1],
block_net->output_dtypes[j * out_num + 1],
block_net->stages[0].output_shapes[j * out_num + 1]);
}
}
}();
[&]() {
auto lm_head = bmrt_get_network_info(bmrt, "lm_head");
bmrt_tensor(
&this->lm_head.hidden_states,
bmrt,
lm_head->input_dtypes[0],
lm_head->stages[0].input_shapes[0]);
bmrt_tensor(
&this->lm_head.token,
bmrt,
lm_head->output_dtypes[0],
lm_head->stages[0].output_shapes[0]);
}();
return;
}
int WizardCoder::forward_first(const std::vector<int>& token_ids) {
token_length = token_ids.size();
auto attention_mask = std::make_unique<float[]>(MAX_LEN * MAX_LEN);
auto position_id = std::make_unique<int[]>(MAX_LEN);
for (int i = 0; i < MAX_LEN; i++) {
for (int j = i + 1; j < MAX_LEN; j++)
attention_mask[j + i * MAX_LEN] = -1000.0;
if (i < token_length) position_id[i] = i;
}
std::vector<int> one_input_nums{1};
std::vector<int> num_device_inputs_nums(num_device, 1);
std::vector<void*> pos_id_data{position_id.get()};
std::vector<void*> tok_id_data{(void*)token_ids.data()};
std::vector<void*> attention_mask_data(
num_device, (void*)attention_mask.get());
bmrt_memcpy_s2d_parallel(
bmrt,
&embedding.input_ids_512,
tok_id_data.data(),
one_input_nums.data(),
1);
bmrt_memcpy_s2d_parallel(
bmrt,
&embedding.input_pos_512,
pos_id_data.data(),
one_input_nums.data(),
1);
bmrt_memcpy_s2d_parallel(
bmrt,
blocks.begin()->attention_mask.data(),
attention_mask_data.data(),
num_device_inputs_nums.data(),
num_device);
bm_tensor_t input_blocks[] = {
embedding.input_ids_512, embedding.input_pos_512};
bmrt_launch_tensor_ex(
bmrt,
"embedding",
input_blocks,
2,
&embedding.hidden_states_512,
1,
true,
false);
bm_thread_sync(handle);
std::vector<bm_tensor_t> inputs_block;
std::vector<bm_tensor_t> outputs_block;
for (int i = 0; i < num_device; i++) {
embedding.hidden_states_512.shape = blocks[0].input_states[0].shape;
inputs_block.push_back(embedding.hidden_states_512);
inputs_block.push_back(blocks[0].attention_mask[i]);
outputs_block.push_back(embedding.hidden_states_512);
outputs_block.push_back(blocks[0].past_layers[i]);
}
for (int i = 0; i < NUM_LAYERS; i++) {
auto name = std::string{"block_"} + std::to_string(i);
for (int j = 0; j < num_device; j++) {
outputs_block[1] = blocks[i].past_layers[j];
}
bmrt_launch_tensor_ex(
bmrt,
name.c_str(),
inputs_block.data(),
inputs_block.size(),
outputs_block.data(),
outputs_block.size(),
true,
false);
bm_thread_sync(handle);
for (int j = 0; j < num_device; j++) {
move2end(blocks[i].past_layers[j]);
}
bm_thread_sync(handle);
}
auto bytes =
bm_mem_get_device_size(embedding.hidden_states_512.device_mem) /
MAX_LEN;
bm_memcpy_d2d_byte(
handle,
lm_head.hidden_states.device_mem,
0,
embedding.hidden_states_512.device_mem,
(token_length - 1) * bytes,
bytes);
bmrt_launch_tensor_ex(
bmrt,
"lm_head",
&lm_head.hidden_states,
1,
&lm_head.token,
1,
true,
false);
int token = 0;
bm_memcpy_d2s(handle, &token, lm_head.token.device_mem);
++token_length;
return token;
}
void WizardCoder::move2end(const bm_tensor_t& cache) {
auto sz = bm_mem_get_device_size(cache.device_mem);
auto bytes = sz / MAX_LEN;
auto len = token_length * bytes;
bm_memcpy_d2d(handle, cache.device_mem, sz - len, cache.device_mem, 0, len);
}
int WizardCoder::forward_next() {
int pid = token_length - 1;
std::vector<void*> input_pid_data{&pid};
std::vector<int> embedding_inputs_num{1};
bmrt_memcpy_s2d_parallel(
bmrt,
&embedding.input_pos_1,
input_pid_data.data(),
embedding_inputs_num.data(),
1);
bmrt_tensor_with_device(
&embedding.input_ids_1,
lm_head.token.device_mem,
embedding.input_ids_1.dtype,
embedding.input_ids_1.shape);
bm_tensor_t input_blocks[] = {embedding.input_ids_1, embedding.input_pos_1};
bmrt_launch_tensor_ex(
bmrt,
"embedding",
input_blocks,
2,
&embedding.hidden_states_1,
1,
true,
false);
bm_thread_sync(handle);
auto attention_mask = std::make_unique<float[]>(1 + MAX_LEN);
for (int i = 0; i < MAX_LEN - token_length + 1; i++)
attention_mask[i] = -1000;
std::vector<int> input_nums(num_device, 1);
std::vector<void*> attention_mask_data(num_device, attention_mask.get());
bmrt_memcpy_s2d_parallel(
bmrt,
blocks_cache.begin()->attention_mask.data(),
attention_mask_data.data(),
input_nums.data(),
num_device);
std::vector<bm_tensor_t> inputs_block;
std::vector<bm_tensor_t> outputs_block;
for (int i = 0; i < num_device; i++) {
inputs_block.push_back(embedding.hidden_states_1);
inputs_block.push_back(blocks[0].past_layers[i]);
inputs_block.push_back(blocks_cache[0].attention_mask[i]);
outputs_block.push_back(embedding.hidden_states_1);
outputs_block.push_back(blocks_cache[0].current_cache[i]);
}
for (int i = 0; i < NUM_LAYERS; i++) {
auto name = std::string{"block_cache_"} + std::to_string(i);
for (int j = 0; j < num_device; j++) {
inputs_block[1] = blocks[i].past_layers[j];
outputs_block[1] = blocks_cache[i].current_cache[j];
}
bmrt_launch_tensor_ex(
bmrt,
name.c_str(),
inputs_block.data(),
inputs_block.size(),
outputs_block.data(),
outputs_block.size(),
true,
false);
bm_thread_sync(handle);
auto totalsize = bm_mem_get_device_size(
blocks_cache[0].current_cache[0].device_mem) /
513;
for (int j = 0; j < num_device; j++) {
bm_memcpy_d2d(
handle,
blocks[i].past_layers[j].device_mem,
0,
blocks_cache[i].current_cache[j].device_mem,
totalsize,
totalsize * 512);
}
}
bmrt_launch_tensor_ex(
bmrt,
"lm_head",
&embedding.hidden_states_1,
1,
&lm_head.token,
1,
true,
false);
bm_thread_sync(handle);
int token = 0;
++token_length;
bm_memcpy_d2s(handle, &token, lm_head.token.device_mem);
return token;
}
std::string WizardCoder::build_prompt(std::string_view input_str) const {
return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
"### Instruction:\n" +
std::string{input_str} + "\n\n### Response:";
}
void WizardCoder::stream_generate(
const std::vector<int>& input_ids,
int max_new_length) {
int cnt = 1;
auto const input_token_len = input_ids.size();
auto start_time = std::chrono::high_resolution_clock::now();
auto token = forward_first(input_ids);
auto FTL = get_elapsed(start_time);
start_time = std::chrono::high_resolution_clock::now();
while (++cnt < max_new_length && cnt + input_token_len <= MAX_LEN) {
auto result = tokenizer.decode_id(token, true);
if (result == "<|endoftext|>") break;
std::cout << result << std::flush;
token = forward_next();
}
auto total = get_elapsed(start_time);
std::cout << FYEL("\n\nInference Time: ") << (total + FTL)
<< FYEL(" ms\nToken: ") << cnt << FYEL(" FTL: ") << FTL
<< FYEL(" ms\nRate: ") << (cnt - 1) * 1000.0 / total
<< FYEL(" Token/Sec\n");
}
void WizardCoder::answer(std::string_view input_str, int max_new_length) {
auto prompt = build_prompt(input_str);
auto input_ids = tokenizer.encode(prompt);
stream_generate(input_ids, max_new_length);
}
void WizardCoder::chat() {
while (true) {
std::cout << "\nQuestion: ";
std::string input_str;
std::getline(std::cin, input_str);
if (input_str == "exit") {
break;
}
std::cout << "\nAnswer: " << std::flush;
answer(input_str);
std::cout << std::endl;
}
}
static void split(
const std::string& s,
const std::string& delim,
std::vector<std::string>& ret) {
size_t last = 0;
size_t index = s.find_first_of(delim, last);
while (index != std::string::npos) {
ret.push_back(s.substr(last, index - last));
last = index + 1;
index = s.find_first_of(delim, last);
}
if (last < s.length()) {
ret.push_back(s.substr(last));
}
}
static std::vector<int> parseCascadeDevices(const std::string& str) {
std::vector<int> devices;
std::vector<std::string> sub_str;
split(str, ",", sub_str);
for (auto& s : sub_str) {
devices.push_back(std::atoi(s.c_str()));
}
return devices;
}
void processArguments(
int argc,
char* argv[],
std::string& model_path,
std::string& vocab_path,
std::vector<int>& devices) {
struct option longOptions[] = {
{"model", required_argument, nullptr, 'm'},
{"devid", required_argument, nullptr, 'd'},
{"vocab", required_argument, nullptr, 'v'},
{nullptr, 0, nullptr, 0}};
int optionIndex = 0;
int option;
while ((option = getopt_long(
argc, argv, "m:d:v:", longOptions, &optionIndex)) != -1) {
switch (option) {
case 'm':
model_path = optarg;
break;
case 'd':
devices = parseCascadeDevices(optarg);
break;
case 'v':
vocab_path = optarg;
break;
case '?':
exit(EXIT_FAILURE);
default:
exit(EXIT_FAILURE);
}
}
}
int main(int argc, char** argv) {
printf("Demo for Wizardcoder-15B in BM1684X\n");
std::string model_path = "wizardcoder-15b_int4_1dev.bmodel";
std::string vocab_path = "../vocab/vocab.json";
std::vector<int> devices = {0};
processArguments(argc, argv, model_path, vocab_path, devices);
printf("Init Environment ...\n");
WizardCoder model;
model.init(model_path, vocab_path, devices);
model.chat();
return 0;
}