#pragma once #include "llama.h" #include #include #include #ifdef __GNUC__ #ifdef __MINGW32__ #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif #else #define LLAMA_ATTRIBUTE_FORMAT(...) #endif // // logging // LLAMA_ATTRIBUTE_FORMAT(2, 3) void llama_log_internal (ggml_log_level level, const char * format, ...); void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); #define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) // // helpers // struct time_meas { time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} ~time_meas() { if (t_start_us >= 0) { t_acc += ggml_time_us() - t_start_us; } } const int64_t t_start_us; int64_t & t_acc; }; static void replace_all(std::string & s, const std::string & search, const std::string & replace) { if (search.empty()) { return; } std::string builder; builder.reserve(s.length()); size_t pos = 0; size_t last_pos = 0; while ((pos = s.find(search, last_pos)) != std::string::npos) { builder.append(s, last_pos, pos - last_pos); builder.append(replace); last_pos = pos + search.length(); } builder.append(s, last_pos, std::string::npos); s = std::move(builder); } const std::vector> & llama_internal_get_tensor_map( struct llama_context * ctx ); // the ring buffer works similarly to std::deque, but with a fixed capacity template struct ring_buffer { ring_buffer(size_t cap) : capacity(cap), data(cap) {} T & front() { if (sz == 0) { throw std::runtime_error("ring buffer is empty"); } return data[first]; } const T & front() const { if (sz == 0) { throw std::runtime_error("ring buffer is empty"); } return data[first]; } T & back() { if (sz == 0) { throw std::runtime_error("ring buffer is empty"); } return data[pos]; } const T & back() const { if (sz == 0) { throw std::runtime_error("ring buffer is empty"); } return data[pos]; } void push_back(const T & value) { if (capacity == 0) { throw std::runtime_error("ring buffer: capacity is zero"); } if (sz == capacity) { // advance the start when buffer is full first = (first + 1) % capacity; } else { sz++; } data[pos] = value; pos = (pos + 1) % capacity; } T pop_front() { if (sz == 0) { throw std::runtime_error("ring buffer is empty"); } T value = data[first]; first = (first + 1) % capacity; sz--; return value; } //T & operator[](size_t i) { // if (i >= sz) { // throw std::runtime_error("ring buffer: index out of bounds"); // } // return data[(first + i) % capacity]; //} //const T & at(size_t i) const { // if (i >= sz) { // throw std::runtime_error("ring buffer: index out of bounds"); // } // return data[(first + i) % capacity]; //} const T & rat(size_t i) const { if (i >= sz) { throw std::runtime_error("ring buffer: index out of bounds"); } return data[(first + sz - i - 1) % capacity]; } std::vector to_vector() const { std::vector result; result.reserve(sz); for (size_t i = 0; i < sz; i++) { result.push_back(data[(first + i) % capacity]); } return result; } void clear() { // here only reset the status of the buffer sz = 0; first = 0; pos = 0; } bool empty() const { return sz == 0; } size_t size() const { return sz; } size_t capacity = 0; size_t sz = 0; size_t first = 0; size_t pos = 0; std::vector data; };