Spaces:
Runtime error
Runtime error
// Internal header to be included only by llama.cpp. | |
// Contains wrappers around OS interfaces. | |
__attribute__((format(gnu_printf, 1, 2))) | |
__attribute__((format(printf, 1, 2))) | |
static std::string format(const char * fmt, ...) { | |
va_list ap, ap2; | |
va_start(ap, fmt); | |
va_copy(ap2, ap); | |
int size = vsnprintf(NULL, 0, fmt, ap); | |
LLAMA_ASSERT(size >= 0 && size < INT_MAX); | |
std::vector<char> buf(size + 1); | |
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); | |
LLAMA_ASSERT(size2 == size); | |
va_end(ap2); | |
va_end(ap); | |
return std::string(buf.data(), size); | |
} | |
struct llama_file { | |
// use FILE * so we don't have to re-open the file to mmap | |
FILE * fp; | |
size_t size; | |
llama_file(const char * fname, const char * mode) { | |
fp = std::fopen(fname, mode); | |
if (fp == NULL) { | |
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); | |
} | |
seek(0, SEEK_END); | |
size = tell(); | |
seek(0, SEEK_SET); | |
} | |
size_t tell() const { | |
__int64 ret = _ftelli64(fp); | |
long ret = std::ftell(fp); | |
LLAMA_ASSERT(ret != -1); // this really shouldn't fail | |
return (size_t) ret; | |
} | |
void seek(size_t offset, int whence) { | |
int ret = _fseeki64(fp, (__int64) offset, whence); | |
int ret = std::fseek(fp, (long) offset, whence); | |
LLAMA_ASSERT(ret == 0); // same | |
} | |
void read_raw(void * ptr, size_t len) const { | |
if (len == 0) { | |
return; | |
} | |
errno = 0; | |
std::size_t ret = std::fread(ptr, len, 1, fp); | |
if (ferror(fp)) { | |
throw std::runtime_error(format("read error: %s", strerror(errno))); | |
} | |
if (ret != 1) { | |
throw std::runtime_error(std::string("unexpectedly reached end of file")); | |
} | |
} | |
std::uint32_t read_u32() { | |
std::uint32_t ret; | |
read_raw(&ret, sizeof(ret)); | |
return ret; | |
} | |
std::string read_string(std::uint32_t len) { | |
std::vector<char> chars(len); | |
read_raw(chars.data(), len); | |
return std::string(chars.data(), len); | |
} | |
void write_raw(const void * ptr, size_t len) const { | |
if (len == 0) { | |
return; | |
} | |
errno = 0; | |
size_t ret = std::fwrite(ptr, len, 1, fp); | |
if (ret != 1) { | |
throw std::runtime_error(format("write error: %s", strerror(errno))); | |
} | |
} | |
void write_u32(std::uint32_t val) { | |
write_raw(&val, sizeof(val)); | |
} | |
~llama_file() { | |
if (fp) { | |
std::fclose(fp); | |
} | |
} | |
}; | |
static std::string llama_format_win_err(DWORD err) { | |
LPSTR buf; | |
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, | |
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); | |
if (!size) { | |
return "FormatMessageA failed"; | |
} | |
std::string ret(buf, size); | |
LocalFree(buf); | |
return ret; | |
} | |
struct llama_mmap { | |
void * addr; | |
size_t size; | |
llama_mmap(const llama_mmap &) = delete; | |
static constexpr bool SUPPORTED = true; | |
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { | |
size = file->size; | |
int fd = fileno(file->fp); | |
int flags = MAP_PRIVATE; | |
// prefetch/readahead impairs performance on NUMA systems | |
if (numa) { prefetch = 0; } | |
if (prefetch) { flags |= MAP_POPULATE; } | |
addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0); | |
if (addr == MAP_FAILED) { | |
throw std::runtime_error(format("mmap failed: %s", strerror(errno))); | |
} | |
if (prefetch > 0) { | |
// Advise the kernel to preload the mapped memory | |
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { | |
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", | |
strerror(errno)); | |
} | |
} | |
if (numa) { | |
// advise the kernel not to use readahead | |
// (because the next page might not belong on the same node) | |
if (madvise(addr, file->size, MADV_RANDOM)) { | |
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", | |
strerror(errno)); | |
} | |
} | |
} | |
~llama_mmap() { | |
munmap(addr, size); | |
} | |
static constexpr bool SUPPORTED = true; | |
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { | |
(void) numa; | |
size = file->size; | |
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); | |
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); | |
DWORD error = GetLastError(); | |
if (hMapping == NULL) { | |
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); | |
} | |
addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0); | |
error = GetLastError(); | |
CloseHandle(hMapping); | |
if (addr == NULL) { | |
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); | |
} | |
if (prefetch) { | |
// Advise the kernel to preload the mapped memory | |
WIN32_MEMORY_RANGE_ENTRY range; | |
range.VirtualAddress = addr; | |
range.NumberOfBytes = (SIZE_T)size; | |
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { | |
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", | |
llama_format_win_err(GetLastError()).c_str()); | |
} | |
} | |
printf("\nPrefetchVirtualMemory skipped in failsafe mode."); | |
} | |
~llama_mmap() { | |
if (!UnmapViewOfFile(addr)) { | |
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", | |
llama_format_win_err(GetLastError()).c_str()); | |
} | |
} | |
static constexpr bool SUPPORTED = false; | |
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) { | |
(void) prefetch; | |
(void) numa; | |
throw std::runtime_error(std::string("mmap not supported")); | |
} | |
}; | |
// Represents some region of memory being locked using mlock or VirtualLock; | |
// will automatically unlock on destruction. | |
struct llama_mlock { | |
void * addr = NULL; | |
size_t size = 0; | |
bool failed_already = false; | |
llama_mlock() {} | |
llama_mlock(const llama_mlock &) = delete; | |
~llama_mlock() { | |
if (size) { | |
raw_unlock(addr, size); | |
} | |
} | |
void init(void * ptr) { | |
LLAMA_ASSERT(addr == NULL && size == 0); | |
addr = ptr; | |
} | |
void grow_to(size_t target_size) { | |
LLAMA_ASSERT(addr); | |
if (failed_already) { | |
return; | |
} | |
size_t granularity = lock_granularity(); | |
target_size = (target_size + granularity - 1) & ~(granularity - 1); | |
if (target_size > size) { | |
if (raw_lock((uint8_t *) addr + size, target_size - size)) { | |
size = target_size; | |
} else { | |
failed_already = true; | |
} | |
} | |
} | |
static constexpr bool SUPPORTED = true; | |
size_t lock_granularity() { | |
return (size_t) sysconf(_SC_PAGESIZE); | |
} | |
bool raw_lock(const void * addr, size_t size) { | |
if (!mlock(addr, size)) { | |
return true; | |
} else { | |
char* errmsg = std::strerror(errno); | |
bool suggest = (errno == ENOMEM); | |
// Check if the resource limit is fine after all | |
struct rlimit lock_limit; | |
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) | |
suggest = false; | |
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) | |
suggest = false; | |
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", | |
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); | |
return false; | |
} | |
} | |
void raw_unlock(void * addr, size_t size) { | |
if (munlock(addr, size)) { | |
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); | |
} | |
} | |
static constexpr bool SUPPORTED = true; | |
size_t lock_granularity() { | |
SYSTEM_INFO si; | |
GetSystemInfo(&si); | |
return (size_t) si.dwPageSize; | |
} | |
bool raw_lock(void * ptr, size_t len) { | |
for (int tries = 1; ; tries++) { | |
if (VirtualLock(ptr, len)) { | |
return true; | |
} | |
if (tries == 2) { | |
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", | |
len, size, llama_format_win_err(GetLastError()).c_str()); | |
return false; | |
} | |
// It failed but this was only the first try; increase the working | |
// set size and try again. | |
SIZE_T min_ws_size, max_ws_size; | |
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { | |
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", | |
llama_format_win_err(GetLastError()).c_str()); | |
return false; | |
} | |
// Per MSDN: "The maximum number of pages that a process can lock | |
// is equal to the number of pages in its minimum working set minus | |
// a small overhead." | |
// Hopefully a megabyte is enough overhead: | |
size_t increment = len + 1048576; | |
// The minimum must be <= the maximum, so we need to increase both: | |
min_ws_size += increment; | |
max_ws_size += increment; | |
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { | |
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", | |
llama_format_win_err(GetLastError()).c_str()); | |
return false; | |
} | |
} | |
} | |
void raw_unlock(void * ptr, size_t len) { | |
if (!VirtualUnlock(ptr, len)) { | |
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", | |
llama_format_win_err(GetLastError()).c_str()); | |
} | |
} | |
static constexpr bool SUPPORTED = false; | |
size_t lock_granularity() { | |
return (size_t) 65536; | |
} | |
bool raw_lock(const void * addr, size_t len) { | |
fprintf(stderr, "warning: mlock not supported on this system\n"); | |
return false; | |
} | |
void raw_unlock(const void * addr, size_t len) {} | |
}; | |
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization. | |
struct llama_buffer { | |
uint8_t * addr = NULL; | |
size_t size = 0; | |
llama_buffer() = default; | |
void resize(size_t len) { | |
free(addr); | |
int result = posix_memalign((void **) &addr, getpagesize(), len); | |
if (result == 0) { | |
memset(addr, 0, len); | |
} | |
else { | |
addr = NULL; | |
} | |
delete[] addr; | |
addr = new uint8_t[len]; | |
size = len; | |
} | |
~llama_buffer() { | |
free(addr); | |
delete[] addr; | |
addr = NULL; | |
} | |
// disable copy and move | |
llama_buffer(const llama_buffer&) = delete; | |
llama_buffer(llama_buffer&&) = delete; | |
llama_buffer& operator=(const llama_buffer&) = delete; | |
llama_buffer& operator=(llama_buffer&&) = delete; | |
}; | |
struct llama_ctx_buffer { | |
uint8_t * addr = NULL; | |
bool is_cuda; | |
size_t size = 0; | |
llama_ctx_buffer() = default; | |
void resize(size_t size) { | |
free(); | |
addr = (uint8_t *) ggml_cuda_host_malloc(size); | |
if (addr) { | |
is_cuda = true; | |
} | |
else { | |
// fall back to pageable memory | |
addr = new uint8_t[size]; | |
is_cuda = false; | |
} | |
this->size = size; | |
} | |
void free() { | |
if (addr) { | |
if (is_cuda) { | |
ggml_cuda_host_free(addr); | |
} | |
else { | |
delete[] addr; | |
} | |
} | |
addr = NULL; | |
} | |
~llama_ctx_buffer() { | |
free(); | |
} | |
// disable copy and move | |
llama_ctx_buffer(const llama_ctx_buffer&) = delete; | |
llama_ctx_buffer(llama_ctx_buffer&&) = delete; | |
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; | |
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; | |
}; | |
typedef llama_buffer llama_ctx_buffer; | |