|
#ifndef LM_BINARY_FORMAT_H |
|
#define LM_BINARY_FORMAT_H |
|
|
|
#include "lm/config.hh" |
|
#include "lm/model_type.hh" |
|
#include "lm/read_arpa.hh" |
|
|
|
#include "util/file_piece.hh" |
|
#include "util/mmap.hh" |
|
#include "util/scoped.hh" |
|
|
|
#include <cstddef> |
|
#include <vector> |
|
|
|
#include <stdint.h> |
|
|
|
namespace lm { |
|
namespace ngram { |
|
|
|
extern const char *kModelNames[6]; |
|
|
|
|
|
|
|
|
|
|
|
bool RecognizeBinary(const char *file, ModelType &recognized); |
|
|
|
struct FixedWidthParameters { |
|
unsigned char order; |
|
float probing_multiplier; |
|
|
|
ModelType model_type; |
|
|
|
bool has_vocabulary; |
|
unsigned int search_version; |
|
}; |
|
|
|
|
|
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) |
|
|
|
|
|
struct Parameters { |
|
FixedWidthParameters fixed; |
|
std::vector<uint64_t> counts; |
|
}; |
|
|
|
class BinaryFormat { |
|
public: |
|
explicit BinaryFormat(const Config &config); |
|
|
|
|
|
|
|
void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms); |
|
|
|
void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const; |
|
|
|
void *LoadBinary(std::size_t size); |
|
|
|
uint64_t VocabStringReadingOffset() const { |
|
assert(vocab_string_offset_ != kInvalidOffset); |
|
return vocab_string_offset_; |
|
} |
|
|
|
|
|
|
|
void *SetupJustVocab(std::size_t memory_size, uint8_t order); |
|
|
|
void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base); |
|
|
|
void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base); |
|
|
|
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts); |
|
|
|
private: |
|
void MapFile(void *&vocab_base, void *&search_base); |
|
|
|
|
|
const Config::WriteMethod write_method_; |
|
const char *write_mmap_; |
|
util::LoadMethod load_method_; |
|
|
|
|
|
util::scoped_fd file_; |
|
|
|
|
|
util::scoped_memory mapping_; |
|
|
|
|
|
|
|
|
|
util::scoped_memory memory_vocab_, memory_search_; |
|
|
|
|
|
|
|
std::size_t header_size_, vocab_size_, vocab_pad_; |
|
|
|
uint64_t vocab_string_offset_; |
|
|
|
static const uint64_t kInvalidOffset = (uint64_t)-1; |
|
}; |
|
|
|
bool IsBinaryFormat(int fd); |
|
|
|
} |
|
} |
|
#endif |
|
|