|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "lm/quantize.hh" |
|
|
|
#include "lm/binary_format.hh" |
|
#include "lm/lm_exception.hh" |
|
#include "util/file.hh" |
|
|
|
#include <algorithm> |
|
#include <numeric> |
|
|
|
namespace lm { |
|
namespace ngram { |
|
|
|
namespace { |
|
|
|
void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) { |
|
std::sort(values.begin(), values.end()); |
|
std::vector<float>::const_iterator start = values.begin(), finish; |
|
for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) { |
|
finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins); |
|
if (finish == start) { |
|
|
|
*centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity(); |
|
} else { |
|
*centers = std::accumulate(start, finish, 0.0) / static_cast<float>(finish - start); |
|
} |
|
} |
|
} |
|
|
|
const char kSeparatelyQuantizeVersion = 2; |
|
|
|
} |
|
|
|
void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) { |
|
unsigned char buffer[3]; |
|
file.ReadForConfig(buffer, 3, offset); |
|
char version = buffer[0]; |
|
config.prob_bits = buffer[1]; |
|
config.backoff_bits = buffer[2]; |
|
if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion); |
|
} |
|
|
|
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) { |
|
prob_bits_ = config.prob_bits; |
|
backoff_bits_ = config.backoff_bits; |
|
|
|
if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero"); |
|
if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero"); |
|
if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits."); |
|
if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits."); |
|
|
|
actual_base_ = static_cast<uint8_t*>(base); |
|
float *start = reinterpret_cast<float*>(actual_base_ + 8); |
|
for (unsigned char i = 0; i < order - 2; ++i) { |
|
tables_[i][0] = Bins(prob_bits_, start); |
|
start += (1ULL << prob_bits_); |
|
tables_[i][1] = Bins(backoff_bits_, start); |
|
start += (1ULL << backoff_bits_); |
|
} |
|
longest_ = tables_[order - 2][0] = Bins(prob_bits_, start); |
|
} |
|
|
|
void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff) { |
|
TrainProb(order, prob); |
|
|
|
|
|
float *centers = tables_[order - 2][1].Populate(); |
|
*(centers++) = kNoExtensionBackoff; |
|
*(centers++) = kExtensionBackoff; |
|
MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2); |
|
} |
|
|
|
void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) { |
|
float *centers = tables_[order - 2][0].Populate(); |
|
MakeBins(prob, centers, (1ULL << prob_bits_)); |
|
} |
|
|
|
void SeparatelyQuantize::FinishedLoading(const Config &config) { |
|
uint8_t *actual_base = actual_base_; |
|
*(actual_base++) = kSeparatelyQuantizeVersion; |
|
*(actual_base++) = config.prob_bits; |
|
*(actual_base++) = config.backoff_bits; |
|
} |
|
|
|
} |
|
} |
|
|