|
#ifndef LM_FILTER_FORMAT_H |
|
#define LM_FILTER_FORMAT_H |
|
|
|
#include "arpa_io.hh" |
|
#include "count_io.hh" |
|
|
|
#include <boost/lexical_cast.hpp> |
|
#include <boost/ptr_container/ptr_vector.hpp> |
|
|
|
#include <iosfwd> |
|
|
|
namespace lm { |
|
|
|
template <class Single> class MultipleOutput { |
|
private: |
|
typedef boost::ptr_vector<Single> Singles; |
|
typedef typename Singles::iterator SinglesIterator; |
|
|
|
public: |
|
MultipleOutput(const char *prefix, size_t number) { |
|
files_.reserve(number); |
|
std::string tmp; |
|
for (unsigned int i = 0; i < number; ++i) { |
|
tmp = prefix; |
|
tmp += boost::lexical_cast<std::string>(i); |
|
files_.push_back(new Single(tmp.c_str())); |
|
} |
|
} |
|
|
|
void AddNGram(const StringPiece &line) { |
|
for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) |
|
i->AddNGram(line); |
|
} |
|
|
|
template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { |
|
for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) |
|
i->AddNGram(begin, end, line); |
|
} |
|
|
|
void SingleAddNGram(size_t offset, const StringPiece &line) { |
|
files_[offset].AddNGram(line); |
|
} |
|
|
|
template <class Iterator> void SingleAddNGram(size_t offset, const Iterator &begin, const Iterator &end, const StringPiece &line) { |
|
files_[offset].AddNGram(begin, end, line); |
|
} |
|
|
|
protected: |
|
Singles files_; |
|
}; |
|
|
|
class MultipleARPAOutput : public MultipleOutput<ARPAOutput> { |
|
public: |
|
MultipleARPAOutput(const char *prefix, size_t number) : MultipleOutput<ARPAOutput>(prefix, number) {} |
|
|
|
void ReserveForCounts(std::streampos reserve) { |
|
for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) |
|
i->ReserveForCounts(reserve); |
|
} |
|
|
|
void BeginLength(unsigned int length) { |
|
for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) |
|
i->BeginLength(length); |
|
} |
|
|
|
void EndLength(unsigned int length) { |
|
for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) |
|
i->EndLength(length); |
|
} |
|
|
|
void Finish() { |
|
for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) |
|
i->Finish(); |
|
} |
|
}; |
|
|
|
template <class Filter, class Output> class DispatchInput { |
|
public: |
|
DispatchInput(Filter &filter, Output &output) : filter_(filter), output_(output) {} |
|
|
|
|
|
|
|
|
|
|
|
void AddNGram(const StringPiece &ngram, const StringPiece &line) { |
|
filter_.AddNGram(ngram, line, output_); |
|
} |
|
|
|
protected: |
|
Filter &filter_; |
|
Output &output_; |
|
}; |
|
|
|
template <class Filter, class Output> class DispatchARPAInput : public DispatchInput<Filter, Output> { |
|
private: |
|
typedef DispatchInput<Filter, Output> B; |
|
|
|
public: |
|
DispatchARPAInput(Filter &filter, Output &output) : B(filter, output) {} |
|
|
|
void ReserveForCounts(std::streampos reserve) { B::output_.ReserveForCounts(reserve); } |
|
void BeginLength(unsigned int length) { B::output_.BeginLength(length); } |
|
|
|
void EndLength(unsigned int length) { |
|
B::filter_.Flush(); |
|
B::output_.EndLength(length); |
|
} |
|
void Finish() { B::output_.Finish(); } |
|
}; |
|
|
|
struct ARPAFormat { |
|
typedef ARPAOutput Output; |
|
typedef MultipleARPAOutput Multiple; |
|
static void Copy(util::FilePiece &in, Output &out) { |
|
ReadARPA(in, out); |
|
} |
|
template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { |
|
DispatchARPAInput<Filter, Out> dispatcher(filter, output); |
|
ReadARPA(in, dispatcher); |
|
} |
|
}; |
|
|
|
struct CountFormat { |
|
typedef CountOutput Output; |
|
typedef MultipleOutput<Output> Multiple; |
|
static void Copy(util::FilePiece &in, Output &out) { |
|
ReadCount(in, out); |
|
} |
|
template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { |
|
DispatchInput<Filter, Out> dispatcher(filter, output); |
|
ReadCount(in, dispatcher); |
|
} |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
class InputBuffer { |
|
public: |
|
InputBuffer() : actual_(0) {} |
|
|
|
void Reserve(size_t size) { lines_.reserve(size); } |
|
|
|
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { |
|
if (lines_.size() == actual_) lines_.resize(lines_.size() + 1); |
|
|
|
std::string &copied = lines_[actual_].line; |
|
copied.assign(line.data(), line.size()); |
|
lines_[actual_].ngram.set(copied.data() + (ngram.data() - line.data()), ngram.size()); |
|
++actual_; |
|
} |
|
|
|
template <class Filter, class Output> void CallFilter(Filter &filter, Output &output) const { |
|
for (std::vector<Line>::const_iterator i = lines_.begin(); i != lines_.begin() + actual_; ++i) { |
|
filter.AddNGram(i->ngram, i->line, output); |
|
} |
|
} |
|
|
|
void Clear() { actual_ = 0; } |
|
bool Empty() { return actual_ == 0; } |
|
size_t Size() { return actual_; } |
|
|
|
private: |
|
struct Line { |
|
std::string line; |
|
StringPiece ngram; |
|
}; |
|
|
|
size_t actual_; |
|
|
|
std::vector<Line> lines_; |
|
}; |
|
|
|
class BinaryOutputBuffer { |
|
public: |
|
BinaryOutputBuffer() {} |
|
|
|
void Reserve(size_t size) { |
|
lines_.reserve(size); |
|
} |
|
|
|
void AddNGram(const StringPiece &line) { |
|
lines_.push_back(line); |
|
} |
|
|
|
template <class Output> void Flush(Output &output) { |
|
for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) { |
|
output.AddNGram(*i); |
|
} |
|
lines_.clear(); |
|
} |
|
|
|
private: |
|
std::vector<StringPiece> lines_; |
|
}; |
|
|
|
class MultipleOutputBuffer { |
|
public: |
|
MultipleOutputBuffer() : last_(NULL) {} |
|
|
|
void Reserve(size_t size) { |
|
annotated_.reserve(size); |
|
} |
|
|
|
void AddNGram(const StringPiece &line) { |
|
annotated_.resize(annotated_.size() + 1); |
|
annotated_.back().line = line; |
|
} |
|
|
|
void SingleAddNGram(size_t offset, const StringPiece &line) { |
|
if ((line.data() == last_.data()) && (line.length() == last_.length())) { |
|
annotated_.back().systems.push_back(offset); |
|
} else { |
|
annotated_.resize(annotated_.size() + 1); |
|
annotated_.back().systems.push_back(offset); |
|
annotated_.back().line = line; |
|
last_ = line; |
|
} |
|
} |
|
|
|
template <class Output> void Flush(Output &output) { |
|
for (std::vector<Annotated>::const_iterator i = annotated_.begin(); i != annotated_.end(); ++i) { |
|
if (i->systems.empty()) { |
|
output.AddNGram(i->line); |
|
} else { |
|
for (std::vector<size_t>::const_iterator j = i->systems.begin(); j != i->systems.end(); ++j) { |
|
output.SingleAddNGram(*j, i->line); |
|
} |
|
} |
|
} |
|
annotated_.clear(); |
|
} |
|
|
|
private: |
|
struct Annotated { |
|
|
|
|
|
std::vector<size_t> systems; |
|
StringPiece line; |
|
}; |
|
|
|
StringPiece last_; |
|
|
|
std::vector<Annotated> annotated_; |
|
}; |
|
|
|
} |
|
|
|
#endif |
|
|