GNorm2-docker / CRF /encoder.cpp
Steven Tang
Commit all
69fb171
raw
history blame
13.2 kB
//
// CRF++ -- Yet Another CRF toolkit
//
// $Id: encoder.cpp 1601 2007-03-31 09:47:18Z taku $;
//
// Copyright(C) 2005-2007 Taku Kudo <taku@chasen.org>
//
#include <fstream>
#include "param.h"
#include "encoder.h"
#include "timer.h"
#include "tagger.h"
#include "lbfgs.h"
#include "common.h"
#include "feature_index.h"
#include "scoped_ptr.h"
#include "thread.h"
namespace {
bool toLower(std::string *s) {
for (size_t i = 0; i < s->size(); ++i) {
char c = (*s)[i];
if ((c >= 'A') && (c <= 'Z')) {
c += 'a' - 'A';
(*s)[i] = c;
}
}
return true;
}
}
namespace CRFPP {
class CRFEncoderThread: public thread {
public:
TaggerImpl **x;
unsigned short start_i;
unsigned short thread_num;
int zeroone;
int err;
size_t size;
double obj;
std::vector<double> expected;
void run() {
obj = 0.0;
err = zeroone = 0;
std::fill(expected.begin(), expected.end(), 0.0);
for (size_t i = start_i; i < size; i += thread_num) {
obj += x[i]->gradient(&expected[0]);
int error_num = x[i]->eval();
err += error_num;
if (error_num) ++zeroone;
}
}
};
bool runMIRA(const std::vector<TaggerImpl* > &x,
EncoderFeatureIndex *feature_index,
double *alpha,
size_t maxitr,
float C,
double eta,
unsigned short shrinking_size,
unsigned short thread_num) {
std::vector<unsigned char> shrink(x.size());
std::vector<float> upper_bound(x.size());
std::vector<double> expected(feature_index->size());
std::fill(upper_bound.begin(), upper_bound.end(), 0.0);
std::fill(shrink.begin(), shrink.end(), 0);
int converge = 0;
int all = 0;
for (size_t i = 0; i < x.size(); ++i) all += x[i]->size();
for (size_t itr = 0; itr < maxitr; ++itr) {
int zeroone = 0;
int err = 0;
int active_set = 0;
int upper_active_set = 0;
double max_kkt_violation = 0.0;
feature_index->clear();
for (size_t i = 0; i < x.size(); ++i) {
if (shrink[i] >= shrinking_size) continue;
++active_set;
std::fill(expected.begin(), expected.end(), 0.0);
double cost_diff = x[i]->collins(&expected[0]);
int error_num = x[i]->eval();
err += error_num;
if (error_num) ++zeroone;
if (error_num == 0) {
++shrink[i];
} else {
shrink[i] = 0;
double s = 0.0;
for (size_t k = 0; k < expected.size(); ++k)
s += expected[k] * expected[k];
double mu = _max(0.0, (error_num - cost_diff) / s);
if (upper_bound[i] + mu > C) {
mu = C - upper_bound[i];
++upper_active_set;
} else {
max_kkt_violation = _max(error_num - cost_diff,
max_kkt_violation);
}
if (mu > 1e-10) {
upper_bound[i] += mu;
upper_bound[i] = _min(C, upper_bound[i]);
for (size_t k = 0; k < expected.size(); ++k)
alpha[k] += mu * expected[k];
}
}
}
double obj = 0.0;
for (size_t i = 0; i < feature_index->size(); ++i)
obj += alpha[i] * alpha[i];
std::cout << "iter=" << itr
<< " terr=" << 1.0 * err / all
<< " serr=" << 1.0 * zeroone / x.size()
<< " act=" << active_set
<< " uact=" << upper_active_set
<< " obj=" << obj
<< " kkt=" << max_kkt_violation << std::endl;
if (max_kkt_violation <= 0.0) {
std::fill(shrink.begin(), shrink.end(), 0);
converge++;
} else {
converge = 0;
}
if (itr > maxitr || converge == 2) break; // 2 is ad-hoc
}
return true;
}
bool runCRF(const std::vector<TaggerImpl* > &x,
EncoderFeatureIndex *feature_index,
double *alpha,
size_t maxitr,
float C,
double eta,
unsigned short shrinking_size,
unsigned short thread_num,
bool orthant) {
double old_obj = 1e+37;
int converge = 0;
LBFGS lbfgs;
std::vector<CRFEncoderThread> thread(thread_num);
for (size_t i = 0; i < thread_num; i++) {
thread[i].start_i = i;
thread[i].size = x.size();
thread[i].thread_num = thread_num;
thread[i].x = const_cast<TaggerImpl **>(&x[0]);
thread[i].expected.resize(feature_index->size());
}
size_t all = 0;
for (size_t i = 0; i < x.size(); ++i) all += x[i]->size();
for (size_t itr = 0; itr < maxitr; ++itr) {
feature_index->clear();
for (size_t i = 0; i < thread_num; ++i) thread[i].start();
for (size_t i = 0; i < thread_num; ++i) thread[i].join();
for (size_t i = 1; i < thread_num; ++i) {
thread[0].obj += thread[i].obj;
thread[0].err += thread[i].err;
thread[0].zeroone += thread[i].zeroone;
}
for (size_t i = 1; i < thread_num; ++i) {
for (size_t k = 0; k < feature_index->size(); ++k)
thread[0].expected[k] += thread[i].expected[k];
}
size_t num_nonzero = 0;
if (orthant) { // L1
for (size_t k = 0; k < feature_index->size(); ++k) {
thread[0].obj += std::abs(alpha[k] / C);
if (alpha[k] != 0.0) ++num_nonzero;
}
} else {
num_nonzero = feature_index->size();
for (size_t k = 0; k < feature_index->size(); ++k) {
thread[0].obj += (alpha[k] * alpha[k] /(2.0 * C));
thread[0].expected[k] += alpha[k] / C;
}
}
double diff = (itr == 0 ? 1.0 :
std::abs(old_obj - thread[0].obj)/old_obj);
std::cout << "iter=" << itr
<< " terr=" << 1.0 * thread[0].err / all
<< " serr=" << 1.0 * thread[0].zeroone / x.size()
<< " act=" << num_nonzero
<< " obj=" << thread[0].obj
<< " diff=" << diff << std::endl;
old_obj = thread[0].obj;
if (diff < eta)
converge++;
else
converge = 0;
if (itr > maxitr || converge == 3) break; // 3 is ad-hoc
if (lbfgs.optimize(feature_index->size(),
&alpha[0],
thread[0].obj,
&thread[0].expected[0], orthant, C) <= 0)
return false;
}
return true;
}
bool Encoder::convert(const char* textfilename,
const char *binaryfilename) {
EncoderFeatureIndex feature_index(1);
CHECK_FALSE(feature_index.convert(textfilename, binaryfilename))
<< feature_index.what();
return true;
}
bool Encoder::learn(const char *templfile,
const char *trainfile,
const char *modelfile,
bool textmodelfile,
size_t maxitr,
size_t freq,
double eta,
double C,
unsigned short thread_num,
unsigned short shrinking_size,
int algorithm) {
std::cout << COPYRIGHT << std::endl;
CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
CHECK_FALSE(thread_num > 0) << "thread must be > 0";
#ifndef CRFPP_USE_THREAD
CHECK_FALSE(thread_num == 1)
<< "This architecture doesn't support multi-thrading";
#endif
CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 ||
(algorithm == MIRA && thread_num == 1))
<< "MIRA doesn't support multi-thrading";
EncoderFeatureIndex feature_index(thread_num);
std::vector<TaggerImpl* > x;
std::cout.setf(std::ios::fixed, std::ios::floatfield);
std::cout.precision(5);
#define WHAT_ERROR(msg) do { \
for (std::vector<TaggerImpl *>::iterator it = x.begin(); \
it != x.end(); ++it) \
delete *it; \
std::cerr << msg << std::endl; \
return false; } while (0)
CHECK_FALSE(feature_index.open(templfile, trainfile))
<< feature_index.what();
{
progress_timer pg;
std::ifstream ifs(trainfile);
CHECK_FALSE(ifs) << "cannot open: " << trainfile;
std::cout << "reading training data: " << std::flush;
size_t line = 0;
while (ifs) {
TaggerImpl *_x = new TaggerImpl();
_x->open(&feature_index);
if (!_x->read(&ifs) || !_x->shrink())
WHAT_ERROR(_x->what());
if (!_x->empty())
x.push_back(_x);
else
delete _x;
_x->set_thread_id(line % thread_num);
if (++line % 100 == 0) std::cout << line << ".. " << std::flush;
}
ifs.close();
std::cout << "\nDone!";
}
feature_index.shrink(freq);
std::vector <double> alpha(feature_index.size()); // parameter
std::fill(alpha.begin(), alpha.end(), 0.0);
feature_index.set_alpha(&alpha[0]);
std::cout << "Number of sentences: " << x.size() << std::endl;
std::cout << "Number of features: " << feature_index.size() << std::endl;
std::cout << "Number of thread(s): " << thread_num << std::endl;
std::cout << "Freq: " << freq << std::endl;
std::cout << "eta: " << eta << std::endl;
std::cout << "C: " << C << std::endl;
std::cout << "shrinking size: " << shrinking_size
<< std::endl;
progress_timer pg;
switch (algorithm) {
case MIRA:
if (!runMIRA(x, &feature_index, &alpha[0],
maxitr, C, eta, shrinking_size, thread_num))
WHAT_ERROR("MIRA execute error");
break;
case CRF_L2:
if (!runCRF(x, &feature_index, &alpha[0],
maxitr, C, eta, shrinking_size, thread_num, false))
WHAT_ERROR("CRF_L2 execute error");
break;
case CRF_L1:
if (!runCRF(x, &feature_index, &alpha[0],
maxitr, C, eta, shrinking_size, thread_num, true))
WHAT_ERROR("CRF_L1 execute error");
break;
}
for (std::vector<TaggerImpl *>::iterator it = x.begin();
it != x.end(); ++it)
delete *it;
if (!feature_index.save(modelfile, textmodelfile))
WHAT_ERROR(feature_index.what());
std::cout << "\nDone!";
return true;
}
}
int crfpp_learn(int argc, char **argv) {
static const CRFPP::Option long_options[] = {
{"freq", 'f', "1", "INT",
"use features that occuer no less than INT(default 1)" },
{"maxiter" , 'm', "100000", "INT",
"set INT for max iterations in LBFGS routine(default 10k)" },
{"cost", 'c', "1.0", "FLOAT",
"set FLOAT for cost parameter(default 1.0)" },
{"eta", 'e', "0.0001", "FLOAT",
"set FLOAT for termination criterion(default 0.0001)" },
{"convert", 'C', 0, 0,
"convert text model to binary model" },
{"textmodel", 't', 0, 0,
"build also text model file for debugging" },
{"algorithm", 'a', "CRF", "(CRF|MIRA)", "select training algorithm" },
{"thread", 'p', "1", "INT", "number of threads(default 1)" },
{"shrinking-size", 'H', "20", "INT",
"set INT for number of iterations variable needs to "
" be optimal before considered for shrinking. (default 20)" },
{"version", 'v', 0, 0, "show the version and exit" },
{"help", 'h', 0, 0, "show this help and exit" },
{0, 0, 0, 0, 0}
};
CRFPP::Param param;
param.open(argc, argv, long_options);
if (!param.help_version()) return 0;
const bool convert = param.get<bool>("convert");
const std::vector<std::string> &rest = param.rest_args();
if (param.get<bool>("help") ||
(convert && rest.size() != 2) || (!convert && rest.size() != 3)) {
std::cout << param.help();
return 0;
}
const size_t freq = param.get<int>("freq");
const size_t maxiter = param.get<int>("maxiter");
const double C = param.get<float>("cost");
const double eta = param.get<float>("eta");
const bool textmodel = param.get<bool>("textmodel");
const unsigned short thread = param.get<unsigned short>("thread");
const unsigned short shrinking_size = param.get<unsigned short>("shrinking-size");
std::string salgo = param.get<std::string>("algorithm");
toLower(&salgo);
int algorithm = CRFPP::Encoder::MIRA;
if (salgo == "crf" || salgo == "crf-l2") {
algorithm = CRFPP::Encoder::CRF_L2;
} else if (salgo == "crf-l1") {
algorithm = CRFPP::Encoder::CRF_L1;
} else if (salgo == "mira") {
algorithm = CRFPP::Encoder::MIRA;
} else {
std::cerr << "unknown alogrithm: " << salgo << std::endl;
return -1;
}
CRFPP::Encoder encoder;
if (convert) {
if (!encoder.convert(rest[0].c_str(), rest[1].c_str())) {
std::cerr << encoder.what() << std::endl;
return -1;
}
} else {
if (!encoder.learn(rest[0].c_str(),
rest[1].c_str(),
rest[2].c_str(),
textmodel,
maxiter, freq, eta, C, thread, shrinking_size,
algorithm)) {
std::cerr << encoder.what() << std::endl;
return -1;
}
}
return 0;
}