63 #define PROG_NAME "run-model"
65 #define DEFAULT_MAX_EXAMPLES -1
66 #define DEFAULT_MAX_CANDIDATES -1
67 #define DEFAULT_MODEL_CONFIG "PerceptronModel(name(\"MyPerceptronModel\"))"
68 #define DEFAULT_REPORTING_INTERVAL 1000
69 #define DEFAULT_COMPACTIFY_INTERVAL 10000
70 #define DEFAULT_USE_WEIGHTED_LOSS true
75 #define XSTR(arg) STR(arg)
80 using namespace reranker;
84 PROG_NAME " --config <master config file>\n",
85 "\t-m|--model-file <model file> [--model-config <model config>]\n"
86 "\t[-t|--train <training input file>+ [-i <input model file>] [--mapper] ]\n",
87 "\t-d|--devtest <devtest input file>+\n",
88 "\t[-o|--output <candidate set output file>]\n",
89 "\t[-h <hyp output file>] [--scores <score output file>]\n",
90 "\t[--train-config <training feature extractor config file>]\n",
91 "\t[--dev-config <devtest feature extractor config file>]\n",
92 "\t[--compactify-feature-uids]\n",
93 "\t[-s|--streaming [--compactify-interval <interval>] ] [-u]\n",
95 "\t[--min-epochs <min epochs>] [--max-epochs <max epochs>]\n",
96 "\t[--max-examples <max num examples>]\n",
97 "\t[--max-candidates <max num candidates>]\n",
98 "\t[-r <reporting interval>] [ --use-weighted-loss[=][true|false] ]\n",
100 "\t<master config file> is a file in the interpreted factory language\n",
101 "\t\tcapable of specifying all options to this executable (see\n",
102 "\t\tconfig/default.infact for an example of the default options)\n",
103 "\t<model file> is the name of the file to which to write out a\n",
104 "\t\tnewly-trained model when training (one or more\n",
105 "\t\t<training input file>'s specified), or the name of a file\n",
106 "\t\tfrom which to load a serialized model when decoding\n",
107 "\t<input model file> is an optional input model file as a starting\n",
108 "\t\tmodel when training\n",
109 "\t<model config> is the optional configuration string for constructing\n",
110 "\t\ta new Model instance\n",
112 "\t<training input file> is the name of a stream of serialized\n",
113 "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
114 "\t--mapper specifies to train a single epoch and output features to\n",
115 "\t\tstandard output\n",
116 "\t<devtest input file> is the name of a stream of serialized\n",
117 "\t\tCandidateSet instances, or \"-\" for input from standard input\n",
118 "\t\t(required unless training in mapper mode)\n",
119 "\t<candidate set output file> is the name of the file to which to output\n",
120 "\t\tcandidate sets that have been scored by the model (in\n",
121 "\t\tdecoding mode)\n",
122 "\t<training feature extractor config file> is the name of a configuration\n",
123 "\t\tfile to be read by the ExecutiveFeatureExtractor instance\n"
124 "\t\textracting features on training examples\n",
125 "\t<devtest feature extractor config file> is the name of a configuration\n",
126 "\t\tfile to be read by the ExecutiveFeatureExtractor instance\n",
127 "\t\textracting features on devtest examples\n",
128 "\t--compactify-feature-uids specifies to re-map all feature uids to the\n",
129 "\t\t[0,n-1] interval, where n is the number of non-zero features\n",
130 "\t--streaming specifies to train in streaming mode (i.e., do not\n",
131 "\t\tread in all training instances into memory)\n",
132 "\t--compactify-interval specifies the interval after which to compactify\n",
133 "\t\tfeature uid's and remove unused symbols (only available when\n",
134 "\t\ttraining in streaming mode; defaults to "
136 "\t-u specifies that the input files are uncompressed\n",
137 "\t--no-base64 specifies not to use base64 encoding/decoding\n",
138 "\t--max-examples specifies the maximum number of examples to read from\n",
140 "\t--max-candidates specifies the maximum number of candidates to read\n",
142 "\t-r specifies the interval at which the CandidateSetReader reports how\n",
143 "\t\tmany candidate sets it has read (defaults to "
145 "\t--use-weighted-loss specifies whether to weight losses on devtest\n",
146 "\t\texamples by the number of tokens in the reference, where, e.g.,\n",
147 "\t\tweighted loss is appropriate for computing WER, but not BLEU\n",
154 int usage_msg_len =
sizeof(
usage_msg)/
sizeof(
const char *);
155 for (
int i = 0; i < usage_msg_len; ++i) {
163 cerr <<
PROG_NAME <<
": error: " << err_msg << endl;
175 shared_ptr<ExecutiveFeatureExtractor> efe,
176 vector<shared_ptr<CandidateSet> > &examples) {
177 bool reset_counters =
true;
178 for (vector<string>::const_iterator file_it = files.begin();
179 file_it != files.end();
181 csr.
Read(*file_it, compressed, use_base64, reset_counters, examples);
183 if (efe.get() != NULL) {
185 for (vector<shared_ptr<CandidateSet> >::iterator it = examples.begin();
186 it != examples.end();
188 efe->Extract(*(*it));
196 string master_config_file;
199 string input_model_file;
201 vector<string> training_files;
202 vector<string> devtest_files;
203 bool mapper_mode =
false;
205 string hyp_output_file;
206 string score_output_file;
207 string training_feature_extractor_config_file;
208 string devtest_feature_extractor_config_file;
209 bool compressed =
true;
210 bool use_base64 =
true;
211 bool streaming =
false;
213 string use_weighted_loss_arg_prefix =
"--use-weighted-loss";
214 size_t use_weighted_loss_arg_prefix_len =
215 use_weighted_loss_arg_prefix.length();
216 bool compactify_feature_uids =
false;
224 shared_ptr<Model> model;
225 shared_ptr<ExecutiveFeatureExtractor> training_efe;
226 shared_ptr<ExecutiveFeatureExtractor> devtest_efe;
232 int master_config_arg_idx = -1;
233 for (
int i = 1; i < argc; ++i) {
234 string arg = argv[i];
235 if (arg ==
"--config") {
237 string(
"no master configuration file specified with ") + arg;
241 master_config_file = argv[++i];
242 master_config_arg_idx = i - 1;
245 if (master_config_file !=
"") {
247 cerr <<
"Reading options from \"" << master_config_file <<
"\"." << endl;
248 i.
Eval(master_config_file);
252 i.
Get(
"model_file", &model_file);
253 i.
Get(
"model", &model);
254 i.
Get(
"mapper_mode", &mapper_mode);
255 i.
Get(
"training_files", &training_files);
256 i.
Get(
"devtest_files", &devtest_files);
257 i.
Get(
"output_file", &output_file);
258 i.
Get(
"hyp_output_file", &hyp_output_file);
259 i.
Get(
"training_efe", &training_efe);
260 i.
Get(
"devtest_efe", &devtest_efe);
261 i.
Get(
"compactify_feature_uids", &compactify_feature_uids);
262 i.
Get(
"compactify_interval", &compactify_interval);
263 i.
Get(
"streaming", &streaming);
264 i.
Get(
"compressed", &compressed);
265 i.
Get(
"use_base64", &use_base64);
266 i.
Get(
"min_epochs", &min_epochs);
267 i.
Get(
"max_epochs", &max_epochs);
268 i.
Get(
"max_examples", &max_examples);
269 i.
Get(
"max_candidates", &max_candidates);
270 i.
Get(
"reporting_interval", &reporting_interval);
271 i.
Get(
"use_weighted_loss", &use_weighted_loss);
275 for (
int i = 1; i < argc; ++i) {
276 if (i == master_config_arg_idx) {
280 string arg = argv[i];
281 if (arg ==
"-m" || arg ==
"-model" || arg ==
"--model") {
282 string err_msg = string(
"no model file specified with ") + arg;
286 model_file = argv[++i];
287 }
else if (arg ==
"-i" || arg ==
"--i") {
288 string err_msg = string(
"no input model file specified with ") + arg;
292 input_model_file = argv[++i];
293 }
else if (arg ==
"-model-config" || arg ==
"--model-config") {
295 string(
"no model configuration string specified with ") + arg;
299 model_config = argv[++i];
300 }
else if (arg ==
"-t" || arg ==
"-train" || arg ==
"--train") {
301 string err_msg = string(
"no input files specified with ") + arg;
307 for ( ; i < argc; ++i) {
308 if (argv[i][0] ==
'-' && strlen(argv[i]) > 1) {
312 training_files.push_back(argv[i]);
314 }
else if (arg ==
"-mapper" || arg ==
"--mapper") {
316 }
else if (arg ==
"-d" || arg ==
"-devtest" || arg ==
"--devtest") {
317 string err_msg = string(
"no input files specified with ") + arg;
323 for ( ; i < argc; ++i) {
324 if (argv[i][0] ==
'-') {
328 devtest_files.push_back(argv[i]);
330 }
else if (arg ==
"-o" || arg ==
"-output" || arg ==
"--output") {
331 string err_msg = string(
"no output file specified with ") + arg;
335 output_file = argv[++i];
336 }
else if (arg ==
"-h") {
338 string(
"no hypothesis output file specified with ") + arg;
342 hyp_output_file = argv[++i];
343 }
else if (arg ==
"-scores" || arg ==
"--scores") {
345 string(
"no score output file specified with ") + arg;
349 score_output_file = argv[++i];
350 }
else if (arg ==
"-train-config" || arg ==
"--train-config") {
352 string(
"no feature extractor config file specified with ") + arg;
356 training_feature_extractor_config_file = argv[++i];
357 }
else if (arg ==
"-dev-config" || arg ==
"--dev-config") {
359 string(
"no feature extractor config file specified with ") + arg;
363 devtest_feature_extractor_config_file = argv[++i];
364 }
else if (arg ==
"-compactify-feature-uids" ||
365 arg ==
"--compactify-feature-uids") {
366 compactify_feature_uids =
true;
367 }
else if (arg ==
"-s" || arg ==
"-streaming" || arg ==
"--streaming") {
369 }
else if (arg ==
"--compactify-interval") {
370 string err_msg = string(
"no interval specified with ") + arg;
374 compactify_interval = atoi(argv[++i]);
375 }
else if (arg ==
"-u") {
377 }
else if (arg ==
"--no-base64") {
379 }
else if (arg ==
"-min-epochs" || arg ==
"--min-epochs") {
380 string err_msg = string(
"no arg specified with ") + arg;
384 min_epochs = atoi(argv[++i]);
385 }
else if (arg ==
"-max-epochs" || arg ==
"--max-epochs") {
386 string err_msg = string(
"no arg specified with ") + arg;
390 max_epochs = atoi(argv[++i]);
391 }
else if (arg ==
"-max-examples" || arg ==
"--max-examples") {
392 string err_msg = string(
"no arg specified with ") + arg;
396 max_examples = atoi(argv[++i]);
397 }
else if (arg ==
"-max-candidates" || arg ==
"--max-candidates") {
398 string err_msg = string(
"no arg specified with ") + arg;
402 max_candidates = atoi(argv[++i]);
403 }
else if (arg ==
"-r") {
404 string err_msg = string(
"no arg specified with ") + arg;
408 reporting_interval = atoi(argv[++i]);
409 }
else if (arg.substr(0, use_weighted_loss_arg_prefix_len) ==
410 use_weighted_loss_arg_prefix) {
411 string use_weighted_loss_str;
412 if (arg.length() > use_weighted_loss_arg_prefix_len &&
413 arg[use_weighted_loss_arg_prefix_len] ==
'=') {
414 use_weighted_loss_str =
415 arg.substr(use_weighted_loss_arg_prefix_len + 1);
418 string(
"no \"true\" or \"false\" arg specified with ") + arg;
422 use_weighted_loss_str = argv[++i];
424 if (use_weighted_loss_str !=
"true" &&
425 use_weighted_loss_str !=
"false") {
426 cerr <<
PROG_NAME <<
": error: must specify \"true\" or \"false\""
427 <<
" with --use-weighted-loss" << endl;
431 if (use_weighted_loss_str !=
"true") {
432 use_weighted_loss =
false;
434 }
else if (arg.size() > 0 && arg[0] ==
'-') {
435 cerr <<
PROG_NAME <<
": error: unrecognized option: " << arg << endl;
441 bool training = training_files.size() > 0;
444 if (model_file ==
"") {
445 cerr <<
PROG_NAME <<
": error: must specify model file" << endl;
449 if (!mapper_mode && devtest_files.size() == 0) {
450 cerr <<
PROG_NAME <<
": error: must specify devtest input files when "
451 <<
"not in mapper mode" << endl;
455 if (output_file !=
"" && training) {
456 cerr <<
PROG_NAME <<
": error: cannot specify output file when training"
461 if (hyp_output_file !=
"" && training) {
463 <<
": error: cannot specify hypothesis output file when training"
468 bool reading_from_stdin =
false;
469 for (vector<string>::const_iterator training_file_it = training_files.begin();
470 training_file_it != training_files.end();
471 ++training_file_it) {
472 if (*training_file_it ==
"-") {
473 reading_from_stdin =
true;
477 if (training_files.size() > 1 && reading_from_stdin) {
478 cerr <<
PROG_NAME <<
": error: cannot read from standard input and "
479 <<
"specify other training files" << endl;
483 if (!training && input_model_file !=
"") {
484 cerr <<
PROG_NAME <<
": error: can only specify <input model file> "
485 <<
"when in training mode" << endl;
491 if (training_feature_extractor_config_file !=
"") {
492 training_efe = ExecutiveFeatureExtractor::InitFromSpec(
493 training_feature_extractor_config_file);
495 if (devtest_feature_extractor_config_file !=
"") {
496 devtest_efe = ExecutiveFeatureExtractor::InitFromSpec(
497 devtest_feature_extractor_config_file);
505 if (!training || input_model_file !=
"") {
508 string model_file_to_load = training ? input_model_file : model_file;
511 model = model_reader.
Read(model_file_to_load, compressed, use_base64);
514 ifstream model_config_is(model_config.c_str());
515 if (model_config_is) {
516 cerr <<
"Reading model config from file \"" << model_config <<
"\"."
526 if (model.get() == NULL) {
531 shared_ptr<ModelProtoWriter> model_writer =
532 proto_writer_factory.
CreateOrDie(model->proto_writer_spec(),
533 "model proto writer");
534 if (model_writer.get() == NULL) {
544 model->set_use_weighted_loss(use_weighted_loss);
545 model->set_min_epochs(min_epochs);
546 model->set_max_epochs(max_epochs);
548 vector<shared_ptr<CandidateSet> > training_examples;
549 vector<shared_ptr<CandidateSet> > devtest_examples;
550 if (!streaming && !mapper_mode) {
551 cerr <<
"Loading devtest examples." << endl;
553 devtest_efe, devtest_examples);
554 if (devtest_examples.size() == 0) {
555 cerr <<
"Could not read any devtest examples. Exiting." << endl;
561 CandidateSetVectorIt;
566 if (training_files.size() > 0) {
567 cerr <<
"Training." << endl;
575 compressed, use_base64);
582 compressed, use_base64);
591 training_efe, training_examples);
592 if (training_examples.size() == 0) {
593 cerr <<
"Could not read any training examples from training files."
594 <<
" Exiting." << endl;
599 training_it =
new CandidateSetVectorIt(training_examples);
600 devtest_it =
new CandidateSetVectorIt(devtest_examples);
607 model->TrainOneEpoch(*training_it);
609 model->Train(*training_it, *devtest_it);
614 if (compactify_feature_uids) {
615 cerr <<
"Compactifying feature uid's...";
617 model->CompactifyFeatureUids();
618 cerr <<
"done." << endl;
622 cerr <<
"Writing out model to file \"" << model_file <<
"\"...";
624 confusion_learning::ModelMessage model_message;
625 model_writer->Write(model.get(), &model_message,
false);
627 ConfusionProtoIO* proto_writer;
629 cerr <<
"Writing ModelMessage (without features) and FeatureMessage "
630 <<
"instances to standard output." << endl;
631 proto_writer =
new ConfusionProtoIO(model_file, ConfusionProtoIO::WRITESTD,
633 cout << ModelInfoReducer::kModelMessageFeatureName <<
"\t";
635 proto_writer =
new ConfusionProtoIO(model_file, ConfusionProtoIO::WRITE,
636 compressed, use_base64);
638 proto_writer->Write(model_message);
640 bool output_best_epoch = !mapper_mode;
641 bool output_key = mapper_mode;
642 model_writer->WriteFeatures(model.get(),
643 *(proto_writer->outputstream()),
645 model->num_training_errors(),
648 cerr <<
"done." << endl;
650 CandidateSetVectorIt devtest_examples_it(devtest_examples);
652 model->Evaluate(devtest_examples_it);
654 if (output_file !=
"") {
657 csw.
Write(devtest_examples, output_file, compressed, use_base64);
659 bool output_hyps = hyp_output_file !=
"";
660 bool output_scores = score_output_file !=
"";
661 if (output_hyps || output_scores) {
662 ofstream hyp_os(hyp_output_file.c_str());
663 ofstream score_os(score_output_file.c_str());
664 devtest_examples_it.Reset();
665 while (devtest_examples_it.HasNext()) {
666 CandidateSet &candidate_set = devtest_examples_it.Next();
672 cand_it != candidate_set.
end();
674 score_os << (*cand_it)->score() <<
"\n";
687 google::protobuf::ShutdownProtobufLibrary();
Provides the reranker::PerceptronModel reranker class.
void ClearStrings()
Invokes CandidateSetProtoReader::ClearStrings on the internal CandidateSetProtoReader instance...
Provides the reranker::Candidate class for representing a candidate hypothesis from an initial model...
An interface specifying iteration over CandidateSet instances, using Java-style semantics (sorry...
Provides the ModelReader class, which can create Model instances from a file.
An implementation of the CandidateSetIterator interface that iterates over CandidateSet instances tha...
A simple class for tokenizing a stream of tokens for the formally specified language used to construc...
shared_ptr< Model > Read(const string &filename, bool compressed, bool use_base64)
const Candidate & GetBestScoring() const
Provides the reranker::Symbols interface as well as the reranker::StaticSymbolTable implementation...
#define DEFAULT_MODEL_CONFIG
const string & raw_data() const
Returns the raw data (typically the sentence) for this candidate.
void set_verbosity(int verbosity)
Sets the verbosity of this reader (mostly for debugging purposes).
void Eval(const string &filename)
Evaluates the statements in the specified text file.
void Write(vector< shared_ptr< CandidateSet > > &examples, const string &filename, bool compressed, bool use_base64)
Writes a stream of CandidateSet instances to the specified file or to standard output.
void read_and_extract_features(const vector< string > &files, CandidateSetReader &csr, bool compressed, bool use_base64, shared_ptr< ExecutiveFeatureExtractor > efe, vector< shared_ptr< CandidateSet > > &examples)
#define XSTR(arg)
Expands the string value of the specified argument using the STR macro.
#define DEFAULT_REPORTING_INTERVAL
An end-of-epoch hook for writing out the best model so far to file after each epoch (if the best mode...
Factory for dynamically created instance of the specified type.
void set_verbosity(int verbosity)
Sets the verbosity of this writer (mostly for debugging purposes).
A class for writing streams of training or test instances, where each training or test instance is a ...
Class for reading streams of training or test instances, where each training or test instance is a re...
Reducer classes for trainer.
void TearDown()
A free-floating function (within the reranker namespace) that frees statically allocated objects...
Provides an interpreter for assigning primitives and Factory-constructible objects to named variables...
A class to hold a set of candidates, either for training or test.
bool check_for_required_arg(int argc, int i, string err_msg)
void Read(const string &filename, bool compressed, bool use_base64, bool reset_counters, vector< shared_ptr< CandidateSet > > &examples)
Reads a stream of CandidateSet instances from the specified file or from standard input...
shared_ptr< T > CreateOrDie(StreamTokenizer &st, Environment *env=NULL)
Dynamically creates an object, whose type and initialization are contained in a specification string...
#define DEFAULT_COMPACTIFY_INTERVAL
Provides an interpreter for assigning primitives and Factory-constructible objects to named variables...
#define DEFAULT_MAX_EXAMPLES
vector< shared_ptr< Candidate > >::const_iterator const_iterator
#define DEFAULT_USE_WEIGHTED_LOSS
An implementation of the CandidateSetIterator interface that is backed by an arbitrary C++ collection...
const_iterator begin() const
int main(int argc, char **argv)
A class for reading streams of training or test instances, where each training or test instance is a ...
Interface for serializer for reranker::Model instances to ModelMessage instances. ...
Class for writing streams of training or test instances, where each training or test instance is a re...
const_iterator end() const
Class to hold a single training instance for a reranker, which is a set of examples, typically the n-best output of some input process, posibly including a gold-standard feature vector.
#define DEFAULT_MAX_CANDIDATES
Knows how to create Model instances that have been serialized to a file.
Reranker model interface.
bool Get(const string &varname, T *value) const
Retrieves the value of the specified variable.